hebrew_flash_cards/tests/test_apkg_builder.py
Sochen c85063ee2f Sprint 15: example sentence pipeline overhaul + corpus expansion + card improvements
- Regenerated all example sentences from scratch (deleted legacy + stale entries)
- Added .txt file support to epub_examples.py for Ben Yehuda corpus
- 7 Ben Yehuda nikkud'd children's texts + 3 new Time Tunnel EPUBs
- Maqaf-stripped construct form indexing (+68% inflected matches)
- Total: 3,598 words with examples, 3,289 with cloze (was ~2,900)
- Cloze prefix preservation (_cloze_prefix_len)
- Hebrew spoiler stripping from English meanings
- Gender field (זָכָר/נְקֵבָה) on vocab cards
- sec-table CSS layout for aligned key:value pairs
- Mishkal uses mishkal_hebrew on plural cards
- Improved mishkal extraction from pealim detail pages
- 21 new pytest tests (cloze, PoS, Hebrew stripping, gender, mishkal)
- 2 new validate_data.py tests + mishkal stats
- Colliding forms tracking (local-only)
- Release tag v0.17

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 10:44:14 +00:00

246 lines
9.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Unit tests for apkg_builder — Sprint 15 learnings.
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
meanings, PoS exact matching, gender field population, and mishkal data integrity.
"""
import json
import re
import sys
from pathlib import Path
import pytest
# Ensure project root is on path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from apkg_builder import _categorize_pos, _cloze_prefix_len
# ---------------------------------------------------------------------------
# Cloze prefix preservation
# ---------------------------------------------------------------------------
class TestClozePrefix:
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
def test_single_prefix_bet(self):
# בַּתּוֹר = bet + patach + tor
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
def test_single_prefix_lamed(self):
# לַמֶּלֶךְ = lamed + patach + melech
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
def test_two_consonant_prefix(self):
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
token = "שֶׁבַּתּוֹר"
word = "תּוֹר"
prefix_len = _cloze_prefix_len(token, word)
assert prefix_len > 0
assert token[prefix_len:].startswith(word)
def test_no_prefix_direct_match(self):
# Word appears at start — no prefix
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
def test_empty_inputs(self):
assert _cloze_prefix_len("", "תּוֹר") == 0
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
assert _cloze_prefix_len("", "") == 0
def test_non_prefix_letter_returns_zero(self):
# If the "prefix" chars aren't valid prefix letters, return 0
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
def test_prefix_preserves_nikkud(self):
# Verify that prefix_len includes nikkud marks
token = "בַּתּוֹר"
word = "תּוֹר"
prefix_len = _cloze_prefix_len(token, word)
prefix = token[:prefix_len]
# Prefix should contain at least bet + nikkud mark(s)
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
assert base_letters == ["ב"]
# ---------------------------------------------------------------------------
# PoS exact matching (no substring collisions)
# ---------------------------------------------------------------------------
class TestCategorizePos:
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
def test_noun_exact(self):
assert _categorize_pos("Noun") == "Noun"
def test_pronoun_is_other(self):
assert _categorize_pos("Pronoun") == "Other"
def test_verb_exact(self):
assert _categorize_pos("Verb") == "Verb"
def test_noun_with_dash(self):
assert _categorize_pos("Noun masculine") == "Noun"
def test_adjective(self):
assert _categorize_pos("Adjective") == "Adjective"
def test_conjunction_is_other(self):
assert _categorize_pos("Conjunction") == "Other"
# ---------------------------------------------------------------------------
# Hebrew spoiler stripping from English meanings
# ---------------------------------------------------------------------------
class TestHebrewSpoilerStripping:
"""English meanings must not contain Hebrew text (spoils the card)."""
# Use the same regex from apkg_builder.py
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
@staticmethod
def _strip_hebrew(meaning: str) -> str:
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
meaning = re.sub(r"[;:]\s*—", "", meaning)
meaning = re.sub(r";\s*:", ";", meaning)
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
def test_pure_english_unchanged(self):
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
def test_hebrew_word_removed(self):
result = self._strip_hebrew("to eat; אכל")
assert "אכל" not in result
def test_hebrew_with_nikkud_removed(self):
result = self._strip_hebrew("tall; גָּבוֹהַּ")
assert "גָּבוֹהַּ" not in result
assert "tall" in result
def test_no_residual_hebrew_in_real_data(self):
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
# The regex used in apkg_builder
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
spoilers = []
for key, entry in words.items():
meaning = entry.get("meaning") or ""
cleaned = self._strip_hebrew(meaning)
if hebrew_re.search(cleaned):
spoilers.append(f"{key}: {cleaned!r}")
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
# ---------------------------------------------------------------------------
# Gender field for nouns (words.json data integrity)
# ---------------------------------------------------------------------------
class TestGenderDataIntegrity:
"""Nouns with noun_inflection should have gender populated."""
@pytest.fixture()
def words(self):
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
return json.load(f)
def test_nouns_have_gender(self, words):
"""Nouns with noun_inflection should have a valid gender."""
missing = []
for key, entry in words.items():
pos = entry.get("pos") or ""
ni = entry.get("noun_inflection")
if pos.startswith("Noun") and ni:
gender = ni.get("gender") or ""
if gender not in ("masculine", "feminine", "masculine and feminine"):
missing.append(f"{key}: gender={gender!r}")
# Allow up to 7% missing (loan words, compound words, etc.)
noun_count = sum(
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
)
if noun_count > 0:
pct_missing = len(missing) / noun_count
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
# ---------------------------------------------------------------------------
# Mishkal data integrity
# ---------------------------------------------------------------------------
class TestMishkalIntegrity:
"""Validate mishkal data consistency in words.json."""
@pytest.fixture()
def words(self):
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
return json.load(f)
def test_mishkal_hebrew_matches_english(self, words):
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
from pealim_detail_scrape import _mishkal_to_hebrew
mismatches = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_eng = infl.get("mishkal") or ""
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_eng and mishkal_heb:
expected = _mishkal_to_hebrew(mishkal_eng) or ""
if expected and expected != mishkal_heb:
mismatches.append(f"{key}: {mishkal_eng}{mishkal_heb} (expected {expected})")
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
def test_mishkal_hebrew_is_hebrew(self, words):
"""mishkal_hebrew must contain Hebrew characters."""
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
bad = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_heb and not hebrew_re.search(mishkal_heb):
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
def test_no_orphaned_mishkal(self, words):
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
orphans = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_heb = infl.get("mishkal_hebrew") or ""
mishkal_eng = infl.get("mishkal") or ""
if mishkal_heb and not mishkal_eng:
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"