"""Unit tests for apkg_builder — Sprint 15 learnings. Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English meanings, PoS exact matching, gender field population, and mishkal data integrity. """ import json import re import sys from pathlib import Path import pytest # Ensure project root is on path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from apkg_builder import _categorize_pos, _cloze_prefix_len # --------------------------------------------------------------------------- # Cloze prefix preservation # --------------------------------------------------------------------------- class TestClozePrefix: """_cloze_prefix_len must detect Hebrew prefix letters before the word.""" def test_single_prefix_bet(self): # בַּתּוֹר = bet + patach + tor assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0 def test_single_prefix_lamed(self): # לַמֶּלֶךְ = lamed + patach + melech assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0 def test_two_consonant_prefix(self): # שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters) token = "שֶׁבַּתּוֹר" word = "תּוֹר" prefix_len = _cloze_prefix_len(token, word) assert prefix_len > 0 assert token[prefix_len:].startswith(word) def test_no_prefix_direct_match(self): # Word appears at start — no prefix assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0 def test_empty_inputs(self): assert _cloze_prefix_len("", "תּוֹר") == 0 assert _cloze_prefix_len("בַּתּוֹר", "") == 0 assert _cloze_prefix_len("", "") == 0 def test_non_prefix_letter_returns_zero(self): # If the "prefix" chars aren't valid prefix letters, return 0 # 'ת' is not in _PREFIX_LETTERS (בהוכלמש) assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0 def test_prefix_preserves_nikkud(self): # Verify that prefix_len includes nikkud marks token = "בַּתּוֹר" word = "תּוֹר" prefix_len = _cloze_prefix_len(token, word) prefix = token[:prefix_len] # Prefix should contain at least bet + nikkud mark(s) base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"] assert base_letters == ["ב"] # --------------------------------------------------------------------------- # PoS exact matching (no substring collisions) # --------------------------------------------------------------------------- class TestCategorizePos: """_categorize_pos must not let 'Pronoun' match 'Noun'.""" def test_noun_exact(self): assert _categorize_pos("Noun") == "Noun" def test_pronoun_is_other(self): assert _categorize_pos("Pronoun") == "Other" def test_verb_exact(self): assert _categorize_pos("Verb") == "Verb" def test_noun_with_dash(self): assert _categorize_pos("Noun – masculine") == "Noun" def test_adjective(self): assert _categorize_pos("Adjective") == "Adjective" def test_conjunction_is_other(self): assert _categorize_pos("Conjunction") == "Other" # --------------------------------------------------------------------------- # Hebrew spoiler stripping from English meanings # --------------------------------------------------------------------------- class TestHebrewSpoilerStripping: """English meanings must not contain Hebrew text (spoils the card).""" # Use the same regex from apkg_builder.py HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*") @staticmethod def _strip_hebrew(meaning: str) -> str: """Replicate the meaning cleaning pipeline from build_vocab_deck.""" meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning) meaning = re.sub(r"[;:]\s*—", " —", meaning) meaning = re.sub(r";\s*:", ";", meaning) return re.sub(r"\s{2,}", " ", meaning).strip(", ;:") def test_pure_english_unchanged(self): assert self._strip_hebrew("to eat, to consume") == "to eat, to consume" def test_hebrew_word_removed(self): result = self._strip_hebrew("to eat; אכל") assert "אכל" not in result def test_hebrew_with_nikkud_removed(self): result = self._strip_hebrew("tall; גָּבוֹהַּ") assert "גָּבוֹהַּ" not in result assert "tall" in result def test_no_residual_hebrew_in_real_data(self): """Scan actual words.json — no meaning should contain Hebrew after stripping.""" words_path = Path(__file__).resolve().parent.parent / "data" / "words.json" if not words_path.exists(): pytest.skip("words.json not available") with open(words_path, encoding="utf-8") as f: words = json.load(f) # The regex used in apkg_builder hebrew_re = re.compile(r"[\u05D0-\u05EA]") spoilers = [] for key, entry in words.items(): meaning = entry.get("meaning") or "" cleaned = self._strip_hebrew(meaning) if hebrew_re.search(cleaned): spoilers.append(f"{key}: {cleaned!r}") assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}" # --------------------------------------------------------------------------- # Gender field for nouns (words.json data integrity) # --------------------------------------------------------------------------- class TestGenderDataIntegrity: """Nouns with noun_inflection should have gender populated.""" @pytest.fixture() def words(self): words_path = Path(__file__).resolve().parent.parent / "data" / "words.json" if not words_path.exists(): pytest.skip("words.json not available") with open(words_path, encoding="utf-8") as f: return json.load(f) def test_nouns_have_gender(self, words): """Nouns with noun_inflection should have a valid gender.""" missing = [] for key, entry in words.items(): pos = entry.get("pos") or "" ni = entry.get("noun_inflection") if pos.startswith("Noun") and ni: gender = ni.get("gender") or "" if gender not in ("masculine", "feminine", "masculine and feminine"): missing.append(f"{key}: gender={gender!r}") # Allow up to 7% missing (loan words, compound words, etc.) noun_count = sum( 1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection") ) if noun_count > 0: pct_missing = len(missing) / noun_count assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}" # --------------------------------------------------------------------------- # Mishkal data integrity # --------------------------------------------------------------------------- class TestMishkalIntegrity: """Validate mishkal data consistency in words.json.""" @pytest.fixture() def words(self): words_path = Path(__file__).resolve().parent.parent / "data" / "words.json" if not words_path.exists(): pytest.skip("words.json not available") with open(words_path, encoding="utf-8") as f: return json.load(f) def test_mishkal_hebrew_matches_english(self, words): """If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew.""" from pealim_detail_scrape import _mishkal_to_hebrew mismatches = [] for key, entry in words.items(): for infl_key in ("noun_inflection", "adjective_inflection"): infl = entry.get(infl_key) if not infl: continue mishkal_eng = infl.get("mishkal") or "" mishkal_heb = infl.get("mishkal_hebrew") or "" if mishkal_eng and mishkal_heb: expected = _mishkal_to_hebrew(mishkal_eng) or "" if expected and expected != mishkal_heb: mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})") assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}" def test_mishkal_hebrew_is_hebrew(self, words): """mishkal_hebrew must contain Hebrew characters.""" hebrew_re = re.compile(r"[\u05D0-\u05EA]") bad = [] for key, entry in words.items(): for infl_key in ("noun_inflection", "adjective_inflection"): infl = entry.get(infl_key) if not infl: continue mishkal_heb = infl.get("mishkal_hebrew") or "" if mishkal_heb and not hebrew_re.search(mishkal_heb): bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}") assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}" def test_no_orphaned_mishkal(self, words): """If mishkal_hebrew is set, mishkal (English) must also be set.""" orphans = [] for key, entry in words.items(): for infl_key in ("noun_inflection", "adjective_inflection"): infl = entry.get(infl_key) if not infl: continue mishkal_heb = infl.get("mishkal_hebrew") or "" mishkal_eng = infl.get("mishkal") or "" if mishkal_heb and not mishkal_eng: orphans.append(f"{key}: has mishkal_hebrew but no mishkal") assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"