hebrew_flash_cards/tests/test_apkg_builder.py

"""Unit tests for apkg_builder — Sprint 15 learnings.

Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
meanings, PoS exact matching, gender field population, and mishkal data integrity.
"""

import json
import re
import sys
from pathlib import Path

import pytest

# Ensure project root is on path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from apkg_builder import _categorize_pos, _cloze_prefix_len

# ---------------------------------------------------------------------------
# Cloze prefix preservation
# ---------------------------------------------------------------------------


class TestClozePrefix:
    """_cloze_prefix_len must detect Hebrew prefix letters before the word."""

    def test_single_prefix_bet(self):
        # בַּתּוֹר = bet + patach + tor
        assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0

    def test_single_prefix_lamed(self):
        # לַמֶּלֶךְ = lamed + patach + melech
        assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0

    def test_two_consonant_prefix(self):
        # שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
        token = "שֶׁבַּתּוֹר"
        word = "תּוֹר"
        prefix_len = _cloze_prefix_len(token, word)
        assert prefix_len > 0
        assert token[prefix_len:].startswith(word)

    def test_no_prefix_direct_match(self):
        # Word appears at start — no prefix
        assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0

    def test_empty_inputs(self):
        assert _cloze_prefix_len("", "תּוֹר") == 0
        assert _cloze_prefix_len("בַּתּוֹר", "") == 0
        assert _cloze_prefix_len("", "") == 0

    def test_non_prefix_letter_returns_zero(self):
        # If the "prefix" chars aren't valid prefix letters, return 0
        # 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
        assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0

    def test_prefix_preserves_nikkud(self):
        # Verify that prefix_len includes nikkud marks
        token = "בַּתּוֹר"
        word = "תּוֹר"
        prefix_len = _cloze_prefix_len(token, word)
        prefix = token[:prefix_len]
        # Prefix should contain at least bet + nikkud mark(s)
        base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
        assert base_letters == ["ב"]


# ---------------------------------------------------------------------------
# PoS exact matching (no substring collisions)
# ---------------------------------------------------------------------------


class TestCategorizePos:
    """_categorize_pos must not let 'Pronoun' match 'Noun'."""

    def test_noun_exact(self):
        assert _categorize_pos("Noun") == "Noun"

    def test_pronoun_is_other(self):
        assert _categorize_pos("Pronoun") == "Other"

    def test_verb_exact(self):
        assert _categorize_pos("Verb") == "Verb"

    def test_noun_with_dash(self):
        assert _categorize_pos("Noun – masculine") == "Noun"

    def test_adjective(self):
        assert _categorize_pos("Adjective") == "Adjective"

    def test_conjunction_is_other(self):
        assert _categorize_pos("Conjunction") == "Other"


# ---------------------------------------------------------------------------
# Hebrew spoiler stripping from English meanings
# ---------------------------------------------------------------------------


class TestHebrewSpoilerStripping:
    """English meanings must not contain Hebrew text (spoils the card)."""

    # Use the same regex from apkg_builder.py
    HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")

    @staticmethod
    def _strip_hebrew(meaning: str) -> str:
        """Replicate the meaning cleaning pipeline from build_vocab_deck."""
        meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
        meaning = re.sub(r"[;:]\s*—", " —", meaning)
        meaning = re.sub(r";\s*:", ";", meaning)
        return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")

    def test_pure_english_unchanged(self):
        assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"

    def test_hebrew_word_removed(self):
        result = self._strip_hebrew("to eat; אכל")
        assert "אכל" not in result

    def test_hebrew_with_nikkud_removed(self):
        result = self._strip_hebrew("tall; גָּבוֹהַּ")
        assert "גָּבוֹהַּ" not in result
        assert "tall" in result

    def test_no_residual_hebrew_in_real_data(self):
        """Scan actual words.json — no meaning should contain Hebrew after stripping."""
        words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")

        with open(words_path, encoding="utf-8") as f:
            words = json.load(f)

        # The regex used in apkg_builder
        hebrew_re = re.compile(r"[\u05D0-\u05EA]")
        spoilers = []
        for key, entry in words.items():
            meaning = entry.get("meaning") or ""
            cleaned = self._strip_hebrew(meaning)
            if hebrew_re.search(cleaned):
                spoilers.append(f"{key}: {cleaned!r}")

        assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"


# ---------------------------------------------------------------------------
# Gender field for nouns (words.json data integrity)
# ---------------------------------------------------------------------------


class TestGenderDataIntegrity:
    """Nouns with noun_inflection should have gender populated."""

    @pytest.fixture()
    def words(self):
        words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            return json.load(f)

    def test_nouns_have_gender(self, words):
        """Nouns with noun_inflection should have a valid gender."""
        missing = []
        for key, entry in words.items():
            pos = entry.get("pos") or ""
            ni = entry.get("noun_inflection")
            if pos.startswith("Noun") and ni:
                gender = ni.get("gender") or ""
                if gender not in ("masculine", "feminine", "masculine and feminine"):
                    missing.append(f"{key}: gender={gender!r}")

        # Allow up to 7% missing (loan words, compound words, etc.)
        noun_count = sum(
            1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
        )
        if noun_count > 0:
            pct_missing = len(missing) / noun_count
            assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"


# ---------------------------------------------------------------------------
# Mishkal data integrity
# ---------------------------------------------------------------------------


class TestMishkalIntegrity:
    """Validate mishkal data consistency in words.json."""

    @pytest.fixture()
    def words(self):
        words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            return json.load(f)

    def test_mishkal_hebrew_matches_english(self, words):
        """If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
        from pealim_detail_scrape import _mishkal_to_hebrew

        mismatches = []
        for key, entry in words.items():
            for infl_key in ("noun_inflection", "adjective_inflection"):
                infl = entry.get(infl_key)
                if not infl:
                    continue
                mishkal_eng = infl.get("mishkal") or ""
                mishkal_heb = infl.get("mishkal_hebrew") or ""
                if mishkal_eng and mishkal_heb:
                    expected = _mishkal_to_hebrew(mishkal_eng) or ""
                    if expected and expected != mishkal_heb:
                        mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")

        assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"

    def test_mishkal_hebrew_is_hebrew(self, words):
        """mishkal_hebrew must contain Hebrew characters."""
        hebrew_re = re.compile(r"[\u05D0-\u05EA]")
        bad = []
        for key, entry in words.items():
            for infl_key in ("noun_inflection", "adjective_inflection"):
                infl = entry.get(infl_key)
                if not infl:
                    continue
                mishkal_heb = infl.get("mishkal_hebrew") or ""
                if mishkal_heb and not hebrew_re.search(mishkal_heb):
                    bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")

        assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"

    def test_no_orphaned_mishkal(self, words):
        """If mishkal_hebrew is set, mishkal (English) must also be set."""
        orphans = []
        for key, entry in words.items():
            for infl_key in ("noun_inflection", "adjective_inflection"):
                infl = entry.get(infl_key)
                if not infl:
                    continue
                mishkal_heb = infl.get("mishkal_hebrew") or ""
                mishkal_eng = infl.get("mishkal") or ""
                if mishkal_heb and not mishkal_eng:
                    orphans.append(f"{key}: has mishkal_hebrew but no mishkal")

        assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"