hebrew_flash_cards/tests/test_sentence_difficulty.py

"""Tests for sentence difficulty scoring."""

import json
from pathlib import Path

import pytest

import frequency_lookup
from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence


class TestBuildNikkudMap:
    def test_maps_direct_headwords(self):
        words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
        nmap = build_nikkud_map(words)
        assert nmap["אָב"] == "אב"

    def test_maps_conjugation_forms(self):
        words = {
            "שָׁמַר": {
                "word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
                "conjugation": {
                    "active_forms": [
                        {
                            "person": "1s",
                            "tense": "עָבָר",
                            "form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
                        },
                    ],
                    "infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
                    "reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["שָׁמַרְתִּי"] == "שמרתי"
        assert nmap["לִשְׁמֹר"] == "לשמור"

    def test_maps_noun_inflections(self):
        words = {
            "אָב": {
                "word": {"nikkud": "אָב", "ktiv_male": "אב"},
                "noun_inflection": {
                    "singular": {"nikkud": "אָב", "ktiv_male": "אב"},
                    "plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
                    "pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["אָבוֹת"] == "אבות"
        assert nmap["אָבִי"] == "אבי"

    def test_maps_adjective_inflections(self):
        words = {
            "גָּדוֹל": {
                "word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
                "adjective_inflection": {
                    "ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
                    "fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
                    "mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
                    "fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["גְּדוֹלָה"] == "גדולה"
        assert nmap["גְּדוֹלִים"] == "גדולים"

    def test_construct_forms_strip_maqaf(self):
        words = {
            "בֵּית": {
                "word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
                "noun_inflection": {
                    "construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert "בֵּית־" in nmap
        assert "בֵּית" in nmap

    def test_handles_missing_fields(self):
        words = {
            "test": {
                "word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
                "conjugation": None,
                "noun_inflection": None,
                "adjective_inflection": None,
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["טֶסְט"] == "טסט"

    def test_real_words_json_coverage(self):
        words_path = Path(__file__).parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            words = json.load(f)
        nmap = build_nikkud_map(words)
        assert len(nmap) > 90_000


class TestResolveTokenFrequency:
    @pytest.fixture()
    def freq_setup(self):
        frequency_lookup.load()
        freq_data = frequency_lookup.get_freq_data()
        words_path = Path(__file__).parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            words = json.load(f)
        from epub_examples import _build_nikkud_index

        nikkud_map = build_nikkud_map(words)
        nikkud_index = _build_nikkud_index(words)
        return nikkud_map, nikkud_index, freq_data

    def test_tier1_known_mapping(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
        assert rank is not None
        assert rank < 50_000

    def test_tier3_academy_converter(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
        assert rank is not None
        assert rank < 1000

    def test_unknown_token_returns_default(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
        assert rank == 50_000

    def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        assert freq_data.get("שלום") is not None


class TestScoreSentence:
    @pytest.fixture()
    def scoring_setup(self):
        frequency_lookup.load()
        freq_data = frequency_lookup.get_freq_data()
        words_path = Path(__file__).parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            words = json.load(f)
        from epub_examples import _build_nikkud_index

        nikkud_map = build_nikkud_map(words)
        nikkud_index = _build_nikkud_index(words)
        return nikkud_map, nikkud_index, freq_data

    def test_returns_integer(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "הוּא הָלַךְ הַבַּיְתָה"
        start = text.index("הָלַךְ")
        end = start + len("הָלַךְ")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)

    def test_easy_sentence_scores_lower(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        easy = "הוּא אָמַר שָׁלוֹם"
        easy_start = easy.index("אָמַר")
        easy_end = easy_start + len("אָמַר")
        hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
        hard_start = hard.index("נִשְׁתַּטֵּחַ")
        hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
        easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
        hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
        assert easy_score < hard_score

    def test_single_context_token(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "הוּא טוֹב"
        start = 0
        end = len("הוּא")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)

    def test_handles_punctuation(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = '"הוּא טוֹב!"'
        start = text.index("טוֹב")
        end = start + len("טוֹב")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)

    def test_splits_on_maqaf(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "בֵּית־סֵפֶר גָּדוֹל"
        start = text.index("גָּדוֹל")
        end = start + len("גָּדוֹל")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)

    def test_no_context_tokens_returns_default(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "א ב"
        score = score_sentence(text, 0, 1, nmap, nidx, freq)
        assert score == DEFAULT_RANK