feat: add sentence_difficulty module with 5-tier frequency scoring

Implements build_nikkud_map(), _resolve_token_frequency(), and score_sentence() for v0.20 adaptive cloze sentence selection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-15 13:23:21 +00:00 · 2026-03-15 13:23:21 +00:00 · fb12f806a8
commit fb12f806a8
parent 00fba934fb
2 changed files with 405 additions and 0 deletions
--- a/sentence_difficulty.py
+++ b/sentence_difficulty.py
@ -0,0 +1,198 @@
 """Sentence difficulty scoring by context-word frequency.
 Scores sentences by the median frequency rank of context words
 (excluding the cloze target). Lower score = easier sentence.
 Used by epub_examples.py to select the best cloze sentence.
 """
 from __future__ import annotations
 from statistics import median
 import helpers
 import nikkud_to_ktiv_male
 DEFAULT_RANK = 50_000
 # Hebrew prefix consonants for ktiv_male prefix stripping (tier 5)
 _KM_PREFIX_CHARS = set("בהוכלמשע")
 # Punctuation to strip from tokens
 _PUNCT = set('.,!?;:"\'"״׳–—()[]{}')
 # Maqaf (Hebrew hyphen) — splits tokens
 _MAQAF = "־"
 def build_nikkud_map(words: dict) -> dict[str, str]:
    """Build nikkud→ktiv_male lookup from words.json.
    Indexes: headwords, conjugation forms (active, passive, infinitive,
    reference_form), noun inflections (singular, plural, construct,
    pronominal suffixes), and adjective inflections (ms/fs/mp/fp).
    Args:
        words: The full words.json dict keyed by unique_key.
    Returns:
        Dict mapping nikkud form to ktiv_male string.
        When collisions occur, last-write wins (acceptable for frequency lookup).
    """
    nmap: dict[str, str] = {}
    def _add(nikkud: str | None, ktiv_male: str | None) -> None:
        if nikkud and ktiv_male:
            nmap[nikkud] = ktiv_male
    for entry in words.values():
        word = entry.get("word") or {}
        _add(word.get("nikkud"), word.get("ktiv_male"))
        # Conjugation forms
        conj = entry.get("conjugation") or {}
        for form_entry in conj.get("active_forms") or []:
            form = form_entry.get("form") or {}
            _add(form.get("nikkud"), form.get("ktiv_male"))
        for form_entry in conj.get("hufal_pual_forms") or []:
            form = form_entry.get("form") or {}
            _add(form.get("nikkud"), form.get("ktiv_male"))
        inf = conj.get("infinitive") or {}
        _add(inf.get("nikkud"), inf.get("ktiv_male"))
        ref = conj.get("reference_form") or {}
        _add(ref.get("nikkud"), ref.get("ktiv_male"))
        # Noun inflection forms
        noun = entry.get("noun_inflection") or {}
        for field in ("singular", "plural", "construct_singular", "construct_plural"):
            sub = noun.get(field) or {}
            nikkud_form = sub.get("nikkud")
            ktiv = sub.get("ktiv_male")
            _add(nikkud_form, ktiv)
            # Index construct forms without maqaf
            if nikkud_form and nikkud_form.endswith("־") and ktiv:
                _add(nikkud_form[:-1], ktiv)
        pronominal = noun.get("pronominal_suffixes") or {}
        for sub in pronominal.values():
            if isinstance(sub, dict):
                _add(sub.get("nikkud"), sub.get("ktiv_male"))
        # Adjective inflection forms
        adj = entry.get("adjective_inflection") or {}
        for field in ("ms", "fs", "mp", "fp"):
            sub = adj.get(field) or {}
            _add(sub.get("nikkud"), sub.get("ktiv_male"))
    return nmap
 def _resolve_token_frequency(
    token: str,
    nikkud_map: dict[str, str],
    nikkud_index: dict,
    freq_data: dict[str, int],
 ) -> int:
    """Resolve a nikkud sentence token to its frequency rank.
    Uses a 5-tier pipeline:
    1. Known mapping (nikkud_map from words.json)
    2. Nikkud prefix stripping (epub_examples.try_strip_prefix)
    3. Academy rules converter (nikkud_to_ktiv_male.convert)
    4. strip_nikkud fallback (helpers.strip_nikkud)
    5. Ktiv_male prefix stripping on the converted form
    Returns:
        Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found.
    """
    # Tier 1: Direct lookup in nikkud→ktiv_male map
    ktiv = nikkud_map.get(token)
    if ktiv and ktiv in freq_data:
        return freq_data[ktiv]
    # Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map
    from epub_examples import try_strip_prefix
    prefix_hits = try_strip_prefix(token, nikkud_index)
    for _unique_key, _match_type, matched_remainder in prefix_hits:
        remainder_ktiv = nikkud_map.get(matched_remainder)
        if remainder_ktiv and remainder_ktiv in freq_data:
            return freq_data[remainder_ktiv]
    # Tier 3: Academy rules converter
    converted = nikkud_to_ktiv_male.convert(token)
    if converted in freq_data:
        return freq_data[converted]
    # Tier 4: strip_nikkud fallback
    stripped = helpers.strip_nikkud(token)
    if stripped != converted and stripped in freq_data:
        return freq_data[stripped]
    # Tier 5: Ktiv_male prefix stripping on converted/stripped form
    for form in (converted, stripped):
        for prefix_len in (1, 2):
            if len(form) > prefix_len + 1:
                prefix = form[:prefix_len]
                if all(c in _KM_PREFIX_CHARS for c in prefix):
                    stem = form[prefix_len:]
                    if stem in freq_data:
                        return freq_data[stem]
    return DEFAULT_RANK
 def score_sentence(
    text: str,
    target_start: int,
    target_end: int,
    nikkud_map: dict[str, str],
    nikkud_index: dict,
    freq_data: dict[str, int],
 ) -> int:
    """Score a sentence by median frequency rank of context words.
    Args:
        text: The full sentence text (with nikkud).
        target_start: Character offset where the cloze target word starts.
        target_end: Character offset where the cloze target word ends.
        nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map().
        nikkud_index: nikkud index from epub_examples._build_nikkud_index().
        freq_data: Frequency dict from frequency_lookup.get_freq_data().
    Returns:
        Median frequency rank of context tokens (int). Lower = easier.
        Returns DEFAULT_RANK if no scoreable context tokens.
    """
    # Tokenize: split on whitespace, then split on maqaf
    raw_tokens = text.split()
    tokens_with_pos: list[tuple[str, int, int]] = []
    pos = 0
    for raw in raw_tokens:
        start = text.index(raw, pos)
        # Split on maqaf
        parts = raw.split(_MAQAF)
        sub_pos = start
        for part in parts:
            if part:
                tokens_with_pos.append((part, sub_pos, sub_pos + len(part)))
            sub_pos += len(part) + 1  # +1 for maqaf
        pos = start + len(raw)
    # Filter: exclude target word, strip punctuation, skip short tokens
    context_ranks: list[int] = []
    for token, tok_start, tok_end in tokens_with_pos:
        # Exclude target word by overlap with char offsets
        if tok_start < target_end and tok_end > target_start:
            continue
        # Strip punctuation from edges
        cleaned = token.strip("".join(_PUNCT))
        if len(cleaned) < 2:
            continue
        rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data)
        context_ranks.append(rank)
    if not context_ranks:
        return DEFAULT_RANK
    return int(median(context_ranks))
--- a/tests/test_sentence_difficulty.py
+++ b/tests/test_sentence_difficulty.py
@ -0,0 +1,207 @@
 """Tests for sentence difficulty scoring."""
 import json
 from pathlib import Path
 import pytest
 import frequency_lookup
 from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence
 class TestBuildNikkudMap:
    def test_maps_direct_headwords(self):
        words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
        nmap = build_nikkud_map(words)
        assert nmap["אָב"] == "אב"
    def test_maps_conjugation_forms(self):
        words = {
            "שָׁמַר": {
                "word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
                "conjugation": {
                    "active_forms": [
                        {
                            "person": "1s",
                            "tense": "עָבָר",
                            "form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
                        },
                    ],
                    "infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
                    "reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["שָׁמַרְתִּי"] == "שמרתי"
        assert nmap["לִשְׁמֹר"] == "לשמור"
    def test_maps_noun_inflections(self):
        words = {
            "אָב": {
                "word": {"nikkud": "אָב", "ktiv_male": "אב"},
                "noun_inflection": {
                    "singular": {"nikkud": "אָב", "ktiv_male": "אב"},
                    "plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
                    "pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["אָבוֹת"] == "אבות"
        assert nmap["אָבִי"] == "אבי"
    def test_maps_adjective_inflections(self):
        words = {
            "גָּדוֹל": {
                "word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
                "adjective_inflection": {
                    "ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
                    "fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
                    "mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
                    "fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["גְּדוֹלָה"] == "גדולה"
        assert nmap["גְּדוֹלִים"] == "גדולים"
    def test_construct_forms_strip_maqaf(self):
        words = {
            "בֵּית": {
                "word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
                "noun_inflection": {
                    "construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
                },
            }
        }
        nmap = build_nikkud_map(words)
        assert "בֵּית־" in nmap
        assert "בֵּית" in nmap
    def test_handles_missing_fields(self):
        words = {
            "test": {
                "word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
                "conjugation": None,
                "noun_inflection": None,
                "adjective_inflection": None,
            }
        }
        nmap = build_nikkud_map(words)
        assert nmap["טֶסְט"] == "טסט"
    def test_real_words_json_coverage(self):
        words_path = Path(__file__).parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            words = json.load(f)
        nmap = build_nikkud_map(words)
        assert len(nmap) > 90_000
 class TestResolveTokenFrequency:
    @pytest.fixture()
    def freq_setup(self):
        frequency_lookup.load()
        freq_data = frequency_lookup.get_freq_data()
        words_path = Path(__file__).parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            words = json.load(f)
        from epub_examples import _build_nikkud_index
        nikkud_map = build_nikkud_map(words)
        nikkud_index = _build_nikkud_index(words)
        return nikkud_map, nikkud_index, freq_data
    def test_tier1_known_mapping(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
        assert rank is not None
        assert rank < 50_000
    def test_tier3_academy_converter(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
        assert rank is not None
        assert rank < 1000
    def test_unknown_token_returns_default(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
        assert rank == 50_000
    def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
        nikkud_map, nikkud_index, freq_data = freq_setup
        assert freq_data.get("שלום") is not None
 class TestScoreSentence:
    @pytest.fixture()
    def scoring_setup(self):
        frequency_lookup.load()
        freq_data = frequency_lookup.get_freq_data()
        words_path = Path(__file__).parent.parent / "data" / "words.json"
        if not words_path.exists():
            pytest.skip("words.json not available")
        with open(words_path, encoding="utf-8") as f:
            words = json.load(f)
        from epub_examples import _build_nikkud_index
        nikkud_map = build_nikkud_map(words)
        nikkud_index = _build_nikkud_index(words)
        return nikkud_map, nikkud_index, freq_data
    def test_returns_integer(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "הוּא הָלַךְ הַבַּיְתָה"
        start = text.index("הָלַךְ")
        end = start + len("הָלַךְ")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)
    def test_easy_sentence_scores_lower(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        easy = "הוּא אָמַר שָׁלוֹם"
        easy_start = easy.index("אָמַר")
        easy_end = easy_start + len("אָמַר")
        hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
        hard_start = hard.index("נִשְׁתַּטֵּחַ")
        hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
        easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
        hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
        assert easy_score < hard_score
    def test_single_context_token(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "הוּא טוֹב"
        start = 0
        end = len("הוּא")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)
    def test_handles_punctuation(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = '"הוּא טוֹב!"'
        start = text.index("טוֹב")
        end = start + len("טוֹב")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)
    def test_splits_on_maqaf(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "בֵּית־סֵפֶר גָּדוֹל"
        start = text.index("גָּדוֹל")
        end = start + len("גָּדוֹל")
        score = score_sentence(text, start, end, nmap, nidx, freq)
        assert isinstance(score, int)
    def test_no_context_tokens_returns_default(self, scoring_setup):
        nmap, nidx, freq = scoring_setup
        text = "א ב"
        score = score_sentence(text, 0, 1, nmap, nidx, freq)
        assert score == DEFAULT_RANK