diff --git a/sentence_difficulty.py b/sentence_difficulty.py new file mode 100644 index 0000000..829430c --- /dev/null +++ b/sentence_difficulty.py @@ -0,0 +1,198 @@ +"""Sentence difficulty scoring by context-word frequency. + +Scores sentences by the median frequency rank of context words +(excluding the cloze target). Lower score = easier sentence. +Used by epub_examples.py to select the best cloze sentence. +""" + +from __future__ import annotations + +from statistics import median + +import helpers +import nikkud_to_ktiv_male + +DEFAULT_RANK = 50_000 + +# Hebrew prefix consonants for ktiv_male prefix stripping (tier 5) +_KM_PREFIX_CHARS = set("בהוכלמשע") + +# Punctuation to strip from tokens +_PUNCT = set('.,!?;:"\'"״׳–—()[]{}') + +# Maqaf (Hebrew hyphen) — splits tokens +_MAQAF = "־" + + +def build_nikkud_map(words: dict) -> dict[str, str]: + """Build nikkud→ktiv_male lookup from words.json. + + Indexes: headwords, conjugation forms (active, passive, infinitive, + reference_form), noun inflections (singular, plural, construct, + pronominal suffixes), and adjective inflections (ms/fs/mp/fp). + + Args: + words: The full words.json dict keyed by unique_key. + + Returns: + Dict mapping nikkud form to ktiv_male string. + When collisions occur, last-write wins (acceptable for frequency lookup). + """ + nmap: dict[str, str] = {} + + def _add(nikkud: str | None, ktiv_male: str | None) -> None: + if nikkud and ktiv_male: + nmap[nikkud] = ktiv_male + + for entry in words.values(): + word = entry.get("word") or {} + _add(word.get("nikkud"), word.get("ktiv_male")) + + # Conjugation forms + conj = entry.get("conjugation") or {} + for form_entry in conj.get("active_forms") or []: + form = form_entry.get("form") or {} + _add(form.get("nikkud"), form.get("ktiv_male")) + for form_entry in conj.get("hufal_pual_forms") or []: + form = form_entry.get("form") or {} + _add(form.get("nikkud"), form.get("ktiv_male")) + inf = conj.get("infinitive") or {} + _add(inf.get("nikkud"), inf.get("ktiv_male")) + ref = conj.get("reference_form") or {} + _add(ref.get("nikkud"), ref.get("ktiv_male")) + + # Noun inflection forms + noun = entry.get("noun_inflection") or {} + for field in ("singular", "plural", "construct_singular", "construct_plural"): + sub = noun.get(field) or {} + nikkud_form = sub.get("nikkud") + ktiv = sub.get("ktiv_male") + _add(nikkud_form, ktiv) + # Index construct forms without maqaf + if nikkud_form and nikkud_form.endswith("־") and ktiv: + _add(nikkud_form[:-1], ktiv) + pronominal = noun.get("pronominal_suffixes") or {} + for sub in pronominal.values(): + if isinstance(sub, dict): + _add(sub.get("nikkud"), sub.get("ktiv_male")) + + # Adjective inflection forms + adj = entry.get("adjective_inflection") or {} + for field in ("ms", "fs", "mp", "fp"): + sub = adj.get(field) or {} + _add(sub.get("nikkud"), sub.get("ktiv_male")) + + return nmap + + +def _resolve_token_frequency( + token: str, + nikkud_map: dict[str, str], + nikkud_index: dict, + freq_data: dict[str, int], +) -> int: + """Resolve a nikkud sentence token to its frequency rank. + + Uses a 5-tier pipeline: + 1. Known mapping (nikkud_map from words.json) + 2. Nikkud prefix stripping (epub_examples.try_strip_prefix) + 3. Academy rules converter (nikkud_to_ktiv_male.convert) + 4. strip_nikkud fallback (helpers.strip_nikkud) + 5. Ktiv_male prefix stripping on the converted form + + Returns: + Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found. + """ + # Tier 1: Direct lookup in nikkud→ktiv_male map + ktiv = nikkud_map.get(token) + if ktiv and ktiv in freq_data: + return freq_data[ktiv] + + # Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map + from epub_examples import try_strip_prefix + + prefix_hits = try_strip_prefix(token, nikkud_index) + for _unique_key, _match_type, matched_remainder in prefix_hits: + remainder_ktiv = nikkud_map.get(matched_remainder) + if remainder_ktiv and remainder_ktiv in freq_data: + return freq_data[remainder_ktiv] + + # Tier 3: Academy rules converter + converted = nikkud_to_ktiv_male.convert(token) + if converted in freq_data: + return freq_data[converted] + + # Tier 4: strip_nikkud fallback + stripped = helpers.strip_nikkud(token) + if stripped != converted and stripped in freq_data: + return freq_data[stripped] + + # Tier 5: Ktiv_male prefix stripping on converted/stripped form + for form in (converted, stripped): + for prefix_len in (1, 2): + if len(form) > prefix_len + 1: + prefix = form[:prefix_len] + if all(c in _KM_PREFIX_CHARS for c in prefix): + stem = form[prefix_len:] + if stem in freq_data: + return freq_data[stem] + + return DEFAULT_RANK + + +def score_sentence( + text: str, + target_start: int, + target_end: int, + nikkud_map: dict[str, str], + nikkud_index: dict, + freq_data: dict[str, int], +) -> int: + """Score a sentence by median frequency rank of context words. + + Args: + text: The full sentence text (with nikkud). + target_start: Character offset where the cloze target word starts. + target_end: Character offset where the cloze target word ends. + nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map(). + nikkud_index: nikkud index from epub_examples._build_nikkud_index(). + freq_data: Frequency dict from frequency_lookup.get_freq_data(). + + Returns: + Median frequency rank of context tokens (int). Lower = easier. + Returns DEFAULT_RANK if no scoreable context tokens. + """ + # Tokenize: split on whitespace, then split on maqaf + raw_tokens = text.split() + tokens_with_pos: list[tuple[str, int, int]] = [] + pos = 0 + for raw in raw_tokens: + start = text.index(raw, pos) + # Split on maqaf + parts = raw.split(_MAQAF) + sub_pos = start + for part in parts: + if part: + tokens_with_pos.append((part, sub_pos, sub_pos + len(part))) + sub_pos += len(part) + 1 # +1 for maqaf + pos = start + len(raw) + + # Filter: exclude target word, strip punctuation, skip short tokens + context_ranks: list[int] = [] + for token, tok_start, tok_end in tokens_with_pos: + # Exclude target word by overlap with char offsets + if tok_start < target_end and tok_end > target_start: + continue + + # Strip punctuation from edges + cleaned = token.strip("".join(_PUNCT)) + if len(cleaned) < 2: + continue + + rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data) + context_ranks.append(rank) + + if not context_ranks: + return DEFAULT_RANK + + return int(median(context_ranks)) diff --git a/tests/test_sentence_difficulty.py b/tests/test_sentence_difficulty.py new file mode 100644 index 0000000..6b6577b --- /dev/null +++ b/tests/test_sentence_difficulty.py @@ -0,0 +1,207 @@ +"""Tests for sentence difficulty scoring.""" + +import json +from pathlib import Path + +import pytest + +import frequency_lookup +from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence + + +class TestBuildNikkudMap: + def test_maps_direct_headwords(self): + words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}} + nmap = build_nikkud_map(words) + assert nmap["אָב"] == "אב" + + def test_maps_conjugation_forms(self): + words = { + "שָׁמַר": { + "word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"}, + "conjugation": { + "active_forms": [ + { + "person": "1s", + "tense": "עָבָר", + "form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"}, + }, + ], + "infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"}, + "reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"}, + }, + } + } + nmap = build_nikkud_map(words) + assert nmap["שָׁמַרְתִּי"] == "שמרתי" + assert nmap["לִשְׁמֹר"] == "לשמור" + + def test_maps_noun_inflections(self): + words = { + "אָב": { + "word": {"nikkud": "אָב", "ktiv_male": "אב"}, + "noun_inflection": { + "singular": {"nikkud": "אָב", "ktiv_male": "אב"}, + "plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"}, + "pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}}, + }, + } + } + nmap = build_nikkud_map(words) + assert nmap["אָבוֹת"] == "אבות" + assert nmap["אָבִי"] == "אבי" + + def test_maps_adjective_inflections(self): + words = { + "גָּדוֹל": { + "word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"}, + "adjective_inflection": { + "ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"}, + "fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"}, + "mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"}, + "fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"}, + }, + } + } + nmap = build_nikkud_map(words) + assert nmap["גְּדוֹלָה"] == "גדולה" + assert nmap["גְּדוֹלִים"] == "גדולים" + + def test_construct_forms_strip_maqaf(self): + words = { + "בֵּית": { + "word": {"nikkud": "בֵּית", "ktiv_male": "בית"}, + "noun_inflection": { + "construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"}, + }, + } + } + nmap = build_nikkud_map(words) + assert "בֵּית־" in nmap + assert "בֵּית" in nmap + + def test_handles_missing_fields(self): + words = { + "test": { + "word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"}, + "conjugation": None, + "noun_inflection": None, + "adjective_inflection": None, + } + } + nmap = build_nikkud_map(words) + assert nmap["טֶסְט"] == "טסט" + + def test_real_words_json_coverage(self): + words_path = Path(__file__).parent.parent / "data" / "words.json" + if not words_path.exists(): + pytest.skip("words.json not available") + with open(words_path, encoding="utf-8") as f: + words = json.load(f) + nmap = build_nikkud_map(words) + assert len(nmap) > 90_000 + + +class TestResolveTokenFrequency: + @pytest.fixture() + def freq_setup(self): + frequency_lookup.load() + freq_data = frequency_lookup.get_freq_data() + words_path = Path(__file__).parent.parent / "data" / "words.json" + if not words_path.exists(): + pytest.skip("words.json not available") + with open(words_path, encoding="utf-8") as f: + words = json.load(f) + from epub_examples import _build_nikkud_index + + nikkud_map = build_nikkud_map(words) + nikkud_index = _build_nikkud_index(words) + return nikkud_map, nikkud_index, freq_data + + def test_tier1_known_mapping(self, freq_setup): + nikkud_map, nikkud_index, freq_data = freq_setup + rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data) + assert rank is not None + assert rank < 50_000 + + def test_tier3_academy_converter(self, freq_setup): + nikkud_map, nikkud_index, freq_data = freq_setup + rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data) + assert rank is not None + assert rank < 1000 + + def test_unknown_token_returns_default(self, freq_setup): + nikkud_map, nikkud_index, freq_data = freq_setup + rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data) + assert rank == 50_000 + + def test_tier5_ktiv_male_prefix_strip(self, freq_setup): + nikkud_map, nikkud_index, freq_data = freq_setup + assert freq_data.get("שלום") is not None + + +class TestScoreSentence: + @pytest.fixture() + def scoring_setup(self): + frequency_lookup.load() + freq_data = frequency_lookup.get_freq_data() + words_path = Path(__file__).parent.parent / "data" / "words.json" + if not words_path.exists(): + pytest.skip("words.json not available") + with open(words_path, encoding="utf-8") as f: + words = json.load(f) + from epub_examples import _build_nikkud_index + + nikkud_map = build_nikkud_map(words) + nikkud_index = _build_nikkud_index(words) + return nikkud_map, nikkud_index, freq_data + + def test_returns_integer(self, scoring_setup): + nmap, nidx, freq = scoring_setup + text = "הוּא הָלַךְ הַבַּיְתָה" + start = text.index("הָלַךְ") + end = start + len("הָלַךְ") + score = score_sentence(text, start, end, nmap, nidx, freq) + assert isinstance(score, int) + + def test_easy_sentence_scores_lower(self, scoring_setup): + nmap, nidx, freq = scoring_setup + easy = "הוּא אָמַר שָׁלוֹם" + easy_start = easy.index("אָמַר") + easy_end = easy_start + len("אָמַר") + hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה" + hard_start = hard.index("נִשְׁתַּטֵּחַ") + hard_end = hard_start + len("נִשְׁתַּטֵּחַ") + easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq) + hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq) + assert easy_score < hard_score + + def test_single_context_token(self, scoring_setup): + nmap, nidx, freq = scoring_setup + text = "הוּא טוֹב" + start = 0 + end = len("הוּא") + score = score_sentence(text, start, end, nmap, nidx, freq) + assert isinstance(score, int) + + def test_handles_punctuation(self, scoring_setup): + nmap, nidx, freq = scoring_setup + text = '"הוּא טוֹב!"' + start = text.index("טוֹב") + end = start + len("טוֹב") + score = score_sentence(text, start, end, nmap, nidx, freq) + assert isinstance(score, int) + + def test_splits_on_maqaf(self, scoring_setup): + nmap, nidx, freq = scoring_setup + text = "בֵּית־סֵפֶר גָּדוֹל" + start = text.index("גָּדוֹל") + end = start + len("גָּדוֹל") + score = score_sentence(text, start, end, nmap, nidx, freq) + assert isinstance(score, int) + + def test_no_context_tokens_returns_default(self, scoring_setup): + nmap, nidx, freq = scoring_setup + text = "א ב" + score = score_sentence(text, 0, 1, nmap, nidx, freq) + assert score == DEFAULT_RANK