"""Sentence difficulty scoring by context-word frequency. Scores sentences by the median frequency rank of context words (excluding the cloze target). Lower score = easier sentence. Used by epub_examples.py to select the best cloze sentence. """ from __future__ import annotations from statistics import median import helpers import nikkud_to_ktiv_male DEFAULT_RANK = 50_000 # Hebrew prefix consonants for ktiv_male prefix stripping (tier 5) _KM_PREFIX_CHARS = set("בהוכלמשע") # Punctuation to strip from tokens _PUNCT = set('.,!?;:"\'"״׳–—()[]{}') # Maqaf (Hebrew hyphen) — splits tokens _MAQAF = "־" def build_nikkud_map(words: dict) -> dict[str, str]: """Build nikkud→ktiv_male lookup from words.json. Indexes: headwords, conjugation forms (active, passive, infinitive, reference_form), noun inflections (singular, plural, construct, pronominal suffixes), and adjective inflections (ms/fs/mp/fp). Args: words: The full words.json dict keyed by unique_key. Returns: Dict mapping nikkud form to ktiv_male string. When collisions occur, last-write wins (acceptable for frequency lookup). """ nmap: dict[str, str] = {} def _add(nikkud: str | None, ktiv_male: str | None) -> None: if nikkud and ktiv_male: nmap[nikkud] = ktiv_male for entry in words.values(): word = entry.get("word") or {} _add(word.get("nikkud"), word.get("ktiv_male")) # Conjugation forms conj = entry.get("conjugation") or {} for form_entry in conj.get("active_forms") or []: form = form_entry.get("form") or {} _add(form.get("nikkud"), form.get("ktiv_male")) for form_entry in conj.get("hufal_pual_forms") or []: form = form_entry.get("form") or {} _add(form.get("nikkud"), form.get("ktiv_male")) inf = conj.get("infinitive") or {} _add(inf.get("nikkud"), inf.get("ktiv_male")) ref = conj.get("reference_form") or {} _add(ref.get("nikkud"), ref.get("ktiv_male")) # Noun inflection forms noun = entry.get("noun_inflection") or {} for field in ("singular", "plural", "construct_singular", "construct_plural"): sub = noun.get(field) or {} nikkud_form = sub.get("nikkud") ktiv = sub.get("ktiv_male") _add(nikkud_form, ktiv) # Index construct forms without maqaf if nikkud_form and nikkud_form.endswith("־") and ktiv: _add(nikkud_form[:-1], ktiv) pronominal = noun.get("pronominal_suffixes") or {} for sub in pronominal.values(): if isinstance(sub, dict): _add(sub.get("nikkud"), sub.get("ktiv_male")) # Adjective inflection forms adj = entry.get("adjective_inflection") or {} for field in ("ms", "fs", "mp", "fp"): sub = adj.get(field) or {} _add(sub.get("nikkud"), sub.get("ktiv_male")) return nmap def _resolve_token_frequency( token: str, nikkud_map: dict[str, str], nikkud_index: dict, freq_data: dict[str, int], ) -> int: """Resolve a nikkud sentence token to its frequency rank. Uses a 5-tier pipeline: 1. Known mapping (nikkud_map from words.json) 2. Nikkud prefix stripping (epub_examples.try_strip_prefix) 3. Academy rules converter (nikkud_to_ktiv_male.convert) 4. strip_nikkud fallback (helpers.strip_nikkud) 5. Ktiv_male prefix stripping on the converted form Returns: Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found. """ # Tier 1: Direct lookup in nikkud→ktiv_male map ktiv = nikkud_map.get(token) if ktiv and ktiv in freq_data: return freq_data[ktiv] # Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map from epub_examples import try_strip_prefix prefix_hits = try_strip_prefix(token, nikkud_index) for _unique_key, _match_type, matched_remainder in prefix_hits: remainder_ktiv = nikkud_map.get(matched_remainder) if remainder_ktiv and remainder_ktiv in freq_data: return freq_data[remainder_ktiv] # Tier 3: Academy rules converter converted = nikkud_to_ktiv_male.convert(token) if converted in freq_data: return freq_data[converted] # Tier 4: strip_nikkud fallback stripped = helpers.strip_nikkud(token) if stripped != converted and stripped in freq_data: return freq_data[stripped] # Tier 5: Ktiv_male prefix stripping on converted/stripped form for form in (converted, stripped): for prefix_len in (1, 2): if len(form) > prefix_len + 1: prefix = form[:prefix_len] if all(c in _KM_PREFIX_CHARS for c in prefix): stem = form[prefix_len:] if stem in freq_data: return freq_data[stem] return DEFAULT_RANK def score_sentence( text: str, target_start: int, target_end: int, nikkud_map: dict[str, str], nikkud_index: dict, freq_data: dict[str, int], ) -> int: """Score a sentence by median frequency rank of context words. Args: text: The full sentence text (with nikkud). target_start: Character offset where the cloze target word starts. target_end: Character offset where the cloze target word ends. nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map(). nikkud_index: nikkud index from epub_examples._build_nikkud_index(). freq_data: Frequency dict from frequency_lookup.get_freq_data(). Returns: Median frequency rank of context tokens (int). Lower = easier. Returns DEFAULT_RANK if no scoreable context tokens. """ # Tokenize: split on whitespace, then split on maqaf raw_tokens = text.split() tokens_with_pos: list[tuple[str, int, int]] = [] pos = 0 for raw in raw_tokens: start = text.index(raw, pos) # Split on maqaf parts = raw.split(_MAQAF) sub_pos = start for part in parts: if part: tokens_with_pos.append((part, sub_pos, sub_pos + len(part))) sub_pos += len(part) + 1 # +1 for maqaf pos = start + len(raw) # Filter: exclude target word, strip punctuation, skip short tokens context_ranks: list[int] = [] for token, tok_start, tok_end in tokens_with_pos: # Exclude target word by overlap with char offsets if tok_start < target_end and tok_end > target_start: continue # Strip punctuation from edges cleaned = token.strip("".join(_PUNCT)) if len(cleaned) < 2: continue rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data) context_ranks.append(rank) if not context_ranks: return DEFAULT_RANK return int(median(context_ranks))