Implements build_nikkud_map(), _resolve_token_frequency(), and score_sentence() for v0.20 adaptive cloze sentence selection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
198 lines
6.8 KiB
Python
198 lines
6.8 KiB
Python
"""Sentence difficulty scoring by context-word frequency.
|
|
|
|
Scores sentences by the median frequency rank of context words
|
|
(excluding the cloze target). Lower score = easier sentence.
|
|
Used by epub_examples.py to select the best cloze sentence.
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from statistics import median
|
|
|
|
import helpers
|
|
import nikkud_to_ktiv_male
|
|
|
|
DEFAULT_RANK = 50_000
|
|
|
|
# Hebrew prefix consonants for ktiv_male prefix stripping (tier 5)
|
|
_KM_PREFIX_CHARS = set("בהוכלמשע")
|
|
|
|
# Punctuation to strip from tokens
|
|
_PUNCT = set('.,!?;:"\'"״׳–—()[]{}')
|
|
|
|
# Maqaf (Hebrew hyphen) — splits tokens
|
|
_MAQAF = "־"
|
|
|
|
|
|
def build_nikkud_map(words: dict) -> dict[str, str]:
|
|
"""Build nikkud→ktiv_male lookup from words.json.
|
|
|
|
Indexes: headwords, conjugation forms (active, passive, infinitive,
|
|
reference_form), noun inflections (singular, plural, construct,
|
|
pronominal suffixes), and adjective inflections (ms/fs/mp/fp).
|
|
|
|
Args:
|
|
words: The full words.json dict keyed by unique_key.
|
|
|
|
Returns:
|
|
Dict mapping nikkud form to ktiv_male string.
|
|
When collisions occur, last-write wins (acceptable for frequency lookup).
|
|
"""
|
|
nmap: dict[str, str] = {}
|
|
|
|
def _add(nikkud: str | None, ktiv_male: str | None) -> None:
|
|
if nikkud and ktiv_male:
|
|
nmap[nikkud] = ktiv_male
|
|
|
|
for entry in words.values():
|
|
word = entry.get("word") or {}
|
|
_add(word.get("nikkud"), word.get("ktiv_male"))
|
|
|
|
# Conjugation forms
|
|
conj = entry.get("conjugation") or {}
|
|
for form_entry in conj.get("active_forms") or []:
|
|
form = form_entry.get("form") or {}
|
|
_add(form.get("nikkud"), form.get("ktiv_male"))
|
|
for form_entry in conj.get("hufal_pual_forms") or []:
|
|
form = form_entry.get("form") or {}
|
|
_add(form.get("nikkud"), form.get("ktiv_male"))
|
|
inf = conj.get("infinitive") or {}
|
|
_add(inf.get("nikkud"), inf.get("ktiv_male"))
|
|
ref = conj.get("reference_form") or {}
|
|
_add(ref.get("nikkud"), ref.get("ktiv_male"))
|
|
|
|
# Noun inflection forms
|
|
noun = entry.get("noun_inflection") or {}
|
|
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
|
sub = noun.get(field) or {}
|
|
nikkud_form = sub.get("nikkud")
|
|
ktiv = sub.get("ktiv_male")
|
|
_add(nikkud_form, ktiv)
|
|
# Index construct forms without maqaf
|
|
if nikkud_form and nikkud_form.endswith("־") and ktiv:
|
|
_add(nikkud_form[:-1], ktiv)
|
|
pronominal = noun.get("pronominal_suffixes") or {}
|
|
for sub in pronominal.values():
|
|
if isinstance(sub, dict):
|
|
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
|
|
|
# Adjective inflection forms
|
|
adj = entry.get("adjective_inflection") or {}
|
|
for field in ("ms", "fs", "mp", "fp"):
|
|
sub = adj.get(field) or {}
|
|
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
|
|
|
return nmap
|
|
|
|
|
|
def _resolve_token_frequency(
|
|
token: str,
|
|
nikkud_map: dict[str, str],
|
|
nikkud_index: dict,
|
|
freq_data: dict[str, int],
|
|
) -> int:
|
|
"""Resolve a nikkud sentence token to its frequency rank.
|
|
|
|
Uses a 5-tier pipeline:
|
|
1. Known mapping (nikkud_map from words.json)
|
|
2. Nikkud prefix stripping (epub_examples.try_strip_prefix)
|
|
3. Academy rules converter (nikkud_to_ktiv_male.convert)
|
|
4. strip_nikkud fallback (helpers.strip_nikkud)
|
|
5. Ktiv_male prefix stripping on the converted form
|
|
|
|
Returns:
|
|
Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found.
|
|
"""
|
|
# Tier 1: Direct lookup in nikkud→ktiv_male map
|
|
ktiv = nikkud_map.get(token)
|
|
if ktiv and ktiv in freq_data:
|
|
return freq_data[ktiv]
|
|
|
|
# Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map
|
|
from epub_examples import try_strip_prefix
|
|
|
|
prefix_hits = try_strip_prefix(token, nikkud_index)
|
|
for _unique_key, _match_type, matched_remainder in prefix_hits:
|
|
remainder_ktiv = nikkud_map.get(matched_remainder)
|
|
if remainder_ktiv and remainder_ktiv in freq_data:
|
|
return freq_data[remainder_ktiv]
|
|
|
|
# Tier 3: Academy rules converter
|
|
converted = nikkud_to_ktiv_male.convert(token)
|
|
if converted in freq_data:
|
|
return freq_data[converted]
|
|
|
|
# Tier 4: strip_nikkud fallback
|
|
stripped = helpers.strip_nikkud(token)
|
|
if stripped != converted and stripped in freq_data:
|
|
return freq_data[stripped]
|
|
|
|
# Tier 5: Ktiv_male prefix stripping on converted/stripped form
|
|
for form in (converted, stripped):
|
|
for prefix_len in (1, 2):
|
|
if len(form) > prefix_len + 1:
|
|
prefix = form[:prefix_len]
|
|
if all(c in _KM_PREFIX_CHARS for c in prefix):
|
|
stem = form[prefix_len:]
|
|
if stem in freq_data:
|
|
return freq_data[stem]
|
|
|
|
return DEFAULT_RANK
|
|
|
|
|
|
def score_sentence(
|
|
text: str,
|
|
target_start: int,
|
|
target_end: int,
|
|
nikkud_map: dict[str, str],
|
|
nikkud_index: dict,
|
|
freq_data: dict[str, int],
|
|
) -> int:
|
|
"""Score a sentence by median frequency rank of context words.
|
|
|
|
Args:
|
|
text: The full sentence text (with nikkud).
|
|
target_start: Character offset where the cloze target word starts.
|
|
target_end: Character offset where the cloze target word ends.
|
|
nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map().
|
|
nikkud_index: nikkud index from epub_examples._build_nikkud_index().
|
|
freq_data: Frequency dict from frequency_lookup.get_freq_data().
|
|
|
|
Returns:
|
|
Median frequency rank of context tokens (int). Lower = easier.
|
|
Returns DEFAULT_RANK if no scoreable context tokens.
|
|
"""
|
|
# Tokenize: split on whitespace, then split on maqaf
|
|
raw_tokens = text.split()
|
|
tokens_with_pos: list[tuple[str, int, int]] = []
|
|
pos = 0
|
|
for raw in raw_tokens:
|
|
start = text.index(raw, pos)
|
|
# Split on maqaf
|
|
parts = raw.split(_MAQAF)
|
|
sub_pos = start
|
|
for part in parts:
|
|
if part:
|
|
tokens_with_pos.append((part, sub_pos, sub_pos + len(part)))
|
|
sub_pos += len(part) + 1 # +1 for maqaf
|
|
pos = start + len(raw)
|
|
|
|
# Filter: exclude target word, strip punctuation, skip short tokens
|
|
context_ranks: list[int] = []
|
|
for token, tok_start, tok_end in tokens_with_pos:
|
|
# Exclude target word by overlap with char offsets
|
|
if tok_start < target_end and tok_end > target_start:
|
|
continue
|
|
|
|
# Strip punctuation from edges
|
|
cleaned = token.strip("".join(_PUNCT))
|
|
if len(cleaned) < 2:
|
|
continue
|
|
|
|
rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data)
|
|
context_ranks.append(rank)
|
|
|
|
if not context_ranks:
|
|
return DEFAULT_RANK
|
|
|
|
return int(median(context_ranks))
|