Task 7: Replace length-based scoring with frequency-based scoring in update_words_json

- Import `sentence_difficulty.build_nikkud_map`, `score_sentence`, and `frequency_lookup`
- Build `nikkud_index`, `nikkud_map`, `freq_data` once before the per-word loop
- Replace `_score()` closure with call to `score_sentence()` (median context-word rank)
- Store `difficulty_score` in cloze dict for downstream use

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-15 13:29:22 +00:00
parent fb12f806a8
commit 272a2a080d

View file

@ -18,7 +18,9 @@ import zipfile
from html.parser import HTMLParser from html.parser import HTMLParser
from pathlib import Path from pathlib import Path
import frequency_lookup
from helpers import strip_nikkud from helpers import strip_nikkud
from sentence_difficulty import build_nikkud_map, score_sentence
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -658,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
updated = 0 updated = 0
# Build frequency scoring infrastructure (once for all words)
nikkud_index = _build_nikkud_index(words)
nikkud_map = build_nikkud_map(words)
freq_data = frequency_lookup.get_freq_data()
for unique_key, sent_list in matches.items(): for unique_key, sent_list in matches.items():
if unique_key not in words: if unique_key not in words:
continue continue
@ -677,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
prefix_only = [s for s in unique if "prefix" in s["match_method"]] prefix_only = [s for s in unique if "prefix" in s["match_method"]]
pool = direct if direct else prefix_only pool = direct if direct else prefix_only
# Score: prefer 612 word sentences # Score: prefer sentences with easier (more common) context words
def _score(s: dict) -> tuple[int,]: def _score(s: dict) -> tuple[int,]:
wc = s["word_count"] return (
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0 score_sentence(
return (length_score,) s["text"],
s["char_offset"],
s["char_end"],
nikkud_map,
nikkud_index,
freq_data,
),
)
pool.sort(key=_score) pool.sort(key=_score)
best = pool[:3] best = pool[:3]
@ -716,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
"cloze_word_end": top["char_end"], "cloze_word_end": top["char_end"],
"cloze_hint": None, "cloze_hint": None,
"cloze_guid": cloze_guid, "cloze_guid": cloze_guid,
"difficulty_score": _score(top)[0],
} }
elif is_confusable: elif is_confusable:
examples.pop("cloze", None) examples.pop("cloze", None)