Task 7: Replace length-based scoring with frequency-based scoring in update_words_json

- Import `sentence_difficulty.build_nikkud_map`, `score_sentence`, and `frequency_lookup` - Build `nikkud_index`, `nikkud_map`, `freq_data` once before the per-word loop - Replace `_score()` closure with call to `score_sentence()` (median context-word rank) - Store `difficulty_score` in cloze dict for downstream use Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-15 13:29:22 +00:00 · 2026-03-15 13:29:22 +00:00 · 272a2a080d
commit 272a2a080d
parent fb12f806a8
1 changed files with 19 additions and 4 deletions
--- a/epub_examples.py
+++ b/epub_examples.py
@ -18,7 +18,9 @@ import zipfile
 from html.parser import HTMLParser
 from pathlib import Path
 import frequency_lookup
 from helpers import strip_nikkud
 from sentence_difficulty import build_nikkud_map, score_sentence
 logger = logging.getLogger(__name__)
@ -658,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
    updated = 0
    # Build frequency scoring infrastructure (once for all words)
    nikkud_index = _build_nikkud_index(words)
    nikkud_map = build_nikkud_map(words)
    freq_data = frequency_lookup.get_freq_data()
    for unique_key, sent_list in matches.items():
        if unique_key not in words:
            continue
@ -677,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
        prefix_only = [s for s in unique if "prefix" in s["match_method"]]
        pool = direct if direct else prefix_only
-        # Score: prefer 6–12 word sentences
+        # Score: prefer sentences with easier (more common) context words
        def _score(s: dict) -> tuple[int,]:
-            wc = s["word_count"]
+            return (
-            length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
+                score_sentence(
-            return (length_score,)
+                    s["text"],
                    s["char_offset"],
                    s["char_end"],
                    nikkud_map,
                    nikkud_index,
                    freq_data,
                ),
            )
        pool.sort(key=_score)
        best = pool[:3]
@ -716,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
                "cloze_word_end": top["char_end"],
                "cloze_hint": None,
                "cloze_guid": cloze_guid,
                "difficulty_score": _score(top)[0],
            }
        elif is_confusable:
            examples.pop("cloze", None)