From 272a2a080db6e388e807f2d2333d2da19fff6324 Mon Sep 17 00:00:00 2001 From: Sochen Date: Sun, 15 Mar 2026 13:29:22 +0000 Subject: [PATCH] Task 7: Replace length-based scoring with frequency-based scoring in update_words_json - Import `sentence_difficulty.build_nikkud_map`, `score_sentence`, and `frequency_lookup` - Build `nikkud_index`, `nikkud_map`, `freq_data` once before the per-word loop - Replace `_score()` closure with call to `score_sentence()` (median context-word rank) - Store `difficulty_score` in cloze dict for downstream use Co-Authored-By: Claude Sonnet 4.6 --- epub_examples.py | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/epub_examples.py b/epub_examples.py index ed1f266..435da2e 100644 --- a/epub_examples.py +++ b/epub_examples.py @@ -18,7 +18,9 @@ import zipfile from html.parser import HTMLParser from pathlib import Path +import frequency_lookup from helpers import strip_nikkud +from sentence_difficulty import build_nikkud_map, score_sentence logger = logging.getLogger(__name__) @@ -658,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> updated = 0 + # Build frequency scoring infrastructure (once for all words) + nikkud_index = _build_nikkud_index(words) + nikkud_map = build_nikkud_map(words) + freq_data = frequency_lookup.get_freq_data() + for unique_key, sent_list in matches.items(): if unique_key not in words: continue @@ -677,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> prefix_only = [s for s in unique if "prefix" in s["match_method"]] pool = direct if direct else prefix_only - # Score: prefer 6–12 word sentences + # Score: prefer sentences with easier (more common) context words def _score(s: dict) -> tuple[int,]: - wc = s["word_count"] - length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0 - return (length_score,) + return ( + score_sentence( + s["text"], + s["char_offset"], + s["char_end"], + nikkud_map, + nikkud_index, + freq_data, + ), + ) pool.sort(key=_score) best = pool[:3] @@ -716,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> "cloze_word_end": top["char_end"], "cloze_hint": None, "cloze_guid": cloze_guid, + "difficulty_score": _score(top)[0], } elif is_confusable: examples.pop("cloze", None)