diff --git a/epub_examples.py b/epub_examples.py index ed1f266..435da2e 100644 --- a/epub_examples.py +++ b/epub_examples.py @@ -18,7 +18,9 @@ import zipfile from html.parser import HTMLParser from pathlib import Path +import frequency_lookup from helpers import strip_nikkud +from sentence_difficulty import build_nikkud_map, score_sentence logger = logging.getLogger(__name__) @@ -658,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> updated = 0 + # Build frequency scoring infrastructure (once for all words) + nikkud_index = _build_nikkud_index(words) + nikkud_map = build_nikkud_map(words) + freq_data = frequency_lookup.get_freq_data() + for unique_key, sent_list in matches.items(): if unique_key not in words: continue @@ -677,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> prefix_only = [s for s in unique if "prefix" in s["match_method"]] pool = direct if direct else prefix_only - # Score: prefer 6–12 word sentences + # Score: prefer sentences with easier (more common) context words def _score(s: dict) -> tuple[int,]: - wc = s["word_count"] - length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0 - return (length_score,) + return ( + score_sentence( + s["text"], + s["char_offset"], + s["char_end"], + nikkud_map, + nikkud_index, + freq_data, + ), + ) pool.sort(key=_score) best = pool[:3] @@ -716,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> "cloze_word_end": top["char_end"], "cloze_hint": None, "cloze_guid": cloze_guid, + "difficulty_score": _score(top)[0], } elif is_confusable: examples.pop("cloze", None)