From 272a2a080db6e388e807f2d2333d2da19fff6324 Mon Sep 17 00:00:00 2001
From: Sochen <sochen@nevo.engineer>
Date: Sun, 15 Mar 2026 13:29:22 +0000
Subject: [PATCH] Task 7: Replace length-based scoring with frequency-based
 scoring in update_words_json

- Import `sentence_difficulty.build_nikkud_map`, `score_sentence`, and `frequency_lookup`
- Build `nikkud_index`, `nikkud_map`, `freq_data` once before the per-word loop
- Replace `_score()` closure with call to `score_sentence()` (median context-word rank)
- Store `difficulty_score` in cloze dict for downstream use

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 epub_examples.py | 23 +++++++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/epub_examples.py b/epub_examples.py
index ed1f266..435da2e 100644
--- a/epub_examples.py
+++ b/epub_examples.py
@@ -18,7 +18,9 @@ import zipfile
 from html.parser import HTMLParser
 from pathlib import Path
 
+import frequency_lookup
 from helpers import strip_nikkud
+from sentence_difficulty import build_nikkud_map, score_sentence
 
 logger = logging.getLogger(__name__)
 
@@ -658,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
 
     updated = 0
 
+    # Build frequency scoring infrastructure (once for all words)
+    nikkud_index = _build_nikkud_index(words)
+    nikkud_map = build_nikkud_map(words)
+    freq_data = frequency_lookup.get_freq_data()
+
     for unique_key, sent_list in matches.items():
         if unique_key not in words:
             continue
@@ -677,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
         prefix_only = [s for s in unique if "prefix" in s["match_method"]]
         pool = direct if direct else prefix_only
 
-        # Score: prefer 6–12 word sentences
+        # Score: prefer sentences with easier (more common) context words
         def _score(s: dict) -> tuple[int,]:
-            wc = s["word_count"]
-            length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
-            return (length_score,)
+            return (
+                score_sentence(
+                    s["text"],
+                    s["char_offset"],
+                    s["char_end"],
+                    nikkud_map,
+                    nikkud_index,
+                    freq_data,
+                ),
+            )
 
         pool.sort(key=_score)
         best = pool[:3]
@@ -716,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
                 "cloze_word_end": top["char_end"],
                 "cloze_hint": None,
                 "cloze_guid": cloze_guid,
+                "difficulty_score": _score(top)[0],
             }
         elif is_confusable:
             examples.pop("cloze", None)