Task 7: Replace length-based scoring with frequency-based scoring in update_words_json
- Import `sentence_difficulty.build_nikkud_map`, `score_sentence`, and `frequency_lookup` - Build `nikkud_index`, `nikkud_map`, `freq_data` once before the per-word loop - Replace `_score()` closure with call to `score_sentence()` (median context-word rank) - Store `difficulty_score` in cloze dict for downstream use Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
fb12f806a8
commit
272a2a080d
1 changed files with 19 additions and 4 deletions
|
|
@ -18,7 +18,9 @@ import zipfile
|
||||||
from html.parser import HTMLParser
|
from html.parser import HTMLParser
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
import frequency_lookup
|
||||||
from helpers import strip_nikkud
|
from helpers import strip_nikkud
|
||||||
|
from sentence_difficulty import build_nikkud_map, score_sentence
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
|
@ -658,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
||||||
|
|
||||||
updated = 0
|
updated = 0
|
||||||
|
|
||||||
|
# Build frequency scoring infrastructure (once for all words)
|
||||||
|
nikkud_index = _build_nikkud_index(words)
|
||||||
|
nikkud_map = build_nikkud_map(words)
|
||||||
|
freq_data = frequency_lookup.get_freq_data()
|
||||||
|
|
||||||
for unique_key, sent_list in matches.items():
|
for unique_key, sent_list in matches.items():
|
||||||
if unique_key not in words:
|
if unique_key not in words:
|
||||||
continue
|
continue
|
||||||
|
|
@ -677,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
||||||
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
|
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
|
||||||
pool = direct if direct else prefix_only
|
pool = direct if direct else prefix_only
|
||||||
|
|
||||||
# Score: prefer 6–12 word sentences
|
# Score: prefer sentences with easier (more common) context words
|
||||||
def _score(s: dict) -> tuple[int,]:
|
def _score(s: dict) -> tuple[int,]:
|
||||||
wc = s["word_count"]
|
return (
|
||||||
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
|
score_sentence(
|
||||||
return (length_score,)
|
s["text"],
|
||||||
|
s["char_offset"],
|
||||||
|
s["char_end"],
|
||||||
|
nikkud_map,
|
||||||
|
nikkud_index,
|
||||||
|
freq_data,
|
||||||
|
),
|
||||||
|
)
|
||||||
|
|
||||||
pool.sort(key=_score)
|
pool.sort(key=_score)
|
||||||
best = pool[:3]
|
best = pool[:3]
|
||||||
|
|
@ -716,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
||||||
"cloze_word_end": top["char_end"],
|
"cloze_word_end": top["char_end"],
|
||||||
"cloze_hint": None,
|
"cloze_hint": None,
|
||||||
"cloze_guid": cloze_guid,
|
"cloze_guid": cloze_guid,
|
||||||
|
"difficulty_score": _score(top)[0],
|
||||||
}
|
}
|
||||||
elif is_confusable:
|
elif is_confusable:
|
||||||
examples.pop("cloze", None)
|
examples.pop("cloze", None)
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue