From 14d567a261df5e0a950d9b1a6a9fafdff7b48761 Mon Sep 17 00:00:00 2001 From: Sochen Date: Sun, 15 Mar 2026 13:30:13 +0000 Subject: [PATCH] schema: add difficulty_score field + update spec with MIN_WORDS=3 Co-Authored-By: Claude Opus 4.6 --- SCHEMA.yaml | 1 + .../specs/2026-03-15-adaptive-sentence-difficulty-design.md | 2 ++ 2 files changed, 3 insertions(+) diff --git a/SCHEMA.yaml b/SCHEMA.yaml index f3b7450..2ae43d2 100644 --- a/SCHEMA.yaml +++ b/SCHEMA.yaml @@ -69,6 +69,7 @@ entry: cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes cloze_hint: "family member" cloze_guid: "def456..." # GUID for the cloze note + difficulty_score: 234 # Median frequency rank of context words (lower = easier); optional rejected_count: 0 # --- Noun-specific: Inflection Forms --- diff --git a/docs/superpowers/specs/2026-03-15-adaptive-sentence-difficulty-design.md b/docs/superpowers/specs/2026-03-15-adaptive-sentence-difficulty-design.md index 7ab5e58..3cffe9c 100644 --- a/docs/superpowers/specs/2026-03-15-adaptive-sentence-difficulty-design.md +++ b/docs/superpowers/specs/2026-03-15-adaptive-sentence-difficulty-design.md @@ -54,6 +54,8 @@ def _score(s: dict) -> tuple[int,]: New scoring replaces length with frequency-based difficulty. The `_score` function gains access to the frequency pipeline via closure over the nikkud_map, nikkud_index, and freq_data built once at the start of `update_words_json()`. +**Minimum sentence length:** Reduced from 4 words to 3 words (`MIN_WORDS = 3` in epub_examples.py). Hebrew is more concise than English — 3-word sentences are valid and common. This expands the candidate pool for cloze selection. + **Behavioral change:** Because `pool.sort(key=_score)` determines which 3 sentences are selected as `best = pool[:3]`, changing the scoring function changes **which sentences are selected**, not just their order. This is intentional — we want the easiest sentences as cloze candidates, not the closest-to-9-words ones. Existing cloze GUIDs will be preserved when the same sentence text is re-selected; entries where a different sentence wins will get new GUIDs. ## Data Model Changes