Compare commits
17 commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6d2d446ed5 | |||
| f978e5f39a | |||
| 5f617af4ba | |||
| f3496998f5 | |||
| 138acb06d8 | |||
| 0a85291975 | |||
| 14d567a261 | |||
| 8b24d0fd26 | |||
| 272a2a080d | |||
| fb12f806a8 | |||
| 00fba934fb | |||
| d2a7c9d483 | |||
| d0f4aea58d | |||
| b3ea086e85 | |||
| af186e2030 | |||
| 0d92451271 | |||
| c85063ee2f |
22 changed files with 115712 additions and 32137 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -15,6 +15,7 @@ __pycache__/
|
|||
|
||||
# Large generated cache files (rebuild locally)
|
||||
data/benyehuda_index.json
|
||||
data/colliding_forms.json
|
||||
|
||||
# Audio directories (large; rebuild locally)
|
||||
data/audio/
|
||||
|
|
|
|||
|
|
@ -27,6 +27,7 @@ entry:
|
|||
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
||||
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
||||
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
||||
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
|
||||
audio_url: "https://..." # Pealim audio URL
|
||||
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
||||
tags: "" # Pealim tags if any
|
||||
|
|
@ -68,6 +69,7 @@ entry:
|
|||
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
|
||||
cloze_hint: "family member"
|
||||
cloze_guid: "def456..." # GUID for the cloze note
|
||||
difficulty_score: 234 # Median frequency rank of context words (lower = easier); optional
|
||||
rejected_count: 0
|
||||
|
||||
# --- Noun-specific: Inflection Forms ---
|
||||
|
|
|
|||
522
apkg_builder.py
522
apkg_builder.py
|
|
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
|||
|
||||
# Release version tag added to all notes so users can identify which release
|
||||
# their cards come from (visible in Anki's Browse view and card info).
|
||||
RELEASE_TAG = "v0.16"
|
||||
RELEASE_TAG = "v0.20"
|
||||
|
||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||
|
|
@ -152,12 +152,6 @@ CARD_CSS = """
|
|||
direction: rtl;
|
||||
text-align: center;
|
||||
}
|
||||
.root-info {
|
||||
font-size: 26px;
|
||||
color: #222;
|
||||
margin-top: 6px;
|
||||
direction: rtl;
|
||||
}
|
||||
.example {
|
||||
font-size: 24px;
|
||||
color: #222;
|
||||
|
|
@ -185,39 +179,100 @@ CARD_CSS = """
|
|||
font-weight: normal;
|
||||
color: #555;
|
||||
}
|
||||
.sec-table {
|
||||
display: table;
|
||||
margin: 6px auto 0;
|
||||
direction: rtl;
|
||||
border-collapse: collapse;
|
||||
}
|
||||
.sec-label {
|
||||
display: table-row;
|
||||
font-size: 28px;
|
||||
font-weight: normal;
|
||||
color: #222;
|
||||
direction: rtl;
|
||||
text-align: center;
|
||||
margin-top: 6px;
|
||||
}
|
||||
.sec-key {
|
||||
display: table-cell;
|
||||
font-size: 28px;
|
||||
color: #222;
|
||||
font-weight: bold;
|
||||
text-align: right;
|
||||
padding: 2px 0 2px 8px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.sec-val {
|
||||
display: table-cell;
|
||||
font-size: 28px;
|
||||
color: #222;
|
||||
text-align: right;
|
||||
padding: 2px 0;
|
||||
}
|
||||
.definitions {
|
||||
direction: rtl;
|
||||
text-align: center;
|
||||
}
|
||||
.more-toggle {
|
||||
text-align: center;
|
||||
direction: rtl;
|
||||
margin-top: 8px;
|
||||
}
|
||||
.more-header {
|
||||
display: inline-block;
|
||||
font-size: 18px;
|
||||
color: #555;
|
||||
cursor: pointer;
|
||||
list-style: none;
|
||||
border: 1px solid #ccc;
|
||||
border-radius: 16px;
|
||||
padding: 4px 16px;
|
||||
margin: 4px 0;
|
||||
background: #f8f8f8;
|
||||
}
|
||||
.more-header::-webkit-details-marker { display: none; }
|
||||
.more-header::before { content: "○ "; font-size: 14px; }
|
||||
details[open] > .more-header::before { content: "● "; }
|
||||
.related-header {
|
||||
font-size: 22px;
|
||||
color: #555;
|
||||
text-align: center;
|
||||
margin: 4px 0;
|
||||
}
|
||||
.rw-word {
|
||||
display: table-cell;
|
||||
font-size: 28px;
|
||||
color: #222;
|
||||
font-weight: normal;
|
||||
text-align: right;
|
||||
padding: 2px 0 2px 8px;
|
||||
white-space: nowrap;
|
||||
}
|
||||
.rw-meaning {
|
||||
display: table-cell;
|
||||
font-size: 24px;
|
||||
color: #555;
|
||||
text-align: left;
|
||||
direction: ltr;
|
||||
padding: 2px 0;
|
||||
}
|
||||
.conf-entry {
|
||||
margin: 8px 0;
|
||||
font-size: 28px;
|
||||
direction: rtl;
|
||||
}
|
||||
.related-group {
|
||||
direction: rtl;
|
||||
text-align: center;
|
||||
margin: 2px 0;
|
||||
font-size: 26px;
|
||||
}
|
||||
.emoji-img {
|
||||
font-size: 3.5em;
|
||||
text-align: center;
|
||||
margin: 0.3em 0;
|
||||
}
|
||||
.plural-direction {
|
||||
font-size: 32px;
|
||||
color: #444;
|
||||
text-align: center;
|
||||
direction: rtl;
|
||||
margin: 8px 0;
|
||||
font-weight: bold;
|
||||
}
|
||||
.card [type="button"], .card button, .replay-button {
|
||||
display: block !important;
|
||||
margin: 4px auto !important;
|
||||
|
|
@ -228,16 +283,39 @@ CARD_CSS = """
|
|||
.hebrew { color: #f0f0f0; }
|
||||
.hebrew-sm { color: #e0e0e0; }
|
||||
.meaning { color: #82b0ff; }
|
||||
.root-info { color: #e0e0e0; }
|
||||
.sec-label { color: #e0e0e0; }
|
||||
.sec-key { color: #e0e0e0; }
|
||||
.sec-val { color: #e0e0e0; }
|
||||
.conf-entry { color: #ddd; }
|
||||
.hint { color: #777; }
|
||||
.voice-label { color: #888; }
|
||||
.example { color: #e0e0e0; border-right-color: #555; }
|
||||
.divider { border-top-color: #333; }
|
||||
.freq-badge { color: #888; border-color: #444; }
|
||||
.more-header { color: #bbb; background: #2a2a2e; border-color: #555; }
|
||||
.related-header { color: #999; }
|
||||
.rw-word { color: #e0e0e0; }
|
||||
.rw-meaning { color: #999; }
|
||||
.plural-direction { color: #aaa; }
|
||||
}
|
||||
.nightMode .card { color: #e8e8e8; background: #1c1c1e; }
|
||||
.nightMode .hebrew { color: #f0f0f0; }
|
||||
.nightMode .hebrew-sm { color: #e0e0e0; }
|
||||
.nightMode .meaning { color: #82b0ff; }
|
||||
.nightMode .sec-label { color: #e0e0e0; }
|
||||
.nightMode .sec-key { color: #e0e0e0; }
|
||||
.nightMode .sec-val { color: #e0e0e0; }
|
||||
.nightMode .conf-entry { color: #ddd; }
|
||||
.nightMode .hint { color: #777; }
|
||||
.nightMode .voice-label { color: #888; }
|
||||
.nightMode .example { color: #e0e0e0; border-right-color: #555; }
|
||||
.nightMode .divider { border-top-color: #333; }
|
||||
.nightMode .freq-badge { color: #888; border-color: #444; }
|
||||
.nightMode .more-header { color: #bbb; background: #2a2a2e; border-color: #555; }
|
||||
.nightMode .related-header { color: #999; }
|
||||
.nightMode .rw-word { color: #e0e0e0; }
|
||||
.nightMode .rw-meaning { color: #999; }
|
||||
.nightMode .plural-direction { color: #aaa; }
|
||||
"""
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -255,14 +333,19 @@ VOCAB_BACK_HEB = """
|
|||
<div class="meaning">{{Meaning}}</div>
|
||||
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
|
||||
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
|
||||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
||||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
|
||||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
|
||||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
|
||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
|
||||
</div>
|
||||
{{#SharedRoots}}
|
||||
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||
<div class="root-info">{{SharedRoots}}</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">{{SharedRoots}}</div>
|
||||
{{/SharedRoots}}
|
||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
||||
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
|
||||
</details>
|
||||
"""
|
||||
|
||||
VOCAB_FRONT_ENG = """
|
||||
|
|
@ -277,14 +360,19 @@ VOCAB_BACK_ENG = """
|
|||
<div class="divider"></div>
|
||||
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
|
||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
|
||||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
||||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
|
||||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
|
||||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
|
||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
|
||||
</div>
|
||||
{{#SharedRoots}}
|
||||
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||
<div class="root-info">{{SharedRoots}}</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">{{SharedRoots}}</div>
|
||||
{{/SharedRoots}}
|
||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
||||
</details>
|
||||
"""
|
||||
|
||||
VOCAB_FRONT_CLOZE = """
|
||||
|
|
@ -318,6 +406,7 @@ VOCAB_MODEL = genanki.Model(
|
|||
{"name": "Prep"},
|
||||
{"name": "Hint"},
|
||||
{"name": "Plural"},
|
||||
{"name": "Gender"},
|
||||
{"name": "ClozeExample"},
|
||||
{"name": "ClozeHint"},
|
||||
],
|
||||
|
|
@ -349,7 +438,7 @@ VOCAB_MODEL = genanki.Model(
|
|||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
CONJ_FRONT = """
|
||||
<div class="sec-label">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">{{Pronoun}}</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">{{Infinitive}}{{#Prep}} <span class="hebrew-sm">({{Prep}})</span>{{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
|
||||
<div class="hebrew">{{Tense}}</div>
|
||||
|
|
@ -359,11 +448,18 @@ CONJ_BACK = """
|
|||
{{FrontSide}}<hr>
|
||||
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
|
||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
|
||||
{{#RelatedVocab}}<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||
<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
{{#Meaning}}<div class="meaning" style="font-size:28px;">{{Meaning}}</div>{{/Meaning}}
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
|
||||
</div>
|
||||
{{#RelatedVocab}}
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">{{RelatedVocab}}</div>
|
||||
{{/RelatedVocab}}
|
||||
</details>
|
||||
"""
|
||||
|
||||
CONJ_CSS = CARD_CSS
|
||||
|
|
@ -624,6 +720,116 @@ _EMOJI_STOP = frozenset(
|
|||
"bar",
|
||||
"wheel",
|
||||
"horizontal",
|
||||
# Polysemous keywords producing wrong-sense emoji (Sprint 17 audit)
|
||||
"high", # ⚡ high voltage, not "tall"
|
||||
"down", # 🫳 palm down, not "descend"
|
||||
"off", # 📴 phone off, not "remove"
|
||||
"away", # 💨 dashing away, not "depart"
|
||||
"together", # 🤲 palms together, not "unite"
|
||||
"top", # 🎩 top hat, not "upper"
|
||||
"low", # 🔈 low volume, not "short"
|
||||
"flat", # 🥿 ballet flat, not "apartment"
|
||||
"soft", # 🍦 soft serve, not "quiet"
|
||||
"broken", # 💔 broken heart, not "damaged"
|
||||
"round", # 📍 round pushpin, not "circular"
|
||||
"cool", # 🆒 COOL button, not "cold"
|
||||
"free", # 🆓 FREE button, not "liberated"
|
||||
"long", # 🪘 long drum, not "lengthy"
|
||||
"straight", # 📏 straight ruler, not "direct"
|
||||
"empty", # 🪹 empty nest, not "void"
|
||||
"hot", # 🥵 hot face, not "warm"
|
||||
"cross", # ✝️ latin cross, not "intersect"
|
||||
"bright", # 🔆 bright button, not "luminous"
|
||||
"old", # 👴 old man, not "aged"
|
||||
"head", # 🙂↔️ shaking head, not "leader"
|
||||
# Category words that match generic emoji
|
||||
"military", # 🎖️ military medal for any military term
|
||||
"sports", # 🏅 sports medal for any sports term
|
||||
"food", # 😋 yummy face for any food term
|
||||
"city", # 🇻🇦 Vatican flag for any city
|
||||
"china", # 🇨🇳 China flag for "porcelain"
|
||||
"polish", # 💅 nail polish for "to polish/shine"
|
||||
"aid", # 🦻 hearing aid for "to help"
|
||||
"office", # 🧑💼 office worker for "bureau"
|
||||
"construction", # 🏛️ classical building, not construction
|
||||
"cinema", # 🎦 cinema emoji for any film term
|
||||
"ceremony", # 🎑 moon ceremony for any ceremony
|
||||
"building", # 🏛️ classical building for any structure
|
||||
# Body parts / human features → wrong emoji
|
||||
"arm", # 🦾 mechanical arm for "to arm"
|
||||
"hair", # 👱 blond person for "hair"
|
||||
"nose", # 😤 steam from nose
|
||||
"tongue", # 😛 tongue-out face
|
||||
"chest", # not a chest
|
||||
"eyes", # 😃 face with eyes
|
||||
# Abstract/vague words
|
||||
"fear", # 😱 screaming face
|
||||
"anger", # 💢 anger symbol
|
||||
"angry", # 😠 angry face
|
||||
"tired", # 😫 tired face
|
||||
"sad", # 😥 sad face
|
||||
"joy", # 😂 tears of joy
|
||||
"love", # 💌 love letter
|
||||
"cold", # 🥶 cold face
|
||||
"pile", # 💩 pile of poo
|
||||
"man", # 👨 man
|
||||
"woman", # 👩 woman
|
||||
"boy", # 👦 boy
|
||||
"girl", # 👧 girl
|
||||
"baby", # 👶 baby
|
||||
"children", # 🚸 children crossing
|
||||
"student", # 🧑🎓 student
|
||||
"adult", # 🧑🧑🧒 family
|
||||
"name", # 📛 name badge
|
||||
"check", # ✅ check mark
|
||||
"line", # 🫥 dotted line face
|
||||
"floor", # 🤣 ROFL (rolling on floor)
|
||||
"room", # 🧖 person in steamy room
|
||||
"bubble", # 👁️🗨️ speech bubble
|
||||
"car", # 🚃 railway car, not automobile
|
||||
"bullet", # 🚅 bullet train
|
||||
"steam", # 😤 face with steam
|
||||
"fly", # 🪰 the insect, not the verb
|
||||
"plant", # 🪴 potted plant for all "X (plant)" entries
|
||||
"tree", # 🌲 evergreen for all "X (tree)" entries
|
||||
"ball", # ⛹️ person bouncing ball
|
||||
"bag", # 👝 clutch bag
|
||||
"fight", # not a fight
|
||||
"cloud", # not a cloud
|
||||
"video", # 🎮 video game, not video
|
||||
"rescue", # ⛑️ rescue worker helmet
|
||||
"exchange", # 💱 currency exchange
|
||||
"cut", # 🥩 cut of meat, not "to cut"
|
||||
"key", # 🔐 locked with key
|
||||
"walking", # 🚶 person walking
|
||||
"running", # 🏃 person running
|
||||
"climbing", # 🧗 person climbing
|
||||
"speaking", # 🗣️ speaking head
|
||||
"playing", # 🤽 person playing
|
||||
"feeding", # 👩🍼 person feeding
|
||||
"shooting", # 🌠 shooting star
|
||||
"clapping", # 👏 clapping hands
|
||||
"cooking", # 🍳 cooking emoji
|
||||
"holding", # 🥹 face holding back tears
|
||||
# More wrong-sense matches from remaining audit
|
||||
"paper", # 🏮 red lantern for "paper"
|
||||
"track", # 🛤️ railroad for "track record"
|
||||
"vertical", # 🚦 traffic light for "vertical"
|
||||
"speaker", # 🔇 muted speaker for "speaker (person)"
|
||||
"square", # 🟥 red square for "plaza"
|
||||
"wrapped", # 🎁 gift for "wrapped, bound"
|
||||
"volume", # 🔈 speaker for "volume (book)"
|
||||
"mobile", # 📱 phone for "mobile, moveable"
|
||||
"flash", # 📸 camera flash for "to shine"
|
||||
"identification", # 🪪 ID card for "locating"
|
||||
"service", # 🐕🦺 service dog for "service, term"
|
||||
"ground", # ⛱️ umbrella on ground
|
||||
"machine", # 🎰 slot machine for "mechanism"
|
||||
"liquid", # 🫗 pouring for "liquid, drop"
|
||||
"vehicle", # 🚙 SUV for any vehicle mention
|
||||
"window", # 🪟 window pane for "window, gap"
|
||||
"information", # ℹ️ info symbol
|
||||
"child", # 🧒 child emoji
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -703,6 +909,32 @@ def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]:
|
|||
return result
|
||||
|
||||
|
||||
# Hebrew prefix letters (אותיות השימוש): בהוכלמש
|
||||
_PREFIX_LETTERS = frozenset("בהוכלמש")
|
||||
|
||||
|
||||
def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int:
|
||||
"""Return the number of characters in the cloze token that are prefix (not part of the word).
|
||||
|
||||
For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars).
|
||||
Returns 0 if the token starts with the word directly.
|
||||
"""
|
||||
if not word_nikkud or not cloze_token:
|
||||
return 0
|
||||
# If the token starts with the word nikkud, no prefix
|
||||
if cloze_token.startswith(word_nikkud):
|
||||
return 0
|
||||
# Check if word nikkud appears as a suffix of the token
|
||||
idx = cloze_token.find(word_nikkud)
|
||||
if idx > 0:
|
||||
# Verify prefix chars are valid Hebrew prefix letters
|
||||
prefix_part = cloze_token[:idx]
|
||||
base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"]
|
||||
if base_letters and all(c in _PREFIX_LETTERS for c in base_letters):
|
||||
return idx
|
||||
return 0
|
||||
|
||||
|
||||
def build_vocab_deck(
|
||||
words: dict[str, dict],
|
||||
limit: int | None = None,
|
||||
|
|
@ -737,9 +969,11 @@ def build_vocab_deck(
|
|||
if word_nikkud not in word_to_pos_cat:
|
||||
word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
|
||||
|
||||
# Sort entries by frequency (null → 999999), applying limit after sort
|
||||
# Sort entries by effective frequency (pseudo_frequency for confusables,
|
||||
# else regular frequency; null → 999999), applying limit after sort
|
||||
def _freq_key(item: tuple[str, dict]) -> int:
|
||||
return item[1].get("frequency") or 999_999
|
||||
e = item[1]
|
||||
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
|
||||
|
||||
sorted_entries = sorted(words.items(), key=_freq_key)
|
||||
if limit:
|
||||
|
|
@ -758,10 +992,13 @@ def build_vocab_deck(
|
|||
pos_heb = entry.get("pos_hebrew", "")
|
||||
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
|
||||
meaning = HBPAREN_RE.sub("", meaning).strip()
|
||||
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
|
||||
# Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning)
|
||||
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||
meaning = re.sub(r"[;:]\s*—", " —", meaning) # clean "; —" → " —"
|
||||
meaning = re.sub(r";\s*:", ";", meaning) # clean "; :" → ";"
|
||||
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||||
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
|
||||
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
|
||||
meaning_raw = entry.get("meaning_raw", "") or ""
|
||||
slug = entry.get("slug", "") or ""
|
||||
frequency = entry.get("frequency") or 999_999
|
||||
audio_file = entry.get("audio_file", "") or ""
|
||||
|
|
@ -796,20 +1033,22 @@ def build_vocab_deck(
|
|||
else:
|
||||
freq_display = "Unlisted"
|
||||
|
||||
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup
|
||||
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup.
|
||||
# Skip fallback for verbs — keyword matching on verb definitions produces
|
||||
# wrong-sense emoji (e.g. "to cut" → 🥩, "to arm" → 🦾).
|
||||
emoji_str = ""
|
||||
if entry.get("emoji_visible") and entry.get("emoji"):
|
||||
emoji_str = entry["emoji"]
|
||||
elif not emoji_str and emoji_lookup:
|
||||
elif emoji_lookup and not meaning.startswith("to "):
|
||||
meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip()
|
||||
for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]:
|
||||
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
|
||||
emoji_str = emoji_lookup[kw]
|
||||
break
|
||||
|
||||
# Extract Hebrew prepositions from meaning_raw
|
||||
preps = HBPAREN_RE.findall(meaning_raw)
|
||||
prep_str = " ".join(f"({p})" for p in preps)
|
||||
# Hebrew prepositions — extracted upstream by list scraper
|
||||
entry_prep = entry.get("prep")
|
||||
prep_str = " ".join(f"({p})" for p in entry_prep.split()) if entry_prep else ""
|
||||
|
||||
# Audio — use audio_file from entry; for confusables it's already slug-based
|
||||
audio_tag = ""
|
||||
|
|
@ -850,10 +1089,13 @@ def build_vocab_deck(
|
|||
start = cloze_data.get("cloze_word_start")
|
||||
end = cloze_data.get("cloze_word_end")
|
||||
if cloze_text and start is not None and end is not None:
|
||||
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
|
||||
# Clean up duplicate/misplaced quotation marks
|
||||
# Preserve Hebrew prefix letters in the cloze blank
|
||||
# e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____"
|
||||
cloze_token = cloze_text[start:end]
|
||||
prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud)
|
||||
cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:]
|
||||
# Clean up duplicate adjacent quotation marks (e.g. "" → ")
|
||||
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
|
||||
cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
|
||||
raw_hint = cloze_data.get("cloze_hint") or ""
|
||||
if raw_hint:
|
||||
cloze_hint = raw_hint
|
||||
|
|
@ -863,35 +1105,47 @@ def build_vocab_deck(
|
|||
if pos_cat == "Verb" and pos_heb:
|
||||
cloze_hint = f"{meaning} ({pos_heb})"
|
||||
|
||||
# Related words (shared roots) grouped by PoS category
|
||||
# Related words (shared roots) as a table: word — meaning, sorted by frequency
|
||||
related_html = ""
|
||||
if shared_roots_keys:
|
||||
groups: dict[str, list[str]] = {}
|
||||
rw_items: list[tuple[int, str, str]] = [] # (sort_key, nikkud, meaning)
|
||||
for rw_key in shared_roots_keys:
|
||||
rw_entry = words.get(rw_key)
|
||||
if rw_entry:
|
||||
rw_nikkud = rw_entry["word"]["nikkud"]
|
||||
cat = _categorize_pos(rw_entry.get("pos", ""))
|
||||
rw_meaning = rw_entry.get("meaning") or ""
|
||||
if len(rw_meaning) > 40:
|
||||
rw_meaning = rw_meaning[:37] + "…"
|
||||
rw_freq = rw_entry.get("frequency") or 999999
|
||||
else:
|
||||
# Key not found: use the key itself as display text
|
||||
rw_nikkud = rw_key
|
||||
cat = "Other"
|
||||
groups.setdefault(cat, []).append(rw_nikkud)
|
||||
parts = []
|
||||
for cat, rw_words in groups.items():
|
||||
if cat == "Other":
|
||||
parts.append(f'<div class="related-group">{" ".join(rw_words)}</div>')
|
||||
else:
|
||||
label = POS_CATEGORY_LABELS.get(cat, cat)
|
||||
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
|
||||
related_html = "\n".join(parts)
|
||||
rw_meaning = ""
|
||||
rw_freq = 999999
|
||||
rw_items.append((rw_freq, rw_nikkud, rw_meaning))
|
||||
rw_items.sort(key=lambda x: x[0])
|
||||
rows_html: list[str] = []
|
||||
for _freq, rw_nikkud, rw_meaning in rw_items:
|
||||
rows_html.append(
|
||||
f'<div class="sec-label">'
|
||||
f'<span class="rw-word">{rw_nikkud}</span>'
|
||||
f'<span class="rw-meaning">{rw_meaning}</span>'
|
||||
f"</div>"
|
||||
)
|
||||
related_html = "\n".join(rows_html)
|
||||
|
||||
# Plural form (nouns only — guard against adjective/verb inflection bleed)
|
||||
# Plural form and gender (nouns only)
|
||||
plural_str = ""
|
||||
gender_str = ""
|
||||
if pos_raw.startswith("Noun"):
|
||||
noun_inflection = entry.get("noun_inflection")
|
||||
if noun_inflection and noun_inflection.get("plural"):
|
||||
if noun_inflection:
|
||||
if noun_inflection.get("plural"):
|
||||
plural_str = noun_inflection["plural"].get("nikkud", "")
|
||||
gender_raw = noun_inflection.get("gender") or ""
|
||||
if gender_raw == "masculine":
|
||||
gender_str = "זָכָר"
|
||||
elif gender_raw == "feminine":
|
||||
gender_str = "נְקֵבָה"
|
||||
|
||||
# Image
|
||||
image_tag = ""
|
||||
|
|
@ -927,6 +1181,7 @@ def build_vocab_deck(
|
|||
prep_str,
|
||||
hint_str,
|
||||
plural_str,
|
||||
gender_str,
|
||||
cloze_example,
|
||||
cloze_hint,
|
||||
],
|
||||
|
|
@ -941,7 +1196,8 @@ def build_vocab_deck(
|
|||
prep_count = sum(1 for n in deck.notes if n.fields[12])
|
||||
hint_count = sum(1 for n in deck.notes if n.fields[13])
|
||||
plural_count = sum(1 for n in deck.notes if n.fields[14])
|
||||
cloze_count = sum(1 for n in deck.notes if n.fields[15])
|
||||
gender_count = sum(1 for n in deck.notes if n.fields[15])
|
||||
cloze_count = sum(1 for n in deck.notes if n.fields[16])
|
||||
unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
|
||||
if emoji_count:
|
||||
logger.info(f" Emoji extracted: {emoji_count} words")
|
||||
|
|
@ -951,6 +1207,8 @@ def build_vocab_deck(
|
|||
logger.info(f" Eng→Heb hints: {hint_count} words")
|
||||
if plural_count:
|
||||
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
|
||||
if gender_count:
|
||||
logger.info(f" Noun gender on vocab cards: {gender_count} words")
|
||||
if cloze_count:
|
||||
logger.info(f" Sentence cloze cards: {cloze_count} words")
|
||||
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
|
||||
|
|
@ -969,13 +1227,17 @@ def build_conj_deck(
|
|||
note_count = 0
|
||||
verb_count = 0
|
||||
|
||||
# Build root → [related word nikkud] lookup for cross-linking
|
||||
root_words: dict[str, list[str]] = {}
|
||||
# Build root → [(freq, nikkud, meaning)] lookup for cross-linking
|
||||
root_words: dict[str, list[tuple[int, str, str]]] = {}
|
||||
for entry in words.values():
|
||||
root_list = entry.get("root") or []
|
||||
root_key = " ".join(root_list)
|
||||
if root_key:
|
||||
root_words.setdefault(root_key, []).append(entry["word"]["nikkud"])
|
||||
rw_meaning = entry.get("meaning") or ""
|
||||
if len(rw_meaning) > 40:
|
||||
rw_meaning = rw_meaning[:37] + "…"
|
||||
rw_freq = entry.get("frequency") or 999999
|
||||
root_words.setdefault(root_key, []).append((rw_freq, entry["word"]["nikkud"], rw_meaning))
|
||||
|
||||
for _unique_key, entry in words.items():
|
||||
conj = entry.get("conjugation")
|
||||
|
|
@ -996,28 +1258,27 @@ def build_conj_deck(
|
|||
root = ".".join(root_list)
|
||||
voice = VOICE_MAP.get(binyan, "")
|
||||
|
||||
meaning_raw = entry.get("meaning_raw", "") or ""
|
||||
meaning = entry.get("meaning", "") or ""
|
||||
# Extract Hebrew preposition — strip from meaning, show on Hebrew side
|
||||
# Hebrew preposition — extracted upstream by scraper
|
||||
prep_str = ""
|
||||
conj_prep = conj.get("prep")
|
||||
conj_prep = conj.get("prep") or entry.get("prep")
|
||||
if conj_prep:
|
||||
# Strip any parentheses from stored prep value
|
||||
prep_str = conj_prep.strip("() ")
|
||||
elif meaning_raw:
|
||||
preps = HBPAREN_RE.findall(meaning_raw)
|
||||
if preps:
|
||||
prep_str = preps[0]
|
||||
# Strip Hebrew prepositions from English meaning to avoid duplication
|
||||
if prep_str:
|
||||
meaning = HBPAREN_RE.sub("", meaning).strip()
|
||||
# Also strip from meaning_raw patterns like "(על)"
|
||||
meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
|
||||
# Clean up double spaces and trailing commas
|
||||
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
|
||||
|
||||
related = [w for w in root_words.get(root, []) if w != infinitive]
|
||||
related_str = " ".join(related[:8]) if related else ""
|
||||
related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive]
|
||||
if related:
|
||||
related.sort(key=lambda x: x[0])
|
||||
related_rows = []
|
||||
for _freq, rw_nikkud, rw_meaning in related[:8]:
|
||||
related_rows.append(
|
||||
f'<div class="sec-label">'
|
||||
f'<span class="rw-word">{rw_nikkud}</span>'
|
||||
f'<span class="rw-meaning">{rw_meaning}</span>'
|
||||
f"</div>"
|
||||
)
|
||||
related_str = "\n".join(related_rows)
|
||||
else:
|
||||
related_str = ""
|
||||
|
||||
forms = _forms_list_to_dict(active_forms_list)
|
||||
|
||||
|
|
@ -1299,9 +1560,12 @@ def build_confusables_deck(
|
|||
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
|
||||
guid_to_entries.setdefault(guid, []).append(entry)
|
||||
|
||||
def _eff_freq(e: dict) -> int:
|
||||
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
|
||||
|
||||
for guid, group_entries in sorted(
|
||||
guid_to_entries.items(),
|
||||
key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
|
||||
key=lambda x: sum(_eff_freq(e) for e in x[1]) / len(x[1]),
|
||||
):
|
||||
if guid in seen_guids:
|
||||
continue
|
||||
|
|
@ -1320,9 +1584,13 @@ def build_confusables_deck(
|
|||
unique_entries.append(e)
|
||||
if len(unique_entries) < 2:
|
||||
continue
|
||||
# Sort by pseudo/frequency so most common meaning appears first
|
||||
unique_entries.sort(key=_eff_freq)
|
||||
if len(unique_entries) < 2:
|
||||
continue
|
||||
|
||||
word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
|
||||
words_display = " / ".join(e["word"]["nikkud"] for e in unique_entries)
|
||||
words_display = word_no_nik # Show ktiv male (shared form) on front
|
||||
|
||||
defs_parts: list[str] = []
|
||||
audio_parts: list[str] = []
|
||||
|
|
@ -1330,10 +1598,12 @@ def build_confusables_deck(
|
|||
w = e["word"]["nikkud"]
|
||||
m = e.get("meaning", "")
|
||||
p = e.get("pos_hebrew", "")
|
||||
pos_label = f" ({p})" if p else ""
|
||||
pos_div = f'<div style="font-size:18px; color:#888;">{p}</div>' if p else ""
|
||||
defs_parts.append(
|
||||
f'<div class="conf-entry"><span class="hebrew" style="font-size:24px;">{w}</span>'
|
||||
f" = {m}{pos_label}</div>"
|
||||
f'<div class="conf-entry">'
|
||||
f'<span class="hebrew" style="font-size:24px;">{w}</span>'
|
||||
f'<div style="direction:ltr; text-align:center; font-size:22px;">{m}</div>'
|
||||
f"{pos_div}</div>"
|
||||
)
|
||||
if include_audio:
|
||||
af = e.get("audio_file", "") or ""
|
||||
|
|
@ -1389,31 +1659,35 @@ def write_conf_apkg(
|
|||
PLURAL_FRONT_SG = """
|
||||
<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
|
||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||
<div class="sec-label">{{Meaning}}</div>
|
||||
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
|
||||
<div class="meaning" style="font-size:28px;">{{Meaning}}</div>
|
||||
<div class="plural-direction">יָחִיד ← רַבִּים</div>
|
||||
"""
|
||||
|
||||
PLURAL_BACK_SG = """
|
||||
{{FrontSide}}<hr>
|
||||
<div class="hebrew">{{Plural}}</div>
|
||||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||||
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
|
||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
||||
<div class="sec-table">
|
||||
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
|
||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
|
||||
</div>
|
||||
"""
|
||||
|
||||
PLURAL_FRONT_PL = """
|
||||
<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
|
||||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||||
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
|
||||
<div class="plural-direction">רַבִּים ← יָחִיד</div>
|
||||
"""
|
||||
|
||||
PLURAL_BACK_PL = """
|
||||
{{FrontSide}}<hr>
|
||||
<div class="hebrew">{{Singular}}</div>
|
||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||
<div class="sec-label">{{Meaning}}</div>
|
||||
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
|
||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
||||
<div class="meaning" style="font-size:28px;">{{Meaning}}</div>
|
||||
<div class="sec-table">
|
||||
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
|
||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
|
||||
</div>
|
||||
"""
|
||||
|
||||
PLURAL_CSS = CARD_CSS
|
||||
|
|
@ -1501,13 +1775,42 @@ def build_plural_deck(
|
|||
elif mishkal:
|
||||
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
|
||||
|
||||
# Select exemplars per mishkal, preferring high-frequency words
|
||||
per_mishkal = 6
|
||||
# Select regular exemplars to achieve a 2:1 regular:irregular ratio.
|
||||
# Distribute evenly across mishkal patterns, preferring high-frequency words.
|
||||
irregular_count = len(irregulars)
|
||||
target_regular = irregular_count * 2
|
||||
mishkal_count = len(by_mishkal) or 1
|
||||
# Over-sample per mishkal to compensate for small patterns, then trim
|
||||
per_mishkal = max(3, (target_regular * 3) // (mishkal_count * 2))
|
||||
|
||||
selected: list[tuple[str, dict, dict]] = list(irregulars)
|
||||
regular_pool: list[tuple[str, dict, dict]] = []
|
||||
for _mishkal, entries in sorted(by_mishkal.items()):
|
||||
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||
selected.extend(entries[:per_mishkal])
|
||||
regular_pool.extend(entries[:per_mishkal])
|
||||
|
||||
# If we overshot, trim to target (keeping highest-frequency across all mishkals)
|
||||
if len(regular_pool) > target_regular:
|
||||
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||
regular_pool = regular_pool[:target_regular]
|
||||
|
||||
# Sort both pools by frequency, then interleave for homogeneous 2:1 regular:irregular
|
||||
irregulars.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||
|
||||
# Interleave: for every 1 irregular, insert 2 regulars
|
||||
selected: list[tuple[str, dict, dict]] = []
|
||||
ri = 0 # regular index
|
||||
for _ii, irr in enumerate(irregulars):
|
||||
# Insert 2 regulars before each irregular (when available)
|
||||
for _ in range(2):
|
||||
if ri < len(regular_pool):
|
||||
selected.append(regular_pool[ri])
|
||||
ri += 1
|
||||
selected.append(irr)
|
||||
# Append remaining regulars
|
||||
while ri < len(regular_pool):
|
||||
selected.append(regular_pool[ri])
|
||||
ri += 1
|
||||
|
||||
note_count = 0
|
||||
for _unique_key, entry, noun_inflection in selected:
|
||||
|
|
@ -1517,7 +1820,7 @@ def build_plural_deck(
|
|||
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
|
||||
gender = noun_inflection.get("gender") or ""
|
||||
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
|
||||
mishkal = noun_inflection.get("mishkal") or ""
|
||||
mishkal_heb = noun_inflection.get("mishkal_hebrew") or ""
|
||||
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
|
||||
root_list = entry.get("root") or []
|
||||
root = ".".join(root_list)
|
||||
|
|
@ -1530,16 +1833,25 @@ def build_plural_deck(
|
|||
sg_audio = ""
|
||||
pl_audio = ""
|
||||
if include_audio:
|
||||
sg_tag = _audio_tag(singular_ktiv)
|
||||
slug = entry.get("slug", "")
|
||||
sg_tag = _audio_tag(singular_ktiv, slug=slug)
|
||||
if sg_tag:
|
||||
sg_audio = sg_tag
|
||||
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
|
||||
if mp3_path not in media_files:
|
||||
media_files.append(mp3_path)
|
||||
# Plural audio: {slug}_plural.mp3
|
||||
if slug:
|
||||
pl_mp3 = AUDIO_DIR / f"{slug}_plural.mp3"
|
||||
if pl_mp3.exists():
|
||||
pl_audio = f"[sound:{pl_mp3.name}]"
|
||||
if pl_mp3 not in media_files:
|
||||
media_files.append(pl_mp3)
|
||||
|
||||
mishkal_eng = noun_inflection.get("mishkal") or ""
|
||||
tags = [RELEASE_TAG]
|
||||
if mishkal:
|
||||
tags.append(f"mishkal::{mishkal}")
|
||||
if mishkal_eng:
|
||||
tags.append(f"mishkal::{mishkal_eng}")
|
||||
if _is_irregular_plural(gender, plural_ktiv):
|
||||
tags.append("irregular")
|
||||
|
||||
|
|
@ -1553,7 +1865,7 @@ def build_plural_deck(
|
|||
pl_audio,
|
||||
meaning,
|
||||
root,
|
||||
mishkal,
|
||||
mishkal_heb,
|
||||
gender_heb,
|
||||
],
|
||||
tags=tags,
|
||||
|
|
|
|||
110
card_preview.html
Normal file
110
card_preview.html
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
<!DOCTYPE html>
|
||||
<html dir="rtl">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
|
||||
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
|
||||
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
|
||||
.card-content { padding: 16px; text-align: center; }
|
||||
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
|
||||
|
||||
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
|
||||
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
|
||||
.emoji-img { font-size: 48px; text-align: center; margin: 4px 0; }
|
||||
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
|
||||
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
|
||||
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
|
||||
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
|
||||
.example { font-size: 24px; color: #222; padding: 6px 8px; direction: rtl; text-align: center; border-left: 3px solid #ccc; font-style: italic; margin: 6px auto; max-width: 90%; }
|
||||
.voice-label { font-size: 20px; color: #888; }
|
||||
|
||||
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
|
||||
.more-header {
|
||||
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
|
||||
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
|
||||
}
|
||||
.more-header::-webkit-details-marker { display: none; }
|
||||
.more-header::before { content: "○ "; font-size: 14px; }
|
||||
details[open] > .more-header::before { content: "● "; }
|
||||
|
||||
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
|
||||
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Vocab: English → Hebrew (BACK) — collapsed</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">English → Hebrew — Back (default: collapsed)</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
|
||||
<div class="emoji-img">📍</div>
|
||||
<div class="divider"></div>
|
||||
<div class="hebrew">פַּעַם</div>
|
||||
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
|
||||
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Same card — EXPANDED</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">English → Hebrew — Back (expanded)</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
|
||||
<div class="emoji-img">📍</div>
|
||||
<div class="divider"></div>
|
||||
<div class="hebrew">פַּעַם</div>
|
||||
|
||||
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
|
||||
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
114
card_preview_conj.html
Normal file
114
card_preview_conj.html
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
<!DOCTYPE html>
|
||||
<html dir="rtl">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
|
||||
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
|
||||
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
|
||||
.card-content { padding: 16px; text-align: center; }
|
||||
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
|
||||
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
|
||||
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
|
||||
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
|
||||
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
|
||||
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
|
||||
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
|
||||
.voice-label { font-size: 20px; color: #888; }
|
||||
|
||||
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
|
||||
.more-header {
|
||||
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
|
||||
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
|
||||
}
|
||||
.more-header::-webkit-details-marker { display: none; }
|
||||
.more-header::before { content: "○ "; font-size: 14px; }
|
||||
details[open] > .more-header::before { content: "● "; }
|
||||
|
||||
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
|
||||
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — FRONT</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Front</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (collapsed)</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Back — default state</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
<hr>
|
||||
<div class="hebrew">שׁוֹמֵר (על)</div>
|
||||
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (expanded)</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Back — expanded</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
<hr>
|
||||
<div class="hebrew">שׁוֹמֵר (על)</div>
|
||||
|
||||
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
50000
data/en_50k.txt
Normal file
50000
data/en_50k.txt
Normal file
File diff suppressed because it is too large
Load diff
95256
data/words.json
95256
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -0,0 +1,150 @@
|
|||
# Adaptive Sentence Difficulty Cloze — v0.20 Design Spec
|
||||
|
||||
**Date:** 2026-03-15
|
||||
**Status:** Approved
|
||||
**Release:** v0.20
|
||||
|
||||
## Problem
|
||||
|
||||
Cloze cards currently select the example sentence closest to 9 words in length. This ignores whether the surrounding context words are familiar to the learner. A sentence full of rare words is harder than one with common words, regardless of length.
|
||||
|
||||
## Solution
|
||||
|
||||
Replace the length-based `_score()` function in `epub_examples.py` with a **frequency-based difficulty score**. The easiest sentence (most common context words) becomes the cloze. All vetted sentences remain on the card, ordered easy→hard.
|
||||
|
||||
## Scoring Pipeline
|
||||
|
||||
### Token Frequency Lookup (5-tier)
|
||||
|
||||
Given a nikkud sentence token, resolve its frequency rank:
|
||||
|
||||
1. **Known mapping** — look up token in the nikkud→ktiv_male map built from words.json headwords, conjugations, and inflections (94k mappings). If found, look up the ktiv_male in the frequency data.
|
||||
2. **Nikkud prefix stripping** — use `_try_strip_prefix()` to strip validated Hebrew prefixes (בהוכלמש), then resolve the remainder via the known mapping.
|
||||
3. **Academy rules converter** — apply `nikkud_to_ktiv_male.convert()` (91.6% accuracy) to produce ktiv_male, look up in frequency data.
|
||||
4. **strip_nikkud fallback** — use `helpers.strip_nikkud()` as a lossy fallback.
|
||||
5. **Ktiv_male prefix stripping** — strip 1-2 character Hebrew prefixes from the converted/stripped form and look up the stem.
|
||||
|
||||
Tokens not found in any tier are assigned a default high rank (50,000).
|
||||
|
||||
**Coverage:** ~93% of example sentence tokens resolve to a frequency rank (measured empirically on 7,588 sentences).
|
||||
|
||||
**Frequency data source:** Use `frequency_lookup.py` which auto-selects `frequency_clean.json` when available, falling back to `frequency_cache.json`.
|
||||
|
||||
### Sentence Difficulty Score
|
||||
|
||||
For a given word's candidate sentence:
|
||||
|
||||
1. Tokenize: split on whitespace, strip punctuation (.,!?;:"'"״׳–—()[]{}), split on maqaf (־).
|
||||
2. Exclude the target word's token using `cloze_word_start`/`cloze_word_end` offsets from the matched sentence.
|
||||
3. For each remaining token (length >= 2), resolve its frequency rank via the 5-tier pipeline.
|
||||
4. **Score = median frequency rank of context tokens.**
|
||||
|
||||
Lower score = easier (context words are more common). Median resists outliers (one rare proper noun shouldn't dominate).
|
||||
|
||||
### Integration Point
|
||||
|
||||
The scoring integrates into `epub_examples.py`'s existing `_score()` closure inside `update_words_json()` (line ~677). Currently:
|
||||
|
||||
```python
|
||||
def _score(s: dict) -> tuple[int,]:
|
||||
wc = s["word_count"]
|
||||
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
|
||||
return (length_score,)
|
||||
```
|
||||
|
||||
New scoring replaces length with frequency-based difficulty. The `_score` function gains access to the frequency pipeline via closure over the nikkud_map, nikkud_index, and freq_data built once at the start of `update_words_json()`.
|
||||
|
||||
**Minimum sentence length:** Reduced from 4 words to 3 words (`MIN_WORDS = 3` in epub_examples.py). Hebrew is more concise than English — 3-word sentences are valid and common. This expands the candidate pool for cloze selection.
|
||||
|
||||
**Behavioral change:** Because `pool.sort(key=_score)` determines which 3 sentences are selected as `best = pool[:3]`, changing the scoring function changes **which sentences are selected**, not just their order. This is intentional — we want the easiest sentences as cloze candidates, not the closest-to-9-words ones. Existing cloze GUIDs will be preserved when the same sentence text is re-selected; entries where a different sentence wins will get new GUIDs.
|
||||
|
||||
## Data Model Changes
|
||||
|
||||
### words.json
|
||||
|
||||
The `examples.cloze` dict (single sentence) gains an optional `difficulty_score` field:
|
||||
|
||||
```json
|
||||
{
|
||||
"examples": {
|
||||
"vetted": [
|
||||
{"text": "...", "source": "...", "match_method": "..."},
|
||||
{"text": "...", "source": "...", "match_method": "..."}
|
||||
],
|
||||
"cloze": {
|
||||
"text": "...",
|
||||
"cloze_word_start": 5,
|
||||
"cloze_word_end": 10,
|
||||
"cloze_hint": null,
|
||||
"cloze_guid": "abc123",
|
||||
"difficulty_score": 234
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The vetted list is also sorted by difficulty (easiest first), so the card back shows sentences in pedagogically useful order.
|
||||
|
||||
### SCHEMA.yaml
|
||||
|
||||
Add `difficulty_score` as optional integer field under `examples.cloze`.
|
||||
|
||||
## Implementation Scope
|
||||
|
||||
### New file: `sentence_difficulty.py`
|
||||
|
||||
Standalone module for sentence scoring. No pipeline step — called by `epub_examples.py`.
|
||||
|
||||
- `score_sentence(sentence_text: str, target_start: int, target_end: int, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — returns median context frequency rank. Uses `target_start`/`target_end` character offsets to exclude the cloze target token.
|
||||
- `build_nikkud_map(words: dict) -> dict[str, str]` — builds nikkud→ktiv_male lookup from words.json (headwords + conjugation forms + noun inflections). Returns `{nikkud_form: ktiv_male_form}`. Implementation note: should share iteration logic with `epub_examples._build_nikkud_index()` or derive from its output to avoid duplicating the traversal of words.json forms.
|
||||
- `_resolve_token_frequency(token: str, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — the 5-tier lookup. Uses `_try_strip_prefix` from epub_examples (made importable by removing underscore or adding a public wrapper).
|
||||
|
||||
### Modified files
|
||||
|
||||
- **`epub_examples.py`**:
|
||||
- Import `sentence_difficulty.score_sentence` and `sentence_difficulty.build_nikkud_map`
|
||||
- In `update_words_json()`: build nikkud_map and load freq_data once at start (before per-word loop)
|
||||
- Replace `_score()` closure with frequency-based scoring that calls `score_sentence()`
|
||||
- Sort vetted list by difficulty score (easiest first)
|
||||
- Store `difficulty_score` in the cloze dict
|
||||
- Make `_try_strip_prefix` importable (rename to `try_strip_prefix` or add public alias)
|
||||
- **`frequency_lookup.py`** — add `get_freq_data() -> dict` public accessor to expose the loaded frequency dict (avoids accessing private `_freq` directly)
|
||||
- **`SCHEMA.yaml`** — add `difficulty_score` field
|
||||
- **`run.py`** — no changes; scoring happens inside epub_examples step
|
||||
|
||||
### Not modified
|
||||
|
||||
- **`apkg_builder.py`** — reads cloze as-is; vetted order is already respected
|
||||
- **`nikkud_to_ktiv_male.py`** — used as-is
|
||||
- **Card templates** — no changes needed
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `nikkud_to_ktiv_male.convert()` — Academy rules converter (already written)
|
||||
- `epub_examples._try_strip_prefix()` / `_build_nikkud_index()` — nikkud prefix stripping and index
|
||||
- `frequency_lookup.py` — loads frequency data (auto-selects clean vs cache)
|
||||
- `helpers.strip_nikkud()` — fallback converter
|
||||
|
||||
## Validation
|
||||
|
||||
- **Unit tests** for `score_sentence()` with known easy/hard sentences
|
||||
- **Unit tests** for `_resolve_token_frequency()` covering all 5 tiers
|
||||
- **Integration test**: verify cloze selection picks easiest sentence, vetted list is sorted
|
||||
- **Spot check**: manually review 10 words with 3+ sentences to confirm ordering
|
||||
- **Regression**: existing tests pass, GUID coverage unchanged, deck validates
|
||||
|
||||
## Constraints
|
||||
|
||||
- `examples.cloze` remains a single dict (not converted to list)
|
||||
- No new Anki card types or fields
|
||||
- No runtime JS in Anki cards
|
||||
- No network calls during scoring
|
||||
- `difficulty_score` is informational metadata; card rendering doesn't depend on it
|
||||
- Existing cloze GUIDs preserved when the same sentence is re-selected
|
||||
|
||||
## Scope Exclusions (Future Work)
|
||||
|
||||
- **Pronominal suffix stripping** — would improve the ~7% unscored token rate; deferred (PROJECT_NOTES.md)
|
||||
- **Kamatz katan disambiguation** — requires morphological analysis; accepted limitation
|
||||
- **Per-learner adaptive difficulty** — requires Anki plugin; out of scope for static deck
|
||||
- **Multiple cloze sentences per card** — would require schema migration to list; deferred
|
||||
136
epub_examples.py
136
epub_examples.py
|
|
@ -18,7 +18,9 @@ import zipfile
|
|||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
import frequency_lookup
|
||||
from helpers import strip_nikkud
|
||||
from sentence_difficulty import build_nikkud_map, score_sentence
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -29,7 +31,7 @@ WORDS_JSON = DATA_DIR / "words.json"
|
|||
|
||||
# Book metadata: filename -> display name
|
||||
def _discover_epubs() -> dict[str, str]:
|
||||
"""Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
|
||||
"""Auto-discover all .epub and .txt files in EPUB_DIR, returning {filepath: display_name}."""
|
||||
if not EPUB_DIR.exists():
|
||||
return {}
|
||||
books: dict[str, str] = {}
|
||||
|
|
@ -50,11 +52,14 @@ def _discover_epubs() -> dict[str, str]:
|
|||
else:
|
||||
name = stem_stripped[:40]
|
||||
books[str(path)] = name
|
||||
# Also discover plain-text files (e.g. Ben Yehuda downloads)
|
||||
for path in sorted(EPUB_DIR.glob("*.txt")):
|
||||
books[str(path)] = path.stem
|
||||
return books
|
||||
|
||||
|
||||
# Sentence length bounds (word count)
|
||||
MIN_WORDS = 4
|
||||
MIN_WORDS = 3
|
||||
MAX_WORDS = 15
|
||||
|
||||
|
||||
|
|
@ -196,6 +201,20 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
|
|||
return _split_into_sentences(full_text, book_name)
|
||||
|
||||
|
||||
def extract_sentences_from_text(text_path: Path, book_name: str) -> list[dict]:
|
||||
"""Extract sentences from a plain-text file (e.g. Ben Yehuda downloads).
|
||||
|
||||
Args:
|
||||
text_path: Path to the .txt file.
|
||||
book_name: Human-readable book name used as the ``source`` field.
|
||||
|
||||
Returns:
|
||||
List of ``{"text": str, "source": str}`` dicts.
|
||||
"""
|
||||
full_text = text_path.read_text(encoding="utf-8")
|
||||
return _split_into_sentences(full_text, book_name)
|
||||
|
||||
|
||||
# ── Sentence splitting ───────────────────────────────────────────
|
||||
|
||||
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
|
||||
|
|
@ -431,6 +450,10 @@ def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, st
|
|||
return results
|
||||
|
||||
|
||||
# Public alias for use by sentence_difficulty module
|
||||
try_strip_prefix = _try_strip_prefix
|
||||
|
||||
|
||||
def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||
"""Build a mapping from nikkud form to list of (unique_key, match_type).
|
||||
|
||||
|
|
@ -480,7 +503,12 @@ def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
|||
|
||||
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||
sub = noun.get(field) or {}
|
||||
_add(sub.get("nikkud"), unique_key, "inflected")
|
||||
form = sub.get("nikkud")
|
||||
_add(form, unique_key, "inflected")
|
||||
# Index construct forms without maqaf too — modern text often
|
||||
# writes smichut as two space-separated words without maqaf
|
||||
if form and form.endswith("־"):
|
||||
_add(form[:-1], unique_key, "inflected")
|
||||
|
||||
pronominal = noun.get("pronominal_suffixes") or {}
|
||||
for _person, sub in pronominal.items():
|
||||
|
|
@ -632,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
|
||||
updated = 0
|
||||
|
||||
# Build frequency scoring infrastructure (once for all words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
|
||||
for unique_key, sent_list in matches.items():
|
||||
if unique_key not in words:
|
||||
continue
|
||||
|
|
@ -651,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
|
||||
pool = direct if direct else prefix_only
|
||||
|
||||
# Score: prefer 6–12 word sentences
|
||||
# Score: prefer sentences with easier (more common) context words
|
||||
def _score(s: dict) -> tuple[int,]:
|
||||
wc = s["word_count"]
|
||||
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
|
||||
return (length_score,)
|
||||
return (
|
||||
score_sentence(
|
||||
s["text"],
|
||||
s["char_offset"],
|
||||
s["char_end"],
|
||||
nikkud_map,
|
||||
nikkud_index,
|
||||
freq_data,
|
||||
),
|
||||
)
|
||||
|
||||
pool.sort(key=_score)
|
||||
best = pool[:3]
|
||||
|
|
@ -690,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
"cloze_word_end": top["char_end"],
|
||||
"cloze_hint": None,
|
||||
"cloze_guid": cloze_guid,
|
||||
"difficulty_score": _score(top)[0],
|
||||
}
|
||||
elif is_confusable:
|
||||
examples.pop("cloze", None)
|
||||
|
|
@ -697,9 +738,87 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
examples["rejected_count"] = 0
|
||||
updated += 1
|
||||
|
||||
# Deduplicate shared examples across confusable groups
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
if cleared:
|
||||
logger.info(f" Cleared shared examples from {cleared} confusable entries")
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def _deduplicate_confusable_examples(words: dict) -> int:
|
||||
"""Remove shared examples from less-common confusable group members.
|
||||
|
||||
After example matching assigns sentences, confusable entries often share
|
||||
identical examples (matched via shared nikkud forms). This function keeps
|
||||
examples only on the highest-frequency member, clearing others.
|
||||
|
||||
Args:
|
||||
words: The full words.json dict, modified in place (examples already
|
||||
assigned).
|
||||
|
||||
Returns:
|
||||
Count of entries whose examples were cleared.
|
||||
"""
|
||||
from collections import defaultdict
|
||||
|
||||
# Build confusable group map: group_id → [unique_key, ...]
|
||||
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||
for key, entry in words.items():
|
||||
cg = entry.get("confusable_group")
|
||||
if cg:
|
||||
group_id = tuple(sorted(cg))
|
||||
group_map[group_id].append(key)
|
||||
|
||||
cleared = 0
|
||||
|
||||
for _group_id, members in group_map.items():
|
||||
if len(members) < 2:
|
||||
continue
|
||||
|
||||
# Collect vetted sentence text sets per member
|
||||
member_texts: dict[str, frozenset[str]] = {}
|
||||
for key in members:
|
||||
vetted = (words[key].get("examples") or {}).get("vetted") or []
|
||||
texts = frozenset(e.get("text", "") for e in vetted)
|
||||
member_texts[key] = texts
|
||||
|
||||
# Find members with identical non-empty sentence sets
|
||||
# Group members by their sentence set
|
||||
text_groups: dict[frozenset[str], list[str]] = defaultdict(list)
|
||||
for key, texts in member_texts.items():
|
||||
if texts: # skip entries with no examples
|
||||
text_groups[texts].append(key)
|
||||
|
||||
# For each set of members sharing identical examples, keep only the
|
||||
# highest-frequency one
|
||||
for _texts, sharing_keys in text_groups.items():
|
||||
if len(sharing_keys) < 2:
|
||||
continue
|
||||
|
||||
# Sort by frequency_rank (lower = more common = winner).
|
||||
# No frequency → sort last (use large sentinel).
|
||||
# Tie-break: alphabetical by unique_key.
|
||||
def _sort_key(k: str) -> tuple[int, str]:
|
||||
rank = words[k].get("frequency_rank")
|
||||
return (rank if rank is not None else 999999, k)
|
||||
|
||||
sharing_keys.sort(key=_sort_key)
|
||||
winner = sharing_keys[0]
|
||||
losers = sharing_keys[1:]
|
||||
|
||||
for loser_key in losers:
|
||||
entry = words[loser_key]
|
||||
examples = entry.get("examples") or {}
|
||||
examples["vetted"] = []
|
||||
examples.pop("cloze", None)
|
||||
entry["examples"] = examples
|
||||
cleared += 1
|
||||
logger.debug(f" Cleared examples from {loser_key} (kept on {winner})")
|
||||
|
||||
return cleared
|
||||
|
||||
|
||||
# ── Public API ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
|
@ -720,6 +839,9 @@ def run(words: dict) -> dict:
|
|||
|
||||
for filepath, book_name in _discover_epubs().items():
|
||||
path = Path(filepath)
|
||||
if path.suffix == ".txt":
|
||||
sentences = extract_sentences_from_text(path, book_name)
|
||||
else:
|
||||
sentences = extract_sentences_from_epub(path, book_name)
|
||||
book_counts[book_name] = len(sentences)
|
||||
all_sentences.extend(sentences)
|
||||
|
|
|
|||
|
|
@ -74,6 +74,16 @@ def get_frequency_rank(word_no_nikkud: str) -> int | None:
|
|||
return _freq.get(clean)
|
||||
|
||||
|
||||
def get_freq_data() -> dict[str, int]:
|
||||
"""Return the full frequency dict (word -> rank).
|
||||
|
||||
Auto-loads from cache if not yet loaded.
|
||||
"""
|
||||
if not _freq:
|
||||
load()
|
||||
return _freq
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
load()
|
||||
|
|
|
|||
185
nikkud_to_ktiv_male.py
Normal file
185
nikkud_to_ktiv_male.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling).
|
||||
|
||||
Implements Hebrew Academy rules for matres lectionis insertion:
|
||||
- Rule A: U vowel (kubutz) → always insert vav
|
||||
- Rule B: O vowel (holam on non-vav) → insert vav
|
||||
- Rule C: I vowel (hiriq) → insert yod (conditionally)
|
||||
- Rule D: E vowel (tsere) → insert yod (limited cases)
|
||||
- Rule E/F: Consonantal vav/yod doubling
|
||||
|
||||
Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
|
||||
# Hebrew nikkud code points
|
||||
SHVA = "\u05b0"
|
||||
HATAF_SEGOL = "\u05b1"
|
||||
HATAF_PATAH = "\u05b2"
|
||||
HATAF_KAMATZ = "\u05b3"
|
||||
HIRIQ = "\u05b4"
|
||||
TSERE = "\u05b5"
|
||||
SEGOL = "\u05b6"
|
||||
PATAH = "\u05b7"
|
||||
KAMATZ = "\u05b8"
|
||||
HOLAM = "\u05b9"
|
||||
HOLAM_HASER = "\u05ba"
|
||||
KUBUTZ = "\u05bb"
|
||||
DAGESH = "\u05bc"
|
||||
METEG = "\u05bd"
|
||||
RAFE = "\u05bf"
|
||||
SHIN_DOT = "\u05c1"
|
||||
SIN_DOT = "\u05c2"
|
||||
|
||||
VAV = "ו"
|
||||
YOD = "י"
|
||||
MAQAF = "־"
|
||||
|
||||
VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ}
|
||||
|
||||
NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT}
|
||||
|
||||
|
||||
def _parse_segments(text: str) -> list[tuple[str, list[str]]]:
|
||||
"""Parse nikkud text into (character, [marks]) segments."""
|
||||
segments: list[tuple[str, list[str]]] = []
|
||||
cur_char: str | None = None
|
||||
cur_marks: list[str] = []
|
||||
|
||||
for ch in text:
|
||||
if unicodedata.category(ch) == "Mn":
|
||||
cur_marks.append(ch)
|
||||
else:
|
||||
if cur_char is not None:
|
||||
segments.append((cur_char, cur_marks))
|
||||
cur_char = ch
|
||||
cur_marks = []
|
||||
|
||||
if cur_char is not None:
|
||||
segments.append((cur_char, cur_marks))
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def _get_vowel(marks: list[str]) -> str | None:
|
||||
"""Extract the vowel mark from a list of combining marks."""
|
||||
for m in marks:
|
||||
if m in VOWELS:
|
||||
return m
|
||||
return None
|
||||
|
||||
|
||||
def _has_dagesh(marks: list[str]) -> bool:
|
||||
return DAGESH in marks
|
||||
|
||||
|
||||
def _is_hebrew_letter(ch: str) -> bool:
|
||||
return "\u05d0" <= ch <= "\u05ea"
|
||||
|
||||
|
||||
def convert(text: str) -> str:
|
||||
"""Convert nikkud Hebrew text to ktiv male.
|
||||
|
||||
Strips all nikkud marks and inserts matres lectionis (vav/yod)
|
||||
according to Hebrew Academy spelling rules.
|
||||
"""
|
||||
segments = _parse_segments(text)
|
||||
result: list[str] = []
|
||||
|
||||
for i, (ch, marks) in enumerate(segments):
|
||||
if not _is_hebrew_letter(ch):
|
||||
# Non-Hebrew character: output as-is (no marks)
|
||||
result.append(ch)
|
||||
continue
|
||||
|
||||
vowel = _get_vowel(marks)
|
||||
has_dag = _has_dagesh(marks)
|
||||
|
||||
# Output the base letter (strip all nikkud marks)
|
||||
result.append(ch)
|
||||
|
||||
# --- Rule A: U vowel (kubutz) → always add vav ---
|
||||
if vowel == KUBUTZ:
|
||||
result.append(VAV)
|
||||
continue
|
||||
|
||||
# --- Shuruk detection ---
|
||||
# Vav with dagesh and no other vowel = shuruk (already a mater)
|
||||
# Vav with dagesh AND a vowel = consonantal vav (ב with dagesh)
|
||||
# If letter is vav with dagesh only → it's shuruk, already output
|
||||
if ch == VAV and has_dag and vowel is None:
|
||||
# Shuruk: vav IS the mater lectionis, already output
|
||||
continue
|
||||
|
||||
# --- Rule B: O vowel (holam) → add vav ---
|
||||
if vowel in (HOLAM, HOLAM_HASER):
|
||||
if ch != VAV:
|
||||
# Exception: holam before aleph (pe-aleph verbs) — no vav
|
||||
# e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד
|
||||
next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א"
|
||||
if not next_is_aleph:
|
||||
result.append(VAV)
|
||||
# If ch IS vav (holam male), vav already output
|
||||
continue
|
||||
|
||||
# --- Rule C: I vowel (hiriq) → conditionally add yod ---
|
||||
if vowel == HIRIQ:
|
||||
if ch == YOD:
|
||||
# Yod already present, don't double
|
||||
continue
|
||||
|
||||
# Don't insert yod if next letter is already yod
|
||||
if i + 1 < len(segments) and segments[i + 1][0] == YOD:
|
||||
continue
|
||||
|
||||
# Rule C Section 3: Don't add yod if the NEXT consonant
|
||||
# has shva (indicating shva nach on that consonant)
|
||||
add_yod = True
|
||||
|
||||
if i + 1 < len(segments):
|
||||
next_ch, next_marks = segments[i + 1]
|
||||
next_vowel = _get_vowel(next_marks)
|
||||
|
||||
# Shva on next consonant = shva nach → don't add yod
|
||||
# UNLESS next consonant also has dagesh (= shva na / doubled)
|
||||
next_has_dagesh = _has_dagesh(next_marks)
|
||||
if next_vowel == SHVA and not next_has_dagesh:
|
||||
add_yod = False
|
||||
# No vowel on next consonant (word-final) = closed syllable
|
||||
# → don't add yod (e.g., suffix -תי -נו -תם)
|
||||
elif next_vowel is None and _is_hebrew_letter(next_ch):
|
||||
# Check if this is truly word-final or next-to-last
|
||||
remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0]))
|
||||
if remaining_letters <= 2:
|
||||
# Short suffix like תי, נו — don't add yod
|
||||
add_yod = False
|
||||
|
||||
if add_yod:
|
||||
result.append(YOD)
|
||||
continue
|
||||
|
||||
# --- Rule D: E vowel (tsere/segol) → generally NO yod ---
|
||||
# Exception (b): tsere before guttural/resh gets yod ONLY
|
||||
# in word-initial position (dagesh substitution in Hif'il/noun patterns)
|
||||
# e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע
|
||||
# but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר
|
||||
if vowel == TSERE:
|
||||
add_yod = False
|
||||
|
||||
if i + 1 < len(segments):
|
||||
next_ch = segments[i + 1][0]
|
||||
if next_ch in "אהחער":
|
||||
# Only at word-initial (pos 0) or after prefix (pos 1)
|
||||
# where dagesh substitution applies
|
||||
hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0]))
|
||||
if hebrew_pos <= 1:
|
||||
add_yod = True
|
||||
|
||||
if add_yod:
|
||||
result.append(YOD)
|
||||
continue
|
||||
|
||||
# All other vowels (patah, kamatz, segol, shva, hataf-*):
|
||||
# No mater lectionis insertion needed
|
||||
|
||||
return "".join(result)
|
||||
|
|
@ -40,6 +40,9 @@ SAVE_INTERVAL = 50 # write words.json every N processed entries
|
|||
|
||||
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
|
||||
|
||||
# Regex for Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||
|
||||
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
||||
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
|
||||
|
||||
|
|
@ -459,15 +462,29 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
|
|||
"""
|
||||
Extract (gender, mishkal) from the PoS section of the detail page.
|
||||
Returns ("masculine"|"feminine"|"", mishkal_english|"").
|
||||
|
||||
Pealim HTML structure:
|
||||
<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>
|
||||
The mishkal is in the <i> tag (k-notation, e.g. "ketel") or the nm= URL param (q-notation).
|
||||
Some nouns have no mishkal link: <p>Noun – masculine</p>
|
||||
"""
|
||||
gender = ""
|
||||
mishkal = ""
|
||||
|
||||
# Try various selectors that pealim uses for PoS info
|
||||
pos_section = soup.find("div", class_="pos") or soup.find("p", class_="pos")
|
||||
# Find the PoS <p> tag — on pealim detail pages it's a bare <p> like
|
||||
# "Noun – ketel pattern, masculine" or "Adjective – katul pattern"
|
||||
pos_section = None
|
||||
for p in soup.find_all("p"):
|
||||
text = p.get_text(" ", strip=True)
|
||||
if re.match(r"^(Noun|Adjective)\b", text):
|
||||
pos_section = p
|
||||
break
|
||||
|
||||
# Fall back to older selectors (div.pos, p.pos, div.page-header)
|
||||
if not pos_section:
|
||||
# Look for it in the page header area
|
||||
pos_section = soup.find("div", class_="page-header")
|
||||
pos_section = (
|
||||
soup.find("div", class_="pos") or soup.find("p", class_="pos") or soup.find("div", class_="page-header")
|
||||
)
|
||||
|
||||
if pos_section:
|
||||
text = pos_section.get_text(" ", strip=True)
|
||||
|
|
@ -476,13 +493,21 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
|
|||
if raw in text.lower():
|
||||
gender = canonical
|
||||
break
|
||||
# Mishkal detection: look for CaCaC-style patterns
|
||||
mishkal_match = re.search(r"\b([A-Z][a-zA-Z\']+)\b", text)
|
||||
if mishkal_match:
|
||||
candidate = mishkal_match.group(1)
|
||||
# Validate: mishkal names contain uppercase letters in CaCaC pattern
|
||||
if re.match(r"^[A-Za-z\']+$", candidate) and any(c.isupper() for c in candidate):
|
||||
mishkal = candidate
|
||||
|
||||
# Mishkal detection: extract from <a href="...nm=XXXX"><i>YYYY</i> pattern</a>
|
||||
# Nouns use nm= param, adjectives use am= param
|
||||
mishkal_link = pos_section.find("a", href=re.compile(r"[na]m="))
|
||||
if mishkal_link:
|
||||
# Prefer <i> tag text (k-notation, matches _MISHKAL_HEBREW_Q after k→q)
|
||||
i_tag = mishkal_link.find("i")
|
||||
if i_tag:
|
||||
mishkal = i_tag.get_text(strip=True)
|
||||
else:
|
||||
# Fall back to nm= URL parameter (already q-notation)
|
||||
href = mishkal_link.get("href", "")
|
||||
nm_match = re.search(r"[na]m=([a-zA-Z']+)", href)
|
||||
if nm_match:
|
||||
mishkal = nm_match.group(1)
|
||||
|
||||
# Also check the og:description or breadcrumbs for gender
|
||||
if not gender:
|
||||
|
|
@ -926,9 +951,17 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
|||
binyan = _extract_binyan_from_page(mo_soup)
|
||||
|
||||
meaning = ""
|
||||
prep: str | None = None
|
||||
lead_div = mo_soup.find("div", class_="lead")
|
||||
if lead_div:
|
||||
meaning = lead_div.get_text(strip=True)
|
||||
# Extract preposition(s) from the lead text, e.g. "(על)" → "על"
|
||||
prep_matches = HBPAREN_RE.findall(meaning)
|
||||
if prep_matches:
|
||||
prep = " ".join(prep_matches)
|
||||
# Fall back to any prep already stored (e.g. from a previous manual edit)
|
||||
if prep is None:
|
||||
prep = existing.get("prep")
|
||||
|
||||
# Parse active forms
|
||||
mo_active = _parse_conjugation_table(mo_soup, passive=False)
|
||||
|
|
@ -980,7 +1013,7 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
|||
"binyan": binyan,
|
||||
"binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
|
||||
"meaning": meaning,
|
||||
"prep": existing.get("prep"),
|
||||
"prep": prep,
|
||||
"active_forms": active_forms,
|
||||
"hufal_pual_forms": hufal_pual_forms,
|
||||
"reference_form_passive": reference_form_passive,
|
||||
|
|
|
|||
|
|
@ -82,10 +82,13 @@ BINYAN_HEBREW: dict[str, str] = {
|
|||
|
||||
# Regex for extracting emoji characters
|
||||
EMOJI_RE = re.compile(
|
||||
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF]+",
|
||||
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF\uFE0E\uFE0F\u200D]+",
|
||||
re.UNICODE,
|
||||
)
|
||||
|
||||
# Regex for extracting Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||
|
||||
# Fields that must never be overwritten when updating an existing entry
|
||||
PROTECTED_FIELDS = frozenset(
|
||||
[
|
||||
|
|
@ -149,6 +152,7 @@ def _default_entry() -> dict:
|
|||
"image": None,
|
||||
"image_source": None,
|
||||
"hint": "",
|
||||
"prep": None,
|
||||
"shared_roots": [],
|
||||
"confusable_group": None,
|
||||
"confusables_guid": None,
|
||||
|
|
@ -170,8 +174,9 @@ def _extract_emoji(text: str) -> str | None:
|
|||
|
||||
|
||||
def _clean_meaning(raw: str) -> str:
|
||||
"""Strip emoji and extra whitespace from a raw meaning string."""
|
||||
"""Strip emoji, Hebrew parenthesized prepositions, and extra whitespace from a raw meaning string."""
|
||||
cleaned = EMOJI_RE.sub("", raw)
|
||||
cleaned = HBPAREN_RE.sub("", cleaned)
|
||||
return " ".join(cleaned.split())
|
||||
|
||||
|
||||
|
|
@ -453,6 +458,9 @@ def _merge_row(
|
|||
emoji = _extract_emoji(meaning_raw_raw)
|
||||
tags = _build_tags(pos_en, root)
|
||||
audio_file = _compute_audio_file(slug, ktiv_male)
|
||||
# Extract Hebrew preposition(s) from the raw meaning (e.g. "(על)" → "על")
|
||||
prep_matches = HBPAREN_RE.findall(meaning_raw)
|
||||
prep: str | None = " ".join(prep_matches) if prep_matches else None
|
||||
|
||||
# ---- locate existing entry ----
|
||||
unique_key: str | None = slug_index.get(slug) if slug else None
|
||||
|
|
@ -468,6 +476,7 @@ def _merge_row(
|
|||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["audio_url"] = audio_url
|
||||
entry["audio_file"] = audio_file
|
||||
entry["tags"] = tags
|
||||
|
|
@ -484,6 +493,7 @@ def _merge_row(
|
|||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["emoji"] = emoji
|
||||
entry["emoji_source"] = "from_pealim" if emoji else None
|
||||
entry["audio_url"] = audio_url
|
||||
|
|
|
|||
|
|
@ -20,8 +20,11 @@ from pathlib import Path
|
|||
|
||||
import requests
|
||||
|
||||
sys.path.insert(0, "/home/node/projects")
|
||||
import load_keeshare
|
||||
|
||||
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
||||
FORGEJO_TOKEN = "f023bd4cfd4b77aac584647f2fa8481df3906578"
|
||||
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["password"]
|
||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
|
||||
# All deck variants to include in release
|
||||
|
|
|
|||
269
scripts/assign_pseudo_frequency.py
Normal file
269
scripts/assign_pseudo_frequency.py
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Assign pseudo-frequency to confusable groups using English word frequency.
|
||||
|
||||
Problem: Confusable entries share the same ktiv_male and thus the same Hebrew
|
||||
frequency rank. This script uses English frequency to differentiate them so
|
||||
Anki sorts more-common meanings first.
|
||||
|
||||
Algorithm:
|
||||
1. For each confusable group where all entries share the same Hebrew frequency,
|
||||
extract the first meaningful English keyword from each entry's meaning field.
|
||||
2. Look up English frequency rank for each keyword.
|
||||
3. Assign pseudo_frequency: the most frequent English meaning keeps the original
|
||||
Hebrew rank; less frequent meanings get progressively higher (worse) ranks
|
||||
by adding an offset (100 * position in group).
|
||||
|
||||
Usage:
|
||||
python3 scripts/assign_pseudo_frequency.py # assign and save
|
||||
python3 scripts/assign_pseudo_frequency.py --dry-run # preview only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt"
|
||||
|
||||
# Words too common/vague to use as frequency signal
|
||||
_EN_STOP = frozenset(
|
||||
{
|
||||
"to",
|
||||
"be",
|
||||
"a",
|
||||
"an",
|
||||
"the",
|
||||
"of",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"for",
|
||||
"and",
|
||||
"with",
|
||||
"by",
|
||||
"or",
|
||||
"but",
|
||||
"not",
|
||||
"as",
|
||||
"its",
|
||||
"it",
|
||||
"is",
|
||||
"was",
|
||||
"are",
|
||||
"from",
|
||||
"that",
|
||||
"this",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"can",
|
||||
"could",
|
||||
"may",
|
||||
"might",
|
||||
"shall",
|
||||
"should",
|
||||
"must",
|
||||
"no",
|
||||
"yes",
|
||||
"very",
|
||||
"too",
|
||||
"also",
|
||||
"just",
|
||||
"only",
|
||||
"so",
|
||||
"up",
|
||||
"out",
|
||||
"into",
|
||||
"over",
|
||||
"after",
|
||||
"before",
|
||||
"about",
|
||||
"more",
|
||||
"than",
|
||||
"other",
|
||||
"some",
|
||||
"any",
|
||||
"all",
|
||||
"each",
|
||||
"every",
|
||||
"both",
|
||||
"few",
|
||||
"many",
|
||||
"much",
|
||||
"most",
|
||||
"such",
|
||||
"own",
|
||||
"same",
|
||||
"well",
|
||||
"still",
|
||||
"even",
|
||||
"how",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
"whose",
|
||||
"why",
|
||||
"because",
|
||||
"if",
|
||||
"then",
|
||||
"else",
|
||||
"while",
|
||||
"until",
|
||||
"though",
|
||||
"whether",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _load_en_freq() -> dict[str, int]:
|
||||
"""Load English frequency data: word -> rank (1 = most common)."""
|
||||
freq: dict[str, int] = {}
|
||||
rank = 1
|
||||
with open(EN_FREQ_PATH, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if parts:
|
||||
word = parts[0].lower()
|
||||
if word not in freq:
|
||||
freq[word] = rank
|
||||
rank += 1
|
||||
return freq
|
||||
|
||||
|
||||
def _extract_keywords(meaning: str) -> list[str]:
|
||||
"""Extract meaningful English keywords from a meaning string.
|
||||
|
||||
Returns list of lowercase words, filtered for stop words and short words.
|
||||
"""
|
||||
# Strip parenthesized content, punctuation
|
||||
cleaned = re.sub(r"\([^)]*\)", " ", meaning)
|
||||
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
|
||||
return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP]
|
||||
|
||||
|
||||
def assign_pseudo_frequencies(
|
||||
words: dict,
|
||||
en_freq: dict[str, int],
|
||||
dry_run: bool = False,
|
||||
) -> int:
|
||||
"""Assign pseudo_frequency to confusable groups. Returns count of changes."""
|
||||
|
||||
# Group by confusables_guid
|
||||
groups: dict[str, list[str]] = defaultdict(list)
|
||||
for key, entry in words.items():
|
||||
cg = entry.get("confusables_guid")
|
||||
if cg:
|
||||
groups[cg].append(key)
|
||||
|
||||
changes = 0
|
||||
assigned_groups = 0
|
||||
skipped_diff = 0
|
||||
skipped_no_en = 0
|
||||
|
||||
for _guid, keys in groups.items():
|
||||
entries = [words[k] for k in keys]
|
||||
freqs = [e.get("frequency") for e in entries]
|
||||
|
||||
# Skip groups that are already differentiated
|
||||
unique_freqs = set(freqs)
|
||||
if len(unique_freqs) > 1:
|
||||
skipped_diff += 1
|
||||
continue
|
||||
|
||||
base_freq = freqs[0] # All same (or all None)
|
||||
|
||||
# Look up English frequency for each entry
|
||||
en_ranks: list[tuple[int, str]] = [] # (en_rank, key)
|
||||
for key, entry in zip(keys, entries, strict=True):
|
||||
keywords = _extract_keywords(entry.get("meaning", ""))
|
||||
en_rank = 999_999
|
||||
for kw in keywords[:5]:
|
||||
r = en_freq.get(kw)
|
||||
if r is not None:
|
||||
en_rank = r
|
||||
break
|
||||
en_ranks.append((en_rank, key))
|
||||
|
||||
# Sort by English frequency (lower rank = more common)
|
||||
en_ranks.sort()
|
||||
|
||||
# Check if all entries have the same English rank (no signal)
|
||||
if len({r for r, _ in en_ranks}) <= 1:
|
||||
skipped_no_en += 1
|
||||
continue
|
||||
|
||||
assigned_groups += 1
|
||||
|
||||
# Assign pseudo_frequency: most common gets base, others get offset
|
||||
for position, (en_rank, key) in enumerate(en_ranks):
|
||||
pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank
|
||||
|
||||
if not dry_run:
|
||||
words[key]["pseudo_frequency"] = pseudo
|
||||
changes += 1
|
||||
|
||||
if dry_run:
|
||||
meaning = words[key].get("meaning", "")[:40]
|
||||
logger.info(
|
||||
" [en:%5d] pseudo=%6d %s",
|
||||
en_rank,
|
||||
pseudo,
|
||||
meaning,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal",
|
||||
assigned_groups,
|
||||
skipped_diff,
|
||||
skipped_no_en,
|
||||
)
|
||||
return changes
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
|
||||
logger.info("Loading English frequency data: %s", EN_FREQ_PATH)
|
||||
en_freq = _load_en_freq()
|
||||
logger.info("English frequency: %d entries", len(en_freq))
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words: dict = json.load(f)
|
||||
|
||||
changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run)
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — %d changes would be made", changes)
|
||||
return
|
||||
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Saved %d pseudo-frequency assignments to words.json", changes)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -685,6 +685,112 @@ def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
|||
_pass(name)
|
||||
|
||||
|
||||
def test_no_shared_confusable_examples(data: dict[str, Any]) -> None:
|
||||
"""Within each confusable group, no two entries should share the same set of vetted sentence texts.
|
||||
|
||||
Shared examples indicate the deduplication step in epub_examples.py
|
||||
failed to assign examples to only the highest-frequency member.
|
||||
"""
|
||||
name = "no_shared_confusable_examples"
|
||||
errors: list[str] = []
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
# Build confusable group map
|
||||
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||
for key, entry in data.items():
|
||||
cg = entry.get("confusable_group")
|
||||
if cg:
|
||||
group_id = tuple(sorted(cg))
|
||||
group_map[group_id].append(key)
|
||||
|
||||
for _group_id, members in group_map.items():
|
||||
if len(members) < 2:
|
||||
continue
|
||||
|
||||
# Collect sentence text sets per member
|
||||
text_sets: dict[str, frozenset[str]] = {}
|
||||
for key in members:
|
||||
vetted = (data[key].get("examples") or {}).get("vetted") or []
|
||||
texts = frozenset(e.get("text", "") for e in vetted)
|
||||
if texts:
|
||||
text_sets[key] = texts
|
||||
|
||||
# Check for identical sets
|
||||
seen: dict[frozenset[str], str] = {}
|
||||
for key, texts in text_sets.items():
|
||||
if texts in seen:
|
||||
meaning_a = (data[seen[texts]].get("meaning") or "")[:30]
|
||||
meaning_b = (data[key].get("meaning") or "")[:30]
|
||||
errors.append(
|
||||
f"{seen[texts]} ({meaning_a}) and {key} ({meaning_b}) share {len(texts)} identical example(s)"
|
||||
)
|
||||
else:
|
||||
seen[texts] = key
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
|
||||
"""English meanings must not contain bare Hebrew text (spoils the card)."""
|
||||
name = "no_hebrew_in_meaning"
|
||||
errors: list[str] = []
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
|
||||
for key, entry in data.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
# Apply same cleaning pipeline as apkg_builder
|
||||
cleaned = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||
cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(", ;:")
|
||||
if hebrew_re.search(cleaned):
|
||||
errors.append(f"[{key}] meaning still contains Hebrew after cleaning: {cleaned!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_mishkal_consistency(data: dict[str, Any]) -> None:
|
||||
"""mishkal_hebrew must match mishkal via _mishkal_to_hebrew conversion."""
|
||||
name = "mishkal_consistency"
|
||||
errors: list[str] = []
|
||||
|
||||
try:
|
||||
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||
except ImportError:
|
||||
_warn(name, ["Could not import _mishkal_to_hebrew — skipping"])
|
||||
return
|
||||
|
||||
for key, entry in data.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_eng and mishkal_heb:
|
||||
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||
if expected and expected != mishkal_heb:
|
||||
errors.append(f"[{key}] {infl_key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||
if mishkal_heb and not mishkal_eng:
|
||||
errors.append(f"[{key}] {infl_key}: has mishkal_hebrew but no mishkal")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stats summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
|
@ -702,6 +808,11 @@ def print_stats(data: dict[str, Any]) -> None:
|
|||
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
|
||||
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
|
||||
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
|
||||
with_mishkal = sum(
|
||||
1
|
||||
for e in data.values()
|
||||
if (e.get("noun_inflection") or {}).get("mishkal") or (e.get("adjective_inflection") or {}).get("mishkal")
|
||||
)
|
||||
|
||||
print()
|
||||
print("Stats Summary")
|
||||
|
|
@ -709,6 +820,7 @@ def print_stats(data: dict[str, Any]) -> None:
|
|||
print(f" Total entries: {total:>6}")
|
||||
print(f" With conjugation data: {with_conj:>6}")
|
||||
print(f" With noun_inflection: {with_noun_inf:>6}")
|
||||
print(f" With mishkal: {with_mishkal:>6}")
|
||||
print(f" With vetted examples: {with_vetted:>6}")
|
||||
print(f" With cloze examples: {with_cloze:>6}")
|
||||
print(f" With images: {with_image:>6}")
|
||||
|
|
@ -740,6 +852,9 @@ ALL_TESTS: dict[str, Any] = {
|
|||
"conjugation_form_guids": test_conjugation_form_guids,
|
||||
"conjugation_person_codes": test_conjugation_person_codes,
|
||||
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||||
"no_shared_confusable_examples": test_no_shared_confusable_examples,
|
||||
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
|
||||
"mishkal_consistency": test_mishkal_consistency,
|
||||
}
|
||||
|
||||
|
||||
|
|
|
|||
198
sentence_difficulty.py
Normal file
198
sentence_difficulty.py
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
"""Sentence difficulty scoring by context-word frequency.
|
||||
|
||||
Scores sentences by the median frequency rank of context words
|
||||
(excluding the cloze target). Lower score = easier sentence.
|
||||
Used by epub_examples.py to select the best cloze sentence.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from statistics import median
|
||||
|
||||
import helpers
|
||||
import nikkud_to_ktiv_male
|
||||
|
||||
DEFAULT_RANK = 50_000
|
||||
|
||||
# Hebrew prefix consonants for ktiv_male prefix stripping (tier 5)
|
||||
_KM_PREFIX_CHARS = set("בהוכלמשע")
|
||||
|
||||
# Punctuation to strip from tokens
|
||||
_PUNCT = set('.,!?;:"\'"״׳–—()[]{}')
|
||||
|
||||
# Maqaf (Hebrew hyphen) — splits tokens
|
||||
_MAQAF = "־"
|
||||
|
||||
|
||||
def build_nikkud_map(words: dict) -> dict[str, str]:
|
||||
"""Build nikkud→ktiv_male lookup from words.json.
|
||||
|
||||
Indexes: headwords, conjugation forms (active, passive, infinitive,
|
||||
reference_form), noun inflections (singular, plural, construct,
|
||||
pronominal suffixes), and adjective inflections (ms/fs/mp/fp).
|
||||
|
||||
Args:
|
||||
words: The full words.json dict keyed by unique_key.
|
||||
|
||||
Returns:
|
||||
Dict mapping nikkud form to ktiv_male string.
|
||||
When collisions occur, last-write wins (acceptable for frequency lookup).
|
||||
"""
|
||||
nmap: dict[str, str] = {}
|
||||
|
||||
def _add(nikkud: str | None, ktiv_male: str | None) -> None:
|
||||
if nikkud and ktiv_male:
|
||||
nmap[nikkud] = ktiv_male
|
||||
|
||||
for entry in words.values():
|
||||
word = entry.get("word") or {}
|
||||
_add(word.get("nikkud"), word.get("ktiv_male"))
|
||||
|
||||
# Conjugation forms
|
||||
conj = entry.get("conjugation") or {}
|
||||
for form_entry in conj.get("active_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
for form_entry in conj.get("hufal_pual_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
inf = conj.get("infinitive") or {}
|
||||
_add(inf.get("nikkud"), inf.get("ktiv_male"))
|
||||
ref = conj.get("reference_form") or {}
|
||||
_add(ref.get("nikkud"), ref.get("ktiv_male"))
|
||||
|
||||
# Noun inflection forms
|
||||
noun = entry.get("noun_inflection") or {}
|
||||
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||
sub = noun.get(field) or {}
|
||||
nikkud_form = sub.get("nikkud")
|
||||
ktiv = sub.get("ktiv_male")
|
||||
_add(nikkud_form, ktiv)
|
||||
# Index construct forms without maqaf
|
||||
if nikkud_form and nikkud_form.endswith("־") and ktiv:
|
||||
_add(nikkud_form[:-1], ktiv)
|
||||
pronominal = noun.get("pronominal_suffixes") or {}
|
||||
for sub in pronominal.values():
|
||||
if isinstance(sub, dict):
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
# Adjective inflection forms
|
||||
adj = entry.get("adjective_inflection") or {}
|
||||
for field in ("ms", "fs", "mp", "fp"):
|
||||
sub = adj.get(field) or {}
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
return nmap
|
||||
|
||||
|
||||
def _resolve_token_frequency(
|
||||
token: str,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Resolve a nikkud sentence token to its frequency rank.
|
||||
|
||||
Uses a 5-tier pipeline:
|
||||
1. Known mapping (nikkud_map from words.json)
|
||||
2. Nikkud prefix stripping (epub_examples.try_strip_prefix)
|
||||
3. Academy rules converter (nikkud_to_ktiv_male.convert)
|
||||
4. strip_nikkud fallback (helpers.strip_nikkud)
|
||||
5. Ktiv_male prefix stripping on the converted form
|
||||
|
||||
Returns:
|
||||
Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found.
|
||||
"""
|
||||
# Tier 1: Direct lookup in nikkud→ktiv_male map
|
||||
ktiv = nikkud_map.get(token)
|
||||
if ktiv and ktiv in freq_data:
|
||||
return freq_data[ktiv]
|
||||
|
||||
# Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map
|
||||
from epub_examples import try_strip_prefix
|
||||
|
||||
prefix_hits = try_strip_prefix(token, nikkud_index)
|
||||
for _unique_key, _match_type, matched_remainder in prefix_hits:
|
||||
remainder_ktiv = nikkud_map.get(matched_remainder)
|
||||
if remainder_ktiv and remainder_ktiv in freq_data:
|
||||
return freq_data[remainder_ktiv]
|
||||
|
||||
# Tier 3: Academy rules converter
|
||||
converted = nikkud_to_ktiv_male.convert(token)
|
||||
if converted in freq_data:
|
||||
return freq_data[converted]
|
||||
|
||||
# Tier 4: strip_nikkud fallback
|
||||
stripped = helpers.strip_nikkud(token)
|
||||
if stripped != converted and stripped in freq_data:
|
||||
return freq_data[stripped]
|
||||
|
||||
# Tier 5: Ktiv_male prefix stripping on converted/stripped form
|
||||
for form in (converted, stripped):
|
||||
for prefix_len in (1, 2):
|
||||
if len(form) > prefix_len + 1:
|
||||
prefix = form[:prefix_len]
|
||||
if all(c in _KM_PREFIX_CHARS for c in prefix):
|
||||
stem = form[prefix_len:]
|
||||
if stem in freq_data:
|
||||
return freq_data[stem]
|
||||
|
||||
return DEFAULT_RANK
|
||||
|
||||
|
||||
def score_sentence(
|
||||
text: str,
|
||||
target_start: int,
|
||||
target_end: int,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Score a sentence by median frequency rank of context words.
|
||||
|
||||
Args:
|
||||
text: The full sentence text (with nikkud).
|
||||
target_start: Character offset where the cloze target word starts.
|
||||
target_end: Character offset where the cloze target word ends.
|
||||
nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map().
|
||||
nikkud_index: nikkud index from epub_examples._build_nikkud_index().
|
||||
freq_data: Frequency dict from frequency_lookup.get_freq_data().
|
||||
|
||||
Returns:
|
||||
Median frequency rank of context tokens (int). Lower = easier.
|
||||
Returns DEFAULT_RANK if no scoreable context tokens.
|
||||
"""
|
||||
# Tokenize: split on whitespace, then split on maqaf
|
||||
raw_tokens = text.split()
|
||||
tokens_with_pos: list[tuple[str, int, int]] = []
|
||||
pos = 0
|
||||
for raw in raw_tokens:
|
||||
start = text.index(raw, pos)
|
||||
# Split on maqaf
|
||||
parts = raw.split(_MAQAF)
|
||||
sub_pos = start
|
||||
for part in parts:
|
||||
if part:
|
||||
tokens_with_pos.append((part, sub_pos, sub_pos + len(part)))
|
||||
sub_pos += len(part) + 1 # +1 for maqaf
|
||||
pos = start + len(raw)
|
||||
|
||||
# Filter: exclude target word, strip punctuation, skip short tokens
|
||||
context_ranks: list[int] = []
|
||||
for token, tok_start, tok_end in tokens_with_pos:
|
||||
# Exclude target word by overlap with char offsets
|
||||
if tok_start < target_end and tok_end > target_start:
|
||||
continue
|
||||
|
||||
# Strip punctuation from edges
|
||||
cleaned = token.strip("".join(_PUNCT))
|
||||
if len(cleaned) < 2:
|
||||
continue
|
||||
|
||||
rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data)
|
||||
context_ranks.append(rank)
|
||||
|
||||
if not context_ranks:
|
||||
return DEFAULT_RANK
|
||||
|
||||
return int(median(context_ranks))
|
||||
246
tests/test_apkg_builder.py
Normal file
246
tests/test_apkg_builder.py
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
"""Unit tests for apkg_builder — Sprint 15 learnings.
|
||||
|
||||
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
|
||||
meanings, PoS exact matching, gender field population, and mishkal data integrity.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure project root is on path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from apkg_builder import _categorize_pos, _cloze_prefix_len
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cloze prefix preservation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestClozePrefix:
|
||||
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
|
||||
|
||||
def test_single_prefix_bet(self):
|
||||
# בַּתּוֹר = bet + patach + tor
|
||||
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
|
||||
|
||||
def test_single_prefix_lamed(self):
|
||||
# לַמֶּלֶךְ = lamed + patach + melech
|
||||
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
|
||||
|
||||
def test_two_consonant_prefix(self):
|
||||
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
|
||||
token = "שֶׁבַּתּוֹר"
|
||||
word = "תּוֹר"
|
||||
prefix_len = _cloze_prefix_len(token, word)
|
||||
assert prefix_len > 0
|
||||
assert token[prefix_len:].startswith(word)
|
||||
|
||||
def test_no_prefix_direct_match(self):
|
||||
# Word appears at start — no prefix
|
||||
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
|
||||
|
||||
def test_empty_inputs(self):
|
||||
assert _cloze_prefix_len("", "תּוֹר") == 0
|
||||
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
|
||||
assert _cloze_prefix_len("", "") == 0
|
||||
|
||||
def test_non_prefix_letter_returns_zero(self):
|
||||
# If the "prefix" chars aren't valid prefix letters, return 0
|
||||
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
|
||||
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
|
||||
|
||||
def test_prefix_preserves_nikkud(self):
|
||||
# Verify that prefix_len includes nikkud marks
|
||||
token = "בַּתּוֹר"
|
||||
word = "תּוֹר"
|
||||
prefix_len = _cloze_prefix_len(token, word)
|
||||
prefix = token[:prefix_len]
|
||||
# Prefix should contain at least bet + nikkud mark(s)
|
||||
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
|
||||
assert base_letters == ["ב"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# PoS exact matching (no substring collisions)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCategorizePos:
|
||||
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
|
||||
|
||||
def test_noun_exact(self):
|
||||
assert _categorize_pos("Noun") == "Noun"
|
||||
|
||||
def test_pronoun_is_other(self):
|
||||
assert _categorize_pos("Pronoun") == "Other"
|
||||
|
||||
def test_verb_exact(self):
|
||||
assert _categorize_pos("Verb") == "Verb"
|
||||
|
||||
def test_noun_with_dash(self):
|
||||
assert _categorize_pos("Noun – masculine") == "Noun"
|
||||
|
||||
def test_adjective(self):
|
||||
assert _categorize_pos("Adjective") == "Adjective"
|
||||
|
||||
def test_conjunction_is_other(self):
|
||||
assert _categorize_pos("Conjunction") == "Other"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hebrew spoiler stripping from English meanings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHebrewSpoilerStripping:
|
||||
"""English meanings must not contain Hebrew text (spoils the card)."""
|
||||
|
||||
# Use the same regex from apkg_builder.py
|
||||
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
|
||||
|
||||
@staticmethod
|
||||
def _strip_hebrew(meaning: str) -> str:
|
||||
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
|
||||
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||
meaning = re.sub(r"[;:]\s*—", " —", meaning)
|
||||
meaning = re.sub(r";\s*:", ";", meaning)
|
||||
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||||
|
||||
def test_pure_english_unchanged(self):
|
||||
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
|
||||
|
||||
def test_hebrew_word_removed(self):
|
||||
result = self._strip_hebrew("to eat; אכל")
|
||||
assert "אכל" not in result
|
||||
|
||||
def test_hebrew_with_nikkud_removed(self):
|
||||
result = self._strip_hebrew("tall; גָּבוֹהַּ")
|
||||
assert "גָּבוֹהַּ" not in result
|
||||
assert "tall" in result
|
||||
|
||||
def test_no_residual_hebrew_in_real_data(self):
|
||||
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
# The regex used in apkg_builder
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
spoilers = []
|
||||
for key, entry in words.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
cleaned = self._strip_hebrew(meaning)
|
||||
if hebrew_re.search(cleaned):
|
||||
spoilers.append(f"{key}: {cleaned!r}")
|
||||
|
||||
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gender field for nouns (words.json data integrity)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGenderDataIntegrity:
|
||||
"""Nouns with noun_inflection should have gender populated."""
|
||||
|
||||
@pytest.fixture()
|
||||
def words(self):
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def test_nouns_have_gender(self, words):
|
||||
"""Nouns with noun_inflection should have a valid gender."""
|
||||
missing = []
|
||||
for key, entry in words.items():
|
||||
pos = entry.get("pos") or ""
|
||||
ni = entry.get("noun_inflection")
|
||||
if pos.startswith("Noun") and ni:
|
||||
gender = ni.get("gender") or ""
|
||||
if gender not in ("masculine", "feminine", "masculine and feminine"):
|
||||
missing.append(f"{key}: gender={gender!r}")
|
||||
|
||||
# Allow up to 7% missing (loan words, compound words, etc.)
|
||||
noun_count = sum(
|
||||
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
|
||||
)
|
||||
if noun_count > 0:
|
||||
pct_missing = len(missing) / noun_count
|
||||
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mishkal data integrity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMishkalIntegrity:
|
||||
"""Validate mishkal data consistency in words.json."""
|
||||
|
||||
@pytest.fixture()
|
||||
def words(self):
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def test_mishkal_hebrew_matches_english(self, words):
|
||||
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
|
||||
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||
|
||||
mismatches = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_eng and mishkal_heb:
|
||||
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||
if expected and expected != mishkal_heb:
|
||||
mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||
|
||||
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
|
||||
|
||||
def test_mishkal_hebrew_is_hebrew(self, words):
|
||||
"""mishkal_hebrew must contain Hebrew characters."""
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
bad = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_heb and not hebrew_re.search(mishkal_heb):
|
||||
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
|
||||
|
||||
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
|
||||
|
||||
def test_no_orphaned_mishkal(self, words):
|
||||
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
|
||||
orphans = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
if mishkal_heb and not mishkal_eng:
|
||||
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
|
||||
|
||||
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"
|
||||
|
|
@ -484,3 +484,41 @@ class TestScrapePrepositionDetail:
|
|||
def test_empty_on_no_table(self) -> None:
|
||||
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
|
||||
assert result == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests for _parse_noun_gender_mishkal mishkal extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
from bs4 import BeautifulSoup # noqa: E402
|
||||
|
||||
from pealim_detail_scrape import _parse_noun_gender_mishkal # noqa: E402
|
||||
|
||||
|
||||
class TestNounGenderMishkal:
|
||||
def test_noun_with_mishkal(self):
|
||||
html = '<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "masculine"
|
||||
assert mishkal == "ketel"
|
||||
|
||||
def test_noun_without_mishkal(self):
|
||||
html = "<p>Noun – masculine</p>"
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "masculine"
|
||||
assert mishkal == ""
|
||||
|
||||
def test_adjective_mishkal(self):
|
||||
html = '<p>Adjective – <a href="/dict/?pos=adjective&am=qatul"><i>katul</i> pattern</a></p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
_, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert mishkal == "katul"
|
||||
|
||||
def test_feminine_noun(self):
|
||||
html = '<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, feminine</p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "feminine"
|
||||
assert mishkal == "ketel"
|
||||
|
|
|
|||
127
tests/test_epub_examples.py
Normal file
127
tests/test_epub_examples.py
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
"""Tests for epub_examples deduplication of confusable group examples."""
|
||||
|
||||
from epub_examples import _deduplicate_confusable_examples
|
||||
|
||||
|
||||
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
|
||||
"""Build a minimal words.json entry for testing."""
|
||||
entry = {
|
||||
"meaning": meaning,
|
||||
"confusable_group": confusable_group,
|
||||
}
|
||||
if vetted_texts is not None:
|
||||
entry["examples"] = {
|
||||
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
|
||||
}
|
||||
if frequency_rank is not None:
|
||||
entry["frequency_rank"] = frequency_rank
|
||||
return entry
|
||||
|
||||
|
||||
class TestDeduplicateConfusableExamples:
|
||||
"""Tests for _deduplicate_confusable_examples()."""
|
||||
|
||||
def test_shared_examples_kept_on_higher_frequency(self):
|
||||
"""When two confusables share identical examples, the one with
|
||||
lower frequency_rank (more common) keeps them."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
|
||||
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_no_action_when_examples_differ(self):
|
||||
"""Groups with different example sets are left untouched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert len(words["key_b"]["examples"]["vetted"]) == 1
|
||||
|
||||
def test_no_action_when_one_has_no_examples(self):
|
||||
"""If only one member has examples, nothing to deduplicate."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_no_frequency_uses_alphabetical_tiebreak(self):
|
||||
"""When no member has frequency data, first alphabetically wins."""
|
||||
group = ["alpha_key", "beta_key"]
|
||||
words = {
|
||||
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
|
||||
"beta_key": _make_entry("meaning2", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
|
||||
assert words["beta_key"]["examples"]["vetted"] == []
|
||||
|
||||
def test_three_way_group(self):
|
||||
"""Three-member group: highest frequency wins, other two cleared."""
|
||||
group = ["key_a", "key_b", "key_c"]
|
||||
words = {
|
||||
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
|
||||
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
|
||||
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 2
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
assert words["key_c"]["examples"]["vetted"] == []
|
||||
|
||||
def test_cloze_removed_from_losers(self):
|
||||
"""Losing entries should have their cloze data removed too."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
|
||||
}
|
||||
# Add cloze to both
|
||||
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert "cloze" not in words["key_b"]["examples"]
|
||||
|
||||
def test_no_confusable_groups_returns_zero(self):
|
||||
"""Words without confusable_group are ignored."""
|
||||
words = {
|
||||
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_mixed_frequency_and_none(self):
|
||||
"""Member with frequency beats member without."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
|
||||
"key_b": _make_entry("no_freq", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_partial_overlap_not_deduplicated(self):
|
||||
"""Groups with overlapping but not identical sentence sets are not touched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
|
||||
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
83
tests/test_scoring_integration.py
Normal file
83
tests/test_scoring_integration.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
"""Integration tests for frequency-based sentence scoring in update_words_json."""
|
||||
|
||||
|
||||
def _make_sentence(text, source="test", match_method="direct", word_count=None, char_offset=0, char_end=3):
|
||||
"""Build a minimal sentence dict as match_sentences would produce."""
|
||||
if word_count is None:
|
||||
word_count = len(text.split())
|
||||
return {
|
||||
"text": text,
|
||||
"source": source,
|
||||
"match_method": match_method,
|
||||
"word_count": word_count,
|
||||
"char_offset": char_offset,
|
||||
"char_end": char_end,
|
||||
}
|
||||
|
||||
|
||||
class TestScoringIntegration:
|
||||
"""Tests that update_words_json uses frequency scoring."""
|
||||
|
||||
def test_cloze_has_difficulty_score(self):
|
||||
"""Cloze dict includes difficulty_score field."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא אָדָם טוֹב מְאוֹד", char_offset=10, char_end=13),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"].get("cloze")
|
||||
assert cloze is not None
|
||||
assert "difficulty_score" in cloze
|
||||
assert isinstance(cloze["difficulty_score"], int)
|
||||
|
||||
def test_vetted_sorted_by_difficulty(self):
|
||||
"""Vetted sentences are sorted easiest first."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא טוֹב", char_offset=4, char_end=7),
|
||||
_make_sentence("הַתַּפְנִיט טוֹב בְּיוֹתֵר", char_offset=10, char_end=13),
|
||||
_make_sentence("אֲנִי טוֹב הַיּוֹם", char_offset=5, char_end=8),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
vetted = words["טוֹב"]["examples"]["vetted"]
|
||||
assert len(vetted) == 3
|
||||
|
||||
def test_easiest_sentence_becomes_cloze(self):
|
||||
"""The sentence with the lowest difficulty score becomes the cloze."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
easy_text = "הוּא טוֹב מְאוֹד"
|
||||
hard_text = "הַפַּרְנָסִימוֹן טוֹב לְהַפְלִיא"
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence(hard_text, char_offset=14, char_end=17),
|
||||
_make_sentence(easy_text, char_offset=4, char_end=7),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"]["cloze"]
|
||||
assert cloze["text"] == easy_text
|
||||
207
tests/test_sentence_difficulty.py
Normal file
207
tests/test_sentence_difficulty.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""Tests for sentence difficulty scoring."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import frequency_lookup
|
||||
from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence
|
||||
|
||||
|
||||
class TestBuildNikkudMap:
|
||||
def test_maps_direct_headwords(self):
|
||||
words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָב"] == "אב"
|
||||
|
||||
def test_maps_conjugation_forms(self):
|
||||
words = {
|
||||
"שָׁמַר": {
|
||||
"word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
"conjugation": {
|
||||
"active_forms": [
|
||||
{
|
||||
"person": "1s",
|
||||
"tense": "עָבָר",
|
||||
"form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
|
||||
},
|
||||
],
|
||||
"infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
|
||||
"reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["שָׁמַרְתִּי"] == "שמרתי"
|
||||
assert nmap["לִשְׁמֹר"] == "לשמור"
|
||||
|
||||
def test_maps_noun_inflections(self):
|
||||
words = {
|
||||
"אָב": {
|
||||
"word": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"noun_inflection": {
|
||||
"singular": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
|
||||
"pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָבוֹת"] == "אבות"
|
||||
assert nmap["אָבִי"] == "אבי"
|
||||
|
||||
def test_maps_adjective_inflections(self):
|
||||
words = {
|
||||
"גָּדוֹל": {
|
||||
"word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"adjective_inflection": {
|
||||
"ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
|
||||
"mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
|
||||
"fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["גְּדוֹלָה"] == "גדולה"
|
||||
assert nmap["גְּדוֹלִים"] == "גדולים"
|
||||
|
||||
def test_construct_forms_strip_maqaf(self):
|
||||
words = {
|
||||
"בֵּית": {
|
||||
"word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
|
||||
"noun_inflection": {
|
||||
"construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert "בֵּית־" in nmap
|
||||
assert "בֵּית" in nmap
|
||||
|
||||
def test_handles_missing_fields(self):
|
||||
words = {
|
||||
"test": {
|
||||
"word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
|
||||
"conjugation": None,
|
||||
"noun_inflection": None,
|
||||
"adjective_inflection": None,
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["טֶסְט"] == "טסט"
|
||||
|
||||
def test_real_words_json_coverage(self):
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
nmap = build_nikkud_map(words)
|
||||
assert len(nmap) > 90_000
|
||||
|
||||
|
||||
class TestResolveTokenFrequency:
|
||||
@pytest.fixture()
|
||||
def freq_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_tier1_known_mapping(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 50_000
|
||||
|
||||
def test_tier3_academy_converter(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 1000
|
||||
|
||||
def test_unknown_token_returns_default(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank == 50_000
|
||||
|
||||
def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
assert freq_data.get("שלום") is not None
|
||||
|
||||
|
||||
class TestScoreSentence:
|
||||
@pytest.fixture()
|
||||
def scoring_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_returns_integer(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא הָלַךְ הַבַּיְתָה"
|
||||
start = text.index("הָלַךְ")
|
||||
end = start + len("הָלַךְ")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_easy_sentence_scores_lower(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
easy = "הוּא אָמַר שָׁלוֹם"
|
||||
easy_start = easy.index("אָמַר")
|
||||
easy_end = easy_start + len("אָמַר")
|
||||
hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
|
||||
hard_start = hard.index("נִשְׁתַּטֵּחַ")
|
||||
hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
|
||||
easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
|
||||
hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
|
||||
assert easy_score < hard_score
|
||||
|
||||
def test_single_context_token(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא טוֹב"
|
||||
start = 0
|
||||
end = len("הוּא")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_handles_punctuation(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = '"הוּא טוֹב!"'
|
||||
start = text.index("טוֹב")
|
||||
end = start + len("טוֹב")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_splits_on_maqaf(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "בֵּית־סֵפֶר גָּדוֹל"
|
||||
start = text.index("גָּדוֹל")
|
||||
end = start + len("גָּדוֹל")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_no_context_tokens_returns_default(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "א ב"
|
||||
score = score_sentence(text, 0, 1, nmap, nidx, freq)
|
||||
assert score == DEFAULT_RANK
|
||||
Loading…
Reference in a new issue