- Full pealim.com rescrape: 9,120 words (15 new), all with audio URLs - Plurals deck: 2:1 regular:irregular ratio (649 notes), RTL arrows, 1.6x hint text - Conjugation deck: blue infinitive on front, plain meaning on back, nikkud labels - Confusables deck: larger prompt text (32px), audio only when all words have it - Validator: non-audio variants no longer false-fail on audio check - 14 new audio files downloaded Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1867 lines
68 KiB
Python
1867 lines
68 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
|
||
Uses genanki for reliable, stable deck generation.
|
||
|
||
Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
|
||
in Anki rather than creating a duplicate.
|
||
"""
|
||
|
||
import json
|
||
import logging
|
||
import random
|
||
import re
|
||
import unicodedata
|
||
from pathlib import Path
|
||
|
||
import genanki
|
||
import pandas as pd
|
||
|
||
from helpers import strip_nikkud as _strip_nikkud
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
# Stable deck/model IDs — do not change these
|
||
VOCAB_DECK_ID = 1_234_567_890
|
||
VOCAB_MODEL_ID = 1_701_222_017_968 # matches Nevo's original Anki model
|
||
CONJ_DECK_ID = 1_234_567_892
|
||
CONJ_MODEL_ID = 1_234_567_893
|
||
CONF_DECK_ID = 1_234_567_894
|
||
CONF_MODEL_ID = 1_234_567_895
|
||
PLURAL_DECK_ID = 1_234_567_896
|
||
PLURAL_MODEL_ID = 1_234_567_897
|
||
|
||
# Subdeck IDs for combined "Hebrew::*" package — MUST differ from standalone IDs
|
||
COMPLETE_VOCAB_DECK_ID = 1_234_567_900
|
||
COMPLETE_CONJ_DECK_ID = 1_234_567_901
|
||
COMPLETE_CONF_DECK_ID = 1_234_567_902
|
||
COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
||
|
||
# Release version tag added to all notes so users can identify which release
|
||
# their cards come from (visible in Anki's Browse view and card info).
|
||
RELEASE_TAG = "v0.14"
|
||
|
||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)")
|
||
|
||
DATA_DIR = Path(__file__).parent / "data"
|
||
|
||
# Legacy GUID map from Nevo's original Anki deck (imported ~Jul 2025).
|
||
# Preserves study progress on reimport by reusing the same note GUIDs.
|
||
_LEGACY_GUID_PATH = DATA_DIR / "legacy_guid_map.json"
|
||
_LEGACY_GUIDS: dict[str, str] = {}
|
||
if _LEGACY_GUID_PATH.exists():
|
||
with open(_LEGACY_GUID_PATH) as _f:
|
||
_LEGACY_GUIDS = json.load(_f)
|
||
|
||
|
||
def _vocab_guid(word: str, meaning: str = "") -> str:
|
||
"""Return the legacy GUID for a word if it exists, else a deterministic one.
|
||
|
||
For homographs (same word, different meanings), tries a compound key
|
||
``word||meaning_prefix`` first. Falls back to the plain word key, then
|
||
to a deterministic GUID from (word, meaning).
|
||
"""
|
||
key = unicodedata.normalize("NFC", word)
|
||
if meaning:
|
||
compound = f"{key}||{meaning.lower().strip()[:30]}"
|
||
if compound in _LEGACY_GUIDS:
|
||
return _LEGACY_GUIDS[compound]
|
||
if key in _LEGACY_GUIDS:
|
||
return _LEGACY_GUIDS[key]
|
||
return genanki.guid_for(word, meaning) if meaning else genanki.guid_for(word)
|
||
|
||
|
||
AUDIO_DIR = DATA_DIR / "audio"
|
||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||
|
||
VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
|
||
VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg"
|
||
VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg"
|
||
VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg"
|
||
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
|
||
CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg"
|
||
CONF_APKG = OUTPUT_DIR / "hebrew_confusables.apkg"
|
||
CONF_APKG_AUDIO = OUTPUT_DIR / "hebrew_confusables_audio.apkg"
|
||
PLURAL_APKG = OUTPUT_DIR / "hebrew_plurals.apkg"
|
||
PLURAL_APKG_AUDIO = OUTPUT_DIR / "hebrew_plurals_audio.apkg"
|
||
COMPLETE_APKG = OUTPUT_DIR / "hebrew_complete.apkg"
|
||
COMPLETE_APKG_AUDIO = OUTPUT_DIR / "hebrew_complete_audio.apkg"
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Binyan → Hebrew label mapping (for conjugation card display)
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
BINYAN_TO_HEBREW: dict[str, str] = {
|
||
"Pa'al": "פָּעַל",
|
||
"Nif'al": "נִפְעַל",
|
||
"Pi'el": "פִּעֵל",
|
||
"Pu'al": "פֻּעַל",
|
||
"Hitpa'el": "הִתְפַּעֵל",
|
||
"Hif'il": "הִפְעִיל",
|
||
"Huf'al": "הֻפְעַל",
|
||
}
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# PoS → Hebrew label mapping
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
POS_TO_HEBREW = {
|
||
"Noun": "שם עצם",
|
||
"Verb": "פועל",
|
||
"Adjective": "שם תואר",
|
||
"Adverb": "תואר הפועל",
|
||
"Preposition": "מילת יחס",
|
||
"Conjunction": "מילת חיבור",
|
||
"Pronoun": "כינוי גוף",
|
||
"Particle": "מילית",
|
||
}
|
||
|
||
# PoS category groupings for related-words display
|
||
POS_CATEGORY_LABELS = {
|
||
"Verb": "פעלים",
|
||
"Noun": "שמות עצם",
|
||
"Adjective": "שמות תואר",
|
||
"Adverb": "תוארי הפועל",
|
||
}
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Shared CSS
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
FONTS_DIR = DATA_DIR / "fonts"
|
||
|
||
CARD_CSS = """
|
||
@font-face {
|
||
font-family: 'Heebo';
|
||
src: url('_Heebo-Regular.ttf');
|
||
font-weight: normal;
|
||
}
|
||
@font-face {
|
||
font-family: 'Heebo';
|
||
src: url('_Heebo-Bold.ttf');
|
||
font-weight: bold;
|
||
}
|
||
.card {
|
||
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
|
||
font-size: 20px;
|
||
text-align: center;
|
||
color: #222;
|
||
background: #fff;
|
||
padding: 16px;
|
||
}
|
||
.hebrew {
|
||
font-size: 36px;
|
||
font-weight: bold;
|
||
direction: rtl;
|
||
text-align: center;
|
||
line-height: 1.5;
|
||
color: #222;
|
||
}
|
||
.hebrew-sm {
|
||
font-size: 24px;
|
||
font-weight: normal;
|
||
direction: rtl;
|
||
text-align: center;
|
||
color: #333;
|
||
}
|
||
.meaning {
|
||
font-size: 28px;
|
||
color: #1a1a8c;
|
||
margin: 8px 0;
|
||
}
|
||
.hint {
|
||
font-size: 16px;
|
||
color: #888;
|
||
margin: 4px 0;
|
||
direction: rtl;
|
||
}
|
||
.root-info {
|
||
font-size: 18px;
|
||
color: #555;
|
||
margin-top: 6px;
|
||
direction: rtl;
|
||
}
|
||
.example {
|
||
font-size: 18px;
|
||
color: #444;
|
||
direction: rtl;
|
||
text-align: right;
|
||
font-style: italic;
|
||
margin: 10px auto 0;
|
||
max-width: 90%;
|
||
border-right: 3px solid #aaa;
|
||
padding-right: 8px;
|
||
}
|
||
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
|
||
.freq-badge {
|
||
display: inline-block;
|
||
font-size: 11px;
|
||
color: #aaa;
|
||
background: transparent;
|
||
border: 1px solid #eee;
|
||
border-radius: 10px;
|
||
padding: 2px 8px;
|
||
margin-top: 4px;
|
||
}
|
||
.voice-label {
|
||
font-size: 0.6em;
|
||
font-weight: normal;
|
||
color: #555;
|
||
}
|
||
.sec-label {
|
||
font-size: 32px;
|
||
color: #555;
|
||
direction: rtl;
|
||
text-align: center;
|
||
margin-top: 6px;
|
||
}
|
||
.sec-key {
|
||
font-size: 24px;
|
||
color: #888;
|
||
}
|
||
.related-group {
|
||
direction: rtl;
|
||
text-align: right;
|
||
margin: 2px 0;
|
||
font-size: 18px;
|
||
}
|
||
.emoji-img {
|
||
font-size: 3.5em;
|
||
text-align: center;
|
||
margin: 0.3em 0;
|
||
}
|
||
@media (prefers-color-scheme: dark) {
|
||
.card { color: #e8e8e8; background: #1c1c1e; }
|
||
.hebrew { color: #f0f0f0; }
|
||
.hebrew-sm { color: #ddd; }
|
||
.meaning { color: #82b0ff; }
|
||
.root-info { color: #aaa; }
|
||
.sec-label { color: #aaa; }
|
||
.sec-key { color: #666; }
|
||
.hint { color: #777; }
|
||
.voice-label { color: #888; }
|
||
.example { color: #bbb; border-right-color: #555; }
|
||
.divider { border-top-color: #333; }
|
||
.freq-badge { color: #888; border-color: #444; }
|
||
}
|
||
"""
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Vocabulary Deck
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
VOCAB_FRONT_HEB = """
|
||
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
|
||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||
"""
|
||
|
||
VOCAB_BACK_HEB = """
|
||
{{FrontSide}}
|
||
<div class="divider"></div>
|
||
<div class="meaning">{{Meaning}}</div>
|
||
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
|
||
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
|
||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
||
{{#SharedRoots}}
|
||
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||
<div class="root-info">{{SharedRoots}}</div>
|
||
{{/SharedRoots}}
|
||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
||
{{#Example}}
|
||
<div class="example">{{Example}}</div>
|
||
{{/Example}}
|
||
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
|
||
"""
|
||
|
||
VOCAB_FRONT_ENG = """
|
||
<div class="meaning">{{Meaning}}</div>
|
||
{{#Hint}}<div class="hint">{{Hint}}</div>{{/Hint}}
|
||
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
|
||
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
|
||
"""
|
||
|
||
VOCAB_BACK_ENG = """
|
||
{{FrontSide}}
|
||
<div class="divider"></div>
|
||
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
|
||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
|
||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
||
{{#Example}}
|
||
<div class="example">{{Example}}</div>
|
||
{{/Example}}
|
||
"""
|
||
|
||
VOCAB_FRONT_CLOZE = """
|
||
<div class="example" style="font-size:24px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
|
||
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
|
||
"""
|
||
|
||
VOCAB_BACK_CLOZE = """
|
||
{{FrontSide}}
|
||
<div class="divider"></div>
|
||
<div class="hebrew">{{Word}}</div>
|
||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||
<div class="meaning">{{Meaning}}</div>
|
||
"""
|
||
|
||
VOCAB_MODEL = genanki.Model(
|
||
VOCAB_MODEL_ID,
|
||
"Hebrew Flash Cards",
|
||
fields=[
|
||
{"name": "Word"},
|
||
{"name": "Root"},
|
||
{"name": "PoS"},
|
||
{"name": "Meaning"},
|
||
{"name": "WordNoNikkud"},
|
||
{"name": "SharedRoots"},
|
||
{"name": "Tags"},
|
||
{"name": "Audio"},
|
||
{"name": "Example"},
|
||
{"name": "Frequency"},
|
||
{"name": "Image"},
|
||
{"name": "Emoji"},
|
||
{"name": "Prep"},
|
||
{"name": "Hint"},
|
||
{"name": "Plural"},
|
||
{"name": "ClozeExample"},
|
||
{"name": "ClozeHint"},
|
||
],
|
||
templates=[
|
||
{
|
||
# ord 0 — matches Nevo's original "Card 2" (Eng→Heb)
|
||
"name": "English → Hebrew",
|
||
"qfmt": VOCAB_FRONT_ENG,
|
||
"afmt": VOCAB_BACK_ENG,
|
||
},
|
||
{
|
||
# ord 1 — matches Nevo's original "Card 3" (Heb→Eng)
|
||
"name": "Hebrew → English",
|
||
"qfmt": VOCAB_FRONT_HEB,
|
||
"afmt": VOCAB_BACK_HEB,
|
||
},
|
||
{
|
||
# ord 2 — Cloze-style sentence fill-in (only generated when ClozeExample is non-empty)
|
||
"name": "Sentence Cloze",
|
||
"qfmt": VOCAB_FRONT_CLOZE,
|
||
"afmt": VOCAB_BACK_CLOZE,
|
||
},
|
||
],
|
||
css=CARD_CSS,
|
||
)
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Conjugation Deck
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
CONJ_FRONT = """
|
||
<div class="hebrew">{{Pronoun}}</div>
|
||
<div class="meaning" style="font-size:28px;">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
|
||
<div class="hebrew">{{Tense}}</div>
|
||
"""
|
||
|
||
CONJ_BACK = """
|
||
{{FrontSide}}<hr>
|
||
<div class="hebrew">{{ConjugatedForm}}</div>
|
||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
|
||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
|
||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
|
||
{{#RelatedVocab}}<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
|
||
"""
|
||
|
||
CONJ_CSS = CARD_CSS
|
||
|
||
CONJ_MODEL = genanki.Model(
|
||
CONJ_MODEL_ID,
|
||
"Pealim Conjugation",
|
||
fields=[
|
||
{"name": "Infinitive"},
|
||
{"name": "ReferenceForm"},
|
||
{"name": "Pronoun"},
|
||
{"name": "Tense"},
|
||
{"name": "ConjugatedForm"},
|
||
{"name": "Root"},
|
||
{"name": "Binyan"},
|
||
{"name": "Voice"},
|
||
{"name": "Audio"},
|
||
{"name": "Meaning"},
|
||
{"name": "RelatedVocab"},
|
||
],
|
||
templates=[
|
||
{
|
||
"name": "Conjugation Drill",
|
||
"qfmt": CONJ_FRONT,
|
||
"afmt": CONJ_BACK,
|
||
}
|
||
],
|
||
css=CONJ_CSS,
|
||
)
|
||
|
||
# Present-tense expansion: each form key → list of (pronoun, tense_label)
|
||
PRESENT_EXPANSION = {
|
||
"present_ms": [
|
||
("אֲנִי (זָכָר)", "הוֹוֶה"),
|
||
("אַתָּה", "הוֹוֶה"),
|
||
("הוּא", "הוֹוֶה"),
|
||
],
|
||
"present_fs": [
|
||
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
|
||
("אַתְּ", "הוֹוֶה"),
|
||
("הִיא", "הוֹוֶה"),
|
||
],
|
||
"present_mp": [
|
||
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
|
||
("אַתֶּם", "הוֹוֶה"),
|
||
("הֵם", "הוֹוֶה"),
|
||
],
|
||
"present_fp": [
|
||
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
|
||
("אַתֶּן", "הוֹוֶה"),
|
||
("הֵן", "הוֹוֶה"),
|
||
],
|
||
}
|
||
|
||
# Modern Hebrew: 2fp/3fp future and imperative default to mp form (classical in parens)
|
||
FP_MODERN_FALLBACK = {
|
||
"future_2fp": "future_2mp",
|
||
"future_3fp": "future_3mp",
|
||
"imperative_fp": "imperative_mp",
|
||
}
|
||
|
||
# 3rd person plural past: same form for m/f — generate two separate pronoun cards
|
||
PAST_3P_EXPANSION = [
|
||
("הֵם", "עָבָר"),
|
||
("הֵן", "עָבָר"),
|
||
]
|
||
|
||
# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
|
||
VOICE_MAP = {
|
||
"Pu'al": "סָבִיל",
|
||
"Huf'al": "סָבִיל",
|
||
}
|
||
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Helpers
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
|
||
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR) -> str:
|
||
"""Return [sound:xxx.mp3] if audio file exists, else empty string."""
|
||
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
|
||
if not safe:
|
||
return ""
|
||
mp3_path = audio_dir / f"{safe}.mp3"
|
||
if mp3_path.exists():
|
||
return f"[sound:{mp3_path.name}]"
|
||
return ""
|
||
|
||
|
||
def _conj_audio_tag(slug: str, form_key: str) -> str:
|
||
"""Return [sound:xxx.mp3] for conjugation audio if downloaded."""
|
||
filename = f"{slug}_{form_key}.mp3"
|
||
mp3_path = AUDIO_CONJ_DIR / filename
|
||
if mp3_path.exists():
|
||
return f"[sound:{filename}]"
|
||
return ""
|
||
|
||
|
||
# Keywords excluded when building emoji lookup AND matching meaning text.
|
||
# Curated from manual review of all 2,261 emoji-word pairs (Sprint 8).
|
||
_EMOJI_STOP = frozenset(
|
||
{
|
||
# Basic stop words
|
||
"to",
|
||
"be",
|
||
"a",
|
||
"an",
|
||
"the",
|
||
"of",
|
||
"in",
|
||
"on",
|
||
"at",
|
||
"for",
|
||
"and",
|
||
"with",
|
||
"by",
|
||
"or",
|
||
"but",
|
||
"not",
|
||
"as",
|
||
"its",
|
||
# Generic emoji description words (too vague)
|
||
"face",
|
||
"hand",
|
||
"sign",
|
||
"symbol",
|
||
"button",
|
||
"small",
|
||
"large",
|
||
"light",
|
||
"dark",
|
||
"open",
|
||
"closed",
|
||
# Numbers → clock emoji (🕐🕑🕒 etc.)
|
||
"one",
|
||
"two",
|
||
"three",
|
||
"four",
|
||
"five",
|
||
"six",
|
||
"seven",
|
||
"eight",
|
||
"nine",
|
||
"ten",
|
||
"hundred",
|
||
"thousand",
|
||
# UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️)
|
||
"next",
|
||
"fast",
|
||
"play",
|
||
"pause",
|
||
"repeat",
|
||
"end",
|
||
"soon",
|
||
"record",
|
||
# Abstract words → misleading object emoji
|
||
"part",
|
||
"place",
|
||
"mark",
|
||
"post",
|
||
"department",
|
||
"store",
|
||
"note",
|
||
"control",
|
||
"level",
|
||
"stop",
|
||
"cover",
|
||
"roll",
|
||
"rolling",
|
||
"pick",
|
||
"over",
|
||
"right",
|
||
"way",
|
||
"skin",
|
||
"drop",
|
||
"middle",
|
||
"piece",
|
||
"section",
|
||
# Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.)
|
||
"north",
|
||
"south",
|
||
"northern",
|
||
"southern",
|
||
"western",
|
||
"eastern",
|
||
"central",
|
||
"territory",
|
||
"kingdom",
|
||
"united",
|
||
"virgin",
|
||
# Common words producing bad emoji matches
|
||
"new",
|
||
"big",
|
||
"full",
|
||
"last",
|
||
"first",
|
||
"double",
|
||
"slightly",
|
||
"without",
|
||
"from",
|
||
"behind",
|
||
"people",
|
||
"position",
|
||
"status",
|
||
"situation",
|
||
"game",
|
||
"call",
|
||
"trade",
|
||
"male",
|
||
"female",
|
||
"person",
|
||
"letter",
|
||
# Polysemous words → wrong emoji sense
|
||
"french",
|
||
"fried",
|
||
"board",
|
||
"bow",
|
||
"water",
|
||
"union",
|
||
"rock",
|
||
"left",
|
||
"back",
|
||
"crane",
|
||
"dash",
|
||
"bar",
|
||
"wheel",
|
||
"horizontal",
|
||
}
|
||
)
|
||
|
||
|
||
def _load_emoji_lookup() -> dict[str, str]:
|
||
"""Load or fetch Unicode emoji keyword→character lookup.
|
||
|
||
Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
|
||
Result is cached in data/emoji_lookup.json.
|
||
Returns empty dict on network failure (safe fallback).
|
||
"""
|
||
cache_file = DATA_DIR / "emoji_lookup.json"
|
||
if cache_file.exists():
|
||
with open(cache_file) as f:
|
||
return json.load(f)
|
||
|
||
import requests
|
||
|
||
try:
|
||
resp = requests.get(
|
||
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
|
||
timeout=30,
|
||
)
|
||
resp.raise_for_status()
|
||
except Exception as e:
|
||
logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
|
||
return {}
|
||
|
||
lookup: dict[str, str] = {}
|
||
for line in resp.text.splitlines():
|
||
if "fully-qualified" not in line:
|
||
continue
|
||
m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
|
||
if not m:
|
||
continue
|
||
emoji_char = m.group(1)
|
||
desc = m.group(2).lower().strip()
|
||
for word in desc.split():
|
||
word = word.strip(".,'\"-")
|
||
if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
|
||
lookup[word] = emoji_char
|
||
|
||
cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
|
||
logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
|
||
return lookup
|
||
|
||
|
||
def _translate_pos(pos_str: str) -> str:
|
||
"""Translate PoS string to Hebrew. For verbs, appends binyan."""
|
||
for eng, heb in POS_TO_HEBREW.items():
|
||
if eng.lower() in pos_str.lower():
|
||
if eng == "Verb":
|
||
# Extract binyan from strings like "Verb – Pi'el" or "Verb –pi'el"
|
||
for binyan_eng, binyan_heb in BINYAN_TO_HEBREW.items():
|
||
if binyan_eng.lower() in pos_str.lower().replace("–", "-").replace("—", "-"):
|
||
return f"פועל — {binyan_heb}"
|
||
return heb
|
||
return pos_str
|
||
|
||
|
||
def _categorize_pos(pos_str: str) -> str:
|
||
"""Return the canonical PoS category key for grouping."""
|
||
for cat in POS_CATEGORY_LABELS:
|
||
if cat.lower() in pos_str.lower():
|
||
return cat
|
||
return "Other"
|
||
|
||
|
||
def build_vocab_deck(
|
||
dict_csv: Path,
|
||
examples_cache: dict | None = None,
|
||
freq_cache: dict | None = None,
|
||
image_cache: dict | None = None,
|
||
emoji_lookup: dict | None = None,
|
||
limit: int | None = None,
|
||
include_audio: bool = True,
|
||
include_images: bool = True,
|
||
) -> tuple[genanki.Deck, list[Path]]:
|
||
"""
|
||
Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv).
|
||
Returns (deck, list_of_media_files).
|
||
"""
|
||
logger.info(f"Loading dictionary from {dict_csv}")
|
||
try:
|
||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||
if df.shape[1] < 3:
|
||
raise ValueError("too few columns")
|
||
except (ValueError, pd.errors.ParserError):
|
||
df = pd.read_csv(dict_csv, index_col=0)
|
||
|
||
if limit:
|
||
df = df.head(limit)
|
||
|
||
logger.info(f" {len(df)} rows loaded")
|
||
|
||
examples_cache = examples_cache or {}
|
||
freq_cache = freq_cache or {}
|
||
image_cache = image_cache or {}
|
||
|
||
# Load EPUB/PDF sentence matches (nikkud'd — preferred over Ben Yehuda)
|
||
epub_examples: dict[str, list[str]] = {}
|
||
epub_path = DATA_DIR / "vocab_sentence_matches.json"
|
||
if epub_path.exists():
|
||
try:
|
||
with open(epub_path) as _f:
|
||
raw_epub = json.load(_f)
|
||
for word_key, info in raw_epub.items():
|
||
sents = info.get("sentences", [])
|
||
if sents:
|
||
epub_examples[word_key] = [s["text"] if isinstance(s, dict) else s for s in sents]
|
||
# Also index by nikkud form
|
||
nikkud_word = info.get("word_nikkud", "")
|
||
if nikkud_word and nikkud_word != word_key:
|
||
epub_examples[nikkud_word] = epub_examples[word_key]
|
||
logger.info(f" EPUB sentence matches loaded: {len(epub_examples)} words")
|
||
except (json.JSONDecodeError, OSError):
|
||
pass
|
||
|
||
# Load AI-vetted sentences for cloze cards (only approved sentences)
|
||
vetted_cloze: dict[str, list[str]] = {} # word_nikkud → [good sentences]
|
||
vetted_path = DATA_DIR / "vetted_sentences.json"
|
||
if vetted_path.exists():
|
||
try:
|
||
with open(vetted_path) as _f:
|
||
raw_vetted = json.load(_f)
|
||
for word_key, info in raw_vetted.items():
|
||
good = info.get("good_sentences", [])
|
||
if good:
|
||
texts = [s["text"] if isinstance(s, dict) else s for s in good]
|
||
nikkud_word = info.get("word_nikkud", word_key)
|
||
vetted_cloze[nikkud_word] = texts
|
||
if word_key != nikkud_word:
|
||
vetted_cloze[word_key] = texts
|
||
logger.info(f" Vetted cloze sentences loaded: {len(vetted_cloze)} words")
|
||
except (json.JSONDecodeError, OSError):
|
||
pass
|
||
|
||
# Load noun plural forms for vocab card back display
|
||
noun_plural_lookup: dict[str, str] = {} # word (nikkud) → plural (nikkud)
|
||
_noun_plural_stripped: dict[str, str] = {} # word (stripped) → plural (nikkud), fallback
|
||
noun_plural_path = DATA_DIR / "noun_plurals.json"
|
||
if noun_plural_path.exists():
|
||
try:
|
||
with open(noun_plural_path) as _f:
|
||
_noun_data = json.load(_f)
|
||
for _entry in _noun_data.values():
|
||
sg = _entry.get("singular", "")
|
||
pl = _entry.get("plural", "")
|
||
if sg and pl:
|
||
noun_plural_lookup[sg] = pl
|
||
s = _strip_nikkud(sg)
|
||
if s not in _noun_plural_stripped:
|
||
_noun_plural_stripped[s] = pl
|
||
logger.info(f" Noun plurals loaded: {len(noun_plural_lookup)} entries")
|
||
except (json.JSONDecodeError, OSError):
|
||
pass
|
||
|
||
# Load refined meanings for synonym disambiguation (layer 2)
|
||
refined_meanings: dict[str, str] = {}
|
||
refined_path = DATA_DIR / "refined_meanings.json"
|
||
if refined_path.exists():
|
||
try:
|
||
with open(refined_path) as _f:
|
||
refined_meanings = json.load(_f)
|
||
logger.info(f" Refined meanings loaded: {len(refined_meanings)} entries")
|
||
except (json.JSONDecodeError, OSError):
|
||
pass
|
||
|
||
# Load image cache from disk if not passed in
|
||
image_cache_path = DATA_DIR / "image_cache.json"
|
||
if not image_cache and image_cache_path.exists():
|
||
try:
|
||
with open(image_cache_path) as _f:
|
||
image_cache = json.load(_f)
|
||
except (json.JSONDecodeError, OSError) as e:
|
||
logger.debug(f"Could not load image cache from disk: {e}")
|
||
|
||
images_dir = DATA_DIR / "images"
|
||
|
||
# Build word_stripped → pos_category dict for related-words grouping
|
||
word_to_pos_cat: dict[str, str] = {}
|
||
for _, row in df.iterrows():
|
||
wni = str(row.get("Word Without Nikkud", "")).strip()
|
||
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||
if wni and pos_raw and pos_raw not in ("nan", "None"):
|
||
word_to_pos_cat[_strip_nikkud(wni)] = _categorize_pos(pos_raw)
|
||
|
||
# Build confusable words set: consonant-only forms with multiple entries
|
||
# Uses _strip_nikkud (removes combining marks) rather than Word Without Nikkud
|
||
# (which preserves matres lectionis) — since sentence matching also uses
|
||
# _strip_nikkud, we need to detect collisions at that level.
|
||
_strip_to_nikkud: dict[str, set[str]] = {}
|
||
for _, row in df.iterrows():
|
||
w = str(row.get("Word", "")).strip()
|
||
if w and w not in ("nan", "None"):
|
||
consonants = _strip_nikkud(w)
|
||
_strip_to_nikkud.setdefault(consonants, set()).add(w)
|
||
_confusable_words: set[str] = {k for k, v in _strip_to_nikkud.items() if len(v) > 1}
|
||
if _confusable_words:
|
||
logger.info(f" Confusable words (homographs): {len(_confusable_words)} stripped forms")
|
||
|
||
# Build ambiguity index: group words by normalized meaning to detect
|
||
# Eng→Heb collisions. A word needs a hint when another word shares
|
||
# the same English meaning. Hint = PoS (+ binyan for verbs).
|
||
_meaning_groups: dict[str, list[tuple[str, str]]] = {} # norm_meaning → [(word, pos_raw)]
|
||
for _, row in df.iterrows():
|
||
w = str(row.get("Word", "")).strip()
|
||
m = str(row.get("Meaning", "")).strip()
|
||
p = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||
if not w or not m or m in ("nan", "None"):
|
||
continue
|
||
# Normalize: strip emoji, Hebrew parens, take text before first semicolon
|
||
m_clean = EMOJI_RE.sub("", m).strip()
|
||
m_clean = HBPAREN_RE.sub("", m_clean).strip().strip(",").strip()
|
||
m_norm = m_clean.split(";")[0].strip().lower()
|
||
if m_norm:
|
||
_meaning_groups.setdefault(m_norm, []).append((w, p if p not in ("nan", "None") else ""))
|
||
|
||
# For each word in an ambiguous group, build its hint string
|
||
_word_hints: dict[tuple[str, str], str] = {} # (word, meaning) → hint
|
||
for _m_norm, entries in _meaning_groups.items():
|
||
if len(entries) < 2:
|
||
continue
|
||
# Check if the group has genuinely different PoS/binyan (not just duplicates)
|
||
pos_set = set()
|
||
for _, p in entries:
|
||
pos_set.add(_translate_pos(p) if p else "")
|
||
if len(pos_set) < 2:
|
||
continue
|
||
for w, p in entries:
|
||
hint = _translate_pos(p) if p else ""
|
||
if hint:
|
||
# Find original meaning for this word to build the (word, meaning) key
|
||
_word_hints.setdefault((w, hint), hint)
|
||
|
||
# Rebuild as (word, full_meaning) → hint for lookup during note creation
|
||
_word_meaning_hints: dict[tuple[str, str], str] = {}
|
||
for _, row in df.iterrows():
|
||
w = str(row.get("Word", "")).strip()
|
||
m = str(row.get("Meaning", "")).strip()
|
||
p = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||
if not w or not m or m in ("nan", "None"):
|
||
continue
|
||
hint = _translate_pos(p) if p and p not in ("nan", "None") else ""
|
||
if (w, hint) in _word_hints:
|
||
_word_meaning_hints[(w, m)] = hint
|
||
|
||
if _word_meaning_hints:
|
||
logger.info(f" Eng→Heb disambiguation hints: {len(_word_meaning_hints)} words")
|
||
|
||
# Sort by frequency rank
|
||
def freq_sort_key(row):
|
||
word_plain = _strip_nikkud(str(row.get("Word Without Nikkud", row.get("WordNoNikkud", ""))).strip())
|
||
return freq_cache.get(word_plain, 999_999)
|
||
|
||
df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
|
||
df = df.sort_values("_freq_rank")
|
||
|
||
deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
|
||
media_files: list[Path] = []
|
||
seen_words: set[tuple[str, str]] = set()
|
||
|
||
for _, row in df.iterrows():
|
||
word = str(row.get("Word", "")).strip()
|
||
root = str(row.get("Root", "")).strip()
|
||
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||
meaning = str(row.get("Meaning", "")).strip()
|
||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||
shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
|
||
tags_str = str(row.get("tags", row.get("Tags", ""))).strip()
|
||
freq_rank_raw = row["_freq_rank"]
|
||
if freq_rank_raw <= 500:
|
||
freq_display = f"Core #{freq_rank_raw}"
|
||
elif freq_rank_raw <= 1500:
|
||
freq_display = f"Essential #{freq_rank_raw}"
|
||
elif freq_rank_raw <= 3000:
|
||
freq_display = f"Intermediate #{freq_rank_raw}"
|
||
elif freq_rank_raw <= 5000:
|
||
freq_display = f"Upper-intermediate #{freq_rank_raw}"
|
||
elif freq_rank_raw <= 10000:
|
||
freq_display = f"Advanced #{freq_rank_raw}"
|
||
elif freq_rank_raw < 999_999:
|
||
freq_display = f"Rare #{freq_rank_raw}"
|
||
else:
|
||
freq_display = "Unlisted"
|
||
|
||
root = "" if root in ("nan", "None", "-") else root
|
||
pos_raw = "" if pos_raw in ("nan", "None") else pos_raw
|
||
meaning = "" if meaning in ("nan", "None") else meaning
|
||
word_no_nik = "" if word_no_nik in ("nan", "None") else word_no_nik
|
||
shared_roots = "" if shared_roots in ("nan", "None") else shared_roots
|
||
tags_str = "" if tags_str in ("nan", "None") else tags_str
|
||
|
||
if not word or not meaning:
|
||
continue
|
||
|
||
# Skip exact duplicates (same word AND same meaning — true dupes).
|
||
# Homographs (same word, different meaning) are kept as separate notes.
|
||
word_meaning_key = (word, meaning)
|
||
if word_meaning_key in seen_words:
|
||
logger.debug(f" Skipping duplicate word+meaning: {word}")
|
||
continue
|
||
seen_words.add(word_meaning_key)
|
||
|
||
# Extract emoji from meaning (pealim embeds emoji in meaning text)
|
||
emoji_str = "".join(EMOJI_RE.findall(meaning))
|
||
meaning_clean = EMOJI_RE.sub("", meaning).strip()
|
||
|
||
# Fallback: look up emoji from Unicode standard by English keyword
|
||
if not emoji_str and emoji_lookup:
|
||
for kw in re.sub(r"[^\w\s]", " ", meaning_clean.lower()).split()[:5]:
|
||
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
|
||
emoji_str = emoji_lookup[kw]
|
||
break
|
||
|
||
# Extract Hebrew parentheticals (prepositions) from meaning
|
||
preps = HBPAREN_RE.findall(meaning_clean)
|
||
prep_str = " ".join(f"({p})" for p in preps)
|
||
meaning_clean = HBPAREN_RE.sub("", meaning_clean).strip().strip(",").strip()
|
||
|
||
# Apply refined meaning if available (AI disambiguation layer 2)
|
||
if word in refined_meanings:
|
||
meaning_clean = refined_meanings[word]
|
||
|
||
# Translate PoS to Hebrew
|
||
pos_heb = _translate_pos(pos_raw) if pos_raw else ""
|
||
|
||
# Eng→Heb disambiguation hint (PoS + binyan, shown only for ambiguous meanings)
|
||
hint_str = _word_meaning_hints.get((word, meaning), "")
|
||
|
||
# Audio
|
||
audio_tag = _audio_tag(word_no_nik) if include_audio else ""
|
||
if audio_tag:
|
||
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
|
||
mp3_path = AUDIO_DIR / mp3_name
|
||
if mp3_path not in media_files:
|
||
media_files.append(mp3_path)
|
||
|
||
# Consonant-only form for confusable detection and cloze matching
|
||
word_consonants = _strip_nikkud(word)
|
||
is_confusable = word_consonants in _confusable_words
|
||
|
||
# Example sentences — priority: EPUB (nikkud'd) > Ben Yehuda > none
|
||
# For confusable words (same consonants, different nikkud), only match by
|
||
# exact nikkud form to avoid showing wrong-word sentences.
|
||
example_html = ""
|
||
# 1. EPUB/PDF sentences (full nikkud)
|
||
epub_sents = epub_examples.get(word)
|
||
if not epub_sents and not is_confusable:
|
||
epub_sents = epub_examples.get(word_no_nik) or epub_examples.get(_strip_nikkud(word_no_nik))
|
||
if epub_sents:
|
||
example_html = epub_sents[0]
|
||
else:
|
||
# 2. Ben Yehuda examples (some have nikkud from nikkud corpus)
|
||
by_sents = examples_cache.get(word)
|
||
if not by_sents and not is_confusable:
|
||
by_sents = examples_cache.get(word_no_nik) or examples_cache.get(_strip_nikkud(word_no_nik))
|
||
if by_sents:
|
||
# Prefer nikkud'd Ben Yehuda sentences (contain combining marks)
|
||
nikkud_sents = [s for s in by_sents if any("\u0591" <= c <= "\u05c7" for c in s)]
|
||
example_html = nikkud_sents[0] if nikkud_sents else by_sents[0]
|
||
|
||
# Cloze example: replace target word with blank in example sentence.
|
||
# Priority: AI-vetted sentences > EPUB/Ben Yehuda sentences.
|
||
# Uses stripped (no-nikkud) matching. Skips homographs (confusable words).
|
||
cloze_example = ""
|
||
cloze_hint = ""
|
||
if word_consonants and not is_confusable:
|
||
# Pick best sentence for cloze: vetted first, then example_html
|
||
cloze_source = None
|
||
vetted = vetted_cloze.get(word)
|
||
if not vetted and not is_confusable:
|
||
vetted = vetted_cloze.get(word_no_nik) or vetted_cloze.get(_strip_nikkud(word_no_nik))
|
||
if vetted:
|
||
cloze_source = vetted[0]
|
||
elif example_html:
|
||
cloze_source = example_html
|
||
|
||
if cloze_source:
|
||
tokens = cloze_source.split()
|
||
word_stripped = _strip_nikkud(word)
|
||
replaced = False
|
||
if word_stripped:
|
||
for i, tok in enumerate(tokens):
|
||
tok_stripped = _strip_nikkud(tok)
|
||
m = re.match(r'^(.*?)([\.,!?;:"\u0027]*)$', tok_stripped)
|
||
tok_core = m.group(1) if m else tok_stripped
|
||
punct_match = re.search(r'[.,!?;:"\u0027]+$', tok)
|
||
trailing = punct_match.group() if punct_match else ""
|
||
if tok_core == word_stripped:
|
||
tokens[i] = "_____" + trailing
|
||
replaced = True
|
||
break
|
||
if replaced:
|
||
cloze_example = " ".join(tokens)
|
||
pos_cat = _categorize_pos(pos_raw)
|
||
cloze_hint = meaning_clean
|
||
if pos_cat == "Verb" and pos_heb:
|
||
cloze_hint = f"{meaning_clean} ({pos_heb})"
|
||
|
||
# Related words grouped by PoS category
|
||
related_html = ""
|
||
if shared_roots:
|
||
related_words = shared_roots.split()
|
||
groups: dict[str, list[str]] = {}
|
||
for rw in related_words:
|
||
cat = word_to_pos_cat.get(_strip_nikkud(rw), "Other")
|
||
groups.setdefault(cat, []).append(rw)
|
||
parts = []
|
||
for cat, words in groups.items():
|
||
if cat == "Other":
|
||
# No label for uncategorized words — just list them plain
|
||
parts.append(f'<div class="related-group">{" ".join(words)}</div>')
|
||
else:
|
||
label = POS_CATEGORY_LABELS.get(cat, cat)
|
||
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>')
|
||
related_html = "\n".join(parts)
|
||
|
||
# Image: look up by stripped word (no-nikkud)
|
||
image_tag = ""
|
||
if include_images:
|
||
image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None)
|
||
if image_filename:
|
||
image_path = images_dir / image_filename
|
||
if image_path.exists():
|
||
image_tag = image_filename
|
||
if image_path not in media_files:
|
||
media_files.append(image_path)
|
||
|
||
note = genanki.Note(
|
||
model=VOCAB_MODEL,
|
||
# Stable GUID: uses legacy GUID from Nevo's original deck when
|
||
# available, otherwise deterministic from word + meaning.
|
||
guid=_vocab_guid(word, meaning),
|
||
fields=[
|
||
word,
|
||
root,
|
||
pos_heb,
|
||
meaning_clean,
|
||
word_no_nik,
|
||
related_html or shared_roots,
|
||
tags_str,
|
||
audio_tag,
|
||
example_html,
|
||
freq_display,
|
||
image_tag,
|
||
emoji_str,
|
||
prep_str,
|
||
hint_str,
|
||
noun_plural_lookup.get(word, "") or _noun_plural_stripped.get(word_consonants, ""),
|
||
cloze_example,
|
||
cloze_hint,
|
||
],
|
||
tags=(tags_str.split() if tags_str else [])
|
||
+ [RELEASE_TAG]
|
||
+ [f"freq::{freq_display.split()[0]}" if freq_display != "Unlisted" else "freq::Unlisted"],
|
||
)
|
||
deck.add_note(note)
|
||
|
||
# Diagnostic: count words with emoji/prep/hint/plural/cloze extracted
|
||
emoji_count = sum(1 for n in deck.notes if n.fields[11])
|
||
prep_count = sum(1 for n in deck.notes if n.fields[12])
|
||
hint_count = sum(1 for n in deck.notes if n.fields[13])
|
||
plural_count = sum(1 for n in deck.notes if n.fields[14])
|
||
cloze_count = sum(1 for n in deck.notes if n.fields[15])
|
||
if emoji_count:
|
||
logger.info(f" Emoji extracted: {emoji_count} words")
|
||
if prep_count:
|
||
logger.info(f" Hebrew prepositions extracted: {prep_count} words")
|
||
if hint_count:
|
||
logger.info(f" Eng→Heb hints: {hint_count} words")
|
||
if plural_count:
|
||
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
|
||
if cloze_count:
|
||
logger.info(f" Sentence cloze cards: {cloze_count} words")
|
||
|
||
# Diagnostic: count words without PoS coverage in shared_roots
|
||
other_count = 0
|
||
for _, row in df.iterrows():
|
||
sr = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
|
||
if sr and sr not in ("nan", "None"):
|
||
other_count += sum(1 for rw in sr.split() if word_to_pos_cat.get(_strip_nikkud(rw)) is None)
|
||
unlisted = int((df["_freq_rank"] >= 999_999).sum())
|
||
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(df)}")
|
||
logger.info(f" Related-words without PoS coverage: {other_count} (shown unlabeled)")
|
||
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
|
||
return deck, media_files
|
||
|
||
|
||
def build_conj_deck(
|
||
conjugations: dict,
|
||
audio_dir: Path = AUDIO_CONJ_DIR,
|
||
include_audio: bool = True,
|
||
dict_csv: Path | None = None,
|
||
) -> tuple[genanki.Deck, list[Path]]:
|
||
"""Build the conjugation drill deck from conjugations.json data."""
|
||
deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
|
||
media_files: list[Path] = []
|
||
note_count = 0
|
||
|
||
# Build lookup tables from vocab CSV for cross-linking
|
||
verb_meaning: dict[str, str] = {} # word_no_nikkud → meaning
|
||
root_words: dict[str, list[str]] = {} # root → [related words]
|
||
if dict_csv and dict_csv.exists():
|
||
vdf = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||
for _, row in vdf.iterrows():
|
||
word = str(row.get("Word", "")).strip()
|
||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||
meaning = str(row.get("Meaning", "")).strip()
|
||
root = str(row.get("Root", "")).strip()
|
||
if root and root not in ("nan", "None", "-"):
|
||
root_words.setdefault(root, []).append(word)
|
||
if meaning and meaning not in ("nan", "None"):
|
||
# Use Word Without Nikkud (ktiv male) for matching
|
||
if word_no_nik and word_no_nik not in ("nan", "None"):
|
||
verb_meaning[word_no_nik] = meaning
|
||
verb_meaning[_strip_nikkud(word)] = meaning
|
||
|
||
for infinitive, data in conjugations.items():
|
||
if not data or not data.get("forms"):
|
||
continue
|
||
|
||
root = data.get("root", "")
|
||
binyan = data.get("binyan", "")
|
||
binyan_heb = BINYAN_TO_HEBREW.get(binyan, binyan)
|
||
ref_form = data.get("reference_form", infinitive)
|
||
slug = data.get("slug", "")
|
||
voice = VOICE_MAP.get(binyan, "")
|
||
|
||
# Meaning: prefer scraped meaning from pealim page, fall back to CSV cross-link
|
||
meaning = (
|
||
data.get("meaning", "")
|
||
or verb_meaning.get(infinitive, "")
|
||
or verb_meaning.get(_strip_nikkud(infinitive), "")
|
||
)
|
||
related = [w for w in root_words.get(root, []) if w != infinitive]
|
||
related_str = " ".join(related[:8]) if related else ""
|
||
forms = data["forms"]
|
||
|
||
def add_note(
|
||
pronoun: str,
|
||
tense: str,
|
||
conj_form: str,
|
||
audio_tag: str,
|
||
*,
|
||
_infinitive: str = infinitive,
|
||
_ref_form: str = ref_form,
|
||
_root: str = root,
|
||
_binyan_heb: str = binyan_heb,
|
||
_voice: str = voice,
|
||
_meaning: str = meaning,
|
||
_related_str: str = related_str,
|
||
) -> None:
|
||
nonlocal note_count
|
||
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
|
||
return
|
||
note = genanki.Note(
|
||
model=CONJ_MODEL,
|
||
guid=genanki.guid_for(_infinitive, pronoun, tense),
|
||
fields=[
|
||
_infinitive,
|
||
_ref_form,
|
||
pronoun,
|
||
tense,
|
||
conj_form,
|
||
_root,
|
||
_binyan_heb,
|
||
_voice,
|
||
audio_tag,
|
||
_meaning,
|
||
_related_str,
|
||
],
|
||
tags=[RELEASE_TAG],
|
||
)
|
||
deck.add_note(note)
|
||
note_count += 1
|
||
|
||
alternate_forms = data.get("alternate_forms", {})
|
||
|
||
# Seeded RNG per verb — deterministic pronoun/gender choices
|
||
verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)
|
||
|
||
for form_key, form_data in forms.items():
|
||
primary_form = form_data.get("form", "")
|
||
alt_form = alternate_forms.get(form_key, "")
|
||
conj_form = f"{primary_form} / {alt_form}" if alt_form else primary_form
|
||
# Infinitive: shown on card front as reference — skip as a quiz form
|
||
if form_key == "infinitive":
|
||
continue
|
||
|
||
# Audio tag: use downloaded file if present
|
||
audio_tag = ""
|
||
if include_audio and slug:
|
||
audio_tag = _conj_audio_tag(slug, form_key)
|
||
if audio_tag:
|
||
mp3_path = audio_dir / f"{slug}_{form_key}.mp3"
|
||
if mp3_path not in media_files:
|
||
media_files.append(mp3_path)
|
||
|
||
# Present tense expansion: 4 form keys → 1 card each (seeded RNG)
|
||
if form_key in PRESENT_EXPANSION:
|
||
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
|
||
add_note(chosen[0], chosen[1], conj_form, audio_tag)
|
||
continue
|
||
|
||
# Past 3rd plural: same form for m/f → 1 card (seeded RNG)
|
||
if form_key == "past_3p":
|
||
chosen = verb_rng.choice(PAST_3P_EXPANSION)
|
||
add_note(chosen[0], chosen[1], conj_form, audio_tag)
|
||
continue
|
||
|
||
# 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
|
||
if form_key in FP_MODERN_FALLBACK:
|
||
mp_key = FP_MODERN_FALLBACK[form_key]
|
||
mp_form = forms.get(mp_key, {}).get("form", "")
|
||
fp_form = conj_form
|
||
display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
|
||
pronoun = form_data.get("pronoun", "")
|
||
tense = form_data.get("tense", "")
|
||
add_note(pronoun, tense, display_form, audio_tag)
|
||
continue
|
||
|
||
# Standard card
|
||
pronoun = form_data.get("pronoun", "")
|
||
tense = form_data.get("tense", "")
|
||
|
||
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
|
||
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
|
||
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
|
||
pronoun = f"{pronoun} ({gender})"
|
||
|
||
add_note(pronoun, tense, conj_form, audio_tag)
|
||
|
||
logger.info(f"Conjugation deck: {note_count} notes across {sum(1 for v in conjugations.values() if v)} verbs")
|
||
return deck, media_files
|
||
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Confusables deck — words that look identical without nikkud
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
CONF_FRONT = """
|
||
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
|
||
<div class="meaning" style="font-size:32px;">מה ההבדל?</div>
|
||
"""
|
||
|
||
CONF_BACK = """
|
||
{{FrontSide}}<hr>
|
||
<div class="definitions">{{Definitions}}</div>
|
||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||
"""
|
||
|
||
CONF_CSS = CARD_CSS
|
||
|
||
CONF_MODEL = genanki.Model(
|
||
CONF_MODEL_ID,
|
||
"Hebrew Confusables",
|
||
fields=[
|
||
{"name": "Words"},
|
||
{"name": "Definitions"},
|
||
{"name": "Audio"},
|
||
{"name": "WordNoNikkud"},
|
||
],
|
||
templates=[
|
||
{
|
||
"name": "Confusable",
|
||
"qfmt": CONF_FRONT,
|
||
"afmt": CONF_BACK,
|
||
},
|
||
],
|
||
css=CONF_CSS,
|
||
)
|
||
|
||
|
||
def build_confusables_deck(
|
||
dict_csv: Path,
|
||
include_audio: bool = True,
|
||
) -> tuple[genanki.Deck, list[Path]]:
|
||
"""Build confusables deck from vocab CSV — groups words identical without nikkud."""
|
||
logger.info("Building confusables deck …")
|
||
try:
|
||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||
if df.shape[1] < 3:
|
||
raise ValueError("too few columns")
|
||
except (ValueError, pd.errors.ParserError):
|
||
df = pd.read_csv(dict_csv, index_col=0)
|
||
|
||
deck = genanki.Deck(CONF_DECK_ID, "Hebrew Confusables")
|
||
media_files: list[Path] = []
|
||
note_count = 0
|
||
|
||
# Group by Word Without Nikkud
|
||
groups = {}
|
||
for _, row in df.iterrows():
|
||
word = str(row.get("Word", "")).strip()
|
||
meaning = str(row.get("Meaning", "")).strip()
|
||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||
if not word or not meaning or meaning in ("nan", "None"):
|
||
continue
|
||
if not word_no_nik or word_no_nik in ("nan", "None"):
|
||
continue
|
||
pos_heb = _translate_pos(pos_raw) if pos_raw and pos_raw not in ("nan", "None") else ""
|
||
groups.setdefault(word_no_nik, []).append((word, meaning, pos_heb))
|
||
|
||
for word_no_nik, entries in sorted(groups.items()):
|
||
if len(entries) < 2:
|
||
continue
|
||
|
||
# Deduplicate: skip entries with identical word+meaning
|
||
seen = set()
|
||
unique_entries = []
|
||
for w, m, p in entries:
|
||
key = (w, m)
|
||
if key not in seen:
|
||
seen.add(key)
|
||
unique_entries.append((w, m, p))
|
||
if len(unique_entries) < 2:
|
||
continue
|
||
|
||
# Build card content
|
||
words_display = " / ".join(w for w, _, _ in unique_entries)
|
||
defs_parts = []
|
||
audio_parts = []
|
||
all_have_audio = True
|
||
for w, m, p in unique_entries:
|
||
pos_label = f" ({p})" if p else ""
|
||
defs_parts.append(
|
||
f'<div class="conf-entry"><span class="hebrew" style="font-size:24px;">{w}</span>'
|
||
f" = {m}{pos_label}</div>"
|
||
)
|
||
if include_audio:
|
||
at = _audio_tag(_strip_nikkud(w))
|
||
if at and at not in audio_parts:
|
||
audio_parts.append(at)
|
||
mp3_name = at.removeprefix("[sound:").removesuffix("]")
|
||
mp3_path = AUDIO_DIR / mp3_name
|
||
if mp3_path not in media_files:
|
||
media_files.append(mp3_path)
|
||
else:
|
||
all_have_audio = False
|
||
|
||
# Only include audio if every word in the group has it
|
||
if not all_have_audio:
|
||
audio_parts = []
|
||
|
||
defs_html = "\n".join(defs_parts)
|
||
audio_html = " ".join(audio_parts)
|
||
|
||
note = genanki.Note(
|
||
model=CONF_MODEL,
|
||
guid=genanki.guid_for("confusable", word_no_nik),
|
||
fields=[words_display, defs_html, audio_html, word_no_nik],
|
||
tags=[RELEASE_TAG],
|
||
)
|
||
deck.add_note(note)
|
||
note_count += 1
|
||
|
||
logger.info(f"Confusables deck: {note_count} notes")
|
||
return deck, media_files
|
||
|
||
|
||
def write_conf_apkg(
|
||
deck: genanki.Deck,
|
||
media_files: list[Path] | None = None,
|
||
out_path: Path = CONF_APKG,
|
||
) -> None:
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
pkg = genanki.Package(deck)
|
||
base = [str(p) for p in (media_files or []) if p.exists()]
|
||
pkg.media_files = base + _font_media_files()
|
||
pkg.write_to_file(str(out_path))
|
||
logger.info(f"Confusables deck written → {out_path}")
|
||
|
||
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
# Noun plurals deck — singular↔plural drilling
|
||
# ──────────────────────────────────────────────────────────────────────────────
|
||
|
||
PLURAL_FRONT_SG = """
|
||
<div class="hebrew">{{Singular}}</div>
|
||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||
<div class="meaning">{{Meaning}}</div>
|
||
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
|
||
"""
|
||
|
||
PLURAL_BACK_SG = """
|
||
{{FrontSide}}<hr>
|
||
<div class="hebrew">{{Plural}}</div>
|
||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
|
||
"""
|
||
|
||
PLURAL_FRONT_PL = """
|
||
<div class="hebrew">{{Plural}}</div>
|
||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
|
||
"""
|
||
|
||
PLURAL_BACK_PL = """
|
||
{{FrontSide}}<hr>
|
||
<div class="hebrew">{{Singular}}</div>
|
||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||
<div class="meaning">{{Meaning}}</div>
|
||
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
|
||
"""
|
||
|
||
PLURAL_CSS = CARD_CSS
|
||
|
||
PLURAL_MODEL = genanki.Model(
|
||
PLURAL_MODEL_ID,
|
||
"Hebrew Plurals",
|
||
fields=[
|
||
{"name": "Singular"},
|
||
{"name": "SingularAudio"},
|
||
{"name": "Plural"},
|
||
{"name": "PluralAudio"},
|
||
{"name": "Meaning"},
|
||
{"name": "Root"},
|
||
{"name": "Mishkal"},
|
||
{"name": "Gender"},
|
||
],
|
||
templates=[
|
||
{
|
||
"name": "Singular → Plural",
|
||
"qfmt": PLURAL_FRONT_SG,
|
||
"afmt": PLURAL_BACK_SG,
|
||
},
|
||
{
|
||
"name": "Plural → Singular",
|
||
"qfmt": PLURAL_FRONT_PL,
|
||
"afmt": PLURAL_BACK_PL,
|
||
},
|
||
],
|
||
css=PLURAL_CSS,
|
||
)
|
||
|
||
|
||
def _is_irregular_plural(gender: str, plural: str) -> bool:
|
||
"""Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix."""
|
||
plural_stripped = _strip_nikkud(plural)
|
||
return (gender == "masculine" and plural_stripped.endswith("ות")) or (
|
||
gender == "feminine" and plural_stripped.endswith("ים")
|
||
)
|
||
|
||
|
||
def build_plural_deck(
|
||
noun_plurals_path: Path = DATA_DIR / "noun_plurals.json",
|
||
dict_csv: Path | None = None,
|
||
include_audio: bool = False,
|
||
) -> tuple[genanki.Deck, list[Path]]:
|
||
"""Build noun plurals deck.
|
||
|
||
Selection: ALL irregular plurals + 2-3 high-frequency exemplars per mishkal
|
||
pattern (for regular nouns). Cross-references frequency from vocab CSV.
|
||
"""
|
||
logger.info("Building plurals deck …")
|
||
|
||
with open(noun_plurals_path) as f:
|
||
all_nouns: dict[str, dict] = json.load(f)
|
||
|
||
# Load frequency data for prioritizing exemplars
|
||
freq_order: dict[str, int] = {}
|
||
if dict_csv and dict_csv.exists():
|
||
try:
|
||
vdf = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||
if vdf.shape[1] < 3:
|
||
raise ValueError
|
||
except (ValueError, pd.errors.ParserError):
|
||
vdf = pd.read_csv(dict_csv, index_col=0)
|
||
for idx, row in vdf.iterrows():
|
||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||
if word_no_nik and word_no_nik not in ("nan", "None"):
|
||
freq_order[word_no_nik] = idx # lower index = higher frequency
|
||
|
||
# Load meanings from vocab CSV
|
||
meanings: dict[str, str] = {}
|
||
roots: dict[str, str] = {}
|
||
if dict_csv and dict_csv.exists():
|
||
try:
|
||
vdf2 = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||
if vdf2.shape[1] < 3:
|
||
raise ValueError
|
||
except (ValueError, pd.errors.ParserError):
|
||
vdf2 = pd.read_csv(dict_csv, index_col=0)
|
||
for _, row in vdf2.iterrows():
|
||
word = str(row.get("Word", "")).strip()
|
||
if word:
|
||
meanings[word] = str(row.get("Meaning", "")).strip()
|
||
roots[word] = str(row.get("Root", "")).strip()
|
||
|
||
deck = genanki.Deck(PLURAL_DECK_ID, "Hebrew Plurals")
|
||
media_files: list[Path] = []
|
||
|
||
# Separate irregular plurals from regular (by mishkal)
|
||
irregulars: list[tuple[str, dict]] = []
|
||
by_mishkal: dict[str, list[tuple[str, dict]]] = {}
|
||
|
||
for word_key, data in all_nouns.items():
|
||
singular = data.get("singular", "")
|
||
plural = data.get("plural", "")
|
||
gender = data.get("gender", "")
|
||
mishkal = data.get("mishkal", "")
|
||
if not singular or not plural:
|
||
continue
|
||
|
||
if _is_irregular_plural(gender, plural):
|
||
irregulars.append((word_key, data))
|
||
elif mishkal:
|
||
by_mishkal.setdefault(mishkal, []).append((word_key, data))
|
||
|
||
# Select exemplars per mishkal, preferring high-frequency words.
|
||
# Target 2:1 regular:irregular ratio to avoid over-representing irregulars.
|
||
# Target ≥2:1 regular:irregular ratio — 6 per mishkal compensates for
|
||
# small groups (<6 entries) that can't fill their quota.
|
||
per_mishkal = 6
|
||
|
||
selected: list[tuple[str, dict]] = list(irregulars)
|
||
for _mishkal, entries in sorted(by_mishkal.items()):
|
||
# Sort by frequency (lower index = more common)
|
||
entries.sort(key=lambda e: freq_order.get(e[0], 999999))
|
||
selected.extend(entries[:per_mishkal])
|
||
|
||
note_count = 0
|
||
for _word_key, data in selected:
|
||
singular = data["singular"]
|
||
plural = data["plural"]
|
||
gender = data.get("gender", "")
|
||
mishkal = data.get("mishkal", "")
|
||
|
||
meaning = meanings.get(singular, "")
|
||
if not meaning or meaning in ("nan", "None"):
|
||
# Try without nikkud
|
||
meaning = meanings.get(_strip_nikkud(singular), "")
|
||
root = roots.get(singular, "")
|
||
if not root or root in ("nan", "None", "-"):
|
||
root = ""
|
||
|
||
# Audio tags
|
||
sg_audio = ""
|
||
pl_audio = ""
|
||
if include_audio:
|
||
# Use local audio files if available
|
||
sg_no_nik = _strip_nikkud(singular)
|
||
sg_tag = _audio_tag(sg_no_nik)
|
||
if sg_tag:
|
||
sg_audio = sg_tag
|
||
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
|
||
if mp3_path not in media_files:
|
||
media_files.append(mp3_path)
|
||
|
||
tags = [RELEASE_TAG]
|
||
if mishkal:
|
||
tags.append(f"mishkal::{mishkal}")
|
||
if _is_irregular_plural(gender, plural):
|
||
tags.append("irregular")
|
||
|
||
note = genanki.Note(
|
||
model=PLURAL_MODEL,
|
||
guid=genanki.guid_for("plural", singular),
|
||
fields=[
|
||
singular,
|
||
sg_audio,
|
||
plural,
|
||
pl_audio,
|
||
meaning,
|
||
root,
|
||
mishkal,
|
||
gender,
|
||
],
|
||
tags=tags,
|
||
)
|
||
deck.add_note(note)
|
||
note_count += 1
|
||
|
||
irregular_count = len(irregulars)
|
||
regular_count = note_count - irregular_count
|
||
logger.info(
|
||
f"Plurals deck: {note_count} notes "
|
||
f"({irregular_count} irregular + {regular_count} regular exemplars "
|
||
f"from {len(by_mishkal)} mishkal patterns)"
|
||
)
|
||
return deck, media_files
|
||
|
||
|
||
def write_plural_apkg(
|
||
deck: genanki.Deck,
|
||
media_files: list[Path] | None = None,
|
||
out_path: Path = PLURAL_APKG,
|
||
) -> None:
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
pkg = genanki.Package(deck)
|
||
base = [str(p) for p in (media_files or []) if p.exists()]
|
||
pkg.media_files = base + _font_media_files()
|
||
pkg.write_to_file(str(out_path))
|
||
logger.info(f"Plurals deck written → {out_path}")
|
||
|
||
|
||
def _font_media_files() -> list[str]:
|
||
"""Return list of Heebo font file paths that exist, for bundling in .apkg."""
|
||
font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
|
||
return [str(p) for p in font_paths if p.exists()]
|
||
|
||
|
||
class _RandomOrderPackage(genanki.Package):
|
||
"""genanki.Package subclass that sets new card order to random (0) instead of insertion order (1)."""
|
||
|
||
def write_to_db(self, cursor, timestamp, id_gen):
|
||
super().write_to_db(cursor, timestamp, id_gen)
|
||
row = cursor.execute("SELECT dconf FROM col").fetchone()
|
||
if row:
|
||
dconf = json.loads(row[0])
|
||
for conf in dconf.values():
|
||
if isinstance(conf, dict) and "new" in conf:
|
||
conf["new"]["order"] = 0
|
||
cursor.execute("UPDATE col SET dconf = ?", [json.dumps(dconf)])
|
||
|
||
|
||
def write_vocab_apkg(
|
||
deck: genanki.Deck,
|
||
media_files: list[Path],
|
||
out_path: Path = VOCAB_APKG,
|
||
) -> None:
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
pkg = genanki.Package(deck) # insertion order = frequency rank (new.order=1 default)
|
||
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
|
||
pkg.write_to_file(str(out_path))
|
||
logger.info(f"Vocabulary deck written → {out_path}")
|
||
|
||
|
||
def write_conj_apkg(
|
||
deck: genanki.Deck,
|
||
media_files: list[Path] | None = None,
|
||
out_path: Path = CONJ_APKG,
|
||
) -> None:
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
pkg = _RandomOrderPackage(deck)
|
||
base = [str(p) for p in (media_files or []) if p.exists()]
|
||
pkg.media_files = base + _font_media_files()
|
||
pkg.write_to_file(str(out_path))
|
||
logger.info(f"Conjugation deck written → {out_path}")
|
||
|
||
|
||
def build_complete_deck(
|
||
dict_csv: Path,
|
||
conjugations: dict,
|
||
examples_cache: dict | None = None,
|
||
freq_cache: dict | None = None,
|
||
image_cache: dict | None = None,
|
||
emoji_lookup: dict | None = None,
|
||
limit: int | None = None,
|
||
include_audio: bool = False,
|
||
) -> tuple[list[genanki.Deck], list[Path]]:
|
||
"""Build all subdecks under 'Hebrew::*' for the combined .apkg.
|
||
|
||
Returns (list_of_decks, deduplicated_media_files).
|
||
"""
|
||
logger.info(f" Building complete deck (audio={'yes' if include_audio else 'no'}) …")
|
||
|
||
# Build standalone decks using existing functions
|
||
vocab_deck, vocab_media = build_vocab_deck(
|
||
dict_csv,
|
||
examples_cache=examples_cache,
|
||
freq_cache=freq_cache,
|
||
image_cache=image_cache or {},
|
||
emoji_lookup=emoji_lookup,
|
||
limit=limit,
|
||
include_audio=include_audio,
|
||
include_images=True,
|
||
)
|
||
|
||
conj_deck, conj_media = build_conj_deck(
|
||
conjugations,
|
||
include_audio=include_audio,
|
||
dict_csv=dict_csv,
|
||
)
|
||
|
||
conf_deck, conf_media = build_confusables_deck(
|
||
dict_csv,
|
||
include_audio=include_audio,
|
||
)
|
||
|
||
# Create new Deck objects with subdeck names and different IDs
|
||
complete_vocab = genanki.Deck(COMPLETE_VOCAB_DECK_ID, "Hebrew::Vocabulary")
|
||
for note in vocab_deck.notes:
|
||
complete_vocab.add_note(note)
|
||
|
||
complete_conj = genanki.Deck(COMPLETE_CONJ_DECK_ID, "Hebrew::Conjugations")
|
||
for note in conj_deck.notes:
|
||
complete_conj.add_note(note)
|
||
|
||
complete_conf = genanki.Deck(COMPLETE_CONF_DECK_ID, "Hebrew::Confusables")
|
||
for note in conf_deck.notes:
|
||
complete_conf.add_note(note)
|
||
|
||
all_source_media = vocab_media + conj_media + conf_media
|
||
|
||
# Plurals subdeck (only if data exists)
|
||
plural_data_path = DATA_DIR / "noun_plurals.json"
|
||
if plural_data_path.exists():
|
||
plural_deck, plural_media = build_plural_deck(
|
||
noun_plurals_path=plural_data_path,
|
||
dict_csv=dict_csv,
|
||
include_audio=include_audio,
|
||
)
|
||
complete_plural = genanki.Deck(COMPLETE_PLURAL_DECK_ID, "Hebrew::Plurals")
|
||
for note in plural_deck.notes:
|
||
complete_plural.add_note(note)
|
||
all_source_media += plural_media
|
||
else:
|
||
complete_plural = None
|
||
|
||
# Deduplicate media files by resolved path
|
||
seen_paths: set[str] = set()
|
||
all_media: list[Path] = []
|
||
for mf in all_source_media:
|
||
resolved = str(mf.resolve()) if mf.exists() else str(mf)
|
||
if resolved not in seen_paths:
|
||
seen_paths.add(resolved)
|
||
all_media.append(mf)
|
||
|
||
decks = [complete_vocab, complete_conj, complete_conf]
|
||
if complete_plural:
|
||
decks.append(complete_plural)
|
||
|
||
plural_info = f" + {len(complete_plural.notes)} plural" if complete_plural else ""
|
||
logger.info(
|
||
f" Complete deck: {len(complete_vocab.notes)} vocab + "
|
||
f"{len(complete_conj.notes)} conj + {len(complete_conf.notes)} conf{plural_info} notes, "
|
||
f"{len(all_media)} media files"
|
||
)
|
||
return decks, all_media
|
||
|
||
|
||
def write_complete_apkg(
|
||
decks: list[genanki.Deck],
|
||
media_files: list[Path],
|
||
out_path: Path = COMPLETE_APKG,
|
||
) -> None:
|
||
"""Write a combined .apkg with multiple subdecks."""
|
||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||
pkg = genanki.Package(decks)
|
||
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
|
||
pkg.write_to_file(str(out_path))
|
||
logger.info(f"Complete deck written → {out_path}")
|
||
|
||
|
||
def build_all_variants(
|
||
dict_csv: Path,
|
||
conjugations: dict,
|
||
examples_cache: dict | None = None,
|
||
freq_cache: dict | None = None,
|
||
image_cache: dict | None = None,
|
||
limit: int | None = None,
|
||
) -> None:
|
||
"""Build all 6 release variants (4 vocab + 2 conj) into output/."""
|
||
logger.info("Building all release variants …")
|
||
|
||
emoji_lookup = _load_emoji_lookup()
|
||
logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded")
|
||
|
||
vocab_variants = [
|
||
(False, False, VOCAB_APKG),
|
||
(True, False, VOCAB_APKG_AUDIO),
|
||
(False, True, VOCAB_APKG_IMAGES),
|
||
(True, True, VOCAB_APKG_AUDIO_IMAGES),
|
||
]
|
||
for audio, images, path in vocab_variants:
|
||
label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}"
|
||
logger.info(f" Vocab variant: {label} → {path.name}")
|
||
deck, media = build_vocab_deck(
|
||
dict_csv,
|
||
examples_cache=examples_cache,
|
||
freq_cache=freq_cache,
|
||
image_cache=image_cache or {},
|
||
emoji_lookup=emoji_lookup,
|
||
limit=limit,
|
||
include_audio=audio,
|
||
include_images=images,
|
||
)
|
||
write_vocab_apkg(deck, media, out_path=path)
|
||
|
||
conj_variants = [
|
||
(False, CONJ_APKG),
|
||
(True, CONJ_APKG_AUDIO),
|
||
]
|
||
for audio, path in conj_variants:
|
||
label = f"audio={'yes' if audio else 'no'}"
|
||
logger.info(f" Conj variant: {label} → {path.name}")
|
||
deck, media = build_conj_deck(conjugations, include_audio=audio, dict_csv=dict_csv)
|
||
write_conj_apkg(deck, media, out_path=path)
|
||
|
||
conf_variants = [
|
||
(False, CONF_APKG),
|
||
(True, CONF_APKG_AUDIO),
|
||
]
|
||
for audio, path in conf_variants:
|
||
label = f"audio={'yes' if audio else 'no'}"
|
||
logger.info(f" Conf variant: {label} → {path.name}")
|
||
deck, media = build_confusables_deck(dict_csv, include_audio=audio)
|
||
write_conf_apkg(deck, media, out_path=path)
|
||
|
||
# Noun plurals (only if data exists)
|
||
plural_data_path = DATA_DIR / "noun_plurals.json"
|
||
if plural_data_path.exists():
|
||
plural_variants = [
|
||
(False, PLURAL_APKG),
|
||
(True, PLURAL_APKG_AUDIO),
|
||
]
|
||
for audio, path in plural_variants:
|
||
label = f"audio={'yes' if audio else 'no'}"
|
||
logger.info(f" Plural variant: {label} → {path.name}")
|
||
deck, media = build_plural_deck(
|
||
noun_plurals_path=plural_data_path,
|
||
dict_csv=dict_csv,
|
||
include_audio=audio,
|
||
)
|
||
write_plural_apkg(deck, media, out_path=path)
|
||
else:
|
||
logger.info(" Skipping plural deck (data/noun_plurals.json not found)")
|
||
|
||
# Combined "Hebrew::*" complete decks
|
||
complete_variants = [
|
||
(False, COMPLETE_APKG),
|
||
(True, COMPLETE_APKG_AUDIO),
|
||
]
|
||
for audio, path in complete_variants:
|
||
decks, media = build_complete_deck(
|
||
dict_csv,
|
||
conjugations=conjugations,
|
||
examples_cache=examples_cache,
|
||
freq_cache=freq_cache,
|
||
image_cache=image_cache,
|
||
emoji_lookup=emoji_lookup,
|
||
limit=limit,
|
||
include_audio=audio,
|
||
)
|
||
write_complete_apkg(decks, media, out_path=path)
|
||
|
||
logger.info("All variants built.")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||
|
||
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||
if not csv_path.exists():
|
||
csv_path = DATA_DIR / "hebrew_dict.csv"
|
||
if not csv_path.exists():
|
||
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
|
||
if not csv_path.exists():
|
||
csv_path = DATA_DIR / "pealim_dict.csv"
|
||
|
||
deck, media = build_vocab_deck(csv_path, limit=20)
|
||
write_vocab_apkg(deck, media)
|
||
|
||
conj_path = DATA_DIR / "conjugations.json"
|
||
if conj_path.exists():
|
||
with open(conj_path) as f:
|
||
conjugations = json.load(f)
|
||
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||
conj_deck, conj_media = build_conj_deck(conjugations, dict_csv=csv_path)
|
||
write_conj_apkg(conj_deck, conj_media)
|