#!/usr/bin/env python3 """ Build Anki .apkg files for both the vocabulary deck and the conjugation deck. Uses genanki for reliable, stable deck generation. Deck IDs are hardcoded integers — same ID on re-import updates the existing deck in Anki rather than creating a duplicate. """ import json import logging import random import re from pathlib import Path import genanki logger = logging.getLogger(__name__) # Stable deck/model IDs — do not change these VOCAB_DECK_ID = 1_234_567_890 VOCAB_MODEL_ID = 1_701_222_017_968 # matches Nevo's original Anki model CONJ_DECK_ID = 1_234_567_892 CONJ_MODEL_ID = 1_234_567_893 CONF_DECK_ID = 1_234_567_894 CONF_MODEL_ID = 1_234_567_895 PLURAL_DECK_ID = 1_234_567_896 PLURAL_MODEL_ID = 1_234_567_897 # Subdeck IDs for combined "Hebrew::*" package — MUST differ from standalone IDs COMPLETE_VOCAB_DECK_ID = 1_234_567_900 COMPLETE_CONJ_DECK_ID = 1_234_567_901 COMPLETE_CONF_DECK_ID = 1_234_567_902 COMPLETE_PLURAL_DECK_ID = 1_234_567_903 # Release version tag added to all notes so users can identify which release # their cards come from (visible in Anki's Browse view and card info). RELEASE_TAG = "v0.20" # Regex for extracting emoji and Hebrew prepositions from meaning strings EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+") HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)") DATA_DIR = Path(__file__).parent / "data" AUDIO_DIR = DATA_DIR / "audio" AUDIO_CONJ_DIR = DATA_DIR / "audio_conj" OUTPUT_DIR = Path(__file__).parent / "output" VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg" VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg" VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg" VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg" CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg" CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg" CONF_APKG = OUTPUT_DIR / "hebrew_confusables.apkg" CONF_APKG_AUDIO = OUTPUT_DIR / "hebrew_confusables_audio.apkg" PLURAL_APKG = OUTPUT_DIR / "hebrew_plurals.apkg" PLURAL_APKG_AUDIO = OUTPUT_DIR / "hebrew_plurals_audio.apkg" COMPLETE_APKG = OUTPUT_DIR / "hebrew_complete.apkg" COMPLETE_APKG_AUDIO = OUTPUT_DIR / "hebrew_complete_audio.apkg" # ────────────────────────────────────────────────────────────────────────────── # Binyan → Hebrew label mapping (for conjugation card display) # ────────────────────────────────────────────────────────────────────────────── BINYAN_TO_HEBREW: dict[str, str] = { "Pa'al": "פָּעַל", "Nif'al": "נִפְעַל", "Pi'el": "פִּעֵל", "Pu'al": "פֻּעַל", "Hitpa'el": "הִתְפַּעֵל", "Hif'il": "הִפְעִיל", "Huf'al": "הֻפְעַל", } # ────────────────────────────────────────────────────────────────────────────── # PoS → Hebrew label mapping # ────────────────────────────────────────────────────────────────────────────── POS_TO_HEBREW = { "Noun": "שם עצם", "Verb": "פועל", "Adjective": "שם תואר", "Adverb": "תואר הפועל", "Preposition": "מילת יחס", "Conjunction": "מילת חיבור", "Pronoun": "כינוי גוף", "Particle": "מילית", } # PoS category groupings for related-words display POS_CATEGORY_LABELS = { "Verb": "פעלים", "Noun": "שמות עצם", "Adjective": "שמות תואר", "Adverb": "תוארי הפועל", } # ────────────────────────────────────────────────────────────────────────────── # Shared CSS # ────────────────────────────────────────────────────────────────────────────── FONTS_DIR = DATA_DIR / "fonts" CARD_CSS = """ @font-face { font-family: 'Heebo'; src: url('_Heebo-Regular.ttf'); font-weight: normal; } @font-face { font-family: 'Heebo'; src: url('_Heebo-Bold.ttf'); font-weight: bold; } .card { font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif; font-size: 20px; text-align: right; color: #222; background: #fff; padding: 16px; max-width: 600px; margin: 0 auto; } .hebrew { font-size: 42px; font-weight: bold; direction: rtl; text-align: center; line-height: 1.5; color: #222; } .hebrew-sm { font-size: 30px; font-weight: normal; direction: rtl; text-align: center; color: #222; } .meaning { font-size: 34px; color: #1a1a8c; margin: 8px 0; text-align: center; } .hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; } .example { font-size: 24px; color: #222; direction: rtl; text-align: right; font-style: italic; margin: 10px auto 0; max-width: 90%; border-right: 3px solid #aaa; padding-right: 8px; } .divider { border-top: 1px solid #ddd; margin: 10px 0; } .freq-badge { display: inline-block; font-size: 11px; color: #aaa; background: transparent; border: 1px solid #eee; border-radius: 10px; padding: 2px 8px; margin-top: 4px; } .voice-label { font-size: 0.6em; font-weight: normal; color: #555; } .sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; } .sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; } .sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; } .sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; } .definitions { direction: rtl; text-align: center; } .more-toggle { text-align: center; direction: rtl; margin-top: 8px; } .more-header { display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none; border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8; } .more-header::-webkit-details-marker { display: none; } .more-header::before { content: "○ "; font-size: 14px; } details[open] > .more-header::before { content: "● "; } .related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; } .rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; } .rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; } .conf-entry { margin: 8px 0; font-size: 28px; direction: rtl; } .emoji-img { font-size: 3.5em; text-align: center; margin: 0.3em 0; } .plural-direction { font-size: 32px; color: #444; text-align: center; direction: rtl; margin: 8px 0; font-weight: bold; } .card [type="button"], .card button, .replay-button { display: block !important; margin: 4px auto !important; text-align: center; } @media (prefers-color-scheme: dark) { .card { color: #e8e8e8; background: #1c1c1e; } .hebrew { color: #f0f0f0; } .hebrew-sm { color: #e0e0e0; } .meaning { color: #82b0ff; } .sec-label { color: #e0e0e0; } .sec-key { color: #e0e0e0; } .sec-val { color: #e0e0e0; } .conf-entry { color: #ddd; } .hint { color: #777; } .voice-label { color: #888; } .example { color: #e0e0e0; border-right-color: #555; } .divider { border-top-color: #333; } .freq-badge { color: #888; border-color: #444; } .more-header { color: #bbb; background: #2a2a2e; border-color: #555; } .related-header { color: #999; } .rw-word { color: #e0e0e0; } .rw-meaning { color: #999; } .plural-direction { color: #aaa; } } .nightMode .card { color: #e8e8e8; background: #1c1c1e; } .nightMode .hebrew { color: #f0f0f0; } .nightMode .hebrew-sm { color: #e0e0e0; } .nightMode .meaning { color: #82b0ff; } .nightMode .sec-label { color: #e0e0e0; } .nightMode .sec-key { color: #e0e0e0; } .nightMode .sec-val { color: #e0e0e0; } .nightMode .conf-entry { color: #ddd; } .nightMode .hint { color: #777; } .nightMode .voice-label { color: #888; } .nightMode .example { color: #e0e0e0; border-right-color: #555; } .nightMode .divider { border-top-color: #333; } .nightMode .freq-badge { color: #888; border-color: #444; } .nightMode .more-header { color: #bbb; background: #2a2a2e; border-color: #555; } .nightMode .related-header { color: #999; } .nightMode .rw-word { color: #e0e0e0; } .nightMode .rw-meaning { color: #999; } .nightMode .plural-direction { color: #aaa; } """ # ────────────────────────────────────────────────────────────────────────────── # Vocabulary Deck # ────────────────────────────────────────────────────────────────────────────── VOCAB_FRONT_HEB = """
{{Word}}{{#Prep}} {{Prep}}{{/Prep}}
{{#Audio}}
{{Audio}}
{{/Audio}} """ VOCAB_BACK_HEB = """ {{FrontSide}}
{{Meaning}}
{{#Emoji}}
{{Emoji}}
{{/Emoji}} {{^Emoji}}{{#Image}}
{{/Image}}{{/Emoji}}
מידע נוסף
{{#WordNoNikkud}}
לְלֹא נִיקּוּד:{{WordNoNikkud}}
{{/WordNoNikkud}} {{#Root}}
שֹׁרֶשׁ:{{Root}}
{{/Root}} {{#PoS}}
חֵלֶק דִּיבּוּר:{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}
{{/PoS}} {{#Plural}}
רַבִּים:{{Plural}}
{{/Plural}}
{{#SharedRoots}}
{{SharedRoots}}
{{/SharedRoots}}
""" VOCAB_FRONT_ENG = """
{{Meaning}}
{{#Hint}}
{{Hint}}
{{/Hint}} {{#Emoji}}
{{Emoji}}
{{/Emoji}} {{^Emoji}}{{#Image}}
{{/Image}}{{/Emoji}} """ VOCAB_BACK_ENG = """ {{FrontSide}}
{{Word}}{{#Prep}} {{Prep}}{{/Prep}}
{{#Audio}}
{{Audio}}
{{/Audio}}
מידע נוסף
{{#WordNoNikkud}}
לְלֹא נִיקּוּד:{{WordNoNikkud}}
{{/WordNoNikkud}} {{#Root}}
שֹׁרֶשׁ:{{Root}}
{{/Root}} {{#PoS}}
חֵלֶק דִּיבּוּר:{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}
{{/PoS}} {{#Plural}}
רַבִּים:{{Plural}}
{{/Plural}}
{{#SharedRoots}}
{{SharedRoots}}
{{/SharedRoots}}
""" VOCAB_FRONT_CLOZE = """
{{ClozeExample}}
{{#ClozeHint}}
{{ClozeHint}}
{{/ClozeHint}} """ VOCAB_BACK_CLOZE = """ {{FrontSide}}
{{Word}}
{{#Audio}}
{{Audio}}
{{/Audio}} """ VOCAB_MODEL = genanki.Model( VOCAB_MODEL_ID, "Hebrew Flash Cards", fields=[ {"name": "Word"}, {"name": "Root"}, {"name": "PoS"}, {"name": "Meaning"}, {"name": "WordNoNikkud"}, {"name": "SharedRoots"}, {"name": "Tags"}, {"name": "Audio"}, {"name": "Example"}, {"name": "Frequency"}, {"name": "Image"}, {"name": "Emoji"}, {"name": "Prep"}, {"name": "Hint"}, {"name": "Plural"}, {"name": "Gender"}, {"name": "ClozeExample"}, {"name": "ClozeHint"}, ], templates=[ { # ord 0 — matches Nevo's original "Card 2" (Eng→Heb) "name": "English → Hebrew", "qfmt": VOCAB_FRONT_ENG, "afmt": VOCAB_BACK_ENG, }, { # ord 1 — matches Nevo's original "Card 3" (Heb→Eng) "name": "Hebrew → English", "qfmt": VOCAB_FRONT_HEB, "afmt": VOCAB_BACK_HEB, }, { # ord 2 — Cloze-style sentence fill-in (only generated when ClozeExample is non-empty) "name": "Sentence Cloze", "qfmt": VOCAB_FRONT_CLOZE, "afmt": VOCAB_BACK_CLOZE, }, ], css=CARD_CSS, ) # ────────────────────────────────────────────────────────────────────────────── # Conjugation Deck # ────────────────────────────────────────────────────────────────────────────── CONJ_FRONT = """
אֵיךְ אוֹמְרִים
{{Pronoun}}
{{Infinitive}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Tense}}
""" CONJ_BACK = """ {{FrontSide}}
{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}
{{#Audio}}
{{Audio}}
{{/Audio}}
מידע נוסף {{#Meaning}}
{{Meaning}}
{{/Meaning}}
שֹׁרֶשׁ:{{Root}}
בִּנְיָן:{{Binyan}}
{{#RelatedVocab}}
{{RelatedVocab}}
{{/RelatedVocab}}
""" CONJ_CSS = CARD_CSS CONJ_MODEL = genanki.Model( CONJ_MODEL_ID, "Hebrew Conjugation", fields=[ {"name": "Infinitive"}, {"name": "ReferenceForm"}, {"name": "Pronoun"}, {"name": "Tense"}, {"name": "ConjugatedForm"}, {"name": "Root"}, {"name": "Binyan"}, {"name": "Voice"}, {"name": "Audio"}, {"name": "Meaning"}, {"name": "RelatedVocab"}, {"name": "Prep"}, ], templates=[ { "name": "Conjugation Drill", "qfmt": CONJ_FRONT, "afmt": CONJ_BACK, } ], css=CONJ_CSS, ) # Present-tense expansion: each form key → list of (pronoun, tense_label) PRESENT_EXPANSION = { "present_ms": [ ("אֲנִי (זָכָר)", "הוֹוֶה"), ("אַתָּה", "הוֹוֶה"), ("הוּא", "הוֹוֶה"), ], "present_fs": [ ("אֲנִי (נְקֵבָה)", "הוֹוֶה"), ("אַתְּ", "הוֹוֶה"), ("הִיא", "הוֹוֶה"), ], "present_mp": [ ("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"), ("אַתֶּם", "הוֹוֶה"), ("הֵם", "הוֹוֶה"), ], "present_fp": [ ("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"), ("אַתֶּן", "הוֹוֶה"), ("הֵן", "הוֹוֶה"), ], } # Modern Hebrew: 2fp/3fp future and imperative default to mp form (classical in parens) FP_MODERN_FALLBACK = { "future_2fp": "future_2mp", "future_3fp": "future_3mp", "imperative_fp": "imperative_mp", } # 3rd person plural past: same form for m/f — generate two separate pronoun cards PAST_3P_EXPANSION = [ ("הֵם", "עָבָר"), ("הֵן", "עָבָר"), ] # Tense labels with "בְּ" prefix for display on cards TENSE_WITH_BE = { "עָבָר": "בֶּעָבָר", "הוֹוֶה": "בַּהוֹוֶה", "עָתִיד": "בֶּעָתִיד", "צִיּוּוּי": "בַּצִּוּוּי", } # Voice field: passive label only (shown inline on card front for Pu'al/Huf'al) VOICE_MAP = { "Pu'al": "סָבִיל", "Huf'al": "סָבִיל", } # Tense Hebrew label → English key prefix (for form_key construction) TENSE_KEY_MAP = { "עָבָר": "past", "הוֹוֶה": "present", "עָתִיד": "future", "צִוּוּי": "imperative", "צִיּוּוּי": "imperative", # alternate spelling } # ────────────────────────────────────────────────────────────────────────────── # Helpers # ────────────────────────────────────────────────────────────────────────────── def _load_words() -> dict[str, dict]: """Load the unified words.json data store.""" path = DATA_DIR / "words.json" with open(path, encoding="utf-8") as f: return json.load(f) def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR, slug: str = "") -> str: """Return [sound:xxx.mp3] if audio file exists, else empty string. Tries slug-based filename first (for confusable words), then consonant-based. """ if slug: slug_path = audio_dir / f"{slug}.mp3" if slug_path.exists(): return f"[sound:{slug_path.name}]" safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud) if not safe: return "" mp3_path = audio_dir / f"{safe}.mp3" if mp3_path.exists(): return f"[sound:{mp3_path.name}]" return "" def _conj_audio_tag(slug: str, form_key: str) -> str: """Return [sound:xxx.mp3] for conjugation audio if downloaded.""" filename = f"{slug}_{form_key}.mp3" mp3_path = AUDIO_CONJ_DIR / filename if mp3_path.exists(): return f"[sound:{filename}]" return "" # Keywords excluded when building emoji lookup AND matching meaning text. # Curated from manual review of all 2,261 emoji-word pairs (Sprint 8). _EMOJI_STOP = frozenset( { # Basic stop words "to", "be", "a", "an", "the", "of", "in", "on", "at", "for", "and", "with", "by", "or", "but", "not", "as", "its", # Generic emoji description words (too vague) "face", "hand", "sign", "symbol", "button", "small", "large", "light", "dark", "open", "closed", # Numbers → clock emoji (🕐🕑🕒 etc.) "one", "two", "three", "four", "five", "six", "seven", "eight", "nine", "ten", "hundred", "thousand", # UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️) "next", "fast", "play", "pause", "repeat", "end", "soon", "record", # Abstract words → misleading object emoji "part", "place", "mark", "post", "department", "store", "note", "control", "level", "stop", "cover", "roll", "rolling", "pick", "over", "right", "way", "skin", "drop", "middle", "piece", "section", # Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.) "north", "south", "northern", "southern", "western", "eastern", "central", "territory", "kingdom", "united", "virgin", # Common words producing bad emoji matches "new", "big", "full", "last", "first", "double", "slightly", "without", "from", "behind", "people", "position", "status", "situation", "game", "call", "trade", "male", "female", "person", "letter", # Polysemous words → wrong emoji sense "french", "fried", "board", "bow", "water", "union", "rock", "left", "back", "crane", "dash", "bar", "wheel", "horizontal", # Polysemous keywords producing wrong-sense emoji (Sprint 17 audit) "high", # ⚡ high voltage, not "tall" "down", # 🫳 palm down, not "descend" "off", # 📴 phone off, not "remove" "away", # 💨 dashing away, not "depart" "together", # 🤲 palms together, not "unite" "top", # 🎩 top hat, not "upper" "low", # 🔈 low volume, not "short" "flat", # 🥿 ballet flat, not "apartment" "soft", # 🍦 soft serve, not "quiet" "broken", # 💔 broken heart, not "damaged" "round", # 📍 round pushpin, not "circular" "cool", # 🆒 COOL button, not "cold" "free", # 🆓 FREE button, not "liberated" "long", # 🪘 long drum, not "lengthy" "straight", # 📏 straight ruler, not "direct" "empty", # 🪹 empty nest, not "void" "hot", # 🥵 hot face, not "warm" "cross", # ✝️ latin cross, not "intersect" "bright", # 🔆 bright button, not "luminous" "old", # 👴 old man, not "aged" "head", # 🙂‍↔️ shaking head, not "leader" # Category words that match generic emoji "military", # 🎖️ military medal for any military term "sports", # 🏅 sports medal for any sports term "food", # 😋 yummy face for any food term "city", # 🇻🇦 Vatican flag for any city "china", # 🇨🇳 China flag for "porcelain" "polish", # 💅 nail polish for "to polish/shine" "aid", # 🦻 hearing aid for "to help" "office", # 🧑‍💼 office worker for "bureau" "construction", # 🏛️ classical building, not construction "cinema", # 🎦 cinema emoji for any film term "ceremony", # 🎑 moon ceremony for any ceremony "building", # 🏛️ classical building for any structure # Body parts / human features → wrong emoji "arm", # 🦾 mechanical arm for "to arm" "hair", # 👱 blond person for "hair" "nose", # 😤 steam from nose "tongue", # 😛 tongue-out face "chest", # 🪎 not a chest "eyes", # 😃 face with eyes # Abstract/vague words "fear", # 😱 screaming face "anger", # 💢 anger symbol "angry", # 😠 angry face "tired", # 😫 tired face "sad", # 😥 sad face "joy", # 😂 tears of joy "love", # 💌 love letter "cold", # 🥶 cold face "pile", # 💩 pile of poo "man", # 👨 man "woman", # 👩 woman "boy", # 👦 boy "girl", # 👧 girl "baby", # 👶 baby "children", # 🚸 children crossing "student", # 🧑‍🎓 student "adult", # 🧑‍🧑‍🧒 family "name", # 📛 name badge "check", # ✅ check mark "line", # 🫥 dotted line face "floor", # 🤣 ROFL (rolling on floor) "room", # 🧖 person in steamy room "bubble", # 👁️‍🗨️ speech bubble "car", # 🚃 railway car, not automobile "bullet", # 🚅 bullet train "steam", # 😤 face with steam "fly", # 🪰 the insect, not the verb "plant", # 🪴 potted plant for all "X (plant)" entries "tree", # 🌲 evergreen for all "X (tree)" entries "ball", # ⛹️ person bouncing ball "bag", # 👝 clutch bag "fight", # 🫯 not a fight "cloud", # 🫯 not a cloud "video", # 🎮 video game, not video "rescue", # ⛑️ rescue worker helmet "exchange", # 💱 currency exchange "cut", # 🥩 cut of meat, not "to cut" "key", # 🔐 locked with key "walking", # 🚶 person walking "running", # 🏃 person running "climbing", # 🧗 person climbing "speaking", # 🗣️ speaking head "playing", # 🤽 person playing "feeding", # 👩‍🍼 person feeding "shooting", # 🌠 shooting star "clapping", # 👏 clapping hands "cooking", # 🍳 cooking emoji "holding", # 🥹 face holding back tears # More wrong-sense matches from remaining audit "paper", # 🏮 red lantern for "paper" "track", # 🛤️ railroad for "track record" "vertical", # 🚦 traffic light for "vertical" "speaker", # 🔇 muted speaker for "speaker (person)" "square", # 🟥 red square for "plaza" "wrapped", # 🎁 gift for "wrapped, bound" "volume", # 🔈 speaker for "volume (book)" "mobile", # 📱 phone for "mobile, moveable" "flash", # 📸 camera flash for "to shine" "identification", # 🪪 ID card for "locating" "service", # 🐕‍🦺 service dog for "service, term" "ground", # ⛱️ umbrella on ground "machine", # 🎰 slot machine for "mechanism" "liquid", # 🫗 pouring for "liquid, drop" "vehicle", # 🚙 SUV for any vehicle mention "window", # 🪟 window pane for "window, gap" "information", # ℹ️ info symbol "child", # 🧒 child emoji } ) def _load_emoji_lookup() -> dict[str, str]: """Load or fetch Unicode emoji keyword→character lookup. Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping. Result is cached in data/emoji_lookup.json. Returns empty dict on network failure (safe fallback). """ cache_file = DATA_DIR / "emoji_lookup.json" if cache_file.exists(): with open(cache_file) as f: return json.load(f) import requests try: resp = requests.get( "https://unicode.org/Public/emoji/latest/emoji-test.txt", timeout=30, ) resp.raise_for_status() except Exception as e: logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.") return {} lookup: dict[str, str] = {} for line in resp.text.splitlines(): if "fully-qualified" not in line: continue m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line) if not m: continue emoji_char = m.group(1) desc = m.group(2).lower().strip() for word in desc.split(): word = word.strip(".,'\"-") if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup: lookup[word] = emoji_char cache_file.write_text(json.dumps(lookup, ensure_ascii=False)) logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}") return lookup def _categorize_pos(pos_str: str) -> str: """Return the canonical PoS category key for grouping.""" base = pos_str.split("–")[0].split("—")[0].strip() for cat in POS_CATEGORY_LABELS: if base == cat: return cat return "Other" def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]: """Convert schema's active_forms list to the keyed format the card generator expects. Keys are like ``past_1s``, ``present_ms``, ``future_2mp``, ``imperative_fs``. Each value dict has: form, form_ktiv, pronoun (Hebrew string), tense (Hebrew label), audio_url, guid, guid_candidates. """ result: dict[str, dict] = {} for f in forms_list: tense_en = TENSE_KEY_MAP.get(f["tense"], f["tense"]) key = f"{tense_en}_{f['person']}" result[key] = { "form": f["form"]["nikkud"], "form_ktiv": f["form"].get("ktiv_male", ""), "pronoun": f.get("pronoun_hebrew", ""), # Hebrew pronoun string "tense": f["tense"], # Hebrew tense label "audio_url": f.get("audio_url", ""), "guid": f.get("guid"), "guid_candidates": f.get("guid_candidates"), } return result # Hebrew prefix letters (אותיות השימוש): בהוכלמש _PREFIX_LETTERS = frozenset("בהוכלמש") def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int: """Return the number of characters in the cloze token that are prefix (not part of the word). For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars). Returns 0 if the token starts with the word directly. """ if not word_nikkud or not cloze_token: return 0 # If the token starts with the word nikkud, no prefix if cloze_token.startswith(word_nikkud): return 0 # Check if word nikkud appears as a suffix of the token idx = cloze_token.find(word_nikkud) if idx > 0: # Verify prefix chars are valid Hebrew prefix letters prefix_part = cloze_token[:idx] base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"] if base_letters and all(c in _PREFIX_LETTERS for c in base_letters): return idx return 0 def build_vocab_deck( words: dict[str, dict], limit: int | None = None, include_audio: bool = True, include_images: bool = True, emoji_lookup: dict | None = None, ) -> tuple[genanki.Deck, list[Path]]: """Build the vocabulary deck from the unified words dict. Args: words: Unified data dict keyed by unique_key (from words.json). limit: If set, only process the first N entries (by frequency). include_audio: Whether to include audio tags in notes. include_images: Whether to include image tags in notes. emoji_lookup: Optional Unicode emoji keyword→char mapping for fallback emoji. Returns: (deck, list_of_media_files) """ logger.info(f"Building vocabulary deck from {len(words)} words …") images_dir = DATA_DIR / "images" # Build word_unique_key → pos_category dict for related-words grouping word_to_pos_cat: dict[str, str] = {} for unique_key, entry in words.items(): pos_raw = entry.get("pos", "") if pos_raw: word_to_pos_cat[unique_key] = _categorize_pos(pos_raw) # Also index by nikkud word (for shared_roots lookup by nikkud form) word_nikkud = entry["word"]["nikkud"] if word_nikkud not in word_to_pos_cat: word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other" # Sort entries by effective frequency (pseudo_frequency for confusables, # else regular frequency; null → 999999), applying limit after sort def _freq_key(item: tuple[str, dict]) -> int: e = item[1] return e.get("pseudo_frequency") or e.get("frequency") or 999_999 sorted_entries = sorted(words.items(), key=_freq_key) if limit: sorted_entries = sorted_entries[:limit] deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary") media_files: list[Path] = [] seen_words: set[tuple[str, str]] = set() for _unique_key, entry in sorted_entries: word_nikkud = entry["word"]["nikkud"] word_no_nik = entry["word"].get("ktiv_male", "") root_list = entry.get("root") or [] root = ".".join(root_list) pos_raw = entry.get("pos", "") pos_heb = entry.get("pos_hebrew", "") meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip() meaning = HBPAREN_RE.sub("", meaning).strip() # Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning) meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning) meaning = re.sub(r"[;:]\s*—", " —", meaning) # clean "; —" → " —" meaning = re.sub(r";\s*:", ";", meaning) # clean "; :" → ";" meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:") meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma slug = entry.get("slug", "") or "" frequency = entry.get("frequency") or 999_999 audio_file = entry.get("audio_file", "") or "" tags_str = entry.get("tags", "") or "" hint_str = entry.get("hint", "") or "" shared_roots_keys = entry.get("shared_roots") or [] is_confusable = entry.get("confusable_group") is not None if not word_nikkud or not meaning: continue # Skip exact duplicates (same word AND same meaning) word_meaning_key = (word_nikkud, meaning) if word_meaning_key in seen_words: logger.debug(f" Skipping duplicate word+meaning: {word_nikkud}") continue seen_words.add(word_meaning_key) # Frequency display label if frequency <= 500: freq_display = f"Core #{frequency}" elif frequency <= 1500: freq_display = f"Essential #{frequency}" elif frequency <= 3000: freq_display = f"Intermediate #{frequency}" elif frequency <= 5000: freq_display = f"Upper-intermediate #{frequency}" elif frequency <= 10000: freq_display = f"Advanced #{frequency}" elif frequency < 999_999: freq_display = f"Rare #{frequency}" else: freq_display = "Unlisted" # Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup. # Skip fallback for verbs — keyword matching on verb definitions produces # wrong-sense emoji (e.g. "to cut" → 🥩, "to arm" → 🦾). emoji_str = "" if entry.get("emoji_visible") and entry.get("emoji"): emoji_str = entry["emoji"] elif emoji_lookup and not meaning.startswith("to "): meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip() for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]: if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup: emoji_str = emoji_lookup[kw] break # Hebrew prepositions — extracted upstream by list scraper entry_prep = entry.get("prep") prep_str = " ".join(f"({p})" for p in entry_prep.split()) if entry_prep else "" # Audio — use audio_file from entry; for confusables it's already slug-based audio_tag = "" if include_audio and audio_file: mp3_path = AUDIO_DIR / audio_file if mp3_path.exists(): audio_tag = f"[sound:{audio_file}]" if mp3_path not in media_files: media_files.append(mp3_path) else: # Fallback: try consonant-based filename audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "") if audio_tag: mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]") mp3_path_fb = AUDIO_DIR / mp3_name if mp3_path_fb not in media_files: media_files.append(mp3_path_fb) elif include_audio: audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "") if audio_tag: mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]") mp3_path_fb = AUDIO_DIR / mp3_name if mp3_path_fb not in media_files: media_files.append(mp3_path_fb) # Example sentence from vetted examples example_html = "" examples = entry.get("examples") or {} if examples.get("vetted"): example_html = examples["vetted"][0]["text"] # Cloze: use pre-computed cloze from words.json cloze_example = "" cloze_hint = "" if not is_confusable and examples.get("cloze"): cloze_data = examples["cloze"] cloze_text = cloze_data.get("text", "") start = cloze_data.get("cloze_word_start") end = cloze_data.get("cloze_word_end") if cloze_text and start is not None and end is not None: # Preserve Hebrew prefix letters in the cloze blank # e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____" cloze_token = cloze_text[start:end] prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud) cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:] # Clean up duplicate adjacent quotation marks (e.g. "" → ") cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example) raw_hint = cloze_data.get("cloze_hint") or "" if raw_hint: cloze_hint = raw_hint else: pos_cat = _categorize_pos(pos_raw) if pos_raw else "Other" cloze_hint = meaning if pos_cat == "Verb" and pos_heb: cloze_hint = f"{meaning} ({pos_heb})" # Related words (shared roots) as a table: word — meaning, sorted by frequency related_html = "" if shared_roots_keys: rw_items: list[tuple[int, str, str]] = [] # (sort_key, nikkud, meaning) for rw_key in shared_roots_keys: rw_entry = words.get(rw_key) if rw_entry: rw_nikkud = rw_entry["word"]["nikkud"] rw_meaning = rw_entry.get("meaning") or "" if len(rw_meaning) > 40: rw_meaning = rw_meaning[:37] + "…" rw_freq = rw_entry.get("frequency") or 999999 else: rw_nikkud = rw_key rw_meaning = "" rw_freq = 999999 rw_items.append((rw_freq, rw_nikkud, rw_meaning)) rw_items.sort(key=lambda x: x[0]) rows_html: list[str] = [] for _freq, rw_nikkud, rw_meaning in rw_items: rows_html.append( f'
' f'{rw_nikkud}' f'{rw_meaning}' f"
" ) related_html = "\n".join(rows_html) # Plural form and gender (nouns only) plural_str = "" gender_str = "" if pos_raw.startswith("Noun"): noun_inflection = entry.get("noun_inflection") if noun_inflection: if noun_inflection.get("plural"): plural_str = noun_inflection["plural"].get("nikkud", "") gender_raw = noun_inflection.get("gender") or "" if gender_raw == "masculine": gender_str = "זָכָר" elif gender_raw == "feminine": gender_str = "נְקֵבָה" # Image image_tag = "" if include_images: image_filename = entry.get("image") or "" if image_filename: image_path = images_dir / image_filename if image_path.exists(): image_tag = image_filename if image_path not in media_files: media_files.append(image_path) # GUID: use vocab_legacy_guid from entry, fall back to deterministic legacy_guid = entry.get("vocab_legacy_guid") note_guid = legacy_guid or genanki.guid_for(word_nikkud, meaning) note = genanki.Note( model=VOCAB_MODEL, guid=note_guid, fields=[ word_nikkud, root, pos_heb, meaning, word_no_nik, related_html or "", tags_str, audio_tag, example_html, freq_display, image_tag, emoji_str, prep_str, hint_str, plural_str, gender_str, cloze_example, cloze_hint, ], tags=(tags_str.split() if tags_str else []) + [RELEASE_TAG] + [f"freq::{freq_display.split()[0]}" if freq_display != "Unlisted" else "freq::Unlisted"], ) deck.add_note(note) # Diagnostics emoji_count = sum(1 for n in deck.notes if n.fields[11]) prep_count = sum(1 for n in deck.notes if n.fields[12]) hint_count = sum(1 for n in deck.notes if n.fields[13]) plural_count = sum(1 for n in deck.notes if n.fields[14]) gender_count = sum(1 for n in deck.notes if n.fields[15]) cloze_count = sum(1 for n in deck.notes if n.fields[16]) unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999) if emoji_count: logger.info(f" Emoji extracted: {emoji_count} words") if prep_count: logger.info(f" Hebrew prepositions extracted: {prep_count} words") if hint_count: logger.info(f" Eng→Heb hints: {hint_count} words") if plural_count: logger.info(f" Noun plurals on vocab cards: {plural_count} words") if gender_count: logger.info(f" Noun gender on vocab cards: {gender_count} words") if cloze_count: logger.info(f" Sentence cloze cards: {cloze_count} words") logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}") logger.info(f"Vocabulary deck: {len(deck.notes)} notes") return deck, media_files def build_conj_deck( words: dict[str, dict], audio_dir: Path = AUDIO_CONJ_DIR, include_audio: bool = True, ) -> tuple[genanki.Deck, list[Path]]: """Build the conjugation drill deck from words with in_conjugation_deck=True.""" deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations") media_files: list[Path] = [] note_count = 0 verb_count = 0 # Build root → [(freq, nikkud, meaning)] lookup for cross-linking root_words: dict[str, list[tuple[int, str, str]]] = {} for entry in words.values(): root_list = entry.get("root") or [] root_key = " ".join(root_list) if root_key: rw_meaning = entry.get("meaning") or "" if len(rw_meaning) > 40: rw_meaning = rw_meaning[:37] + "…" rw_freq = entry.get("frequency") or 999999 root_words.setdefault(root_key, []).append((rw_freq, entry["word"]["nikkud"], rw_meaning)) for _unique_key, entry in words.items(): conj = entry.get("conjugation") if not conj or not conj.get("in_conjugation_deck"): continue active_forms_list = conj.get("active_forms") or [] if not active_forms_list: continue verb_count += 1 infinitive = conj["infinitive"]["nikkud"] ref_form = conj["reference_form"]["nikkud"] binyan = conj.get("binyan", "") binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or "" slug = entry.get("slug", "") or "" root_list = entry.get("root") or [] root = ".".join(root_list) voice = VOICE_MAP.get(binyan, "") meaning = entry.get("meaning", "") or "" # Hebrew preposition — extracted upstream by scraper prep_str = "" conj_prep = conj.get("prep") or entry.get("prep") if conj_prep: prep_str = conj_prep.strip("() ") related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive] if related: related.sort(key=lambda x: x[0]) related_rows = [] for _freq, rw_nikkud, rw_meaning in related[:8]: related_rows.append( f'
' f'{rw_nikkud}' f'{rw_meaning}' f"
" ) related_str = "\n".join(related_rows) else: related_str = "" forms = _forms_list_to_dict(active_forms_list) def add_note( pronoun: str, tense: str, conj_form: str, audio_tag: str, _form_key_for_guid: str, guid_val: str | None = None, guid_candidates: list[str] | None = None, *, _infinitive: str = infinitive, _ref_form: str = ref_form, _root: str = root, _binyan_heb: str = binyan_heb, _voice: str = voice, _meaning: str = meaning, _related_str: str = related_str, _prep_str: str = prep_str, ) -> None: nonlocal note_count if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form): return # Apply tense prefix (בְּ) display_tense = TENSE_WITH_BE.get(tense, tense) # GUID: use stored guid, then first candidate, then deterministic fallback if guid_val: note_guid = guid_val elif guid_candidates: note_guid = guid_candidates[0] else: note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb) note = genanki.Note( model=CONJ_MODEL, guid=note_guid, fields=[ _infinitive, _ref_form, pronoun, display_tense, conj_form, _root, _binyan_heb, _voice, audio_tag, _meaning, _related_str, _prep_str, ], tags=[RELEASE_TAG], ) deck.add_note(note) note_count += 1 # Seeded RNG per verb — deterministic pronoun/gender choices verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF) for form_key, form_data in forms.items(): primary_form = form_data.get("form", "") conj_form = primary_form # Infinitive: shown on card front as reference — skip as a quiz form if form_key == "infinitive": continue # Audio tag audio_tag = "" if include_audio and slug: audio_tag = _conj_audio_tag(slug, form_key) if audio_tag: mp3_path = audio_dir / f"{slug}_{form_key}.mp3" if mp3_path not in media_files: media_files.append(mp3_path) guid_val = form_data.get("guid") guid_candidates = form_data.get("guid_candidates") # Present tense expansion: 4 form keys → 1 card each (seeded RNG) if form_key in PRESENT_EXPANSION: chosen = verb_rng.choice(PRESENT_EXPANSION[form_key]) add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates) continue # Past 3rd plural: same form for m/f → 1 card (seeded RNG) if form_key == "past_3p": chosen = verb_rng.choice(PAST_3P_EXPANSION) add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates) continue # 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens if form_key in FP_MODERN_FALLBACK: mp_key = FP_MODERN_FALLBACK[form_key] mp_form = forms.get(mp_key, {}).get("form", "") fp_form = conj_form display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form pronoun = form_data.get("pronoun", "") tense = form_data.get("tense", "") add_note(pronoun, tense, display_form, audio_tag, form_key, guid_val, guid_candidates) continue # Standard card pronoun = form_data.get("pronoun", "") tense = form_data.get("tense", "") # 1st-person forms get a randomly assigned gender label (deterministic per verb) if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}: gender = verb_rng.choice(["זָכָר", "נְקֵבָה"]) pronoun = f"{pronoun} ({gender})" add_note(pronoun, tense, conj_form, audio_tag, form_key, guid_val, guid_candidates) # Passive partner forms (Huf'al/Pu'al counterpart) hufal_forms_list = conj.get("hufal_pual_forms") if hufal_forms_list: ref_passive = conj.get("reference_form_passive") ref_form_passive = ref_passive["nikkud"] if ref_passive else ref_form passive_binyan = "Huf'al" if binyan == "Hif'il" else "Pu'al" passive_binyan_heb = BINYAN_TO_HEBREW.get(passive_binyan, passive_binyan) passive_voice = VOICE_MAP.get(passive_binyan, "סָבִיל") passive_forms = _forms_list_to_dict(hufal_forms_list) for form_key, form_data in passive_forms.items(): primary_form = form_data.get("form", "") conj_form = primary_form if form_key == "infinitive": continue audio_tag = "" if include_audio and slug: passive_audio_key = f"passive_{form_key}" audio_tag = _conj_audio_tag(slug, passive_audio_key) if audio_tag: mp3_path = audio_dir / f"{slug}_{passive_audio_key}.mp3" if mp3_path not in media_files: media_files.append(mp3_path) guid_val = form_data.get("guid") guid_candidates = form_data.get("guid_candidates") if form_key in PRESENT_EXPANSION: chosen = verb_rng.choice(PRESENT_EXPANSION[form_key]) add_note( chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates, _ref_form=ref_form_passive, _binyan_heb=passive_binyan_heb, _voice=passive_voice, ) continue if form_key == "past_3p": chosen = verb_rng.choice(PAST_3P_EXPANSION) add_note( chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates, _ref_form=ref_form_passive, _binyan_heb=passive_binyan_heb, _voice=passive_voice, ) continue if form_key in FP_MODERN_FALLBACK: mp_key = FP_MODERN_FALLBACK[form_key] mp_form = passive_forms.get(mp_key, {}).get("form", "") fp_form = conj_form display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form pronoun = form_data.get("pronoun", "") tense = form_data.get("tense", "") add_note( pronoun, tense, display_form, audio_tag, form_key, guid_val, guid_candidates, _ref_form=ref_form_passive, _binyan_heb=passive_binyan_heb, _voice=passive_voice, ) continue pronoun = form_data.get("pronoun", "") tense = form_data.get("tense", "") if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}: gender = verb_rng.choice(["זָכָר", "נְקֵבָה"]) pronoun = f"{pronoun} ({gender})" add_note( pronoun, tense, conj_form, audio_tag, form_key, guid_val, guid_candidates, _ref_form=ref_form_passive, _binyan_heb=passive_binyan_heb, _voice=passive_voice, ) logger.info(f"Conjugation deck: {note_count} notes across {verb_count} verbs") return deck, media_files # ────────────────────────────────────────────────────────────────────────────── # Confusables deck — words that look identical without nikkud # ────────────────────────────────────────────────────────────────────────────── CONF_FRONT = """
{{Words}}
מה ההבדל?
""" CONF_BACK = """ {{FrontSide}}
{{Definitions}}
{{#Audio}}
{{Audio}}
{{/Audio}} """ CONF_CSS = CARD_CSS CONF_MODEL = genanki.Model( CONF_MODEL_ID, "Hebrew Confusables", fields=[ {"name": "Words"}, {"name": "Definitions"}, {"name": "Audio"}, {"name": "WordNoNikkud"}, ], templates=[ { "name": "Confusable", "qfmt": CONF_FRONT, "afmt": CONF_BACK, }, ], css=CONF_CSS, ) def build_confusables_deck( words: dict[str, dict], include_audio: bool = True, ) -> tuple[genanki.Deck, list[Path]]: """Build confusables deck from words dict — groups words by confusable_group.""" logger.info("Building confusables deck …") deck = genanki.Deck(CONF_DECK_ID, "Hebrew Confusables") media_files: list[Path] = [] note_count = 0 # Group entries by shared ktiv_male (confusable_group members share the same ktiv_male) # Use confusables_guid as the stable note GUID — all members of a group share it. # Process each unique guid once. seen_guids: set[str] = set() # Build guid → list of entries guid_to_entries: dict[str, list[dict]] = {} for unique_key, entry in words.items(): if entry.get("confusable_group") is None: continue guid = entry.get("confusables_guid") if not guid: # Fall back to ktiv_male-based guid guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key)) guid_to_entries.setdefault(guid, []).append(entry) def _eff_freq(e: dict) -> int: return e.get("pseudo_frequency") or e.get("frequency") or 999_999 for guid, group_entries in sorted( guid_to_entries.items(), key=lambda x: sum(_eff_freq(e) for e in x[1]) / len(x[1]), ): if guid in seen_guids: continue seen_guids.add(guid) if len(group_entries) < 2: continue # Deduplicate: skip entries with identical word+meaning seen: set[tuple[str, str]] = set() unique_entries: list[dict] = [] for e in group_entries: key = (e["word"]["nikkud"], e.get("meaning", "")) if key not in seen: seen.add(key) unique_entries.append(e) if len(unique_entries) < 2: continue # Sort by pseudo/frequency so most common meaning appears first unique_entries.sort(key=_eff_freq) if len(unique_entries) < 2: continue word_no_nik = unique_entries[0]["word"].get("ktiv_male", "") words_display = word_no_nik # Show ktiv male (shared form) on front defs_parts: list[str] = [] audio_parts: list[str] = [] for e in unique_entries: w = e["word"]["nikkud"] m = e.get("meaning", "") p = e.get("pos_hebrew", "") pos_div = f'
{p}
' if p else "" defs_parts.append( f'
' f'{w}' f'
{m}
' f"{pos_div}
" ) if include_audio: af = e.get("audio_file", "") or "" at = "" if af: mp3_path = AUDIO_DIR / af if mp3_path.exists(): at = f"[sound:{af}]" if not at: slug = e.get("slug", "") or "" ktiv_male = e.get("word", {}).get("ktiv_male", "") or "" at = _audio_tag(ktiv_male, slug=slug) if at and at not in audio_parts: audio_parts.append(at) mp3_name = at.removeprefix("[sound:").removesuffix("]") mp3_path = AUDIO_DIR / mp3_name if mp3_path not in media_files: media_files.append(mp3_path) defs_html = "\n".join(defs_parts) audio_html = " ".join(audio_parts) note = genanki.Note( model=CONF_MODEL, guid=guid, fields=[words_display, defs_html, audio_html, word_no_nik], tags=[RELEASE_TAG], ) deck.add_note(note) note_count += 1 logger.info(f"Confusables deck: {note_count} notes") return deck, media_files def write_conf_apkg( deck: genanki.Deck, media_files: list[Path] | None = None, out_path: Path = CONF_APKG, ) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) pkg = genanki.Package(deck) base = [str(p) for p in (media_files or []) if p.exists()] pkg.media_files = base + _font_media_files() pkg.write_to_file(str(out_path)) logger.info(f"Confusables deck written → {out_path}") # ────────────────────────────────────────────────────────────────────────────── # Noun plurals deck — singular↔plural drilling # ────────────────────────────────────────────────────────────────────────────── PLURAL_FRONT_SG = """
{{Singular}}
{{#SingularAudio}}
{{SingularAudio}}
{{/SingularAudio}}
{{Meaning}}
יָחִיד ← רַבִּים
""" PLURAL_BACK_SG = """ {{FrontSide}}
{{Plural}}
{{#PluralAudio}}
{{PluralAudio}}
{{/PluralAudio}}
{{#Gender}}
מִין:{{Gender}}
{{/Gender}} {{#Mishkal}}
מִשְׁקָל:{{Mishkal}}
{{/Mishkal}}
""" PLURAL_FRONT_PL = """
{{Plural}}
{{#PluralAudio}}
{{PluralAudio}}
{{/PluralAudio}}
רַבִּים ← יָחִיד
""" PLURAL_BACK_PL = """ {{FrontSide}}
{{Singular}}
{{#SingularAudio}}
{{SingularAudio}}
{{/SingularAudio}}
{{Meaning}}
{{#Gender}}
מִין:{{Gender}}
{{/Gender}} {{#Mishkal}}
מִשְׁקָל:{{Mishkal}}
{{/Mishkal}}
""" PLURAL_CSS = CARD_CSS PLURAL_MODEL = genanki.Model( PLURAL_MODEL_ID, "Hebrew Plurals", fields=[ {"name": "Singular"}, {"name": "SingularAudio"}, {"name": "Plural"}, {"name": "PluralAudio"}, {"name": "Meaning"}, {"name": "Root"}, {"name": "Mishkal"}, {"name": "Gender"}, ], templates=[ { "name": "Singular → Plural", "qfmt": PLURAL_FRONT_SG, "afmt": PLURAL_BACK_SG, }, { "name": "Plural → Singular", "qfmt": PLURAL_FRONT_PL, "afmt": PLURAL_BACK_PL, }, ], css=PLURAL_CSS, ) def _is_irregular_plural(gender: str, plural_ktiv: str) -> bool: """Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix. Args: gender: ``"masculine"`` or ``"feminine"``. plural_ktiv: ktiv male (no nikkud) form of the plural. """ return (gender == "masculine" and plural_ktiv.endswith("ות")) or ( gender == "feminine" and plural_ktiv.endswith("ים") ) def build_plural_deck( words: dict[str, dict], include_audio: bool = False, ) -> tuple[genanki.Deck, list[Path]]: """Build noun plurals deck from words with noun_inflection data. Selection: ALL irregular plurals + 2-3 high-frequency exemplars per mishkal pattern (for regular nouns). """ logger.info("Building plurals deck …") deck = genanki.Deck(PLURAL_DECK_ID, "Hebrew Plurals") media_files: list[Path] = [] # Collect all nouns with both singular and plural irregulars: list[tuple[str, dict, dict]] = [] # (unique_key, entry, noun_inflection) by_mishkal: dict[str, list[tuple[str, dict, dict]]] = {} for unique_key, entry in words.items(): if not entry.get("pos", "").startswith("Noun"): continue noun_inflection = entry.get("noun_inflection") if not noun_inflection: continue singular_data = noun_inflection.get("singular") plural_data = noun_inflection.get("plural") if not singular_data or not plural_data: continue singular = singular_data.get("nikkud", "") plural = plural_data.get("nikkud", "") plural_ktiv = plural_data.get("ktiv_male", "") if not singular or not plural: continue gender = noun_inflection.get("gender", "") mishkal = noun_inflection.get("mishkal") or "" if _is_irregular_plural(gender, plural_ktiv): irregulars.append((unique_key, entry, noun_inflection)) elif mishkal: by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection)) # Select regular exemplars to achieve a 2:1 regular:irregular ratio. # Distribute evenly across mishkal patterns, preferring high-frequency words. irregular_count = len(irregulars) target_regular = irregular_count * 2 mishkal_count = len(by_mishkal) or 1 # Over-sample per mishkal to compensate for small patterns, then trim per_mishkal = max(3, (target_regular * 3) // (mishkal_count * 2)) regular_pool: list[tuple[str, dict, dict]] = [] for _mishkal, entries in sorted(by_mishkal.items()): entries.sort(key=lambda e: e[1].get("frequency") or 999_999) regular_pool.extend(entries[:per_mishkal]) # If we overshot, trim to target (keeping highest-frequency across all mishkals) if len(regular_pool) > target_regular: regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999) regular_pool = regular_pool[:target_regular] # Sort both pools by frequency, then interleave for homogeneous 2:1 regular:irregular irregulars.sort(key=lambda e: e[1].get("frequency") or 999_999) regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999) # Interleave: for every 1 irregular, insert 2 regulars selected: list[tuple[str, dict, dict]] = [] ri = 0 # regular index for _ii, irr in enumerate(irregulars): # Insert 2 regulars before each irregular (when available) for _ in range(2): if ri < len(regular_pool): selected.append(regular_pool[ri]) ri += 1 selected.append(irr) # Append remaining regulars while ri < len(regular_pool): selected.append(regular_pool[ri]) ri += 1 note_count = 0 for _unique_key, entry, noun_inflection in selected: singular = noun_inflection["singular"]["nikkud"] singular_ktiv = noun_inflection["singular"].get("ktiv_male", "") plural = noun_inflection["plural"]["nikkud"] plural_ktiv = noun_inflection["plural"].get("ktiv_male", "") gender = noun_inflection.get("gender") or "" gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender) mishkal_heb = noun_inflection.get("mishkal_hebrew") or "" meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip() root_list = entry.get("root") or [] root = ".".join(root_list) # GUID from noun_inflection note_guid_raw = noun_inflection.get("plurals_guid") note_guid = note_guid_raw if note_guid_raw else genanki.guid_for("plural", singular, meaning) # Audio tags sg_audio = "" pl_audio = "" if include_audio: slug = entry.get("slug", "") sg_tag = _audio_tag(singular_ktiv, slug=slug) if sg_tag: sg_audio = sg_tag mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]") if mp3_path not in media_files: media_files.append(mp3_path) # Plural audio: {slug}_plural.mp3 if slug: pl_mp3 = AUDIO_DIR / f"{slug}_plural.mp3" if pl_mp3.exists(): pl_audio = f"[sound:{pl_mp3.name}]" if pl_mp3 not in media_files: media_files.append(pl_mp3) mishkal_eng = noun_inflection.get("mishkal") or "" tags = [RELEASE_TAG] if mishkal_eng: tags.append(f"mishkal::{mishkal_eng}") if _is_irregular_plural(gender, plural_ktiv): tags.append("irregular") note = genanki.Note( model=PLURAL_MODEL, guid=note_guid, fields=[ singular, sg_audio, plural, pl_audio, meaning, root, mishkal_heb, gender_heb, ], tags=tags, ) deck.add_note(note) note_count += 1 irregular_count = len(irregulars) regular_count = note_count - irregular_count logger.info( f"Plurals deck: {note_count} notes " f"({irregular_count} irregular + {regular_count} regular exemplars " f"from {len(by_mishkal)} mishkal patterns)" ) return deck, media_files def write_plural_apkg( deck: genanki.Deck, media_files: list[Path] | None = None, out_path: Path = PLURAL_APKG, ) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) pkg = genanki.Package(deck) base = [str(p) for p in (media_files or []) if p.exists()] pkg.media_files = base + _font_media_files() pkg.write_to_file(str(out_path)) logger.info(f"Plurals deck written → {out_path}") def _font_media_files() -> list[str]: """Return list of Heebo font file paths that exist, for bundling in .apkg.""" font_paths = list(FONTS_DIR.glob("_Heebo*.ttf")) return [str(p) for p in font_paths if p.exists()] class _RandomOrderPackage(genanki.Package): """genanki.Package subclass that sets new card order to random (0) instead of insertion order (1).""" def write_to_db(self, cursor, timestamp, id_gen): super().write_to_db(cursor, timestamp, id_gen) row = cursor.execute("SELECT dconf FROM col").fetchone() if row: dconf = json.loads(row[0]) for conf in dconf.values(): if isinstance(conf, dict) and "new" in conf: conf["new"]["order"] = 0 cursor.execute("UPDATE col SET dconf = ?", [json.dumps(dconf)]) def write_vocab_apkg( deck: genanki.Deck, media_files: list[Path], out_path: Path = VOCAB_APKG, ) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) pkg = genanki.Package(deck) # insertion order = frequency rank (new.order=1 default) pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files() pkg.write_to_file(str(out_path)) logger.info(f"Vocabulary deck written → {out_path}") def write_conj_apkg( deck: genanki.Deck, media_files: list[Path] | None = None, out_path: Path = CONJ_APKG, ) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) pkg = _RandomOrderPackage(deck) base = [str(p) for p in (media_files or []) if p.exists()] pkg.media_files = base + _font_media_files() pkg.write_to_file(str(out_path)) logger.info(f"Conjugation deck written → {out_path}") def build_complete_deck( words: dict[str, dict], limit: int | None = None, include_audio: bool = False, emoji_lookup: dict | None = None, ) -> tuple[list[genanki.Deck], list[Path]]: """Build all subdecks under 'Hebrew::*' for the combined .apkg. Returns (list_of_decks, deduplicated_media_files). """ logger.info(f" Building complete deck (audio={'yes' if include_audio else 'no'}) …") # Build standalone decks using existing functions vocab_deck, vocab_media = build_vocab_deck( words, limit=limit, include_audio=include_audio, include_images=True, emoji_lookup=emoji_lookup, ) conj_deck, conj_media = build_conj_deck( words, include_audio=include_audio, ) conf_deck, conf_media = build_confusables_deck( words, include_audio=include_audio, ) plural_deck, plural_media = build_plural_deck( words, include_audio=include_audio, ) # Create new Deck objects with subdeck names and different IDs complete_vocab = genanki.Deck(COMPLETE_VOCAB_DECK_ID, "Hebrew::Vocabulary") for note in vocab_deck.notes: complete_vocab.add_note(note) complete_conj = genanki.Deck(COMPLETE_CONJ_DECK_ID, "Hebrew::Conjugations") for note in conj_deck.notes: complete_conj.add_note(note) complete_conf = genanki.Deck(COMPLETE_CONF_DECK_ID, "Hebrew::Confusables") for note in conf_deck.notes: complete_conf.add_note(note) complete_plural = genanki.Deck(COMPLETE_PLURAL_DECK_ID, "Hebrew::Plurals") for note in plural_deck.notes: complete_plural.add_note(note) all_source_media = vocab_media + conj_media + conf_media + plural_media # Deduplicate media files by resolved path seen_paths: set[str] = set() all_media: list[Path] = [] for mf in all_source_media: resolved = str(mf.resolve()) if mf.exists() else str(mf) if resolved not in seen_paths: seen_paths.add(resolved) all_media.append(mf) decks = [complete_vocab, complete_conj, complete_conf, complete_plural] plural_info = f" + {len(complete_plural.notes)} plural" logger.info( f" Complete deck: {len(complete_vocab.notes)} vocab + " f"{len(complete_conj.notes)} conj + {len(complete_conf.notes)} conf{plural_info} notes, " f"{len(all_media)} media files" ) return decks, all_media def write_complete_apkg( decks: list[genanki.Deck], media_files: list[Path], out_path: Path = COMPLETE_APKG, ) -> None: """Write a combined .apkg with multiple subdecks.""" out_path.parent.mkdir(parents=True, exist_ok=True) pkg = genanki.Package(decks) pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files() pkg.write_to_file(str(out_path)) logger.info(f"Complete deck written → {out_path}") def build_all_variants( words: dict[str, dict], limit: int | None = None, ) -> None: """Build all 12 release variants into output/.""" logger.info("Building all release variants …") emoji_lookup = _load_emoji_lookup() logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded") vocab_variants = [ (False, False, VOCAB_APKG), (True, False, VOCAB_APKG_AUDIO), (False, True, VOCAB_APKG_IMAGES), (True, True, VOCAB_APKG_AUDIO_IMAGES), ] for audio, images, path in vocab_variants: label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}" logger.info(f" Vocab variant: {label} → {path.name}") deck, media = build_vocab_deck( words, limit=limit, include_audio=audio, include_images=images, emoji_lookup=emoji_lookup, ) write_vocab_apkg(deck, media, out_path=path) conj_variants = [ (False, CONJ_APKG), (True, CONJ_APKG_AUDIO), ] for audio, path in conj_variants: label = f"audio={'yes' if audio else 'no'}" logger.info(f" Conj variant: {label} → {path.name}") deck, media = build_conj_deck(words, include_audio=audio) write_conj_apkg(deck, media, out_path=path) conf_variants = [ (False, CONF_APKG), (True, CONF_APKG_AUDIO), ] for audio, path in conf_variants: label = f"audio={'yes' if audio else 'no'}" logger.info(f" Conf variant: {label} → {path.name}") deck, media = build_confusables_deck(words, include_audio=audio) write_conf_apkg(deck, media, out_path=path) plural_variants = [ (False, PLURAL_APKG), (True, PLURAL_APKG_AUDIO), ] for audio, path in plural_variants: label = f"audio={'yes' if audio else 'no'}" logger.info(f" Plural variant: {label} → {path.name}") deck, media = build_plural_deck(words, include_audio=audio) write_plural_apkg(deck, media, out_path=path) # Combined "Hebrew::*" complete decks complete_variants = [ (False, COMPLETE_APKG), (True, COMPLETE_APKG_AUDIO), ] for audio, path in complete_variants: decks, media = build_complete_deck( words, limit=limit, include_audio=audio, emoji_lookup=emoji_lookup, ) write_complete_apkg(decks, media, out_path=path) logger.info("All variants built.") if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") words = _load_words() deck, media = build_vocab_deck(words, limit=20) write_vocab_apkg(deck, media) conj_deck, conj_media = build_conj_deck(words) write_conj_apkg(conj_deck, conj_media)