#!/usr/bin/env python3
"""
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
Uses genanki for reliable, stable deck generation.
Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
in Anki rather than creating a duplicate.
"""
import json
import logging
import random
import re
from pathlib import Path
import genanki
logger = logging.getLogger(__name__)
# Stable deck/model IDs — do not change these
VOCAB_DECK_ID = 1_234_567_890
VOCAB_MODEL_ID = 1_701_222_017_968 # matches Nevo's original Anki model
CONJ_DECK_ID = 1_234_567_892
CONJ_MODEL_ID = 1_234_567_893
CONF_DECK_ID = 1_234_567_894
CONF_MODEL_ID = 1_234_567_895
PLURAL_DECK_ID = 1_234_567_896
PLURAL_MODEL_ID = 1_234_567_897
# Subdeck IDs for combined "Hebrew::*" package — MUST differ from standalone IDs
COMPLETE_VOCAB_DECK_ID = 1_234_567_900
COMPLETE_CONJ_DECK_ID = 1_234_567_901
COMPLETE_CONF_DECK_ID = 1_234_567_902
COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.20"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)")
DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
OUTPUT_DIR = Path(__file__).parent / "output"
VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg"
VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg"
VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg"
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg"
CONF_APKG = OUTPUT_DIR / "hebrew_confusables.apkg"
CONF_APKG_AUDIO = OUTPUT_DIR / "hebrew_confusables_audio.apkg"
PLURAL_APKG = OUTPUT_DIR / "hebrew_plurals.apkg"
PLURAL_APKG_AUDIO = OUTPUT_DIR / "hebrew_plurals_audio.apkg"
COMPLETE_APKG = OUTPUT_DIR / "hebrew_complete.apkg"
COMPLETE_APKG_AUDIO = OUTPUT_DIR / "hebrew_complete_audio.apkg"
# ──────────────────────────────────────────────────────────────────────────────
# Binyan → Hebrew label mapping (for conjugation card display)
# ──────────────────────────────────────────────────────────────────────────────
BINYAN_TO_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּעֵל",
"Pu'al": "פֻּעַל",
"Hitpa'el": "הִתְפַּעֵל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
}
# ──────────────────────────────────────────────────────────────────────────────
# PoS → Hebrew label mapping
# ──────────────────────────────────────────────────────────────────────────────
POS_TO_HEBREW = {
"Noun": "שם עצם",
"Verb": "פועל",
"Adjective": "שם תואר",
"Adverb": "תואר הפועל",
"Preposition": "מילת יחס",
"Conjunction": "מילת חיבור",
"Pronoun": "כינוי גוף",
"Particle": "מילית",
}
# PoS category groupings for related-words display
POS_CATEGORY_LABELS = {
"Verb": "פעלים",
"Noun": "שמות עצם",
"Adjective": "שמות תואר",
"Adverb": "תוארי הפועל",
}
# ──────────────────────────────────────────────────────────────────────────────
# Shared CSS
# ──────────────────────────────────────────────────────────────────────────────
FONTS_DIR = DATA_DIR / "fonts"
CARD_CSS = """
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Regular.ttf');
font-weight: normal;
}
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Bold.ttf');
font-weight: bold;
}
.card {
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
font-size: 20px;
text-align: right;
color: #222;
background: #fff;
padding: 16px;
max-width: 600px;
margin: 0 auto;
}
.hebrew {
font-size: 42px;
font-weight: bold;
direction: rtl;
text-align: center;
line-height: 1.5;
color: #222;
}
.hebrew-sm {
font-size: 30px;
font-weight: normal;
direction: rtl;
text-align: center;
color: #222;
}
.meaning {
font-size: 34px;
color: #1a1a8c;
margin: 8px 0;
text-align: center;
}
.hint {
font-size: 22px;
color: #555;
margin: 4px 0;
direction: rtl;
text-align: center;
}
.example {
font-size: 24px;
color: #222;
direction: rtl;
text-align: right;
font-style: italic;
margin: 10px auto 0;
max-width: 90%;
border-right: 3px solid #aaa;
padding-right: 8px;
}
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
.freq-badge {
display: inline-block;
font-size: 11px;
color: #aaa;
background: transparent;
border: 1px solid #eee;
border-radius: 10px;
padding: 2px 8px;
margin-top: 4px;
}
.voice-label {
font-size: 0.6em;
font-weight: normal;
color: #555;
}
.sec-table {
display: table;
margin: 6px auto 0;
direction: rtl;
border-collapse: collapse;
}
.sec-label {
display: table-row;
font-size: 28px;
font-weight: normal;
color: #222;
direction: rtl;
}
.sec-key {
display: table-cell;
font-size: 28px;
color: #222;
font-weight: bold;
text-align: right;
padding: 2px 0 2px 8px;
white-space: nowrap;
}
.sec-val {
display: table-cell;
font-size: 28px;
color: #222;
text-align: right;
padding: 2px 0;
}
.definitions {
direction: rtl;
text-align: center;
}
.more-toggle {
text-align: center;
direction: rtl;
margin-top: 8px;
}
.more-header {
display: inline-block;
font-size: 18px;
color: #555;
cursor: pointer;
list-style: none;
border: 1px solid #ccc;
border-radius: 16px;
padding: 4px 16px;
margin: 4px 0;
background: #f8f8f8;
}
.more-header::-webkit-details-marker { display: none; }
.more-header::before { content: "○ "; font-size: 14px; }
details[open] > .more-header::before { content: "● "; }
.related-header {
font-size: 22px;
color: #555;
text-align: center;
margin: 4px 0;
}
.rw-word {
display: table-cell;
font-size: 28px;
color: #222;
font-weight: normal;
text-align: right;
padding: 2px 0 2px 8px;
white-space: nowrap;
}
.rw-meaning {
display: table-cell;
font-size: 24px;
color: #555;
text-align: left;
direction: ltr;
padding: 2px 0;
}
.conf-entry {
margin: 8px 0;
font-size: 28px;
direction: rtl;
}
.emoji-img {
font-size: 3.5em;
text-align: center;
margin: 0.3em 0;
}
.plural-direction {
font-size: 32px;
color: #444;
text-align: center;
direction: rtl;
margin: 8px 0;
font-weight: bold;
}
.card [type="button"], .card button, .replay-button {
display: block !important;
margin: 4px auto !important;
text-align: center;
}
@media (prefers-color-scheme: dark) {
.card { color: #e8e8e8; background: #1c1c1e; }
.hebrew { color: #f0f0f0; }
.hebrew-sm { color: #e0e0e0; }
.meaning { color: #82b0ff; }
.sec-label { color: #e0e0e0; }
.sec-key { color: #e0e0e0; }
.sec-val { color: #e0e0e0; }
.conf-entry { color: #ddd; }
.hint { color: #777; }
.voice-label { color: #888; }
.example { color: #e0e0e0; border-right-color: #555; }
.divider { border-top-color: #333; }
.freq-badge { color: #888; border-color: #444; }
.more-header { color: #bbb; background: #2a2a2e; border-color: #555; }
.related-header { color: #999; }
.rw-word { color: #e0e0e0; }
.rw-meaning { color: #999; }
.plural-direction { color: #aaa; }
}
.nightMode .card { color: #e8e8e8; background: #1c1c1e; }
.nightMode .hebrew { color: #f0f0f0; }
.nightMode .hebrew-sm { color: #e0e0e0; }
.nightMode .meaning { color: #82b0ff; }
.nightMode .sec-label { color: #e0e0e0; }
.nightMode .sec-key { color: #e0e0e0; }
.nightMode .sec-val { color: #e0e0e0; }
.nightMode .conf-entry { color: #ddd; }
.nightMode .hint { color: #777; }
.nightMode .voice-label { color: #888; }
.nightMode .example { color: #e0e0e0; border-right-color: #555; }
.nightMode .divider { border-top-color: #333; }
.nightMode .freq-badge { color: #888; border-color: #444; }
.nightMode .more-header { color: #bbb; background: #2a2a2e; border-color: #555; }
.nightMode .related-header { color: #999; }
.nightMode .rw-word { color: #e0e0e0; }
.nightMode .rw-meaning { color: #999; }
.nightMode .plural-direction { color: #aaa; }
"""
# ──────────────────────────────────────────────────────────────────────────────
# Vocabulary Deck
# ──────────────────────────────────────────────────────────────────────────────
VOCAB_FRONT_HEB = """
{{Word}}{{#Prep}} {{Prep}}{{/Prep}}
{{#Audio}}{{Audio}}
{{/Audio}}
"""
VOCAB_BACK_HEB = """
{{FrontSide}}
{{Meaning}}
{{#Emoji}}{{Emoji}}
{{/Emoji}}
{{^Emoji}}{{#Image}}{{/Image}}{{/Emoji}}
{{#WordNoNikkud}}
לְלֹא נִיקּוּד:{{WordNoNikkud}}
{{/WordNoNikkud}}
{{#Root}}
שֹׁרֶשׁ:{{Root}}
{{/Root}}
{{#PoS}}
חֵלֶק דִּיבּוּר:{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}
{{/PoS}}
{{#Plural}}
רַבִּים:{{Plural}}
{{/Plural}}
{{#SharedRoots}}
{{SharedRoots}}
{{/SharedRoots}}
"""
VOCAB_FRONT_ENG = """
{{Meaning}}
{{#Hint}}{{Hint}}
{{/Hint}}
{{#Emoji}}{{Emoji}}
{{/Emoji}}
{{^Emoji}}{{#Image}}{{/Image}}{{/Emoji}}
"""
VOCAB_BACK_ENG = """
{{FrontSide}}
{{Word}}{{#Prep}} {{Prep}}{{/Prep}}
{{#Audio}}{{Audio}}
{{/Audio}}
{{#WordNoNikkud}}
לְלֹא נִיקּוּד:{{WordNoNikkud}}
{{/WordNoNikkud}}
{{#Root}}
שֹׁרֶשׁ:{{Root}}
{{/Root}}
{{#PoS}}
חֵלֶק דִּיבּוּר:{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}
{{/PoS}}
{{#Plural}}
רַבִּים:{{Plural}}
{{/Plural}}
{{#SharedRoots}}
{{SharedRoots}}
{{/SharedRoots}}
"""
VOCAB_FRONT_CLOZE = """
{{ClozeExample}}
{{#ClozeHint}}{{ClozeHint}}
{{/ClozeHint}}
"""
VOCAB_BACK_CLOZE = """
{{FrontSide}}
{{Word}}
{{#Audio}}{{Audio}}
{{/Audio}}
"""
VOCAB_MODEL = genanki.Model(
VOCAB_MODEL_ID,
"Hebrew Flash Cards",
fields=[
{"name": "Word"},
{"name": "Root"},
{"name": "PoS"},
{"name": "Meaning"},
{"name": "WordNoNikkud"},
{"name": "SharedRoots"},
{"name": "Tags"},
{"name": "Audio"},
{"name": "Example"},
{"name": "Frequency"},
{"name": "Image"},
{"name": "Emoji"},
{"name": "Prep"},
{"name": "Hint"},
{"name": "Plural"},
{"name": "Gender"},
{"name": "ClozeExample"},
{"name": "ClozeHint"},
],
templates=[
{
# ord 0 — matches Nevo's original "Card 2" (Eng→Heb)
"name": "English → Hebrew",
"qfmt": VOCAB_FRONT_ENG,
"afmt": VOCAB_BACK_ENG,
},
{
# ord 1 — matches Nevo's original "Card 3" (Heb→Eng)
"name": "Hebrew → English",
"qfmt": VOCAB_FRONT_HEB,
"afmt": VOCAB_BACK_HEB,
},
{
# ord 2 — Cloze-style sentence fill-in (only generated when ClozeExample is non-empty)
"name": "Sentence Cloze",
"qfmt": VOCAB_FRONT_CLOZE,
"afmt": VOCAB_BACK_CLOZE,
},
],
css=CARD_CSS,
)
# ──────────────────────────────────────────────────────────────────────────────
# Conjugation Deck
# ──────────────────────────────────────────────────────────────────────────────
CONJ_FRONT = """
אֵיךְ אוֹמְרִים
{{Pronoun}}
{{Infinitive}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Tense}}
"""
CONJ_BACK = """
{{FrontSide}}
{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}
{{#Audio}}{{Audio}}
{{/Audio}}
{{#Meaning}}{{Meaning}}
{{/Meaning}}
שֹׁרֶשׁ:{{Root}}
בִּנְיָן:{{Binyan}}
{{#RelatedVocab}}
{{RelatedVocab}}
{{/RelatedVocab}}
"""
CONJ_CSS = CARD_CSS
CONJ_MODEL = genanki.Model(
CONJ_MODEL_ID,
"Hebrew Conjugation",
fields=[
{"name": "Infinitive"},
{"name": "ReferenceForm"},
{"name": "Pronoun"},
{"name": "Tense"},
{"name": "ConjugatedForm"},
{"name": "Root"},
{"name": "Binyan"},
{"name": "Voice"},
{"name": "Audio"},
{"name": "Meaning"},
{"name": "RelatedVocab"},
{"name": "Prep"},
],
templates=[
{
"name": "Conjugation Drill",
"qfmt": CONJ_FRONT,
"afmt": CONJ_BACK,
}
],
css=CONJ_CSS,
)
# Present-tense expansion: each form key → list of (pronoun, tense_label)
PRESENT_EXPANSION = {
"present_ms": [
("אֲנִי (זָכָר)", "הוֹוֶה"),
("אַתָּה", "הוֹוֶה"),
("הוּא", "הוֹוֶה"),
],
"present_fs": [
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
("אַתְּ", "הוֹוֶה"),
("הִיא", "הוֹוֶה"),
],
"present_mp": [
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
("אַתֶּם", "הוֹוֶה"),
("הֵם", "הוֹוֶה"),
],
"present_fp": [
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
("אַתֶּן", "הוֹוֶה"),
("הֵן", "הוֹוֶה"),
],
}
# Modern Hebrew: 2fp/3fp future and imperative default to mp form (classical in parens)
FP_MODERN_FALLBACK = {
"future_2fp": "future_2mp",
"future_3fp": "future_3mp",
"imperative_fp": "imperative_mp",
}
# 3rd person plural past: same form for m/f — generate two separate pronoun cards
PAST_3P_EXPANSION = [
("הֵם", "עָבָר"),
("הֵן", "עָבָר"),
]
# Tense labels with "בְּ" prefix for display on cards
TENSE_WITH_BE = {
"עָבָר": "בֶּעָבָר",
"הוֹוֶה": "בַּהוֹוֶה",
"עָתִיד": "בֶּעָתִיד",
"צִיּוּוּי": "בַּצִּוּוּי",
}
# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
VOICE_MAP = {
"Pu'al": "סָבִיל",
"Huf'al": "סָבִיל",
}
# Tense Hebrew label → English key prefix (for form_key construction)
TENSE_KEY_MAP = {
"עָבָר": "past",
"הוֹוֶה": "present",
"עָתִיד": "future",
"צִוּוּי": "imperative",
"צִיּוּוּי": "imperative", # alternate spelling
}
# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────
def _load_words() -> dict[str, dict]:
"""Load the unified words.json data store."""
path = DATA_DIR / "words.json"
with open(path, encoding="utf-8") as f:
return json.load(f)
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR, slug: str = "") -> str:
"""Return [sound:xxx.mp3] if audio file exists, else empty string.
Tries slug-based filename first (for confusable words), then consonant-based.
"""
if slug:
slug_path = audio_dir / f"{slug}.mp3"
if slug_path.exists():
return f"[sound:{slug_path.name}]"
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
if not safe:
return ""
mp3_path = audio_dir / f"{safe}.mp3"
if mp3_path.exists():
return f"[sound:{mp3_path.name}]"
return ""
def _conj_audio_tag(slug: str, form_key: str) -> str:
"""Return [sound:xxx.mp3] for conjugation audio if downloaded."""
filename = f"{slug}_{form_key}.mp3"
mp3_path = AUDIO_CONJ_DIR / filename
if mp3_path.exists():
return f"[sound:{filename}]"
return ""
# Keywords excluded when building emoji lookup AND matching meaning text.
# Curated from manual review of all 2,261 emoji-word pairs (Sprint 8).
_EMOJI_STOP = frozenset(
{
# Basic stop words
"to",
"be",
"a",
"an",
"the",
"of",
"in",
"on",
"at",
"for",
"and",
"with",
"by",
"or",
"but",
"not",
"as",
"its",
# Generic emoji description words (too vague)
"face",
"hand",
"sign",
"symbol",
"button",
"small",
"large",
"light",
"dark",
"open",
"closed",
# Numbers → clock emoji (🕐🕑🕒 etc.)
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"hundred",
"thousand",
# UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️)
"next",
"fast",
"play",
"pause",
"repeat",
"end",
"soon",
"record",
# Abstract words → misleading object emoji
"part",
"place",
"mark",
"post",
"department",
"store",
"note",
"control",
"level",
"stop",
"cover",
"roll",
"rolling",
"pick",
"over",
"right",
"way",
"skin",
"drop",
"middle",
"piece",
"section",
# Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.)
"north",
"south",
"northern",
"southern",
"western",
"eastern",
"central",
"territory",
"kingdom",
"united",
"virgin",
# Common words producing bad emoji matches
"new",
"big",
"full",
"last",
"first",
"double",
"slightly",
"without",
"from",
"behind",
"people",
"position",
"status",
"situation",
"game",
"call",
"trade",
"male",
"female",
"person",
"letter",
# Polysemous words → wrong emoji sense
"french",
"fried",
"board",
"bow",
"water",
"union",
"rock",
"left",
"back",
"crane",
"dash",
"bar",
"wheel",
"horizontal",
# Polysemous keywords producing wrong-sense emoji (Sprint 17 audit)
"high", # ⚡ high voltage, not "tall"
"down", # 🫳 palm down, not "descend"
"off", # 📴 phone off, not "remove"
"away", # 💨 dashing away, not "depart"
"together", # 🤲 palms together, not "unite"
"top", # 🎩 top hat, not "upper"
"low", # 🔈 low volume, not "short"
"flat", # 🥿 ballet flat, not "apartment"
"soft", # 🍦 soft serve, not "quiet"
"broken", # 💔 broken heart, not "damaged"
"round", # 📍 round pushpin, not "circular"
"cool", # 🆒 COOL button, not "cold"
"free", # 🆓 FREE button, not "liberated"
"long", # 🪘 long drum, not "lengthy"
"straight", # 📏 straight ruler, not "direct"
"empty", # 🪹 empty nest, not "void"
"hot", # 🥵 hot face, not "warm"
"cross", # ✝️ latin cross, not "intersect"
"bright", # 🔆 bright button, not "luminous"
"old", # 👴 old man, not "aged"
"head", # 🙂↔️ shaking head, not "leader"
# Category words that match generic emoji
"military", # 🎖️ military medal for any military term
"sports", # 🏅 sports medal for any sports term
"food", # 😋 yummy face for any food term
"city", # 🇻🇦 Vatican flag for any city
"china", # 🇨🇳 China flag for "porcelain"
"polish", # 💅 nail polish for "to polish/shine"
"aid", # 🦻 hearing aid for "to help"
"office", # 🧑💼 office worker for "bureau"
"construction", # 🏛️ classical building, not construction
"cinema", # 🎦 cinema emoji for any film term
"ceremony", # 🎑 moon ceremony for any ceremony
"building", # 🏛️ classical building for any structure
# Body parts / human features → wrong emoji
"arm", # 🦾 mechanical arm for "to arm"
"hair", # 👱 blond person for "hair"
"nose", # 😤 steam from nose
"tongue", # 😛 tongue-out face
"chest", # not a chest
"eyes", # 😃 face with eyes
# Abstract/vague words
"fear", # 😱 screaming face
"anger", # 💢 anger symbol
"angry", # 😠 angry face
"tired", # 😫 tired face
"sad", # 😥 sad face
"joy", # 😂 tears of joy
"love", # 💌 love letter
"cold", # 🥶 cold face
"pile", # 💩 pile of poo
"man", # 👨 man
"woman", # 👩 woman
"boy", # 👦 boy
"girl", # 👧 girl
"baby", # 👶 baby
"children", # 🚸 children crossing
"student", # 🧑🎓 student
"adult", # 🧑🧑🧒 family
"name", # 📛 name badge
"check", # ✅ check mark
"line", # 🫥 dotted line face
"floor", # 🤣 ROFL (rolling on floor)
"room", # 🧖 person in steamy room
"bubble", # 👁️🗨️ speech bubble
"car", # 🚃 railway car, not automobile
"bullet", # 🚅 bullet train
"steam", # 😤 face with steam
"fly", # 🪰 the insect, not the verb
"plant", # 🪴 potted plant for all "X (plant)" entries
"tree", # 🌲 evergreen for all "X (tree)" entries
"ball", # ⛹️ person bouncing ball
"bag", # 👝 clutch bag
"fight", # not a fight
"cloud", # not a cloud
"video", # 🎮 video game, not video
"rescue", # ⛑️ rescue worker helmet
"exchange", # 💱 currency exchange
"cut", # 🥩 cut of meat, not "to cut"
"key", # 🔐 locked with key
"walking", # 🚶 person walking
"running", # 🏃 person running
"climbing", # 🧗 person climbing
"speaking", # 🗣️ speaking head
"playing", # 🤽 person playing
"feeding", # 👩🍼 person feeding
"shooting", # 🌠 shooting star
"clapping", # 👏 clapping hands
"cooking", # 🍳 cooking emoji
"holding", # 🥹 face holding back tears
# More wrong-sense matches from remaining audit
"paper", # 🏮 red lantern for "paper"
"track", # 🛤️ railroad for "track record"
"vertical", # 🚦 traffic light for "vertical"
"speaker", # 🔇 muted speaker for "speaker (person)"
"square", # 🟥 red square for "plaza"
"wrapped", # 🎁 gift for "wrapped, bound"
"volume", # 🔈 speaker for "volume (book)"
"mobile", # 📱 phone for "mobile, moveable"
"flash", # 📸 camera flash for "to shine"
"identification", # 🪪 ID card for "locating"
"service", # 🐕🦺 service dog for "service, term"
"ground", # ⛱️ umbrella on ground
"machine", # 🎰 slot machine for "mechanism"
"liquid", # 🫗 pouring for "liquid, drop"
"vehicle", # 🚙 SUV for any vehicle mention
"window", # 🪟 window pane for "window, gap"
"information", # ℹ️ info symbol
"child", # 🧒 child emoji
}
)
def _load_emoji_lookup() -> dict[str, str]:
"""Load or fetch Unicode emoji keyword→character lookup.
Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
Result is cached in data/emoji_lookup.json.
Returns empty dict on network failure (safe fallback).
"""
cache_file = DATA_DIR / "emoji_lookup.json"
if cache_file.exists():
with open(cache_file) as f:
return json.load(f)
import requests
try:
resp = requests.get(
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
timeout=30,
)
resp.raise_for_status()
except Exception as e:
logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
return {}
lookup: dict[str, str] = {}
for line in resp.text.splitlines():
if "fully-qualified" not in line:
continue
m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
if not m:
continue
emoji_char = m.group(1)
desc = m.group(2).lower().strip()
for word in desc.split():
word = word.strip(".,'\"-")
if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
lookup[word] = emoji_char
cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
return lookup
def _categorize_pos(pos_str: str) -> str:
"""Return the canonical PoS category key for grouping."""
base = pos_str.split("–")[0].split("—")[0].strip()
for cat in POS_CATEGORY_LABELS:
if base == cat:
return cat
return "Other"
def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]:
"""Convert schema's active_forms list to the keyed format the card generator expects.
Keys are like ``past_1s``, ``present_ms``, ``future_2mp``, ``imperative_fs``.
Each value dict has: form, form_ktiv, pronoun (Hebrew string), tense (Hebrew label),
audio_url, guid, guid_candidates.
"""
result: dict[str, dict] = {}
for f in forms_list:
tense_en = TENSE_KEY_MAP.get(f["tense"], f["tense"])
key = f"{tense_en}_{f['person']}"
result[key] = {
"form": f["form"]["nikkud"],
"form_ktiv": f["form"].get("ktiv_male", ""),
"pronoun": f.get("pronoun_hebrew", ""), # Hebrew pronoun string
"tense": f["tense"], # Hebrew tense label
"audio_url": f.get("audio_url", ""),
"guid": f.get("guid"),
"guid_candidates": f.get("guid_candidates"),
}
return result
# Hebrew prefix letters (אותיות השימוש): בהוכלמש
_PREFIX_LETTERS = frozenset("בהוכלמש")
def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int:
"""Return the number of characters in the cloze token that are prefix (not part of the word).
For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars).
Returns 0 if the token starts with the word directly.
"""
if not word_nikkud or not cloze_token:
return 0
# If the token starts with the word nikkud, no prefix
if cloze_token.startswith(word_nikkud):
return 0
# Check if word nikkud appears as a suffix of the token
idx = cloze_token.find(word_nikkud)
if idx > 0:
# Verify prefix chars are valid Hebrew prefix letters
prefix_part = cloze_token[:idx]
base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"]
if base_letters and all(c in _PREFIX_LETTERS for c in base_letters):
return idx
return 0
def build_vocab_deck(
words: dict[str, dict],
limit: int | None = None,
include_audio: bool = True,
include_images: bool = True,
emoji_lookup: dict | None = None,
) -> tuple[genanki.Deck, list[Path]]:
"""Build the vocabulary deck from the unified words dict.
Args:
words: Unified data dict keyed by unique_key (from words.json).
limit: If set, only process the first N entries (by frequency).
include_audio: Whether to include audio tags in notes.
include_images: Whether to include image tags in notes.
emoji_lookup: Optional Unicode emoji keyword→char mapping for fallback emoji.
Returns:
(deck, list_of_media_files)
"""
logger.info(f"Building vocabulary deck from {len(words)} words …")
images_dir = DATA_DIR / "images"
# Build word_unique_key → pos_category dict for related-words grouping
word_to_pos_cat: dict[str, str] = {}
for unique_key, entry in words.items():
pos_raw = entry.get("pos", "")
if pos_raw:
word_to_pos_cat[unique_key] = _categorize_pos(pos_raw)
# Also index by nikkud word (for shared_roots lookup by nikkud form)
word_nikkud = entry["word"]["nikkud"]
if word_nikkud not in word_to_pos_cat:
word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
# Sort entries by effective frequency (pseudo_frequency for confusables,
# else regular frequency; null → 999999), applying limit after sort
def _freq_key(item: tuple[str, dict]) -> int:
e = item[1]
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
sorted_entries = sorted(words.items(), key=_freq_key)
if limit:
sorted_entries = sorted_entries[:limit]
deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
media_files: list[Path] = []
seen_words: set[tuple[str, str]] = set()
for _unique_key, entry in sorted_entries:
word_nikkud = entry["word"]["nikkud"]
word_no_nik = entry["word"].get("ktiv_male", "")
root_list = entry.get("root") or []
root = ".".join(root_list)
pos_raw = entry.get("pos", "")
pos_heb = entry.get("pos_hebrew", "")
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
meaning = HBPAREN_RE.sub("", meaning).strip()
# Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning)
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
meaning = re.sub(r"[;:]\s*—", " —", meaning) # clean "; —" → " —"
meaning = re.sub(r";\s*:", ";", meaning) # clean "; :" → ";"
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
slug = entry.get("slug", "") or ""
frequency = entry.get("frequency") or 999_999
audio_file = entry.get("audio_file", "") or ""
tags_str = entry.get("tags", "") or ""
hint_str = entry.get("hint", "") or ""
shared_roots_keys = entry.get("shared_roots") or []
is_confusable = entry.get("confusable_group") is not None
if not word_nikkud or not meaning:
continue
# Skip exact duplicates (same word AND same meaning)
word_meaning_key = (word_nikkud, meaning)
if word_meaning_key in seen_words:
logger.debug(f" Skipping duplicate word+meaning: {word_nikkud}")
continue
seen_words.add(word_meaning_key)
# Frequency display label
if frequency <= 500:
freq_display = f"Core #{frequency}"
elif frequency <= 1500:
freq_display = f"Essential #{frequency}"
elif frequency <= 3000:
freq_display = f"Intermediate #{frequency}"
elif frequency <= 5000:
freq_display = f"Upper-intermediate #{frequency}"
elif frequency <= 10000:
freq_display = f"Advanced #{frequency}"
elif frequency < 999_999:
freq_display = f"Rare #{frequency}"
else:
freq_display = "Unlisted"
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup.
# Skip fallback for verbs — keyword matching on verb definitions produces
# wrong-sense emoji (e.g. "to cut" → 🥩, "to arm" → 🦾).
emoji_str = ""
if entry.get("emoji_visible") and entry.get("emoji"):
emoji_str = entry["emoji"]
elif emoji_lookup and not meaning.startswith("to "):
meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip()
for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]:
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
emoji_str = emoji_lookup[kw]
break
# Hebrew prepositions — extracted upstream by list scraper
entry_prep = entry.get("prep")
prep_str = " ".join(f"({p})" for p in entry_prep.split()) if entry_prep else ""
# Audio — use audio_file from entry; for confusables it's already slug-based
audio_tag = ""
if include_audio and audio_file:
mp3_path = AUDIO_DIR / audio_file
if mp3_path.exists():
audio_tag = f"[sound:{audio_file}]"
if mp3_path not in media_files:
media_files.append(mp3_path)
else:
# Fallback: try consonant-based filename
audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "")
if audio_tag:
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
mp3_path_fb = AUDIO_DIR / mp3_name
if mp3_path_fb not in media_files:
media_files.append(mp3_path_fb)
elif include_audio:
audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "")
if audio_tag:
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
mp3_path_fb = AUDIO_DIR / mp3_name
if mp3_path_fb not in media_files:
media_files.append(mp3_path_fb)
# Example sentence from vetted examples
example_html = ""
examples = entry.get("examples") or {}
if examples.get("vetted"):
example_html = examples["vetted"][0]["text"]
# Cloze: use pre-computed cloze from words.json
cloze_example = ""
cloze_hint = ""
if not is_confusable and examples.get("cloze"):
cloze_data = examples["cloze"]
cloze_text = cloze_data.get("text", "")
start = cloze_data.get("cloze_word_start")
end = cloze_data.get("cloze_word_end")
if cloze_text and start is not None and end is not None:
# Preserve Hebrew prefix letters in the cloze blank
# e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____"
cloze_token = cloze_text[start:end]
prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud)
cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:]
# Clean up duplicate adjacent quotation marks (e.g. "" → ")
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
raw_hint = cloze_data.get("cloze_hint") or ""
if raw_hint:
cloze_hint = raw_hint
else:
pos_cat = _categorize_pos(pos_raw) if pos_raw else "Other"
cloze_hint = meaning
if pos_cat == "Verb" and pos_heb:
cloze_hint = f"{meaning} ({pos_heb})"
# Related words (shared roots) as a table: word — meaning, sorted by frequency
related_html = ""
if shared_roots_keys:
rw_items: list[tuple[int, str, str]] = [] # (sort_key, nikkud, meaning)
for rw_key in shared_roots_keys:
rw_entry = words.get(rw_key)
if rw_entry:
rw_nikkud = rw_entry["word"]["nikkud"]
rw_meaning = rw_entry.get("meaning") or ""
if len(rw_meaning) > 40:
rw_meaning = rw_meaning[:37] + "…"
rw_freq = rw_entry.get("frequency") or 999999
else:
rw_nikkud = rw_key
rw_meaning = ""
rw_freq = 999999
rw_items.append((rw_freq, rw_nikkud, rw_meaning))
rw_items.sort(key=lambda x: x[0])
rows_html: list[str] = []
for _freq, rw_nikkud, rw_meaning in rw_items:
rows_html.append(
f''
f'{rw_nikkud}'
f'{rw_meaning}'
f"
"
)
related_html = "\n".join(rows_html)
# Plural form and gender (nouns only)
plural_str = ""
gender_str = ""
if pos_raw.startswith("Noun"):
noun_inflection = entry.get("noun_inflection")
if noun_inflection:
if noun_inflection.get("plural"):
plural_str = noun_inflection["plural"].get("nikkud", "")
gender_raw = noun_inflection.get("gender") or ""
if gender_raw == "masculine":
gender_str = "זָכָר"
elif gender_raw == "feminine":
gender_str = "נְקֵבָה"
# Image
image_tag = ""
if include_images:
image_filename = entry.get("image") or ""
if image_filename:
image_path = images_dir / image_filename
if image_path.exists():
image_tag = image_filename
if image_path not in media_files:
media_files.append(image_path)
# GUID: use vocab_legacy_guid from entry, fall back to deterministic
legacy_guid = entry.get("vocab_legacy_guid")
note_guid = legacy_guid or genanki.guid_for(word_nikkud, meaning)
note = genanki.Note(
model=VOCAB_MODEL,
guid=note_guid,
fields=[
word_nikkud,
root,
pos_heb,
meaning,
word_no_nik,
related_html or "",
tags_str,
audio_tag,
example_html,
freq_display,
image_tag,
emoji_str,
prep_str,
hint_str,
plural_str,
gender_str,
cloze_example,
cloze_hint,
],
tags=(tags_str.split() if tags_str else [])
+ [RELEASE_TAG]
+ [f"freq::{freq_display.split()[0]}" if freq_display != "Unlisted" else "freq::Unlisted"],
)
deck.add_note(note)
# Diagnostics
emoji_count = sum(1 for n in deck.notes if n.fields[11])
prep_count = sum(1 for n in deck.notes if n.fields[12])
hint_count = sum(1 for n in deck.notes if n.fields[13])
plural_count = sum(1 for n in deck.notes if n.fields[14])
gender_count = sum(1 for n in deck.notes if n.fields[15])
cloze_count = sum(1 for n in deck.notes if n.fields[16])
unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
if emoji_count:
logger.info(f" Emoji extracted: {emoji_count} words")
if prep_count:
logger.info(f" Hebrew prepositions extracted: {prep_count} words")
if hint_count:
logger.info(f" Eng→Heb hints: {hint_count} words")
if plural_count:
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
if gender_count:
logger.info(f" Noun gender on vocab cards: {gender_count} words")
if cloze_count:
logger.info(f" Sentence cloze cards: {cloze_count} words")
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
return deck, media_files
def build_conj_deck(
words: dict[str, dict],
audio_dir: Path = AUDIO_CONJ_DIR,
include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
"""Build the conjugation drill deck from words with in_conjugation_deck=True."""
deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
media_files: list[Path] = []
note_count = 0
verb_count = 0
# Build root → [(freq, nikkud, meaning)] lookup for cross-linking
root_words: dict[str, list[tuple[int, str, str]]] = {}
for entry in words.values():
root_list = entry.get("root") or []
root_key = " ".join(root_list)
if root_key:
rw_meaning = entry.get("meaning") or ""
if len(rw_meaning) > 40:
rw_meaning = rw_meaning[:37] + "…"
rw_freq = entry.get("frequency") or 999999
root_words.setdefault(root_key, []).append((rw_freq, entry["word"]["nikkud"], rw_meaning))
for _unique_key, entry in words.items():
conj = entry.get("conjugation")
if not conj or not conj.get("in_conjugation_deck"):
continue
active_forms_list = conj.get("active_forms") or []
if not active_forms_list:
continue
verb_count += 1
infinitive = conj["infinitive"]["nikkud"]
ref_form = conj["reference_form"]["nikkud"]
binyan = conj.get("binyan", "")
binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
slug = entry.get("slug", "") or ""
root_list = entry.get("root") or []
root = ".".join(root_list)
voice = VOICE_MAP.get(binyan, "")
meaning = entry.get("meaning", "") or ""
# Hebrew preposition — extracted upstream by scraper
prep_str = ""
conj_prep = conj.get("prep") or entry.get("prep")
if conj_prep:
prep_str = conj_prep.strip("() ")
related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive]
if related:
related.sort(key=lambda x: x[0])
related_rows = []
for _freq, rw_nikkud, rw_meaning in related[:8]:
related_rows.append(
f''
f'{rw_nikkud}'
f'{rw_meaning}'
f"
"
)
related_str = "\n".join(related_rows)
else:
related_str = ""
forms = _forms_list_to_dict(active_forms_list)
def add_note(
pronoun: str,
tense: str,
conj_form: str,
audio_tag: str,
_form_key_for_guid: str,
guid_val: str | None = None,
guid_candidates: list[str] | None = None,
*,
_infinitive: str = infinitive,
_ref_form: str = ref_form,
_root: str = root,
_binyan_heb: str = binyan_heb,
_voice: str = voice,
_meaning: str = meaning,
_related_str: str = related_str,
_prep_str: str = prep_str,
) -> None:
nonlocal note_count
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
return
# Apply tense prefix (בְּ)
display_tense = TENSE_WITH_BE.get(tense, tense)
# GUID: use stored guid, then first candidate, then deterministic fallback
if guid_val:
note_guid = guid_val
elif guid_candidates:
note_guid = guid_candidates[0]
else:
note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
note = genanki.Note(
model=CONJ_MODEL,
guid=note_guid,
fields=[
_infinitive,
_ref_form,
pronoun,
display_tense,
conj_form,
_root,
_binyan_heb,
_voice,
audio_tag,
_meaning,
_related_str,
_prep_str,
],
tags=[RELEASE_TAG],
)
deck.add_note(note)
note_count += 1
# Seeded RNG per verb — deterministic pronoun/gender choices
verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)
for form_key, form_data in forms.items():
primary_form = form_data.get("form", "")
conj_form = primary_form
# Infinitive: shown on card front as reference — skip as a quiz form
if form_key == "infinitive":
continue
# Audio tag
audio_tag = ""
if include_audio and slug:
audio_tag = _conj_audio_tag(slug, form_key)
if audio_tag:
mp3_path = audio_dir / f"{slug}_{form_key}.mp3"
if mp3_path not in media_files:
media_files.append(mp3_path)
guid_val = form_data.get("guid")
guid_candidates = form_data.get("guid_candidates")
# Present tense expansion: 4 form keys → 1 card each (seeded RNG)
if form_key in PRESENT_EXPANSION:
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates)
continue
# Past 3rd plural: same form for m/f → 1 card (seeded RNG)
if form_key == "past_3p":
chosen = verb_rng.choice(PAST_3P_EXPANSION)
add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates)
continue
# 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
if form_key in FP_MODERN_FALLBACK:
mp_key = FP_MODERN_FALLBACK[form_key]
mp_form = forms.get(mp_key, {}).get("form", "")
fp_form = conj_form
display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
add_note(pronoun, tense, display_form, audio_tag, form_key, guid_val, guid_candidates)
continue
# Standard card
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
pronoun = f"{pronoun} ({gender})"
add_note(pronoun, tense, conj_form, audio_tag, form_key, guid_val, guid_candidates)
# Passive partner forms (Huf'al/Pu'al counterpart)
hufal_forms_list = conj.get("hufal_pual_forms")
if hufal_forms_list:
ref_passive = conj.get("reference_form_passive")
ref_form_passive = ref_passive["nikkud"] if ref_passive else ref_form
passive_binyan = "Huf'al" if binyan == "Hif'il" else "Pu'al"
passive_binyan_heb = BINYAN_TO_HEBREW.get(passive_binyan, passive_binyan)
passive_voice = VOICE_MAP.get(passive_binyan, "סָבִיל")
passive_forms = _forms_list_to_dict(hufal_forms_list)
for form_key, form_data in passive_forms.items():
primary_form = form_data.get("form", "")
conj_form = primary_form
if form_key == "infinitive":
continue
audio_tag = ""
if include_audio and slug:
passive_audio_key = f"passive_{form_key}"
audio_tag = _conj_audio_tag(slug, passive_audio_key)
if audio_tag:
mp3_path = audio_dir / f"{slug}_{passive_audio_key}.mp3"
if mp3_path not in media_files:
media_files.append(mp3_path)
guid_val = form_data.get("guid")
guid_candidates = form_data.get("guid_candidates")
if form_key in PRESENT_EXPANSION:
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
add_note(
chosen[0],
chosen[1],
conj_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
continue
if form_key == "past_3p":
chosen = verb_rng.choice(PAST_3P_EXPANSION)
add_note(
chosen[0],
chosen[1],
conj_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
continue
if form_key in FP_MODERN_FALLBACK:
mp_key = FP_MODERN_FALLBACK[form_key]
mp_form = passive_forms.get(mp_key, {}).get("form", "")
fp_form = conj_form
display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
add_note(
pronoun,
tense,
display_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
continue
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
pronoun = f"{pronoun} ({gender})"
add_note(
pronoun,
tense,
conj_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
logger.info(f"Conjugation deck: {note_count} notes across {verb_count} verbs")
return deck, media_files
# ──────────────────────────────────────────────────────────────────────────────
# Confusables deck — words that look identical without nikkud
# ──────────────────────────────────────────────────────────────────────────────
CONF_FRONT = """
"""
CONF_BACK = """
{{FrontSide}}
{{Definitions}}
{{#Audio}}{{Audio}}
{{/Audio}}
"""
CONF_CSS = CARD_CSS
CONF_MODEL = genanki.Model(
CONF_MODEL_ID,
"Hebrew Confusables",
fields=[
{"name": "Words"},
{"name": "Definitions"},
{"name": "Audio"},
{"name": "WordNoNikkud"},
],
templates=[
{
"name": "Confusable",
"qfmt": CONF_FRONT,
"afmt": CONF_BACK,
},
],
css=CONF_CSS,
)
def build_confusables_deck(
words: dict[str, dict],
include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
"""Build confusables deck from words dict — groups words by confusable_group."""
logger.info("Building confusables deck …")
deck = genanki.Deck(CONF_DECK_ID, "Hebrew Confusables")
media_files: list[Path] = []
note_count = 0
# Group entries by shared ktiv_male (confusable_group members share the same ktiv_male)
# Use confusables_guid as the stable note GUID — all members of a group share it.
# Process each unique guid once.
seen_guids: set[str] = set()
# Build guid → list of entries
guid_to_entries: dict[str, list[dict]] = {}
for unique_key, entry in words.items():
if entry.get("confusable_group") is None:
continue
guid = entry.get("confusables_guid")
if not guid:
# Fall back to ktiv_male-based guid
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
guid_to_entries.setdefault(guid, []).append(entry)
def _eff_freq(e: dict) -> int:
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
for guid, group_entries in sorted(
guid_to_entries.items(),
key=lambda x: sum(_eff_freq(e) for e in x[1]) / len(x[1]),
):
if guid in seen_guids:
continue
seen_guids.add(guid)
if len(group_entries) < 2:
continue
# Deduplicate: skip entries with identical word+meaning
seen: set[tuple[str, str]] = set()
unique_entries: list[dict] = []
for e in group_entries:
key = (e["word"]["nikkud"], e.get("meaning", ""))
if key not in seen:
seen.add(key)
unique_entries.append(e)
if len(unique_entries) < 2:
continue
# Sort by pseudo/frequency so most common meaning appears first
unique_entries.sort(key=_eff_freq)
if len(unique_entries) < 2:
continue
word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
words_display = word_no_nik # Show ktiv male (shared form) on front
defs_parts: list[str] = []
audio_parts: list[str] = []
for e in unique_entries:
w = e["word"]["nikkud"]
m = e.get("meaning", "")
p = e.get("pos_hebrew", "")
pos_div = f'{p}
' if p else ""
defs_parts.append(
f''
f'
{w}'
f'
{m}
'
f"{pos_div}
"
)
if include_audio:
af = e.get("audio_file", "") or ""
at = ""
if af:
mp3_path = AUDIO_DIR / af
if mp3_path.exists():
at = f"[sound:{af}]"
if not at:
slug = e.get("slug", "") or ""
ktiv_male = e.get("word", {}).get("ktiv_male", "") or ""
at = _audio_tag(ktiv_male, slug=slug)
if at and at not in audio_parts:
audio_parts.append(at)
mp3_name = at.removeprefix("[sound:").removesuffix("]")
mp3_path = AUDIO_DIR / mp3_name
if mp3_path not in media_files:
media_files.append(mp3_path)
defs_html = "\n".join(defs_parts)
audio_html = " ".join(audio_parts)
note = genanki.Note(
model=CONF_MODEL,
guid=guid,
fields=[words_display, defs_html, audio_html, word_no_nik],
tags=[RELEASE_TAG],
)
deck.add_note(note)
note_count += 1
logger.info(f"Confusables deck: {note_count} notes")
return deck, media_files
def write_conf_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = CONF_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Confusables deck written → {out_path}")
# ──────────────────────────────────────────────────────────────────────────────
# Noun plurals deck — singular↔plural drilling
# ──────────────────────────────────────────────────────────────────────────────
PLURAL_FRONT_SG = """
{{Singular}}
{{#SingularAudio}}{{SingularAudio}}
{{/SingularAudio}}
{{Meaning}}
יָחִיד ← רַבִּים
"""
PLURAL_BACK_SG = """
{{FrontSide}}
{{Plural}}
{{#PluralAudio}}{{PluralAudio}}
{{/PluralAudio}}
{{#Gender}}
מִין:{{Gender}}
{{/Gender}}
{{#Mishkal}}
מִשְׁקָל:{{Mishkal}}
{{/Mishkal}}
"""
PLURAL_FRONT_PL = """
{{Plural}}
{{#PluralAudio}}{{PluralAudio}}
{{/PluralAudio}}
רַבִּים ← יָחִיד
"""
PLURAL_BACK_PL = """
{{FrontSide}}
{{Singular}}
{{#SingularAudio}}{{SingularAudio}}
{{/SingularAudio}}
{{Meaning}}
{{#Gender}}
מִין:{{Gender}}
{{/Gender}}
{{#Mishkal}}
מִשְׁקָל:{{Mishkal}}
{{/Mishkal}}
"""
PLURAL_CSS = CARD_CSS
PLURAL_MODEL = genanki.Model(
PLURAL_MODEL_ID,
"Hebrew Plurals",
fields=[
{"name": "Singular"},
{"name": "SingularAudio"},
{"name": "Plural"},
{"name": "PluralAudio"},
{"name": "Meaning"},
{"name": "Root"},
{"name": "Mishkal"},
{"name": "Gender"},
],
templates=[
{
"name": "Singular → Plural",
"qfmt": PLURAL_FRONT_SG,
"afmt": PLURAL_BACK_SG,
},
{
"name": "Plural → Singular",
"qfmt": PLURAL_FRONT_PL,
"afmt": PLURAL_BACK_PL,
},
],
css=PLURAL_CSS,
)
def _is_irregular_plural(gender: str, plural_ktiv: str) -> bool:
"""Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix.
Args:
gender: ``"masculine"`` or ``"feminine"``.
plural_ktiv: ktiv male (no nikkud) form of the plural.
"""
return (gender == "masculine" and plural_ktiv.endswith("ות")) or (
gender == "feminine" and plural_ktiv.endswith("ים")
)
def build_plural_deck(
words: dict[str, dict],
include_audio: bool = False,
) -> tuple[genanki.Deck, list[Path]]:
"""Build noun plurals deck from words with noun_inflection data.
Selection: ALL irregular plurals + 2-3 high-frequency exemplars per mishkal
pattern (for regular nouns).
"""
logger.info("Building plurals deck …")
deck = genanki.Deck(PLURAL_DECK_ID, "Hebrew Plurals")
media_files: list[Path] = []
# Collect all nouns with both singular and plural
irregulars: list[tuple[str, dict, dict]] = [] # (unique_key, entry, noun_inflection)
by_mishkal: dict[str, list[tuple[str, dict, dict]]] = {}
for unique_key, entry in words.items():
if not entry.get("pos", "").startswith("Noun"):
continue
noun_inflection = entry.get("noun_inflection")
if not noun_inflection:
continue
singular_data = noun_inflection.get("singular")
plural_data = noun_inflection.get("plural")
if not singular_data or not plural_data:
continue
singular = singular_data.get("nikkud", "")
plural = plural_data.get("nikkud", "")
plural_ktiv = plural_data.get("ktiv_male", "")
if not singular or not plural:
continue
gender = noun_inflection.get("gender", "")
mishkal = noun_inflection.get("mishkal") or ""
if _is_irregular_plural(gender, plural_ktiv):
irregulars.append((unique_key, entry, noun_inflection))
elif mishkal:
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
# Select regular exemplars to achieve a 2:1 regular:irregular ratio.
# Distribute evenly across mishkal patterns, preferring high-frequency words.
irregular_count = len(irregulars)
target_regular = irregular_count * 2
mishkal_count = len(by_mishkal) or 1
# Over-sample per mishkal to compensate for small patterns, then trim
per_mishkal = max(3, (target_regular * 3) // (mishkal_count * 2))
regular_pool: list[tuple[str, dict, dict]] = []
for _mishkal, entries in sorted(by_mishkal.items()):
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
regular_pool.extend(entries[:per_mishkal])
# If we overshot, trim to target (keeping highest-frequency across all mishkals)
if len(regular_pool) > target_regular:
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
regular_pool = regular_pool[:target_regular]
# Sort both pools by frequency, then interleave for homogeneous 2:1 regular:irregular
irregulars.sort(key=lambda e: e[1].get("frequency") or 999_999)
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
# Interleave: for every 1 irregular, insert 2 regulars
selected: list[tuple[str, dict, dict]] = []
ri = 0 # regular index
for _ii, irr in enumerate(irregulars):
# Insert 2 regulars before each irregular (when available)
for _ in range(2):
if ri < len(regular_pool):
selected.append(regular_pool[ri])
ri += 1
selected.append(irr)
# Append remaining regulars
while ri < len(regular_pool):
selected.append(regular_pool[ri])
ri += 1
note_count = 0
for _unique_key, entry, noun_inflection in selected:
singular = noun_inflection["singular"]["nikkud"]
singular_ktiv = noun_inflection["singular"].get("ktiv_male", "")
plural = noun_inflection["plural"]["nikkud"]
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
gender = noun_inflection.get("gender") or ""
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
mishkal_heb = noun_inflection.get("mishkal_hebrew") or ""
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
root_list = entry.get("root") or []
root = ".".join(root_list)
# GUID from noun_inflection
note_guid_raw = noun_inflection.get("plurals_guid")
note_guid = note_guid_raw if note_guid_raw else genanki.guid_for("plural", singular, meaning)
# Audio tags
sg_audio = ""
pl_audio = ""
if include_audio:
slug = entry.get("slug", "")
sg_tag = _audio_tag(singular_ktiv, slug=slug)
if sg_tag:
sg_audio = sg_tag
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
if mp3_path not in media_files:
media_files.append(mp3_path)
# Plural audio: {slug}_plural.mp3
if slug:
pl_mp3 = AUDIO_DIR / f"{slug}_plural.mp3"
if pl_mp3.exists():
pl_audio = f"[sound:{pl_mp3.name}]"
if pl_mp3 not in media_files:
media_files.append(pl_mp3)
mishkal_eng = noun_inflection.get("mishkal") or ""
tags = [RELEASE_TAG]
if mishkal_eng:
tags.append(f"mishkal::{mishkal_eng}")
if _is_irregular_plural(gender, plural_ktiv):
tags.append("irregular")
note = genanki.Note(
model=PLURAL_MODEL,
guid=note_guid,
fields=[
singular,
sg_audio,
plural,
pl_audio,
meaning,
root,
mishkal_heb,
gender_heb,
],
tags=tags,
)
deck.add_note(note)
note_count += 1
irregular_count = len(irregulars)
regular_count = note_count - irregular_count
logger.info(
f"Plurals deck: {note_count} notes "
f"({irregular_count} irregular + {regular_count} regular exemplars "
f"from {len(by_mishkal)} mishkal patterns)"
)
return deck, media_files
def write_plural_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = PLURAL_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Plurals deck written → {out_path}")
def _font_media_files() -> list[str]:
"""Return list of Heebo font file paths that exist, for bundling in .apkg."""
font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
return [str(p) for p in font_paths if p.exists()]
class _RandomOrderPackage(genanki.Package):
"""genanki.Package subclass that sets new card order to random (0) instead of insertion order (1)."""
def write_to_db(self, cursor, timestamp, id_gen):
super().write_to_db(cursor, timestamp, id_gen)
row = cursor.execute("SELECT dconf FROM col").fetchone()
if row:
dconf = json.loads(row[0])
for conf in dconf.values():
if isinstance(conf, dict) and "new" in conf:
conf["new"]["order"] = 0
cursor.execute("UPDATE col SET dconf = ?", [json.dumps(dconf)])
def write_vocab_apkg(
deck: genanki.Deck,
media_files: list[Path],
out_path: Path = VOCAB_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck) # insertion order = frequency rank (new.order=1 default)
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Vocabulary deck written → {out_path}")
def write_conj_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = CONJ_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = _RandomOrderPackage(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Conjugation deck written → {out_path}")
def build_complete_deck(
words: dict[str, dict],
limit: int | None = None,
include_audio: bool = False,
emoji_lookup: dict | None = None,
) -> tuple[list[genanki.Deck], list[Path]]:
"""Build all subdecks under 'Hebrew::*' for the combined .apkg.
Returns (list_of_decks, deduplicated_media_files).
"""
logger.info(f" Building complete deck (audio={'yes' if include_audio else 'no'}) …")
# Build standalone decks using existing functions
vocab_deck, vocab_media = build_vocab_deck(
words,
limit=limit,
include_audio=include_audio,
include_images=True,
emoji_lookup=emoji_lookup,
)
conj_deck, conj_media = build_conj_deck(
words,
include_audio=include_audio,
)
conf_deck, conf_media = build_confusables_deck(
words,
include_audio=include_audio,
)
plural_deck, plural_media = build_plural_deck(
words,
include_audio=include_audio,
)
# Create new Deck objects with subdeck names and different IDs
complete_vocab = genanki.Deck(COMPLETE_VOCAB_DECK_ID, "Hebrew::Vocabulary")
for note in vocab_deck.notes:
complete_vocab.add_note(note)
complete_conj = genanki.Deck(COMPLETE_CONJ_DECK_ID, "Hebrew::Conjugations")
for note in conj_deck.notes:
complete_conj.add_note(note)
complete_conf = genanki.Deck(COMPLETE_CONF_DECK_ID, "Hebrew::Confusables")
for note in conf_deck.notes:
complete_conf.add_note(note)
complete_plural = genanki.Deck(COMPLETE_PLURAL_DECK_ID, "Hebrew::Plurals")
for note in plural_deck.notes:
complete_plural.add_note(note)
all_source_media = vocab_media + conj_media + conf_media + plural_media
# Deduplicate media files by resolved path
seen_paths: set[str] = set()
all_media: list[Path] = []
for mf in all_source_media:
resolved = str(mf.resolve()) if mf.exists() else str(mf)
if resolved not in seen_paths:
seen_paths.add(resolved)
all_media.append(mf)
decks = [complete_vocab, complete_conj, complete_conf, complete_plural]
plural_info = f" + {len(complete_plural.notes)} plural"
logger.info(
f" Complete deck: {len(complete_vocab.notes)} vocab + "
f"{len(complete_conj.notes)} conj + {len(complete_conf.notes)} conf{plural_info} notes, "
f"{len(all_media)} media files"
)
return decks, all_media
def write_complete_apkg(
decks: list[genanki.Deck],
media_files: list[Path],
out_path: Path = COMPLETE_APKG,
) -> None:
"""Write a combined .apkg with multiple subdecks."""
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(decks)
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Complete deck written → {out_path}")
def build_all_variants(
words: dict[str, dict],
limit: int | None = None,
) -> None:
"""Build all 12 release variants into output/."""
logger.info("Building all release variants …")
emoji_lookup = _load_emoji_lookup()
logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded")
vocab_variants = [
(False, False, VOCAB_APKG),
(True, False, VOCAB_APKG_AUDIO),
(False, True, VOCAB_APKG_IMAGES),
(True, True, VOCAB_APKG_AUDIO_IMAGES),
]
for audio, images, path in vocab_variants:
label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}"
logger.info(f" Vocab variant: {label} → {path.name}")
deck, media = build_vocab_deck(
words,
limit=limit,
include_audio=audio,
include_images=images,
emoji_lookup=emoji_lookup,
)
write_vocab_apkg(deck, media, out_path=path)
conj_variants = [
(False, CONJ_APKG),
(True, CONJ_APKG_AUDIO),
]
for audio, path in conj_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Conj variant: {label} → {path.name}")
deck, media = build_conj_deck(words, include_audio=audio)
write_conj_apkg(deck, media, out_path=path)
conf_variants = [
(False, CONF_APKG),
(True, CONF_APKG_AUDIO),
]
for audio, path in conf_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Conf variant: {label} → {path.name}")
deck, media = build_confusables_deck(words, include_audio=audio)
write_conf_apkg(deck, media, out_path=path)
plural_variants = [
(False, PLURAL_APKG),
(True, PLURAL_APKG_AUDIO),
]
for audio, path in plural_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Plural variant: {label} → {path.name}")
deck, media = build_plural_deck(words, include_audio=audio)
write_plural_apkg(deck, media, out_path=path)
# Combined "Hebrew::*" complete decks
complete_variants = [
(False, COMPLETE_APKG),
(True, COMPLETE_APKG_AUDIO),
]
for audio, path in complete_variants:
decks, media = build_complete_deck(
words,
limit=limit,
include_audio=audio,
emoji_lookup=emoji_lookup,
)
write_complete_apkg(decks, media, out_path=path)
logger.info("All variants built.")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
words = _load_words()
deck, media = build_vocab_deck(words, limit=20)
write_vocab_apkg(deck, media)
conj_deck, conj_media = build_conj_deck(words)
write_conj_apkg(conj_deck, conj_media)