hebrew_flash_cards/apkg_builder.py
Sochen 0d92451271 Sprint 16: collapsible card details + related words table
- All secondary fields (shoresh, PoS, ktiv male, plural, related words)
  behind a "מידע נוסף" toggle button using HTML <details>/<summary>
- Conjugation back: English meaning, binyan also behind toggle
- Related words: table format with word + meaning, sorted by frequency
- Hebrew words not bold, English meanings 24px gray (#555)
- "מִילִים קְשׁוּרוֹת" sub-header with nikkud inside toggle
- "אֵיךְ אוֹמְרִים" prompt centered using hint class
- New CSS: .more-toggle, .more-header, .related-header, .rw-word, .rw-meaning
- Dark mode support for all new classes
- Bump to v0.18

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-11 01:34:14 +00:00

1956 lines
70 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
Uses genanki for reliable, stable deck generation.
Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
in Anki rather than creating a duplicate.
"""
import json
import logging
import random
import re
from pathlib import Path
import genanki
logger = logging.getLogger(__name__)
# Stable deck/model IDs — do not change these
VOCAB_DECK_ID = 1_234_567_890
VOCAB_MODEL_ID = 1_701_222_017_968 # matches Nevo's original Anki model
CONJ_DECK_ID = 1_234_567_892
CONJ_MODEL_ID = 1_234_567_893
CONF_DECK_ID = 1_234_567_894
CONF_MODEL_ID = 1_234_567_895
PLURAL_DECK_ID = 1_234_567_896
PLURAL_MODEL_ID = 1_234_567_897
# Subdeck IDs for combined "Hebrew::*" package — MUST differ from standalone IDs
COMPLETE_VOCAB_DECK_ID = 1_234_567_900
COMPLETE_CONJ_DECK_ID = 1_234_567_901
COMPLETE_CONF_DECK_ID = 1_234_567_902
COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.18"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)")
DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
OUTPUT_DIR = Path(__file__).parent / "output"
VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg"
VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg"
VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg"
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg"
CONF_APKG = OUTPUT_DIR / "hebrew_confusables.apkg"
CONF_APKG_AUDIO = OUTPUT_DIR / "hebrew_confusables_audio.apkg"
PLURAL_APKG = OUTPUT_DIR / "hebrew_plurals.apkg"
PLURAL_APKG_AUDIO = OUTPUT_DIR / "hebrew_plurals_audio.apkg"
COMPLETE_APKG = OUTPUT_DIR / "hebrew_complete.apkg"
COMPLETE_APKG_AUDIO = OUTPUT_DIR / "hebrew_complete_audio.apkg"
# ──────────────────────────────────────────────────────────────────────────────
# Binyan → Hebrew label mapping (for conjugation card display)
# ──────────────────────────────────────────────────────────────────────────────
BINYAN_TO_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּעֵל",
"Pu'al": "פֻּעַל",
"Hitpa'el": "הִתְפַּעֵל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
}
# ──────────────────────────────────────────────────────────────────────────────
# PoS → Hebrew label mapping
# ──────────────────────────────────────────────────────────────────────────────
POS_TO_HEBREW = {
"Noun": "שם עצם",
"Verb": "פועל",
"Adjective": "שם תואר",
"Adverb": "תואר הפועל",
"Preposition": "מילת יחס",
"Conjunction": "מילת חיבור",
"Pronoun": "כינוי גוף",
"Particle": "מילית",
}
# PoS category groupings for related-words display
POS_CATEGORY_LABELS = {
"Verb": "פעלים",
"Noun": "שמות עצם",
"Adjective": "שמות תואר",
"Adverb": "תוארי הפועל",
}
# ──────────────────────────────────────────────────────────────────────────────
# Shared CSS
# ──────────────────────────────────────────────────────────────────────────────
FONTS_DIR = DATA_DIR / "fonts"
CARD_CSS = """
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Regular.ttf');
font-weight: normal;
}
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Bold.ttf');
font-weight: bold;
}
.card {
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
font-size: 20px;
text-align: right;
color: #222;
background: #fff;
padding: 16px;
max-width: 600px;
margin: 0 auto;
}
.hebrew {
font-size: 42px;
font-weight: bold;
direction: rtl;
text-align: center;
line-height: 1.5;
color: #222;
}
.hebrew-sm {
font-size: 30px;
font-weight: normal;
direction: rtl;
text-align: center;
color: #222;
}
.meaning {
font-size: 34px;
color: #1a1a8c;
margin: 8px 0;
text-align: center;
}
.hint {
font-size: 22px;
color: #555;
margin: 4px 0;
direction: rtl;
text-align: center;
}
.example {
font-size: 24px;
color: #222;
direction: rtl;
text-align: right;
font-style: italic;
margin: 10px auto 0;
max-width: 90%;
border-right: 3px solid #aaa;
padding-right: 8px;
}
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
.freq-badge {
display: inline-block;
font-size: 11px;
color: #aaa;
background: transparent;
border: 1px solid #eee;
border-radius: 10px;
padding: 2px 8px;
margin-top: 4px;
}
.voice-label {
font-size: 0.6em;
font-weight: normal;
color: #555;
}
.sec-table {
display: table;
margin: 6px auto 0;
direction: rtl;
border-collapse: collapse;
}
.sec-label {
display: table-row;
font-size: 28px;
font-weight: normal;
color: #222;
direction: rtl;
}
.sec-key {
display: table-cell;
font-size: 28px;
color: #222;
font-weight: bold;
text-align: right;
padding: 2px 0 2px 8px;
white-space: nowrap;
}
.sec-val {
display: table-cell;
font-size: 28px;
color: #222;
text-align: right;
padding: 2px 0;
}
.definitions {
direction: rtl;
text-align: center;
}
.more-toggle {
text-align: center;
direction: rtl;
margin-top: 8px;
}
.more-header {
display: inline-block;
font-size: 18px;
color: #555;
cursor: pointer;
list-style: none;
border: 1px solid #ccc;
border-radius: 16px;
padding: 4px 16px;
margin: 4px 0;
background: #f8f8f8;
}
.more-header::-webkit-details-marker { display: none; }
.more-header::before { content: ""; font-size: 14px; }
details[open] > .more-header::before { content: ""; }
.related-header {
font-size: 22px;
color: #555;
text-align: center;
margin: 4px 0;
}
.rw-word {
display: table-cell;
font-size: 28px;
color: #222;
font-weight: normal;
text-align: right;
padding: 2px 0 2px 8px;
white-space: nowrap;
}
.rw-meaning {
display: table-cell;
font-size: 24px;
color: #555;
text-align: left;
direction: ltr;
padding: 2px 0;
}
.conf-entry {
margin: 8px 0;
font-size: 28px;
direction: rtl;
}
.emoji-img {
font-size: 3.5em;
text-align: center;
margin: 0.3em 0;
}
.card [type="button"], .card button, .replay-button {
display: block !important;
margin: 4px auto !important;
text-align: center;
}
@media (prefers-color-scheme: dark) {
.card { color: #e8e8e8; background: #1c1c1e; }
.hebrew { color: #f0f0f0; }
.hebrew-sm { color: #e0e0e0; }
.meaning { color: #82b0ff; }
.sec-label { color: #e0e0e0; }
.sec-key { color: #e0e0e0; }
.sec-val { color: #e0e0e0; }
.conf-entry { color: #ddd; }
.hint { color: #777; }
.voice-label { color: #888; }
.example { color: #e0e0e0; border-right-color: #555; }
.divider { border-top-color: #333; }
.freq-badge { color: #888; border-color: #444; }
.more-header { color: #bbb; background: #2a2a2e; border-color: #555; }
.related-header { color: #999; }
.rw-word { color: #e0e0e0; }
.rw-meaning { color: #999; }
}
"""
# ──────────────────────────────────────────────────────────────────────────────
# Vocabulary Deck
# ──────────────────────────────────────────────────────────────────────────────
VOCAB_FRONT_HEB = """
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""
VOCAB_BACK_HEB = """
{{FrontSide}}
<div class="divider"></div>
<div class="meaning">{{Meaning}}</div>
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
<div class="sec-table">
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
</div>
{{#SharedRoots}}
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">{{SharedRoots}}</div>
{{/SharedRoots}}
</details>
"""
VOCAB_FRONT_ENG = """
<div class="meaning">{{Meaning}}</div>
{{#Hint}}<div class="hint">{{Hint}}</div>{{/Hint}}
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
"""
VOCAB_BACK_ENG = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
<div class="sec-table">
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
</div>
{{#SharedRoots}}
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">{{SharedRoots}}</div>
{{/SharedRoots}}
</details>
"""
VOCAB_FRONT_CLOZE = """
<div class="example" style="font-size:32px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
"""
VOCAB_BACK_CLOZE = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""
VOCAB_MODEL = genanki.Model(
VOCAB_MODEL_ID,
"Hebrew Flash Cards",
fields=[
{"name": "Word"},
{"name": "Root"},
{"name": "PoS"},
{"name": "Meaning"},
{"name": "WordNoNikkud"},
{"name": "SharedRoots"},
{"name": "Tags"},
{"name": "Audio"},
{"name": "Example"},
{"name": "Frequency"},
{"name": "Image"},
{"name": "Emoji"},
{"name": "Prep"},
{"name": "Hint"},
{"name": "Plural"},
{"name": "Gender"},
{"name": "ClozeExample"},
{"name": "ClozeHint"},
],
templates=[
{
# ord 0 — matches Nevo's original "Card 2" (Eng→Heb)
"name": "English → Hebrew",
"qfmt": VOCAB_FRONT_ENG,
"afmt": VOCAB_BACK_ENG,
},
{
# ord 1 — matches Nevo's original "Card 3" (Heb→Eng)
"name": "Hebrew → English",
"qfmt": VOCAB_FRONT_HEB,
"afmt": VOCAB_BACK_HEB,
},
{
# ord 2 — Cloze-style sentence fill-in (only generated when ClozeExample is non-empty)
"name": "Sentence Cloze",
"qfmt": VOCAB_FRONT_CLOZE,
"afmt": VOCAB_BACK_CLOZE,
},
],
css=CARD_CSS,
)
# ──────────────────────────────────────────────────────────────────────────────
# Conjugation Deck
# ──────────────────────────────────────────────────────────────────────────────
CONJ_FRONT = """
<div class="hint">אֵיךְ אוֹמְרִים</div>
<div class="hebrew">{{Pronoun}}</div>
<div class="hebrew" style="color:#1a1a8c;">{{Infinitive}}{{#Prep}} <span class="hebrew-sm">({{Prep}})</span>{{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Tense}}</div>
"""
CONJ_BACK = """
{{FrontSide}}<hr>
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
{{#Meaning}}<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>{{/Meaning}}
<div class="sec-table">
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
</div>
{{#RelatedVocab}}
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">{{RelatedVocab}}</div>
{{/RelatedVocab}}
</details>
"""
CONJ_CSS = CARD_CSS
CONJ_MODEL = genanki.Model(
CONJ_MODEL_ID,
"Hebrew Conjugation",
fields=[
{"name": "Infinitive"},
{"name": "ReferenceForm"},
{"name": "Pronoun"},
{"name": "Tense"},
{"name": "ConjugatedForm"},
{"name": "Root"},
{"name": "Binyan"},
{"name": "Voice"},
{"name": "Audio"},
{"name": "Meaning"},
{"name": "RelatedVocab"},
{"name": "Prep"},
],
templates=[
{
"name": "Conjugation Drill",
"qfmt": CONJ_FRONT,
"afmt": CONJ_BACK,
}
],
css=CONJ_CSS,
)
# Present-tense expansion: each form key → list of (pronoun, tense_label)
PRESENT_EXPANSION = {
"present_ms": [
("אֲנִי (זָכָר)", "הוֹוֶה"),
("אַתָּה", "הוֹוֶה"),
("הוּא", "הוֹוֶה"),
],
"present_fs": [
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
("אַתְּ", "הוֹוֶה"),
("הִיא", "הוֹוֶה"),
],
"present_mp": [
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
("אַתֶּם", "הוֹוֶה"),
("הֵם", "הוֹוֶה"),
],
"present_fp": [
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
("אַתֶּן", "הוֹוֶה"),
("הֵן", "הוֹוֶה"),
],
}
# Modern Hebrew: 2fp/3fp future and imperative default to mp form (classical in parens)
FP_MODERN_FALLBACK = {
"future_2fp": "future_2mp",
"future_3fp": "future_3mp",
"imperative_fp": "imperative_mp",
}
# 3rd person plural past: same form for m/f — generate two separate pronoun cards
PAST_3P_EXPANSION = [
("הֵם", "עָבָר"),
("הֵן", "עָבָר"),
]
# Tense labels with "בְּ" prefix for display on cards
TENSE_WITH_BE = {
"עָבָר": "בֶּעָבָר",
"הוֹוֶה": "בַּהוֹוֶה",
"עָתִיד": "בֶּעָתִיד",
"צִיּוּוּי": "בַּצִּוּוּי",
}
# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
VOICE_MAP = {
"Pu'al": "סָבִיל",
"Huf'al": "סָבִיל",
}
# Tense Hebrew label → English key prefix (for form_key construction)
TENSE_KEY_MAP = {
"עָבָר": "past",
"הוֹוֶה": "present",
"עָתִיד": "future",
"צִוּוּי": "imperative",
"צִיּוּוּי": "imperative", # alternate spelling
}
# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────
def _load_words() -> dict[str, dict]:
"""Load the unified words.json data store."""
path = DATA_DIR / "words.json"
with open(path, encoding="utf-8") as f:
return json.load(f)
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR, slug: str = "") -> str:
"""Return [sound:xxx.mp3] if audio file exists, else empty string.
Tries slug-based filename first (for confusable words), then consonant-based.
"""
if slug:
slug_path = audio_dir / f"{slug}.mp3"
if slug_path.exists():
return f"[sound:{slug_path.name}]"
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
if not safe:
return ""
mp3_path = audio_dir / f"{safe}.mp3"
if mp3_path.exists():
return f"[sound:{mp3_path.name}]"
return ""
def _conj_audio_tag(slug: str, form_key: str) -> str:
"""Return [sound:xxx.mp3] for conjugation audio if downloaded."""
filename = f"{slug}_{form_key}.mp3"
mp3_path = AUDIO_CONJ_DIR / filename
if mp3_path.exists():
return f"[sound:{filename}]"
return ""
# Keywords excluded when building emoji lookup AND matching meaning text.
# Curated from manual review of all 2,261 emoji-word pairs (Sprint 8).
_EMOJI_STOP = frozenset(
{
# Basic stop words
"to",
"be",
"a",
"an",
"the",
"of",
"in",
"on",
"at",
"for",
"and",
"with",
"by",
"or",
"but",
"not",
"as",
"its",
# Generic emoji description words (too vague)
"face",
"hand",
"sign",
"symbol",
"button",
"small",
"large",
"light",
"dark",
"open",
"closed",
# Numbers → clock emoji (🕐🕑🕒 etc.)
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"hundred",
"thousand",
# UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️)
"next",
"fast",
"play",
"pause",
"repeat",
"end",
"soon",
"record",
# Abstract words → misleading object emoji
"part",
"place",
"mark",
"post",
"department",
"store",
"note",
"control",
"level",
"stop",
"cover",
"roll",
"rolling",
"pick",
"over",
"right",
"way",
"skin",
"drop",
"middle",
"piece",
"section",
# Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.)
"north",
"south",
"northern",
"southern",
"western",
"eastern",
"central",
"territory",
"kingdom",
"united",
"virgin",
# Common words producing bad emoji matches
"new",
"big",
"full",
"last",
"first",
"double",
"slightly",
"without",
"from",
"behind",
"people",
"position",
"status",
"situation",
"game",
"call",
"trade",
"male",
"female",
"person",
"letter",
# Polysemous words → wrong emoji sense
"french",
"fried",
"board",
"bow",
"water",
"union",
"rock",
"left",
"back",
"crane",
"dash",
"bar",
"wheel",
"horizontal",
}
)
def _load_emoji_lookup() -> dict[str, str]:
"""Load or fetch Unicode emoji keyword→character lookup.
Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
Result is cached in data/emoji_lookup.json.
Returns empty dict on network failure (safe fallback).
"""
cache_file = DATA_DIR / "emoji_lookup.json"
if cache_file.exists():
with open(cache_file) as f:
return json.load(f)
import requests
try:
resp = requests.get(
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
timeout=30,
)
resp.raise_for_status()
except Exception as e:
logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
return {}
lookup: dict[str, str] = {}
for line in resp.text.splitlines():
if "fully-qualified" not in line:
continue
m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
if not m:
continue
emoji_char = m.group(1)
desc = m.group(2).lower().strip()
for word in desc.split():
word = word.strip(".,'\"-")
if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
lookup[word] = emoji_char
cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
return lookup
def _categorize_pos(pos_str: str) -> str:
"""Return the canonical PoS category key for grouping."""
base = pos_str.split("")[0].split("")[0].strip()
for cat in POS_CATEGORY_LABELS:
if base == cat:
return cat
return "Other"
def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]:
"""Convert schema's active_forms list to the keyed format the card generator expects.
Keys are like ``past_1s``, ``present_ms``, ``future_2mp``, ``imperative_fs``.
Each value dict has: form, form_ktiv, pronoun (Hebrew string), tense (Hebrew label),
audio_url, guid, guid_candidates.
"""
result: dict[str, dict] = {}
for f in forms_list:
tense_en = TENSE_KEY_MAP.get(f["tense"], f["tense"])
key = f"{tense_en}_{f['person']}"
result[key] = {
"form": f["form"]["nikkud"],
"form_ktiv": f["form"].get("ktiv_male", ""),
"pronoun": f.get("pronoun_hebrew", ""), # Hebrew pronoun string
"tense": f["tense"], # Hebrew tense label
"audio_url": f.get("audio_url", ""),
"guid": f.get("guid"),
"guid_candidates": f.get("guid_candidates"),
}
return result
# Hebrew prefix letters (אותיות השימוש): בהוכלמש
_PREFIX_LETTERS = frozenset("בהוכלמש")
def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int:
"""Return the number of characters in the cloze token that are prefix (not part of the word).
For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars).
Returns 0 if the token starts with the word directly.
"""
if not word_nikkud or not cloze_token:
return 0
# If the token starts with the word nikkud, no prefix
if cloze_token.startswith(word_nikkud):
return 0
# Check if word nikkud appears as a suffix of the token
idx = cloze_token.find(word_nikkud)
if idx > 0:
# Verify prefix chars are valid Hebrew prefix letters
prefix_part = cloze_token[:idx]
base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"]
if base_letters and all(c in _PREFIX_LETTERS for c in base_letters):
return idx
return 0
def build_vocab_deck(
words: dict[str, dict],
limit: int | None = None,
include_audio: bool = True,
include_images: bool = True,
emoji_lookup: dict | None = None,
) -> tuple[genanki.Deck, list[Path]]:
"""Build the vocabulary deck from the unified words dict.
Args:
words: Unified data dict keyed by unique_key (from words.json).
limit: If set, only process the first N entries (by frequency).
include_audio: Whether to include audio tags in notes.
include_images: Whether to include image tags in notes.
emoji_lookup: Optional Unicode emoji keyword→char mapping for fallback emoji.
Returns:
(deck, list_of_media_files)
"""
logger.info(f"Building vocabulary deck from {len(words)} words …")
images_dir = DATA_DIR / "images"
# Build word_unique_key → pos_category dict for related-words grouping
word_to_pos_cat: dict[str, str] = {}
for unique_key, entry in words.items():
pos_raw = entry.get("pos", "")
if pos_raw:
word_to_pos_cat[unique_key] = _categorize_pos(pos_raw)
# Also index by nikkud word (for shared_roots lookup by nikkud form)
word_nikkud = entry["word"]["nikkud"]
if word_nikkud not in word_to_pos_cat:
word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
# Sort entries by frequency (null → 999999), applying limit after sort
def _freq_key(item: tuple[str, dict]) -> int:
return item[1].get("frequency") or 999_999
sorted_entries = sorted(words.items(), key=_freq_key)
if limit:
sorted_entries = sorted_entries[:limit]
deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
media_files: list[Path] = []
seen_words: set[tuple[str, str]] = set()
for _unique_key, entry in sorted_entries:
word_nikkud = entry["word"]["nikkud"]
word_no_nik = entry["word"].get("ktiv_male", "")
root_list = entry.get("root") or []
root = ".".join(root_list)
pos_raw = entry.get("pos", "")
pos_heb = entry.get("pos_hebrew", "")
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
meaning = HBPAREN_RE.sub("", meaning).strip()
# Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning)
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
meaning = re.sub(r"[;:]\s*—", "", meaning) # clean "; —" → " —"
meaning = re.sub(r";\s*:", ";", meaning) # clean "; :" → ";"
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
meaning_raw = entry.get("meaning_raw", "") or ""
slug = entry.get("slug", "") or ""
frequency = entry.get("frequency") or 999_999
audio_file = entry.get("audio_file", "") or ""
tags_str = entry.get("tags", "") or ""
hint_str = entry.get("hint", "") or ""
shared_roots_keys = entry.get("shared_roots") or []
is_confusable = entry.get("confusable_group") is not None
if not word_nikkud or not meaning:
continue
# Skip exact duplicates (same word AND same meaning)
word_meaning_key = (word_nikkud, meaning)
if word_meaning_key in seen_words:
logger.debug(f" Skipping duplicate word+meaning: {word_nikkud}")
continue
seen_words.add(word_meaning_key)
# Frequency display label
if frequency <= 500:
freq_display = f"Core #{frequency}"
elif frequency <= 1500:
freq_display = f"Essential #{frequency}"
elif frequency <= 3000:
freq_display = f"Intermediate #{frequency}"
elif frequency <= 5000:
freq_display = f"Upper-intermediate #{frequency}"
elif frequency <= 10000:
freq_display = f"Advanced #{frequency}"
elif frequency < 999_999:
freq_display = f"Rare #{frequency}"
else:
freq_display = "Unlisted"
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup
emoji_str = ""
if entry.get("emoji_visible") and entry.get("emoji"):
emoji_str = entry["emoji"]
elif not emoji_str and emoji_lookup:
meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip()
for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]:
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
emoji_str = emoji_lookup[kw]
break
# Extract Hebrew prepositions from meaning_raw
preps = HBPAREN_RE.findall(meaning_raw)
prep_str = " ".join(f"({p})" for p in preps)
# Audio — use audio_file from entry; for confusables it's already slug-based
audio_tag = ""
if include_audio and audio_file:
mp3_path = AUDIO_DIR / audio_file
if mp3_path.exists():
audio_tag = f"[sound:{audio_file}]"
if mp3_path not in media_files:
media_files.append(mp3_path)
else:
# Fallback: try consonant-based filename
audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "")
if audio_tag:
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
mp3_path_fb = AUDIO_DIR / mp3_name
if mp3_path_fb not in media_files:
media_files.append(mp3_path_fb)
elif include_audio:
audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "")
if audio_tag:
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
mp3_path_fb = AUDIO_DIR / mp3_name
if mp3_path_fb not in media_files:
media_files.append(mp3_path_fb)
# Example sentence from vetted examples
example_html = ""
examples = entry.get("examples") or {}
if examples.get("vetted"):
example_html = examples["vetted"][0]["text"]
# Cloze: use pre-computed cloze from words.json
cloze_example = ""
cloze_hint = ""
if not is_confusable and examples.get("cloze"):
cloze_data = examples["cloze"]
cloze_text = cloze_data.get("text", "")
start = cloze_data.get("cloze_word_start")
end = cloze_data.get("cloze_word_end")
if cloze_text and start is not None and end is not None:
# Preserve Hebrew prefix letters in the cloze blank
# e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____"
cloze_token = cloze_text[start:end]
prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud)
cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:]
# Clean up duplicate adjacent quotation marks (e.g. "" → ")
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
raw_hint = cloze_data.get("cloze_hint") or ""
if raw_hint:
cloze_hint = raw_hint
else:
pos_cat = _categorize_pos(pos_raw) if pos_raw else "Other"
cloze_hint = meaning
if pos_cat == "Verb" and pos_heb:
cloze_hint = f"{meaning} ({pos_heb})"
# Related words (shared roots) as a table: word — meaning, sorted by frequency
related_html = ""
if shared_roots_keys:
rw_items: list[tuple[int, str, str]] = [] # (sort_key, nikkud, meaning)
for rw_key in shared_roots_keys:
rw_entry = words.get(rw_key)
if rw_entry:
rw_nikkud = rw_entry["word"]["nikkud"]
rw_meaning = rw_entry.get("meaning") or ""
if len(rw_meaning) > 40:
rw_meaning = rw_meaning[:37] + ""
rw_freq = rw_entry.get("frequency") or 999999
else:
rw_nikkud = rw_key
rw_meaning = ""
rw_freq = 999999
rw_items.append((rw_freq, rw_nikkud, rw_meaning))
rw_items.sort(key=lambda x: x[0])
rows_html: list[str] = []
for _freq, rw_nikkud, rw_meaning in rw_items:
rows_html.append(
f'<div class="sec-label">'
f'<span class="rw-word">{rw_nikkud}</span>'
f'<span class="rw-meaning">{rw_meaning}</span>'
f"</div>"
)
related_html = "\n".join(rows_html)
# Plural form and gender (nouns only)
plural_str = ""
gender_str = ""
if pos_raw.startswith("Noun"):
noun_inflection = entry.get("noun_inflection")
if noun_inflection:
if noun_inflection.get("plural"):
plural_str = noun_inflection["plural"].get("nikkud", "")
gender_raw = noun_inflection.get("gender") or ""
if gender_raw == "masculine":
gender_str = "זָכָר"
elif gender_raw == "feminine":
gender_str = "נְקֵבָה"
# Image
image_tag = ""
if include_images:
image_filename = entry.get("image") or ""
if image_filename:
image_path = images_dir / image_filename
if image_path.exists():
image_tag = image_filename
if image_path not in media_files:
media_files.append(image_path)
# GUID: use vocab_legacy_guid from entry, fall back to deterministic
legacy_guid = entry.get("vocab_legacy_guid")
note_guid = legacy_guid or genanki.guid_for(word_nikkud, meaning)
note = genanki.Note(
model=VOCAB_MODEL,
guid=note_guid,
fields=[
word_nikkud,
root,
pos_heb,
meaning,
word_no_nik,
related_html or "",
tags_str,
audio_tag,
example_html,
freq_display,
image_tag,
emoji_str,
prep_str,
hint_str,
plural_str,
gender_str,
cloze_example,
cloze_hint,
],
tags=(tags_str.split() if tags_str else [])
+ [RELEASE_TAG]
+ [f"freq::{freq_display.split()[0]}" if freq_display != "Unlisted" else "freq::Unlisted"],
)
deck.add_note(note)
# Diagnostics
emoji_count = sum(1 for n in deck.notes if n.fields[11])
prep_count = sum(1 for n in deck.notes if n.fields[12])
hint_count = sum(1 for n in deck.notes if n.fields[13])
plural_count = sum(1 for n in deck.notes if n.fields[14])
gender_count = sum(1 for n in deck.notes if n.fields[15])
cloze_count = sum(1 for n in deck.notes if n.fields[16])
unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
if emoji_count:
logger.info(f" Emoji extracted: {emoji_count} words")
if prep_count:
logger.info(f" Hebrew prepositions extracted: {prep_count} words")
if hint_count:
logger.info(f" Eng→Heb hints: {hint_count} words")
if plural_count:
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
if gender_count:
logger.info(f" Noun gender on vocab cards: {gender_count} words")
if cloze_count:
logger.info(f" Sentence cloze cards: {cloze_count} words")
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
return deck, media_files
def build_conj_deck(
words: dict[str, dict],
audio_dir: Path = AUDIO_CONJ_DIR,
include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
"""Build the conjugation drill deck from words with in_conjugation_deck=True."""
deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
media_files: list[Path] = []
note_count = 0
verb_count = 0
# Build root → [(freq, nikkud, meaning)] lookup for cross-linking
root_words: dict[str, list[tuple[int, str, str]]] = {}
for entry in words.values():
root_list = entry.get("root") or []
root_key = " ".join(root_list)
if root_key:
rw_meaning = entry.get("meaning") or ""
if len(rw_meaning) > 40:
rw_meaning = rw_meaning[:37] + ""
rw_freq = entry.get("frequency") or 999999
root_words.setdefault(root_key, []).append((rw_freq, entry["word"]["nikkud"], rw_meaning))
for _unique_key, entry in words.items():
conj = entry.get("conjugation")
if not conj or not conj.get("in_conjugation_deck"):
continue
active_forms_list = conj.get("active_forms") or []
if not active_forms_list:
continue
verb_count += 1
infinitive = conj["infinitive"]["nikkud"]
ref_form = conj["reference_form"]["nikkud"]
binyan = conj.get("binyan", "")
binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
slug = entry.get("slug", "") or ""
root_list = entry.get("root") or []
root = ".".join(root_list)
voice = VOICE_MAP.get(binyan, "")
meaning_raw = entry.get("meaning_raw", "") or ""
meaning = entry.get("meaning", "") or ""
# Extract Hebrew preposition — strip from meaning, show on Hebrew side
prep_str = ""
conj_prep = conj.get("prep")
if conj_prep:
# Strip any parentheses from stored prep value
prep_str = conj_prep.strip("() ")
elif meaning_raw:
preps = HBPAREN_RE.findall(meaning_raw)
if preps:
prep_str = preps[0]
# Strip Hebrew prepositions from English meaning to avoid duplication
if prep_str:
meaning = HBPAREN_RE.sub("", meaning).strip()
# Also strip from meaning_raw patterns like "(על)"
meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
# Clean up double spaces and trailing commas
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive]
if related:
related.sort(key=lambda x: x[0])
related_rows = []
for _freq, rw_nikkud, rw_meaning in related[:8]:
related_rows.append(
f'<div class="sec-label">'
f'<span class="rw-word">{rw_nikkud}</span>'
f'<span class="rw-meaning">{rw_meaning}</span>'
f"</div>"
)
related_str = "\n".join(related_rows)
else:
related_str = ""
forms = _forms_list_to_dict(active_forms_list)
def add_note(
pronoun: str,
tense: str,
conj_form: str,
audio_tag: str,
_form_key_for_guid: str,
guid_val: str | None = None,
guid_candidates: list[str] | None = None,
*,
_infinitive: str = infinitive,
_ref_form: str = ref_form,
_root: str = root,
_binyan_heb: str = binyan_heb,
_voice: str = voice,
_meaning: str = meaning,
_related_str: str = related_str,
_prep_str: str = prep_str,
) -> None:
nonlocal note_count
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
return
# Apply tense prefix (בְּ)
display_tense = TENSE_WITH_BE.get(tense, tense)
# GUID: use stored guid, then first candidate, then deterministic fallback
if guid_val:
note_guid = guid_val
elif guid_candidates:
note_guid = guid_candidates[0]
else:
note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
note = genanki.Note(
model=CONJ_MODEL,
guid=note_guid,
fields=[
_infinitive,
_ref_form,
pronoun,
display_tense,
conj_form,
_root,
_binyan_heb,
_voice,
audio_tag,
_meaning,
_related_str,
_prep_str,
],
tags=[RELEASE_TAG],
)
deck.add_note(note)
note_count += 1
# Seeded RNG per verb — deterministic pronoun/gender choices
verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)
for form_key, form_data in forms.items():
primary_form = form_data.get("form", "")
conj_form = primary_form
# Infinitive: shown on card front as reference — skip as a quiz form
if form_key == "infinitive":
continue
# Audio tag
audio_tag = ""
if include_audio and slug:
audio_tag = _conj_audio_tag(slug, form_key)
if audio_tag:
mp3_path = audio_dir / f"{slug}_{form_key}.mp3"
if mp3_path not in media_files:
media_files.append(mp3_path)
guid_val = form_data.get("guid")
guid_candidates = form_data.get("guid_candidates")
# Present tense expansion: 4 form keys → 1 card each (seeded RNG)
if form_key in PRESENT_EXPANSION:
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates)
continue
# Past 3rd plural: same form for m/f → 1 card (seeded RNG)
if form_key == "past_3p":
chosen = verb_rng.choice(PAST_3P_EXPANSION)
add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates)
continue
# 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
if form_key in FP_MODERN_FALLBACK:
mp_key = FP_MODERN_FALLBACK[form_key]
mp_form = forms.get(mp_key, {}).get("form", "")
fp_form = conj_form
display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
add_note(pronoun, tense, display_form, audio_tag, form_key, guid_val, guid_candidates)
continue
# Standard card
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
pronoun = f"{pronoun} ({gender})"
add_note(pronoun, tense, conj_form, audio_tag, form_key, guid_val, guid_candidates)
# Passive partner forms (Huf'al/Pu'al counterpart)
hufal_forms_list = conj.get("hufal_pual_forms")
if hufal_forms_list:
ref_passive = conj.get("reference_form_passive")
ref_form_passive = ref_passive["nikkud"] if ref_passive else ref_form
passive_binyan = "Huf'al" if binyan == "Hif'il" else "Pu'al"
passive_binyan_heb = BINYAN_TO_HEBREW.get(passive_binyan, passive_binyan)
passive_voice = VOICE_MAP.get(passive_binyan, "סָבִיל")
passive_forms = _forms_list_to_dict(hufal_forms_list)
for form_key, form_data in passive_forms.items():
primary_form = form_data.get("form", "")
conj_form = primary_form
if form_key == "infinitive":
continue
audio_tag = ""
if include_audio and slug:
passive_audio_key = f"passive_{form_key}"
audio_tag = _conj_audio_tag(slug, passive_audio_key)
if audio_tag:
mp3_path = audio_dir / f"{slug}_{passive_audio_key}.mp3"
if mp3_path not in media_files:
media_files.append(mp3_path)
guid_val = form_data.get("guid")
guid_candidates = form_data.get("guid_candidates")
if form_key in PRESENT_EXPANSION:
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
add_note(
chosen[0],
chosen[1],
conj_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
continue
if form_key == "past_3p":
chosen = verb_rng.choice(PAST_3P_EXPANSION)
add_note(
chosen[0],
chosen[1],
conj_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
continue
if form_key in FP_MODERN_FALLBACK:
mp_key = FP_MODERN_FALLBACK[form_key]
mp_form = passive_forms.get(mp_key, {}).get("form", "")
fp_form = conj_form
display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
add_note(
pronoun,
tense,
display_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
continue
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
pronoun = f"{pronoun} ({gender})"
add_note(
pronoun,
tense,
conj_form,
audio_tag,
form_key,
guid_val,
guid_candidates,
_ref_form=ref_form_passive,
_binyan_heb=passive_binyan_heb,
_voice=passive_voice,
)
logger.info(f"Conjugation deck: {note_count} notes across {verb_count} verbs")
return deck, media_files
# ──────────────────────────────────────────────────────────────────────────────
# Confusables deck — words that look identical without nikkud
# ──────────────────────────────────────────────────────────────────────────────
CONF_FRONT = """
<div style="direction:rtl; text-align:center;">
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
</div>
"""
CONF_BACK = """
{{FrontSide}}<hr>
<div class="definitions">{{Definitions}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""
CONF_CSS = CARD_CSS
CONF_MODEL = genanki.Model(
CONF_MODEL_ID,
"Hebrew Confusables",
fields=[
{"name": "Words"},
{"name": "Definitions"},
{"name": "Audio"},
{"name": "WordNoNikkud"},
],
templates=[
{
"name": "Confusable",
"qfmt": CONF_FRONT,
"afmt": CONF_BACK,
},
],
css=CONF_CSS,
)
def build_confusables_deck(
words: dict[str, dict],
include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
"""Build confusables deck from words dict — groups words by confusable_group."""
logger.info("Building confusables deck …")
deck = genanki.Deck(CONF_DECK_ID, "Hebrew Confusables")
media_files: list[Path] = []
note_count = 0
# Group entries by shared ktiv_male (confusable_group members share the same ktiv_male)
# Use confusables_guid as the stable note GUID — all members of a group share it.
# Process each unique guid once.
seen_guids: set[str] = set()
# Build guid → list of entries
guid_to_entries: dict[str, list[dict]] = {}
for unique_key, entry in words.items():
if entry.get("confusable_group") is None:
continue
guid = entry.get("confusables_guid")
if not guid:
# Fall back to ktiv_male-based guid
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
guid_to_entries.setdefault(guid, []).append(entry)
for guid, group_entries in sorted(
guid_to_entries.items(),
key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
):
if guid in seen_guids:
continue
seen_guids.add(guid)
if len(group_entries) < 2:
continue
# Deduplicate: skip entries with identical word+meaning
seen: set[tuple[str, str]] = set()
unique_entries: list[dict] = []
for e in group_entries:
key = (e["word"]["nikkud"], e.get("meaning", ""))
if key not in seen:
seen.add(key)
unique_entries.append(e)
if len(unique_entries) < 2:
continue
word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
words_display = " / ".join(e["word"]["nikkud"] for e in unique_entries)
defs_parts: list[str] = []
audio_parts: list[str] = []
for e in unique_entries:
w = e["word"]["nikkud"]
m = e.get("meaning", "")
p = e.get("pos_hebrew", "")
pos_div = f'<div style="font-size:18px; color:#888;">{p}</div>' if p else ""
defs_parts.append(
f'<div class="conf-entry">'
f'<span class="hebrew" style="font-size:24px;">{w}</span>'
f'<div style="direction:ltr; text-align:center; font-size:22px;">{m}</div>'
f"{pos_div}</div>"
)
if include_audio:
af = e.get("audio_file", "") or ""
at = ""
if af:
mp3_path = AUDIO_DIR / af
if mp3_path.exists():
at = f"[sound:{af}]"
if not at:
slug = e.get("slug", "") or ""
ktiv_male = e.get("word", {}).get("ktiv_male", "") or ""
at = _audio_tag(ktiv_male, slug=slug)
if at and at not in audio_parts:
audio_parts.append(at)
mp3_name = at.removeprefix("[sound:").removesuffix("]")
mp3_path = AUDIO_DIR / mp3_name
if mp3_path not in media_files:
media_files.append(mp3_path)
defs_html = "\n".join(defs_parts)
audio_html = " ".join(audio_parts)
note = genanki.Note(
model=CONF_MODEL,
guid=guid,
fields=[words_display, defs_html, audio_html, word_no_nik],
tags=[RELEASE_TAG],
)
deck.add_note(note)
note_count += 1
logger.info(f"Confusables deck: {note_count} notes")
return deck, media_files
def write_conf_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = CONF_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Confusables deck written → {out_path}")
# ──────────────────────────────────────────────────────────────────────────────
# Noun plurals deck — singular↔plural drilling
# ──────────────────────────────────────────────────────────────────────────────
PLURAL_FRONT_SG = """
<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="sec-label">{{Meaning}}</div>
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
"""
PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
<div class="sec-table">
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
</div>
"""
PLURAL_FRONT_PL = """
<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
"""
PLURAL_BACK_PL = """
{{FrontSide}}<hr>
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>
<div class="sec-table">
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
</div>
"""
PLURAL_CSS = CARD_CSS
PLURAL_MODEL = genanki.Model(
PLURAL_MODEL_ID,
"Hebrew Plurals",
fields=[
{"name": "Singular"},
{"name": "SingularAudio"},
{"name": "Plural"},
{"name": "PluralAudio"},
{"name": "Meaning"},
{"name": "Root"},
{"name": "Mishkal"},
{"name": "Gender"},
],
templates=[
{
"name": "Singular → Plural",
"qfmt": PLURAL_FRONT_SG,
"afmt": PLURAL_BACK_SG,
},
{
"name": "Plural → Singular",
"qfmt": PLURAL_FRONT_PL,
"afmt": PLURAL_BACK_PL,
},
],
css=PLURAL_CSS,
)
def _is_irregular_plural(gender: str, plural_ktiv: str) -> bool:
"""Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix.
Args:
gender: ``"masculine"`` or ``"feminine"``.
plural_ktiv: ktiv male (no nikkud) form of the plural.
"""
return (gender == "masculine" and plural_ktiv.endswith("ות")) or (
gender == "feminine" and plural_ktiv.endswith("ים")
)
def build_plural_deck(
words: dict[str, dict],
include_audio: bool = False,
) -> tuple[genanki.Deck, list[Path]]:
"""Build noun plurals deck from words with noun_inflection data.
Selection: ALL irregular plurals + 2-3 high-frequency exemplars per mishkal
pattern (for regular nouns).
"""
logger.info("Building plurals deck …")
deck = genanki.Deck(PLURAL_DECK_ID, "Hebrew Plurals")
media_files: list[Path] = []
# Collect all nouns with both singular and plural
irregulars: list[tuple[str, dict, dict]] = [] # (unique_key, entry, noun_inflection)
by_mishkal: dict[str, list[tuple[str, dict, dict]]] = {}
for unique_key, entry in words.items():
if not entry.get("pos", "").startswith("Noun"):
continue
noun_inflection = entry.get("noun_inflection")
if not noun_inflection:
continue
singular_data = noun_inflection.get("singular")
plural_data = noun_inflection.get("plural")
if not singular_data or not plural_data:
continue
singular = singular_data.get("nikkud", "")
plural = plural_data.get("nikkud", "")
plural_ktiv = plural_data.get("ktiv_male", "")
if not singular or not plural:
continue
gender = noun_inflection.get("gender", "")
mishkal = noun_inflection.get("mishkal") or ""
if _is_irregular_plural(gender, plural_ktiv):
irregulars.append((unique_key, entry, noun_inflection))
elif mishkal:
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
# Select regular exemplars to achieve a 2:1 regular:irregular ratio.
# Distribute evenly across mishkal patterns, preferring high-frequency words.
irregular_count = len(irregulars)
target_regular = irregular_count * 2
mishkal_count = len(by_mishkal) or 1
per_mishkal = max(2, target_regular // mishkal_count)
selected: list[tuple[str, dict, dict]] = list(irregulars)
regular_pool: list[tuple[str, dict, dict]] = []
for _mishkal, entries in sorted(by_mishkal.items()):
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
regular_pool.extend(entries[:per_mishkal])
# If we overshot, trim to target (keeping highest-frequency across all mishkals)
if len(regular_pool) > target_regular:
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
regular_pool = regular_pool[:target_regular]
selected.extend(regular_pool)
note_count = 0
for _unique_key, entry, noun_inflection in selected:
singular = noun_inflection["singular"]["nikkud"]
singular_ktiv = noun_inflection["singular"].get("ktiv_male", "")
plural = noun_inflection["plural"]["nikkud"]
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
gender = noun_inflection.get("gender") or ""
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
mishkal_heb = noun_inflection.get("mishkal_hebrew") or ""
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
root_list = entry.get("root") or []
root = ".".join(root_list)
# GUID from noun_inflection
note_guid_raw = noun_inflection.get("plurals_guid")
note_guid = note_guid_raw if note_guid_raw else genanki.guid_for("plural", singular, meaning)
# Audio tags
sg_audio = ""
pl_audio = ""
if include_audio:
sg_tag = _audio_tag(singular_ktiv)
if sg_tag:
sg_audio = sg_tag
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
if mp3_path not in media_files:
media_files.append(mp3_path)
mishkal_eng = noun_inflection.get("mishkal") or ""
tags = [RELEASE_TAG]
if mishkal_eng:
tags.append(f"mishkal::{mishkal_eng}")
if _is_irregular_plural(gender, plural_ktiv):
tags.append("irregular")
note = genanki.Note(
model=PLURAL_MODEL,
guid=note_guid,
fields=[
singular,
sg_audio,
plural,
pl_audio,
meaning,
root,
mishkal_heb,
gender_heb,
],
tags=tags,
)
deck.add_note(note)
note_count += 1
irregular_count = len(irregulars)
regular_count = note_count - irregular_count
logger.info(
f"Plurals deck: {note_count} notes "
f"({irregular_count} irregular + {regular_count} regular exemplars "
f"from {len(by_mishkal)} mishkal patterns)"
)
return deck, media_files
def write_plural_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = PLURAL_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Plurals deck written → {out_path}")
def _font_media_files() -> list[str]:
"""Return list of Heebo font file paths that exist, for bundling in .apkg."""
font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
return [str(p) for p in font_paths if p.exists()]
class _RandomOrderPackage(genanki.Package):
"""genanki.Package subclass that sets new card order to random (0) instead of insertion order (1)."""
def write_to_db(self, cursor, timestamp, id_gen):
super().write_to_db(cursor, timestamp, id_gen)
row = cursor.execute("SELECT dconf FROM col").fetchone()
if row:
dconf = json.loads(row[0])
for conf in dconf.values():
if isinstance(conf, dict) and "new" in conf:
conf["new"]["order"] = 0
cursor.execute("UPDATE col SET dconf = ?", [json.dumps(dconf)])
def write_vocab_apkg(
deck: genanki.Deck,
media_files: list[Path],
out_path: Path = VOCAB_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck) # insertion order = frequency rank (new.order=1 default)
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Vocabulary deck written → {out_path}")
def write_conj_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = CONJ_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = _RandomOrderPackage(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Conjugation deck written → {out_path}")
def build_complete_deck(
words: dict[str, dict],
limit: int | None = None,
include_audio: bool = False,
emoji_lookup: dict | None = None,
) -> tuple[list[genanki.Deck], list[Path]]:
"""Build all subdecks under 'Hebrew::*' for the combined .apkg.
Returns (list_of_decks, deduplicated_media_files).
"""
logger.info(f" Building complete deck (audio={'yes' if include_audio else 'no'}) …")
# Build standalone decks using existing functions
vocab_deck, vocab_media = build_vocab_deck(
words,
limit=limit,
include_audio=include_audio,
include_images=True,
emoji_lookup=emoji_lookup,
)
conj_deck, conj_media = build_conj_deck(
words,
include_audio=include_audio,
)
conf_deck, conf_media = build_confusables_deck(
words,
include_audio=include_audio,
)
plural_deck, plural_media = build_plural_deck(
words,
include_audio=include_audio,
)
# Create new Deck objects with subdeck names and different IDs
complete_vocab = genanki.Deck(COMPLETE_VOCAB_DECK_ID, "Hebrew::Vocabulary")
for note in vocab_deck.notes:
complete_vocab.add_note(note)
complete_conj = genanki.Deck(COMPLETE_CONJ_DECK_ID, "Hebrew::Conjugations")
for note in conj_deck.notes:
complete_conj.add_note(note)
complete_conf = genanki.Deck(COMPLETE_CONF_DECK_ID, "Hebrew::Confusables")
for note in conf_deck.notes:
complete_conf.add_note(note)
complete_plural = genanki.Deck(COMPLETE_PLURAL_DECK_ID, "Hebrew::Plurals")
for note in plural_deck.notes:
complete_plural.add_note(note)
all_source_media = vocab_media + conj_media + conf_media + plural_media
# Deduplicate media files by resolved path
seen_paths: set[str] = set()
all_media: list[Path] = []
for mf in all_source_media:
resolved = str(mf.resolve()) if mf.exists() else str(mf)
if resolved not in seen_paths:
seen_paths.add(resolved)
all_media.append(mf)
decks = [complete_vocab, complete_conj, complete_conf, complete_plural]
plural_info = f" + {len(complete_plural.notes)} plural"
logger.info(
f" Complete deck: {len(complete_vocab.notes)} vocab + "
f"{len(complete_conj.notes)} conj + {len(complete_conf.notes)} conf{plural_info} notes, "
f"{len(all_media)} media files"
)
return decks, all_media
def write_complete_apkg(
decks: list[genanki.Deck],
media_files: list[Path],
out_path: Path = COMPLETE_APKG,
) -> None:
"""Write a combined .apkg with multiple subdecks."""
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(decks)
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Complete deck written → {out_path}")
def build_all_variants(
words: dict[str, dict],
limit: int | None = None,
) -> None:
"""Build all 12 release variants into output/."""
logger.info("Building all release variants …")
emoji_lookup = _load_emoji_lookup()
logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded")
vocab_variants = [
(False, False, VOCAB_APKG),
(True, False, VOCAB_APKG_AUDIO),
(False, True, VOCAB_APKG_IMAGES),
(True, True, VOCAB_APKG_AUDIO_IMAGES),
]
for audio, images, path in vocab_variants:
label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}"
logger.info(f" Vocab variant: {label}{path.name}")
deck, media = build_vocab_deck(
words,
limit=limit,
include_audio=audio,
include_images=images,
emoji_lookup=emoji_lookup,
)
write_vocab_apkg(deck, media, out_path=path)
conj_variants = [
(False, CONJ_APKG),
(True, CONJ_APKG_AUDIO),
]
for audio, path in conj_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Conj variant: {label}{path.name}")
deck, media = build_conj_deck(words, include_audio=audio)
write_conj_apkg(deck, media, out_path=path)
conf_variants = [
(False, CONF_APKG),
(True, CONF_APKG_AUDIO),
]
for audio, path in conf_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Conf variant: {label}{path.name}")
deck, media = build_confusables_deck(words, include_audio=audio)
write_conf_apkg(deck, media, out_path=path)
plural_variants = [
(False, PLURAL_APKG),
(True, PLURAL_APKG_AUDIO),
]
for audio, path in plural_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Plural variant: {label}{path.name}")
deck, media = build_plural_deck(words, include_audio=audio)
write_plural_apkg(deck, media, out_path=path)
# Combined "Hebrew::*" complete decks
complete_variants = [
(False, COMPLETE_APKG),
(True, COMPLETE_APKG_AUDIO),
]
for audio, path in complete_variants:
decks, media = build_complete_deck(
words,
limit=limit,
include_audio=audio,
emoji_lookup=emoji_lookup,
)
write_complete_apkg(decks, media, out_path=path)
logger.info("All variants built.")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
words = _load_words()
deck, media = build_vocab_deck(words, limit=20)
write_vocab_apkg(deck, media)
conj_deck, conj_media = build_conj_deck(words)
write_conj_apkg(conj_deck, conj_media)