hebrew_flash_cards/apkg_builder.py
Sochen 802c369365 v0.14: rescrape vocab, formatting fixes for all decks
- Full pealim.com rescrape: 9,120 words (15 new), all with audio URLs
- Plurals deck: 2:1 regular:irregular ratio (649 notes), RTL arrows, 1.6x hint text
- Conjugation deck: blue infinitive on front, plain meaning on back, nikkud labels
- Confusables deck: larger prompt text (32px), audio only when all words have it
- Validator: non-audio variants no longer false-fail on audio check
- 14 new audio files downloaded

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 09:26:41 +00:00

1867 lines
68 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
Uses genanki for reliable, stable deck generation.
Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
in Anki rather than creating a duplicate.
"""
import json
import logging
import random
import re
import unicodedata
from pathlib import Path
import genanki
import pandas as pd
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
# Stable deck/model IDs — do not change these
VOCAB_DECK_ID = 1_234_567_890
VOCAB_MODEL_ID = 1_701_222_017_968 # matches Nevo's original Anki model
CONJ_DECK_ID = 1_234_567_892
CONJ_MODEL_ID = 1_234_567_893
CONF_DECK_ID = 1_234_567_894
CONF_MODEL_ID = 1_234_567_895
PLURAL_DECK_ID = 1_234_567_896
PLURAL_MODEL_ID = 1_234_567_897
# Subdeck IDs for combined "Hebrew::*" package — MUST differ from standalone IDs
COMPLETE_VOCAB_DECK_ID = 1_234_567_900
COMPLETE_CONJ_DECK_ID = 1_234_567_901
COMPLETE_CONF_DECK_ID = 1_234_567_902
COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.14"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)")
DATA_DIR = Path(__file__).parent / "data"
# Legacy GUID map from Nevo's original Anki deck (imported ~Jul 2025).
# Preserves study progress on reimport by reusing the same note GUIDs.
_LEGACY_GUID_PATH = DATA_DIR / "legacy_guid_map.json"
_LEGACY_GUIDS: dict[str, str] = {}
if _LEGACY_GUID_PATH.exists():
with open(_LEGACY_GUID_PATH) as _f:
_LEGACY_GUIDS = json.load(_f)
def _vocab_guid(word: str, meaning: str = "") -> str:
"""Return the legacy GUID for a word if it exists, else a deterministic one.
For homographs (same word, different meanings), tries a compound key
``word||meaning_prefix`` first. Falls back to the plain word key, then
to a deterministic GUID from (word, meaning).
"""
key = unicodedata.normalize("NFC", word)
if meaning:
compound = f"{key}||{meaning.lower().strip()[:30]}"
if compound in _LEGACY_GUIDS:
return _LEGACY_GUIDS[compound]
if key in _LEGACY_GUIDS:
return _LEGACY_GUIDS[key]
return genanki.guid_for(word, meaning) if meaning else genanki.guid_for(word)
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
OUTPUT_DIR = Path(__file__).parent / "output"
VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg"
VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg"
VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg"
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg"
CONF_APKG = OUTPUT_DIR / "hebrew_confusables.apkg"
CONF_APKG_AUDIO = OUTPUT_DIR / "hebrew_confusables_audio.apkg"
PLURAL_APKG = OUTPUT_DIR / "hebrew_plurals.apkg"
PLURAL_APKG_AUDIO = OUTPUT_DIR / "hebrew_plurals_audio.apkg"
COMPLETE_APKG = OUTPUT_DIR / "hebrew_complete.apkg"
COMPLETE_APKG_AUDIO = OUTPUT_DIR / "hebrew_complete_audio.apkg"
# ──────────────────────────────────────────────────────────────────────────────
# Binyan → Hebrew label mapping (for conjugation card display)
# ──────────────────────────────────────────────────────────────────────────────
BINYAN_TO_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּעֵל",
"Pu'al": "פֻּעַל",
"Hitpa'el": "הִתְפַּעֵל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
}
# ──────────────────────────────────────────────────────────────────────────────
# PoS → Hebrew label mapping
# ──────────────────────────────────────────────────────────────────────────────
POS_TO_HEBREW = {
"Noun": "שם עצם",
"Verb": "פועל",
"Adjective": "שם תואר",
"Adverb": "תואר הפועל",
"Preposition": "מילת יחס",
"Conjunction": "מילת חיבור",
"Pronoun": "כינוי גוף",
"Particle": "מילית",
}
# PoS category groupings for related-words display
POS_CATEGORY_LABELS = {
"Verb": "פעלים",
"Noun": "שמות עצם",
"Adjective": "שמות תואר",
"Adverb": "תוארי הפועל",
}
# ──────────────────────────────────────────────────────────────────────────────
# Shared CSS
# ──────────────────────────────────────────────────────────────────────────────
FONTS_DIR = DATA_DIR / "fonts"
CARD_CSS = """
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Regular.ttf');
font-weight: normal;
}
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Bold.ttf');
font-weight: bold;
}
.card {
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
font-size: 20px;
text-align: center;
color: #222;
background: #fff;
padding: 16px;
}
.hebrew {
font-size: 36px;
font-weight: bold;
direction: rtl;
text-align: center;
line-height: 1.5;
color: #222;
}
.hebrew-sm {
font-size: 24px;
font-weight: normal;
direction: rtl;
text-align: center;
color: #333;
}
.meaning {
font-size: 28px;
color: #1a1a8c;
margin: 8px 0;
}
.hint {
font-size: 16px;
color: #888;
margin: 4px 0;
direction: rtl;
}
.root-info {
font-size: 18px;
color: #555;
margin-top: 6px;
direction: rtl;
}
.example {
font-size: 18px;
color: #444;
direction: rtl;
text-align: right;
font-style: italic;
margin: 10px auto 0;
max-width: 90%;
border-right: 3px solid #aaa;
padding-right: 8px;
}
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
.freq-badge {
display: inline-block;
font-size: 11px;
color: #aaa;
background: transparent;
border: 1px solid #eee;
border-radius: 10px;
padding: 2px 8px;
margin-top: 4px;
}
.voice-label {
font-size: 0.6em;
font-weight: normal;
color: #555;
}
.sec-label {
font-size: 32px;
color: #555;
direction: rtl;
text-align: center;
margin-top: 6px;
}
.sec-key {
font-size: 24px;
color: #888;
}
.related-group {
direction: rtl;
text-align: right;
margin: 2px 0;
font-size: 18px;
}
.emoji-img {
font-size: 3.5em;
text-align: center;
margin: 0.3em 0;
}
@media (prefers-color-scheme: dark) {
.card { color: #e8e8e8; background: #1c1c1e; }
.hebrew { color: #f0f0f0; }
.hebrew-sm { color: #ddd; }
.meaning { color: #82b0ff; }
.root-info { color: #aaa; }
.sec-label { color: #aaa; }
.sec-key { color: #666; }
.hint { color: #777; }
.voice-label { color: #888; }
.example { color: #bbb; border-right-color: #555; }
.divider { border-top-color: #333; }
.freq-badge { color: #888; border-color: #444; }
}
"""
# ──────────────────────────────────────────────────────────────────────────────
# Vocabulary Deck
# ──────────────────────────────────────────────────────────────────────────────
VOCAB_FRONT_HEB = """
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""
VOCAB_BACK_HEB = """
{{FrontSide}}
<div class="divider"></div>
<div class="meaning">{{Meaning}}</div>
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
{{#SharedRoots}}
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
"""
VOCAB_FRONT_ENG = """
<div class="meaning">{{Meaning}}</div>
{{#Hint}}<div class="hint">{{Hint}}</div>{{/Hint}}
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
"""
VOCAB_BACK_ENG = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
"""
VOCAB_FRONT_CLOZE = """
<div class="example" style="font-size:24px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
"""
VOCAB_BACK_CLOZE = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<div class="meaning">{{Meaning}}</div>
"""
VOCAB_MODEL = genanki.Model(
VOCAB_MODEL_ID,
"Hebrew Flash Cards",
fields=[
{"name": "Word"},
{"name": "Root"},
{"name": "PoS"},
{"name": "Meaning"},
{"name": "WordNoNikkud"},
{"name": "SharedRoots"},
{"name": "Tags"},
{"name": "Audio"},
{"name": "Example"},
{"name": "Frequency"},
{"name": "Image"},
{"name": "Emoji"},
{"name": "Prep"},
{"name": "Hint"},
{"name": "Plural"},
{"name": "ClozeExample"},
{"name": "ClozeHint"},
],
templates=[
{
# ord 0 — matches Nevo's original "Card 2" (Eng→Heb)
"name": "English → Hebrew",
"qfmt": VOCAB_FRONT_ENG,
"afmt": VOCAB_BACK_ENG,
},
{
# ord 1 — matches Nevo's original "Card 3" (Heb→Eng)
"name": "Hebrew → English",
"qfmt": VOCAB_FRONT_HEB,
"afmt": VOCAB_BACK_HEB,
},
{
# ord 2 — Cloze-style sentence fill-in (only generated when ClozeExample is non-empty)
"name": "Sentence Cloze",
"qfmt": VOCAB_FRONT_CLOZE,
"afmt": VOCAB_BACK_CLOZE,
},
],
css=CARD_CSS,
)
# ──────────────────────────────────────────────────────────────────────────────
# Conjugation Deck
# ──────────────────────────────────────────────────────────────────────────────
CONJ_FRONT = """
<div class="hebrew">{{Pronoun}}</div>
<div class="meaning" style="font-size:28px;">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Tense}}</div>
"""
CONJ_BACK = """
{{FrontSide}}<hr>
<div class="hebrew">{{ConjugatedForm}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
{{#RelatedVocab}}<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
"""
CONJ_CSS = CARD_CSS
CONJ_MODEL = genanki.Model(
CONJ_MODEL_ID,
"Pealim Conjugation",
fields=[
{"name": "Infinitive"},
{"name": "ReferenceForm"},
{"name": "Pronoun"},
{"name": "Tense"},
{"name": "ConjugatedForm"},
{"name": "Root"},
{"name": "Binyan"},
{"name": "Voice"},
{"name": "Audio"},
{"name": "Meaning"},
{"name": "RelatedVocab"},
],
templates=[
{
"name": "Conjugation Drill",
"qfmt": CONJ_FRONT,
"afmt": CONJ_BACK,
}
],
css=CONJ_CSS,
)
# Present-tense expansion: each form key → list of (pronoun, tense_label)
PRESENT_EXPANSION = {
"present_ms": [
("אֲנִי (זָכָר)", "הוֹוֶה"),
("אַתָּה", "הוֹוֶה"),
("הוּא", "הוֹוֶה"),
],
"present_fs": [
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
("אַתְּ", "הוֹוֶה"),
("הִיא", "הוֹוֶה"),
],
"present_mp": [
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
("אַתֶּם", "הוֹוֶה"),
("הֵם", "הוֹוֶה"),
],
"present_fp": [
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
("אַתֶּן", "הוֹוֶה"),
("הֵן", "הוֹוֶה"),
],
}
# Modern Hebrew: 2fp/3fp future and imperative default to mp form (classical in parens)
FP_MODERN_FALLBACK = {
"future_2fp": "future_2mp",
"future_3fp": "future_3mp",
"imperative_fp": "imperative_mp",
}
# 3rd person plural past: same form for m/f — generate two separate pronoun cards
PAST_3P_EXPANSION = [
("הֵם", "עָבָר"),
("הֵן", "עָבָר"),
]
# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
VOICE_MAP = {
"Pu'al": "סָבִיל",
"Huf'al": "סָבִיל",
}
# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR) -> str:
"""Return [sound:xxx.mp3] if audio file exists, else empty string."""
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
if not safe:
return ""
mp3_path = audio_dir / f"{safe}.mp3"
if mp3_path.exists():
return f"[sound:{mp3_path.name}]"
return ""
def _conj_audio_tag(slug: str, form_key: str) -> str:
"""Return [sound:xxx.mp3] for conjugation audio if downloaded."""
filename = f"{slug}_{form_key}.mp3"
mp3_path = AUDIO_CONJ_DIR / filename
if mp3_path.exists():
return f"[sound:{filename}]"
return ""
# Keywords excluded when building emoji lookup AND matching meaning text.
# Curated from manual review of all 2,261 emoji-word pairs (Sprint 8).
_EMOJI_STOP = frozenset(
{
# Basic stop words
"to",
"be",
"a",
"an",
"the",
"of",
"in",
"on",
"at",
"for",
"and",
"with",
"by",
"or",
"but",
"not",
"as",
"its",
# Generic emoji description words (too vague)
"face",
"hand",
"sign",
"symbol",
"button",
"small",
"large",
"light",
"dark",
"open",
"closed",
# Numbers → clock emoji (🕐🕑🕒 etc.)
"one",
"two",
"three",
"four",
"five",
"six",
"seven",
"eight",
"nine",
"ten",
"hundred",
"thousand",
# UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️)
"next",
"fast",
"play",
"pause",
"repeat",
"end",
"soon",
"record",
# Abstract words → misleading object emoji
"part",
"place",
"mark",
"post",
"department",
"store",
"note",
"control",
"level",
"stop",
"cover",
"roll",
"rolling",
"pick",
"over",
"right",
"way",
"skin",
"drop",
"middle",
"piece",
"section",
# Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.)
"north",
"south",
"northern",
"southern",
"western",
"eastern",
"central",
"territory",
"kingdom",
"united",
"virgin",
# Common words producing bad emoji matches
"new",
"big",
"full",
"last",
"first",
"double",
"slightly",
"without",
"from",
"behind",
"people",
"position",
"status",
"situation",
"game",
"call",
"trade",
"male",
"female",
"person",
"letter",
# Polysemous words → wrong emoji sense
"french",
"fried",
"board",
"bow",
"water",
"union",
"rock",
"left",
"back",
"crane",
"dash",
"bar",
"wheel",
"horizontal",
}
)
def _load_emoji_lookup() -> dict[str, str]:
"""Load or fetch Unicode emoji keyword→character lookup.
Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
Result is cached in data/emoji_lookup.json.
Returns empty dict on network failure (safe fallback).
"""
cache_file = DATA_DIR / "emoji_lookup.json"
if cache_file.exists():
with open(cache_file) as f:
return json.load(f)
import requests
try:
resp = requests.get(
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
timeout=30,
)
resp.raise_for_status()
except Exception as e:
logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
return {}
lookup: dict[str, str] = {}
for line in resp.text.splitlines():
if "fully-qualified" not in line:
continue
m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
if not m:
continue
emoji_char = m.group(1)
desc = m.group(2).lower().strip()
for word in desc.split():
word = word.strip(".,'\"-")
if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
lookup[word] = emoji_char
cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
return lookup
def _translate_pos(pos_str: str) -> str:
"""Translate PoS string to Hebrew. For verbs, appends binyan."""
for eng, heb in POS_TO_HEBREW.items():
if eng.lower() in pos_str.lower():
if eng == "Verb":
# Extract binyan from strings like "Verb Pi'el" or "Verb pi'el"
for binyan_eng, binyan_heb in BINYAN_TO_HEBREW.items():
if binyan_eng.lower() in pos_str.lower().replace("", "-").replace("", "-"):
return f"פועל — {binyan_heb}"
return heb
return pos_str
def _categorize_pos(pos_str: str) -> str:
"""Return the canonical PoS category key for grouping."""
for cat in POS_CATEGORY_LABELS:
if cat.lower() in pos_str.lower():
return cat
return "Other"
def build_vocab_deck(
dict_csv: Path,
examples_cache: dict | None = None,
freq_cache: dict | None = None,
image_cache: dict | None = None,
emoji_lookup: dict | None = None,
limit: int | None = None,
include_audio: bool = True,
include_images: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
"""
Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv).
Returns (deck, list_of_media_files).
"""
logger.info(f"Loading dictionary from {dict_csv}")
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
if limit:
df = df.head(limit)
logger.info(f" {len(df)} rows loaded")
examples_cache = examples_cache or {}
freq_cache = freq_cache or {}
image_cache = image_cache or {}
# Load EPUB/PDF sentence matches (nikkud'd — preferred over Ben Yehuda)
epub_examples: dict[str, list[str]] = {}
epub_path = DATA_DIR / "vocab_sentence_matches.json"
if epub_path.exists():
try:
with open(epub_path) as _f:
raw_epub = json.load(_f)
for word_key, info in raw_epub.items():
sents = info.get("sentences", [])
if sents:
epub_examples[word_key] = [s["text"] if isinstance(s, dict) else s for s in sents]
# Also index by nikkud form
nikkud_word = info.get("word_nikkud", "")
if nikkud_word and nikkud_word != word_key:
epub_examples[nikkud_word] = epub_examples[word_key]
logger.info(f" EPUB sentence matches loaded: {len(epub_examples)} words")
except (json.JSONDecodeError, OSError):
pass
# Load AI-vetted sentences for cloze cards (only approved sentences)
vetted_cloze: dict[str, list[str]] = {} # word_nikkud → [good sentences]
vetted_path = DATA_DIR / "vetted_sentences.json"
if vetted_path.exists():
try:
with open(vetted_path) as _f:
raw_vetted = json.load(_f)
for word_key, info in raw_vetted.items():
good = info.get("good_sentences", [])
if good:
texts = [s["text"] if isinstance(s, dict) else s for s in good]
nikkud_word = info.get("word_nikkud", word_key)
vetted_cloze[nikkud_word] = texts
if word_key != nikkud_word:
vetted_cloze[word_key] = texts
logger.info(f" Vetted cloze sentences loaded: {len(vetted_cloze)} words")
except (json.JSONDecodeError, OSError):
pass
# Load noun plural forms for vocab card back display
noun_plural_lookup: dict[str, str] = {} # word (nikkud) → plural (nikkud)
_noun_plural_stripped: dict[str, str] = {} # word (stripped) → plural (nikkud), fallback
noun_plural_path = DATA_DIR / "noun_plurals.json"
if noun_plural_path.exists():
try:
with open(noun_plural_path) as _f:
_noun_data = json.load(_f)
for _entry in _noun_data.values():
sg = _entry.get("singular", "")
pl = _entry.get("plural", "")
if sg and pl:
noun_plural_lookup[sg] = pl
s = _strip_nikkud(sg)
if s not in _noun_plural_stripped:
_noun_plural_stripped[s] = pl
logger.info(f" Noun plurals loaded: {len(noun_plural_lookup)} entries")
except (json.JSONDecodeError, OSError):
pass
# Load refined meanings for synonym disambiguation (layer 2)
refined_meanings: dict[str, str] = {}
refined_path = DATA_DIR / "refined_meanings.json"
if refined_path.exists():
try:
with open(refined_path) as _f:
refined_meanings = json.load(_f)
logger.info(f" Refined meanings loaded: {len(refined_meanings)} entries")
except (json.JSONDecodeError, OSError):
pass
# Load image cache from disk if not passed in
image_cache_path = DATA_DIR / "image_cache.json"
if not image_cache and image_cache_path.exists():
try:
with open(image_cache_path) as _f:
image_cache = json.load(_f)
except (json.JSONDecodeError, OSError) as e:
logger.debug(f"Could not load image cache from disk: {e}")
images_dir = DATA_DIR / "images"
# Build word_stripped → pos_category dict for related-words grouping
word_to_pos_cat: dict[str, str] = {}
for _, row in df.iterrows():
wni = str(row.get("Word Without Nikkud", "")).strip()
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
if wni and pos_raw and pos_raw not in ("nan", "None"):
word_to_pos_cat[_strip_nikkud(wni)] = _categorize_pos(pos_raw)
# Build confusable words set: consonant-only forms with multiple entries
# Uses _strip_nikkud (removes combining marks) rather than Word Without Nikkud
# (which preserves matres lectionis) — since sentence matching also uses
# _strip_nikkud, we need to detect collisions at that level.
_strip_to_nikkud: dict[str, set[str]] = {}
for _, row in df.iterrows():
w = str(row.get("Word", "")).strip()
if w and w not in ("nan", "None"):
consonants = _strip_nikkud(w)
_strip_to_nikkud.setdefault(consonants, set()).add(w)
_confusable_words: set[str] = {k for k, v in _strip_to_nikkud.items() if len(v) > 1}
if _confusable_words:
logger.info(f" Confusable words (homographs): {len(_confusable_words)} stripped forms")
# Build ambiguity index: group words by normalized meaning to detect
# Eng→Heb collisions. A word needs a hint when another word shares
# the same English meaning. Hint = PoS (+ binyan for verbs).
_meaning_groups: dict[str, list[tuple[str, str]]] = {} # norm_meaning → [(word, pos_raw)]
for _, row in df.iterrows():
w = str(row.get("Word", "")).strip()
m = str(row.get("Meaning", "")).strip()
p = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
if not w or not m or m in ("nan", "None"):
continue
# Normalize: strip emoji, Hebrew parens, take text before first semicolon
m_clean = EMOJI_RE.sub("", m).strip()
m_clean = HBPAREN_RE.sub("", m_clean).strip().strip(",").strip()
m_norm = m_clean.split(";")[0].strip().lower()
if m_norm:
_meaning_groups.setdefault(m_norm, []).append((w, p if p not in ("nan", "None") else ""))
# For each word in an ambiguous group, build its hint string
_word_hints: dict[tuple[str, str], str] = {} # (word, meaning) → hint
for _m_norm, entries in _meaning_groups.items():
if len(entries) < 2:
continue
# Check if the group has genuinely different PoS/binyan (not just duplicates)
pos_set = set()
for _, p in entries:
pos_set.add(_translate_pos(p) if p else "")
if len(pos_set) < 2:
continue
for w, p in entries:
hint = _translate_pos(p) if p else ""
if hint:
# Find original meaning for this word to build the (word, meaning) key
_word_hints.setdefault((w, hint), hint)
# Rebuild as (word, full_meaning) → hint for lookup during note creation
_word_meaning_hints: dict[tuple[str, str], str] = {}
for _, row in df.iterrows():
w = str(row.get("Word", "")).strip()
m = str(row.get("Meaning", "")).strip()
p = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
if not w or not m or m in ("nan", "None"):
continue
hint = _translate_pos(p) if p and p not in ("nan", "None") else ""
if (w, hint) in _word_hints:
_word_meaning_hints[(w, m)] = hint
if _word_meaning_hints:
logger.info(f" Eng→Heb disambiguation hints: {len(_word_meaning_hints)} words")
# Sort by frequency rank
def freq_sort_key(row):
word_plain = _strip_nikkud(str(row.get("Word Without Nikkud", row.get("WordNoNikkud", ""))).strip())
return freq_cache.get(word_plain, 999_999)
df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
df = df.sort_values("_freq_rank")
deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
media_files: list[Path] = []
seen_words: set[tuple[str, str]] = set()
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
root = str(row.get("Root", "")).strip()
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
meaning = str(row.get("Meaning", "")).strip()
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
tags_str = str(row.get("tags", row.get("Tags", ""))).strip()
freq_rank_raw = row["_freq_rank"]
if freq_rank_raw <= 500:
freq_display = f"Core #{freq_rank_raw}"
elif freq_rank_raw <= 1500:
freq_display = f"Essential #{freq_rank_raw}"
elif freq_rank_raw <= 3000:
freq_display = f"Intermediate #{freq_rank_raw}"
elif freq_rank_raw <= 5000:
freq_display = f"Upper-intermediate #{freq_rank_raw}"
elif freq_rank_raw <= 10000:
freq_display = f"Advanced #{freq_rank_raw}"
elif freq_rank_raw < 999_999:
freq_display = f"Rare #{freq_rank_raw}"
else:
freq_display = "Unlisted"
root = "" if root in ("nan", "None", "-") else root
pos_raw = "" if pos_raw in ("nan", "None") else pos_raw
meaning = "" if meaning in ("nan", "None") else meaning
word_no_nik = "" if word_no_nik in ("nan", "None") else word_no_nik
shared_roots = "" if shared_roots in ("nan", "None") else shared_roots
tags_str = "" if tags_str in ("nan", "None") else tags_str
if not word or not meaning:
continue
# Skip exact duplicates (same word AND same meaning — true dupes).
# Homographs (same word, different meaning) are kept as separate notes.
word_meaning_key = (word, meaning)
if word_meaning_key in seen_words:
logger.debug(f" Skipping duplicate word+meaning: {word}")
continue
seen_words.add(word_meaning_key)
# Extract emoji from meaning (pealim embeds emoji in meaning text)
emoji_str = "".join(EMOJI_RE.findall(meaning))
meaning_clean = EMOJI_RE.sub("", meaning).strip()
# Fallback: look up emoji from Unicode standard by English keyword
if not emoji_str and emoji_lookup:
for kw in re.sub(r"[^\w\s]", " ", meaning_clean.lower()).split()[:5]:
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
emoji_str = emoji_lookup[kw]
break
# Extract Hebrew parentheticals (prepositions) from meaning
preps = HBPAREN_RE.findall(meaning_clean)
prep_str = " ".join(f"({p})" for p in preps)
meaning_clean = HBPAREN_RE.sub("", meaning_clean).strip().strip(",").strip()
# Apply refined meaning if available (AI disambiguation layer 2)
if word in refined_meanings:
meaning_clean = refined_meanings[word]
# Translate PoS to Hebrew
pos_heb = _translate_pos(pos_raw) if pos_raw else ""
# Eng→Heb disambiguation hint (PoS + binyan, shown only for ambiguous meanings)
hint_str = _word_meaning_hints.get((word, meaning), "")
# Audio
audio_tag = _audio_tag(word_no_nik) if include_audio else ""
if audio_tag:
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
mp3_path = AUDIO_DIR / mp3_name
if mp3_path not in media_files:
media_files.append(mp3_path)
# Consonant-only form for confusable detection and cloze matching
word_consonants = _strip_nikkud(word)
is_confusable = word_consonants in _confusable_words
# Example sentences — priority: EPUB (nikkud'd) > Ben Yehuda > none
# For confusable words (same consonants, different nikkud), only match by
# exact nikkud form to avoid showing wrong-word sentences.
example_html = ""
# 1. EPUB/PDF sentences (full nikkud)
epub_sents = epub_examples.get(word)
if not epub_sents and not is_confusable:
epub_sents = epub_examples.get(word_no_nik) or epub_examples.get(_strip_nikkud(word_no_nik))
if epub_sents:
example_html = epub_sents[0]
else:
# 2. Ben Yehuda examples (some have nikkud from nikkud corpus)
by_sents = examples_cache.get(word)
if not by_sents and not is_confusable:
by_sents = examples_cache.get(word_no_nik) or examples_cache.get(_strip_nikkud(word_no_nik))
if by_sents:
# Prefer nikkud'd Ben Yehuda sentences (contain combining marks)
nikkud_sents = [s for s in by_sents if any("\u0591" <= c <= "\u05c7" for c in s)]
example_html = nikkud_sents[0] if nikkud_sents else by_sents[0]
# Cloze example: replace target word with blank in example sentence.
# Priority: AI-vetted sentences > EPUB/Ben Yehuda sentences.
# Uses stripped (no-nikkud) matching. Skips homographs (confusable words).
cloze_example = ""
cloze_hint = ""
if word_consonants and not is_confusable:
# Pick best sentence for cloze: vetted first, then example_html
cloze_source = None
vetted = vetted_cloze.get(word)
if not vetted and not is_confusable:
vetted = vetted_cloze.get(word_no_nik) or vetted_cloze.get(_strip_nikkud(word_no_nik))
if vetted:
cloze_source = vetted[0]
elif example_html:
cloze_source = example_html
if cloze_source:
tokens = cloze_source.split()
word_stripped = _strip_nikkud(word)
replaced = False
if word_stripped:
for i, tok in enumerate(tokens):
tok_stripped = _strip_nikkud(tok)
m = re.match(r'^(.*?)([\.,!?;:"\u0027]*)$', tok_stripped)
tok_core = m.group(1) if m else tok_stripped
punct_match = re.search(r'[.,!?;:"\u0027]+$', tok)
trailing = punct_match.group() if punct_match else ""
if tok_core == word_stripped:
tokens[i] = "_____" + trailing
replaced = True
break
if replaced:
cloze_example = " ".join(tokens)
pos_cat = _categorize_pos(pos_raw)
cloze_hint = meaning_clean
if pos_cat == "Verb" and pos_heb:
cloze_hint = f"{meaning_clean} ({pos_heb})"
# Related words grouped by PoS category
related_html = ""
if shared_roots:
related_words = shared_roots.split()
groups: dict[str, list[str]] = {}
for rw in related_words:
cat = word_to_pos_cat.get(_strip_nikkud(rw), "Other")
groups.setdefault(cat, []).append(rw)
parts = []
for cat, words in groups.items():
if cat == "Other":
# No label for uncategorized words — just list them plain
parts.append(f'<div class="related-group">{" ".join(words)}</div>')
else:
label = POS_CATEGORY_LABELS.get(cat, cat)
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>')
related_html = "\n".join(parts)
# Image: look up by stripped word (no-nikkud)
image_tag = ""
if include_images:
image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None)
if image_filename:
image_path = images_dir / image_filename
if image_path.exists():
image_tag = image_filename
if image_path not in media_files:
media_files.append(image_path)
note = genanki.Note(
model=VOCAB_MODEL,
# Stable GUID: uses legacy GUID from Nevo's original deck when
# available, otherwise deterministic from word + meaning.
guid=_vocab_guid(word, meaning),
fields=[
word,
root,
pos_heb,
meaning_clean,
word_no_nik,
related_html or shared_roots,
tags_str,
audio_tag,
example_html,
freq_display,
image_tag,
emoji_str,
prep_str,
hint_str,
noun_plural_lookup.get(word, "") or _noun_plural_stripped.get(word_consonants, ""),
cloze_example,
cloze_hint,
],
tags=(tags_str.split() if tags_str else [])
+ [RELEASE_TAG]
+ [f"freq::{freq_display.split()[0]}" if freq_display != "Unlisted" else "freq::Unlisted"],
)
deck.add_note(note)
# Diagnostic: count words with emoji/prep/hint/plural/cloze extracted
emoji_count = sum(1 for n in deck.notes if n.fields[11])
prep_count = sum(1 for n in deck.notes if n.fields[12])
hint_count = sum(1 for n in deck.notes if n.fields[13])
plural_count = sum(1 for n in deck.notes if n.fields[14])
cloze_count = sum(1 for n in deck.notes if n.fields[15])
if emoji_count:
logger.info(f" Emoji extracted: {emoji_count} words")
if prep_count:
logger.info(f" Hebrew prepositions extracted: {prep_count} words")
if hint_count:
logger.info(f" Eng→Heb hints: {hint_count} words")
if plural_count:
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
if cloze_count:
logger.info(f" Sentence cloze cards: {cloze_count} words")
# Diagnostic: count words without PoS coverage in shared_roots
other_count = 0
for _, row in df.iterrows():
sr = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
if sr and sr not in ("nan", "None"):
other_count += sum(1 for rw in sr.split() if word_to_pos_cat.get(_strip_nikkud(rw)) is None)
unlisted = int((df["_freq_rank"] >= 999_999).sum())
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(df)}")
logger.info(f" Related-words without PoS coverage: {other_count} (shown unlabeled)")
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
return deck, media_files
def build_conj_deck(
conjugations: dict,
audio_dir: Path = AUDIO_CONJ_DIR,
include_audio: bool = True,
dict_csv: Path | None = None,
) -> tuple[genanki.Deck, list[Path]]:
"""Build the conjugation drill deck from conjugations.json data."""
deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
media_files: list[Path] = []
note_count = 0
# Build lookup tables from vocab CSV for cross-linking
verb_meaning: dict[str, str] = {} # word_no_nikkud → meaning
root_words: dict[str, list[str]] = {} # root → [related words]
if dict_csv and dict_csv.exists():
vdf = pd.read_csv(dict_csv, sep=";", index_col=0)
for _, row in vdf.iterrows():
word = str(row.get("Word", "")).strip()
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
meaning = str(row.get("Meaning", "")).strip()
root = str(row.get("Root", "")).strip()
if root and root not in ("nan", "None", "-"):
root_words.setdefault(root, []).append(word)
if meaning and meaning not in ("nan", "None"):
# Use Word Without Nikkud (ktiv male) for matching
if word_no_nik and word_no_nik not in ("nan", "None"):
verb_meaning[word_no_nik] = meaning
verb_meaning[_strip_nikkud(word)] = meaning
for infinitive, data in conjugations.items():
if not data or not data.get("forms"):
continue
root = data.get("root", "")
binyan = data.get("binyan", "")
binyan_heb = BINYAN_TO_HEBREW.get(binyan, binyan)
ref_form = data.get("reference_form", infinitive)
slug = data.get("slug", "")
voice = VOICE_MAP.get(binyan, "")
# Meaning: prefer scraped meaning from pealim page, fall back to CSV cross-link
meaning = (
data.get("meaning", "")
or verb_meaning.get(infinitive, "")
or verb_meaning.get(_strip_nikkud(infinitive), "")
)
related = [w for w in root_words.get(root, []) if w != infinitive]
related_str = " ".join(related[:8]) if related else ""
forms = data["forms"]
def add_note(
pronoun: str,
tense: str,
conj_form: str,
audio_tag: str,
*,
_infinitive: str = infinitive,
_ref_form: str = ref_form,
_root: str = root,
_binyan_heb: str = binyan_heb,
_voice: str = voice,
_meaning: str = meaning,
_related_str: str = related_str,
) -> None:
nonlocal note_count
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
return
note = genanki.Note(
model=CONJ_MODEL,
guid=genanki.guid_for(_infinitive, pronoun, tense),
fields=[
_infinitive,
_ref_form,
pronoun,
tense,
conj_form,
_root,
_binyan_heb,
_voice,
audio_tag,
_meaning,
_related_str,
],
tags=[RELEASE_TAG],
)
deck.add_note(note)
note_count += 1
alternate_forms = data.get("alternate_forms", {})
# Seeded RNG per verb — deterministic pronoun/gender choices
verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)
for form_key, form_data in forms.items():
primary_form = form_data.get("form", "")
alt_form = alternate_forms.get(form_key, "")
conj_form = f"{primary_form} / {alt_form}" if alt_form else primary_form
# Infinitive: shown on card front as reference — skip as a quiz form
if form_key == "infinitive":
continue
# Audio tag: use downloaded file if present
audio_tag = ""
if include_audio and slug:
audio_tag = _conj_audio_tag(slug, form_key)
if audio_tag:
mp3_path = audio_dir / f"{slug}_{form_key}.mp3"
if mp3_path not in media_files:
media_files.append(mp3_path)
# Present tense expansion: 4 form keys → 1 card each (seeded RNG)
if form_key in PRESENT_EXPANSION:
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
add_note(chosen[0], chosen[1], conj_form, audio_tag)
continue
# Past 3rd plural: same form for m/f → 1 card (seeded RNG)
if form_key == "past_3p":
chosen = verb_rng.choice(PAST_3P_EXPANSION)
add_note(chosen[0], chosen[1], conj_form, audio_tag)
continue
# 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
if form_key in FP_MODERN_FALLBACK:
mp_key = FP_MODERN_FALLBACK[form_key]
mp_form = forms.get(mp_key, {}).get("form", "")
fp_form = conj_form
display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
add_note(pronoun, tense, display_form, audio_tag)
continue
# Standard card
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
pronoun = f"{pronoun} ({gender})"
add_note(pronoun, tense, conj_form, audio_tag)
logger.info(f"Conjugation deck: {note_count} notes across {sum(1 for v in conjugations.values() if v)} verbs")
return deck, media_files
# ──────────────────────────────────────────────────────────────────────────────
# Confusables deck — words that look identical without nikkud
# ──────────────────────────────────────────────────────────────────────────────
CONF_FRONT = """
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
<div class="meaning" style="font-size:32px;">מה ההבדל?</div>
"""
CONF_BACK = """
{{FrontSide}}<hr>
<div class="definitions">{{Definitions}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""
CONF_CSS = CARD_CSS
CONF_MODEL = genanki.Model(
CONF_MODEL_ID,
"Hebrew Confusables",
fields=[
{"name": "Words"},
{"name": "Definitions"},
{"name": "Audio"},
{"name": "WordNoNikkud"},
],
templates=[
{
"name": "Confusable",
"qfmt": CONF_FRONT,
"afmt": CONF_BACK,
},
],
css=CONF_CSS,
)
def build_confusables_deck(
dict_csv: Path,
include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
"""Build confusables deck from vocab CSV — groups words identical without nikkud."""
logger.info("Building confusables deck …")
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
deck = genanki.Deck(CONF_DECK_ID, "Hebrew Confusables")
media_files: list[Path] = []
note_count = 0
# Group by Word Without Nikkud
groups = {}
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
meaning = str(row.get("Meaning", "")).strip()
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
if not word or not meaning or meaning in ("nan", "None"):
continue
if not word_no_nik or word_no_nik in ("nan", "None"):
continue
pos_heb = _translate_pos(pos_raw) if pos_raw and pos_raw not in ("nan", "None") else ""
groups.setdefault(word_no_nik, []).append((word, meaning, pos_heb))
for word_no_nik, entries in sorted(groups.items()):
if len(entries) < 2:
continue
# Deduplicate: skip entries with identical word+meaning
seen = set()
unique_entries = []
for w, m, p in entries:
key = (w, m)
if key not in seen:
seen.add(key)
unique_entries.append((w, m, p))
if len(unique_entries) < 2:
continue
# Build card content
words_display = " / ".join(w for w, _, _ in unique_entries)
defs_parts = []
audio_parts = []
all_have_audio = True
for w, m, p in unique_entries:
pos_label = f" ({p})" if p else ""
defs_parts.append(
f'<div class="conf-entry"><span class="hebrew" style="font-size:24px;">{w}</span>'
f" = {m}{pos_label}</div>"
)
if include_audio:
at = _audio_tag(_strip_nikkud(w))
if at and at not in audio_parts:
audio_parts.append(at)
mp3_name = at.removeprefix("[sound:").removesuffix("]")
mp3_path = AUDIO_DIR / mp3_name
if mp3_path not in media_files:
media_files.append(mp3_path)
else:
all_have_audio = False
# Only include audio if every word in the group has it
if not all_have_audio:
audio_parts = []
defs_html = "\n".join(defs_parts)
audio_html = " ".join(audio_parts)
note = genanki.Note(
model=CONF_MODEL,
guid=genanki.guid_for("confusable", word_no_nik),
fields=[words_display, defs_html, audio_html, word_no_nik],
tags=[RELEASE_TAG],
)
deck.add_note(note)
note_count += 1
logger.info(f"Confusables deck: {note_count} notes")
return deck, media_files
def write_conf_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = CONF_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Confusables deck written → {out_path}")
# ──────────────────────────────────────────────────────────────────────────────
# Noun plurals deck — singular↔plural drilling
# ──────────────────────────────────────────────────────────────────────────────
PLURAL_FRONT_SG = """
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="meaning">{{Meaning}}</div>
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
"""
PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
"""
PLURAL_FRONT_PL = """
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
"""
PLURAL_BACK_PL = """
{{FrontSide}}<hr>
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="meaning">{{Meaning}}</div>
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
"""
PLURAL_CSS = CARD_CSS
PLURAL_MODEL = genanki.Model(
PLURAL_MODEL_ID,
"Hebrew Plurals",
fields=[
{"name": "Singular"},
{"name": "SingularAudio"},
{"name": "Plural"},
{"name": "PluralAudio"},
{"name": "Meaning"},
{"name": "Root"},
{"name": "Mishkal"},
{"name": "Gender"},
],
templates=[
{
"name": "Singular → Plural",
"qfmt": PLURAL_FRONT_SG,
"afmt": PLURAL_BACK_SG,
},
{
"name": "Plural → Singular",
"qfmt": PLURAL_FRONT_PL,
"afmt": PLURAL_BACK_PL,
},
],
css=PLURAL_CSS,
)
def _is_irregular_plural(gender: str, plural: str) -> bool:
"""Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix."""
plural_stripped = _strip_nikkud(plural)
return (gender == "masculine" and plural_stripped.endswith("ות")) or (
gender == "feminine" and plural_stripped.endswith("ים")
)
def build_plural_deck(
noun_plurals_path: Path = DATA_DIR / "noun_plurals.json",
dict_csv: Path | None = None,
include_audio: bool = False,
) -> tuple[genanki.Deck, list[Path]]:
"""Build noun plurals deck.
Selection: ALL irregular plurals + 2-3 high-frequency exemplars per mishkal
pattern (for regular nouns). Cross-references frequency from vocab CSV.
"""
logger.info("Building plurals deck …")
with open(noun_plurals_path) as f:
all_nouns: dict[str, dict] = json.load(f)
# Load frequency data for prioritizing exemplars
freq_order: dict[str, int] = {}
if dict_csv and dict_csv.exists():
try:
vdf = pd.read_csv(dict_csv, sep=";", index_col=0)
if vdf.shape[1] < 3:
raise ValueError
except (ValueError, pd.errors.ParserError):
vdf = pd.read_csv(dict_csv, index_col=0)
for idx, row in vdf.iterrows():
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
if word_no_nik and word_no_nik not in ("nan", "None"):
freq_order[word_no_nik] = idx # lower index = higher frequency
# Load meanings from vocab CSV
meanings: dict[str, str] = {}
roots: dict[str, str] = {}
if dict_csv and dict_csv.exists():
try:
vdf2 = pd.read_csv(dict_csv, sep=";", index_col=0)
if vdf2.shape[1] < 3:
raise ValueError
except (ValueError, pd.errors.ParserError):
vdf2 = pd.read_csv(dict_csv, index_col=0)
for _, row in vdf2.iterrows():
word = str(row.get("Word", "")).strip()
if word:
meanings[word] = str(row.get("Meaning", "")).strip()
roots[word] = str(row.get("Root", "")).strip()
deck = genanki.Deck(PLURAL_DECK_ID, "Hebrew Plurals")
media_files: list[Path] = []
# Separate irregular plurals from regular (by mishkal)
irregulars: list[tuple[str, dict]] = []
by_mishkal: dict[str, list[tuple[str, dict]]] = {}
for word_key, data in all_nouns.items():
singular = data.get("singular", "")
plural = data.get("plural", "")
gender = data.get("gender", "")
mishkal = data.get("mishkal", "")
if not singular or not plural:
continue
if _is_irregular_plural(gender, plural):
irregulars.append((word_key, data))
elif mishkal:
by_mishkal.setdefault(mishkal, []).append((word_key, data))
# Select exemplars per mishkal, preferring high-frequency words.
# Target 2:1 regular:irregular ratio to avoid over-representing irregulars.
# Target ≥2:1 regular:irregular ratio — 6 per mishkal compensates for
# small groups (<6 entries) that can't fill their quota.
per_mishkal = 6
selected: list[tuple[str, dict]] = list(irregulars)
for _mishkal, entries in sorted(by_mishkal.items()):
# Sort by frequency (lower index = more common)
entries.sort(key=lambda e: freq_order.get(e[0], 999999))
selected.extend(entries[:per_mishkal])
note_count = 0
for _word_key, data in selected:
singular = data["singular"]
plural = data["plural"]
gender = data.get("gender", "")
mishkal = data.get("mishkal", "")
meaning = meanings.get(singular, "")
if not meaning or meaning in ("nan", "None"):
# Try without nikkud
meaning = meanings.get(_strip_nikkud(singular), "")
root = roots.get(singular, "")
if not root or root in ("nan", "None", "-"):
root = ""
# Audio tags
sg_audio = ""
pl_audio = ""
if include_audio:
# Use local audio files if available
sg_no_nik = _strip_nikkud(singular)
sg_tag = _audio_tag(sg_no_nik)
if sg_tag:
sg_audio = sg_tag
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
if mp3_path not in media_files:
media_files.append(mp3_path)
tags = [RELEASE_TAG]
if mishkal:
tags.append(f"mishkal::{mishkal}")
if _is_irregular_plural(gender, plural):
tags.append("irregular")
note = genanki.Note(
model=PLURAL_MODEL,
guid=genanki.guid_for("plural", singular),
fields=[
singular,
sg_audio,
plural,
pl_audio,
meaning,
root,
mishkal,
gender,
],
tags=tags,
)
deck.add_note(note)
note_count += 1
irregular_count = len(irregulars)
regular_count = note_count - irregular_count
logger.info(
f"Plurals deck: {note_count} notes "
f"({irregular_count} irregular + {regular_count} regular exemplars "
f"from {len(by_mishkal)} mishkal patterns)"
)
return deck, media_files
def write_plural_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = PLURAL_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Plurals deck written → {out_path}")
def _font_media_files() -> list[str]:
"""Return list of Heebo font file paths that exist, for bundling in .apkg."""
font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
return [str(p) for p in font_paths if p.exists()]
class _RandomOrderPackage(genanki.Package):
"""genanki.Package subclass that sets new card order to random (0) instead of insertion order (1)."""
def write_to_db(self, cursor, timestamp, id_gen):
super().write_to_db(cursor, timestamp, id_gen)
row = cursor.execute("SELECT dconf FROM col").fetchone()
if row:
dconf = json.loads(row[0])
for conf in dconf.values():
if isinstance(conf, dict) and "new" in conf:
conf["new"]["order"] = 0
cursor.execute("UPDATE col SET dconf = ?", [json.dumps(dconf)])
def write_vocab_apkg(
deck: genanki.Deck,
media_files: list[Path],
out_path: Path = VOCAB_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck) # insertion order = frequency rank (new.order=1 default)
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Vocabulary deck written → {out_path}")
def write_conj_apkg(
deck: genanki.Deck,
media_files: list[Path] | None = None,
out_path: Path = CONJ_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = _RandomOrderPackage(deck)
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Conjugation deck written → {out_path}")
def build_complete_deck(
dict_csv: Path,
conjugations: dict,
examples_cache: dict | None = None,
freq_cache: dict | None = None,
image_cache: dict | None = None,
emoji_lookup: dict | None = None,
limit: int | None = None,
include_audio: bool = False,
) -> tuple[list[genanki.Deck], list[Path]]:
"""Build all subdecks under 'Hebrew::*' for the combined .apkg.
Returns (list_of_decks, deduplicated_media_files).
"""
logger.info(f" Building complete deck (audio={'yes' if include_audio else 'no'}) …")
# Build standalone decks using existing functions
vocab_deck, vocab_media = build_vocab_deck(
dict_csv,
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache or {},
emoji_lookup=emoji_lookup,
limit=limit,
include_audio=include_audio,
include_images=True,
)
conj_deck, conj_media = build_conj_deck(
conjugations,
include_audio=include_audio,
dict_csv=dict_csv,
)
conf_deck, conf_media = build_confusables_deck(
dict_csv,
include_audio=include_audio,
)
# Create new Deck objects with subdeck names and different IDs
complete_vocab = genanki.Deck(COMPLETE_VOCAB_DECK_ID, "Hebrew::Vocabulary")
for note in vocab_deck.notes:
complete_vocab.add_note(note)
complete_conj = genanki.Deck(COMPLETE_CONJ_DECK_ID, "Hebrew::Conjugations")
for note in conj_deck.notes:
complete_conj.add_note(note)
complete_conf = genanki.Deck(COMPLETE_CONF_DECK_ID, "Hebrew::Confusables")
for note in conf_deck.notes:
complete_conf.add_note(note)
all_source_media = vocab_media + conj_media + conf_media
# Plurals subdeck (only if data exists)
plural_data_path = DATA_DIR / "noun_plurals.json"
if plural_data_path.exists():
plural_deck, plural_media = build_plural_deck(
noun_plurals_path=plural_data_path,
dict_csv=dict_csv,
include_audio=include_audio,
)
complete_plural = genanki.Deck(COMPLETE_PLURAL_DECK_ID, "Hebrew::Plurals")
for note in plural_deck.notes:
complete_plural.add_note(note)
all_source_media += plural_media
else:
complete_plural = None
# Deduplicate media files by resolved path
seen_paths: set[str] = set()
all_media: list[Path] = []
for mf in all_source_media:
resolved = str(mf.resolve()) if mf.exists() else str(mf)
if resolved not in seen_paths:
seen_paths.add(resolved)
all_media.append(mf)
decks = [complete_vocab, complete_conj, complete_conf]
if complete_plural:
decks.append(complete_plural)
plural_info = f" + {len(complete_plural.notes)} plural" if complete_plural else ""
logger.info(
f" Complete deck: {len(complete_vocab.notes)} vocab + "
f"{len(complete_conj.notes)} conj + {len(complete_conf.notes)} conf{plural_info} notes, "
f"{len(all_media)} media files"
)
return decks, all_media
def write_complete_apkg(
decks: list[genanki.Deck],
media_files: list[Path],
out_path: Path = COMPLETE_APKG,
) -> None:
"""Write a combined .apkg with multiple subdecks."""
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(decks)
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Complete deck written → {out_path}")
def build_all_variants(
dict_csv: Path,
conjugations: dict,
examples_cache: dict | None = None,
freq_cache: dict | None = None,
image_cache: dict | None = None,
limit: int | None = None,
) -> None:
"""Build all 6 release variants (4 vocab + 2 conj) into output/."""
logger.info("Building all release variants …")
emoji_lookup = _load_emoji_lookup()
logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded")
vocab_variants = [
(False, False, VOCAB_APKG),
(True, False, VOCAB_APKG_AUDIO),
(False, True, VOCAB_APKG_IMAGES),
(True, True, VOCAB_APKG_AUDIO_IMAGES),
]
for audio, images, path in vocab_variants:
label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}"
logger.info(f" Vocab variant: {label}{path.name}")
deck, media = build_vocab_deck(
dict_csv,
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache or {},
emoji_lookup=emoji_lookup,
limit=limit,
include_audio=audio,
include_images=images,
)
write_vocab_apkg(deck, media, out_path=path)
conj_variants = [
(False, CONJ_APKG),
(True, CONJ_APKG_AUDIO),
]
for audio, path in conj_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Conj variant: {label}{path.name}")
deck, media = build_conj_deck(conjugations, include_audio=audio, dict_csv=dict_csv)
write_conj_apkg(deck, media, out_path=path)
conf_variants = [
(False, CONF_APKG),
(True, CONF_APKG_AUDIO),
]
for audio, path in conf_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Conf variant: {label}{path.name}")
deck, media = build_confusables_deck(dict_csv, include_audio=audio)
write_conf_apkg(deck, media, out_path=path)
# Noun plurals (only if data exists)
plural_data_path = DATA_DIR / "noun_plurals.json"
if plural_data_path.exists():
plural_variants = [
(False, PLURAL_APKG),
(True, PLURAL_APKG_AUDIO),
]
for audio, path in plural_variants:
label = f"audio={'yes' if audio else 'no'}"
logger.info(f" Plural variant: {label}{path.name}")
deck, media = build_plural_deck(
noun_plurals_path=plural_data_path,
dict_csv=dict_csv,
include_audio=audio,
)
write_plural_apkg(deck, media, out_path=path)
else:
logger.info(" Skipping plural deck (data/noun_plurals.json not found)")
# Combined "Hebrew::*" complete decks
complete_variants = [
(False, COMPLETE_APKG),
(True, COMPLETE_APKG_AUDIO),
]
for audio, path in complete_variants:
decks, media = build_complete_deck(
dict_csv,
conjugations=conjugations,
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache,
emoji_lookup=emoji_lookup,
limit=limit,
include_audio=audio,
)
write_complete_apkg(decks, media, out_path=path)
logger.info("All variants built.")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
if not csv_path.exists():
csv_path = DATA_DIR / "hebrew_dict.csv"
if not csv_path.exists():
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
if not csv_path.exists():
csv_path = DATA_DIR / "pealim_dict.csv"
deck, media = build_vocab_deck(csv_path, limit=20)
write_vocab_apkg(deck, media)
conj_path = DATA_DIR / "conjugations.json"
if conj_path.exists():
with open(conj_path) as f:
conjugations = json.load(f)
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
conj_deck, conj_media = build_conj_deck(conjugations, dict_csv=csv_path)
write_conj_apkg(conj_deck, conj_media)