hebrew_flash_cards/apkg_builder.py

#!/usr/bin/env python3
"""
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
Uses genanki for reliable, stable deck generation.

Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
in Anki rather than creating a duplicate.
"""

import json
import logging
import random
import re
import unicodedata
from pathlib import Path

import genanki
import pandas as pd

from helpers import strip_nikkud as _strip_nikkud

logger = logging.getLogger(__name__)

# Stable deck/model IDs — do not change these
VOCAB_DECK_ID = 1_234_567_890
VOCAB_MODEL_ID = 1_701_222_017_968  # matches Nevo's original Anki model
CONJ_DECK_ID = 1_234_567_892
CONJ_MODEL_ID = 1_234_567_893
CONF_DECK_ID = 1_234_567_894
CONF_MODEL_ID = 1_234_567_895
PLURAL_DECK_ID = 1_234_567_896
PLURAL_MODEL_ID = 1_234_567_897

# Subdeck IDs for combined "Hebrew::*" package — MUST differ from standalone IDs
COMPLETE_VOCAB_DECK_ID = 1_234_567_900
COMPLETE_CONJ_DECK_ID = 1_234_567_901
COMPLETE_CONF_DECK_ID = 1_234_567_902
COMPLETE_PLURAL_DECK_ID = 1_234_567_903

# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.14"

# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)")

DATA_DIR = Path(__file__).parent / "data"

# Legacy GUID map from Nevo's original Anki deck (imported ~Jul 2025).
# Preserves study progress on reimport by reusing the same note GUIDs.
_LEGACY_GUID_PATH = DATA_DIR / "legacy_guid_map.json"
_LEGACY_GUIDS: dict[str, str] = {}
if _LEGACY_GUID_PATH.exists():
    with open(_LEGACY_GUID_PATH) as _f:
        _LEGACY_GUIDS = json.load(_f)


def _vocab_guid(word: str, meaning: str = "") -> str:
    """Return the legacy GUID for a word if it exists, else a deterministic one.

    For homographs (same word, different meanings), tries a compound key
    ``word||meaning_prefix`` first. Falls back to the plain word key, then
    to a deterministic GUID from (word, meaning).
    """
    key = unicodedata.normalize("NFC", word)
    if meaning:
        compound = f"{key}||{meaning.lower().strip()[:30]}"
        if compound in _LEGACY_GUIDS:
            return _LEGACY_GUIDS[compound]
    if key in _LEGACY_GUIDS:
        return _LEGACY_GUIDS[key]
    return genanki.guid_for(word, meaning) if meaning else genanki.guid_for(word)


AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
OUTPUT_DIR = Path(__file__).parent / "output"

VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg"
VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg"
VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg"
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg"
CONF_APKG = OUTPUT_DIR / "hebrew_confusables.apkg"
CONF_APKG_AUDIO = OUTPUT_DIR / "hebrew_confusables_audio.apkg"
PLURAL_APKG = OUTPUT_DIR / "hebrew_plurals.apkg"
PLURAL_APKG_AUDIO = OUTPUT_DIR / "hebrew_plurals_audio.apkg"
COMPLETE_APKG = OUTPUT_DIR / "hebrew_complete.apkg"
COMPLETE_APKG_AUDIO = OUTPUT_DIR / "hebrew_complete_audio.apkg"

# ──────────────────────────────────────────────────────────────────────────────
# Binyan → Hebrew label mapping (for conjugation card display)
# ──────────────────────────────────────────────────────────────────────────────

BINYAN_TO_HEBREW: dict[str, str] = {
    "Pa'al": "פָּעַל",
    "Nif'al": "נִפְעַל",
    "Pi'el": "פִּעֵל",
    "Pu'al": "פֻּעַל",
    "Hitpa'el": "הִתְפַּעֵל",
    "Hif'il": "הִפְעִיל",
    "Huf'al": "הֻפְעַל",
}

# ──────────────────────────────────────────────────────────────────────────────
# PoS → Hebrew label mapping
# ──────────────────────────────────────────────────────────────────────────────

POS_TO_HEBREW = {
    "Noun": "שם עצם",
    "Verb": "פועל",
    "Adjective": "שם תואר",
    "Adverb": "תואר הפועל",
    "Preposition": "מילת יחס",
    "Conjunction": "מילת חיבור",
    "Pronoun": "כינוי גוף",
    "Particle": "מילית",
}

# PoS category groupings for related-words display
POS_CATEGORY_LABELS = {
    "Verb": "פעלים",
    "Noun": "שמות עצם",
    "Adjective": "שמות תואר",
    "Adverb": "תוארי הפועל",
}

# ──────────────────────────────────────────────────────────────────────────────
# Shared CSS
# ──────────────────────────────────────────────────────────────────────────────

FONTS_DIR = DATA_DIR / "fonts"

CARD_CSS = """
@font-face {
  font-family: 'Heebo';
  src: url('_Heebo-Regular.ttf');
  font-weight: normal;
}
@font-face {
  font-family: 'Heebo';
  src: url('_Heebo-Bold.ttf');
  font-weight: bold;
}
.card {
  font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
  font-size: 20px;
  text-align: center;
  color: #222;
  background: #fff;
  padding: 16px;
}
.hebrew {
  font-size: 36px;
  font-weight: bold;
  direction: rtl;
  text-align: center;
  line-height: 1.5;
  color: #222;
}
.hebrew-sm {
  font-size: 24px;
  font-weight: normal;
  direction: rtl;
  text-align: center;
  color: #333;
}
.meaning {
  font-size: 28px;
  color: #1a1a8c;
  margin: 8px 0;
}
.hint {
  font-size: 16px;
  color: #888;
  margin: 4px 0;
  direction: rtl;
}
.root-info {
  font-size: 18px;
  color: #555;
  margin-top: 6px;
  direction: rtl;
}
.example {
  font-size: 18px;
  color: #444;
  direction: rtl;
  text-align: right;
  font-style: italic;
  margin: 10px auto 0;
  max-width: 90%;
  border-right: 3px solid #aaa;
  padding-right: 8px;
}
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
.freq-badge {
  display: inline-block;
  font-size: 11px;
  color: #aaa;
  background: transparent;
  border: 1px solid #eee;
  border-radius: 10px;
  padding: 2px 8px;
  margin-top: 4px;
}
.voice-label {
  font-size: 0.6em;
  font-weight: normal;
  color: #555;
}
.sec-label {
  font-size: 32px;
  color: #555;
  direction: rtl;
  text-align: center;
  margin-top: 6px;
}
.sec-key {
  font-size: 24px;
  color: #888;
}
.related-group {
  direction: rtl;
  text-align: right;
  margin: 2px 0;
  font-size: 18px;
}
.emoji-img {
  font-size: 3.5em;
  text-align: center;
  margin: 0.3em 0;
}
@media (prefers-color-scheme: dark) {
  .card        { color: #e8e8e8; background: #1c1c1e; }
  .hebrew      { color: #f0f0f0; }
  .hebrew-sm   { color: #ddd; }
  .meaning     { color: #82b0ff; }
  .root-info   { color: #aaa; }
  .sec-label   { color: #aaa; }
  .sec-key     { color: #666; }
  .hint        { color: #777; }
  .voice-label { color: #888; }
  .example     { color: #bbb; border-right-color: #555; }
  .divider     { border-top-color: #333; }
  .freq-badge  { color: #888; border-color: #444; }
}
"""

# ──────────────────────────────────────────────────────────────────────────────
# Vocabulary Deck
# ──────────────────────────────────────────────────────────────────────────────

VOCAB_FRONT_HEB = """
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""

VOCAB_BACK_HEB = """
{{FrontSide}}
<div class="divider"></div>
<div class="meaning">{{Meaning}}</div>
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
{{#SharedRoots}}
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
"""

VOCAB_FRONT_ENG = """
<div class="meaning">{{Meaning}}</div>
{{#Hint}}<div class="hint">{{Hint}}</div>{{/Hint}}
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
"""

VOCAB_BACK_ENG = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
"""

VOCAB_FRONT_CLOZE = """
<div class="example" style="font-size:24px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
"""

VOCAB_BACK_CLOZE = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<div class="meaning">{{Meaning}}</div>
"""

VOCAB_MODEL = genanki.Model(
    VOCAB_MODEL_ID,
    "Hebrew Flash Cards",
    fields=[
        {"name": "Word"},
        {"name": "Root"},
        {"name": "PoS"},
        {"name": "Meaning"},
        {"name": "WordNoNikkud"},
        {"name": "SharedRoots"},
        {"name": "Tags"},
        {"name": "Audio"},
        {"name": "Example"},
        {"name": "Frequency"},
        {"name": "Image"},
        {"name": "Emoji"},
        {"name": "Prep"},
        {"name": "Hint"},
        {"name": "Plural"},
        {"name": "ClozeExample"},
        {"name": "ClozeHint"},
    ],
    templates=[
        {
            # ord 0 — matches Nevo's original "Card 2" (Eng→Heb)
            "name": "English → Hebrew",
            "qfmt": VOCAB_FRONT_ENG,
            "afmt": VOCAB_BACK_ENG,
        },
        {
            # ord 1 — matches Nevo's original "Card 3" (Heb→Eng)
            "name": "Hebrew → English",
            "qfmt": VOCAB_FRONT_HEB,
            "afmt": VOCAB_BACK_HEB,
        },
        {
            # ord 2 — Cloze-style sentence fill-in (only generated when ClozeExample is non-empty)
            "name": "Sentence Cloze",
            "qfmt": VOCAB_FRONT_CLOZE,
            "afmt": VOCAB_BACK_CLOZE,
        },
    ],
    css=CARD_CSS,
)

# ──────────────────────────────────────────────────────────────────────────────
# Conjugation Deck
# ──────────────────────────────────────────────────────────────────────────────

CONJ_FRONT = """
<div class="hebrew">{{Pronoun}}</div>
<div class="meaning" style="font-size:28px;">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Tense}}</div>
"""

CONJ_BACK = """
{{FrontSide}}<hr>
<div class="hebrew">{{ConjugatedForm}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
{{#RelatedVocab}}<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
"""

CONJ_CSS = CARD_CSS

CONJ_MODEL = genanki.Model(
    CONJ_MODEL_ID,
    "Pealim Conjugation",
    fields=[
        {"name": "Infinitive"},
        {"name": "ReferenceForm"},
        {"name": "Pronoun"},
        {"name": "Tense"},
        {"name": "ConjugatedForm"},
        {"name": "Root"},
        {"name": "Binyan"},
        {"name": "Voice"},
        {"name": "Audio"},
        {"name": "Meaning"},
        {"name": "RelatedVocab"},
    ],
    templates=[
        {
            "name": "Conjugation Drill",
            "qfmt": CONJ_FRONT,
            "afmt": CONJ_BACK,
        }
    ],
    css=CONJ_CSS,
)

# Present-tense expansion: each form key → list of (pronoun, tense_label)
PRESENT_EXPANSION = {
    "present_ms": [
        ("אֲנִי (זָכָר)", "הוֹוֶה"),
        ("אַתָּה", "הוֹוֶה"),
        ("הוּא", "הוֹוֶה"),
    ],
    "present_fs": [
        ("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
        ("אַתְּ", "הוֹוֶה"),
        ("הִיא", "הוֹוֶה"),
    ],
    "present_mp": [
        ("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
        ("אַתֶּם", "הוֹוֶה"),
        ("הֵם", "הוֹוֶה"),
    ],
    "present_fp": [
        ("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
        ("אַתֶּן", "הוֹוֶה"),
        ("הֵן", "הוֹוֶה"),
    ],
}

# Modern Hebrew: 2fp/3fp future and imperative default to mp form (classical in parens)
FP_MODERN_FALLBACK = {
    "future_2fp": "future_2mp",
    "future_3fp": "future_3mp",
    "imperative_fp": "imperative_mp",
}

# 3rd person plural past: same form for m/f — generate two separate pronoun cards
PAST_3P_EXPANSION = [
    ("הֵם", "עָבָר"),
    ("הֵן", "עָבָר"),
]

# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
VOICE_MAP = {
    "Pu'al": "סָבִיל",
    "Huf'al": "סָבִיל",
}


# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────


def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR) -> str:
    """Return [sound:xxx.mp3] if audio file exists, else empty string."""
    safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
    if not safe:
        return ""
    mp3_path = audio_dir / f"{safe}.mp3"
    if mp3_path.exists():
        return f"[sound:{mp3_path.name}]"
    return ""


def _conj_audio_tag(slug: str, form_key: str) -> str:
    """Return [sound:xxx.mp3] for conjugation audio if downloaded."""
    filename = f"{slug}_{form_key}.mp3"
    mp3_path = AUDIO_CONJ_DIR / filename
    if mp3_path.exists():
        return f"[sound:{filename}]"
    return ""


# Keywords excluded when building emoji lookup AND matching meaning text.
# Curated from manual review of all 2,261 emoji-word pairs (Sprint 8).
_EMOJI_STOP = frozenset(
    {
        # Basic stop words
        "to",
        "be",
        "a",
        "an",
        "the",
        "of",
        "in",
        "on",
        "at",
        "for",
        "and",
        "with",
        "by",
        "or",
        "but",
        "not",
        "as",
        "its",
        # Generic emoji description words (too vague)
        "face",
        "hand",
        "sign",
        "symbol",
        "button",
        "small",
        "large",
        "light",
        "dark",
        "open",
        "closed",
        # Numbers → clock emoji (🕐🕑🕒 etc.)
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "hundred",
        "thousand",
        # UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️)
        "next",
        "fast",
        "play",
        "pause",
        "repeat",
        "end",
        "soon",
        "record",
        # Abstract words → misleading object emoji
        "part",
        "place",
        "mark",
        "post",
        "department",
        "store",
        "note",
        "control",
        "level",
        "stop",
        "cover",
        "roll",
        "rolling",
        "pick",
        "over",
        "right",
        "way",
        "skin",
        "drop",
        "middle",
        "piece",
        "section",
        # Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.)
        "north",
        "south",
        "northern",
        "southern",
        "western",
        "eastern",
        "central",
        "territory",
        "kingdom",
        "united",
        "virgin",
        # Common words producing bad emoji matches
        "new",
        "big",
        "full",
        "last",
        "first",
        "double",
        "slightly",
        "without",
        "from",
        "behind",
        "people",
        "position",
        "status",
        "situation",
        "game",
        "call",
        "trade",
        "male",
        "female",
        "person",
        "letter",
        # Polysemous words → wrong emoji sense
        "french",
        "fried",
        "board",
        "bow",
        "water",
        "union",
        "rock",
        "left",
        "back",
        "crane",
        "dash",
        "bar",
        "wheel",
        "horizontal",
    }
)


def _load_emoji_lookup() -> dict[str, str]:
    """Load or fetch Unicode emoji keyword→character lookup.

    Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
    Result is cached in data/emoji_lookup.json.
    Returns empty dict on network failure (safe fallback).
    """
    cache_file = DATA_DIR / "emoji_lookup.json"
    if cache_file.exists():
        with open(cache_file) as f:
            return json.load(f)

    import requests

    try:
        resp = requests.get(
            "https://unicode.org/Public/emoji/latest/emoji-test.txt",
            timeout=30,
        )
        resp.raise_for_status()
    except Exception as e:
        logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
        return {}

    lookup: dict[str, str] = {}
    for line in resp.text.splitlines():
        if "fully-qualified" not in line:
            continue
        m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
        if not m:
            continue
        emoji_char = m.group(1)
        desc = m.group(2).lower().strip()
        for word in desc.split():
            word = word.strip(".,'\"-")
            if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
                lookup[word] = emoji_char

    cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
    logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
    return lookup


def _translate_pos(pos_str: str) -> str:
    """Translate PoS string to Hebrew. For verbs, appends binyan."""
    for eng, heb in POS_TO_HEBREW.items():
        if eng.lower() in pos_str.lower():
            if eng == "Verb":
                # Extract binyan from strings like "Verb – Pi'el" or "Verb –pi'el"
                for binyan_eng, binyan_heb in BINYAN_TO_HEBREW.items():
                    if binyan_eng.lower() in pos_str.lower().replace("–", "-").replace("—", "-"):
                        return f"פועל — {binyan_heb}"
            return heb
    return pos_str


def _categorize_pos(pos_str: str) -> str:
    """Return the canonical PoS category key for grouping."""
    for cat in POS_CATEGORY_LABELS:
        if cat.lower() in pos_str.lower():
            return cat
    return "Other"


def build_vocab_deck(
    dict_csv: Path,
    examples_cache: dict | None = None,
    freq_cache: dict | None = None,
    image_cache: dict | None = None,
    emoji_lookup: dict | None = None,
    limit: int | None = None,
    include_audio: bool = True,
    include_images: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
    """
    Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv).
    Returns (deck, list_of_media_files).
    """
    logger.info(f"Loading dictionary from {dict_csv}")
    try:
        df = pd.read_csv(dict_csv, sep=";", index_col=0)
        if df.shape[1] < 3:
            raise ValueError("too few columns")
    except (ValueError, pd.errors.ParserError):
        df = pd.read_csv(dict_csv, index_col=0)

    if limit:
        df = df.head(limit)

    logger.info(f"  {len(df)} rows loaded")

    examples_cache = examples_cache or {}
    freq_cache = freq_cache or {}
    image_cache = image_cache or {}

    # Load EPUB/PDF sentence matches (nikkud'd — preferred over Ben Yehuda)
    epub_examples: dict[str, list[str]] = {}
    epub_path = DATA_DIR / "vocab_sentence_matches.json"
    if epub_path.exists():
        try:
            with open(epub_path) as _f:
                raw_epub = json.load(_f)
            for word_key, info in raw_epub.items():
                sents = info.get("sentences", [])
                if sents:
                    epub_examples[word_key] = [s["text"] if isinstance(s, dict) else s for s in sents]
                    # Also index by nikkud form
                    nikkud_word = info.get("word_nikkud", "")
                    if nikkud_word and nikkud_word != word_key:
                        epub_examples[nikkud_word] = epub_examples[word_key]
            logger.info(f"  EPUB sentence matches loaded: {len(epub_examples)} words")
        except (json.JSONDecodeError, OSError):
            pass

    # Load AI-vetted sentences for cloze cards (only approved sentences)
    vetted_cloze: dict[str, list[str]] = {}  # word_nikkud → [good sentences]
    vetted_path = DATA_DIR / "vetted_sentences.json"
    if vetted_path.exists():
        try:
            with open(vetted_path) as _f:
                raw_vetted = json.load(_f)
            for word_key, info in raw_vetted.items():
                good = info.get("good_sentences", [])
                if good:
                    texts = [s["text"] if isinstance(s, dict) else s for s in good]
                    nikkud_word = info.get("word_nikkud", word_key)
                    vetted_cloze[nikkud_word] = texts
                    if word_key != nikkud_word:
                        vetted_cloze[word_key] = texts
            logger.info(f"  Vetted cloze sentences loaded: {len(vetted_cloze)} words")
        except (json.JSONDecodeError, OSError):
            pass

    # Load noun plural forms for vocab card back display
    noun_plural_lookup: dict[str, str] = {}  # word (nikkud) → plural (nikkud)
    _noun_plural_stripped: dict[str, str] = {}  # word (stripped) → plural (nikkud), fallback
    noun_plural_path = DATA_DIR / "noun_plurals.json"
    if noun_plural_path.exists():
        try:
            with open(noun_plural_path) as _f:
                _noun_data = json.load(_f)
            for _entry in _noun_data.values():
                sg = _entry.get("singular", "")
                pl = _entry.get("plural", "")
                if sg and pl:
                    noun_plural_lookup[sg] = pl
                    s = _strip_nikkud(sg)
                    if s not in _noun_plural_stripped:
                        _noun_plural_stripped[s] = pl
            logger.info(f"  Noun plurals loaded: {len(noun_plural_lookup)} entries")
        except (json.JSONDecodeError, OSError):
            pass

    # Load refined meanings for synonym disambiguation (layer 2)
    refined_meanings: dict[str, str] = {}
    refined_path = DATA_DIR / "refined_meanings.json"
    if refined_path.exists():
        try:
            with open(refined_path) as _f:
                refined_meanings = json.load(_f)
            logger.info(f"  Refined meanings loaded: {len(refined_meanings)} entries")
        except (json.JSONDecodeError, OSError):
            pass

    # Load image cache from disk if not passed in
    image_cache_path = DATA_DIR / "image_cache.json"
    if not image_cache and image_cache_path.exists():
        try:
            with open(image_cache_path) as _f:
                image_cache = json.load(_f)
        except (json.JSONDecodeError, OSError) as e:
            logger.debug(f"Could not load image cache from disk: {e}")

    images_dir = DATA_DIR / "images"

    # Build word_stripped → pos_category dict for related-words grouping
    word_to_pos_cat: dict[str, str] = {}
    for _, row in df.iterrows():
        wni = str(row.get("Word Without Nikkud", "")).strip()
        pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
        if wni and pos_raw and pos_raw not in ("nan", "None"):
            word_to_pos_cat[_strip_nikkud(wni)] = _categorize_pos(pos_raw)

    # Build confusable words set: consonant-only forms with multiple entries
    # Uses _strip_nikkud (removes combining marks) rather than Word Without Nikkud
    # (which preserves matres lectionis) — since sentence matching also uses
    # _strip_nikkud, we need to detect collisions at that level.
    _strip_to_nikkud: dict[str, set[str]] = {}
    for _, row in df.iterrows():
        w = str(row.get("Word", "")).strip()
        if w and w not in ("nan", "None"):
            consonants = _strip_nikkud(w)
            _strip_to_nikkud.setdefault(consonants, set()).add(w)
    _confusable_words: set[str] = {k for k, v in _strip_to_nikkud.items() if len(v) > 1}
    if _confusable_words:
        logger.info(f"  Confusable words (homographs): {len(_confusable_words)} stripped forms")

    # Build ambiguity index: group words by normalized meaning to detect
    # Eng→Heb collisions. A word needs a hint when another word shares
    # the same English meaning. Hint = PoS (+ binyan for verbs).
    _meaning_groups: dict[str, list[tuple[str, str]]] = {}  # norm_meaning → [(word, pos_raw)]
    for _, row in df.iterrows():
        w = str(row.get("Word", "")).strip()
        m = str(row.get("Meaning", "")).strip()
        p = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
        if not w or not m or m in ("nan", "None"):
            continue
        # Normalize: strip emoji, Hebrew parens, take text before first semicolon
        m_clean = EMOJI_RE.sub("", m).strip()
        m_clean = HBPAREN_RE.sub("", m_clean).strip().strip(",").strip()
        m_norm = m_clean.split(";")[0].strip().lower()
        if m_norm:
            _meaning_groups.setdefault(m_norm, []).append((w, p if p not in ("nan", "None") else ""))

    # For each word in an ambiguous group, build its hint string
    _word_hints: dict[tuple[str, str], str] = {}  # (word, meaning) → hint
    for _m_norm, entries in _meaning_groups.items():
        if len(entries) < 2:
            continue
        # Check if the group has genuinely different PoS/binyan (not just duplicates)
        pos_set = set()
        for _, p in entries:
            pos_set.add(_translate_pos(p) if p else "")
        if len(pos_set) < 2:
            continue
        for w, p in entries:
            hint = _translate_pos(p) if p else ""
            if hint:
                # Find original meaning for this word to build the (word, meaning) key
                _word_hints.setdefault((w, hint), hint)

    # Rebuild as (word, full_meaning) → hint for lookup during note creation
    _word_meaning_hints: dict[tuple[str, str], str] = {}
    for _, row in df.iterrows():
        w = str(row.get("Word", "")).strip()
        m = str(row.get("Meaning", "")).strip()
        p = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
        if not w or not m or m in ("nan", "None"):
            continue
        hint = _translate_pos(p) if p and p not in ("nan", "None") else ""
        if (w, hint) in _word_hints:
            _word_meaning_hints[(w, m)] = hint

    if _word_meaning_hints:
        logger.info(f"  Eng→Heb disambiguation hints: {len(_word_meaning_hints)} words")

    # Sort by frequency rank
    def freq_sort_key(row):
        word_plain = _strip_nikkud(str(row.get("Word Without Nikkud", row.get("WordNoNikkud", ""))).strip())
        return freq_cache.get(word_plain, 999_999)

    df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
    df = df.sort_values("_freq_rank")

    deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
    media_files: list[Path] = []
    seen_words: set[tuple[str, str]] = set()

    for _, row in df.iterrows():
        word = str(row.get("Word", "")).strip()
        root = str(row.get("Root", "")).strip()
        pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
        meaning = str(row.get("Meaning", "")).strip()
        word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
        shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
        tags_str = str(row.get("tags", row.get("Tags", ""))).strip()
        freq_rank_raw = row["_freq_rank"]
        if freq_rank_raw <= 500:
            freq_display = f"Core #{freq_rank_raw}"
        elif freq_rank_raw <= 1500:
            freq_display = f"Essential #{freq_rank_raw}"
        elif freq_rank_raw <= 3000:
            freq_display = f"Intermediate #{freq_rank_raw}"
        elif freq_rank_raw <= 5000:
            freq_display = f"Upper-intermediate #{freq_rank_raw}"
        elif freq_rank_raw <= 10000:
            freq_display = f"Advanced #{freq_rank_raw}"
        elif freq_rank_raw < 999_999:
            freq_display = f"Rare #{freq_rank_raw}"
        else:
            freq_display = "Unlisted"

        root = "" if root in ("nan", "None", "-") else root
        pos_raw = "" if pos_raw in ("nan", "None") else pos_raw
        meaning = "" if meaning in ("nan", "None") else meaning
        word_no_nik = "" if word_no_nik in ("nan", "None") else word_no_nik
        shared_roots = "" if shared_roots in ("nan", "None") else shared_roots
        tags_str = "" if tags_str in ("nan", "None") else tags_str

        if not word or not meaning:
            continue

        # Skip exact duplicates (same word AND same meaning — true dupes).
        # Homographs (same word, different meaning) are kept as separate notes.
        word_meaning_key = (word, meaning)
        if word_meaning_key in seen_words:
            logger.debug(f"  Skipping duplicate word+meaning: {word}")
            continue
        seen_words.add(word_meaning_key)

        # Extract emoji from meaning (pealim embeds emoji in meaning text)
        emoji_str = "".join(EMOJI_RE.findall(meaning))
        meaning_clean = EMOJI_RE.sub("", meaning).strip()

        # Fallback: look up emoji from Unicode standard by English keyword
        if not emoji_str and emoji_lookup:
            for kw in re.sub(r"[^\w\s]", " ", meaning_clean.lower()).split()[:5]:
                if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
                    emoji_str = emoji_lookup[kw]
                    break

        # Extract Hebrew parentheticals (prepositions) from meaning
        preps = HBPAREN_RE.findall(meaning_clean)
        prep_str = " ".join(f"({p})" for p in preps)
        meaning_clean = HBPAREN_RE.sub("", meaning_clean).strip().strip(",").strip()

        # Apply refined meaning if available (AI disambiguation layer 2)
        if word in refined_meanings:
            meaning_clean = refined_meanings[word]

        # Translate PoS to Hebrew
        pos_heb = _translate_pos(pos_raw) if pos_raw else ""

        # Eng→Heb disambiguation hint (PoS + binyan, shown only for ambiguous meanings)
        hint_str = _word_meaning_hints.get((word, meaning), "")

        # Audio
        audio_tag = _audio_tag(word_no_nik) if include_audio else ""
        if audio_tag:
            mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
            mp3_path = AUDIO_DIR / mp3_name
            if mp3_path not in media_files:
                media_files.append(mp3_path)

        # Consonant-only form for confusable detection and cloze matching
        word_consonants = _strip_nikkud(word)
        is_confusable = word_consonants in _confusable_words

        # Example sentences — priority: EPUB (nikkud'd) > Ben Yehuda > none
        # For confusable words (same consonants, different nikkud), only match by
        # exact nikkud form to avoid showing wrong-word sentences.
        example_html = ""
        # 1. EPUB/PDF sentences (full nikkud)
        epub_sents = epub_examples.get(word)
        if not epub_sents and not is_confusable:
            epub_sents = epub_examples.get(word_no_nik) or epub_examples.get(_strip_nikkud(word_no_nik))
        if epub_sents:
            example_html = epub_sents[0]
        else:
            # 2. Ben Yehuda examples (some have nikkud from nikkud corpus)
            by_sents = examples_cache.get(word)
            if not by_sents and not is_confusable:
                by_sents = examples_cache.get(word_no_nik) or examples_cache.get(_strip_nikkud(word_no_nik))
            if by_sents:
                # Prefer nikkud'd Ben Yehuda sentences (contain combining marks)
                nikkud_sents = [s for s in by_sents if any("\u0591" <= c <= "\u05c7" for c in s)]
                example_html = nikkud_sents[0] if nikkud_sents else by_sents[0]

        # Cloze example: replace target word with blank in example sentence.
        # Priority: AI-vetted sentences > EPUB/Ben Yehuda sentences.
        # Uses stripped (no-nikkud) matching. Skips homographs (confusable words).
        cloze_example = ""
        cloze_hint = ""
        if word_consonants and not is_confusable:
            # Pick best sentence for cloze: vetted first, then example_html
            cloze_source = None
            vetted = vetted_cloze.get(word)
            if not vetted and not is_confusable:
                vetted = vetted_cloze.get(word_no_nik) or vetted_cloze.get(_strip_nikkud(word_no_nik))
            if vetted:
                cloze_source = vetted[0]
            elif example_html:
                cloze_source = example_html

            if cloze_source:
                tokens = cloze_source.split()
                word_stripped = _strip_nikkud(word)
                replaced = False
                if word_stripped:
                    for i, tok in enumerate(tokens):
                        tok_stripped = _strip_nikkud(tok)
                        m = re.match(r'^(.*?)([\.,!?;:"\u0027]*)$', tok_stripped)
                        tok_core = m.group(1) if m else tok_stripped
                        punct_match = re.search(r'[.,!?;:"\u0027]+$', tok)
                        trailing = punct_match.group() if punct_match else ""
                        if tok_core == word_stripped:
                            tokens[i] = "_____" + trailing
                            replaced = True
                            break
                if replaced:
                    cloze_example = " ".join(tokens)
                    pos_cat = _categorize_pos(pos_raw)
                    cloze_hint = meaning_clean
                    if pos_cat == "Verb" and pos_heb:
                        cloze_hint = f"{meaning_clean} ({pos_heb})"

        # Related words grouped by PoS category
        related_html = ""
        if shared_roots:
            related_words = shared_roots.split()
            groups: dict[str, list[str]] = {}
            for rw in related_words:
                cat = word_to_pos_cat.get(_strip_nikkud(rw), "Other")
                groups.setdefault(cat, []).append(rw)
            parts = []
            for cat, words in groups.items():
                if cat == "Other":
                    # No label for uncategorized words — just list them plain
                    parts.append(f'<div class="related-group">{" ".join(words)}</div>')
                else:
                    label = POS_CATEGORY_LABELS.get(cat, cat)
                    parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>')
            related_html = "\n".join(parts)

        # Image: look up by stripped word (no-nikkud)
        image_tag = ""
        if include_images:
            image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None)
            if image_filename:
                image_path = images_dir / image_filename
                if image_path.exists():
                    image_tag = image_filename
                    if image_path not in media_files:
                        media_files.append(image_path)

        note = genanki.Note(
            model=VOCAB_MODEL,
            # Stable GUID: uses legacy GUID from Nevo's original deck when
            # available, otherwise deterministic from word + meaning.
            guid=_vocab_guid(word, meaning),
            fields=[
                word,
                root,
                pos_heb,
                meaning_clean,
                word_no_nik,
                related_html or shared_roots,
                tags_str,
                audio_tag,
                example_html,
                freq_display,
                image_tag,
                emoji_str,
                prep_str,
                hint_str,
                noun_plural_lookup.get(word, "") or _noun_plural_stripped.get(word_consonants, ""),
                cloze_example,
                cloze_hint,
            ],
            tags=(tags_str.split() if tags_str else [])
            + [RELEASE_TAG]
            + [f"freq::{freq_display.split()[0]}" if freq_display != "Unlisted" else "freq::Unlisted"],
        )
        deck.add_note(note)

    # Diagnostic: count words with emoji/prep/hint/plural/cloze extracted
    emoji_count = sum(1 for n in deck.notes if n.fields[11])
    prep_count = sum(1 for n in deck.notes if n.fields[12])
    hint_count = sum(1 for n in deck.notes if n.fields[13])
    plural_count = sum(1 for n in deck.notes if n.fields[14])
    cloze_count = sum(1 for n in deck.notes if n.fields[15])
    if emoji_count:
        logger.info(f"  Emoji extracted: {emoji_count} words")
    if prep_count:
        logger.info(f"  Hebrew prepositions extracted: {prep_count} words")
    if hint_count:
        logger.info(f"  Eng→Heb hints: {hint_count} words")
    if plural_count:
        logger.info(f"  Noun plurals on vocab cards: {plural_count} words")
    if cloze_count:
        logger.info(f"  Sentence cloze cards: {cloze_count} words")

    # Diagnostic: count words without PoS coverage in shared_roots
    other_count = 0
    for _, row in df.iterrows():
        sr = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
        if sr and sr not in ("nan", "None"):
            other_count += sum(1 for rw in sr.split() if word_to_pos_cat.get(_strip_nikkud(rw)) is None)
    unlisted = int((df["_freq_rank"] >= 999_999).sum())
    logger.info(f"  Unlisted words (not in frequency corpus): {unlisted}/{len(df)}")
    logger.info(f"  Related-words without PoS coverage: {other_count} (shown unlabeled)")
    logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
    return deck, media_files


def build_conj_deck(
    conjugations: dict,
    audio_dir: Path = AUDIO_CONJ_DIR,
    include_audio: bool = True,
    dict_csv: Path | None = None,
) -> tuple[genanki.Deck, list[Path]]:
    """Build the conjugation drill deck from conjugations.json data."""
    deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
    media_files: list[Path] = []
    note_count = 0

    # Build lookup tables from vocab CSV for cross-linking
    verb_meaning: dict[str, str] = {}  # word_no_nikkud → meaning
    root_words: dict[str, list[str]] = {}  # root → [related words]
    if dict_csv and dict_csv.exists():
        vdf = pd.read_csv(dict_csv, sep=";", index_col=0)
        for _, row in vdf.iterrows():
            word = str(row.get("Word", "")).strip()
            word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
            meaning = str(row.get("Meaning", "")).strip()
            root = str(row.get("Root", "")).strip()
            if root and root not in ("nan", "None", "-"):
                root_words.setdefault(root, []).append(word)
            if meaning and meaning not in ("nan", "None"):
                # Use Word Without Nikkud (ktiv male) for matching
                if word_no_nik and word_no_nik not in ("nan", "None"):
                    verb_meaning[word_no_nik] = meaning
                verb_meaning[_strip_nikkud(word)] = meaning

    for infinitive, data in conjugations.items():
        if not data or not data.get("forms"):
            continue

        root = data.get("root", "")
        binyan = data.get("binyan", "")
        binyan_heb = BINYAN_TO_HEBREW.get(binyan, binyan)
        ref_form = data.get("reference_form", infinitive)
        slug = data.get("slug", "")
        voice = VOICE_MAP.get(binyan, "")

        # Meaning: prefer scraped meaning from pealim page, fall back to CSV cross-link
        meaning = (
            data.get("meaning", "")
            or verb_meaning.get(infinitive, "")
            or verb_meaning.get(_strip_nikkud(infinitive), "")
        )
        related = [w for w in root_words.get(root, []) if w != infinitive]
        related_str = " ".join(related[:8]) if related else ""
        forms = data["forms"]

        def add_note(
            pronoun: str,
            tense: str,
            conj_form: str,
            audio_tag: str,
            *,
            _infinitive: str = infinitive,
            _ref_form: str = ref_form,
            _root: str = root,
            _binyan_heb: str = binyan_heb,
            _voice: str = voice,
            _meaning: str = meaning,
            _related_str: str = related_str,
        ) -> None:
            nonlocal note_count
            if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
                return
            note = genanki.Note(
                model=CONJ_MODEL,
                guid=genanki.guid_for(_infinitive, pronoun, tense),
                fields=[
                    _infinitive,
                    _ref_form,
                    pronoun,
                    tense,
                    conj_form,
                    _root,
                    _binyan_heb,
                    _voice,
                    audio_tag,
                    _meaning,
                    _related_str,
                ],
                tags=[RELEASE_TAG],
            )
            deck.add_note(note)
            note_count += 1

        alternate_forms = data.get("alternate_forms", {})

        # Seeded RNG per verb — deterministic pronoun/gender choices
        verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)

        for form_key, form_data in forms.items():
            primary_form = form_data.get("form", "")
            alt_form = alternate_forms.get(form_key, "")
            conj_form = f"{primary_form} / {alt_form}" if alt_form else primary_form
            # Infinitive: shown on card front as reference — skip as a quiz form
            if form_key == "infinitive":
                continue

            # Audio tag: use downloaded file if present
            audio_tag = ""
            if include_audio and slug:
                audio_tag = _conj_audio_tag(slug, form_key)
                if audio_tag:
                    mp3_path = audio_dir / f"{slug}_{form_key}.mp3"
                    if mp3_path not in media_files:
                        media_files.append(mp3_path)

            # Present tense expansion: 4 form keys → 1 card each (seeded RNG)
            if form_key in PRESENT_EXPANSION:
                chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
                add_note(chosen[0], chosen[1], conj_form, audio_tag)
                continue

            # Past 3rd plural: same form for m/f → 1 card (seeded RNG)
            if form_key == "past_3p":
                chosen = verb_rng.choice(PAST_3P_EXPANSION)
                add_note(chosen[0], chosen[1], conj_form, audio_tag)
                continue

            # 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
            if form_key in FP_MODERN_FALLBACK:
                mp_key = FP_MODERN_FALLBACK[form_key]
                mp_form = forms.get(mp_key, {}).get("form", "")
                fp_form = conj_form
                display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
                pronoun = form_data.get("pronoun", "")
                tense = form_data.get("tense", "")
                add_note(pronoun, tense, display_form, audio_tag)
                continue

            # Standard card
            pronoun = form_data.get("pronoun", "")
            tense = form_data.get("tense", "")

            # 1st-person forms get a randomly assigned gender label (deterministic per verb)
            if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
                gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
                pronoun = f"{pronoun} ({gender})"

            add_note(pronoun, tense, conj_form, audio_tag)

    logger.info(f"Conjugation deck: {note_count} notes across {sum(1 for v in conjugations.values() if v)} verbs")
    return deck, media_files


# ──────────────────────────────────────────────────────────────────────────────
# Confusables deck — words that look identical without nikkud
# ──────────────────────────────────────────────────────────────────────────────

CONF_FRONT = """
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
<div class="meaning" style="font-size:32px;">מה ההבדל?</div>
"""

CONF_BACK = """
{{FrontSide}}<hr>
<div class="definitions">{{Definitions}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""

CONF_CSS = CARD_CSS

CONF_MODEL = genanki.Model(
    CONF_MODEL_ID,
    "Hebrew Confusables",
    fields=[
        {"name": "Words"},
        {"name": "Definitions"},
        {"name": "Audio"},
        {"name": "WordNoNikkud"},
    ],
    templates=[
        {
            "name": "Confusable",
            "qfmt": CONF_FRONT,
            "afmt": CONF_BACK,
        },
    ],
    css=CONF_CSS,
)


def build_confusables_deck(
    dict_csv: Path,
    include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
    """Build confusables deck from vocab CSV — groups words identical without nikkud."""
    logger.info("Building confusables deck …")
    try:
        df = pd.read_csv(dict_csv, sep=";", index_col=0)
        if df.shape[1] < 3:
            raise ValueError("too few columns")
    except (ValueError, pd.errors.ParserError):
        df = pd.read_csv(dict_csv, index_col=0)

    deck = genanki.Deck(CONF_DECK_ID, "Hebrew Confusables")
    media_files: list[Path] = []
    note_count = 0

    # Group by Word Without Nikkud
    groups = {}
    for _, row in df.iterrows():
        word = str(row.get("Word", "")).strip()
        meaning = str(row.get("Meaning", "")).strip()
        word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
        pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
        if not word or not meaning or meaning in ("nan", "None"):
            continue
        if not word_no_nik or word_no_nik in ("nan", "None"):
            continue
        pos_heb = _translate_pos(pos_raw) if pos_raw and pos_raw not in ("nan", "None") else ""
        groups.setdefault(word_no_nik, []).append((word, meaning, pos_heb))

    for word_no_nik, entries in sorted(groups.items()):
        if len(entries) < 2:
            continue

        # Deduplicate: skip entries with identical word+meaning
        seen = set()
        unique_entries = []
        for w, m, p in entries:
            key = (w, m)
            if key not in seen:
                seen.add(key)
                unique_entries.append((w, m, p))
        if len(unique_entries) < 2:
            continue

        # Build card content
        words_display = " / ".join(w for w, _, _ in unique_entries)
        defs_parts = []
        audio_parts = []
        all_have_audio = True
        for w, m, p in unique_entries:
            pos_label = f" ({p})" if p else ""
            defs_parts.append(
                f'<div class="conf-entry"><span class="hebrew" style="font-size:24px;">{w}</span>'
                f" = {m}{pos_label}</div>"
            )
            if include_audio:
                at = _audio_tag(_strip_nikkud(w))
                if at and at not in audio_parts:
                    audio_parts.append(at)
                    mp3_name = at.removeprefix("[sound:").removesuffix("]")
                    mp3_path = AUDIO_DIR / mp3_name
                    if mp3_path not in media_files:
                        media_files.append(mp3_path)
                else:
                    all_have_audio = False

        # Only include audio if every word in the group has it
        if not all_have_audio:
            audio_parts = []

        defs_html = "\n".join(defs_parts)
        audio_html = " ".join(audio_parts)

        note = genanki.Note(
            model=CONF_MODEL,
            guid=genanki.guid_for("confusable", word_no_nik),
            fields=[words_display, defs_html, audio_html, word_no_nik],
            tags=[RELEASE_TAG],
        )
        deck.add_note(note)
        note_count += 1

    logger.info(f"Confusables deck: {note_count} notes")
    return deck, media_files


def write_conf_apkg(
    deck: genanki.Deck,
    media_files: list[Path] | None = None,
    out_path: Path = CONF_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)
    base = [str(p) for p in (media_files or []) if p.exists()]
    pkg.media_files = base + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Confusables deck written → {out_path}")


# ──────────────────────────────────────────────────────────────────────────────
# Noun plurals deck — singular↔plural drilling
# ──────────────────────────────────────────────────────────────────────────────

PLURAL_FRONT_SG = """
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="meaning">{{Meaning}}</div>
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
"""

PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
"""

PLURAL_FRONT_PL = """
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
"""

PLURAL_BACK_PL = """
{{FrontSide}}<hr>
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="meaning">{{Meaning}}</div>
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
"""

PLURAL_CSS = CARD_CSS

PLURAL_MODEL = genanki.Model(
    PLURAL_MODEL_ID,
    "Hebrew Plurals",
    fields=[
        {"name": "Singular"},
        {"name": "SingularAudio"},
        {"name": "Plural"},
        {"name": "PluralAudio"},
        {"name": "Meaning"},
        {"name": "Root"},
        {"name": "Mishkal"},
        {"name": "Gender"},
    ],
    templates=[
        {
            "name": "Singular → Plural",
            "qfmt": PLURAL_FRONT_SG,
            "afmt": PLURAL_BACK_SG,
        },
        {
            "name": "Plural → Singular",
            "qfmt": PLURAL_FRONT_PL,
            "afmt": PLURAL_BACK_PL,
        },
    ],
    css=PLURAL_CSS,
)


def _is_irregular_plural(gender: str, plural: str) -> bool:
    """Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix."""
    plural_stripped = _strip_nikkud(plural)
    return (gender == "masculine" and plural_stripped.endswith("ות")) or (
        gender == "feminine" and plural_stripped.endswith("ים")
    )


def build_plural_deck(
    noun_plurals_path: Path = DATA_DIR / "noun_plurals.json",
    dict_csv: Path | None = None,
    include_audio: bool = False,
) -> tuple[genanki.Deck, list[Path]]:
    """Build noun plurals deck.

    Selection: ALL irregular plurals + 2-3 high-frequency exemplars per mishkal
    pattern (for regular nouns). Cross-references frequency from vocab CSV.
    """
    logger.info("Building plurals deck …")

    with open(noun_plurals_path) as f:
        all_nouns: dict[str, dict] = json.load(f)

    # Load frequency data for prioritizing exemplars
    freq_order: dict[str, int] = {}
    if dict_csv and dict_csv.exists():
        try:
            vdf = pd.read_csv(dict_csv, sep=";", index_col=0)
            if vdf.shape[1] < 3:
                raise ValueError
        except (ValueError, pd.errors.ParserError):
            vdf = pd.read_csv(dict_csv, index_col=0)
        for idx, row in vdf.iterrows():
            word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
            if word_no_nik and word_no_nik not in ("nan", "None"):
                freq_order[word_no_nik] = idx  # lower index = higher frequency

    # Load meanings from vocab CSV
    meanings: dict[str, str] = {}
    roots: dict[str, str] = {}
    if dict_csv and dict_csv.exists():
        try:
            vdf2 = pd.read_csv(dict_csv, sep=";", index_col=0)
            if vdf2.shape[1] < 3:
                raise ValueError
        except (ValueError, pd.errors.ParserError):
            vdf2 = pd.read_csv(dict_csv, index_col=0)
        for _, row in vdf2.iterrows():
            word = str(row.get("Word", "")).strip()
            if word:
                meanings[word] = str(row.get("Meaning", "")).strip()
                roots[word] = str(row.get("Root", "")).strip()

    deck = genanki.Deck(PLURAL_DECK_ID, "Hebrew Plurals")
    media_files: list[Path] = []

    # Separate irregular plurals from regular (by mishkal)
    irregulars: list[tuple[str, dict]] = []
    by_mishkal: dict[str, list[tuple[str, dict]]] = {}

    for word_key, data in all_nouns.items():
        singular = data.get("singular", "")
        plural = data.get("plural", "")
        gender = data.get("gender", "")
        mishkal = data.get("mishkal", "")
        if not singular or not plural:
            continue

        if _is_irregular_plural(gender, plural):
            irregulars.append((word_key, data))
        elif mishkal:
            by_mishkal.setdefault(mishkal, []).append((word_key, data))

    # Select exemplars per mishkal, preferring high-frequency words.
    # Target 2:1 regular:irregular ratio to avoid over-representing irregulars.
    # Target ≥2:1 regular:irregular ratio — 6 per mishkal compensates for
    # small groups (<6 entries) that can't fill their quota.
    per_mishkal = 6

    selected: list[tuple[str, dict]] = list(irregulars)
    for _mishkal, entries in sorted(by_mishkal.items()):
        # Sort by frequency (lower index = more common)
        entries.sort(key=lambda e: freq_order.get(e[0], 999999))
        selected.extend(entries[:per_mishkal])

    note_count = 0
    for _word_key, data in selected:
        singular = data["singular"]
        plural = data["plural"]
        gender = data.get("gender", "")
        mishkal = data.get("mishkal", "")

        meaning = meanings.get(singular, "")
        if not meaning or meaning in ("nan", "None"):
            # Try without nikkud
            meaning = meanings.get(_strip_nikkud(singular), "")
        root = roots.get(singular, "")
        if not root or root in ("nan", "None", "-"):
            root = ""

        # Audio tags
        sg_audio = ""
        pl_audio = ""
        if include_audio:
            # Use local audio files if available
            sg_no_nik = _strip_nikkud(singular)
            sg_tag = _audio_tag(sg_no_nik)
            if sg_tag:
                sg_audio = sg_tag
                mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
                if mp3_path not in media_files:
                    media_files.append(mp3_path)

        tags = [RELEASE_TAG]
        if mishkal:
            tags.append(f"mishkal::{mishkal}")
        if _is_irregular_plural(gender, plural):
            tags.append("irregular")

        note = genanki.Note(
            model=PLURAL_MODEL,
            guid=genanki.guid_for("plural", singular),
            fields=[
                singular,
                sg_audio,
                plural,
                pl_audio,
                meaning,
                root,
                mishkal,
                gender,
            ],
            tags=tags,
        )
        deck.add_note(note)
        note_count += 1

    irregular_count = len(irregulars)
    regular_count = note_count - irregular_count
    logger.info(
        f"Plurals deck: {note_count} notes "
        f"({irregular_count} irregular + {regular_count} regular exemplars "
        f"from {len(by_mishkal)} mishkal patterns)"
    )
    return deck, media_files


def write_plural_apkg(
    deck: genanki.Deck,
    media_files: list[Path] | None = None,
    out_path: Path = PLURAL_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)
    base = [str(p) for p in (media_files or []) if p.exists()]
    pkg.media_files = base + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Plurals deck written → {out_path}")


def _font_media_files() -> list[str]:
    """Return list of Heebo font file paths that exist, for bundling in .apkg."""
    font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
    return [str(p) for p in font_paths if p.exists()]


class _RandomOrderPackage(genanki.Package):
    """genanki.Package subclass that sets new card order to random (0) instead of insertion order (1)."""

    def write_to_db(self, cursor, timestamp, id_gen):
        super().write_to_db(cursor, timestamp, id_gen)
        row = cursor.execute("SELECT dconf FROM col").fetchone()
        if row:
            dconf = json.loads(row[0])
            for conf in dconf.values():
                if isinstance(conf, dict) and "new" in conf:
                    conf["new"]["order"] = 0
            cursor.execute("UPDATE col SET dconf = ?", [json.dumps(dconf)])


def write_vocab_apkg(
    deck: genanki.Deck,
    media_files: list[Path],
    out_path: Path = VOCAB_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)  # insertion order = frequency rank (new.order=1 default)
    pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Vocabulary deck written → {out_path}")


def write_conj_apkg(
    deck: genanki.Deck,
    media_files: list[Path] | None = None,
    out_path: Path = CONJ_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = _RandomOrderPackage(deck)
    base = [str(p) for p in (media_files or []) if p.exists()]
    pkg.media_files = base + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Conjugation deck written → {out_path}")


def build_complete_deck(
    dict_csv: Path,
    conjugations: dict,
    examples_cache: dict | None = None,
    freq_cache: dict | None = None,
    image_cache: dict | None = None,
    emoji_lookup: dict | None = None,
    limit: int | None = None,
    include_audio: bool = False,
) -> tuple[list[genanki.Deck], list[Path]]:
    """Build all subdecks under 'Hebrew::*' for the combined .apkg.

    Returns (list_of_decks, deduplicated_media_files).
    """
    logger.info(f"  Building complete deck (audio={'yes' if include_audio else 'no'}) …")

    # Build standalone decks using existing functions
    vocab_deck, vocab_media = build_vocab_deck(
        dict_csv,
        examples_cache=examples_cache,
        freq_cache=freq_cache,
        image_cache=image_cache or {},
        emoji_lookup=emoji_lookup,
        limit=limit,
        include_audio=include_audio,
        include_images=True,
    )

    conj_deck, conj_media = build_conj_deck(
        conjugations,
        include_audio=include_audio,
        dict_csv=dict_csv,
    )

    conf_deck, conf_media = build_confusables_deck(
        dict_csv,
        include_audio=include_audio,
    )

    # Create new Deck objects with subdeck names and different IDs
    complete_vocab = genanki.Deck(COMPLETE_VOCAB_DECK_ID, "Hebrew::Vocabulary")
    for note in vocab_deck.notes:
        complete_vocab.add_note(note)

    complete_conj = genanki.Deck(COMPLETE_CONJ_DECK_ID, "Hebrew::Conjugations")
    for note in conj_deck.notes:
        complete_conj.add_note(note)

    complete_conf = genanki.Deck(COMPLETE_CONF_DECK_ID, "Hebrew::Confusables")
    for note in conf_deck.notes:
        complete_conf.add_note(note)

    all_source_media = vocab_media + conj_media + conf_media

    # Plurals subdeck (only if data exists)
    plural_data_path = DATA_DIR / "noun_plurals.json"
    if plural_data_path.exists():
        plural_deck, plural_media = build_plural_deck(
            noun_plurals_path=plural_data_path,
            dict_csv=dict_csv,
            include_audio=include_audio,
        )
        complete_plural = genanki.Deck(COMPLETE_PLURAL_DECK_ID, "Hebrew::Plurals")
        for note in plural_deck.notes:
            complete_plural.add_note(note)
        all_source_media += plural_media
    else:
        complete_plural = None

    # Deduplicate media files by resolved path
    seen_paths: set[str] = set()
    all_media: list[Path] = []
    for mf in all_source_media:
        resolved = str(mf.resolve()) if mf.exists() else str(mf)
        if resolved not in seen_paths:
            seen_paths.add(resolved)
            all_media.append(mf)

    decks = [complete_vocab, complete_conj, complete_conf]
    if complete_plural:
        decks.append(complete_plural)

    plural_info = f" + {len(complete_plural.notes)} plural" if complete_plural else ""
    logger.info(
        f"  Complete deck: {len(complete_vocab.notes)} vocab + "
        f"{len(complete_conj.notes)} conj + {len(complete_conf.notes)} conf{plural_info} notes, "
        f"{len(all_media)} media files"
    )
    return decks, all_media


def write_complete_apkg(
    decks: list[genanki.Deck],
    media_files: list[Path],
    out_path: Path = COMPLETE_APKG,
) -> None:
    """Write a combined .apkg with multiple subdecks."""
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(decks)
    pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Complete deck written → {out_path}")


def build_all_variants(
    dict_csv: Path,
    conjugations: dict,
    examples_cache: dict | None = None,
    freq_cache: dict | None = None,
    image_cache: dict | None = None,
    limit: int | None = None,
) -> None:
    """Build all 6 release variants (4 vocab + 2 conj) into output/."""
    logger.info("Building all release variants …")

    emoji_lookup = _load_emoji_lookup()
    logger.info(f"  Emoji lookup: {len(emoji_lookup)} keywords loaded")

    vocab_variants = [
        (False, False, VOCAB_APKG),
        (True, False, VOCAB_APKG_AUDIO),
        (False, True, VOCAB_APKG_IMAGES),
        (True, True, VOCAB_APKG_AUDIO_IMAGES),
    ]
    for audio, images, path in vocab_variants:
        label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}"
        logger.info(f"  Vocab variant: {label} → {path.name}")
        deck, media = build_vocab_deck(
            dict_csv,
            examples_cache=examples_cache,
            freq_cache=freq_cache,
            image_cache=image_cache or {},
            emoji_lookup=emoji_lookup,
            limit=limit,
            include_audio=audio,
            include_images=images,
        )
        write_vocab_apkg(deck, media, out_path=path)

    conj_variants = [
        (False, CONJ_APKG),
        (True, CONJ_APKG_AUDIO),
    ]
    for audio, path in conj_variants:
        label = f"audio={'yes' if audio else 'no'}"
        logger.info(f"  Conj variant: {label} → {path.name}")
        deck, media = build_conj_deck(conjugations, include_audio=audio, dict_csv=dict_csv)
        write_conj_apkg(deck, media, out_path=path)

    conf_variants = [
        (False, CONF_APKG),
        (True, CONF_APKG_AUDIO),
    ]
    for audio, path in conf_variants:
        label = f"audio={'yes' if audio else 'no'}"
        logger.info(f"  Conf variant: {label} → {path.name}")
        deck, media = build_confusables_deck(dict_csv, include_audio=audio)
        write_conf_apkg(deck, media, out_path=path)

    # Noun plurals (only if data exists)
    plural_data_path = DATA_DIR / "noun_plurals.json"
    if plural_data_path.exists():
        plural_variants = [
            (False, PLURAL_APKG),
            (True, PLURAL_APKG_AUDIO),
        ]
        for audio, path in plural_variants:
            label = f"audio={'yes' if audio else 'no'}"
            logger.info(f"  Plural variant: {label} → {path.name}")
            deck, media = build_plural_deck(
                noun_plurals_path=plural_data_path,
                dict_csv=dict_csv,
                include_audio=audio,
            )
            write_plural_apkg(deck, media, out_path=path)
    else:
        logger.info("  Skipping plural deck (data/noun_plurals.json not found)")

    # Combined "Hebrew::*" complete decks
    complete_variants = [
        (False, COMPLETE_APKG),
        (True, COMPLETE_APKG_AUDIO),
    ]
    for audio, path in complete_variants:
        decks, media = build_complete_deck(
            dict_csv,
            conjugations=conjugations,
            examples_cache=examples_cache,
            freq_cache=freq_cache,
            image_cache=image_cache,
            emoji_lookup=emoji_lookup,
            limit=limit,
            include_audio=audio,
        )
        write_complete_apkg(decks, media, out_path=path)

    logger.info("All variants built.")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

    csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
    if not csv_path.exists():
        csv_path = DATA_DIR / "hebrew_dict.csv"
    if not csv_path.exists():
        csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
    if not csv_path.exists():
        csv_path = DATA_DIR / "pealim_dict.csv"

    deck, media = build_vocab_deck(csv_path, limit=20)
    write_vocab_apkg(deck, media)

    conj_path = DATA_DIR / "conjugations.json"
    if conj_path.exists():
        with open(conj_path) as f:
            conjugations = json.load(f)
        csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
        conj_deck, conj_media = build_conj_deck(conjugations, dict_csv=csv_path)
        write_conj_apkg(conj_deck, conj_media)