hebrew_flash_cards/apkg_builder.py

#!/usr/bin/env python3
"""
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
Uses genanki for reliable, stable deck generation.

Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
in Anki rather than creating a duplicate.
"""

import json
import logging
import random
import re
from pathlib import Path

import genanki

logger = logging.getLogger(__name__)

# Stable deck/model IDs — do not change these
VOCAB_DECK_ID = 1_234_567_890
VOCAB_MODEL_ID = 1_701_222_017_968  # matches Nevo's original Anki model
CONJ_DECK_ID = 1_234_567_892
CONJ_MODEL_ID = 1_234_567_893
CONF_DECK_ID = 1_234_567_894
CONF_MODEL_ID = 1_234_567_895
PLURAL_DECK_ID = 1_234_567_896
PLURAL_MODEL_ID = 1_234_567_897

# Subdeck IDs for combined "Hebrew::*" package — MUST differ from standalone IDs
COMPLETE_VOCAB_DECK_ID = 1_234_567_900
COMPLETE_CONJ_DECK_ID = 1_234_567_901
COMPLETE_CONF_DECK_ID = 1_234_567_902
COMPLETE_PLURAL_DECK_ID = 1_234_567_903

# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.18"

# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)")

DATA_DIR = Path(__file__).parent / "data"

AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
OUTPUT_DIR = Path(__file__).parent / "output"

VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg"
VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg"
VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg"
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg"
CONF_APKG = OUTPUT_DIR / "hebrew_confusables.apkg"
CONF_APKG_AUDIO = OUTPUT_DIR / "hebrew_confusables_audio.apkg"
PLURAL_APKG = OUTPUT_DIR / "hebrew_plurals.apkg"
PLURAL_APKG_AUDIO = OUTPUT_DIR / "hebrew_plurals_audio.apkg"
COMPLETE_APKG = OUTPUT_DIR / "hebrew_complete.apkg"
COMPLETE_APKG_AUDIO = OUTPUT_DIR / "hebrew_complete_audio.apkg"

# ──────────────────────────────────────────────────────────────────────────────
# Binyan → Hebrew label mapping (for conjugation card display)
# ──────────────────────────────────────────────────────────────────────────────

BINYAN_TO_HEBREW: dict[str, str] = {
    "Pa'al": "פָּעַל",
    "Nif'al": "נִפְעַל",
    "Pi'el": "פִּעֵל",
    "Pu'al": "פֻּעַל",
    "Hitpa'el": "הִתְפַּעֵל",
    "Hif'il": "הִפְעִיל",
    "Huf'al": "הֻפְעַל",
}

# ──────────────────────────────────────────────────────────────────────────────
# PoS → Hebrew label mapping
# ──────────────────────────────────────────────────────────────────────────────

POS_TO_HEBREW = {
    "Noun": "שם עצם",
    "Verb": "פועל",
    "Adjective": "שם תואר",
    "Adverb": "תואר הפועל",
    "Preposition": "מילת יחס",
    "Conjunction": "מילת חיבור",
    "Pronoun": "כינוי גוף",
    "Particle": "מילית",
}

# PoS category groupings for related-words display
POS_CATEGORY_LABELS = {
    "Verb": "פעלים",
    "Noun": "שמות עצם",
    "Adjective": "שמות תואר",
    "Adverb": "תוארי הפועל",
}

# ──────────────────────────────────────────────────────────────────────────────
# Shared CSS
# ──────────────────────────────────────────────────────────────────────────────

FONTS_DIR = DATA_DIR / "fonts"

CARD_CSS = """
@font-face {
  font-family: 'Heebo';
  src: url('_Heebo-Regular.ttf');
  font-weight: normal;
}
@font-face {
  font-family: 'Heebo';
  src: url('_Heebo-Bold.ttf');
  font-weight: bold;
}
.card {
  font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
  font-size: 20px;
  text-align: right;
  color: #222;
  background: #fff;
  padding: 16px;
  max-width: 600px;
  margin: 0 auto;
}
.hebrew {
  font-size: 42px;
  font-weight: bold;
  direction: rtl;
  text-align: center;
  line-height: 1.5;
  color: #222;
}
.hebrew-sm {
  font-size: 30px;
  font-weight: normal;
  direction: rtl;
  text-align: center;
  color: #222;
}
.meaning {
  font-size: 34px;
  color: #1a1a8c;
  margin: 8px 0;
  text-align: center;
}
.hint {
  font-size: 22px;
  color: #555;
  margin: 4px 0;
  direction: rtl;
  text-align: center;
}
.example {
  font-size: 24px;
  color: #222;
  direction: rtl;
  text-align: right;
  font-style: italic;
  margin: 10px auto 0;
  max-width: 90%;
  border-right: 3px solid #aaa;
  padding-right: 8px;
}
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
.freq-badge {
  display: inline-block;
  font-size: 11px;
  color: #aaa;
  background: transparent;
  border: 1px solid #eee;
  border-radius: 10px;
  padding: 2px 8px;
  margin-top: 4px;
}
.voice-label {
  font-size: 0.6em;
  font-weight: normal;
  color: #555;
}
.sec-table {
  display: table;
  margin: 6px auto 0;
  direction: rtl;
  border-collapse: collapse;
}
.sec-label {
  display: table-row;
  font-size: 28px;
  font-weight: normal;
  color: #222;
  direction: rtl;
}
.sec-key {
  display: table-cell;
  font-size: 28px;
  color: #222;
  font-weight: bold;
  text-align: right;
  padding: 2px 0 2px 8px;
  white-space: nowrap;
}
.sec-val {
  display: table-cell;
  font-size: 28px;
  color: #222;
  text-align: right;
  padding: 2px 0;
}
.definitions {
  direction: rtl;
  text-align: center;
}
.more-toggle {
  text-align: center;
  direction: rtl;
  margin-top: 8px;
}
.more-header {
  display: inline-block;
  font-size: 18px;
  color: #555;
  cursor: pointer;
  list-style: none;
  border: 1px solid #ccc;
  border-radius: 16px;
  padding: 4px 16px;
  margin: 4px 0;
  background: #f8f8f8;
}
.more-header::-webkit-details-marker { display: none; }
.more-header::before { content: "○ "; font-size: 14px; }
details[open] > .more-header::before { content: "● "; }
.related-header {
  font-size: 22px;
  color: #555;
  text-align: center;
  margin: 4px 0;
}
.rw-word {
  display: table-cell;
  font-size: 28px;
  color: #222;
  font-weight: normal;
  text-align: right;
  padding: 2px 0 2px 8px;
  white-space: nowrap;
}
.rw-meaning {
  display: table-cell;
  font-size: 24px;
  color: #555;
  text-align: left;
  direction: ltr;
  padding: 2px 0;
}
.conf-entry {
  margin: 8px 0;
  font-size: 28px;
  direction: rtl;
}
.emoji-img {
  font-size: 3.5em;
  text-align: center;
  margin: 0.3em 0;
}
.card [type="button"], .card button, .replay-button {
  display: block !important;
  margin: 4px auto !important;
  text-align: center;
}
@media (prefers-color-scheme: dark) {
  .card        { color: #e8e8e8; background: #1c1c1e; }
  .hebrew      { color: #f0f0f0; }
  .hebrew-sm   { color: #e0e0e0; }
  .meaning     { color: #82b0ff; }
  .sec-label   { color: #e0e0e0; }
  .sec-key     { color: #e0e0e0; }
  .sec-val     { color: #e0e0e0; }
  .conf-entry  { color: #ddd; }
  .hint        { color: #777; }
  .voice-label { color: #888; }
  .example     { color: #e0e0e0; border-right-color: #555; }
  .divider     { border-top-color: #333; }
  .freq-badge  { color: #888; border-color: #444; }
  .more-header { color: #bbb; background: #2a2a2e; border-color: #555; }
  .related-header { color: #999; }
  .rw-word     { color: #e0e0e0; }
  .rw-meaning  { color: #999; }
}
"""

# ──────────────────────────────────────────────────────────────────────────────
# Vocabulary Deck
# ──────────────────────────────────────────────────────────────────────────────

VOCAB_FRONT_HEB = """
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""

VOCAB_BACK_HEB = """
{{FrontSide}}
<div class="divider"></div>
<div class="meaning">{{Meaning}}</div>
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
<div class="sec-table">
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
</div>
{{#SharedRoots}}
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">{{SharedRoots}}</div>
{{/SharedRoots}}
</details>
"""

VOCAB_FRONT_ENG = """
<div class="meaning">{{Meaning}}</div>
{{#Hint}}<div class="hint">{{Hint}}</div>{{/Hint}}
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
"""

VOCAB_BACK_ENG = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
<div class="sec-table">
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
</div>
{{#SharedRoots}}
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">{{SharedRoots}}</div>
{{/SharedRoots}}
</details>
"""

VOCAB_FRONT_CLOZE = """
<div class="example" style="font-size:32px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
"""

VOCAB_BACK_CLOZE = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""

VOCAB_MODEL = genanki.Model(
    VOCAB_MODEL_ID,
    "Hebrew Flash Cards",
    fields=[
        {"name": "Word"},
        {"name": "Root"},
        {"name": "PoS"},
        {"name": "Meaning"},
        {"name": "WordNoNikkud"},
        {"name": "SharedRoots"},
        {"name": "Tags"},
        {"name": "Audio"},
        {"name": "Example"},
        {"name": "Frequency"},
        {"name": "Image"},
        {"name": "Emoji"},
        {"name": "Prep"},
        {"name": "Hint"},
        {"name": "Plural"},
        {"name": "Gender"},
        {"name": "ClozeExample"},
        {"name": "ClozeHint"},
    ],
    templates=[
        {
            # ord 0 — matches Nevo's original "Card 2" (Eng→Heb)
            "name": "English → Hebrew",
            "qfmt": VOCAB_FRONT_ENG,
            "afmt": VOCAB_BACK_ENG,
        },
        {
            # ord 1 — matches Nevo's original "Card 3" (Heb→Eng)
            "name": "Hebrew → English",
            "qfmt": VOCAB_FRONT_HEB,
            "afmt": VOCAB_BACK_HEB,
        },
        {
            # ord 2 — Cloze-style sentence fill-in (only generated when ClozeExample is non-empty)
            "name": "Sentence Cloze",
            "qfmt": VOCAB_FRONT_CLOZE,
            "afmt": VOCAB_BACK_CLOZE,
        },
    ],
    css=CARD_CSS,
)

# ──────────────────────────────────────────────────────────────────────────────
# Conjugation Deck
# ──────────────────────────────────────────────────────────────────────────────

CONJ_FRONT = """
<div class="hint">אֵיךְ אוֹמְרִים</div>
<div class="hebrew">{{Pronoun}}</div>
<div class="hebrew" style="color:#1a1a8c;">{{Infinitive}}{{#Prep}} <span class="hebrew-sm">({{Prep}})</span>{{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Tense}}</div>
"""

CONJ_BACK = """
{{FrontSide}}<hr>
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
{{#Meaning}}<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>{{/Meaning}}
<div class="sec-table">
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
</div>
{{#RelatedVocab}}
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header" style="cursor:default;">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">{{RelatedVocab}}</div>
{{/RelatedVocab}}
</details>
"""

CONJ_CSS = CARD_CSS

CONJ_MODEL = genanki.Model(
    CONJ_MODEL_ID,
    "Hebrew Conjugation",
    fields=[
        {"name": "Infinitive"},
        {"name": "ReferenceForm"},
        {"name": "Pronoun"},
        {"name": "Tense"},
        {"name": "ConjugatedForm"},
        {"name": "Root"},
        {"name": "Binyan"},
        {"name": "Voice"},
        {"name": "Audio"},
        {"name": "Meaning"},
        {"name": "RelatedVocab"},
        {"name": "Prep"},
    ],
    templates=[
        {
            "name": "Conjugation Drill",
            "qfmt": CONJ_FRONT,
            "afmt": CONJ_BACK,
        }
    ],
    css=CONJ_CSS,
)

# Present-tense expansion: each form key → list of (pronoun, tense_label)
PRESENT_EXPANSION = {
    "present_ms": [
        ("אֲנִי (זָכָר)", "הוֹוֶה"),
        ("אַתָּה", "הוֹוֶה"),
        ("הוּא", "הוֹוֶה"),
    ],
    "present_fs": [
        ("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
        ("אַתְּ", "הוֹוֶה"),
        ("הִיא", "הוֹוֶה"),
    ],
    "present_mp": [
        ("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
        ("אַתֶּם", "הוֹוֶה"),
        ("הֵם", "הוֹוֶה"),
    ],
    "present_fp": [
        ("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
        ("אַתֶּן", "הוֹוֶה"),
        ("הֵן", "הוֹוֶה"),
    ],
}

# Modern Hebrew: 2fp/3fp future and imperative default to mp form (classical in parens)
FP_MODERN_FALLBACK = {
    "future_2fp": "future_2mp",
    "future_3fp": "future_3mp",
    "imperative_fp": "imperative_mp",
}

# 3rd person plural past: same form for m/f — generate two separate pronoun cards
PAST_3P_EXPANSION = [
    ("הֵם", "עָבָר"),
    ("הֵן", "עָבָר"),
]

# Tense labels with "בְּ" prefix for display on cards
TENSE_WITH_BE = {
    "עָבָר": "בֶּעָבָר",
    "הוֹוֶה": "בַּהוֹוֶה",
    "עָתִיד": "בֶּעָתִיד",
    "צִיּוּוּי": "בַּצִּוּוּי",
}

# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
VOICE_MAP = {
    "Pu'al": "סָבִיל",
    "Huf'al": "סָבִיל",
}

# Tense Hebrew label → English key prefix (for form_key construction)
TENSE_KEY_MAP = {
    "עָבָר": "past",
    "הוֹוֶה": "present",
    "עָתִיד": "future",
    "צִוּוּי": "imperative",
    "צִיּוּוּי": "imperative",  # alternate spelling
}


# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────


def _load_words() -> dict[str, dict]:
    """Load the unified words.json data store."""
    path = DATA_DIR / "words.json"
    with open(path, encoding="utf-8") as f:
        return json.load(f)


def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR, slug: str = "") -> str:
    """Return [sound:xxx.mp3] if audio file exists, else empty string.

    Tries slug-based filename first (for confusable words), then consonant-based.
    """
    if slug:
        slug_path = audio_dir / f"{slug}.mp3"
        if slug_path.exists():
            return f"[sound:{slug_path.name}]"
    safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
    if not safe:
        return ""
    mp3_path = audio_dir / f"{safe}.mp3"
    if mp3_path.exists():
        return f"[sound:{mp3_path.name}]"
    return ""


def _conj_audio_tag(slug: str, form_key: str) -> str:
    """Return [sound:xxx.mp3] for conjugation audio if downloaded."""
    filename = f"{slug}_{form_key}.mp3"
    mp3_path = AUDIO_CONJ_DIR / filename
    if mp3_path.exists():
        return f"[sound:{filename}]"
    return ""


# Keywords excluded when building emoji lookup AND matching meaning text.
# Curated from manual review of all 2,261 emoji-word pairs (Sprint 8).
_EMOJI_STOP = frozenset(
    {
        # Basic stop words
        "to",
        "be",
        "a",
        "an",
        "the",
        "of",
        "in",
        "on",
        "at",
        "for",
        "and",
        "with",
        "by",
        "or",
        "but",
        "not",
        "as",
        "its",
        # Generic emoji description words (too vague)
        "face",
        "hand",
        "sign",
        "symbol",
        "button",
        "small",
        "large",
        "light",
        "dark",
        "open",
        "closed",
        # Numbers → clock emoji (🕐🕑🕒 etc.)
        "one",
        "two",
        "three",
        "four",
        "five",
        "six",
        "seven",
        "eight",
        "nine",
        "ten",
        "hundred",
        "thousand",
        # UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️)
        "next",
        "fast",
        "play",
        "pause",
        "repeat",
        "end",
        "soon",
        "record",
        # Abstract words → misleading object emoji
        "part",
        "place",
        "mark",
        "post",
        "department",
        "store",
        "note",
        "control",
        "level",
        "stop",
        "cover",
        "roll",
        "rolling",
        "pick",
        "over",
        "right",
        "way",
        "skin",
        "drop",
        "middle",
        "piece",
        "section",
        # Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.)
        "north",
        "south",
        "northern",
        "southern",
        "western",
        "eastern",
        "central",
        "territory",
        "kingdom",
        "united",
        "virgin",
        # Common words producing bad emoji matches
        "new",
        "big",
        "full",
        "last",
        "first",
        "double",
        "slightly",
        "without",
        "from",
        "behind",
        "people",
        "position",
        "status",
        "situation",
        "game",
        "call",
        "trade",
        "male",
        "female",
        "person",
        "letter",
        # Polysemous words → wrong emoji sense
        "french",
        "fried",
        "board",
        "bow",
        "water",
        "union",
        "rock",
        "left",
        "back",
        "crane",
        "dash",
        "bar",
        "wheel",
        "horizontal",
    }
)


def _load_emoji_lookup() -> dict[str, str]:
    """Load or fetch Unicode emoji keyword→character lookup.

    Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
    Result is cached in data/emoji_lookup.json.
    Returns empty dict on network failure (safe fallback).
    """
    cache_file = DATA_DIR / "emoji_lookup.json"
    if cache_file.exists():
        with open(cache_file) as f:
            return json.load(f)

    import requests

    try:
        resp = requests.get(
            "https://unicode.org/Public/emoji/latest/emoji-test.txt",
            timeout=30,
        )
        resp.raise_for_status()
    except Exception as e:
        logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
        return {}

    lookup: dict[str, str] = {}
    for line in resp.text.splitlines():
        if "fully-qualified" not in line:
            continue
        m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
        if not m:
            continue
        emoji_char = m.group(1)
        desc = m.group(2).lower().strip()
        for word in desc.split():
            word = word.strip(".,'\"-")
            if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
                lookup[word] = emoji_char

    cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
    logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
    return lookup


def _categorize_pos(pos_str: str) -> str:
    """Return the canonical PoS category key for grouping."""
    base = pos_str.split("–")[0].split("—")[0].strip()
    for cat in POS_CATEGORY_LABELS:
        if base == cat:
            return cat
    return "Other"


def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]:
    """Convert schema's active_forms list to the keyed format the card generator expects.

    Keys are like ``past_1s``, ``present_ms``, ``future_2mp``, ``imperative_fs``.
    Each value dict has: form, form_ktiv, pronoun (Hebrew string), tense (Hebrew label),
    audio_url, guid, guid_candidates.
    """
    result: dict[str, dict] = {}
    for f in forms_list:
        tense_en = TENSE_KEY_MAP.get(f["tense"], f["tense"])
        key = f"{tense_en}_{f['person']}"
        result[key] = {
            "form": f["form"]["nikkud"],
            "form_ktiv": f["form"].get("ktiv_male", ""),
            "pronoun": f.get("pronoun_hebrew", ""),  # Hebrew pronoun string
            "tense": f["tense"],  # Hebrew tense label
            "audio_url": f.get("audio_url", ""),
            "guid": f.get("guid"),
            "guid_candidates": f.get("guid_candidates"),
        }
    return result


# Hebrew prefix letters (אותיות השימוש): בהוכלמש
_PREFIX_LETTERS = frozenset("בהוכלמש")


def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int:
    """Return the number of characters in the cloze token that are prefix (not part of the word).

    For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars).
    Returns 0 if the token starts with the word directly.
    """
    if not word_nikkud or not cloze_token:
        return 0
    # If the token starts with the word nikkud, no prefix
    if cloze_token.startswith(word_nikkud):
        return 0
    # Check if word nikkud appears as a suffix of the token
    idx = cloze_token.find(word_nikkud)
    if idx > 0:
        # Verify prefix chars are valid Hebrew prefix letters
        prefix_part = cloze_token[:idx]
        base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"]
        if base_letters and all(c in _PREFIX_LETTERS for c in base_letters):
            return idx
    return 0


def build_vocab_deck(
    words: dict[str, dict],
    limit: int | None = None,
    include_audio: bool = True,
    include_images: bool = True,
    emoji_lookup: dict | None = None,
) -> tuple[genanki.Deck, list[Path]]:
    """Build the vocabulary deck from the unified words dict.

    Args:
        words: Unified data dict keyed by unique_key (from words.json).
        limit: If set, only process the first N entries (by frequency).
        include_audio: Whether to include audio tags in notes.
        include_images: Whether to include image tags in notes.
        emoji_lookup: Optional Unicode emoji keyword→char mapping for fallback emoji.

    Returns:
        (deck, list_of_media_files)
    """
    logger.info(f"Building vocabulary deck from {len(words)} words …")

    images_dir = DATA_DIR / "images"

    # Build word_unique_key → pos_category dict for related-words grouping
    word_to_pos_cat: dict[str, str] = {}
    for unique_key, entry in words.items():
        pos_raw = entry.get("pos", "")
        if pos_raw:
            word_to_pos_cat[unique_key] = _categorize_pos(pos_raw)
        # Also index by nikkud word (for shared_roots lookup by nikkud form)
        word_nikkud = entry["word"]["nikkud"]
        if word_nikkud not in word_to_pos_cat:
            word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"

    # Sort entries by frequency (null → 999999), applying limit after sort
    def _freq_key(item: tuple[str, dict]) -> int:
        return item[1].get("frequency") or 999_999

    sorted_entries = sorted(words.items(), key=_freq_key)
    if limit:
        sorted_entries = sorted_entries[:limit]

    deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
    media_files: list[Path] = []
    seen_words: set[tuple[str, str]] = set()

    for _unique_key, entry in sorted_entries:
        word_nikkud = entry["word"]["nikkud"]
        word_no_nik = entry["word"].get("ktiv_male", "")
        root_list = entry.get("root") or []
        root = ".".join(root_list)
        pos_raw = entry.get("pos", "")
        pos_heb = entry.get("pos_hebrew", "")
        meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
        meaning = HBPAREN_RE.sub("", meaning).strip()
        # Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning)
        meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
        meaning = re.sub(r"[;:]\s*—", " —", meaning)  # clean "; —" → " —"
        meaning = re.sub(r";\s*:", ";", meaning)  # clean "; :" → ";"
        meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
        meaning = re.sub(r"(\w)\(", r"\1 (", meaning)  # space before opening paren
        meaning = re.sub(r",(\S)", r", \1", meaning)  # space after comma
        meaning_raw = entry.get("meaning_raw", "") or ""
        slug = entry.get("slug", "") or ""
        frequency = entry.get("frequency") or 999_999
        audio_file = entry.get("audio_file", "") or ""
        tags_str = entry.get("tags", "") or ""
        hint_str = entry.get("hint", "") or ""
        shared_roots_keys = entry.get("shared_roots") or []
        is_confusable = entry.get("confusable_group") is not None

        if not word_nikkud or not meaning:
            continue

        # Skip exact duplicates (same word AND same meaning)
        word_meaning_key = (word_nikkud, meaning)
        if word_meaning_key in seen_words:
            logger.debug(f"  Skipping duplicate word+meaning: {word_nikkud}")
            continue
        seen_words.add(word_meaning_key)

        # Frequency display label
        if frequency <= 500:
            freq_display = f"Core #{frequency}"
        elif frequency <= 1500:
            freq_display = f"Essential #{frequency}"
        elif frequency <= 3000:
            freq_display = f"Intermediate #{frequency}"
        elif frequency <= 5000:
            freq_display = f"Upper-intermediate #{frequency}"
        elif frequency <= 10000:
            freq_display = f"Advanced #{frequency}"
        elif frequency < 999_999:
            freq_display = f"Rare #{frequency}"
        else:
            freq_display = "Unlisted"

        # Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup
        emoji_str = ""
        if entry.get("emoji_visible") and entry.get("emoji"):
            emoji_str = entry["emoji"]
        elif not emoji_str and emoji_lookup:
            meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip()
            for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]:
                if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
                    emoji_str = emoji_lookup[kw]
                    break

        # Extract Hebrew prepositions from meaning_raw
        preps = HBPAREN_RE.findall(meaning_raw)
        prep_str = " ".join(f"({p})" for p in preps)

        # Audio — use audio_file from entry; for confusables it's already slug-based
        audio_tag = ""
        if include_audio and audio_file:
            mp3_path = AUDIO_DIR / audio_file
            if mp3_path.exists():
                audio_tag = f"[sound:{audio_file}]"
                if mp3_path not in media_files:
                    media_files.append(mp3_path)
            else:
                # Fallback: try consonant-based filename
                audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "")
                if audio_tag:
                    mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
                    mp3_path_fb = AUDIO_DIR / mp3_name
                    if mp3_path_fb not in media_files:
                        media_files.append(mp3_path_fb)
        elif include_audio:
            audio_tag = _audio_tag(word_no_nik, slug=slug if is_confusable else "")
            if audio_tag:
                mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
                mp3_path_fb = AUDIO_DIR / mp3_name
                if mp3_path_fb not in media_files:
                    media_files.append(mp3_path_fb)

        # Example sentence from vetted examples
        example_html = ""
        examples = entry.get("examples") or {}
        if examples.get("vetted"):
            example_html = examples["vetted"][0]["text"]

        # Cloze: use pre-computed cloze from words.json
        cloze_example = ""
        cloze_hint = ""
        if not is_confusable and examples.get("cloze"):
            cloze_data = examples["cloze"]
            cloze_text = cloze_data.get("text", "")
            start = cloze_data.get("cloze_word_start")
            end = cloze_data.get("cloze_word_end")
            if cloze_text and start is not None and end is not None:
                # Preserve Hebrew prefix letters in the cloze blank
                # e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____"
                cloze_token = cloze_text[start:end]
                prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud)
                cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:]
                # Clean up duplicate adjacent quotation marks (e.g. "" → ")
                cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
                raw_hint = cloze_data.get("cloze_hint") or ""
                if raw_hint:
                    cloze_hint = raw_hint
                else:
                    pos_cat = _categorize_pos(pos_raw) if pos_raw else "Other"
                    cloze_hint = meaning
                    if pos_cat == "Verb" and pos_heb:
                        cloze_hint = f"{meaning} ({pos_heb})"

        # Related words (shared roots) as a table: word — meaning, sorted by frequency
        related_html = ""
        if shared_roots_keys:
            rw_items: list[tuple[int, str, str]] = []  # (sort_key, nikkud, meaning)
            for rw_key in shared_roots_keys:
                rw_entry = words.get(rw_key)
                if rw_entry:
                    rw_nikkud = rw_entry["word"]["nikkud"]
                    rw_meaning = rw_entry.get("meaning") or ""
                    if len(rw_meaning) > 40:
                        rw_meaning = rw_meaning[:37] + "…"
                    rw_freq = rw_entry.get("frequency") or 999999
                else:
                    rw_nikkud = rw_key
                    rw_meaning = ""
                    rw_freq = 999999
                rw_items.append((rw_freq, rw_nikkud, rw_meaning))
            rw_items.sort(key=lambda x: x[0])
            rows_html: list[str] = []
            for _freq, rw_nikkud, rw_meaning in rw_items:
                rows_html.append(
                    f'<div class="sec-label">'
                    f'<span class="rw-word">{rw_nikkud}</span>'
                    f'<span class="rw-meaning">{rw_meaning}</span>'
                    f"</div>"
                )
            related_html = "\n".join(rows_html)

        # Plural form and gender (nouns only)
        plural_str = ""
        gender_str = ""
        if pos_raw.startswith("Noun"):
            noun_inflection = entry.get("noun_inflection")
            if noun_inflection:
                if noun_inflection.get("plural"):
                    plural_str = noun_inflection["plural"].get("nikkud", "")
                gender_raw = noun_inflection.get("gender") or ""
                if gender_raw == "masculine":
                    gender_str = "זָכָר"
                elif gender_raw == "feminine":
                    gender_str = "נְקֵבָה"

        # Image
        image_tag = ""
        if include_images:
            image_filename = entry.get("image") or ""
            if image_filename:
                image_path = images_dir / image_filename
                if image_path.exists():
                    image_tag = image_filename
                    if image_path not in media_files:
                        media_files.append(image_path)

        # GUID: use vocab_legacy_guid from entry, fall back to deterministic
        legacy_guid = entry.get("vocab_legacy_guid")
        note_guid = legacy_guid or genanki.guid_for(word_nikkud, meaning)

        note = genanki.Note(
            model=VOCAB_MODEL,
            guid=note_guid,
            fields=[
                word_nikkud,
                root,
                pos_heb,
                meaning,
                word_no_nik,
                related_html or "",
                tags_str,
                audio_tag,
                example_html,
                freq_display,
                image_tag,
                emoji_str,
                prep_str,
                hint_str,
                plural_str,
                gender_str,
                cloze_example,
                cloze_hint,
            ],
            tags=(tags_str.split() if tags_str else [])
            + [RELEASE_TAG]
            + [f"freq::{freq_display.split()[0]}" if freq_display != "Unlisted" else "freq::Unlisted"],
        )
        deck.add_note(note)

    # Diagnostics
    emoji_count = sum(1 for n in deck.notes if n.fields[11])
    prep_count = sum(1 for n in deck.notes if n.fields[12])
    hint_count = sum(1 for n in deck.notes if n.fields[13])
    plural_count = sum(1 for n in deck.notes if n.fields[14])
    gender_count = sum(1 for n in deck.notes if n.fields[15])
    cloze_count = sum(1 for n in deck.notes if n.fields[16])
    unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
    if emoji_count:
        logger.info(f"  Emoji extracted: {emoji_count} words")
    if prep_count:
        logger.info(f"  Hebrew prepositions extracted: {prep_count} words")
    if hint_count:
        logger.info(f"  Eng→Heb hints: {hint_count} words")
    if plural_count:
        logger.info(f"  Noun plurals on vocab cards: {plural_count} words")
    if gender_count:
        logger.info(f"  Noun gender on vocab cards: {gender_count} words")
    if cloze_count:
        logger.info(f"  Sentence cloze cards: {cloze_count} words")
    logger.info(f"  Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
    logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
    return deck, media_files


def build_conj_deck(
    words: dict[str, dict],
    audio_dir: Path = AUDIO_CONJ_DIR,
    include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
    """Build the conjugation drill deck from words with in_conjugation_deck=True."""
    deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
    media_files: list[Path] = []
    note_count = 0
    verb_count = 0

    # Build root → [(freq, nikkud, meaning)] lookup for cross-linking
    root_words: dict[str, list[tuple[int, str, str]]] = {}
    for entry in words.values():
        root_list = entry.get("root") or []
        root_key = " ".join(root_list)
        if root_key:
            rw_meaning = entry.get("meaning") or ""
            if len(rw_meaning) > 40:
                rw_meaning = rw_meaning[:37] + "…"
            rw_freq = entry.get("frequency") or 999999
            root_words.setdefault(root_key, []).append((rw_freq, entry["word"]["nikkud"], rw_meaning))

    for _unique_key, entry in words.items():
        conj = entry.get("conjugation")
        if not conj or not conj.get("in_conjugation_deck"):
            continue

        active_forms_list = conj.get("active_forms") or []
        if not active_forms_list:
            continue

        verb_count += 1
        infinitive = conj["infinitive"]["nikkud"]
        ref_form = conj["reference_form"]["nikkud"]
        binyan = conj.get("binyan", "")
        binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
        slug = entry.get("slug", "") or ""
        root_list = entry.get("root") or []
        root = ".".join(root_list)
        voice = VOICE_MAP.get(binyan, "")

        meaning_raw = entry.get("meaning_raw", "") or ""
        meaning = entry.get("meaning", "") or ""
        # Extract Hebrew preposition — strip from meaning, show on Hebrew side
        prep_str = ""
        conj_prep = conj.get("prep")
        if conj_prep:
            # Strip any parentheses from stored prep value
            prep_str = conj_prep.strip("() ")
        elif meaning_raw:
            preps = HBPAREN_RE.findall(meaning_raw)
            if preps:
                prep_str = preps[0]
        # Strip Hebrew prepositions from English meaning to avoid duplication
        if prep_str:
            meaning = HBPAREN_RE.sub("", meaning).strip()
            # Also strip from meaning_raw patterns like "(על)"
            meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
            # Clean up double spaces and trailing commas
            meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")

        related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive]
        if related:
            related.sort(key=lambda x: x[0])
            related_rows = []
            for _freq, rw_nikkud, rw_meaning in related[:8]:
                related_rows.append(
                    f'<div class="sec-label">'
                    f'<span class="rw-word">{rw_nikkud}</span>'
                    f'<span class="rw-meaning">{rw_meaning}</span>'
                    f"</div>"
                )
            related_str = "\n".join(related_rows)
        else:
            related_str = ""

        forms = _forms_list_to_dict(active_forms_list)

        def add_note(
            pronoun: str,
            tense: str,
            conj_form: str,
            audio_tag: str,
            _form_key_for_guid: str,
            guid_val: str | None = None,
            guid_candidates: list[str] | None = None,
            *,
            _infinitive: str = infinitive,
            _ref_form: str = ref_form,
            _root: str = root,
            _binyan_heb: str = binyan_heb,
            _voice: str = voice,
            _meaning: str = meaning,
            _related_str: str = related_str,
            _prep_str: str = prep_str,
        ) -> None:
            nonlocal note_count
            if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
                return
            # Apply tense prefix (בְּ)
            display_tense = TENSE_WITH_BE.get(tense, tense)
            # GUID: use stored guid, then first candidate, then deterministic fallback
            if guid_val:
                note_guid = guid_val
            elif guid_candidates:
                note_guid = guid_candidates[0]
            else:
                note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
            note = genanki.Note(
                model=CONJ_MODEL,
                guid=note_guid,
                fields=[
                    _infinitive,
                    _ref_form,
                    pronoun,
                    display_tense,
                    conj_form,
                    _root,
                    _binyan_heb,
                    _voice,
                    audio_tag,
                    _meaning,
                    _related_str,
                    _prep_str,
                ],
                tags=[RELEASE_TAG],
            )
            deck.add_note(note)
            note_count += 1

        # Seeded RNG per verb — deterministic pronoun/gender choices
        verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)

        for form_key, form_data in forms.items():
            primary_form = form_data.get("form", "")
            conj_form = primary_form
            # Infinitive: shown on card front as reference — skip as a quiz form
            if form_key == "infinitive":
                continue

            # Audio tag
            audio_tag = ""
            if include_audio and slug:
                audio_tag = _conj_audio_tag(slug, form_key)
                if audio_tag:
                    mp3_path = audio_dir / f"{slug}_{form_key}.mp3"
                    if mp3_path not in media_files:
                        media_files.append(mp3_path)

            guid_val = form_data.get("guid")
            guid_candidates = form_data.get("guid_candidates")

            # Present tense expansion: 4 form keys → 1 card each (seeded RNG)
            if form_key in PRESENT_EXPANSION:
                chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
                add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates)
                continue

            # Past 3rd plural: same form for m/f → 1 card (seeded RNG)
            if form_key == "past_3p":
                chosen = verb_rng.choice(PAST_3P_EXPANSION)
                add_note(chosen[0], chosen[1], conj_form, audio_tag, form_key, guid_val, guid_candidates)
                continue

            # 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
            if form_key in FP_MODERN_FALLBACK:
                mp_key = FP_MODERN_FALLBACK[form_key]
                mp_form = forms.get(mp_key, {}).get("form", "")
                fp_form = conj_form
                display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
                pronoun = form_data.get("pronoun", "")
                tense = form_data.get("tense", "")
                add_note(pronoun, tense, display_form, audio_tag, form_key, guid_val, guid_candidates)
                continue

            # Standard card
            pronoun = form_data.get("pronoun", "")
            tense = form_data.get("tense", "")

            # 1st-person forms get a randomly assigned gender label (deterministic per verb)
            if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
                gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
                pronoun = f"{pronoun} ({gender})"

            add_note(pronoun, tense, conj_form, audio_tag, form_key, guid_val, guid_candidates)

        # Passive partner forms (Huf'al/Pu'al counterpart)
        hufal_forms_list = conj.get("hufal_pual_forms")
        if hufal_forms_list:
            ref_passive = conj.get("reference_form_passive")
            ref_form_passive = ref_passive["nikkud"] if ref_passive else ref_form
            passive_binyan = "Huf'al" if binyan == "Hif'il" else "Pu'al"
            passive_binyan_heb = BINYAN_TO_HEBREW.get(passive_binyan, passive_binyan)
            passive_voice = VOICE_MAP.get(passive_binyan, "סָבִיל")

            passive_forms = _forms_list_to_dict(hufal_forms_list)

            for form_key, form_data in passive_forms.items():
                primary_form = form_data.get("form", "")
                conj_form = primary_form
                if form_key == "infinitive":
                    continue

                audio_tag = ""
                if include_audio and slug:
                    passive_audio_key = f"passive_{form_key}"
                    audio_tag = _conj_audio_tag(slug, passive_audio_key)
                    if audio_tag:
                        mp3_path = audio_dir / f"{slug}_{passive_audio_key}.mp3"
                        if mp3_path not in media_files:
                            media_files.append(mp3_path)

                guid_val = form_data.get("guid")
                guid_candidates = form_data.get("guid_candidates")

                if form_key in PRESENT_EXPANSION:
                    chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
                    add_note(
                        chosen[0],
                        chosen[1],
                        conj_form,
                        audio_tag,
                        form_key,
                        guid_val,
                        guid_candidates,
                        _ref_form=ref_form_passive,
                        _binyan_heb=passive_binyan_heb,
                        _voice=passive_voice,
                    )
                    continue

                if form_key == "past_3p":
                    chosen = verb_rng.choice(PAST_3P_EXPANSION)
                    add_note(
                        chosen[0],
                        chosen[1],
                        conj_form,
                        audio_tag,
                        form_key,
                        guid_val,
                        guid_candidates,
                        _ref_form=ref_form_passive,
                        _binyan_heb=passive_binyan_heb,
                        _voice=passive_voice,
                    )
                    continue

                if form_key in FP_MODERN_FALLBACK:
                    mp_key = FP_MODERN_FALLBACK[form_key]
                    mp_form = passive_forms.get(mp_key, {}).get("form", "")
                    fp_form = conj_form
                    display_form = f"{mp_form} ({fp_form})" if mp_form and mp_form != fp_form else fp_form
                    pronoun = form_data.get("pronoun", "")
                    tense = form_data.get("tense", "")
                    add_note(
                        pronoun,
                        tense,
                        display_form,
                        audio_tag,
                        form_key,
                        guid_val,
                        guid_candidates,
                        _ref_form=ref_form_passive,
                        _binyan_heb=passive_binyan_heb,
                        _voice=passive_voice,
                    )
                    continue

                pronoun = form_data.get("pronoun", "")
                tense = form_data.get("tense", "")
                if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
                    gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
                    pronoun = f"{pronoun} ({gender})"

                add_note(
                    pronoun,
                    tense,
                    conj_form,
                    audio_tag,
                    form_key,
                    guid_val,
                    guid_candidates,
                    _ref_form=ref_form_passive,
                    _binyan_heb=passive_binyan_heb,
                    _voice=passive_voice,
                )

    logger.info(f"Conjugation deck: {note_count} notes across {verb_count} verbs")
    return deck, media_files


# ──────────────────────────────────────────────────────────────────────────────
# Confusables deck — words that look identical without nikkud
# ──────────────────────────────────────────────────────────────────────────────

CONF_FRONT = """
<div style="direction:rtl; text-align:center;">
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
</div>
"""

CONF_BACK = """
{{FrontSide}}<hr>
<div class="definitions">{{Definitions}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""

CONF_CSS = CARD_CSS

CONF_MODEL = genanki.Model(
    CONF_MODEL_ID,
    "Hebrew Confusables",
    fields=[
        {"name": "Words"},
        {"name": "Definitions"},
        {"name": "Audio"},
        {"name": "WordNoNikkud"},
    ],
    templates=[
        {
            "name": "Confusable",
            "qfmt": CONF_FRONT,
            "afmt": CONF_BACK,
        },
    ],
    css=CONF_CSS,
)


def build_confusables_deck(
    words: dict[str, dict],
    include_audio: bool = True,
) -> tuple[genanki.Deck, list[Path]]:
    """Build confusables deck from words dict — groups words by confusable_group."""
    logger.info("Building confusables deck …")

    deck = genanki.Deck(CONF_DECK_ID, "Hebrew Confusables")
    media_files: list[Path] = []
    note_count = 0

    # Group entries by shared ktiv_male (confusable_group members share the same ktiv_male)
    # Use confusables_guid as the stable note GUID — all members of a group share it.
    # Process each unique guid once.
    seen_guids: set[str] = set()
    # Build guid → list of entries
    guid_to_entries: dict[str, list[dict]] = {}
    for unique_key, entry in words.items():
        if entry.get("confusable_group") is None:
            continue
        guid = entry.get("confusables_guid")
        if not guid:
            # Fall back to ktiv_male-based guid
            guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
        guid_to_entries.setdefault(guid, []).append(entry)

    for guid, group_entries in sorted(
        guid_to_entries.items(),
        key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
    ):
        if guid in seen_guids:
            continue
        seen_guids.add(guid)

        if len(group_entries) < 2:
            continue

        # Deduplicate: skip entries with identical word+meaning
        seen: set[tuple[str, str]] = set()
        unique_entries: list[dict] = []
        for e in group_entries:
            key = (e["word"]["nikkud"], e.get("meaning", ""))
            if key not in seen:
                seen.add(key)
                unique_entries.append(e)
        if len(unique_entries) < 2:
            continue

        word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
        words_display = " / ".join(e["word"]["nikkud"] for e in unique_entries)

        defs_parts: list[str] = []
        audio_parts: list[str] = []
        for e in unique_entries:
            w = e["word"]["nikkud"]
            m = e.get("meaning", "")
            p = e.get("pos_hebrew", "")
            pos_div = f'<div style="font-size:18px; color:#888;">{p}</div>' if p else ""
            defs_parts.append(
                f'<div class="conf-entry">'
                f'<span class="hebrew" style="font-size:24px;">{w}</span>'
                f'<div style="direction:ltr; text-align:center; font-size:22px;">{m}</div>'
                f"{pos_div}</div>"
            )
            if include_audio:
                af = e.get("audio_file", "") or ""
                at = ""
                if af:
                    mp3_path = AUDIO_DIR / af
                    if mp3_path.exists():
                        at = f"[sound:{af}]"
                if not at:
                    slug = e.get("slug", "") or ""
                    ktiv_male = e.get("word", {}).get("ktiv_male", "") or ""
                    at = _audio_tag(ktiv_male, slug=slug)
                if at and at not in audio_parts:
                    audio_parts.append(at)
                    mp3_name = at.removeprefix("[sound:").removesuffix("]")
                    mp3_path = AUDIO_DIR / mp3_name
                    if mp3_path not in media_files:
                        media_files.append(mp3_path)

        defs_html = "\n".join(defs_parts)
        audio_html = " ".join(audio_parts)

        note = genanki.Note(
            model=CONF_MODEL,
            guid=guid,
            fields=[words_display, defs_html, audio_html, word_no_nik],
            tags=[RELEASE_TAG],
        )
        deck.add_note(note)
        note_count += 1

    logger.info(f"Confusables deck: {note_count} notes")
    return deck, media_files


def write_conf_apkg(
    deck: genanki.Deck,
    media_files: list[Path] | None = None,
    out_path: Path = CONF_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)
    base = [str(p) for p in (media_files or []) if p.exists()]
    pkg.media_files = base + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Confusables deck written → {out_path}")


# ──────────────────────────────────────────────────────────────────────────────
# Noun plurals deck — singular↔plural drilling
# ──────────────────────────────────────────────────────────────────────────────

PLURAL_FRONT_SG = """
<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="sec-label">{{Meaning}}</div>
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
"""

PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
<div class="sec-table">
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
</div>
"""

PLURAL_FRONT_PL = """
<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
"""

PLURAL_BACK_PL = """
{{FrontSide}}<hr>
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>
<div class="sec-table">
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
</div>
"""

PLURAL_CSS = CARD_CSS

PLURAL_MODEL = genanki.Model(
    PLURAL_MODEL_ID,
    "Hebrew Plurals",
    fields=[
        {"name": "Singular"},
        {"name": "SingularAudio"},
        {"name": "Plural"},
        {"name": "PluralAudio"},
        {"name": "Meaning"},
        {"name": "Root"},
        {"name": "Mishkal"},
        {"name": "Gender"},
    ],
    templates=[
        {
            "name": "Singular → Plural",
            "qfmt": PLURAL_FRONT_SG,
            "afmt": PLURAL_BACK_SG,
        },
        {
            "name": "Plural → Singular",
            "qfmt": PLURAL_FRONT_PL,
            "afmt": PLURAL_BACK_PL,
        },
    ],
    css=PLURAL_CSS,
)


def _is_irregular_plural(gender: str, plural_ktiv: str) -> bool:
    """Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix.

    Args:
        gender: ``"masculine"`` or ``"feminine"``.
        plural_ktiv: ktiv male (no nikkud) form of the plural.
    """
    return (gender == "masculine" and plural_ktiv.endswith("ות")) or (
        gender == "feminine" and plural_ktiv.endswith("ים")
    )


def build_plural_deck(
    words: dict[str, dict],
    include_audio: bool = False,
) -> tuple[genanki.Deck, list[Path]]:
    """Build noun plurals deck from words with noun_inflection data.

    Selection: ALL irregular plurals + 2-3 high-frequency exemplars per mishkal
    pattern (for regular nouns).
    """
    logger.info("Building plurals deck …")

    deck = genanki.Deck(PLURAL_DECK_ID, "Hebrew Plurals")
    media_files: list[Path] = []

    # Collect all nouns with both singular and plural
    irregulars: list[tuple[str, dict, dict]] = []  # (unique_key, entry, noun_inflection)
    by_mishkal: dict[str, list[tuple[str, dict, dict]]] = {}

    for unique_key, entry in words.items():
        if not entry.get("pos", "").startswith("Noun"):
            continue
        noun_inflection = entry.get("noun_inflection")
        if not noun_inflection:
            continue
        singular_data = noun_inflection.get("singular")
        plural_data = noun_inflection.get("plural")
        if not singular_data or not plural_data:
            continue
        singular = singular_data.get("nikkud", "")
        plural = plural_data.get("nikkud", "")
        plural_ktiv = plural_data.get("ktiv_male", "")
        if not singular or not plural:
            continue

        gender = noun_inflection.get("gender", "")
        mishkal = noun_inflection.get("mishkal") or ""

        if _is_irregular_plural(gender, plural_ktiv):
            irregulars.append((unique_key, entry, noun_inflection))
        elif mishkal:
            by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))

    # Select regular exemplars to achieve a 2:1 regular:irregular ratio.
    # Distribute evenly across mishkal patterns, preferring high-frequency words.
    irregular_count = len(irregulars)
    target_regular = irregular_count * 2
    mishkal_count = len(by_mishkal) or 1
    per_mishkal = max(2, target_regular // mishkal_count)

    selected: list[tuple[str, dict, dict]] = list(irregulars)
    regular_pool: list[tuple[str, dict, dict]] = []
    for _mishkal, entries in sorted(by_mishkal.items()):
        entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
        regular_pool.extend(entries[:per_mishkal])

    # If we overshot, trim to target (keeping highest-frequency across all mishkals)
    if len(regular_pool) > target_regular:
        regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
        regular_pool = regular_pool[:target_regular]

    selected.extend(regular_pool)

    note_count = 0
    for _unique_key, entry, noun_inflection in selected:
        singular = noun_inflection["singular"]["nikkud"]
        singular_ktiv = noun_inflection["singular"].get("ktiv_male", "")
        plural = noun_inflection["plural"]["nikkud"]
        plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
        gender = noun_inflection.get("gender") or ""
        gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
        mishkal_heb = noun_inflection.get("mishkal_hebrew") or ""
        meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
        root_list = entry.get("root") or []
        root = ".".join(root_list)

        # GUID from noun_inflection
        note_guid_raw = noun_inflection.get("plurals_guid")
        note_guid = note_guid_raw if note_guid_raw else genanki.guid_for("plural", singular, meaning)

        # Audio tags
        sg_audio = ""
        pl_audio = ""
        if include_audio:
            sg_tag = _audio_tag(singular_ktiv)
            if sg_tag:
                sg_audio = sg_tag
                mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
                if mp3_path not in media_files:
                    media_files.append(mp3_path)

        mishkal_eng = noun_inflection.get("mishkal") or ""
        tags = [RELEASE_TAG]
        if mishkal_eng:
            tags.append(f"mishkal::{mishkal_eng}")
        if _is_irregular_plural(gender, plural_ktiv):
            tags.append("irregular")

        note = genanki.Note(
            model=PLURAL_MODEL,
            guid=note_guid,
            fields=[
                singular,
                sg_audio,
                plural,
                pl_audio,
                meaning,
                root,
                mishkal_heb,
                gender_heb,
            ],
            tags=tags,
        )
        deck.add_note(note)
        note_count += 1

    irregular_count = len(irregulars)
    regular_count = note_count - irregular_count
    logger.info(
        f"Plurals deck: {note_count} notes "
        f"({irregular_count} irregular + {regular_count} regular exemplars "
        f"from {len(by_mishkal)} mishkal patterns)"
    )
    return deck, media_files


def write_plural_apkg(
    deck: genanki.Deck,
    media_files: list[Path] | None = None,
    out_path: Path = PLURAL_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)
    base = [str(p) for p in (media_files or []) if p.exists()]
    pkg.media_files = base + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Plurals deck written → {out_path}")


def _font_media_files() -> list[str]:
    """Return list of Heebo font file paths that exist, for bundling in .apkg."""
    font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
    return [str(p) for p in font_paths if p.exists()]


class _RandomOrderPackage(genanki.Package):
    """genanki.Package subclass that sets new card order to random (0) instead of insertion order (1)."""

    def write_to_db(self, cursor, timestamp, id_gen):
        super().write_to_db(cursor, timestamp, id_gen)
        row = cursor.execute("SELECT dconf FROM col").fetchone()
        if row:
            dconf = json.loads(row[0])
            for conf in dconf.values():
                if isinstance(conf, dict) and "new" in conf:
                    conf["new"]["order"] = 0
            cursor.execute("UPDATE col SET dconf = ?", [json.dumps(dconf)])


def write_vocab_apkg(
    deck: genanki.Deck,
    media_files: list[Path],
    out_path: Path = VOCAB_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)  # insertion order = frequency rank (new.order=1 default)
    pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Vocabulary deck written → {out_path}")


def write_conj_apkg(
    deck: genanki.Deck,
    media_files: list[Path] | None = None,
    out_path: Path = CONJ_APKG,
) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = _RandomOrderPackage(deck)
    base = [str(p) for p in (media_files or []) if p.exists()]
    pkg.media_files = base + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Conjugation deck written → {out_path}")


def build_complete_deck(
    words: dict[str, dict],
    limit: int | None = None,
    include_audio: bool = False,
    emoji_lookup: dict | None = None,
) -> tuple[list[genanki.Deck], list[Path]]:
    """Build all subdecks under 'Hebrew::*' for the combined .apkg.

    Returns (list_of_decks, deduplicated_media_files).
    """
    logger.info(f"  Building complete deck (audio={'yes' if include_audio else 'no'}) …")

    # Build standalone decks using existing functions
    vocab_deck, vocab_media = build_vocab_deck(
        words,
        limit=limit,
        include_audio=include_audio,
        include_images=True,
        emoji_lookup=emoji_lookup,
    )

    conj_deck, conj_media = build_conj_deck(
        words,
        include_audio=include_audio,
    )

    conf_deck, conf_media = build_confusables_deck(
        words,
        include_audio=include_audio,
    )

    plural_deck, plural_media = build_plural_deck(
        words,
        include_audio=include_audio,
    )

    # Create new Deck objects with subdeck names and different IDs
    complete_vocab = genanki.Deck(COMPLETE_VOCAB_DECK_ID, "Hebrew::Vocabulary")
    for note in vocab_deck.notes:
        complete_vocab.add_note(note)

    complete_conj = genanki.Deck(COMPLETE_CONJ_DECK_ID, "Hebrew::Conjugations")
    for note in conj_deck.notes:
        complete_conj.add_note(note)

    complete_conf = genanki.Deck(COMPLETE_CONF_DECK_ID, "Hebrew::Confusables")
    for note in conf_deck.notes:
        complete_conf.add_note(note)

    complete_plural = genanki.Deck(COMPLETE_PLURAL_DECK_ID, "Hebrew::Plurals")
    for note in plural_deck.notes:
        complete_plural.add_note(note)

    all_source_media = vocab_media + conj_media + conf_media + plural_media

    # Deduplicate media files by resolved path
    seen_paths: set[str] = set()
    all_media: list[Path] = []
    for mf in all_source_media:
        resolved = str(mf.resolve()) if mf.exists() else str(mf)
        if resolved not in seen_paths:
            seen_paths.add(resolved)
            all_media.append(mf)

    decks = [complete_vocab, complete_conj, complete_conf, complete_plural]

    plural_info = f" + {len(complete_plural.notes)} plural"
    logger.info(
        f"  Complete deck: {len(complete_vocab.notes)} vocab + "
        f"{len(complete_conj.notes)} conj + {len(complete_conf.notes)} conf{plural_info} notes, "
        f"{len(all_media)} media files"
    )
    return decks, all_media


def write_complete_apkg(
    decks: list[genanki.Deck],
    media_files: list[Path],
    out_path: Path = COMPLETE_APKG,
) -> None:
    """Write a combined .apkg with multiple subdecks."""
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(decks)
    pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Complete deck written → {out_path}")


def build_all_variants(
    words: dict[str, dict],
    limit: int | None = None,
) -> None:
    """Build all 12 release variants into output/."""
    logger.info("Building all release variants …")

    emoji_lookup = _load_emoji_lookup()
    logger.info(f"  Emoji lookup: {len(emoji_lookup)} keywords loaded")

    vocab_variants = [
        (False, False, VOCAB_APKG),
        (True, False, VOCAB_APKG_AUDIO),
        (False, True, VOCAB_APKG_IMAGES),
        (True, True, VOCAB_APKG_AUDIO_IMAGES),
    ]
    for audio, images, path in vocab_variants:
        label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}"
        logger.info(f"  Vocab variant: {label} → {path.name}")
        deck, media = build_vocab_deck(
            words,
            limit=limit,
            include_audio=audio,
            include_images=images,
            emoji_lookup=emoji_lookup,
        )
        write_vocab_apkg(deck, media, out_path=path)

    conj_variants = [
        (False, CONJ_APKG),
        (True, CONJ_APKG_AUDIO),
    ]
    for audio, path in conj_variants:
        label = f"audio={'yes' if audio else 'no'}"
        logger.info(f"  Conj variant: {label} → {path.name}")
        deck, media = build_conj_deck(words, include_audio=audio)
        write_conj_apkg(deck, media, out_path=path)

    conf_variants = [
        (False, CONF_APKG),
        (True, CONF_APKG_AUDIO),
    ]
    for audio, path in conf_variants:
        label = f"audio={'yes' if audio else 'no'}"
        logger.info(f"  Conf variant: {label} → {path.name}")
        deck, media = build_confusables_deck(words, include_audio=audio)
        write_conf_apkg(deck, media, out_path=path)

    plural_variants = [
        (False, PLURAL_APKG),
        (True, PLURAL_APKG_AUDIO),
    ]
    for audio, path in plural_variants:
        label = f"audio={'yes' if audio else 'no'}"
        logger.info(f"  Plural variant: {label} → {path.name}")
        deck, media = build_plural_deck(words, include_audio=audio)
        write_plural_apkg(deck, media, out_path=path)

    # Combined "Hebrew::*" complete decks
    complete_variants = [
        (False, COMPLETE_APKG),
        (True, COMPLETE_APKG_AUDIO),
    ]
    for audio, path in complete_variants:
        decks, media = build_complete_deck(
            words,
            limit=limit,
            include_audio=audio,
            emoji_lookup=emoji_lookup,
        )
        write_complete_apkg(decks, media, out_path=path)

    logger.info("All variants built.")


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

    words = _load_words()
    deck, media = build_vocab_deck(words, limit=20)
    write_vocab_apkg(deck, media)

    conj_deck, conj_media = build_conj_deck(words)
    write_conj_apkg(conj_deck, conj_media)