hebrew_flash_cards/scripts/migrate_to_json.py

"""Migration script: builds data/words.json from all existing data sources.

Run:
    python3 scripts/migrate_to_json.py
    python3 scripts/migrate_to_json.py --dry-run
"""

from __future__ import annotations

import argparse
import csv
import json
import logging
import re
import sys
import unicodedata
from collections import defaultdict
from pathlib import Path
from typing import Any

import genanki

# ---------------------------------------------------------------------------
# Bootstrap: parent package helpers
# ---------------------------------------------------------------------------
sys.path.insert(0, str(Path(__file__).parent.parent))
from helpers import strip_nikkud  # noqa: E402

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    format="%(levelname)s  %(message)s",
    level=logging.INFO,
)
log = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_DIR = Path(__file__).parent.parent / "data"
OUTPUT_FILE = DATA_DIR / "words.json"
MIGRATION_DATE = "2026-03-08"

EMOJI_RE = re.compile(
    r"[\U0001F300-\U0001FFFF"
    r"\U00002600-\U000027BF"
    r"\U0001F000-\U0001F9FF"
    r"\u2600-\u26FF"
    r"\u2700-\u27BF]+",
    re.UNICODE,
)


# NFC-normalise once; used throughout for consistent Unicode comparisons.
def _nfc(s: str) -> str:
    return unicodedata.normalize("NFC", s)


# ---------------------------------------------------------------------------
# PoS → Hebrew mapping
# ---------------------------------------------------------------------------
POS_HEBREW: dict[str, str] = {
    "Noun": "שֵׁם עֶצֶם",
    "Verb": "פֹּעַל",
    "Adjective": "שֵׁם תֹּאַר",
    "Adverb": "תֹּאַר הַפֹּעַל",
    "Pronoun": "כִּנּוּי גּוּף",
    "Preposition": "מִילַּת יַחַס",
    "Conjunction": "מִילַּת חִבּוּר",
    "Interjection": "מִילַּת קְרִיאָה",
    "Numeral": "שֵׁם מִסְפָּר",
    "Cardinal numeral": "שֵׁם מִסְפָּר",
    "Particle": "מִילִּית",
    "Determiner": "מְגַדִּיר",
    "Existential": "מִילַּת קִיּוּם",
    "Interrogative": "מִילַּת שְׁאֵלָה",
}

# Binyan suffix appended to pos_hebrew for verbs
BINYAN_HEBREW: dict[str, str] = {
    "Pa'al": "פָּעַל",
    "Nif'al": "נִפְעַל",
    "Pi'el": "פִּיעֵל",
    "Pu'al": "פֻּעַל",
    "Hif'il": "הִפְעִיל",
    "Huf'al": "הֻפְעַל",
    "Hitpa'el": "הִתְפַּעֵל",
}

# Conjugation form-key → person code
FORM_KEY_TO_PERSON: dict[str, str] = {
    "present_ms": "ms",
    "present_fs": "fs",
    "present_mp": "mp",
    "present_fp": "fp",
    "past_1s": "1s",
    "past_1p": "1p",
    "past_2ms": "2ms",
    "past_2fs": "2fs",
    "past_2mp": "2mp",
    "past_2fp": "2fp",
    "past_3ms": "3ms",
    "past_3fs": "3fs",
    "past_3p": "3p",
    "future_1s": "1s",
    "future_1p": "1p",
    "future_2ms": "2ms",
    "future_2fs": "2fs",
    "future_2mp": "2mp",
    "future_2fp": "2fp",
    "future_3ms": "3ms",
    "future_3fs": "3fs",
    "future_3mp": "3mp",
    "future_3fp": "3fp",
    "imperative_ms": "ms",
    "imperative_fs": "fs",
    "imperative_mp": "mp",
    "imperative_fp": "fp",
    "infinitive": "inf",
}

# Mirrors apkg_builder.PRESENT_EXPANSION — all pronoun/tense choices per present form key.
# The builder uses a per-verb seeded RNG to pick one; we store all possible GUIDs.
PRESENT_EXPANSION: dict[str, list[tuple[str, str]]] = {
    "present_ms": [
        ("אֲנִי (זָכָר)", "הוֹוֶה"),
        ("אַתָּה", "הוֹוֶה"),
        ("הוּא", "הוֹוֶה"),
    ],
    "present_fs": [
        ("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
        ("אַתְּ", "הוֹוֶה"),
        ("הִיא", "הוֹוֶה"),
    ],
    "present_mp": [
        ("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
        ("אַתֶּם", "הוֹוֶה"),
        ("הֵם", "הוֹוֶה"),
    ],
    "present_fp": [
        ("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
        ("אַתֶּן", "הוֹוֶה"),
        ("הֵן", "הוֹוֶה"),
    ],
}

# Mirrors apkg_builder.PAST_3P_EXPANSION
PAST_3P_EXPANSION: list[tuple[str, str]] = [
    ("הֵם", "עָבָר"),
    ("הֵן", "עָבָר"),
]

# Mirrors apkg_builder.FP_MODERN_FALLBACK
FP_MODERN_FALLBACK: dict[str, str] = {
    "future_2fp": "future_2mp",
    "future_3fp": "future_3mp",
    "imperative_fp": "imperative_mp",
}

# 1st-person forms that get a randomly assigned gender label in the builder
_FIRST_PERSON_GENDERED: set[str] = {"past_1s", "past_1p", "future_1s", "future_1p"}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _strip(text: str) -> str:
    """Strip nikkud using the shared helper."""
    return strip_nikkud(text)


def _hebrew_word(nikkud: str) -> dict[str, str]:
    """Build a {nikkud, ktiv_male} sub-object."""
    return {"nikkud": nikkud, "ktiv_male": _strip(nikkud)}


def _parse_root(raw: str) -> list[str]:
    """Parse root string like 'שׁ - מ - ר' into list of consonants.

    Returns empty list for '-' or empty input.
    """
    raw = raw.strip()
    if not raw or raw == "-":
        return []
    parts = [p.strip() for p in raw.split(" - ")]
    return [p for p in parts if p]


def _extract_emoji(meaning: str) -> tuple[str, str | None]:
    """Split emoji from meaning string.

    Returns (clean_meaning, emoji_char_or_None).
    """
    emoji_match = EMOJI_RE.search(meaning)
    if not emoji_match:
        return meaning.strip(), None
    emoji = emoji_match.group(0)
    clean = EMOJI_RE.sub("", meaning).strip()
    # Collapse multiple spaces
    clean = re.sub(r"\s{2,}", " ", clean).strip()
    return clean, emoji


def _parse_pos(raw_pos: str) -> tuple[str, str]:
    """Return (pos_english, pos_hebrew) from raw PoS string.

    Handles patterns like:
    - "Noun – masculine"           → ("Noun", "שֵׁם עֶצֶם")
    - "Verb –pa'al"                → ("Verb", "פֹּעַל — פָּעַל")
    - "Noun –ketelpattern, feminine" → ("Noun", "שֵׁם עֶצֶם")
    - "–"                          → ("Existential", "מִילַּת קִיּוּם")
    - "Cardinal numeral – masculine" → ("Cardinal numeral", "שֵׁם מִסְפָּר")
    """
    raw_pos = raw_pos.strip()

    # Special case for bare "–" (יש, אין)
    if raw_pos == "–":
        return "Existential", POS_HEBREW["Existential"]

    # Split on " – " (em-dash with spaces) or " –" (em-dash no space)
    first_part = re.split(r"\s*–", raw_pos)[0].strip()

    # Map the first word to canonical English PoS key
    # "Cardinal numeral" needs two words
    if first_part.lower().startswith("cardinal"):
        pos_en = "Cardinal numeral"
    else:
        pos_en = first_part.split()[0].capitalize() if first_part else raw_pos

    # Detect binyan for verbs: "Verb –pa'al" → part after – is binyan slug
    binyan_hebrew: str | None = None
    if pos_en == "Verb":
        # extract the binyan part: everything after the dash, strip "pattern" etc.
        after = re.split(r"–\s*", raw_pos, maxsplit=1)
        if len(after) > 1:
            binyan_slug_raw = after[1].split(",")[0].strip()
            # Normalise: "pa'al" → "Pa'al", "hif'il" → "Hif'il" etc.
            for k in BINYAN_HEBREW:
                if k.lower() == binyan_slug_raw.lower():
                    binyan_hebrew = BINYAN_HEBREW[k]
                    break

    base_hebrew = POS_HEBREW.get(pos_en, "")
    if binyan_hebrew:
        pos_hebrew = f"{base_hebrew} — {binyan_hebrew}" if base_hebrew else binyan_hebrew
    else:
        pos_hebrew = base_hebrew

    return pos_en, pos_hebrew


def _strip_construct_hyphen(form: str) -> str:
    """Remove trailing maqqef hyphen from construct form (e.g. 'אֲבִי־' → 'אֲבִי')."""
    return form.rstrip("־").rstrip("-").strip()


# ---------------------------------------------------------------------------
# Data loaders
# ---------------------------------------------------------------------------


def load_csv(path: Path) -> list[dict[str, str]]:
    rows: list[dict[str, str]] = []
    with path.open(encoding="utf-8") as f:
        reader = csv.DictReader(f, delimiter=";")
        for row in reader:
            rows.append(dict(row))
    log.info("Loaded %d rows from %s", len(rows), path.name)
    return rows


def load_json(path: Path) -> Any:
    with path.open(encoding="utf-8") as f:
        data = json.load(f)
    log.info("Loaded %s (%d entries)", path.name, len(data))
    return data


# ---------------------------------------------------------------------------
# Build legacy GUID lookup
# ---------------------------------------------------------------------------


def build_guid_lookup(
    guid_map: dict[str, str],
) -> tuple[dict[str, str], dict[tuple[str, str], str]]:
    """Split guid_map into plain-word and (word, meaning) keyed dicts.

    All keys NFC-normalised for consistent comparison.
    """
    base: dict[str, str] = {}
    disambig: dict[tuple[str, str], str] = {}
    for raw_k, guid in guid_map.items():
        k = _nfc(raw_k)
        if "||" in k:
            word, meaning = k.split("||", 1)
            disambig[(word, meaning)] = guid
        else:
            base[k] = guid
    return base, disambig


def resolve_guid(
    word_nikkud: str,
    meaning: str,
    base: dict[str, str],
    disambig: dict[tuple[str, str], str],
) -> str | None:
    w = _nfc(word_nikkud)
    m = _nfc(meaning)
    # Prefer explicit disambiguation
    if (w, m) in disambig:
        return disambig[(w, m)]
    # Check any disambiguation key that starts with same prefix (truncated meanings)
    for (dw, dm), g in disambig.items():
        if dw == w and (m.startswith(dm) or dm.startswith(m[:20])):
            return g
    return base.get(w)


# ---------------------------------------------------------------------------
# Unique key generation
# ---------------------------------------------------------------------------


def build_unique_keys(
    rows: list[dict[str, str]],
) -> tuple[dict[int, str], list[str]]:
    """Assign unique_key to each CSV row (by index).

    Escalation:
        1. nikkud word
        2. "word|pos"             (if nikkud collides)
        3. "word|pos|meaning"     (if nikkud+pos collides)
        4. "word|pos|meaning|N"   (N=2,3,… for true CSV exact-duplicates)

    Returns:
        idx_to_key  — map from CSV row index to unique_key
        collisions  — list of collision descriptions logged
    """
    collisions: list[str] = []
    idx_to_key: dict[int, str] = {}

    def _pos_short(pos: str) -> str:
        """Canonical short PoS label for key construction."""
        if pos == "–":
            return "Existential"
        return re.split(r"\s*[–-]", pos)[0].strip()

    # Pass 1: try plain nikkud key
    key_to_indices: dict[str, list[int]] = defaultdict(list)
    for i, row in enumerate(rows):
        k = row["Word"]
        key_to_indices[k].append(i)

    for k, indices in key_to_indices.items():
        if len(indices) == 1:
            idx_to_key[indices[0]] = k
        else:
            collisions.append(f"Nikkud collision '{k}' ({len(indices)} rows) — escalating to word|pos")
            # Pass 2: try word|pos
            pos_key_to_indices: dict[str, list[int]] = defaultdict(list)
            for i in indices:
                short_pos = _pos_short(rows[i]["Part of Speech"])
                pos_key = f"{k}|{short_pos}"
                pos_key_to_indices[pos_key].append(i)
            for pk, pk_indices in pos_key_to_indices.items():
                if len(pk_indices) == 1:
                    idx_to_key[pk_indices[0]] = pk
                else:
                    collisions.append(
                        f"  Nikkud+PoS collision '{pk}' ({len(pk_indices)} rows) — escalating to word|pos|meaning"
                    )
                    # Pass 3: try word|pos|meaning
                    meaning_key_to_indices: dict[str, list[int]] = defaultdict(list)
                    for j in pk_indices:
                        meaning = rows[j]["Meaning"]
                        full_key = f"{pk}|{meaning}"
                        meaning_key_to_indices[full_key].append(j)
                    for mk, mk_indices in meaning_key_to_indices.items():
                        if len(mk_indices) == 1:
                            idx_to_key[mk_indices[0]] = mk
                        else:
                            # True exact duplicates: append numeric suffix |2, |3, …
                            collisions.append(
                                f"    Exact duplicate '{mk}' ({len(mk_indices)} rows, same slug) "
                                f"— appending numeric suffix"
                            )
                            idx_to_key[mk_indices[0]] = mk
                            for n, j in enumerate(mk_indices[1:], start=2):
                                idx_to_key[j] = f"{mk}|{n}"

    # Verify completeness
    unkeyed = [i for i in range(len(rows)) if i not in idx_to_key]
    if unkeyed:
        log.error("BUG: %d rows have no unique_key assigned!", len(unkeyed))

    return idx_to_key, collisions


# ---------------------------------------------------------------------------
# Conjugation builder
# ---------------------------------------------------------------------------


def _conj_guids(
    infinitive_nikkud: str,
    form_key: str,
    form_data: dict,
) -> list[str]:
    """Return the list of possible GUIDs for a conjugation form.

    Mirrors apkg_builder's add_note call logic:
    - Present tense: one GUID per PRESENT_EXPANSION choice (all stored).
    - past_3p: two GUIDs (הֵם / הֵן).
    - FP_MODERN_FALLBACK keys: GUID from form_data pronoun/tense directly.
    - 1st-person gendered: two GUIDs (זָכָר / נְקֵבָה suffix).
    - Standard: single GUID from form_data pronoun + tense.

    The builder uses a seeded RNG to *pick one* for present/past_3p; we store
    all candidates so a future reader can identify which GUID is live.
    """
    if form_key in PRESENT_EXPANSION:
        return [genanki.guid_for(infinitive_nikkud, pronoun, tense) for pronoun, tense in PRESENT_EXPANSION[form_key]]

    if form_key == "past_3p":
        return [genanki.guid_for(infinitive_nikkud, pronoun, tense) for pronoun, tense in PAST_3P_EXPANSION]

    if form_key in FP_MODERN_FALLBACK:
        # Builder uses form_data pronoun/tense directly for these
        pronoun = form_data.get("pronoun", "")
        tense = form_data.get("tense", "")
        return [genanki.guid_for(infinitive_nikkud, pronoun, tense)]

    pronoun = form_data.get("pronoun", "")
    tense = form_data.get("tense", "")

    if form_key in _FIRST_PERSON_GENDERED:
        # Builder appends " (זָכָר)" or " (נְקֵבָה)" — store both
        return [
            genanki.guid_for(infinitive_nikkud, f"{pronoun} (זָכָר)", tense),
            genanki.guid_for(infinitive_nikkud, f"{pronoun} (נְקֵבָה)", tense),
        ]

    return [genanki.guid_for(infinitive_nikkud, pronoun, tense)]


def build_conjugation_forms(forms_dict: dict, infinitive_nikkud: str = "") -> list[dict]:
    """Convert raw forms dict to list of ConjugationForm objects.

    Args:
        forms_dict: Raw forms dict from conjugations.json.
        infinitive_nikkud: Nikkud infinitive string used for GUID generation.
    """
    result: list[dict] = []
    # We store all candidate GUIDs rather than selecting one
    for form_key, form_data in forms_dict.items():
        if form_key == "infinitive":
            continue  # stored separately at conjugation.infinitive
        person = FORM_KEY_TO_PERSON.get(form_key)
        if person is None:
            log.warning("Unknown form key: %s", form_key)
            continue
        nikkud_form = form_data.get("form", "")
        if not nikkud_form:
            continue
        guids = _conj_guids(infinitive_nikkud, form_key, form_data) if infinitive_nikkud else []
        result.append(
            {
                "person": person,
                "tense": form_data.get("tense", ""),
                "pronoun_hebrew": form_data.get("pronoun", ""),
                "form": _hebrew_word(nikkud_form),
                "audio_url": form_data.get("audio_url") or None,
                "audio_file": None,
                "guid": guids[0] if len(guids) == 1 else None,
                "guid_candidates": guids if len(guids) > 1 else None,
            }
        )
    return result


# ---------------------------------------------------------------------------
# Main migration
# ---------------------------------------------------------------------------


def migrate(dry_run: bool = False) -> None:  # noqa: C901 (complex but linear)
    # ------------------------------------------------------------------
    # 1. Load all sources
    # ------------------------------------------------------------------
    csv_rows = load_csv(DATA_DIR / "hebrew_dict_for_anki.csv")
    conjugations: dict = load_json(DATA_DIR / "conjugations.json")
    noun_plurals: dict = load_json(DATA_DIR / "noun_plurals.json")
    vetted_sentences: dict = load_json(DATA_DIR / "vetted_sentences.json")
    guid_map_raw: dict = load_json(DATA_DIR / "legacy_guid_map.json")
    refined_meanings: dict = load_json(DATA_DIR / "refined_meanings.json")
    image_cache: dict = load_json(DATA_DIR / "image_cache.json")
    frequency_cache: dict = load_json(DATA_DIR / "frequency_cache.json")
    # ------------------------------------------------------------------
    # 2. Pre-process lookups
    # ------------------------------------------------------------------
    guid_base, guid_disambig = build_guid_lookup(guid_map_raw)

    # noun_plurals: two lookup maps — by slug (primary), by nikkud singular (fallback)
    plurals_by_slug: dict[str, dict] = {}
    plurals_by_nikkud: dict[str, dict] = {}
    for pdata in noun_plurals.values():
        slug = pdata.get("slug", "")
        if slug:
            plurals_by_slug[slug] = pdata
        sing = _nfc(pdata.get("singular", ""))
        if sing:
            plurals_by_nikkud[sing] = pdata

    # vetted_sentences: keyed by stripped word; build NFC lookup of word_nikkud too
    sentences_by_stripped: dict[str, dict] = {}
    for sdata in vetted_sentences.values():
        wn = sdata.get("word_nikkud", "")
        if wn:
            sentences_by_stripped[_strip(wn)] = sdata

    # conjugations: indexed by slug (100% coverage) and by stripped infinitive
    # Some active/passive pairs share the same slug (e.g. הופל/להפיל → 1231-lehapil).
    # When slug collides, always prefer the ACTIVE verb in conj_by_slug so the
    # entry is correctly associated with its active conjugation data.
    conj_by_slug: dict[str, dict] = {}
    conj_by_stripped_inf: dict[str, dict] = {}
    for cdata in conjugations.values():
        slug = cdata.get("slug", "")
        if slug:
            existing = conj_by_slug.get(slug)
            if existing is None:
                conj_by_slug[slug] = cdata
            elif cdata.get("is_passive") and not existing.get("is_passive"):
                # Keep the active verb; skip overwriting with passive
                pass
            elif existing.get("is_passive") and not cdata.get("is_passive"):
                # Replace passive with active
                conj_by_slug[slug] = cdata
            else:
                conj_by_slug[slug] = cdata
        inf = cdata.get("infinitive", "")
        if inf:
            conj_by_stripped_inf[_strip(inf)] = cdata

    # Build passive→active link:
    # passive verbs store reference_form = nikkud infinitive of the ACTIVE verb
    # We need: active_slug → passive_conj_data
    passive_by_active_slug: dict[str, dict] = {}
    for cdata in conjugations.values():
        if not cdata.get("is_passive"):
            continue
        ref_nikkud = cdata.get("reference_form", "")
        ref_stripped = _strip(ref_nikkud)
        # find the active verb's slug
        active_cdata = conj_by_stripped_inf.get(ref_stripped)
        if active_cdata:
            active_slug = active_cdata.get("slug", "")
            if active_slug:
                passive_by_active_slug[active_slug] = cdata
        else:
            log.warning(
                "Passive verb '%s' references active '%s' (stripped='%s') — no match in conjugations",
                cdata.get("infinitive"),
                ref_nikkud,
                ref_stripped,
            )

    # refined_meanings: NFC-keyed
    refined_nfc: dict[str, str] = {_nfc(k): v for k, v in refined_meanings.items()}

    # image_cache: stripped-word keyed
    image_stripped: dict[str, str | None] = dict(image_cache)

    # frequency_cache: stripped-word keyed
    freq_stripped: dict[str, int] = {k: int(v) for k, v in frequency_cache.items() if v is not None}

    # ------------------------------------------------------------------
    # 3. Assign unique keys
    # ------------------------------------------------------------------
    idx_to_key, collisions = build_unique_keys(csv_rows)
    for msg in collisions:
        log.info("KEY COLLISION: %s", msg)
    log.info("Collision summary: %d collision events", len(collisions))

    # ------------------------------------------------------------------
    # 3b. Identify exact-duplicate |N suffix rows to skip
    # ------------------------------------------------------------------
    # |N suffix rows (N=2,3,…) are true CSV exact-duplicates that share the
    # same slug as the base entry.  We drop them entirely so the unique_key
    # space stays clean and no GUID collisions are emitted.
    import re as _re

    _dup_indices: set[int] = set()
    for _i, _k in idx_to_key.items():
        if _re.search(r"\|\d+$", _k):
            _base_k = _re.sub(r"\|\d+$", "", _k)
            _base_i = next((j for j, kk in idx_to_key.items() if kk == _base_k), None)
            if _base_i is not None and csv_rows[_i]["slug"] == csv_rows[_base_i]["slug"]:
                _dup_indices.add(_i)
    if _dup_indices:
        log.info(
            "Skipping %d exact-duplicate |N suffix rows (same slug as base entry)",
            len(_dup_indices),
        )

    # ------------------------------------------------------------------
    # 4. Confusable groups: group by ktiv_male (from ktiv_male_forms)
    # ------------------------------------------------------------------
    # Build: stripped_word → set of slugs sharing that ktiv_male form
    # We care about the *base* form (absolute_singular or absolute form of the word).
    # Strategy: use "Word Without Nikkud" from CSV as ktiv_male, then group slugs.
    # A confusable group = multiple *different* slugs sharing the same ktiv_male.
    slug_to_ktiv_male: dict[str, str] = {}
    for row in csv_rows:
        slug_to_ktiv_male[row["slug"]] = row["Word Without Nikkud"]

    ktiv_male_to_slugs: dict[str, set[str]] = defaultdict(set)
    for slug, km in slug_to_ktiv_male.items():
        ktiv_male_to_slugs[km].add(slug)

    # Only keep those with >1 distinct slug
    confusable_slug_groups: dict[str, set[str]] = {
        km: slugs for km, slugs in ktiv_male_to_slugs.items() if len(slugs) > 1
    }
    log.info("Confusable ktiv_male groups: %d", len(confusable_slug_groups))

    # Build reverse: slug → list of co-confusable slugs
    slug_to_confusable_slugs: dict[str, set[str]] = {}
    for _km, slugs in confusable_slug_groups.items():
        for slug in slugs:
            slug_to_confusable_slugs[slug] = slugs - {slug}

    # We need to map slug → unique_key(s) for the confusable_group field
    # But unique_key is per-row; one slug may map to multiple keys (duplicate entries with same slug).
    # Exclude exact-duplicate rows so dropped entries don't pollute confusable groups.
    slug_to_unique_keys: dict[str, list[str]] = defaultdict(list)
    for i, row in enumerate(csv_rows):
        if i not in _dup_indices:
            slug_to_unique_keys[row["slug"]].append(idx_to_key[i])

    # ------------------------------------------------------------------
    # 5. Build entries
    # ------------------------------------------------------------------
    words: dict[str, dict] = {}
    stats = {
        "total": 0,
        "has_conjugation": 0,
        "has_noun_inflection": 0,
        "has_examples": 0,
        "has_guid": 0,
        "has_image": 0,
        "has_frequency": 0,
        "has_hint": 0,
        "has_emoji": 0,
        "key_collisions": len(collisions),
    }

    for i, row in enumerate(csv_rows):
        if i in _dup_indices:
            continue
        unique_key = idx_to_key[i]
        word_nikkud = row["Word"]
        word_ktiv = row["Word Without Nikkud"]
        slug = row["slug"]
        raw_pos = row["Part of Speech"]
        meaning_raw = row["Meaning"]
        audio_url = row["audio_url"] or None
        tags = row["tags"] or ""

        # -- PoS
        pos_en, pos_hebrew = _parse_pos(raw_pos)

        # -- Root
        root = _parse_root(row["Root"])

        # -- Meaning + emoji
        meaning_clean, emoji_char = _extract_emoji(meaning_raw)

        # -- GUID
        guid = resolve_guid(word_nikkud, meaning_raw, guid_base, guid_disambig)
        if guid:
            stats["has_guid"] += 1

        # -- Frequency (keyed by ktiv_male / stripped)
        frequency = freq_stripped.get(word_ktiv)
        if frequency:
            stats["has_frequency"] += 1

        # -- Image
        image_filename = image_stripped.get(word_ktiv)
        if image_filename:
            stats["has_image"] += 1

        # -- Hint (refined_meanings, NFC-keyed by nikkud)
        hint = refined_nfc.get(_nfc(word_nikkud), "")
        if hint:
            stats["has_hint"] += 1

        # -- Examples (vetted_sentences keyed by stripped word)
        examples_block: dict | None = None
        s_data = sentences_by_stripped.get(word_ktiv)
        if s_data:
            good = s_data.get("good_sentences", [])
            if good:
                vetted_list = [
                    {
                        "text": s["text"],
                        "source": s.get("book", "unknown"),
                        "vetted": True,
                    }
                    for s in good
                ]
                # Pick best cloze sentence (first good one)
                cloze_sent = good[0]
                # cloze_guid: deterministic ID for the cloze card on this vocab note.
                # Pattern: guid_for(word_nikkud, "cloze") — unique per word.
                _cloze_guid = genanki.guid_for(word_nikkud, "cloze")
                _cloze_text = cloze_sent["text"]

                # Compute cloze_word_start / cloze_word_end from the text.
                # Strategy (in order):
                #   1. Use stored offsets if already present in source data.
                #   2. Exact nikkud form search.
                #   3. Exact ktiv_male (plain consonants) search in the raw text.
                #   4. Scan each Hebrew word token in the text; match by stripped consonants.
                #      This handles inflected/construct/plural forms with different nikkud.
                _cw_start: int | None = cloze_sent.get("cloze_word_start")
                _cw_end: int | None = cloze_sent.get("cloze_word_end")
                if _cw_start is None or _cw_end is None:
                    _idx = _cloze_text.find(word_nikkud)
                    if _idx >= 0:
                        _cw_start = _idx
                        _cw_end = _idx + len(word_nikkud)
                    else:
                        # Try exact ktiv_male substring
                        _idx2 = _cloze_text.find(word_ktiv)
                        if _idx2 >= 0:
                            _cw_start = _idx2
                            _cw_end = _idx2 + len(word_ktiv)
                        else:
                            # Scan Hebrew word tokens; find one whose stripped form
                            # matches word_ktiv (handles inflected/construct/plural).
                            _HEBREW_TOK = re.compile(
                                r"[\u05D0-\u05FA\u05B0-\u05BD\u05BF\u05C1\u05C2\u05C7"
                                r"\uFB1D-\uFB4E]+"
                            )
                            for _m in _HEBREW_TOK.finditer(_cloze_text):
                                if _strip(_m.group(0)) == word_ktiv:
                                    _cw_start = _m.start()
                                    _cw_end = _m.end()
                                    break
                            # else leave both as None

                cloze_block = {
                    "text": _cloze_text,
                    "cloze_word_start": _cw_start,
                    "cloze_word_end": _cw_end,
                    "cloze_hint": cloze_sent.get("cloze_hint"),
                    "cloze_guid": _cloze_guid,
                }
                examples_block = {
                    "vetted": vetted_list,
                    "cloze": cloze_block,
                    "rejected_count": s_data.get("rejected_count", 0),
                }
                stats["has_examples"] += 1

        # -- Noun inflection
        noun_inflection: dict | None = None
        pdata = plurals_by_slug.get(slug) or plurals_by_nikkud.get(_nfc(word_nikkud))
        if pdata and pos_en.startswith("Noun"):

            def _hw_or_null(nk: str) -> dict | None:
                nk = _strip_construct_hyphen(nk)
                return _hebrew_word(nk) if nk else None

            gender = pdata.get("gender") or None
            gender_hebrew_map = {
                "masculine": {"nikkud": "זָכָר", "ktiv_male": "זכר"},
                "feminine": {"nikkud": "נְקֵבָה", "ktiv_male": "נקבה"},
            }
            # Plural GUID mirrors apkg_builder line 1609: guid_for("plural", singular_nikkud)
            _plural_singular_nikkud = pdata.get("singular", "")
            _plurals_guid = genanki.guid_for("plural", _plural_singular_nikkud) if _plural_singular_nikkud else None
            noun_inflection = {
                "plurals_guid": _plurals_guid,
                "singular": _hw_or_null(pdata.get("singular", "")),
                "plural": _hw_or_null(pdata.get("plural", "")),
                "singular_audio": pdata.get("singular_audio") or None,
                "plural_audio": pdata.get("plural_audio") or None,
                "construct_singular": _hw_or_null(pdata.get("construct_singular", "")),
                "construct_plural": _hw_or_null(pdata.get("construct_plural", "")),
                "pronominal_suffixes": None,
                "gender": gender,
                "gender_hebrew": gender_hebrew_map.get(gender) if gender else None,
                "mishkal": pdata.get("mishkal") or None,
                "mishkal_hebrew": None,
            }
            stats["has_noun_inflection"] += 1

        # -- Verb conjugation
        conjugation_block: dict | None = None
        cdata = conj_by_slug.get(slug)
        if cdata and not cdata.get("is_passive"):
            # This entry is an active verb with conjugation data
            forms_dict = cdata.get("forms", {})
            # Resolve infinitive nikkud for GUID generation (prefer forms dict, fall back to cdata key)
            _inf_data = forms_dict.get("infinitive", {})
            _inf_nikkud_for_guid = _inf_data.get("form", "") or cdata.get("infinitive", "")
            active_forms = build_conjugation_forms(forms_dict, _inf_nikkud_for_guid)

            # Passive counterpart, if any
            passive_cdata = passive_by_active_slug.get(slug)
            hufal_pual_forms: list | None = None
            reference_form_passive: dict | None = None
            if passive_cdata:
                passive_forms_dict = passive_cdata.get("forms", {})
                _passive_inf_data = passive_forms_dict.get("infinitive", {})
                _passive_inf_nikkud = _passive_inf_data.get("form", "") or passive_cdata.get("infinitive", "")
                hufal_pual_forms = build_conjugation_forms(passive_forms_dict, _passive_inf_nikkud)
                # reference_form of passive = active infinitive; 3ms past is in its forms
                rf_passive_nikkud = passive_cdata.get("forms", {}).get("past_3ms", {}).get("form", "")
                if rf_passive_nikkud:
                    reference_form_passive = _hebrew_word(rf_passive_nikkud)

            # Infinitive form (from forms dict)
            inf_form_data = forms_dict.get("infinitive", {})
            inf_nikkud = inf_form_data.get("form", "") or cdata.get("infinitive", "")
            infinitive_hw = _hebrew_word(inf_nikkud) if inf_nikkud else None

            # Reference form
            ref_nikkud = cdata.get("reference_form", "")
            reference_form_hw = _hebrew_word(ref_nikkud) if ref_nikkud else None

            binyan = cdata.get("binyan", "")
            binyan_hebrew = BINYAN_HEBREW.get(binyan, "")

            conjugation_block = {
                "in_conjugation_deck": True,
                "infinitive": infinitive_hw,
                "reference_form": reference_form_hw,
                "binyan": binyan,
                "binyan_hebrew": binyan_hebrew,
                "prep": None,
                "active_forms": active_forms,
                "hufal_pual_forms": hufal_pual_forms,
                "reference_form_passive": reference_form_passive,
            }
            stats["has_conjugation"] += 1

        elif cdata and cdata.get("is_passive"):
            # Passive-only entry: store a minimal conjugation block referencing the active verb
            binyan = cdata.get("binyan", "")
            binyan_hebrew = BINYAN_HEBREW.get(binyan, "")
            forms_dict = cdata.get("forms", {})
            _passive_only_inf_data = forms_dict.get("infinitive", {})
            _passive_only_inf_nikkud = _passive_only_inf_data.get("form", "") or cdata.get("infinitive", "")
            passive_forms = build_conjugation_forms(forms_dict, _passive_only_inf_nikkud)

            inf_form_data = forms_dict.get("infinitive", {})
            inf_nikkud = inf_form_data.get("form", "") or cdata.get("infinitive", "")
            infinitive_hw = _hebrew_word(inf_nikkud) if inf_nikkud else None

            ref_nikkud = cdata.get("reference_form", "")
            reference_form_hw = _hebrew_word(ref_nikkud) if ref_nikkud else None

            conjugation_block = {
                "in_conjugation_deck": True,
                "infinitive": infinitive_hw,
                "reference_form": reference_form_hw,
                "binyan": binyan,
                "binyan_hebrew": binyan_hebrew,
                "prep": None,
                "active_forms": passive_forms,
                "hufal_pual_forms": None,
                "reference_form_passive": None,
            }
            stats["has_conjugation"] += 1

        # -- Confusable group (filled in pass 2 below)
        # -- Shared roots (filled in pass 2 below)

        # -- Audio filename: slug-based for confusables, word-based otherwise
        audio_file = f"{word_ktiv}.mp3"

        entry: dict = {
            "word": {"nikkud": word_nikkud, "ktiv_male": word_ktiv},
            "slug": slug,
            "root": root,
            "pos": pos_en,
            "pos_hebrew": pos_hebrew,
            "meaning": meaning_clean,
            "meaning_raw": meaning_raw,
            "audio_url": audio_url,
            "audio_file": audio_file,
            "tags": tags,
            "last_scrape_date": MIGRATION_DATE,
            # Identity
            "vocab_legacy_guid": guid,
            # Frequency
            "frequency": frequency,
            "pseudo_frequency": None,
            # Display
            "emoji": emoji_char,
            "emoji_source": "from_pealim" if emoji_char else None,
            "emoji_visible": False,
            "image": image_filename,
            "image_source": "wikipedia" if image_filename else None,
            "hint": hint,
            # Populated in pass 2
            "shared_roots": [],
            "confusable_group": None,
            "confusables_guid": None,
            # Sub-sections
            "examples": examples_block,
            "noun_inflection": noun_inflection,
            "conjugation": conjugation_block,
            "adjective_inflection": None,
            "preposition_inflection": None,
        }

        if emoji_char:
            stats["has_emoji"] += 1

        if unique_key in words:
            log.warning(
                "DUPLICATE unique_key '%s' — row %d would overwrite row %d",
                unique_key,
                i,
                list(words.keys()).index(unique_key),
            )
        words[unique_key] = entry
        stats["total"] += 1

    # ------------------------------------------------------------------
    # 6. Pass 2 — shared_roots and confusable_group
    # ------------------------------------------------------------------

    # shared_roots: group unique_keys by root tuple
    root_to_keys: dict[tuple, list[str]] = defaultdict(list)
    for uk, entry in words.items():
        r = entry["root"]
        if r:
            root_to_keys[tuple(r)].append(uk)

    for uks in root_to_keys.values():
        if len(uks) > 1:
            for uk in uks:
                words[uk]["shared_roots"] = [k for k in uks if k != uk]

    # confusable_group: update audio_file to slug-based for confusable words
    # Also set confusables_guid: genanki.guid_for("confusable", ktiv_male)
    # where ktiv_male is the shared stripped form (key in confusable_slug_groups).
    # Build reverse: slug → ktiv_male (for GUID generation)
    slug_to_confusable_ktiv_male: dict[str, str] = {}
    for km, slugs in confusable_slug_groups.items():
        for slug_in_group in slugs:
            slug_to_confusable_ktiv_male[slug_in_group] = km

    for i, row in enumerate(csv_rows):
        if i in _dup_indices:
            continue
        slug = row["slug"]
        uk = idx_to_key[i]
        co_slugs = slug_to_confusable_slugs.get(slug, set())
        if co_slugs:
            # Gather all unique_keys for co-confusable slugs
            group_keys: list[str] = []
            for co_slug in co_slugs:
                group_keys.extend(slug_to_unique_keys.get(co_slug, []))
            group_keys.append(uk)
            group_keys = sorted(set(group_keys))
            words[uk]["confusable_group"] = group_keys
            # confusables_guid: mirrors apkg_builder line 1401
            ktiv_male_key = slug_to_confusable_ktiv_male.get(slug, "")
            if ktiv_male_key:
                words[uk]["confusables_guid"] = genanki.guid_for("confusable", ktiv_male_key)
            # Use slug-based audio file for confusables to disambiguate
            words[uk]["audio_file"] = f"{slug}.mp3"

    # ------------------------------------------------------------------
    # 7. Stats report
    # ------------------------------------------------------------------
    log.info("=" * 60)
    log.info("MIGRATION COMPLETE — summary stats:")
    log.info("  Total entries:       %d", stats["total"])
    log.info("  Key collision events: %d", stats["key_collisions"])
    log.info("  Has conjugation:     %d", stats["has_conjugation"])
    log.info("  Has noun_inflection: %d", stats["has_noun_inflection"])
    log.info("  Has examples:        %d", stats["has_examples"])
    log.info("  Has legacy GUID:     %d", stats["has_guid"])
    log.info("  Has image:           %d", stats["has_image"])
    log.info("  Has frequency:       %d", stats["has_frequency"])
    log.info("  Has hint:            %d", stats["has_hint"])
    log.info("  Has emoji:           %d", stats["has_emoji"])
    # Confusable entries
    confusable_entries = sum(1 for e in words.values() if e["confusable_group"])
    log.info("  In confusable group: %d", confusable_entries)
    # Entries with shared roots
    with_shared_roots = sum(1 for e in words.values() if e["shared_roots"])
    log.info("  Has shared roots:    %d", with_shared_roots)

    if dry_run:
        log.info("DRY RUN — output file NOT written.")
        return

    # ------------------------------------------------------------------
    # 8. Write output
    # ------------------------------------------------------------------
    with OUTPUT_FILE.open("w", encoding="utf-8") as f:
        json.dump(words, f, ensure_ascii=False, indent=2)
        f.write("\n")

    log.info("Wrote %d entries to %s", len(words), OUTPUT_FILE)


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------


def main() -> None:
    parser = argparse.ArgumentParser(
        description="Migrate all pealim data sources into data/words.json",
    )
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Print stats without writing the output file.",
    )
    args = parser.parse_args()
    migrate(dry_run=args.dry_run)


if __name__ == "__main__":
    main()