hebrew_flash_cards/scripts/check_guid_coverage.py

"""Check that every GUID in the last-release complete .apkg exists in words.json.

Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
then compares against all GUID fields stored in data/words.json.

Usage:
    python3 scripts/check_guid_coverage.py
    python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
    python3 scripts/check_guid_coverage.py --verbose
"""

from __future__ import annotations

import argparse
import json
import os
import sqlite3
import sys
import tempfile
import zipfile
from pathlib import Path
from typing import Any

PROJECT_ROOT = Path(__file__).parent.parent
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"

# Known model IDs (from apkg_builder.py)
MODEL_IDS = {
    1701222017968: "vocab",
    1234567893: "conjugation",
    1234567897: "plurals",
    1234567895: "confusables",
}


def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
    """Extract GUIDs from .apkg grouped by model ID."""
    by_model: dict[int, set[str]] = {}
    with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
        z.extractall(td)
        db_path = os.path.join(td, "collection.anki2")
        conn = sqlite3.connect(db_path)
        cur = conn.cursor()
        cur.execute("SELECT guid, mid FROM notes")
        for guid, mid in cur.fetchall():
            by_model.setdefault(mid, set()).add(guid)
        conn.close()
    return by_model


def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
    """Collect all GUIDs from words.json grouped by deck type."""
    vocab_guids: set[str] = set()
    cloze_guids: set[str] = set()
    conj_guids: set[str] = set()
    plurals_guids: set[str] = set()
    confusables_guids: set[str] = set()

    for entry in data.values():
        # Vocab legacy GUID
        g = entry.get("vocab_legacy_guid")
        if g:
            vocab_guids.add(g)

        # Cloze GUID (stored in examples.cloze.cloze_guid)
        examples = entry.get("examples")
        if examples:
            cloze = examples.get("cloze")
            if cloze:
                g = cloze.get("cloze_guid")
                if g:
                    cloze_guids.add(g)

        # Plurals GUID (stored inside noun_inflection)
        ni = entry.get("noun_inflection")
        if ni:
            g = ni.get("plurals_guid")
            if g:
                plurals_guids.add(g)

        # Confusables GUID (top-level)
        g = entry.get("confusables_guid")
        if g:
            confusables_guids.add(g)

        # Conjugation form GUIDs
        conj = entry.get("conjugation")
        if conj:
            for form_list_key in ("active_forms", "hufal_pual_forms"):
                forms = conj.get(form_list_key)
                if not forms:
                    continue
                for form in forms:
                    g = form.get("guid")
                    if g:
                        conj_guids.add(g)
                    gc = form.get("guid_candidates")
                    if gc:
                        for g2 in gc:
                            conj_guids.add(g2)

    return {
        "vocab": vocab_guids,
        "cloze": cloze_guids,
        "conjugation": conj_guids,
        "plurals": plurals_guids,
        "confusables": confusables_guids,
    }


def main() -> None:
    parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
    parser.add_argument(
        "--apkg",
        type=Path,
        default=DEFAULT_APKG,
        help=f"Path to .apkg file (default: {DEFAULT_APKG})",
    )
    parser.add_argument("--verbose", "-v", action="store_true")
    args = parser.parse_args()

    if not args.apkg.exists():
        print(f"ERROR: apkg not found: {args.apkg}")
        sys.exit(2)
    if not WORDS_JSON.exists():
        print(f"ERROR: words.json not found: {WORDS_JSON}")
        sys.exit(2)

    print(f"Checking: {args.apkg}")
    print(f"Against:  {WORDS_JSON}")
    print()

    apkg_by_model = extract_apkg_guids(args.apkg)
    data = json.load(WORDS_JSON.open(encoding="utf-8"))
    wj = collect_words_json_guids(data)

    total_apkg = sum(len(s) for s in apkg_by_model.values())
    total_wj = sum(len(s) for s in wj.values())
    print(f"Total GUIDs in apkg:      {total_apkg}")
    print(f"Total GUIDs in words.json: {total_wj}")
    print()

    all_missing = 0
    all_extra = 0

    for mid, deck_name in MODEL_IDS.items():
        apkg_set = apkg_by_model.get(mid, set())

        # Map apkg model to words.json GUID sets
        if deck_name == "vocab":
            # Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
            # They share the note GUID — vocab_legacy_guid IS the note guid
            wj_set = wj["vocab"] | wj["cloze"]
        elif deck_name == "conjugation":
            wj_set = wj["conjugation"]
        elif deck_name == "plurals":
            wj_set = wj["plurals"]
        elif deck_name == "confusables":
            wj_set = wj["confusables"]
        else:
            wj_set = set()

        missing = apkg_set - wj_set
        extra = wj_set - apkg_set
        matched = apkg_set & wj_set
        all_missing += len(missing)
        all_extra += len(extra)

        status = "PASS" if not missing else "FAIL"
        print(f"  {status}  {deck_name} (mid={mid})")
        print(
            f"         apkg={len(apkg_set)}, words.json={len(wj_set)}, "
            f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
        )

        if missing and args.verbose:
            # Try to find what word each missing GUID belongs to in the apkg
            print("         Missing GUIDs (in apkg, not in words.json):")
            for g in sorted(missing)[:20]:
                print(f"           {g!r}")
            if len(missing) > 20:
                print(f"           ... ({len(missing) - 20} more)")

        if extra and args.verbose:
            print("         Extra GUIDs (in words.json, not in apkg):")
            for g in sorted(extra)[:10]:
                print(f"           {g!r}")
            if len(extra) > 10:
                print(f"           ... ({len(extra) - 10} more)")

        print()

    # Check for unknown model IDs in apkg
    unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
    if unknown_mids:
        print(f"  WARNING: Unknown model IDs in apkg: {unknown_mids}")
        for mid in unknown_mids:
            print(f"    mid={mid}: {len(apkg_by_model[mid])} notes")

    print("─" * 60)
    if all_missing:
        print(f"  FAILED: {all_missing} apkg GUIDs not found in words.json")
        print("          (These notes would lose study progress on reimport)")
        sys.exit(1)
    else:
        print(f"  All {total_apkg} apkg GUIDs accounted for in words.json.")
        sys.exit(0)


if __name__ == "__main__":
    main()