"""Check that every GUID in the last-release complete .apkg exists in words.json. Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file, then compares against all GUID fields stored in data/words.json. Usage: python3 scripts/check_guid_coverage.py python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg python3 scripts/check_guid_coverage.py --verbose """ from __future__ import annotations import argparse import json import os import sqlite3 import sys import tempfile import zipfile from pathlib import Path from typing import Any PROJECT_ROOT = Path(__file__).parent.parent DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg" WORDS_JSON = PROJECT_ROOT / "data" / "words.json" # Known model IDs (from apkg_builder.py) MODEL_IDS = { 1701222017968: "vocab", 1234567893: "conjugation", 1234567897: "plurals", 1234567895: "confusables", } def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]: """Extract GUIDs from .apkg grouped by model ID.""" by_model: dict[int, set[str]] = {} with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td: z.extractall(td) db_path = os.path.join(td, "collection.anki2") conn = sqlite3.connect(db_path) cur = conn.cursor() cur.execute("SELECT guid, mid FROM notes") for guid, mid in cur.fetchall(): by_model.setdefault(mid, set()).add(guid) conn.close() return by_model def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]: """Collect all GUIDs from words.json grouped by deck type.""" vocab_guids: set[str] = set() cloze_guids: set[str] = set() conj_guids: set[str] = set() plurals_guids: set[str] = set() confusables_guids: set[str] = set() for entry in data.values(): # Vocab legacy GUID g = entry.get("vocab_legacy_guid") if g: vocab_guids.add(g) # Cloze GUID (stored in examples.cloze.cloze_guid) examples = entry.get("examples") if examples: cloze = examples.get("cloze") if cloze: g = cloze.get("cloze_guid") if g: cloze_guids.add(g) # Plurals GUID (stored inside noun_inflection) ni = entry.get("noun_inflection") if ni: g = ni.get("plurals_guid") if g: plurals_guids.add(g) # Confusables GUID (top-level) g = entry.get("confusables_guid") if g: confusables_guids.add(g) # Conjugation form GUIDs conj = entry.get("conjugation") if conj: for form_list_key in ("active_forms", "hufal_pual_forms"): forms = conj.get(form_list_key) if not forms: continue for form in forms: g = form.get("guid") if g: conj_guids.add(g) gc = form.get("guid_candidates") if gc: for g2 in gc: conj_guids.add(g2) return { "vocab": vocab_guids, "cloze": cloze_guids, "conjugation": conj_guids, "plurals": plurals_guids, "confusables": confusables_guids, } def main() -> None: parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json") parser.add_argument( "--apkg", type=Path, default=DEFAULT_APKG, help=f"Path to .apkg file (default: {DEFAULT_APKG})", ) parser.add_argument("--verbose", "-v", action="store_true") args = parser.parse_args() if not args.apkg.exists(): print(f"ERROR: apkg not found: {args.apkg}") sys.exit(2) if not WORDS_JSON.exists(): print(f"ERROR: words.json not found: {WORDS_JSON}") sys.exit(2) print(f"Checking: {args.apkg}") print(f"Against: {WORDS_JSON}") print() apkg_by_model = extract_apkg_guids(args.apkg) data = json.load(WORDS_JSON.open(encoding="utf-8")) wj = collect_words_json_guids(data) total_apkg = sum(len(s) for s in apkg_by_model.values()) total_wj = sum(len(s) for s in wj.values()) print(f"Total GUIDs in apkg: {total_apkg}") print(f"Total GUIDs in words.json: {total_wj}") print() all_missing = 0 all_extra = 0 for mid, deck_name in MODEL_IDS.items(): apkg_set = apkg_by_model.get(mid, set()) # Map apkg model to words.json GUID sets if deck_name == "vocab": # Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2) # They share the note GUID — vocab_legacy_guid IS the note guid wj_set = wj["vocab"] | wj["cloze"] elif deck_name == "conjugation": wj_set = wj["conjugation"] elif deck_name == "plurals": wj_set = wj["plurals"] elif deck_name == "confusables": wj_set = wj["confusables"] else: wj_set = set() missing = apkg_set - wj_set extra = wj_set - apkg_set matched = apkg_set & wj_set all_missing += len(missing) all_extra += len(extra) status = "PASS" if not missing else "FAIL" print(f" {status} {deck_name} (mid={mid})") print( f" apkg={len(apkg_set)}, words.json={len(wj_set)}, " f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}" ) if missing and args.verbose: # Try to find what word each missing GUID belongs to in the apkg print(" Missing GUIDs (in apkg, not in words.json):") for g in sorted(missing)[:20]: print(f" {g!r}") if len(missing) > 20: print(f" ... ({len(missing) - 20} more)") if extra and args.verbose: print(" Extra GUIDs (in words.json, not in apkg):") for g in sorted(extra)[:10]: print(f" {g!r}") if len(extra) > 10: print(f" ... ({len(extra) - 10} more)") print() # Check for unknown model IDs in apkg unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys()) if unknown_mids: print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}") for mid in unknown_mids: print(f" mid={mid}: {len(apkg_by_model[mid])} notes") print("─" * 60) if all_missing: print(f" FAILED: {all_missing} apkg GUIDs not found in words.json") print(" (These notes would lose study progress on reimport)") sys.exit(1) else: print(f" All {total_apkg} apkg GUIDs accounted for in words.json.") sys.exit(0) if __name__ == "__main__": main()