hebrew_flash_cards/scripts/check_guid_coverage.py
Sochen 08fb7009d8 Sprint 11: unified JSON architecture + consolidated scraping pipeline
Migrate from fragmented CSV + 10 JSON files to a single data/words.json
(9,104 entries) as the unified data store. All GUIDs preserved for Anki
study progress continuity.

New files:
- SCHEMA.yaml: authoritative schema for words.json
- pealim_list_scrape.py: consolidated list page scraper → words.json
- pealim_detail_scrape.py: noun/verb detail scraper → words.json
- pealim_audio_download.py: audio downloader reading from words.json
- scripts/migrate_to_json.py: one-time CSV→JSON migration
- scripts/validate_data.py: 17 data integrity tests
- scripts/check_guid_coverage.py: GUID preservation checker
- scripts/repair_slugs.py: slug deduplication repair tool
- tests/test_scraper_integration.py: live scraper integration tests

Updated:
- apkg_builder.py: reads from words.json (no more pandas)
- run.py: 8-step pipeline (list scrape → frequency → examples →
  detail scrape → audio download → fonts → images → build)
- benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers
  for future words.json integration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:54:58 +00:00

212 lines
6.8 KiB
Python

"""Check that every GUID in the last-release complete .apkg exists in words.json.
Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
then compares against all GUID fields stored in data/words.json.
Usage:
python3 scripts/check_guid_coverage.py
python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
python3 scripts/check_guid_coverage.py --verbose
"""
from __future__ import annotations
import argparse
import json
import os
import sqlite3
import sys
import tempfile
import zipfile
from pathlib import Path
from typing import Any
PROJECT_ROOT = Path(__file__).parent.parent
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
# Known model IDs (from apkg_builder.py)
MODEL_IDS = {
1701222017968: "vocab",
1234567893: "conjugation",
1234567897: "plurals",
1234567895: "confusables",
}
def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
"""Extract GUIDs from .apkg grouped by model ID."""
by_model: dict[int, set[str]] = {}
with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
z.extractall(td)
db_path = os.path.join(td, "collection.anki2")
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("SELECT guid, mid FROM notes")
for guid, mid in cur.fetchall():
by_model.setdefault(mid, set()).add(guid)
conn.close()
return by_model
def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
"""Collect all GUIDs from words.json grouped by deck type."""
vocab_guids: set[str] = set()
cloze_guids: set[str] = set()
conj_guids: set[str] = set()
plurals_guids: set[str] = set()
confusables_guids: set[str] = set()
for entry in data.values():
# Vocab legacy GUID
g = entry.get("vocab_legacy_guid")
if g:
vocab_guids.add(g)
# Cloze GUID (stored in examples.cloze.cloze_guid)
examples = entry.get("examples")
if examples:
cloze = examples.get("cloze")
if cloze:
g = cloze.get("cloze_guid")
if g:
cloze_guids.add(g)
# Plurals GUID (stored inside noun_inflection)
ni = entry.get("noun_inflection")
if ni:
g = ni.get("plurals_guid")
if g:
plurals_guids.add(g)
# Confusables GUID (top-level)
g = entry.get("confusables_guid")
if g:
confusables_guids.add(g)
# Conjugation form GUIDs
conj = entry.get("conjugation")
if conj:
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
g = form.get("guid")
if g:
conj_guids.add(g)
gc = form.get("guid_candidates")
if gc:
for g2 in gc:
conj_guids.add(g2)
return {
"vocab": vocab_guids,
"cloze": cloze_guids,
"conjugation": conj_guids,
"plurals": plurals_guids,
"confusables": confusables_guids,
}
def main() -> None:
parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
parser.add_argument(
"--apkg",
type=Path,
default=DEFAULT_APKG,
help=f"Path to .apkg file (default: {DEFAULT_APKG})",
)
parser.add_argument("--verbose", "-v", action="store_true")
args = parser.parse_args()
if not args.apkg.exists():
print(f"ERROR: apkg not found: {args.apkg}")
sys.exit(2)
if not WORDS_JSON.exists():
print(f"ERROR: words.json not found: {WORDS_JSON}")
sys.exit(2)
print(f"Checking: {args.apkg}")
print(f"Against: {WORDS_JSON}")
print()
apkg_by_model = extract_apkg_guids(args.apkg)
data = json.load(WORDS_JSON.open(encoding="utf-8"))
wj = collect_words_json_guids(data)
total_apkg = sum(len(s) for s in apkg_by_model.values())
total_wj = sum(len(s) for s in wj.values())
print(f"Total GUIDs in apkg: {total_apkg}")
print(f"Total GUIDs in words.json: {total_wj}")
print()
all_missing = 0
all_extra = 0
for mid, deck_name in MODEL_IDS.items():
apkg_set = apkg_by_model.get(mid, set())
# Map apkg model to words.json GUID sets
if deck_name == "vocab":
# Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
# They share the note GUID — vocab_legacy_guid IS the note guid
wj_set = wj["vocab"] | wj["cloze"]
elif deck_name == "conjugation":
wj_set = wj["conjugation"]
elif deck_name == "plurals":
wj_set = wj["plurals"]
elif deck_name == "confusables":
wj_set = wj["confusables"]
else:
wj_set = set()
missing = apkg_set - wj_set
extra = wj_set - apkg_set
matched = apkg_set & wj_set
all_missing += len(missing)
all_extra += len(extra)
status = "PASS" if not missing else "FAIL"
print(f" {status} {deck_name} (mid={mid})")
print(
f" apkg={len(apkg_set)}, words.json={len(wj_set)}, "
f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
)
if missing and args.verbose:
# Try to find what word each missing GUID belongs to in the apkg
print(" Missing GUIDs (in apkg, not in words.json):")
for g in sorted(missing)[:20]:
print(f" {g!r}")
if len(missing) > 20:
print(f" ... ({len(missing) - 20} more)")
if extra and args.verbose:
print(" Extra GUIDs (in words.json, not in apkg):")
for g in sorted(extra)[:10]:
print(f" {g!r}")
if len(extra) > 10:
print(f" ... ({len(extra) - 10} more)")
print()
# Check for unknown model IDs in apkg
unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
if unknown_mids:
print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}")
for mid in unknown_mids:
print(f" mid={mid}: {len(apkg_by_model[mid])} notes")
print("" * 60)
if all_missing:
print(f" FAILED: {all_missing} apkg GUIDs not found in words.json")
print(" (These notes would lose study progress on reimport)")
sys.exit(1)
else:
print(f" All {total_apkg} apkg GUIDs accounted for in words.json.")
sys.exit(0)
if __name__ == "__main__":
main()