Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
212 lines
6.8 KiB
Python
212 lines
6.8 KiB
Python
"""Check that every GUID in the last-release complete .apkg exists in words.json.
|
|
|
|
Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
|
|
then compares against all GUID fields stored in data/words.json.
|
|
|
|
Usage:
|
|
python3 scripts/check_guid_coverage.py
|
|
python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
|
|
python3 scripts/check_guid_coverage.py --verbose
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import os
|
|
import sqlite3
|
|
import sys
|
|
import tempfile
|
|
import zipfile
|
|
from pathlib import Path
|
|
from typing import Any
|
|
|
|
PROJECT_ROOT = Path(__file__).parent.parent
|
|
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
|
|
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
|
|
|
# Known model IDs (from apkg_builder.py)
|
|
MODEL_IDS = {
|
|
1701222017968: "vocab",
|
|
1234567893: "conjugation",
|
|
1234567897: "plurals",
|
|
1234567895: "confusables",
|
|
}
|
|
|
|
|
|
def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
|
|
"""Extract GUIDs from .apkg grouped by model ID."""
|
|
by_model: dict[int, set[str]] = {}
|
|
with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
|
|
z.extractall(td)
|
|
db_path = os.path.join(td, "collection.anki2")
|
|
conn = sqlite3.connect(db_path)
|
|
cur = conn.cursor()
|
|
cur.execute("SELECT guid, mid FROM notes")
|
|
for guid, mid in cur.fetchall():
|
|
by_model.setdefault(mid, set()).add(guid)
|
|
conn.close()
|
|
return by_model
|
|
|
|
|
|
def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
|
|
"""Collect all GUIDs from words.json grouped by deck type."""
|
|
vocab_guids: set[str] = set()
|
|
cloze_guids: set[str] = set()
|
|
conj_guids: set[str] = set()
|
|
plurals_guids: set[str] = set()
|
|
confusables_guids: set[str] = set()
|
|
|
|
for entry in data.values():
|
|
# Vocab legacy GUID
|
|
g = entry.get("vocab_legacy_guid")
|
|
if g:
|
|
vocab_guids.add(g)
|
|
|
|
# Cloze GUID (stored in examples.cloze.cloze_guid)
|
|
examples = entry.get("examples")
|
|
if examples:
|
|
cloze = examples.get("cloze")
|
|
if cloze:
|
|
g = cloze.get("cloze_guid")
|
|
if g:
|
|
cloze_guids.add(g)
|
|
|
|
# Plurals GUID (stored inside noun_inflection)
|
|
ni = entry.get("noun_inflection")
|
|
if ni:
|
|
g = ni.get("plurals_guid")
|
|
if g:
|
|
plurals_guids.add(g)
|
|
|
|
# Confusables GUID (top-level)
|
|
g = entry.get("confusables_guid")
|
|
if g:
|
|
confusables_guids.add(g)
|
|
|
|
# Conjugation form GUIDs
|
|
conj = entry.get("conjugation")
|
|
if conj:
|
|
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
|
forms = conj.get(form_list_key)
|
|
if not forms:
|
|
continue
|
|
for form in forms:
|
|
g = form.get("guid")
|
|
if g:
|
|
conj_guids.add(g)
|
|
gc = form.get("guid_candidates")
|
|
if gc:
|
|
for g2 in gc:
|
|
conj_guids.add(g2)
|
|
|
|
return {
|
|
"vocab": vocab_guids,
|
|
"cloze": cloze_guids,
|
|
"conjugation": conj_guids,
|
|
"plurals": plurals_guids,
|
|
"confusables": confusables_guids,
|
|
}
|
|
|
|
|
|
def main() -> None:
|
|
parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
|
|
parser.add_argument(
|
|
"--apkg",
|
|
type=Path,
|
|
default=DEFAULT_APKG,
|
|
help=f"Path to .apkg file (default: {DEFAULT_APKG})",
|
|
)
|
|
parser.add_argument("--verbose", "-v", action="store_true")
|
|
args = parser.parse_args()
|
|
|
|
if not args.apkg.exists():
|
|
print(f"ERROR: apkg not found: {args.apkg}")
|
|
sys.exit(2)
|
|
if not WORDS_JSON.exists():
|
|
print(f"ERROR: words.json not found: {WORDS_JSON}")
|
|
sys.exit(2)
|
|
|
|
print(f"Checking: {args.apkg}")
|
|
print(f"Against: {WORDS_JSON}")
|
|
print()
|
|
|
|
apkg_by_model = extract_apkg_guids(args.apkg)
|
|
data = json.load(WORDS_JSON.open(encoding="utf-8"))
|
|
wj = collect_words_json_guids(data)
|
|
|
|
total_apkg = sum(len(s) for s in apkg_by_model.values())
|
|
total_wj = sum(len(s) for s in wj.values())
|
|
print(f"Total GUIDs in apkg: {total_apkg}")
|
|
print(f"Total GUIDs in words.json: {total_wj}")
|
|
print()
|
|
|
|
all_missing = 0
|
|
all_extra = 0
|
|
|
|
for mid, deck_name in MODEL_IDS.items():
|
|
apkg_set = apkg_by_model.get(mid, set())
|
|
|
|
# Map apkg model to words.json GUID sets
|
|
if deck_name == "vocab":
|
|
# Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
|
|
# They share the note GUID — vocab_legacy_guid IS the note guid
|
|
wj_set = wj["vocab"] | wj["cloze"]
|
|
elif deck_name == "conjugation":
|
|
wj_set = wj["conjugation"]
|
|
elif deck_name == "plurals":
|
|
wj_set = wj["plurals"]
|
|
elif deck_name == "confusables":
|
|
wj_set = wj["confusables"]
|
|
else:
|
|
wj_set = set()
|
|
|
|
missing = apkg_set - wj_set
|
|
extra = wj_set - apkg_set
|
|
matched = apkg_set & wj_set
|
|
all_missing += len(missing)
|
|
all_extra += len(extra)
|
|
|
|
status = "PASS" if not missing else "FAIL"
|
|
print(f" {status} {deck_name} (mid={mid})")
|
|
print(
|
|
f" apkg={len(apkg_set)}, words.json={len(wj_set)}, "
|
|
f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
|
|
)
|
|
|
|
if missing and args.verbose:
|
|
# Try to find what word each missing GUID belongs to in the apkg
|
|
print(" Missing GUIDs (in apkg, not in words.json):")
|
|
for g in sorted(missing)[:20]:
|
|
print(f" {g!r}")
|
|
if len(missing) > 20:
|
|
print(f" ... ({len(missing) - 20} more)")
|
|
|
|
if extra and args.verbose:
|
|
print(" Extra GUIDs (in words.json, not in apkg):")
|
|
for g in sorted(extra)[:10]:
|
|
print(f" {g!r}")
|
|
if len(extra) > 10:
|
|
print(f" ... ({len(extra) - 10} more)")
|
|
|
|
print()
|
|
|
|
# Check for unknown model IDs in apkg
|
|
unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
|
|
if unknown_mids:
|
|
print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}")
|
|
for mid in unknown_mids:
|
|
print(f" mid={mid}: {len(apkg_by_model[mid])} notes")
|
|
|
|
print("─" * 60)
|
|
if all_missing:
|
|
print(f" FAILED: {all_missing} apkg GUIDs not found in words.json")
|
|
print(" (These notes would lose study progress on reimport)")
|
|
sys.exit(1)
|
|
else:
|
|
print(f" All {total_apkg} apkg GUIDs accounted for in words.json.")
|
|
sys.exit(0)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|