diff --git a/apkg_builder.py b/apkg_builder.py index dbd6421..bdc1aa6 100644 --- a/apkg_builder.py +++ b/apkg_builder.py @@ -25,6 +25,10 @@ VOCAB_MODEL_ID = 1_234_567_891 CONJ_DECK_ID = 1_234_567_892 CONJ_MODEL_ID = 1_234_567_893 +# Release version tag added to all notes so users can identify which release +# their cards come from (visible in Anki's Browse view and card info). +RELEASE_TAG = "v0.10" + DATA_DIR = Path(__file__).parent / "data" AUDIO_DIR = DATA_DIR / "audio" AUDIO_CONJ_DIR = DATA_DIR / "audio_conj" @@ -253,8 +257,8 @@ VOCAB_MODEL = genanki.Model( # ────────────────────────────────────────────────────────────────────────────── CONJ_FRONT = """ -
{{ReferenceForm}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Pronoun}}
+
{{ReferenceForm}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Tense}}
""" @@ -525,7 +529,7 @@ def build_vocab_deck( freq_display, image_tag, ], - tags=tags_str.split() if tags_str else [], + tags=(tags_str.split() if tags_str else []) + [RELEASE_TAG], ) deck.add_note(note) @@ -583,6 +587,7 @@ def build_conj_deck( voice, audio_tag, ], + tags=[RELEASE_TAG], ) deck.add_note(note) note_count += 1 diff --git a/validate_apkg.py b/validate_apkg.py new file mode 100644 index 0000000..538434b --- /dev/null +++ b/validate_apkg.py @@ -0,0 +1,301 @@ +#!/usr/bin/env python3 +""" +validate_apkg.py — Anki deck integrity validation. + +Checks both pealim_vocabulary.apkg and pealim_conjugations.apkg for +structural correctness, media consistency, and card data integrity. + +Usage: + python3 validate_apkg.py [--vocab | --conjugations] [path/to/deck.apkg] +""" + +import argparse +import json +import os +import re +import sqlite3 +import struct +import sys +import tempfile +import zipfile +from pathlib import Path + +VOCAB_APKG = Path("output/pealim_vocabulary.apkg") +CONJ_APKG = Path("output/pealim_conjugations.apkg") + +PASS = "\033[32m✓\033[0m" +FAIL = "\033[31m✗\033[0m" +WARN = "\033[33m⚠\033[0m" + + +def check(label: str, ok: bool, detail: str = "") -> bool: + icon = PASS if ok else FAIL + line = f" {icon} {label}" + if detail: + line += f": {detail}" + print(line) + return ok + + +def warn(label: str, detail: str = "") -> None: + line = f" {WARN} {label}" + if detail: + line += f": {detail}" + print(line) + + +def _detect_format(data: bytes) -> str: + if data[:3] == b"ID3": + return "MP3 (ID3)" + if data[:2] in (b"\xff\xfb", b"\xff\xf3", b"\xff\xf2", b"\xff\xfa"): + return "MP3 (raw)" + if data[:4] == b"OggS": + return "OGG" + if data[:4] == b"fLaC": + return "FLAC" + if data[:4] == b"RIFF": + return "WAV" + return f"unknown ({data[:4].hex()})" + + +def validate_apkg(apkg_path: Path) -> int: + """Run all checks. Returns number of failures.""" + name = apkg_path.name + print(f"\n{'='*60}") + print(f" Validating: {apkg_path}") + print(f"{'='*60}") + + failures = 0 + + if not apkg_path.exists(): + print(f" {FAIL} File not found: {apkg_path}") + return 1 + + file_size_mb = apkg_path.stat().st_size / 1_048_576 + print(f"\n File size: {file_size_mb:.1f} MB") + + # --- ZIP structure --- + print("\n[ZIP structure]") + try: + zf = zipfile.ZipFile(apkg_path) + namelist = zf.namelist() + has_db = "collection.anki2" in namelist + has_media = "media" in namelist + failures += 0 if check("collection.anki2 present", has_db) else 1 + failures += 0 if check("media manifest present", has_media) else 1 + except zipfile.BadZipFile as e: + print(f" {FAIL} Invalid ZIP: {e}") + return 1 + + with tempfile.TemporaryDirectory() as tmpdir: + zf.extractall(tmpdir) + + # --- Media manifest --- + print("\n[Media manifest]") + media_path = os.path.join(tmpdir, "media") + with open(media_path) as f: + try: + media_map: dict[str, str] = json.load(f) + except json.JSONDecodeError as e: + print(f" {FAIL} Invalid media JSON: {e}") + failures += 1 + media_map = {} + + original_names = set(media_map.values()) + zip_numbered = set(namelist) - {"collection.anki2", "media"} + + check( + "Manifest count matches ZIP entries", + len(media_map) == len(zip_numbered), + f"{len(media_map)} manifest vs {len(zip_numbered)} ZIP files", + ) + + # Check for zero-byte media files + zero_byte = [] + for num, orig in media_map.items(): + size = zf.getinfo(num).file_size if num in zf.NameToInfo else -1 + if size == 0: + zero_byte.append(orig) + failures += 0 if check("No zero-byte media files", len(zero_byte) == 0, + f"{len(zero_byte)} empty" if zero_byte else "") else 1 + + # Check audio format sample (first 20 mp3s) + mp3_names = [num for num, orig in media_map.items() if orig.endswith(".mp3")] + bad_format = [] + for num in mp3_names[:20]: + data = zf.read(num)[:8] + fmt = _detect_format(data) + if "MP3" not in fmt: + bad_format.append(f"{media_map[num]}: {fmt}") + failures += 0 if check( + f"Audio format (sampled {min(20, len(mp3_names))} files)", + len(bad_format) == 0, + "; ".join(bad_format) if bad_format else f"all MP3", + ) else 1 + + # Fonts present + font_files = [v for v in original_names if v.endswith(".ttf")] + check("Heebo font files bundled", len(font_files) >= 1, + ", ".join(font_files) if font_files else "none found") + + # --- Database --- + print("\n[Database]") + db_path = os.path.join(tmpdir, "collection.anki2") + conn = sqlite3.connect(db_path) + + schema_ver = conn.execute("SELECT ver FROM col").fetchone()[0] + failures += 0 if check("Schema version 11 (Anki 2.1)", schema_ver == 11, + f"got {schema_ver}") else 1 + + note_count = conn.execute("SELECT COUNT(*) FROM notes").fetchone()[0] + card_count = conn.execute("SELECT COUNT(*) FROM cards").fetchone()[0] + failures += 0 if check("Notes present", note_count > 0, f"{note_count:,} notes") else 1 + failures += 0 if check("Cards present", card_count > 0, f"{card_count:,} cards") else 1 + + # Determine expected cards per note from model templates + models_json_raw = conn.execute("SELECT models FROM col").fetchone()[0] + models_raw = json.loads(models_json_raw) + tmpl_counts = [len(m["tmpls"]) for m in models_raw.values()] + expected_ratio = tmpl_counts[0] if len(set(tmpl_counts)) == 1 else None + if expected_ratio: + failures += 0 if check( + f"{expected_ratio} card(s) per note", + card_count == note_count * expected_ratio, + f"{note_count} notes × {expected_ratio} = {note_count * expected_ratio}, got {card_count}", + ) else 1 + + # Duplicate GUIDs + dup_guids = conn.execute( + "SELECT guid, COUNT(*) c FROM notes GROUP BY guid HAVING c > 1" + ).fetchall() + failures += 0 if check("No duplicate GUIDs", len(dup_guids) == 0, + f"{len(dup_guids)} duplicates") else 1 + + # Card queue states + queues = conn.execute( + "SELECT type, queue, COUNT(*) FROM cards GROUP BY type, queue" + ).fetchall() + queue_map = {(t, q): cnt for t, q, cnt in queues} + new_cards = queue_map.get((0, 0), 0) + suspended = queue_map.get((0, -1), 0) + queue_map.get((1, -1), 0) + queue_map.get((2, -1), 0) + if new_cards > 0: + check(f"Cards in new queue (type=0, queue=0)", True, f"{new_cards:,}") + if suspended > 0: + warn("Suspended cards", f"{suspended:,}") + + # dconf — new card order + dconf_json = conn.execute("SELECT dconf FROM col").fetchone()[0] + dconf = json.loads(dconf_json) + orders = {dc.get("new", {}).get("order") for dc in dconf.values() if isinstance(dc, dict)} + per_days = {dc.get("new", {}).get("perDay") for dc in dconf.values() if isinstance(dc, dict)} + check("new.order configured", bool(orders), f"{orders}") + if per_days: + check("new.perDay > 0", all(p and p > 0 for p in per_days if p is not None), + f"perDay={per_days}") + + # Deck assignment + decks_json = conn.execute("SELECT decks FROM col").fetchone()[0] + decks = json.loads(decks_json) + real_decks = {did: d for did, d in decks.items() if did != "1"} + if real_decks: + check("Custom deck exists (not Default only)", True, + ", ".join(d["name"] for d in real_decks.values())) + # All cards in the custom deck? + for did_str in real_decks: + assigned = conn.execute( + "SELECT COUNT(*) FROM cards WHERE did=?", [int(did_str)] + ).fetchone()[0] + check(f"Cards in deck '{real_decks[did_str]['name']}'", assigned > 0, + f"{assigned:,}/{card_count:,}") + + # --- Sound references vs media manifest --- + print("\n[Sound references]") + notes_flds = conn.execute("SELECT flds FROM notes").fetchall() + sound_refs: set[str] = set() + for (flds,) in notes_flds: + for ref in re.findall(r"\[sound:([^\]]+)\]", flds): + sound_refs.add(ref) + + missing_audio = sound_refs - original_names + orphaned_audio = original_names - sound_refs - set(font_files) + failures += 0 if check("All sound refs in media manifest", len(missing_audio) == 0, + f"{len(missing_audio)} missing" if missing_audio else "") else 1 + if orphaned_audio: + warn("Media files not referenced by any card", f"{len(orphaned_audio)} orphaned") + + notes_with_audio = sum( + 1 for (flds,) in notes_flds if "[sound:" in flds + ) + pct = notes_with_audio / note_count * 100 if note_count else 0 + check(f"Notes with audio", notes_with_audio > 0, f"{notes_with_audio:,}/{note_count:,} ({pct:.0f}%)") + + # --- Empty fields check --- + print("\n[Field content]") + models = models_raw + for mid_str, model in models.items(): + field_names = [f["name"] for f in model["flds"]] + # Check required fields (first 3) are not empty + required_idx = list(range(min(3, len(field_names)))) + for idx in required_idx: + fname = field_names[idx] + empty_count = conn.execute( + """SELECT COUNT(*) FROM notes + WHERE mid=? AND ( + flds LIKE ? OR + instr(flds, char(31)) = 0 + )""", + [int(mid_str), "\x1f" * idx + "\x1f%"], + ).fetchone()[0] + # Simpler: count notes where field idx is empty + all_notes_for_model = conn.execute( + "SELECT flds FROM notes WHERE mid=?", [int(mid_str)] + ).fetchall() + empty = sum( + 1 for (flds,) in all_notes_for_model + if len(flds.split("\x1f")) <= idx or not flds.split("\x1f")[idx].strip() + ) + if empty > 0: + warn(f"Model '{model['name']}' field '{fname}' empty in {empty} notes") + else: + check(f"Model '{model['name']}' field '{fname}' populated", True) + + conn.close() + + print() + return failures + + +def main() -> None: + parser = argparse.ArgumentParser(description="Validate Pealim .apkg files") + parser.add_argument("path", nargs="?", help="Path to .apkg file (validates both if omitted)") + group = parser.add_mutually_exclusive_group() + group.add_argument("--vocab", action="store_true", help="Validate vocabulary deck only") + group.add_argument("--conjugations", action="store_true", help="Validate conjugation deck only") + args = parser.parse_args() + + targets: list[Path] = [] + if args.path: + targets = [Path(args.path)] + elif args.vocab: + targets = [VOCAB_APKG] + elif args.conjugations: + targets = [CONJ_APKG] + else: + targets = [VOCAB_APKG, CONJ_APKG] + + total_failures = 0 + for path in targets: + total_failures += validate_apkg(path) + + print(f"\n{'='*60}") + if total_failures == 0: + print(f" {PASS} All checks passed") + else: + print(f" {FAIL} {total_failures} check(s) failed") + print(f"{'='*60}\n") + + sys.exit(0 if total_failures == 0 else 1) + + +if __name__ == "__main__": + main()