Sprint 6: release tagging, conjugation front swap, validate_apkg.py

- Add RELEASE_TAG="v0.10" constant; tag all notes (vocab + conj) so users can identify which release their cards came from via Anki Browse - Swap conjugation card front: Pronoun now above Infinitive for easier recall - Add validate_apkg.py: comprehensive .apkg integrity checker covering ZIP structure, media manifest, audio format, DB schema, card counts, sound refs, and field content; runs on both decks - Configure Forgejo v0.10 release with conjugation .apkg as downloadable asset - Update releases/pealim_conjugations.apkg with tagged notes Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-05 05:09:45 +00:00 · 2026-03-05 05:09:45 +00:00 · 4fcc5cff60
commit 4fcc5cff60
parent 39fb388f6c
2 changed files with 308 additions and 2 deletions
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -25,6 +25,10 @@ VOCAB_MODEL_ID  = 1_234_567_891
 CONJ_DECK_ID    = 1_234_567_892
 CONJ_MODEL_ID   = 1_234_567_893

+# Release version tag added to all notes so users can identify which release
+# their cards come from (visible in Anki's Browse view and card info).
+RELEASE_TAG = "v0.10"
+
 DATA_DIR       = Path(__file__).parent / "data"
 AUDIO_DIR      = DATA_DIR / "audio"
 AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
@ -253,8 +257,8 @@ VOCAB_MODEL = genanki.Model(
 # ──────────────────────────────────────────────────────────────────────────────

 CONJ_FRONT = """
-<div class="hebrew">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
 <div class="hebrew">{{Pronoun}}</div>
+<div class="hebrew">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
 <div class="hebrew">{{Tense}}</div>
 """

@ -525,7 +529,7 @@ def build_vocab_deck(
                freq_display,
                image_tag,
            ],
-            tags=tags_str.split() if tags_str else [],
+            tags=(tags_str.split() if tags_str else []) + [RELEASE_TAG],
        )
        deck.add_note(note)

@ -583,6 +587,7 @@ def build_conj_deck(
                    voice,
                    audio_tag,
                ],
+                tags=[RELEASE_TAG],
            )
            deck.add_note(note)
            note_count += 1
--- a/validate_apkg.py
+++ b/validate_apkg.py
@ -0,0 +1,301 @@
+#!/usr/bin/env python3
+"""
+validate_apkg.py — Anki deck integrity validation.
+
+Checks both pealim_vocabulary.apkg and pealim_conjugations.apkg for
+structural correctness, media consistency, and card data integrity.
+
+Usage:
+    python3 validate_apkg.py [--vocab | --conjugations] [path/to/deck.apkg]
+"""
+
+import argparse
+import json
+import os
+import re
+import sqlite3
+import struct
+import sys
+import tempfile
+import zipfile
+from pathlib import Path
+
+VOCAB_APKG = Path("output/pealim_vocabulary.apkg")
+CONJ_APKG = Path("output/pealim_conjugations.apkg")
+
+PASS = "\033[32m✓\033[0m"
+FAIL = "\033[31m✗\033[0m"
+WARN = "\033[33m⚠\033[0m"
+
+
+def check(label: str, ok: bool, detail: str = "") -> bool:
+    icon = PASS if ok else FAIL
+    line = f"  {icon}  {label}"
+    if detail:
+        line += f": {detail}"
+    print(line)
+    return ok
+
+
+def warn(label: str, detail: str = "") -> None:
+    line = f"  {WARN}  {label}"
+    if detail:
+        line += f": {detail}"
+    print(line)
+
+
+def _detect_format(data: bytes) -> str:
+    if data[:3] == b"ID3":
+        return "MP3 (ID3)"
+    if data[:2] in (b"\xff\xfb", b"\xff\xf3", b"\xff\xf2", b"\xff\xfa"):
+        return "MP3 (raw)"
+    if data[:4] == b"OggS":
+        return "OGG"
+    if data[:4] == b"fLaC":
+        return "FLAC"
+    if data[:4] == b"RIFF":
+        return "WAV"
+    return f"unknown ({data[:4].hex()})"
+
+
+def validate_apkg(apkg_path: Path) -> int:
+    """Run all checks. Returns number of failures."""
+    name = apkg_path.name
+    print(f"\n{'='*60}")
+    print(f"  Validating: {apkg_path}")
+    print(f"{'='*60}")
+
+    failures = 0
+
+    if not apkg_path.exists():
+        print(f"  {FAIL}  File not found: {apkg_path}")
+        return 1
+
+    file_size_mb = apkg_path.stat().st_size / 1_048_576
+    print(f"\n  File size: {file_size_mb:.1f} MB")
+
+    # --- ZIP structure ---
+    print("\n[ZIP structure]")
+    try:
+        zf = zipfile.ZipFile(apkg_path)
+        namelist = zf.namelist()
+        has_db = "collection.anki2" in namelist
+        has_media = "media" in namelist
+        failures += 0 if check("collection.anki2 present", has_db) else 1
+        failures += 0 if check("media manifest present", has_media) else 1
+    except zipfile.BadZipFile as e:
+        print(f"  {FAIL}  Invalid ZIP: {e}")
+        return 1
+
+    with tempfile.TemporaryDirectory() as tmpdir:
+        zf.extractall(tmpdir)
+
+        # --- Media manifest ---
+        print("\n[Media manifest]")
+        media_path = os.path.join(tmpdir, "media")
+        with open(media_path) as f:
+            try:
+                media_map: dict[str, str] = json.load(f)
+            except json.JSONDecodeError as e:
+                print(f"  {FAIL}  Invalid media JSON: {e}")
+                failures += 1
+                media_map = {}
+
+        original_names = set(media_map.values())
+        zip_numbered = set(namelist) - {"collection.anki2", "media"}
+
+        check(
+            "Manifest count matches ZIP entries",
+            len(media_map) == len(zip_numbered),
+            f"{len(media_map)} manifest vs {len(zip_numbered)} ZIP files",
+        )
+
+        # Check for zero-byte media files
+        zero_byte = []
+        for num, orig in media_map.items():
+            size = zf.getinfo(num).file_size if num in zf.NameToInfo else -1
+            if size == 0:
+                zero_byte.append(orig)
+        failures += 0 if check("No zero-byte media files", len(zero_byte) == 0,
+                               f"{len(zero_byte)} empty" if zero_byte else "") else 1
+
+        # Check audio format sample (first 20 mp3s)
+        mp3_names = [num for num, orig in media_map.items() if orig.endswith(".mp3")]
+        bad_format = []
+        for num in mp3_names[:20]:
+            data = zf.read(num)[:8]
+            fmt = _detect_format(data)
+            if "MP3" not in fmt:
+                bad_format.append(f"{media_map[num]}: {fmt}")
+        failures += 0 if check(
+            f"Audio format (sampled {min(20, len(mp3_names))} files)",
+            len(bad_format) == 0,
+            "; ".join(bad_format) if bad_format else f"all MP3",
+        ) else 1
+
+        # Fonts present
+        font_files = [v for v in original_names if v.endswith(".ttf")]
+        check("Heebo font files bundled", len(font_files) >= 1,
+              ", ".join(font_files) if font_files else "none found")
+
+        # --- Database ---
+        print("\n[Database]")
+        db_path = os.path.join(tmpdir, "collection.anki2")
+        conn = sqlite3.connect(db_path)
+
+        schema_ver = conn.execute("SELECT ver FROM col").fetchone()[0]
+        failures += 0 if check("Schema version 11 (Anki 2.1)", schema_ver == 11,
+                               f"got {schema_ver}") else 1
+
+        note_count = conn.execute("SELECT COUNT(*) FROM notes").fetchone()[0]
+        card_count = conn.execute("SELECT COUNT(*) FROM cards").fetchone()[0]
+        failures += 0 if check("Notes present", note_count > 0, f"{note_count:,} notes") else 1
+        failures += 0 if check("Cards present", card_count > 0, f"{card_count:,} cards") else 1
+
+        # Determine expected cards per note from model templates
+        models_json_raw = conn.execute("SELECT models FROM col").fetchone()[0]
+        models_raw = json.loads(models_json_raw)
+        tmpl_counts = [len(m["tmpls"]) for m in models_raw.values()]
+        expected_ratio = tmpl_counts[0] if len(set(tmpl_counts)) == 1 else None
+        if expected_ratio:
+            failures += 0 if check(
+                f"{expected_ratio} card(s) per note",
+                card_count == note_count * expected_ratio,
+                f"{note_count} notes × {expected_ratio} = {note_count * expected_ratio}, got {card_count}",
+            ) else 1
+
+        # Duplicate GUIDs
+        dup_guids = conn.execute(
+            "SELECT guid, COUNT(*) c FROM notes GROUP BY guid HAVING c > 1"
+        ).fetchall()
+        failures += 0 if check("No duplicate GUIDs", len(dup_guids) == 0,
+                               f"{len(dup_guids)} duplicates") else 1
+
+        # Card queue states
+        queues = conn.execute(
+            "SELECT type, queue, COUNT(*) FROM cards GROUP BY type, queue"
+        ).fetchall()
+        queue_map = {(t, q): cnt for t, q, cnt in queues}
+        new_cards = queue_map.get((0, 0), 0)
+        suspended = queue_map.get((0, -1), 0) + queue_map.get((1, -1), 0) + queue_map.get((2, -1), 0)
+        if new_cards > 0:
+            check(f"Cards in new queue (type=0, queue=0)", True, f"{new_cards:,}")
+        if suspended > 0:
+            warn("Suspended cards", f"{suspended:,}")
+
+        # dconf — new card order
+        dconf_json = conn.execute("SELECT dconf FROM col").fetchone()[0]
+        dconf = json.loads(dconf_json)
+        orders = {dc.get("new", {}).get("order") for dc in dconf.values() if isinstance(dc, dict)}
+        per_days = {dc.get("new", {}).get("perDay") for dc in dconf.values() if isinstance(dc, dict)}
+        check("new.order configured", bool(orders), f"{orders}")
+        if per_days:
+            check("new.perDay > 0", all(p and p > 0 for p in per_days if p is not None),
+                  f"perDay={per_days}")
+
+        # Deck assignment
+        decks_json = conn.execute("SELECT decks FROM col").fetchone()[0]
+        decks = json.loads(decks_json)
+        real_decks = {did: d for did, d in decks.items() if did != "1"}
+        if real_decks:
+            check("Custom deck exists (not Default only)", True,
+                  ", ".join(d["name"] for d in real_decks.values()))
+            # All cards in the custom deck?
+            for did_str in real_decks:
+                assigned = conn.execute(
+                    "SELECT COUNT(*) FROM cards WHERE did=?", [int(did_str)]
+                ).fetchone()[0]
+                check(f"Cards in deck '{real_decks[did_str]['name']}'", assigned > 0,
+                      f"{assigned:,}/{card_count:,}")
+
+        # --- Sound references vs media manifest ---
+        print("\n[Sound references]")
+        notes_flds = conn.execute("SELECT flds FROM notes").fetchall()
+        sound_refs: set[str] = set()
+        for (flds,) in notes_flds:
+            for ref in re.findall(r"\[sound:([^\]]+)\]", flds):
+                sound_refs.add(ref)
+
+        missing_audio = sound_refs - original_names
+        orphaned_audio = original_names - sound_refs - set(font_files)
+        failures += 0 if check("All sound refs in media manifest", len(missing_audio) == 0,
+                               f"{len(missing_audio)} missing" if missing_audio else "") else 1
+        if orphaned_audio:
+            warn("Media files not referenced by any card", f"{len(orphaned_audio)} orphaned")
+
+        notes_with_audio = sum(
+            1 for (flds,) in notes_flds if "[sound:" in flds
+        )
+        pct = notes_with_audio / note_count * 100 if note_count else 0
+        check(f"Notes with audio", notes_with_audio > 0, f"{notes_with_audio:,}/{note_count:,} ({pct:.0f}%)")
+
+        # --- Empty fields check ---
+        print("\n[Field content]")
+        models = models_raw
+        for mid_str, model in models.items():
+            field_names = [f["name"] for f in model["flds"]]
+            # Check required fields (first 3) are not empty
+            required_idx = list(range(min(3, len(field_names))))
+            for idx in required_idx:
+                fname = field_names[idx]
+                empty_count = conn.execute(
+                    """SELECT COUNT(*) FROM notes
+                       WHERE mid=? AND (
+                           flds LIKE ? OR
+                           instr(flds, char(31)) = 0
+                       )""",
+                    [int(mid_str), "\x1f" * idx + "\x1f%"],
+                ).fetchone()[0]
+                # Simpler: count notes where field idx is empty
+                all_notes_for_model = conn.execute(
+                    "SELECT flds FROM notes WHERE mid=?", [int(mid_str)]
+                ).fetchall()
+                empty = sum(
+                    1 for (flds,) in all_notes_for_model
+                    if len(flds.split("\x1f")) <= idx or not flds.split("\x1f")[idx].strip()
+                )
+                if empty > 0:
+                    warn(f"Model '{model['name']}' field '{fname}' empty in {empty} notes")
+                else:
+                    check(f"Model '{model['name']}' field '{fname}' populated", True)
+
+        conn.close()
+
+    print()
+    return failures
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Validate Pealim .apkg files")
+    parser.add_argument("path", nargs="?", help="Path to .apkg file (validates both if omitted)")
+    group = parser.add_mutually_exclusive_group()
+    group.add_argument("--vocab", action="store_true", help="Validate vocabulary deck only")
+    group.add_argument("--conjugations", action="store_true", help="Validate conjugation deck only")
+    args = parser.parse_args()
+
+    targets: list[Path] = []
+    if args.path:
+        targets = [Path(args.path)]
+    elif args.vocab:
+        targets = [VOCAB_APKG]
+    elif args.conjugations:
+        targets = [CONJ_APKG]
+    else:
+        targets = [VOCAB_APKG, CONJ_APKG]
+
+    total_failures = 0
+    for path in targets:
+        total_failures += validate_apkg(path)
+
+    print(f"\n{'='*60}")
+    if total_failures == 0:
+        print(f"  {PASS}  All checks passed")
+    else:
+        print(f"  {FAIL}  {total_failures} check(s) failed")
+    print(f"{'='*60}\n")
+
+    sys.exit(0 if total_failures == 0 else 1)
+
+
+if __name__ == "__main__":
+    main()