From ccd7d61efbc3d91b9ac05e1ff52bad95c561660a Mon Sep 17 00:00:00 2001 From: Sochen Date: Thu, 5 Mar 2026 20:58:06 +0000 Subject: [PATCH] Add 6-variant release build (4 vocab + 2 conj), bump to v0.12 - build_vocab_deck(): include_audio/include_images flags - build_conj_deck(): include_audio flag - build_all_variants(): builds all 6 apkg files in one call - Variants: hebrew_vocabulary{,_audio,_images,_audio_images}.apkg hebrew_conjugations{,_audio}.apkg - run.py: step_build_all() replaces step_build_vocab(); conjugation extraction reuses cached conjugations.json unless refreshed - RELEASE_TAG bumped to v0.12 Co-Authored-By: Claude Sonnet 4.6 --- apkg_builder.py | 76 +++++++++++++++++++++++++++++++++++++++++-------- run.py | 67 ++++++++++++++++++++++++------------------- 2 files changed, 102 insertions(+), 41 deletions(-) diff --git a/apkg_builder.py b/apkg_builder.py index 3a70619..1572995 100644 --- a/apkg_builder.py +++ b/apkg_builder.py @@ -28,7 +28,7 @@ CONJ_MODEL_ID = 1_234_567_893 # Release version tag added to all notes so users can identify which release # their cards come from (visible in Anki's Browse view and card info). -RELEASE_TAG = "v0.11" +RELEASE_TAG = "v0.12" # Regex for extracting emoji and Hebrew prepositions from meaning strings EMOJI_RE = re.compile(r'[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+') @@ -39,8 +39,12 @@ AUDIO_DIR = DATA_DIR / "audio" AUDIO_CONJ_DIR = DATA_DIR / "audio_conj" OUTPUT_DIR = Path(__file__).parent / "output" -VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg" -CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg" +VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg" +VOCAB_APKG_AUDIO = OUTPUT_DIR / "hebrew_vocabulary_audio.apkg" +VOCAB_APKG_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_images.apkg" +VOCAB_APKG_AUDIO_IMAGES = OUTPUT_DIR / "hebrew_vocabulary_audio_images.apkg" +CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg" +CONJ_APKG_AUDIO = OUTPUT_DIR / "hebrew_conjugations_audio.apkg" # ────────────────────────────────────────────────────────────────────────────── # Binyan → Hebrew label mapping (for conjugation card display) @@ -410,6 +414,8 @@ def build_vocab_deck( freq_cache: Optional[dict] = None, image_cache: Optional[dict] = None, limit: Optional[int] = None, + include_audio: bool = True, + include_images: bool = True, ) -> tuple[genanki.Deck, list[Path]]: """ Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv). @@ -505,7 +511,7 @@ def build_vocab_deck( pos_heb = _translate_pos(pos_raw) if pos_raw else "" # Audio - audio_tag = _audio_tag(word_no_nik) + audio_tag = _audio_tag(word_no_nik) if include_audio else "" if audio_tag: mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]") mp3_path = AUDIO_DIR / mp3_name @@ -538,14 +544,15 @@ def build_vocab_deck( related_html = "\n".join(parts) # Image: look up by stripped word (no-nikkud) - image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None) image_tag = "" - if image_filename: - image_path = images_dir / image_filename - if image_path.exists(): - image_tag = image_filename - if image_path not in media_files: - media_files.append(image_path) + if include_images: + image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None) + if image_filename: + image_path = images_dir / image_filename + if image_path.exists(): + image_tag = image_filename + if image_path not in media_files: + media_files.append(image_path) note = genanki.Note( model=VOCAB_MODEL, @@ -598,6 +605,7 @@ def build_vocab_deck( def build_conj_deck( conjugations: dict, audio_dir: Path = AUDIO_CONJ_DIR, + include_audio: bool = True, ) -> tuple[genanki.Deck, list[Path]]: """Build the conjugation drill deck from conjugations.json data.""" deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations") @@ -659,7 +667,7 @@ def build_conj_deck( # Audio tag: use downloaded file if present audio_tag = "" - if slug: + if include_audio and slug: audio_tag = _conj_audio_tag(slug, form_key) if audio_tag: mp3_path = audio_dir / f"{slug}_{form_key}.mp3" @@ -756,6 +764,50 @@ def write_conj_apkg( logger.info(f"Conjugation deck written → {out_path}") +def build_all_variants( + dict_csv: Path, + conjugations: dict, + examples_cache: Optional[dict] = None, + freq_cache: Optional[dict] = None, + image_cache: Optional[dict] = None, + limit: Optional[int] = None, +) -> None: + """Build all 6 release variants (4 vocab + 2 conj) into output/.""" + logger.info("Building all release variants …") + + vocab_variants = [ + (False, False, VOCAB_APKG), + (True, False, VOCAB_APKG_AUDIO), + (False, True, VOCAB_APKG_IMAGES), + (True, True, VOCAB_APKG_AUDIO_IMAGES), + ] + for audio, images, path in vocab_variants: + label = f"audio={'yes' if audio else 'no'} images={'yes' if images else 'no'}" + logger.info(f" Vocab variant: {label} → {path.name}") + deck, media = build_vocab_deck( + dict_csv, + examples_cache=examples_cache, + freq_cache=freq_cache, + image_cache=image_cache or {}, + limit=limit, + include_audio=audio, + include_images=images, + ) + write_vocab_apkg(deck, media, out_path=path) + + conj_variants = [ + (False, CONJ_APKG), + (True, CONJ_APKG_AUDIO), + ] + for audio, path in conj_variants: + label = f"audio={'yes' if audio else 'no'}" + logger.info(f" Conj variant: {label} → {path.name}") + deck, media = build_conj_deck(conjugations, include_audio=audio) + write_conj_apkg(deck, media, out_path=path) + + logger.info("All variants built.") + + if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") diff --git a/run.py b/run.py index 70b0424..819f440 100644 --- a/run.py +++ b/run.py @@ -358,9 +358,9 @@ def step_images(args) -> dict: return image_fetch.run(limit=limit) -def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache: dict | None = None): - """Step 5 — build vocabulary .apkg.""" - logger.info("[5] Building vocabulary deck …") +def step_build_all(args, examples_cache: dict, freq_cache: dict, conjugations: dict | None, image_cache: dict | None = None): + """Step 5 — build all 6 release variants (4 vocab + 2 conj).""" + logger.info("[5] Building all deck variants …") import apkg_builder dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv" @@ -371,20 +371,18 @@ def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache: if not dict_csv.exists(): dict_csv = DATA_DIR / "pealim_dict.csv" - deck, media = apkg_builder.build_vocab_deck( + apkg_builder.build_all_variants( dict_csv, + conjugations=conjugations or {}, examples_cache=examples_cache, freq_cache=freq_cache, image_cache=image_cache or {}, limit=args.test, ) - apkg_builder.write_vocab_apkg(deck, media) - logger.info(f" Vocabulary .apkg → {apkg_builder.VOCAB_APKG}") - return deck def step_conjugations(args): - """Step 6 — extract conjugations and build conjugation deck.""" + """Step 6 — extract conjugations (returns data; building handled by step_build_all).""" if args.skip_conjugations: logger.info("[6] Skipping conjugations (--skip-conjugations)") return None @@ -394,18 +392,21 @@ def step_conjugations(args): logger.info("[6] verbs_input.txt not found — skipping conjugation deck") return None - logger.info("[6] Extracting verb conjugations …") - import conjugation_extract - conjugations = conjugation_extract.main(verbs_file) + # Use cached conjugations.json if available (skip re-extraction) + conj_cache = DATA_DIR / "conjugations.json" + if conj_cache.exists() and not getattr(args, 'refresh_conjugations', False): + logger.info("[6] Using cached conjugations.json …") + with open(conj_cache) as f: + import json as _json + conjugations = _json.load(f) + else: + logger.info("[6] Extracting verb conjugations …") + import conjugation_extract + conjugations = conjugation_extract.main(verbs_file) # Download conjugation audio step_conj_audio(args, conjugations) - import apkg_builder - conj_deck, conj_media = apkg_builder.build_conj_deck(conjugations) - apkg_builder.write_conj_apkg(conj_deck, conj_media) - logger.info(f" Conjugation .apkg → {apkg_builder.CONJ_APKG}") - return conjugations @@ -453,17 +454,18 @@ def print_summary(args, examples_cache, freq_cache, conjugations): found_imgs = sum(1 for v in ic.values() if v) logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images") - vocab_apkg = OUTPUT_DIR / "hebrew_vocabulary.apkg" - conj_apkg = OUTPUT_DIR / "hebrew_conjugations.apkg" - if vocab_apkg.exists(): - size_mb = vocab_apkg.stat().st_size / 1e6 - logger.info(f" Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}") - if conj_apkg.exists(): - size_mb = conj_apkg.stat().st_size / 1e6 - logger.info(f" Conjugation .apkg: {size_mb:.1f} MB → {conj_apkg}") - if conjugations: - verb_count = sum(1 for v in conjugations.values() if v) - logger.info(f" Verbs in conjugation deck: {verb_count}") + import apkg_builder as _ab + all_apkgs = [ + _ab.VOCAB_APKG, _ab.VOCAB_APKG_AUDIO, _ab.VOCAB_APKG_IMAGES, _ab.VOCAB_APKG_AUDIO_IMAGES, + _ab.CONJ_APKG, _ab.CONJ_APKG_AUDIO, + ] + for apkg in all_apkgs: + if apkg.exists(): + size_mb = apkg.stat().st_size / 1e6 + logger.info(f" {apkg.name}: {size_mb:.1f} MB") + if conjugations: + verb_count = sum(1 for v in conjugations.values() if v) + logger.info(f" Verbs in conjugation deck: {verb_count}") logger.info("=" * 60) logger.info("DONE") @@ -485,6 +487,13 @@ def main(): if args.only == "conjugations": step_fonts(args) conjugations = step_conjugations(args) + if conjugations: + import apkg_builder + apkg_builder.build_all_variants( + DATA_DIR / "hebrew_dict_for_anki.csv", + conjugations=conjugations, + limit=args.test, + ) print_summary(args, {}, {}, conjugations or {}) return @@ -497,8 +506,8 @@ def main(): step_audio(args) step_fonts(args) image_cache = step_images(args) - step_build_vocab(args, examples_cache, freq_cache, image_cache) - conjugations = step_conjugations(args) + conjugations = step_conjugations(args) + step_build_all(args, examples_cache, freq_cache, conjugations, image_cache) print_summary(args, examples_cache, freq_cache, conjugations or {})