#!/usr/bin/env python3 """Download audio files from URLs stored in words.json. Three audio categories are handled: 1. Vocab audio → data/audio/{audio_file} 2. Noun plural → data/audio/{slug}_plural.mp3 3. Conjugation → data/audio_conj/{slug}_{form_key}.mp3 data/audio_conj/{slug}_passive_{form_key}.mp3 """ import argparse import json import logging import re import time from pathlib import Path import requests logger = logging.getLogger(__name__) DATA_DIR = Path(__file__).parent / "data" AUDIO_DIR = DATA_DIR / "audio" AUDIO_CONJ_DIR = DATA_DIR / "audio_conj" WORDS_JSON = DATA_DIR / "words.json" DOWNLOAD_DELAY = 0.3 MAX_RETRIES = 3 # Map Hebrew tense names to English prefixes for form_key construction. # "מְקוֹר" (infinitive) is included for forward compatibility; it does not # appear in the current dataset but the form_key collapses to bare "infinitive". TENSE_TO_PREFIX = { "הוֹוֶה": "present", "עָבָר": "past", "עָתִיד": "future", "צִוּוּי": "imperative", "מְקוֹר": "infinitive", } # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def _make_audio_file(entry: dict) -> str: """Derive the vocab audio filename when audio_file is absent. Slug-based for confusable entries (slug contains the disambiguating ID), consonant-only for all others. Args: entry: A words.json entry dict. Returns: Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``. """ audio_file = entry.get("audio_file", "") if audio_file: return audio_file # Fallback: use slug for confusables, ktiv_male for others slug = entry.get("slug", "") if entry.get("confusable_group"): return f"{slug}.mp3" ktiv_male = entry.get("word", {}).get("ktiv_male", "") safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male) return f"{safe_name}.mp3" def _form_key(person: str, tense: str) -> str: """Build a filesystem-safe form key from person and tense fields. Args: person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``. tense: Hebrew tense string from the conjugation form. Returns: Form key such as ``"past_1s"`` or ``"present_ms"``. Infinitive tense always returns ``"infinitive"`` (no person suffix). """ prefix = TENSE_TO_PREFIX.get(tense, tense) if prefix == "infinitive": return "infinitive" return f"{prefix}_{person}" def _download(url: str, dest: Path, session: requests.Session) -> bool: """Download *url* to *dest*, retrying up to MAX_RETRIES times. Skips the download silently if *dest* already exists. Args: url: HTTP(S) URL to download. dest: Local path to write the file to. session: Shared requests session. Returns: ``True`` if the file was downloaded (or already existed), ``False`` if all retries were exhausted. """ if dest.exists(): return True for attempt in range(1, MAX_RETRIES + 1): try: resp = session.get(url, timeout=15) resp.raise_for_status() dest.write_bytes(resp.content) logger.debug("Downloaded %s → %s", url, dest.name) return True except requests.RequestException as exc: wait = 2**attempt if attempt < MAX_RETRIES: logger.warning( "Attempt %d/%d failed for %s (%s) — retrying in %ds", attempt, MAX_RETRIES, url, exc, wait, ) time.sleep(wait) else: logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc) return False # --------------------------------------------------------------------------- # Per-category downloaders # --------------------------------------------------------------------------- def download_vocab_audio( entries: list[dict], session: requests.Session, ) -> tuple[int, int, int]: """Download vocabulary audio files. Args: entries: List of words.json entry dicts. session: Shared requests session. Returns: Tuple of (downloaded, cached, no_url) counts. """ downloaded = cached = no_url = 0 for entry in entries: url: str | None = entry.get("audio_url") if not url: no_url += 1 continue audio_file: str | None = entry.get("audio_file") if not audio_file: audio_file = _make_audio_file(entry) dest = AUDIO_DIR / audio_file if dest.exists(): cached += 1 continue if _download(url, dest, session): downloaded += 1 time.sleep(DOWNLOAD_DELAY) else: no_url += 1 # count persistent failures alongside missing URLs return downloaded, cached, no_url def download_noun_plural_audio( entries: list[dict], session: requests.Session, ) -> tuple[int, int]: """Download noun plural audio files. Destination: ``data/audio/{slug}_plural.mp3`` Args: entries: List of words.json entry dicts. session: Shared requests session. Returns: Tuple of (downloaded, cached) counts. """ downloaded = cached = 0 for entry in entries: ni = entry.get("noun_inflection") if not ni or not isinstance(ni, dict): continue url: str | None = ni.get("plural_audio") if not url or not url.startswith("http"): continue slug: str = entry["slug"] dest = AUDIO_DIR / f"{slug}_plural.mp3" if dest.exists(): cached += 1 continue if _download(url, dest, session): downloaded += 1 time.sleep(DOWNLOAD_DELAY) return downloaded, cached def download_conjugation_audio( entries: list[dict], session: requests.Session, ) -> tuple[int, int, int]: """Download conjugation form audio files. Active forms → ``data/audio_conj/{slug}_{form_key}.mp3`` Passive forms → ``data/audio_conj/{slug}_passive_{form_key}.mp3`` Args: entries: List of words.json entry dicts. session: Shared requests session. Returns: Tuple of (downloaded, cached, failed) counts. """ downloaded = cached = failed = 0 for entry in entries: conj = entry.get("conjugation") if not conj: continue slug: str = entry["slug"] form_sets: list[tuple[str, list]] = [ ("", conj.get("active_forms") or []), ("passive_", conj.get("hufal_pual_forms") or []), ] for prefix, forms in form_sets: for form in forms: url: str | None = form.get("audio_url") if not url: continue key = _form_key(form.get("person", ""), form.get("tense", "")) dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3" if dest.exists(): cached += 1 continue if _download(url, dest, session): downloaded += 1 time.sleep(DOWNLOAD_DELAY) else: failed += 1 return downloaded, cached, failed # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- def main() -> None: """Parse CLI args and run the audio download pipeline.""" parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.") parser.add_argument( "--skip-vocab", action="store_true", help="Skip vocabulary audio downloads.", ) parser.add_argument( "--skip-conj", action="store_true", help="Skip conjugation audio downloads.", ) parser.add_argument( "--test", metavar="N", type=int, default=None, help="Limit processing to the first N words.json entries.", ) args = parser.parse_args() logging.basicConfig( level=logging.INFO, format="%(message)s", ) AUDIO_DIR.mkdir(parents=True, exist_ok=True) AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True) with open(WORDS_JSON, encoding="utf-8") as fh: raw: dict[str, dict] = json.load(fh) entries = list(raw.values()) if args.test is not None: entries = entries[: args.test] logger.info("[4] Downloading audio files …") session = requests.Session() session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)" # --- Vocab --- if not args.skip_vocab: v_dl, v_cached, v_no_url = download_vocab_audio(entries, session) else: v_dl = v_cached = v_no_url = 0 # --- Noun plural --- np_dl, np_cached = download_noun_plural_audio(entries, session) # --- Conjugation --- if not args.skip_conj: c_dl, c_cached, c_failed = download_conjugation_audio(entries, session) else: c_dl = c_cached = c_failed = 0 # --- Summary --- if not args.skip_vocab: logger.info( " Vocab: %d downloaded, %d cached, %d no URL", v_dl, v_cached, v_no_url, ) logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached) if not args.skip_conj: failed_msg = f", {c_failed} failed" if c_failed else "" logger.info( " Conjugation: %d downloaded, %d cached%s", c_dl, c_cached, failed_msg, ) if __name__ == "__main__": main()