hebrew_flash_cards/pealim_audio_download.py
Sochen b2fef5aa8a Sprint 11.1: strip_nikkud cleanup, dead code removal, test fixes
Remove strip_nikkud from all pipeline files — use ktiv_male directly.
Fix case-insensitive binyan matching in detail scraper (og:description
uses UPPERCASE). Fix integration test slugs and test limits. Delete
legacy CSVs, stale .apkg, and dead scripts from git. Add vulture to
pre-commit hook.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 04:03:47 +00:00

348 lines
9.7 KiB
Python

#!/usr/bin/env python3
"""Download audio files from URLs stored in words.json.
Three audio categories are handled:
1. Vocab audio → data/audio/{audio_file}
2. Noun plural → data/audio/{slug}_plural.mp3
3. Conjugation → data/audio_conj/{slug}_{form_key}.mp3
data/audio_conj/{slug}_passive_{form_key}.mp3
"""
import argparse
import json
import logging
import re
import time
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
WORDS_JSON = DATA_DIR / "words.json"
DOWNLOAD_DELAY = 0.3
MAX_RETRIES = 3
# Map Hebrew tense names to English prefixes for form_key construction.
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
# appear in the current dataset but the form_key collapses to bare "infinitive".
TENSE_TO_PREFIX = {
"הוֹוֶה": "present",
"עָבָר": "past",
"עָתִיד": "future",
"צִוּוּי": "imperative",
"מְקוֹר": "infinitive",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_audio_file(entry: dict) -> str:
"""Derive the vocab audio filename when audio_file is absent.
Slug-based for confusable entries (slug contains the disambiguating ID),
consonant-only for all others.
Args:
entry: A words.json entry dict.
Returns:
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
"""
audio_file = entry.get("audio_file", "")
if audio_file:
return audio_file
# Fallback: use slug for confusables, ktiv_male for others
slug = entry.get("slug", "")
if entry.get("confusable_group"):
return f"{slug}.mp3"
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
return f"{safe_name}.mp3"
def _form_key(person: str, tense: str) -> str:
"""Build a filesystem-safe form key from person and tense fields.
Args:
person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
tense: Hebrew tense string from the conjugation form.
Returns:
Form key such as ``"past_1s"`` or ``"present_ms"``.
Infinitive tense always returns ``"infinitive"`` (no person suffix).
"""
prefix = TENSE_TO_PREFIX.get(tense, tense)
if prefix == "infinitive":
return "infinitive"
return f"{prefix}_{person}"
def _download(url: str, dest: Path, session: requests.Session) -> bool:
"""Download *url* to *dest*, retrying up to MAX_RETRIES times.
Skips the download silently if *dest* already exists.
Args:
url: HTTP(S) URL to download.
dest: Local path to write the file to.
session: Shared requests session.
Returns:
``True`` if the file was downloaded (or already existed),
``False`` if all retries were exhausted.
"""
if dest.exists():
return True
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = session.get(url, timeout=15)
resp.raise_for_status()
dest.write_bytes(resp.content)
logger.debug("Downloaded %s%s", url, dest.name)
return True
except requests.RequestException as exc:
wait = 2**attempt
if attempt < MAX_RETRIES:
logger.warning(
"Attempt %d/%d failed for %s (%s) — retrying in %ds",
attempt,
MAX_RETRIES,
url,
exc,
wait,
)
time.sleep(wait)
else:
logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
return False
# ---------------------------------------------------------------------------
# Per-category downloaders
# ---------------------------------------------------------------------------
def download_vocab_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int, int]:
"""Download vocabulary audio files.
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached, no_url) counts.
"""
downloaded = cached = no_url = 0
for entry in entries:
url: str | None = entry.get("audio_url")
if not url:
no_url += 1
continue
audio_file: str | None = entry.get("audio_file")
if not audio_file:
audio_file = _make_audio_file(entry)
dest = AUDIO_DIR / audio_file
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
else:
no_url += 1 # count persistent failures alongside missing URLs
return downloaded, cached, no_url
def download_noun_plural_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int]:
"""Download noun plural audio files.
Destination: ``data/audio/{slug}_plural.mp3``
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached) counts.
"""
downloaded = cached = 0
for entry in entries:
ni = entry.get("noun_inflection")
if not ni or not isinstance(ni, dict):
continue
url: str | None = ni.get("plural_audio")
if not url or not url.startswith("http"):
continue
slug: str = entry["slug"]
dest = AUDIO_DIR / f"{slug}_plural.mp3"
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
return downloaded, cached
def download_conjugation_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int, int]:
"""Download conjugation form audio files.
Active forms → ``data/audio_conj/{slug}_{form_key}.mp3``
Passive forms → ``data/audio_conj/{slug}_passive_{form_key}.mp3``
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached, failed) counts.
"""
downloaded = cached = failed = 0
for entry in entries:
conj = entry.get("conjugation")
if not conj:
continue
slug: str = entry["slug"]
form_sets: list[tuple[str, list]] = [
("", conj.get("active_forms") or []),
("passive_", conj.get("hufal_pual_forms") or []),
]
for prefix, forms in form_sets:
for form in forms:
url: str | None = form.get("audio_url")
if not url:
continue
key = _form_key(form.get("person", ""), form.get("tense", ""))
dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
else:
failed += 1
return downloaded, cached, failed
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
"""Parse CLI args and run the audio download pipeline."""
parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
parser.add_argument(
"--skip-vocab",
action="store_true",
help="Skip vocabulary audio downloads.",
)
parser.add_argument(
"--skip-conj",
action="store_true",
help="Skip conjugation audio downloads.",
)
parser.add_argument(
"--test",
metavar="N",
type=int,
default=None,
help="Limit processing to the first N words.json entries.",
)
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
with open(WORDS_JSON, encoding="utf-8") as fh:
raw: dict[str, dict] = json.load(fh)
entries = list(raw.values())
if args.test is not None:
entries = entries[: args.test]
logger.info("[4] Downloading audio files …")
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
# --- Vocab ---
if not args.skip_vocab:
v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
else:
v_dl = v_cached = v_no_url = 0
# --- Noun plural ---
np_dl, np_cached = download_noun_plural_audio(entries, session)
# --- Conjugation ---
if not args.skip_conj:
c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
else:
c_dl = c_cached = c_failed = 0
# --- Summary ---
if not args.skip_vocab:
logger.info(
" Vocab: %d downloaded, %d cached, %d no URL",
v_dl,
v_cached,
v_no_url,
)
logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached)
if not args.skip_conj:
failed_msg = f", {c_failed} failed" if c_failed else ""
logger.info(
" Conjugation: %d downloaded, %d cached%s",
c_dl,
c_cached,
failed_msg,
)
if __name__ == "__main__":
main()