Remove strip_nikkud from all pipeline files — use ktiv_male directly. Fix case-insensitive binyan matching in detail scraper (og:description uses UPPERCASE). Fix integration test slugs and test limits. Delete legacy CSVs, stale .apkg, and dead scripts from git. Add vulture to pre-commit hook. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
348 lines
9.7 KiB
Python
348 lines
9.7 KiB
Python
#!/usr/bin/env python3
|
|
"""Download audio files from URLs stored in words.json.
|
|
|
|
Three audio categories are handled:
|
|
1. Vocab audio → data/audio/{audio_file}
|
|
2. Noun plural → data/audio/{slug}_plural.mp3
|
|
3. Conjugation → data/audio_conj/{slug}_{form_key}.mp3
|
|
data/audio_conj/{slug}_passive_{form_key}.mp3
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DATA_DIR = Path(__file__).parent / "data"
|
|
AUDIO_DIR = DATA_DIR / "audio"
|
|
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
|
WORDS_JSON = DATA_DIR / "words.json"
|
|
|
|
DOWNLOAD_DELAY = 0.3
|
|
MAX_RETRIES = 3
|
|
|
|
# Map Hebrew tense names to English prefixes for form_key construction.
|
|
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
|
|
# appear in the current dataset but the form_key collapses to bare "infinitive".
|
|
TENSE_TO_PREFIX = {
|
|
"הוֹוֶה": "present",
|
|
"עָבָר": "past",
|
|
"עָתִיד": "future",
|
|
"צִוּוּי": "imperative",
|
|
"מְקוֹר": "infinitive",
|
|
}
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _make_audio_file(entry: dict) -> str:
|
|
"""Derive the vocab audio filename when audio_file is absent.
|
|
|
|
Slug-based for confusable entries (slug contains the disambiguating ID),
|
|
consonant-only for all others.
|
|
|
|
Args:
|
|
entry: A words.json entry dict.
|
|
|
|
Returns:
|
|
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
|
|
"""
|
|
audio_file = entry.get("audio_file", "")
|
|
if audio_file:
|
|
return audio_file
|
|
# Fallback: use slug for confusables, ktiv_male for others
|
|
slug = entry.get("slug", "")
|
|
if entry.get("confusable_group"):
|
|
return f"{slug}.mp3"
|
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
|
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
|
return f"{safe_name}.mp3"
|
|
|
|
|
|
def _form_key(person: str, tense: str) -> str:
|
|
"""Build a filesystem-safe form key from person and tense fields.
|
|
|
|
Args:
|
|
person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
|
|
tense: Hebrew tense string from the conjugation form.
|
|
|
|
Returns:
|
|
Form key such as ``"past_1s"`` or ``"present_ms"``.
|
|
Infinitive tense always returns ``"infinitive"`` (no person suffix).
|
|
"""
|
|
prefix = TENSE_TO_PREFIX.get(tense, tense)
|
|
if prefix == "infinitive":
|
|
return "infinitive"
|
|
return f"{prefix}_{person}"
|
|
|
|
|
|
def _download(url: str, dest: Path, session: requests.Session) -> bool:
|
|
"""Download *url* to *dest*, retrying up to MAX_RETRIES times.
|
|
|
|
Skips the download silently if *dest* already exists.
|
|
|
|
Args:
|
|
url: HTTP(S) URL to download.
|
|
dest: Local path to write the file to.
|
|
session: Shared requests session.
|
|
|
|
Returns:
|
|
``True`` if the file was downloaded (or already existed),
|
|
``False`` if all retries were exhausted.
|
|
"""
|
|
if dest.exists():
|
|
return True
|
|
|
|
for attempt in range(1, MAX_RETRIES + 1):
|
|
try:
|
|
resp = session.get(url, timeout=15)
|
|
resp.raise_for_status()
|
|
dest.write_bytes(resp.content)
|
|
logger.debug("Downloaded %s → %s", url, dest.name)
|
|
return True
|
|
except requests.RequestException as exc:
|
|
wait = 2**attempt
|
|
if attempt < MAX_RETRIES:
|
|
logger.warning(
|
|
"Attempt %d/%d failed for %s (%s) — retrying in %ds",
|
|
attempt,
|
|
MAX_RETRIES,
|
|
url,
|
|
exc,
|
|
wait,
|
|
)
|
|
time.sleep(wait)
|
|
else:
|
|
logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
|
|
return False
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Per-category downloaders
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def download_vocab_audio(
|
|
entries: list[dict],
|
|
session: requests.Session,
|
|
) -> tuple[int, int, int]:
|
|
"""Download vocabulary audio files.
|
|
|
|
Args:
|
|
entries: List of words.json entry dicts.
|
|
session: Shared requests session.
|
|
|
|
Returns:
|
|
Tuple of (downloaded, cached, no_url) counts.
|
|
"""
|
|
downloaded = cached = no_url = 0
|
|
|
|
for entry in entries:
|
|
url: str | None = entry.get("audio_url")
|
|
if not url:
|
|
no_url += 1
|
|
continue
|
|
|
|
audio_file: str | None = entry.get("audio_file")
|
|
if not audio_file:
|
|
audio_file = _make_audio_file(entry)
|
|
|
|
dest = AUDIO_DIR / audio_file
|
|
|
|
if dest.exists():
|
|
cached += 1
|
|
continue
|
|
|
|
if _download(url, dest, session):
|
|
downloaded += 1
|
|
time.sleep(DOWNLOAD_DELAY)
|
|
else:
|
|
no_url += 1 # count persistent failures alongside missing URLs
|
|
|
|
return downloaded, cached, no_url
|
|
|
|
|
|
def download_noun_plural_audio(
|
|
entries: list[dict],
|
|
session: requests.Session,
|
|
) -> tuple[int, int]:
|
|
"""Download noun plural audio files.
|
|
|
|
Destination: ``data/audio/{slug}_plural.mp3``
|
|
|
|
Args:
|
|
entries: List of words.json entry dicts.
|
|
session: Shared requests session.
|
|
|
|
Returns:
|
|
Tuple of (downloaded, cached) counts.
|
|
"""
|
|
downloaded = cached = 0
|
|
|
|
for entry in entries:
|
|
ni = entry.get("noun_inflection")
|
|
if not ni or not isinstance(ni, dict):
|
|
continue
|
|
|
|
url: str | None = ni.get("plural_audio")
|
|
if not url or not url.startswith("http"):
|
|
continue
|
|
|
|
slug: str = entry["slug"]
|
|
dest = AUDIO_DIR / f"{slug}_plural.mp3"
|
|
|
|
if dest.exists():
|
|
cached += 1
|
|
continue
|
|
|
|
if _download(url, dest, session):
|
|
downloaded += 1
|
|
time.sleep(DOWNLOAD_DELAY)
|
|
|
|
return downloaded, cached
|
|
|
|
|
|
def download_conjugation_audio(
|
|
entries: list[dict],
|
|
session: requests.Session,
|
|
) -> tuple[int, int, int]:
|
|
"""Download conjugation form audio files.
|
|
|
|
Active forms → ``data/audio_conj/{slug}_{form_key}.mp3``
|
|
Passive forms → ``data/audio_conj/{slug}_passive_{form_key}.mp3``
|
|
|
|
Args:
|
|
entries: List of words.json entry dicts.
|
|
session: Shared requests session.
|
|
|
|
Returns:
|
|
Tuple of (downloaded, cached, failed) counts.
|
|
"""
|
|
downloaded = cached = failed = 0
|
|
|
|
for entry in entries:
|
|
conj = entry.get("conjugation")
|
|
if not conj:
|
|
continue
|
|
|
|
slug: str = entry["slug"]
|
|
|
|
form_sets: list[tuple[str, list]] = [
|
|
("", conj.get("active_forms") or []),
|
|
("passive_", conj.get("hufal_pual_forms") or []),
|
|
]
|
|
|
|
for prefix, forms in form_sets:
|
|
for form in forms:
|
|
url: str | None = form.get("audio_url")
|
|
if not url:
|
|
continue
|
|
|
|
key = _form_key(form.get("person", ""), form.get("tense", ""))
|
|
dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
|
|
|
|
if dest.exists():
|
|
cached += 1
|
|
continue
|
|
|
|
if _download(url, dest, session):
|
|
downloaded += 1
|
|
time.sleep(DOWNLOAD_DELAY)
|
|
else:
|
|
failed += 1
|
|
|
|
return downloaded, cached, failed
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Entry point
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def main() -> None:
|
|
"""Parse CLI args and run the audio download pipeline."""
|
|
parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
|
|
parser.add_argument(
|
|
"--skip-vocab",
|
|
action="store_true",
|
|
help="Skip vocabulary audio downloads.",
|
|
)
|
|
parser.add_argument(
|
|
"--skip-conj",
|
|
action="store_true",
|
|
help="Skip conjugation audio downloads.",
|
|
)
|
|
parser.add_argument(
|
|
"--test",
|
|
metavar="N",
|
|
type=int,
|
|
default=None,
|
|
help="Limit processing to the first N words.json entries.",
|
|
)
|
|
args = parser.parse_args()
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(message)s",
|
|
)
|
|
|
|
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
|
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
|
|
|
|
with open(WORDS_JSON, encoding="utf-8") as fh:
|
|
raw: dict[str, dict] = json.load(fh)
|
|
|
|
entries = list(raw.values())
|
|
if args.test is not None:
|
|
entries = entries[: args.test]
|
|
|
|
logger.info("[4] Downloading audio files …")
|
|
|
|
session = requests.Session()
|
|
session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
|
|
|
|
# --- Vocab ---
|
|
if not args.skip_vocab:
|
|
v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
|
|
else:
|
|
v_dl = v_cached = v_no_url = 0
|
|
|
|
# --- Noun plural ---
|
|
np_dl, np_cached = download_noun_plural_audio(entries, session)
|
|
|
|
# --- Conjugation ---
|
|
if not args.skip_conj:
|
|
c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
|
|
else:
|
|
c_dl = c_cached = c_failed = 0
|
|
|
|
# --- Summary ---
|
|
if not args.skip_vocab:
|
|
logger.info(
|
|
" Vocab: %d downloaded, %d cached, %d no URL",
|
|
v_dl,
|
|
v_cached,
|
|
v_no_url,
|
|
)
|
|
logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached)
|
|
if not args.skip_conj:
|
|
failed_msg = f", {c_failed} failed" if c_failed else ""
|
|
logger.info(
|
|
" Conjugation: %d downloaded, %d cached%s",
|
|
c_dl,
|
|
c_cached,
|
|
failed_msg,
|
|
)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|