hebrew_flash_cards/pealim_audio_download.py

#!/usr/bin/env python3
"""Download audio files from URLs stored in words.json.

Three audio categories are handled:
  1. Vocab audio  → data/audio/{audio_file}
  2. Noun plural  → data/audio/{slug}_plural.mp3
  3. Conjugation  → data/audio_conj/{slug}_{form_key}.mp3
                    data/audio_conj/{slug}_passive_{form_key}.mp3
"""

import argparse
import json
import logging
import re
import time
from pathlib import Path

import requests

logger = logging.getLogger(__name__)

DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
WORDS_JSON = DATA_DIR / "words.json"

DOWNLOAD_DELAY = 0.3
MAX_RETRIES = 3

# Map Hebrew tense names to English prefixes for form_key construction.
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
# appear in the current dataset but the form_key collapses to bare "infinitive".
TENSE_TO_PREFIX = {
    "הוֹוֶה": "present",
    "עָבָר": "past",
    "עָתִיד": "future",
    "צִוּוּי": "imperative",
    "מְקוֹר": "infinitive",
}


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _make_audio_file(entry: dict) -> str:
    """Derive the vocab audio filename when audio_file is absent.

    Slug-based for confusable entries (slug contains the disambiguating ID),
    consonant-only for all others.

    Args:
        entry: A words.json entry dict.

    Returns:
        Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
    """
    audio_file = entry.get("audio_file", "")
    if audio_file:
        return audio_file
    # Fallback: use slug for confusables, ktiv_male for others
    slug = entry.get("slug", "")
    if entry.get("confusable_group"):
        return f"{slug}.mp3"
    ktiv_male = entry.get("word", {}).get("ktiv_male", "")
    safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
    return f"{safe_name}.mp3"


def _form_key(person: str, tense: str) -> str:
    """Build a filesystem-safe form key from person and tense fields.

    Args:
        person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
        tense: Hebrew tense string from the conjugation form.

    Returns:
        Form key such as ``"past_1s"`` or ``"present_ms"``.
        Infinitive tense always returns ``"infinitive"`` (no person suffix).
    """
    prefix = TENSE_TO_PREFIX.get(tense, tense)
    if prefix == "infinitive":
        return "infinitive"
    return f"{prefix}_{person}"


def _download(url: str, dest: Path, session: requests.Session) -> bool:
    """Download *url* to *dest*, retrying up to MAX_RETRIES times.

    Skips the download silently if *dest* already exists.

    Args:
        url: HTTP(S) URL to download.
        dest: Local path to write the file to.
        session: Shared requests session.

    Returns:
        ``True`` if the file was downloaded (or already existed),
        ``False`` if all retries were exhausted.
    """
    if dest.exists():
        return True

    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = session.get(url, timeout=15)
            resp.raise_for_status()
            dest.write_bytes(resp.content)
            logger.debug("Downloaded %s → %s", url, dest.name)
            return True
        except requests.RequestException as exc:
            wait = 2**attempt
            if attempt < MAX_RETRIES:
                logger.warning(
                    "Attempt %d/%d failed for %s (%s) — retrying in %ds",
                    attempt,
                    MAX_RETRIES,
                    url,
                    exc,
                    wait,
                )
                time.sleep(wait)
            else:
                logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
    return False


# ---------------------------------------------------------------------------
# Per-category downloaders
# ---------------------------------------------------------------------------


def download_vocab_audio(
    entries: list[dict],
    session: requests.Session,
) -> tuple[int, int, int]:
    """Download vocabulary audio files.

    Args:
        entries: List of words.json entry dicts.
        session: Shared requests session.

    Returns:
        Tuple of (downloaded, cached, no_url) counts.
    """
    downloaded = cached = no_url = 0

    for entry in entries:
        url: str | None = entry.get("audio_url")
        if not url:
            no_url += 1
            continue

        audio_file: str | None = entry.get("audio_file")
        if not audio_file:
            audio_file = _make_audio_file(entry)

        dest = AUDIO_DIR / audio_file

        if dest.exists():
            cached += 1
            continue

        if _download(url, dest, session):
            downloaded += 1
            time.sleep(DOWNLOAD_DELAY)
        else:
            no_url += 1  # count persistent failures alongside missing URLs

    return downloaded, cached, no_url


def download_noun_plural_audio(
    entries: list[dict],
    session: requests.Session,
) -> tuple[int, int]:
    """Download noun plural audio files.

    Destination: ``data/audio/{slug}_plural.mp3``

    Args:
        entries: List of words.json entry dicts.
        session: Shared requests session.

    Returns:
        Tuple of (downloaded, cached) counts.
    """
    downloaded = cached = 0

    for entry in entries:
        ni = entry.get("noun_inflection")
        if not ni or not isinstance(ni, dict):
            continue

        url: str | None = ni.get("plural_audio")
        if not url or not url.startswith("http"):
            continue

        slug: str = entry["slug"]
        dest = AUDIO_DIR / f"{slug}_plural.mp3"

        if dest.exists():
            cached += 1
            continue

        if _download(url, dest, session):
            downloaded += 1
            time.sleep(DOWNLOAD_DELAY)

    return downloaded, cached


def download_conjugation_audio(
    entries: list[dict],
    session: requests.Session,
) -> tuple[int, int, int]:
    """Download conjugation form audio files.

    Active forms   → ``data/audio_conj/{slug}_{form_key}.mp3``
    Passive forms  → ``data/audio_conj/{slug}_passive_{form_key}.mp3``

    Args:
        entries: List of words.json entry dicts.
        session: Shared requests session.

    Returns:
        Tuple of (downloaded, cached, failed) counts.
    """
    downloaded = cached = failed = 0

    for entry in entries:
        conj = entry.get("conjugation")
        if not conj:
            continue

        slug: str = entry["slug"]

        form_sets: list[tuple[str, list]] = [
            ("", conj.get("active_forms") or []),
            ("passive_", conj.get("hufal_pual_forms") or []),
        ]

        for prefix, forms in form_sets:
            for form in forms:
                url: str | None = form.get("audio_url")
                if not url:
                    continue

                key = _form_key(form.get("person", ""), form.get("tense", ""))
                dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"

                if dest.exists():
                    cached += 1
                    continue

                if _download(url, dest, session):
                    downloaded += 1
                    time.sleep(DOWNLOAD_DELAY)
                else:
                    failed += 1

    return downloaded, cached, failed


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------


def main() -> None:
    """Parse CLI args and run the audio download pipeline."""
    parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
    parser.add_argument(
        "--skip-vocab",
        action="store_true",
        help="Skip vocabulary audio downloads.",
    )
    parser.add_argument(
        "--skip-conj",
        action="store_true",
        help="Skip conjugation audio downloads.",
    )
    parser.add_argument(
        "--test",
        metavar="N",
        type=int,
        default=None,
        help="Limit processing to the first N words.json entries.",
    )
    args = parser.parse_args()

    logging.basicConfig(
        level=logging.INFO,
        format="%(message)s",
    )

    AUDIO_DIR.mkdir(parents=True, exist_ok=True)
    AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)

    with open(WORDS_JSON, encoding="utf-8") as fh:
        raw: dict[str, dict] = json.load(fh)

    entries = list(raw.values())
    if args.test is not None:
        entries = entries[: args.test]

    logger.info("[4] Downloading audio files …")

    session = requests.Session()
    session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"

    # --- Vocab ---
    if not args.skip_vocab:
        v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
    else:
        v_dl = v_cached = v_no_url = 0

    # --- Noun plural ---
    np_dl, np_cached = download_noun_plural_audio(entries, session)

    # --- Conjugation ---
    if not args.skip_conj:
        c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
    else:
        c_dl = c_cached = c_failed = 0

    # --- Summary ---
    if not args.skip_vocab:
        logger.info(
            "    Vocab: %d downloaded, %d cached, %d no URL",
            v_dl,
            v_cached,
            v_no_url,
        )
    logger.info("    Noun plural: %d downloaded, %d cached", np_dl, np_cached)
    if not args.skip_conj:
        failed_msg = f", {c_failed} failed" if c_failed else ""
        logger.info(
            "    Conjugation: %d downloaded, %d cached%s",
            c_dl,
            c_cached,
            failed_msg,
        )


if __name__ == "__main__":
    main()