hebrew_flash_cards/benyehuda.py

#!/usr/bin/env python3
"""
Ben Yehuda corpus example-sentence lookup (nikkud corpus).

TODO: Rewrite to update words.json examples fields directly instead of
writing to a separate examples_cache.json. Currently the migration script
bridges the gap. See Phase 5 in SPRINT_LOG.md.

Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
then answers queries locally.

Exposed API:
  load(force_rebuild=False)
  get_examples(word_nikkud) -> list[str]   (returns 0 or 1 examples)
  save_examples_cache()
"""

import json
import logging
import re
import zipfile
from io import BytesIO
from pathlib import Path

import requests

from helpers import strip_nikkud as _strip_nikkud

logger = logging.getLogger(__name__)

# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
REQUEST_TIMEOUT = 120
MIN_SENTENCE_LEN = 20
MAX_SENTENCE_LEN = 200
MAX_INDEX_ENTRIES = 500  # cap examples kept per word in index to limit memory

# Module-level state
_index: dict[str, list[str]] = {}  # word (with nikkud) -> [sentence, ...]
_examples_cache: dict[str, list[str]] = {}  # word -> cached result for this run


def _split_sentences(text: str) -> list[str]:
    """
    Split text into sentences on newlines only (Hebrew sentences don't have
    mid-word period issues like English).  Min 20 chars, max 200 chars.
    """
    out = []
    for line in text.split("\n"):
        s = line.strip().strip("\"'.,;:!?")
        s = s.strip()
        if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
            out.append(s)
    return out


def _build_index(corpus_zip_bytes: bytes) -> None:
    """Parse corpus ZIP and build word (nikkud) → sentences index."""
    global _index
    _index = {}
    logger.info("Building Ben Yehuda index from nikkud corpus …")

    with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
        txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
        logger.info(f"  Corpus contains {len(txt_files)} text files")
        for fname in txt_files:
            try:
                raw = zf.read(fname).decode("utf-8", errors="ignore")
            except Exception:  # noqa: S112
                continue
            for sentence in _split_sentences(raw):
                # Index by each unique Hebrew token (with nikkud) in the sentence
                words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
                for w in set(words):
                    if len(w) >= 2:
                        bucket = _index.setdefault(w, [])
                        if len(bucket) < MAX_INDEX_ENTRIES:
                            bucket.append(sentence)

    logger.info(f"Index built: {len(_index)} unique word forms")


def _save_index() -> None:
    INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(INDEX_PATH, "w", encoding="utf-8") as f:
        json.dump(_index, f, ensure_ascii=False)
    logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")


def _load_index() -> None:
    global _index
    with open(INDEX_PATH, encoding="utf-8") as f:
        _index = json.load(f)
    logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")


def load(force_rebuild: bool = False) -> None:
    """Load or build the Ben Yehuda index. Downloads corpus if needed."""
    global _index, _examples_cache
    if _index and not force_rebuild:
        return

    if force_rebuild:
        # Delete old index and discard examples cache
        if INDEX_PATH.exists():
            INDEX_PATH.unlink()
            logger.info("Deleted old Ben Yehuda index (force rebuild)")
        _examples_cache = {}
    else:
        # Load persisted examples cache (not needed on rebuild)
        if EXAMPLES_CACHE_PATH.exists():
            with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
                _examples_cache = json.load(f)

    if INDEX_PATH.exists():
        _load_index()
        return

    logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
    resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
    resp.raise_for_status()
    data = resp.content
    logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")

    _build_index(data)
    _save_index()


def save_examples_cache() -> None:
    EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(_examples_cache, f, ensure_ascii=False)
    logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")


def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
    """
    Return 0 or 1 example sentences for the given word (nikkud form).

    Lookup strategy:
    1. Try exact nikkud match in index.
    2. Fall back to stripped (no-nikkud) match against index keys.
       Skipped when word's consonants are in confusable_consonants set
       (to avoid returning sentences for the wrong homograph).

    Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
    the word as a whole token.
    """
    if not _index:
        load()

    word = word_nikkud.strip()
    word_stripped = _strip_nikkud(word)

    cache_key = word

    if cache_key in _examples_cache:
        return _examples_cache[cache_key]

    # Lookup: try exact nikkud first, then stripped fallback
    candidates = _index.get(word, [])
    if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
        # Try looking up by stripped form across index keys
        for k, v in _index.items():
            if _strip_nikkud(k) == word_stripped:
                candidates = v
                break

    # Filter: word must appear as a whole token
    # Match the stripped form (for robustness with nikkud variants in sentence)
    if word_stripped:
        pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
        matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
    else:
        matched = candidates[:]

    # Filter by length
    matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]

    # Return the single longest sentence ≤ MAX_SENTENCE_LEN
    if matched:
        best = max(matched, key=len)
        result = [best]
    else:
        result = []

    _examples_cache[cache_key] = result
    return result


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    load()
    tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
    for w in tests:
        exs = get_examples(w)
        print(f"\n{w}: {len(exs)} example(s)")
        for ex in exs:
            print(f"  → {ex[:100]}")
    save_examples_cache()