hebrew_flash_cards/benyehuda.py
Sochen 17f7458d19 Sprint 9: cloze cards, plurals deck, project reorg, lint tooling
- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences
- Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns)
- Ktiv male forms expanded to 20,711 entries for sentence matching
- Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for
  one-off tools, tests/ with smoke tests, deleted 3 dead files
- Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig,
  fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars)
- validate_apkg.py: card count range check for optional cloze template
- Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals,
  noun_slug_map, vocab_sentence_matches, epub_sentence_index

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 08:09:39 +00:00

196 lines
6.4 KiB
Python

#!/usr/bin/env python3
"""
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
then answers queries locally.
Exposed API:
load(force_rebuild=False)
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
save_examples_cache()
"""
import json
import logging
import re
import zipfile
from io import BytesIO
from pathlib import Path
import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
REQUEST_TIMEOUT = 120
MIN_SENTENCE_LEN = 20
MAX_SENTENCE_LEN = 200
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
# Module-level state
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
def _split_sentences(text: str) -> list[str]:
"""
Split text into sentences on newlines only (Hebrew sentences don't have
mid-word period issues like English). Min 20 chars, max 200 chars.
"""
out = []
for line in text.split("\n"):
s = line.strip().strip("\"'.,;:!?")
s = s.strip()
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
out.append(s)
return out
def _build_index(corpus_zip_bytes: bytes) -> None:
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
global _index
_index = {}
logger.info("Building Ben Yehuda index from nikkud corpus …")
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
logger.info(f" Corpus contains {len(txt_files)} text files")
for fname in txt_files:
try:
raw = zf.read(fname).decode("utf-8", errors="ignore")
except Exception: # noqa: S112
continue
for sentence in _split_sentences(raw):
# Index by each unique Hebrew token (with nikkud) in the sentence
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
for w in set(words):
if len(w) >= 2:
bucket = _index.setdefault(w, [])
if len(bucket) < MAX_INDEX_ENTRIES:
bucket.append(sentence)
logger.info(f"Index built: {len(_index)} unique word forms")
def _save_index() -> None:
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(INDEX_PATH, "w", encoding="utf-8") as f:
json.dump(_index, f, ensure_ascii=False)
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
def _load_index() -> None:
global _index
with open(INDEX_PATH, encoding="utf-8") as f:
_index = json.load(f)
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
def load(force_rebuild: bool = False) -> None:
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
global _index, _examples_cache
if _index and not force_rebuild:
return
if force_rebuild:
# Delete old index and discard examples cache
if INDEX_PATH.exists():
INDEX_PATH.unlink()
logger.info("Deleted old Ben Yehuda index (force rebuild)")
_examples_cache = {}
else:
# Load persisted examples cache (not needed on rebuild)
if EXAMPLES_CACHE_PATH.exists():
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
_examples_cache = json.load(f)
if INDEX_PATH.exists():
_load_index()
return
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
resp.raise_for_status()
data = resp.content
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
_build_index(data)
_save_index()
def save_examples_cache() -> None:
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
json.dump(_examples_cache, f, ensure_ascii=False)
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
def get_examples(word_nikkud: str) -> list[str]:
"""
Return 0 or 1 example sentences for the given word (nikkud form).
Lookup strategy:
1. Try exact nikkud match in index.
2. Fall back to stripped (no-nikkud) match against index keys.
Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
the word as a whole token.
"""
if not _index:
load()
word = word_nikkud.strip()
word_stripped = _strip_nikkud(word)
cache_key = word
if cache_key in _examples_cache:
return _examples_cache[cache_key]
# Lookup: try exact nikkud first, then stripped fallback
candidates = _index.get(word, [])
if not candidates and word_stripped:
# Try looking up by stripped form across index keys
for k, v in _index.items():
if _strip_nikkud(k) == word_stripped:
candidates = v
break
# Filter: word must appear as a whole token
# Match the stripped form (for robustness with nikkud variants in sentence)
if word_stripped:
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
else:
matched = candidates[:]
# Filter by length
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
if matched:
best = max(matched, key=len)
result = [best]
else:
result = []
_examples_cache[cache_key] = result
return result
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
for w in tests:
exs = get_examples(w)
print(f"\n{w}: {len(exs)} example(s)")
for ex in exs:
print(f"{ex[:100]}")
save_examples_cache()