Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
202 lines
6.9 KiB
Python
202 lines
6.9 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
|
|
|
|
TODO: Rewrite to update words.json examples fields directly instead of
|
|
writing to a separate examples_cache.json. Currently the migration script
|
|
bridges the gap. See Phase 5 in SPRINT_LOG.md.
|
|
|
|
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
|
|
then answers queries locally.
|
|
|
|
Exposed API:
|
|
load(force_rebuild=False)
|
|
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
|
|
save_examples_cache()
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import zipfile
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
from helpers import strip_nikkud as _strip_nikkud
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
|
|
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
|
|
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
|
|
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
|
|
REQUEST_TIMEOUT = 120
|
|
MIN_SENTENCE_LEN = 20
|
|
MAX_SENTENCE_LEN = 200
|
|
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
|
|
|
|
# Module-level state
|
|
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
|
|
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
|
|
|
|
|
|
def _split_sentences(text: str) -> list[str]:
|
|
"""
|
|
Split text into sentences on newlines only (Hebrew sentences don't have
|
|
mid-word period issues like English). Min 20 chars, max 200 chars.
|
|
"""
|
|
out = []
|
|
for line in text.split("\n"):
|
|
s = line.strip().strip("\"'.,;:!?")
|
|
s = s.strip()
|
|
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
|
|
out.append(s)
|
|
return out
|
|
|
|
|
|
def _build_index(corpus_zip_bytes: bytes) -> None:
|
|
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
|
|
global _index
|
|
_index = {}
|
|
logger.info("Building Ben Yehuda index from nikkud corpus …")
|
|
|
|
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
|
|
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
|
|
logger.info(f" Corpus contains {len(txt_files)} text files")
|
|
for fname in txt_files:
|
|
try:
|
|
raw = zf.read(fname).decode("utf-8", errors="ignore")
|
|
except Exception: # noqa: S112
|
|
continue
|
|
for sentence in _split_sentences(raw):
|
|
# Index by each unique Hebrew token (with nikkud) in the sentence
|
|
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
|
|
for w in set(words):
|
|
if len(w) >= 2:
|
|
bucket = _index.setdefault(w, [])
|
|
if len(bucket) < MAX_INDEX_ENTRIES:
|
|
bucket.append(sentence)
|
|
|
|
logger.info(f"Index built: {len(_index)} unique word forms")
|
|
|
|
|
|
def _save_index() -> None:
|
|
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(INDEX_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(_index, f, ensure_ascii=False)
|
|
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
|
|
|
|
|
|
def _load_index() -> None:
|
|
global _index
|
|
with open(INDEX_PATH, encoding="utf-8") as f:
|
|
_index = json.load(f)
|
|
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
|
|
|
|
|
|
def load(force_rebuild: bool = False) -> None:
|
|
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
|
|
global _index, _examples_cache
|
|
if _index and not force_rebuild:
|
|
return
|
|
|
|
if force_rebuild:
|
|
# Delete old index and discard examples cache
|
|
if INDEX_PATH.exists():
|
|
INDEX_PATH.unlink()
|
|
logger.info("Deleted old Ben Yehuda index (force rebuild)")
|
|
_examples_cache = {}
|
|
else:
|
|
# Load persisted examples cache (not needed on rebuild)
|
|
if EXAMPLES_CACHE_PATH.exists():
|
|
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
|
|
_examples_cache = json.load(f)
|
|
|
|
if INDEX_PATH.exists():
|
|
_load_index()
|
|
return
|
|
|
|
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
|
|
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
|
|
resp.raise_for_status()
|
|
data = resp.content
|
|
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
|
|
|
|
_build_index(data)
|
|
_save_index()
|
|
|
|
|
|
def save_examples_cache() -> None:
|
|
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(_examples_cache, f, ensure_ascii=False)
|
|
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
|
|
|
|
|
|
def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
|
|
"""
|
|
Return 0 or 1 example sentences for the given word (nikkud form).
|
|
|
|
Lookup strategy:
|
|
1. Try exact nikkud match in index.
|
|
2. Fall back to stripped (no-nikkud) match against index keys.
|
|
Skipped when word's consonants are in confusable_consonants set
|
|
(to avoid returning sentences for the wrong homograph).
|
|
|
|
Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
|
|
the word as a whole token.
|
|
"""
|
|
if not _index:
|
|
load()
|
|
|
|
word = word_nikkud.strip()
|
|
word_stripped = _strip_nikkud(word)
|
|
|
|
cache_key = word
|
|
|
|
if cache_key in _examples_cache:
|
|
return _examples_cache[cache_key]
|
|
|
|
# Lookup: try exact nikkud first, then stripped fallback
|
|
candidates = _index.get(word, [])
|
|
if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
|
|
# Try looking up by stripped form across index keys
|
|
for k, v in _index.items():
|
|
if _strip_nikkud(k) == word_stripped:
|
|
candidates = v
|
|
break
|
|
|
|
# Filter: word must appear as a whole token
|
|
# Match the stripped form (for robustness with nikkud variants in sentence)
|
|
if word_stripped:
|
|
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
|
|
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
|
|
else:
|
|
matched = candidates[:]
|
|
|
|
# Filter by length
|
|
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
|
|
|
|
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
|
|
if matched:
|
|
best = max(matched, key=len)
|
|
result = [best]
|
|
else:
|
|
result = []
|
|
|
|
_examples_cache[cache_key] = result
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
load()
|
|
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
|
|
for w in tests:
|
|
exs = get_examples(w)
|
|
print(f"\n{w}: {len(exs)} example(s)")
|
|
for ex in exs:
|
|
print(f" → {ex[:100]}")
|
|
save_examples_cache()
|