Implements four major improvements to the Pealim Anki deck pipeline:
1. Automated .apkg generation (genanki) — no more manual Anki Desktop step.
Both vocabulary and conjugation decks are built programmatically.
2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus.
Notes sorted by rank so Anki presents most common words first.
3. Example sentences from Ben Yehuda public domain corpus (not pealim.com).
Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set.
4. Conjugation drill deck — one card per form × verb.
Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one
per binyan). Extracts 28 forms each via pealim.com/search/ + table parse.
New files:
apkg_builder.py — genanki deck builder for both decks
benyehuda.py — Ben Yehuda corpus downloader + sentence indexer
frequency_lookup.py — FrequencyWords downloader + rank lookup
verbs_input.txt — verb input list (7 test verbs, one per binyan)
data/ — baseline CSVs + generated caches
Updated:
conjugation_extract.py — rewritten: reads verbs_input.txt, searches
/search/?q= for slug, parses table by row labels
requirements.txt — add genanki, beautifulsoup4, lxml
run.py — full orchestration pipeline with CLI flags
.gitignore — exclude venv/, benyehuda_index.json, audio/, output/
CLI:
python run.py --skip-scrape --skip-audio --test 20 (quick test)
python run.py --skip-scrape (full build)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
160 lines
5.1 KiB
Python
160 lines
5.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Ben Yehuda corpus example-sentence lookup.
|
|
Downloads plaintext-no-nikkud ZIP once, indexes sentences, then answers queries locally.
|
|
Exposed API: get_examples(word_no_nikkud) -> list[str]
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import unicodedata
|
|
import zipfile
|
|
from io import BytesIO
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
CORPUS_URL = (
|
|
"https://github.com/projectbenyehuda/public_domain_dump/releases/"
|
|
"download/2025-10/txt_stripped.zip"
|
|
)
|
|
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
|
|
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
|
|
REQUEST_TIMEOUT = 120
|
|
MIN_SENTENCE_LEN = 15
|
|
MAX_EXAMPLES_PER_WORD = 2
|
|
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
|
|
|
|
# Module-level state
|
|
_index: dict[str, list[str]] = {} # word -> [sentence, ...]
|
|
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
|
|
|
|
|
|
def _strip_nikkud(text: str) -> str:
|
|
return "".join(
|
|
ch for ch in unicodedata.normalize("NFD", text)
|
|
if unicodedata.category(ch) != "Mn"
|
|
)
|
|
|
|
|
|
def _split_sentences(text: str) -> list[str]:
|
|
"""Split text into sentences on common sentence-ending punctuation."""
|
|
raw = re.split(r"[.!?؟\n]{1,3}", text)
|
|
out = []
|
|
for s in raw:
|
|
s = s.strip()
|
|
if len(s) >= MIN_SENTENCE_LEN:
|
|
out.append(s)
|
|
return out
|
|
|
|
|
|
def _build_index(corpus_zip_bytes: bytes) -> None:
|
|
"""Parse corpus ZIP and build word → sentences index."""
|
|
global _index
|
|
_index = {}
|
|
logger.info("Building Ben Yehuda index from corpus …")
|
|
|
|
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
|
|
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
|
|
logger.info(f" Corpus contains {len(txt_files)} text files")
|
|
for fname in txt_files:
|
|
try:
|
|
raw = zf.read(fname).decode("utf-8", errors="ignore")
|
|
except Exception:
|
|
continue
|
|
for sentence in _split_sentences(raw):
|
|
words = re.findall(r"[\u05d0-\u05ea'\"]+", sentence)
|
|
for w in set(words):
|
|
if len(w) >= 2:
|
|
if w not in _index:
|
|
_index[w] = []
|
|
if len(_index[w]) < MAX_INDEX_ENTRIES:
|
|
_index[w].append(sentence)
|
|
|
|
logger.info(f"Index built: {len(_index)} unique words")
|
|
|
|
|
|
def _save_index() -> None:
|
|
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(INDEX_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(_index, f, ensure_ascii=False)
|
|
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
|
|
|
|
|
|
def _load_index() -> None:
|
|
global _index
|
|
with open(INDEX_PATH, encoding="utf-8") as f:
|
|
_index = json.load(f)
|
|
logger.info(f"Ben Yehuda index loaded: {len(_index)} words")
|
|
|
|
|
|
def load(force_rebuild: bool = False) -> None:
|
|
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
|
|
global _index, _examples_cache
|
|
if _index and not force_rebuild:
|
|
return
|
|
|
|
# Load persisted examples cache
|
|
if EXAMPLES_CACHE_PATH.exists():
|
|
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
|
|
_examples_cache = json.load(f)
|
|
|
|
if INDEX_PATH.exists() and not force_rebuild:
|
|
_load_index()
|
|
return
|
|
|
|
logger.info("Downloading Ben Yehuda corpus … (this may take 1-2 minutes)")
|
|
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
|
|
resp.raise_for_status()
|
|
data = resp.content
|
|
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
|
|
|
|
_build_index(data)
|
|
_save_index()
|
|
|
|
|
|
def save_examples_cache() -> None:
|
|
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(_examples_cache, f, ensure_ascii=False)
|
|
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
|
|
|
|
|
|
def get_examples(word_no_nikkud: str) -> list[str]:
|
|
"""
|
|
Return up to 2 shortest complete sentences (≥15 chars) containing word_no_nikkud
|
|
as a whole token. Results are cached; subsequent calls for the same word are instant.
|
|
"""
|
|
if not _index:
|
|
load()
|
|
|
|
word = _strip_nikkud(word_no_nikkud.strip())
|
|
|
|
if word in _examples_cache:
|
|
return _examples_cache[word]
|
|
|
|
candidates = _index.get(word, [])
|
|
# Filter: must contain word as whole token (word boundary)
|
|
pattern = r"(?<![^\s\W])" + re.escape(word) + r"(?![^\s\W])"
|
|
matched = [s for s in candidates if re.search(pattern, s)]
|
|
|
|
# Sort by length (prefer shorter, more natural sentences)
|
|
matched.sort(key=len)
|
|
result = matched[:MAX_EXAMPLES_PER_WORD]
|
|
_examples_cache[word] = result
|
|
return result
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
load()
|
|
tests = ["שלום", "בית", "ספר", "מים", "אהבה", "ילד"]
|
|
for w in tests:
|
|
exs = get_examples(w)
|
|
print(f"\n{w}: {len(exs)} examples")
|
|
for ex in exs:
|
|
print(f" → {ex[:80]}")
|
|
save_examples_cache()
|