hebrew_flash_cards/frequency_lookup.py
Sochen b086123bec feat: add apkg builder, frequency, Ben Yehuda examples, conjugation deck
Implements four major improvements to the Pealim Anki deck pipeline:

1. Automated .apkg generation (genanki) — no more manual Anki Desktop step.
   Both vocabulary and conjugation decks are built programmatically.

2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus.
   Notes sorted by rank so Anki presents most common words first.

3. Example sentences from Ben Yehuda public domain corpus (not pealim.com).
   Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set.

4. Conjugation drill deck — one card per form × verb.
   Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one
   per binyan). Extracts 28 forms each via pealim.com/search/ + table parse.

New files:
  apkg_builder.py     — genanki deck builder for both decks
  benyehuda.py        — Ben Yehuda corpus downloader + sentence indexer
  frequency_lookup.py — FrequencyWords downloader + rank lookup
  verbs_input.txt     — verb input list (7 test verbs, one per binyan)
  data/               — baseline CSVs + generated caches

Updated:
  conjugation_extract.py — rewritten: reads verbs_input.txt, searches
                           /search/?q= for slug, parses table by row labels
  requirements.txt       — add genanki, beautifulsoup4, lxml
  run.py                 — full orchestration pipeline with CLI flags
  .gitignore             — exclude venv/, benyehuda_index.json, audio/, output/

CLI:
  python run.py --skip-scrape --skip-audio --test 20  (quick test)
  python run.py --skip-scrape                          (full build)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 01:58:31 +00:00

85 lines
2.5 KiB
Python

#!/usr/bin/env python3
"""
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
Downloads he_50k.txt once; subsequent runs read from cache.
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
"""
import json
import logging
import re
import unicodedata
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
FREQ_URL = (
"https://raw.githubusercontent.com/hermitdave/FrequencyWords/"
"master/content/2016/he/he_50k.txt"
)
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
REQUEST_TIMEOUT = 30
# Module-level cache: word_no_nikkud -> rank (1 = most common)
_freq: dict[str, int] = {}
def _strip_nikkud(text: str) -> str:
"""Remove Hebrew nikkud (diacritics) from a string."""
return "".join(
ch for ch in unicodedata.normalize("NFD", text)
if unicodedata.category(ch) != "Mn"
)
def load(cache_path: Path = CACHE_PATH) -> None:
"""Load frequency data from cache, downloading if not present."""
global _freq
if cache_path.exists():
with open(cache_path, encoding="utf-8") as f:
_freq = json.load(f)
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
return
logger.info("Downloading FrequencyWords he_50k.txt …")
resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
rank = 1
for line in resp.text.splitlines():
line = line.strip()
if not line:
continue
parts = line.split()
if len(parts) >= 1:
word = _strip_nikkud(parts[0])
if word and word not in _freq:
_freq[word] = rank
rank += 1
cache_path.parent.mkdir(parents=True, exist_ok=True)
with open(cache_path, "w", encoding="utf-8") as f:
json.dump(_freq, f, ensure_ascii=False)
logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}")
def get_frequency_rank(word_no_nikkud: str) -> int | None:
"""
Return the frequency rank of a word (1 = most common).
Returns None if not found in the corpus.
Strips nikkud from the input before lookup.
"""
if not _freq:
load()
clean = _strip_nikkud(word_no_nikkud.strip())
return _freq.get(clean)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()
tests = ["שלום", "ספר", "בית", "מים", "כלב"]
for w in tests:
print(f"{w}: rank {get_frequency_rank(w)}")