- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
72 lines
2.2 KiB
Python
72 lines
2.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
|
|
Downloads he_50k.txt once; subsequent runs read from cache.
|
|
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
|
|
from helpers import strip_nikkud as _strip_nikkud
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
|
|
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
|
|
REQUEST_TIMEOUT = 30
|
|
|
|
# Module-level cache: word_no_nikkud -> rank (1 = most common)
|
|
_freq: dict[str, int] = {}
|
|
|
|
|
|
def load(cache_path: Path = CACHE_PATH) -> None:
|
|
"""Load frequency data from cache, downloading if not present."""
|
|
global _freq
|
|
if cache_path.exists():
|
|
with open(cache_path, encoding="utf-8") as f:
|
|
_freq = json.load(f)
|
|
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
|
|
return
|
|
|
|
logger.info("Downloading FrequencyWords he_50k.txt …")
|
|
resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
|
|
rank = 1
|
|
for line in resp.text.splitlines():
|
|
line = line.strip()
|
|
if not line:
|
|
continue
|
|
word = _strip_nikkud(line.split()[0])
|
|
if word and word not in _freq:
|
|
_freq[word] = rank
|
|
rank += 1
|
|
|
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(cache_path, "w", encoding="utf-8") as f:
|
|
json.dump(_freq, f, ensure_ascii=False)
|
|
logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}")
|
|
|
|
|
|
def get_frequency_rank(word_no_nikkud: str) -> int | None:
|
|
"""
|
|
Return the frequency rank of a word (1 = most common).
|
|
Returns None if not found in the corpus.
|
|
Strips nikkud from the input before lookup.
|
|
"""
|
|
if not _freq:
|
|
load()
|
|
clean = _strip_nikkud(word_no_nikkud.strip())
|
|
return _freq.get(clean)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
load()
|
|
tests = ["שלום", "ספר", "בית", "מים", "כלב"]
|
|
for w in tests:
|
|
print(f"{w}: rank {get_frequency_rank(w)}")
|