#!/usr/bin/env python3 """ Hebrew word frequency lookup from hermitdave/FrequencyWords corpus. Downloads he_50k.txt once; subsequent runs read from cache. Exposed API: get_frequency_rank(word_no_nikkud) -> int | None TODO: Rewrite to update words.json frequency field directly instead of writing to a separate frequency_cache.json. Currently the migration script bridges the gap. See Phase 5 in SPRINT_LOG.md. """ import json import logging from pathlib import Path import requests logger = logging.getLogger(__name__) FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt" CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json" CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json" REQUEST_TIMEOUT = 30 # Module-level cache: word_no_nikkud -> rank (1 = most common) _freq: dict[str, int] = {} def load(cache_path: Path = CACHE_PATH) -> None: """Load frequency data from cache, downloading if not present. Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json. """ global _freq # Prefer YAP-cleaned frequency data if available clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None load_path = clean_path if clean_path and clean_path.exists() else cache_path if load_path.exists(): with open(load_path, encoding="utf-8") as f: _freq = json.load(f) label = "clean" if load_path == clean_path else "raw" logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries") return logger.info("Downloading FrequencyWords he_50k.txt …") resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT) resp.raise_for_status() rank = 1 for line in resp.text.splitlines(): line = line.strip() if not line: continue word = line.split()[0] if word and word not in _freq: _freq[word] = rank rank += 1 cache_path.parent.mkdir(parents=True, exist_ok=True) with open(cache_path, "w", encoding="utf-8") as f: json.dump(_freq, f, ensure_ascii=False) logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}") def get_frequency_rank(word_no_nikkud: str) -> int | None: """ Return the frequency rank of a word (1 = most common). Returns None if not found in the corpus. Expects ktiv male (no nikkud) input. """ if not _freq: load() clean = word_no_nikkud.strip() return _freq.get(clean) def get_freq_data() -> dict[str, int]: """Return the full frequency dict (word -> rank). Auto-loads from cache if not yet loaded. """ if not _freq: load() return _freq if __name__ == "__main__": logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s") load() tests = ["שלום", "ספר", "בית", "מים", "כלב"] for w in tests: print(f"{w}: rank {get_frequency_rank(w)}")