hebrew_flash_cards/frequency_lookup.py

#!/usr/bin/env python3
"""
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
Downloads he_50k.txt once; subsequent runs read from cache.
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
"""

import json
import logging
import re
import unicodedata
from pathlib import Path

import requests

logger = logging.getLogger(__name__)

FREQ_URL = (
    "https://raw.githubusercontent.com/hermitdave/FrequencyWords/"
    "master/content/2016/he/he_50k.txt"
)
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
REQUEST_TIMEOUT = 30

# Module-level cache: word_no_nikkud -> rank (1 = most common)
_freq: dict[str, int] = {}


def _strip_nikkud(text: str) -> str:
    """Remove Hebrew nikkud (diacritics) from a string."""
    return "".join(
        ch for ch in unicodedata.normalize("NFD", text)
        if unicodedata.category(ch) != "Mn"
    )


def load(cache_path: Path = CACHE_PATH) -> None:
    """Load frequency data from cache, downloading if not present."""
    global _freq
    if cache_path.exists():
        with open(cache_path, encoding="utf-8") as f:
            _freq = json.load(f)
        logger.info(f"Frequency cache loaded: {len(_freq)} entries")
        return

    logger.info("Downloading FrequencyWords he_50k.txt …")
    resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT)
    resp.raise_for_status()

    rank = 1
    for line in resp.text.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        if len(parts) >= 1:
            word = _strip_nikkud(parts[0])
            if word and word not in _freq:
                _freq[word] = rank
                rank += 1

    cache_path.parent.mkdir(parents=True, exist_ok=True)
    with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(_freq, f, ensure_ascii=False)
    logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}")


def get_frequency_rank(word_no_nikkud: str) -> int | None:
    """
    Return the frequency rank of a word (1 = most common).
    Returns None if not found in the corpus.
    Strips nikkud from the input before lookup.
    """
    if not _freq:
        load()
    clean = _strip_nikkud(word_no_nikkud.strip())
    return _freq.get(clean)


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    load()
    tests = ["שלום", "ספר", "בית", "מים", "כלב"]
    for w in tests:
        print(f"{w}: rank {get_frequency_rank(w)}")