#!/usr/bin/env python3 """Assign pseudo-frequency to confusable groups using English word frequency. Problem: Confusable entries share the same ktiv_male and thus the same Hebrew frequency rank. This script uses English frequency to differentiate them so Anki sorts more-common meanings first. Algorithm: 1. For each confusable group where all entries share the same Hebrew frequency, extract the first meaningful English keyword from each entry's meaning field. 2. Look up English frequency rank for each keyword. 3. Assign pseudo_frequency: the most frequent English meaning keeps the original Hebrew rank; less frequent meanings get progressively higher (worse) ranks by adding an offset (100 * position in group). Usage: python3 scripts/assign_pseudo_frequency.py # assign and save python3 scripts/assign_pseudo_frequency.py --dry-run # preview only """ from __future__ import annotations import argparse import json import logging import re from collections import defaultdict from pathlib import Path logger = logging.getLogger(__name__) PROJECT_ROOT = Path(__file__).parent.parent WORDS_JSON = PROJECT_ROOT / "data" / "words.json" EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt" # Words too common/vague to use as frequency signal _EN_STOP = frozenset( { "to", "be", "a", "an", "the", "of", "in", "on", "at", "for", "and", "with", "by", "or", "but", "not", "as", "its", "it", "is", "was", "are", "from", "that", "this", "have", "has", "had", "do", "does", "did", "will", "would", "can", "could", "may", "might", "shall", "should", "must", "no", "yes", "very", "too", "also", "just", "only", "so", "up", "out", "into", "over", "after", "before", "about", "more", "than", "other", "some", "any", "all", "each", "every", "both", "few", "many", "much", "most", "such", "own", "same", "well", "still", "even", "how", "what", "when", "where", "which", "who", "whom", "whose", "why", "because", "if", "then", "else", "while", "until", "though", "whether", } ) def _load_en_freq() -> dict[str, int]: """Load English frequency data: word -> rank (1 = most common).""" freq: dict[str, int] = {} rank = 1 with open(EN_FREQ_PATH, encoding="utf-8") as f: for line in f: parts = line.strip().split() if parts: word = parts[0].lower() if word not in freq: freq[word] = rank rank += 1 return freq def _extract_keywords(meaning: str) -> list[str]: """Extract meaningful English keywords from a meaning string. Returns list of lowercase words, filtered for stop words and short words. """ # Strip parenthesized content, punctuation cleaned = re.sub(r"\([^)]*\)", " ", meaning) cleaned = re.sub(r"[^\w\s]", " ", cleaned) return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP] def assign_pseudo_frequencies( words: dict, en_freq: dict[str, int], dry_run: bool = False, ) -> int: """Assign pseudo_frequency to confusable groups. Returns count of changes.""" # Group by confusables_guid groups: dict[str, list[str]] = defaultdict(list) for key, entry in words.items(): cg = entry.get("confusables_guid") if cg: groups[cg].append(key) changes = 0 assigned_groups = 0 skipped_diff = 0 skipped_no_en = 0 for _guid, keys in groups.items(): entries = [words[k] for k in keys] freqs = [e.get("frequency") for e in entries] # Skip groups that are already differentiated unique_freqs = set(freqs) if len(unique_freqs) > 1: skipped_diff += 1 continue base_freq = freqs[0] # All same (or all None) # Look up English frequency for each entry en_ranks: list[tuple[int, str]] = [] # (en_rank, key) for key, entry in zip(keys, entries, strict=True): keywords = _extract_keywords(entry.get("meaning", "")) en_rank = 999_999 for kw in keywords[:5]: r = en_freq.get(kw) if r is not None: en_rank = r break en_ranks.append((en_rank, key)) # Sort by English frequency (lower rank = more common) en_ranks.sort() # Check if all entries have the same English rank (no signal) if len({r for r, _ in en_ranks}) <= 1: skipped_no_en += 1 continue assigned_groups += 1 # Assign pseudo_frequency: most common gets base, others get offset for position, (en_rank, key) in enumerate(en_ranks): pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank if not dry_run: words[key]["pseudo_frequency"] = pseudo changes += 1 if dry_run: meaning = words[key].get("meaning", "")[:40] logger.info( " [en:%5d] pseudo=%6d %s", en_rank, pseudo, meaning, ) logger.info( "Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal", assigned_groups, skipped_diff, skipped_no_en, ) return changes def main() -> None: parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables") parser.add_argument("--dry-run", action="store_true", help="Preview without saving") args = parser.parse_args() logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", ) logger.info("Loading English frequency data: %s", EN_FREQ_PATH) en_freq = _load_en_freq() logger.info("English frequency: %d entries", len(en_freq)) with open(WORDS_JSON, encoding="utf-8") as f: words: dict = json.load(f) changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run) if args.dry_run: logger.info("Dry run — %d changes would be made", changes) return with open(WORDS_JSON, "w", encoding="utf-8") as f: json.dump(words, f, ensure_ascii=False, indent=2) logger.info("Saved %d pseudo-frequency assignments to words.json", changes) if __name__ == "__main__": main()