feat: pseudo-frequency for confusables using English word frequency

264 confusable groups where all entries shared the same Hebrew frequency now have differentiated pseudo_frequency values based on English word commonality (hermitdave en_50k.txt). Most common meaning keeps base rank; less common meanings get +100 offset per position. Examples: - אב: "father" (en:194) → 2491, "bud" (en:2963) → 2591 - אח: "brother" (en:300) → 911, "fireplace" (en:9389) → 1011 Builder uses pseudo_frequency for sort order when available. Confusable card definitions now sorted most-common-first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-04-03 05:28:30 +00:00 · 2026-04-03 05:28:30 +00:00 · 6d2d446ed5
commit 6d2d446ed5
parent f978e5f39a
4 changed files with 50821 additions and 543 deletions
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -969,9 +969,11 @@ def build_vocab_deck(
        if word_nikkud not in word_to_pos_cat:
            word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
-    # Sort entries by frequency (null → 999999), applying limit after sort
+    # Sort entries by effective frequency (pseudo_frequency for confusables,
    # else regular frequency; null → 999999), applying limit after sort
    def _freq_key(item: tuple[str, dict]) -> int:
-        return item[1].get("frequency") or 999_999
+        e = item[1]
        return e.get("pseudo_frequency") or e.get("frequency") or 999_999
    sorted_entries = sorted(words.items(), key=_freq_key)
    if limit:
@ -1558,9 +1560,12 @@ def build_confusables_deck(
            guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
        guid_to_entries.setdefault(guid, []).append(entry)
    def _eff_freq(e: dict) -> int:
        return e.get("pseudo_frequency") or e.get("frequency") or 999_999
    for guid, group_entries in sorted(
        guid_to_entries.items(),
-        key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
+        key=lambda x: sum(_eff_freq(e) for e in x[1]) / len(x[1]),
    ):
        if guid in seen_guids:
            continue
@ -1579,6 +1584,10 @@ def build_confusables_deck(
                unique_entries.append(e)
        if len(unique_entries) < 2:
            continue
        # Sort by pseudo/frequency so most common meaning appears first
        unique_entries.sort(key=_eff_freq)
        if len(unique_entries) < 2:
            continue
        word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
        words_display = word_no_nik  # Show ktiv male (shared form) on front
--- a/data/en_50k.txt
+++ b/data/en_50k.txt
--- a/data/words.json
+++ b/data/words.json
--- a/scripts/assign_pseudo_frequency.py
+++ b/scripts/assign_pseudo_frequency.py
@ -0,0 +1,269 @@
 #!/usr/bin/env python3
 """Assign pseudo-frequency to confusable groups using English word frequency.
 Problem: Confusable entries share the same ktiv_male and thus the same Hebrew
 frequency rank.  This script uses English frequency to differentiate them so
 Anki sorts more-common meanings first.
 Algorithm:
  1. For each confusable group where all entries share the same Hebrew frequency,
     extract the first meaningful English keyword from each entry's meaning field.
  2. Look up English frequency rank for each keyword.
  3. Assign pseudo_frequency: the most frequent English meaning keeps the original
     Hebrew rank; less frequent meanings get progressively higher (worse) ranks
     by adding an offset (100 * position in group).
 Usage:
    python3 scripts/assign_pseudo_frequency.py              # assign and save
    python3 scripts/assign_pseudo_frequency.py --dry-run    # preview only
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 import re
 from collections import defaultdict
 from pathlib import Path
 logger = logging.getLogger(__name__)
 PROJECT_ROOT = Path(__file__).parent.parent
 WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
 EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt"
 # Words too common/vague to use as frequency signal
 _EN_STOP = frozenset(
    {
        "to",
        "be",
        "a",
        "an",
        "the",
        "of",
        "in",
        "on",
        "at",
        "for",
        "and",
        "with",
        "by",
        "or",
        "but",
        "not",
        "as",
        "its",
        "it",
        "is",
        "was",
        "are",
        "from",
        "that",
        "this",
        "have",
        "has",
        "had",
        "do",
        "does",
        "did",
        "will",
        "would",
        "can",
        "could",
        "may",
        "might",
        "shall",
        "should",
        "must",
        "no",
        "yes",
        "very",
        "too",
        "also",
        "just",
        "only",
        "so",
        "up",
        "out",
        "into",
        "over",
        "after",
        "before",
        "about",
        "more",
        "than",
        "other",
        "some",
        "any",
        "all",
        "each",
        "every",
        "both",
        "few",
        "many",
        "much",
        "most",
        "such",
        "own",
        "same",
        "well",
        "still",
        "even",
        "how",
        "what",
        "when",
        "where",
        "which",
        "who",
        "whom",
        "whose",
        "why",
        "because",
        "if",
        "then",
        "else",
        "while",
        "until",
        "though",
        "whether",
    }
 )
 def _load_en_freq() -> dict[str, int]:
    """Load English frequency data: word -> rank (1 = most common)."""
    freq: dict[str, int] = {}
    rank = 1
    with open(EN_FREQ_PATH, encoding="utf-8") as f:
        for line in f:
            parts = line.strip().split()
            if parts:
                word = parts[0].lower()
                if word not in freq:
                    freq[word] = rank
                    rank += 1
    return freq
 def _extract_keywords(meaning: str) -> list[str]:
    """Extract meaningful English keywords from a meaning string.
    Returns list of lowercase words, filtered for stop words and short words.
    """
    # Strip parenthesized content, punctuation
    cleaned = re.sub(r"\([^)]*\)", " ", meaning)
    cleaned = re.sub(r"[^\w\s]", " ", cleaned)
    return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP]
 def assign_pseudo_frequencies(
    words: dict,
    en_freq: dict[str, int],
    dry_run: bool = False,
 ) -> int:
    """Assign pseudo_frequency to confusable groups. Returns count of changes."""
    # Group by confusables_guid
    groups: dict[str, list[str]] = defaultdict(list)
    for key, entry in words.items():
        cg = entry.get("confusables_guid")
        if cg:
            groups[cg].append(key)
    changes = 0
    assigned_groups = 0
    skipped_diff = 0
    skipped_no_en = 0
    for _guid, keys in groups.items():
        entries = [words[k] for k in keys]
        freqs = [e.get("frequency") for e in entries]
        # Skip groups that are already differentiated
        unique_freqs = set(freqs)
        if len(unique_freqs) > 1:
            skipped_diff += 1
            continue
        base_freq = freqs[0]  # All same (or all None)
        # Look up English frequency for each entry
        en_ranks: list[tuple[int, str]] = []  # (en_rank, key)
        for key, entry in zip(keys, entries, strict=True):
            keywords = _extract_keywords(entry.get("meaning", ""))
            en_rank = 999_999
            for kw in keywords[:5]:
                r = en_freq.get(kw)
                if r is not None:
                    en_rank = r
                    break
            en_ranks.append((en_rank, key))
        # Sort by English frequency (lower rank = more common)
        en_ranks.sort()
        # Check if all entries have the same English rank (no signal)
        if len({r for r, _ in en_ranks}) <= 1:
            skipped_no_en += 1
            continue
        assigned_groups += 1
        # Assign pseudo_frequency: most common gets base, others get offset
        for position, (en_rank, key) in enumerate(en_ranks):
            pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank
            if not dry_run:
                words[key]["pseudo_frequency"] = pseudo
            changes += 1
            if dry_run:
                meaning = words[key].get("meaning", "")[:40]
                logger.info(
                    "  [en:%5d] pseudo=%6d  %s",
                    en_rank,
                    pseudo,
                    meaning,
                )
    logger.info(
        "Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal",
        assigned_groups,
        skipped_diff,
        skipped_no_en,
    )
    return changes
 def main() -> None:
    parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables")
    parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s",
    )
    logger.info("Loading English frequency data: %s", EN_FREQ_PATH)
    en_freq = _load_en_freq()
    logger.info("English frequency: %d entries", len(en_freq))
    with open(WORDS_JSON, encoding="utf-8") as f:
        words: dict = json.load(f)
    changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run)
    if args.dry_run:
        logger.info("Dry run — %d changes would be made", changes)
        return
    with open(WORDS_JSON, "w", encoding="utf-8") as f:
        json.dump(words, f, ensure_ascii=False, indent=2)
    logger.info("Saved %d pseudo-frequency assignments to words.json", changes)
 if __name__ == "__main__":
    main()