feat: YAP-cleaned frequency corpus + two-tier assignment pipeline

- Add clean_frequency_corpus.py: YAP morphological analyzer removes prefix+word combos (e.g. בבית=ב+בית) from he_50k frequency data. Headwords always protected. 30,430 clean entries from 49,999 raw. - Add assign_frequency.py: two-tier assignment with PoS-aware homograph handling. Tier 1 matches headwords; Tier 2 matches inflections (any rank) and conjugations (rank>5000 only, to avoid false positives). Function words claim frequency over content words in homograph groups, with manual overrides for 12 common dual-use words. - frequency_lookup.py auto-prefers frequency_clean.json when available - 6,691 entries now have frequency (was 5,974), 717 newly assigned Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 06:22:55 +00:00 · 2026-03-10 06:22:55 +00:00 · 3b0f9defa9
commit 3b0f9defa9
parent b8b65442cb
6 changed files with 1884034 additions and 65460 deletions
--- a/data/frequency_clean.json
+++ b/data/frequency_clean.json
--- a/data/frequency_discarded.json
+++ b/data/frequency_discarded.json
--- a/data/words.json
+++ b/data/words.json
--- a/frequency_lookup.py
+++ b/frequency_lookup.py
@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)

 FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
 CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
+CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json"
 REQUEST_TIMEOUT = 30

 # Module-level cache: word_no_nikkud -> rank (1 = most common)
@ -26,12 +27,19 @@ _freq: dict[str, int] = {}


 def load(cache_path: Path = CACHE_PATH) -> None:
-    """Load frequency data from cache, downloading if not present."""
+    """Load frequency data from cache, downloading if not present.
+
+    Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json.
+    """
    global _freq
-    if cache_path.exists():
-        with open(cache_path, encoding="utf-8") as f:
+    # Prefer YAP-cleaned frequency data if available
+    clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None
+    load_path = clean_path if clean_path and clean_path.exists() else cache_path
+    if load_path.exists():
+        with open(load_path, encoding="utf-8") as f:
            _freq = json.load(f)
-        logger.info(f"Frequency cache loaded: {len(_freq)} entries")
+        label = "clean" if load_path == clean_path else "raw"
+        logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries")
        return

    logger.info("Downloading FrequencyWords he_50k.txt …")
--- a/scripts/assign_frequency.py
+++ b/scripts/assign_frequency.py
@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+"""Assign frequency ranks from the cleaned corpus to words.json entries.
+
+Two-tier assignment with PoS priority:
+  Tier 1: Match headword ktiv_male directly against corpus
+  Tier 2: Match conjugated/inflected forms (only if no other entry already
+           claimed that corpus word via tier 1)
+
+PoS priority (based on standalone-word likelihood in Hebrew text):
+  כינויי_גוף (Pronoun) > מילות_חיבור (Conjunction) > שם_תואר (Adjective) >
+  מילית (Particle) > שם_עצם (Noun) > תוארי_הפועל (Adverb) >
+  מילות_יחס (Preposition) > פעלים (Verb)
+
+Usage:
+    python3 scripts/assign_frequency.py              # assign and save
+    python3 scripts/assign_frequency.py --dry-run    # preview only
+    python3 scripts/assign_frequency.py --stats      # show statistics only
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+PROJECT_ROOT = Path(__file__).parent.parent
+WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
+CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
+RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
+
+# Function word PoS — these dominate content words in homograph groups
+FUNCTION_POS = frozenset({"כינויי_גוף", "מילות_חיבור", "מילית", "מילות_יחס", "תוארי_הפועל"})
+
+# Content PoS that loses frequency when a function word dominates
+# Adjectives also lose (e.g. כן "honest" vs כן "yes") — they're rare collisions
+CONTENT_POS = frozenset({"שם_עצם", "שם_תואר", "פעלים"})
+
+# Manual overrides: at these corpus ranks, ALL homographs share frequency.
+# These are cases where the content word is genuinely common enough to deserve it.
+# e.g. rank 15: עם "people" (NN) alongside עם "with" (PREP)
+# Manual overrides: at these ktiv_male forms, ALL homographs share frequency.
+# These are cases where the content word is genuinely common enough to deserve it.
+SHARE_ALL_WORDS = frozenset(
+    {
+        "עם",  # "people" (NN) + "with" (PREP)
+        "שם",  # "name" (NN) + "there" (ADV)
+        "אל",  # "god" (NN) + "to" (PREP) + "don't" (PART)
+        "עד",  # "witness"/"eternity" (NN) + "until" (PREP)
+        "פה",  # "mouth" (NN) + "here" (ADV)
+        "לאחר",  # "to be late" (VB) + "after" (PREP)
+        "יופי",  # "beauty" (NN) + "great!" (ADV)
+        "המון",  # "crowd" (NN) + "lots of" (ADV)
+        "חבל",  # "rope" (NN) + "it's a pity" (ADV)
+        "ראשית",  # "beginning" (NN) + "firstly" (ADV)
+        "עקב",  # "heel"/"footprint" (NN) + "due to" (CONJ)
+        "אולם",  # "hall" (NN) + "however" (ADV)
+    }
+)
+
+
+def _get_pos_tag(entry: dict) -> str:
+    """Extract primary PoS tag from entry's tags field."""
+    tags = (entry.get("tags") or "").split()
+    for t in tags:
+        if not t.startswith("שורש"):
+            return t
+    return "unknown"
+
+
+def _build_form_index(words: dict) -> dict[str, list[tuple[str, str]]]:
+    """Build reverse index: ktiv_male_form -> [(unique_key, match_type), ...]"""
+    index: dict[str, list[tuple[str, str]]] = defaultdict(list)
+
+    for key, entry in words.items():
+        w = entry.get("word") or {}
+        if km := w.get("ktiv_male"):
+            index[km].append((key, "headword"))
+
+        # Verb conjugations: indexed for new-assignment-only matching (no upgrades).
+        # Conjugated forms collide with unrelated headwords, so tier 2 only uses
+        # these for entries that have NO existing frequency.
+        conj = entry.get("conjugation") or {}
+        for form in conj.get("active_forms") or []:
+            if isinstance(form, dict):
+                form_data = form.get("form") or {}
+                if km2 := form_data.get("ktiv_male"):
+                    km2 = km2.rstrip("!\u200f ")
+                    index[km2].append((key, "conjugation"))
+
+        for hp in conj.get("hufal_pual_forms") or []:
+            if isinstance(hp, dict):
+                hp_data = hp.get("form") or {}
+                if km3 := hp_data.get("ktiv_male"):
+                    km3 = km3.rstrip("!\u200f ")
+                    index[km3].append((key, "conjugation"))
+
+        for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
+            for inf_data in (entry.get(field) or {}).values():
+                if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
+                    index[km4].append((key, "inflection"))
+
+    return dict(index)
+
+
+def _should_get_frequency(
+    entry: dict,
+    all_headword_entries: list[tuple[str, str]],
+    corpus_word: str,
+    words: dict,
+) -> bool:
+    """Decide if an entry should get frequency in a homograph group.
+
+    Rules:
+    - If only one entry matches, it always gets frequency.
+    - If SHARE_ALL_WORDS includes this corpus word, all entries share.
+    - If the group has function words AND content words, content words lose.
+    - Otherwise all entries share.
+    """
+    if len(all_headword_entries) <= 1:
+        return True
+    if corpus_word in SHARE_ALL_WORDS:
+        return True
+
+    pos = _get_pos_tag(entry)
+    has_function = any(_get_pos_tag(words[k]) in FUNCTION_POS for k, _ in all_headword_entries)
+
+    return not (has_function and pos in CONTENT_POS)
+
+
+def assign_frequencies(
+    words: dict,
+    freq_corpus: dict[str, int],
+    raw_corpus: dict[str, int] | None = None,
+    upgrade: bool = False,
+) -> dict[str, dict]:
+    """Assign frequency ranks to words.json entries. Returns assignment details.
+
+    freq_corpus controls which words are valid (cleaned corpus).
+    raw_corpus provides original rank numbers (with gaps). If not provided,
+    uses freq_corpus ranks (re-ranked, no gaps).
+    upgrade: if True, tier 2 can upgrade an entry's rank when a conjugated/inflected
+    form has a better (lower) rank than the headword match.
+    """
+    rank_source = raw_corpus if raw_corpus is not None else freq_corpus
+    form_index = _build_form_index(words)
+
+    # Track which corpus words have been claimed by tier 1
+    tier1_claimed: set[str] = set()
+
+    # Results tracking
+    assignments: dict[str, dict] = {}  # unique_key -> {rank, source, corpus_word}
+
+    # --- Tier 1: headword matches ---
+    # For each corpus word, find all headword matches and assign to eligible entries.
+    # Homograph groups: function words get frequency, content words don't (unless overridden).
+    corpus_by_rank = sorted(freq_corpus.items(), key=lambda x: x[1])
+
+    for corpus_word, _clean_rank in corpus_by_rank:
+        matches = form_index.get(corpus_word, [])
+        headword_matches = [(k, t) for k, t in matches if t == "headword"]
+        if not headword_matches:
+            continue
+
+        original_rank = rank_source.get(corpus_word, _clean_rank)
+        assigned_any = False
+        for entry_key, _ in headword_matches:
+            if entry_key in assignments:
+                continue
+            if _should_get_frequency(words[entry_key], headword_matches, corpus_word, words):
+                assignments[entry_key] = {
+                    "rank": original_rank,
+                    "source": "headword",
+                    "corpus_word": corpus_word,
+                }
+                assigned_any = True
+
+        if assigned_any:
+            tier1_claimed.add(corpus_word)
+
+    tier1_count = len(assignments)
+    logger.info("Tier 1 (headword): %d entries assigned", tier1_count)
+
+    # --- Tier 2: conjugation/inflection matches ---
+    # Only use corpus words NOT claimed in tier 1.
+    # A corpus word that matches an inflection is "owned" by that headword —
+    # it cannot also upgrade an unrelated verb via conjugation.
+    # Upgrades (when enabled) only apply within the same match type priority.
+    for corpus_word, _clean_rank in corpus_by_rank:
+        if corpus_word in tier1_claimed:
+            continue
+
+        matches = form_index.get(corpus_word, [])
+        secondary_matches = [(k, t) for k, t in matches if t in ("conjugation", "inflection")]
+        if not secondary_matches:
+            continue
+
+        original_rank = rank_source.get(corpus_word, _clean_rank)
+
+        # Split by type: inflections take priority over conjugations
+        inflection_matches = [(k, t) for k, t in secondary_matches if t == "inflection"]
+        conjugation_matches = [(k, t) for k, t in secondary_matches if t == "conjugation"]
+
+        # If any inflection matches exist, this corpus word belongs to inflection.
+        # Don't let conjugations claim it.
+        active_matches = inflection_matches if inflection_matches else conjugation_matches
+
+        for entry_key, match_type in active_matches:
+            existing = assignments.get(entry_key)
+            if existing is None:
+                # New assignment — conjugations only allowed for rank > 5000
+                # (too many false positives in the important tiers)
+                if match_type == "conjugation" and original_rank <= 5000:
+                    continue
+                assignments[entry_key] = {
+                    "rank": original_rank,
+                    "source": match_type,
+                    "corpus_word": corpus_word,
+                }
+                break
+            if upgrade and match_type == "inflection" and original_rank < existing["rank"]:
+                # Upgrade — only allowed for inflections (conjugations collide too much)
+                assignments[entry_key] = {
+                    "rank": original_rank,
+                    "source": f"upgrade:{match_type}",
+                    "corpus_word": corpus_word,
+                }
+                break
+
+    tier2_count = len(assignments) - tier1_count
+    logger.info("Tier 2 (conjugation/inflection): %d entries assigned", tier2_count)
+
+    return assignments
+
+
+def print_stats(words: dict, assignments: dict, freq_corpus: dict) -> None:
+    """Print detailed statistics about frequency assignment."""
+    total = len(words)
+    assigned = len(assignments)
+    previously_had = sum(1 for e in words.values() if e.get("frequency") is not None)
+
+    print(f"\n{'=' * 60}")
+    print("Frequency Assignment Statistics")
+    print(f"{'=' * 60}")
+    print(f"Words.json entries:      {total}")
+    print(f"Clean corpus size:       {len(freq_corpus)}")
+    print(f"Previously had freq:     {previously_had}")
+    print(f"Now assigned:            {assigned}")
+    print(f"Newly gained:            {assigned - previously_had}")
+    print(f"Still unlisted:          {total - assigned}")
+
+    # By tier
+    tier1 = sum(1 for a in assignments.values() if a["source"] == "headword")
+    tier2_conj = sum(1 for a in assignments.values() if a["source"] == "conjugation")
+    tier2_inf = sum(1 for a in assignments.values() if a["source"] == "inflection")
+    print("\nBy assignment tier:")
+    print(f"  Tier 1 (headword):     {tier1}")
+    print(f"  Tier 2 (conjugation):  {tier2_conj}")
+    print(f"  Tier 2 (inflection):   {tier2_inf}")
+
+    # By PoS
+    print("\nBy PoS:")
+    from collections import Counter
+
+    pos_assigned = Counter()
+    pos_total = Counter()
+    for k, v in words.items():
+        pos = _get_pos_tag(v)
+        pos_total[pos] += 1
+        if k in assignments:
+            pos_assigned[pos] += 1
+    pos_order = [
+        "כינויי_גוף",
+        "מילות_חיבור",
+        "שם_תואר",
+        "מילית",
+        "שם_עצם",
+        "תוארי_הפועל",
+        "מילות_יחס",
+        "פעלים",
+        "unknown",
+    ]
+    for pos in sorted(pos_total, key=lambda p: pos_order.index(p) if p in pos_order else 99):
+        a = pos_assigned[pos]
+        t = pos_total[pos]
+        pct = a / t * 100 if t else 0
+        print(f"  {pos:20s}: {a:5d}/{t:5d} ({pct:.0f}%)")
+
+    # By frequency tier (using apkg_builder tiers)
+    print("\nBy frequency tier:")
+    tiers = {
+        "Core (1-500)": (1, 500),
+        "Essential (501-1500)": (501, 1500),
+        "Intermediate (1501-3000)": (1501, 3000),
+        "Upper-intermediate (3001-5000)": (3001, 5000),
+        "Advanced (5001-10000)": (5001, 10000),
+        "Rare (10001+)": (10001, 999999),
+    }
+    for label, (lo, hi) in tiers.items():
+        count = sum(1 for a in assignments.values() if lo <= a["rank"] <= hi)
+        print(f"  {label:35s}: {count}")
+
+    # Top 20 newly assigned (entries that didn't have frequency before)
+    newly = []
+    for k, a in assignments.items():
+        if words[k].get("frequency") is None:
+            w = words[k].get("word", {})
+            newly.append((a["rank"], k, w.get("ktiv_male", ""), a["source"], a["corpus_word"]))
+    newly.sort()
+    if newly:
+        print("\nTop 20 newly assigned entries:")
+        for rank, _key, ktiv, source, corpus_word in newly[:20]:
+            print(f"  rank {rank:5d}: {ktiv:15s} via {source:12s} (corpus: {corpus_word})")
+
+    # Entries that LOST frequency (had it before, not assigned now)
+    lost = []
+    for k, v in words.items():
+        old_freq = v.get("frequency")
+        if old_freq is not None and k not in assignments:
+            w = v.get("word", {})
+            lost.append((old_freq, k, w.get("ktiv_male", "")))
+    lost.sort()
+    if lost:
+        print(f"\nEntries that would LOSE frequency ({len(lost)} total):")
+        for rank, _key, ktiv in lost[:20]:
+            print(f"  was rank {rank:5d}: {ktiv}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Assign frequency to words.json")
+    parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
+    parser.add_argument("--stats", action="store_true", help="Show statistics only")
+    parser.add_argument(
+        "--upgrade", action="store_true", help="Allow tier 2 to upgrade headword rank from conjugated forms"
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    # Load data
+    freq_path = CLEAN_CACHE if CLEAN_CACHE.exists() else RAW_CACHE
+    logger.info("Loading frequency corpus: %s", freq_path)
+    with open(freq_path, encoding="utf-8") as f:
+        freq_corpus: dict[str, int] = json.load(f)
+
+    # Load raw corpus for original rank numbers (with gaps)
+    raw_corpus: dict[str, int] | None = None
+    if RAW_CACHE.exists() and freq_path != RAW_CACHE:
+        with open(RAW_CACHE, encoding="utf-8") as f:
+            raw_corpus = json.load(f)
+        logger.info("Using original ranks from %s", RAW_CACHE)
+
+    with open(WORDS_JSON, encoding="utf-8") as f:
+        words: dict = json.load(f)
+
+    logger.info("Corpus: %d entries, Words.json: %d entries", len(freq_corpus), len(words))
+
+    # Run assignment
+    assignments = assign_frequencies(words, freq_corpus, raw_corpus, upgrade=args.upgrade)
+
+    # Stats
+    print_stats(words, assignments, freq_corpus)
+
+    if args.stats or args.dry_run:
+        if args.dry_run:
+            logger.info("Dry run — no changes saved")
+        return
+
+    # Apply to words.json
+    changed = 0
+    for key, entry in words.items():
+        if key in assignments:
+            new_rank = assignments[key]["rank"]
+            if entry.get("frequency") != new_rank:
+                entry["frequency"] = new_rank
+                changed += 1
+        else:
+            if entry.get("frequency") is not None:
+                entry["frequency"] = None
+                changed += 1
+
+    with open(WORDS_JSON, "w", encoding="utf-8") as f:
+        json.dump(words, f, ensure_ascii=False, indent=2)
+
+    logger.info("Updated %d entries in words.json", changed)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/clean_frequency_corpus.py
+++ b/scripts/clean_frequency_corpus.py
@ -0,0 +1,400 @@
+#!/usr/bin/env python3
+"""Clean the Hebrew frequency corpus by removing prefix+word combinations.
+
+Two modes:
+  --mode yap       (default) Use YAP morphological analyzer for accurate prefix detection.
+                   Requires YAP API running at localhost:8000.
+  --mode heuristic Use rule-based prefix stripping (no external dependencies).
+
+Both modes preserve words that exist as known dictionary forms in words.json.
+
+Usage:
+    python3 scripts/clean_frequency_corpus.py                    # YAP mode
+    python3 scripts/clean_frequency_corpus.py --mode heuristic   # heuristic fallback
+    python3 scripts/clean_frequency_corpus.py --dry-run          # preview only
+    python3 scripts/clean_frequency_corpus.py --resume           # resume YAP from checkpoint
+    python3 scripts/clean_frequency_corpus.py --limit 1000       # process first N entries
+
+Input:  data/frequency_cache.json   (raw he_50k.txt, 49999 entries)
+Output: data/frequency_clean.json   (filtered, prefix combos removed)
+        data/frequency_discarded.json (discarded entries with reason)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+PROJECT_ROOT = Path(__file__).parent.parent
+RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
+CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
+DISCARDED = PROJECT_ROOT / "data" / "frequency_discarded.json"
+WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
+CHECKPOINT = PROJECT_ROOT / "data" / "_yap_checkpoint.json"
+
+YAP_URL = os.environ.get("YAP_URL", "http://localhost:8000/yap/heb/joint")
+YAP_TIMEOUT = 10
+BATCH_SAVE_INTERVAL = 500
+
+# --- YAP mode constants ---
+# POS tags that indicate a prefix
+PREFIX_POS = frozenset({"PREPOSITION", "CONJ", "DEF", "REL"})
+# POS tags for the host word that make the combo a false positive
+HOST_POS = frozenset({"NN", "NNP", "NNT", "PRP", "CD", "DT", "EX"})
+
+# --- Heuristic mode constants ---
+# Hebrew prefix combinations, longest first for greedy matching.
+PREFIXES = [
+    # 4-char
+    "וכשמ",
+    "וכשב",
+    "וכשל",
+    "וכשה",
+    # 3-char
+    "וכש",
+    "ומה",
+    "ובה",
+    "וכה",
+    "ולה",
+    "ומש",
+    "ובש",
+    "וכב",
+    "ולב",
+    "ומב",
+    "וכל",
+    "ולכ",
+    "שבה",
+    "שמה",
+    # 2-char
+    "כש",
+    "מה",
+    "בה",
+    "כה",
+    "לה",
+    "מש",
+    "בש",
+    "וב",
+    "וה",
+    "וכ",
+    "ול",
+    "ומ",
+    "וש",
+    "כב",
+    "לב",
+    "מב",
+    "כל",
+    "לכ",
+    "שב",
+    "שה",
+    "שכ",
+    "של",
+    "שמ",
+    # 1-char
+    "ב",
+    "ה",
+    "ו",
+    "כ",
+    "ל",
+    "מ",
+    "ש",
+]
+MIN_REMAINDER_LEN = 2
+
+
+def _load_known_forms(words_path: Path) -> set[str]:
+    """Load all known ktiv_male forms from words.json."""
+    if not words_path.exists():
+        logger.warning("words.json not found at %s — no dictionary filter", words_path)
+        return set()
+
+    with open(words_path, encoding="utf-8") as f:
+        words = json.load(f)
+
+    known: set[str] = set()
+    for entry in words.values():
+        w = entry.get("word") or {}
+        if km := w.get("ktiv_male"):
+            known.add(km)
+
+        for form in entry.get("active_forms") or []:
+            if isinstance(form, dict) and (km2 := form.get("ktiv_male")):
+                known.add(km2)
+
+        for hp in entry.get("hufal_pual_forms") or []:
+            if isinstance(hp, dict) and (km3 := hp.get("ktiv_male")):
+                known.add(km3)
+
+        for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
+            for inf_data in (entry.get(field) or {}).values():
+                if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
+                    known.add(km4)
+
+    logger.info("Loaded %d known dictionary forms from words.json", len(known))
+    return known
+
+
+# ── YAP mode ──────────────────────────────────────────────────────────────
+
+
+def query_yap(word: str) -> dict | None:
+    """Send a single word to YAP and return the JSON response."""
+    payload = {"text": f"{word}  "}
+    try:
+        resp = requests.post(YAP_URL, json=payload, timeout=YAP_TIMEOUT)
+        resp.raise_for_status()
+        return resp.json()
+    except requests.RequestException as e:
+        logger.warning("YAP request failed for '%s': %s", word, e)
+        return None
+
+
+def is_prefix_combo_yap(yap_response: dict) -> tuple[bool, str]:
+    """Check if any morphological analysis segments the word as prefix+host.
+
+    Conservative: if ANY analysis in the lattice shows prefix+host → discard.
+    """
+    lattice = yap_response.get("ma_lattice", "")
+    if not lattice:
+        return False, ""
+
+    arcs = []
+    for line in lattice.strip().split("\n"):
+        if not line.strip():
+            continue
+        parts = line.split("\t")
+        if len(parts) < 6:
+            continue
+        arcs.append(
+            {
+                "from": parts[0],
+                "to": parts[1],
+                "form": parts[2],
+                "lemma": parts[3],
+                "cpos": parts[4],
+                "pos": parts[5],
+            }
+        )
+
+    if len(arcs) < 2:
+        return False, ""
+
+    for a in arcs:
+        if a["cpos"] not in PREFIX_POS and a["pos"] not in PREFIX_POS:
+            continue
+        for b in arcs:
+            if b["from"] != a["to"]:
+                continue
+            if b["cpos"] in HOST_POS or b["pos"] in HOST_POS:
+                reason = f"{a['form']}({a['cpos']})+{b['form']}({b['cpos']})"
+                return True, reason
+
+    return False, ""
+
+
+# ── Heuristic mode ────────────────────────────────────────────────────────
+
+
+def find_prefix_decomposition(word: str, freq: dict[str, int]) -> tuple[str, str] | None:
+    """Check if word is a prefix+higher-ranked-word combo (heuristic)."""
+    if len(word) <= MIN_REMAINDER_LEN:
+        return None
+
+    word_rank = freq.get(word, 999999)
+
+    for prefix in PREFIXES:
+        if not word.startswith(prefix):
+            continue
+        remainder = word[len(prefix) :]
+        if len(remainder) < MIN_REMAINDER_LEN:
+            continue
+        if remainder in freq and freq[remainder] < word_rank:
+            return prefix, remainder
+
+    return None
+
+
+# ── Main ──────────────────────────────────────────────────────────────────
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Clean frequency corpus")
+    parser.add_argument("--mode", choices=["yap", "heuristic"], default="yap", help="Detection mode")
+    parser.add_argument("--dry-run", action="store_true", help="Show removals without saving")
+    parser.add_argument("--resume", action="store_true", help="Resume YAP mode from checkpoint")
+    parser.add_argument("--limit", type=int, default=0, help="Process only first N words (0=all)")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    if not RAW_CACHE.exists():
+        logger.error("Raw frequency cache not found: %s", RAW_CACHE)
+        sys.exit(1)
+
+    with open(RAW_CACHE, encoding="utf-8") as f:
+        raw_freq: dict[str, int] = json.load(f)
+
+    logger.info("Raw frequency corpus: %d entries", len(raw_freq))
+
+    # Sort by rank
+    words_by_rank = sorted(raw_freq.items(), key=lambda x: x[1])
+    if args.limit:
+        words_by_rank = words_by_rank[: args.limit]
+
+    if args.mode == "yap":
+        discarded_list = _run_yap_mode(words_by_rank, args)
+    else:
+        known_forms = _load_known_forms(WORDS_JSON)
+        discarded_list = _run_heuristic_mode(words_by_rank, raw_freq, known_forms)
+
+    kept_count = len(words_by_rank) - len(discarded_list)
+    logger.info("Done. Kept: %d, Discarded: %d", kept_count, len(discarded_list))
+
+    if args.dry_run:
+        logger.info("Dry run — no files written")
+        return
+
+    # Build clean frequency dict (re-ranked without gaps)
+    discarded_words = {d["word"] for d in discarded_list}
+    clean_freq: dict[str, int] = {}
+    new_rank = 1
+    for word, _rank in words_by_rank:
+        if word not in discarded_words:
+            clean_freq[word] = new_rank
+            new_rank += 1
+
+    with open(CLEAN_CACHE, "w", encoding="utf-8") as f:
+        json.dump(clean_freq, f, ensure_ascii=False)
+    logger.info("Clean frequency saved: %d entries → %s", len(clean_freq), CLEAN_CACHE)
+
+    with open(DISCARDED, "w", encoding="utf-8") as f:
+        json.dump(discarded_list, f, ensure_ascii=False, indent=2)
+    logger.info("Discarded entries saved: %d → %s", len(discarded_list), DISCARDED)
+
+
+def _run_yap_mode(
+    words_by_rank: list[tuple[str, int]],
+    args: argparse.Namespace,
+) -> list[dict]:
+    """Run YAP-based prefix detection."""
+    # Check YAP connectivity
+    test = query_yap("בדיקה")
+    if test is None:
+        logger.error("Cannot connect to YAP API at %s", YAP_URL)
+        sys.exit(1)
+    logger.info("YAP API connected")
+
+    # Load checkpoint if resuming
+    analyzed: dict[str, dict] = {}
+    if args.resume and CHECKPOINT.exists():
+        with open(CHECKPOINT, encoding="utf-8") as f:
+            analyzed = json.load(f)
+        logger.info("Resumed from checkpoint: %d words already analyzed", len(analyzed))
+
+    discarded_list: list[dict] = []
+    discarded_count = 0
+    kept_count = 0
+    error_count = 0
+
+    for i, (word, rank) in enumerate(words_by_rank):
+        # Already analyzed (from checkpoint)
+        if word in analyzed:
+            if analyzed[word]["discard"]:
+                discarded_count += 1
+                discarded_list.append({"word": word, "original_rank": rank, "reason": analyzed[word]["reason"]})
+            else:
+                kept_count += 1
+            continue
+
+        # Trivial: single char, ASCII, or too short
+        if len(word) <= 1 or word.isascii():
+            analyzed[word] = {"discard": False, "reason": ""}
+            kept_count += 1
+            continue
+
+        result = query_yap(word)
+        if result is None:
+            analyzed[word] = {"discard": False, "reason": "yap_error"}
+            error_count += 1
+            kept_count += 1
+            time.sleep(0.5)
+            continue
+
+        is_combo, reason = is_prefix_combo_yap(result)
+        analyzed[word] = {"discard": is_combo, "reason": reason}
+
+        if is_combo:
+            discarded_count += 1
+            discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
+            if rank <= 500 or discarded_count <= 50:
+                logger.info("  DISCARD rank %5d: %s (%s)", rank, word, reason)
+        else:
+            kept_count += 1
+
+        # Rate limit
+        if i % 10 == 0:
+            time.sleep(0.01)
+
+        # Checkpoint
+        if (i + 1) % BATCH_SAVE_INTERVAL == 0:
+            if not args.dry_run:
+                with open(CHECKPOINT, "w", encoding="utf-8") as f:
+                    json.dump(analyzed, f, ensure_ascii=False)
+            logger.info(
+                "  [%d/%d] kept=%d discarded=%d errors=%d",
+                i + 1,
+                len(words_by_rank),
+                kept_count,
+                discarded_count,
+                error_count,
+            )
+
+    # Final checkpoint save
+    if not args.dry_run and CHECKPOINT.exists():
+        CHECKPOINT.unlink()
+
+    if error_count:
+        logger.warning("%d YAP errors encountered", error_count)
+
+    return discarded_list
+
+
+def _run_heuristic_mode(
+    words_by_rank: list[tuple[str, int]],
+    raw_freq: dict[str, int],
+    known_forms: set[str],
+) -> list[dict]:
+    """Run heuristic prefix detection (no external dependencies)."""
+    discarded_list: list[dict] = []
+    discarded_count = 0
+
+    for word, rank in words_by_rank:
+        if len(word) <= 1 or word.isascii():
+            continue
+
+        # Known dictionary form → keep
+        if word in known_forms:
+            continue
+
+        result = find_prefix_decomposition(word, raw_freq)
+        if result is not None:
+            prefix, remainder = result
+            discarded_count += 1
+            reason = f"{prefix}+{remainder} (rank {raw_freq[remainder]})"
+            discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
+            if rank <= 500 or discarded_count <= 50:
+                logger.info("  DISCARD rank %5d: %s = %s", rank, word, reason)
+
+    return discarded_list
+
+
+if __name__ == "__main__":
+    main()