feat: YAP-cleaned frequency corpus + two-tier assignment pipeline
- Add clean_frequency_corpus.py: YAP morphological analyzer removes prefix+word combos (e.g. בבית=ב+בית) from he_50k frequency data. Headwords always protected. 30,430 clean entries from 49,999 raw. - Add assign_frequency.py: two-tier assignment with PoS-aware homograph handling. Tier 1 matches headwords; Tier 2 matches inflections (any rank) and conjugations (rank>5000 only, to avoid false positives). Function words claim frequency over content words in homograph groups, with manual overrides for 12 common dual-use words. - frequency_lookup.py auto-prefers frequency_clean.json when available - 6,691 entries now have frequency (was 5,974), 717 newly assigned Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
b8b65442cb
commit
3b0f9defa9
6 changed files with 1884034 additions and 65460 deletions
1
data/frequency_clean.json
Normal file
1
data/frequency_clean.json
Normal file
File diff suppressed because one or more lines are too long
97847
data/frequency_discarded.json
Normal file
97847
data/frequency_discarded.json
Normal file
File diff suppressed because it is too large
Load diff
1850838
data/words.json
1850838
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
|
|||
|
||||
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
|
||||
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
|
||||
CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json"
|
||||
REQUEST_TIMEOUT = 30
|
||||
|
||||
# Module-level cache: word_no_nikkud -> rank (1 = most common)
|
||||
|
|
@ -26,12 +27,19 @@ _freq: dict[str, int] = {}
|
|||
|
||||
|
||||
def load(cache_path: Path = CACHE_PATH) -> None:
|
||||
"""Load frequency data from cache, downloading if not present."""
|
||||
"""Load frequency data from cache, downloading if not present.
|
||||
|
||||
Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json.
|
||||
"""
|
||||
global _freq
|
||||
if cache_path.exists():
|
||||
with open(cache_path, encoding="utf-8") as f:
|
||||
# Prefer YAP-cleaned frequency data if available
|
||||
clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None
|
||||
load_path = clean_path if clean_path and clean_path.exists() else cache_path
|
||||
if load_path.exists():
|
||||
with open(load_path, encoding="utf-8") as f:
|
||||
_freq = json.load(f)
|
||||
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
|
||||
label = "clean" if load_path == clean_path else "raw"
|
||||
logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries")
|
||||
return
|
||||
|
||||
logger.info("Downloading FrequencyWords he_50k.txt …")
|
||||
|
|
|
|||
392
scripts/assign_frequency.py
Normal file
392
scripts/assign_frequency.py
Normal file
|
|
@ -0,0 +1,392 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Assign frequency ranks from the cleaned corpus to words.json entries.
|
||||
|
||||
Two-tier assignment with PoS priority:
|
||||
Tier 1: Match headword ktiv_male directly against corpus
|
||||
Tier 2: Match conjugated/inflected forms (only if no other entry already
|
||||
claimed that corpus word via tier 1)
|
||||
|
||||
PoS priority (based on standalone-word likelihood in Hebrew text):
|
||||
כינויי_גוף (Pronoun) > מילות_חיבור (Conjunction) > שם_תואר (Adjective) >
|
||||
מילית (Particle) > שם_עצם (Noun) > תוארי_הפועל (Adverb) >
|
||||
מילות_יחס (Preposition) > פעלים (Verb)
|
||||
|
||||
Usage:
|
||||
python3 scripts/assign_frequency.py # assign and save
|
||||
python3 scripts/assign_frequency.py --dry-run # preview only
|
||||
python3 scripts/assign_frequency.py --stats # show statistics only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
|
||||
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
|
||||
|
||||
# Function word PoS — these dominate content words in homograph groups
|
||||
FUNCTION_POS = frozenset({"כינויי_גוף", "מילות_חיבור", "מילית", "מילות_יחס", "תוארי_הפועל"})
|
||||
|
||||
# Content PoS that loses frequency when a function word dominates
|
||||
# Adjectives also lose (e.g. כן "honest" vs כן "yes") — they're rare collisions
|
||||
CONTENT_POS = frozenset({"שם_עצם", "שם_תואר", "פעלים"})
|
||||
|
||||
# Manual overrides: at these corpus ranks, ALL homographs share frequency.
|
||||
# These are cases where the content word is genuinely common enough to deserve it.
|
||||
# e.g. rank 15: עם "people" (NN) alongside עם "with" (PREP)
|
||||
# Manual overrides: at these ktiv_male forms, ALL homographs share frequency.
|
||||
# These are cases where the content word is genuinely common enough to deserve it.
|
||||
SHARE_ALL_WORDS = frozenset(
|
||||
{
|
||||
"עם", # "people" (NN) + "with" (PREP)
|
||||
"שם", # "name" (NN) + "there" (ADV)
|
||||
"אל", # "god" (NN) + "to" (PREP) + "don't" (PART)
|
||||
"עד", # "witness"/"eternity" (NN) + "until" (PREP)
|
||||
"פה", # "mouth" (NN) + "here" (ADV)
|
||||
"לאחר", # "to be late" (VB) + "after" (PREP)
|
||||
"יופי", # "beauty" (NN) + "great!" (ADV)
|
||||
"המון", # "crowd" (NN) + "lots of" (ADV)
|
||||
"חבל", # "rope" (NN) + "it's a pity" (ADV)
|
||||
"ראשית", # "beginning" (NN) + "firstly" (ADV)
|
||||
"עקב", # "heel"/"footprint" (NN) + "due to" (CONJ)
|
||||
"אולם", # "hall" (NN) + "however" (ADV)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _get_pos_tag(entry: dict) -> str:
|
||||
"""Extract primary PoS tag from entry's tags field."""
|
||||
tags = (entry.get("tags") or "").split()
|
||||
for t in tags:
|
||||
if not t.startswith("שורש"):
|
||||
return t
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _build_form_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||
"""Build reverse index: ktiv_male_form -> [(unique_key, match_type), ...]"""
|
||||
index: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
|
||||
for key, entry in words.items():
|
||||
w = entry.get("word") or {}
|
||||
if km := w.get("ktiv_male"):
|
||||
index[km].append((key, "headword"))
|
||||
|
||||
# Verb conjugations: indexed for new-assignment-only matching (no upgrades).
|
||||
# Conjugated forms collide with unrelated headwords, so tier 2 only uses
|
||||
# these for entries that have NO existing frequency.
|
||||
conj = entry.get("conjugation") or {}
|
||||
for form in conj.get("active_forms") or []:
|
||||
if isinstance(form, dict):
|
||||
form_data = form.get("form") or {}
|
||||
if km2 := form_data.get("ktiv_male"):
|
||||
km2 = km2.rstrip("!\u200f ")
|
||||
index[km2].append((key, "conjugation"))
|
||||
|
||||
for hp in conj.get("hufal_pual_forms") or []:
|
||||
if isinstance(hp, dict):
|
||||
hp_data = hp.get("form") or {}
|
||||
if km3 := hp_data.get("ktiv_male"):
|
||||
km3 = km3.rstrip("!\u200f ")
|
||||
index[km3].append((key, "conjugation"))
|
||||
|
||||
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
|
||||
for inf_data in (entry.get(field) or {}).values():
|
||||
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
|
||||
index[km4].append((key, "inflection"))
|
||||
|
||||
return dict(index)
|
||||
|
||||
|
||||
def _should_get_frequency(
|
||||
entry: dict,
|
||||
all_headword_entries: list[tuple[str, str]],
|
||||
corpus_word: str,
|
||||
words: dict,
|
||||
) -> bool:
|
||||
"""Decide if an entry should get frequency in a homograph group.
|
||||
|
||||
Rules:
|
||||
- If only one entry matches, it always gets frequency.
|
||||
- If SHARE_ALL_WORDS includes this corpus word, all entries share.
|
||||
- If the group has function words AND content words, content words lose.
|
||||
- Otherwise all entries share.
|
||||
"""
|
||||
if len(all_headword_entries) <= 1:
|
||||
return True
|
||||
if corpus_word in SHARE_ALL_WORDS:
|
||||
return True
|
||||
|
||||
pos = _get_pos_tag(entry)
|
||||
has_function = any(_get_pos_tag(words[k]) in FUNCTION_POS for k, _ in all_headword_entries)
|
||||
|
||||
return not (has_function and pos in CONTENT_POS)
|
||||
|
||||
|
||||
def assign_frequencies(
|
||||
words: dict,
|
||||
freq_corpus: dict[str, int],
|
||||
raw_corpus: dict[str, int] | None = None,
|
||||
upgrade: bool = False,
|
||||
) -> dict[str, dict]:
|
||||
"""Assign frequency ranks to words.json entries. Returns assignment details.
|
||||
|
||||
freq_corpus controls which words are valid (cleaned corpus).
|
||||
raw_corpus provides original rank numbers (with gaps). If not provided,
|
||||
uses freq_corpus ranks (re-ranked, no gaps).
|
||||
upgrade: if True, tier 2 can upgrade an entry's rank when a conjugated/inflected
|
||||
form has a better (lower) rank than the headword match.
|
||||
"""
|
||||
rank_source = raw_corpus if raw_corpus is not None else freq_corpus
|
||||
form_index = _build_form_index(words)
|
||||
|
||||
# Track which corpus words have been claimed by tier 1
|
||||
tier1_claimed: set[str] = set()
|
||||
|
||||
# Results tracking
|
||||
assignments: dict[str, dict] = {} # unique_key -> {rank, source, corpus_word}
|
||||
|
||||
# --- Tier 1: headword matches ---
|
||||
# For each corpus word, find all headword matches and assign to eligible entries.
|
||||
# Homograph groups: function words get frequency, content words don't (unless overridden).
|
||||
corpus_by_rank = sorted(freq_corpus.items(), key=lambda x: x[1])
|
||||
|
||||
for corpus_word, _clean_rank in corpus_by_rank:
|
||||
matches = form_index.get(corpus_word, [])
|
||||
headword_matches = [(k, t) for k, t in matches if t == "headword"]
|
||||
if not headword_matches:
|
||||
continue
|
||||
|
||||
original_rank = rank_source.get(corpus_word, _clean_rank)
|
||||
assigned_any = False
|
||||
for entry_key, _ in headword_matches:
|
||||
if entry_key in assignments:
|
||||
continue
|
||||
if _should_get_frequency(words[entry_key], headword_matches, corpus_word, words):
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": "headword",
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
assigned_any = True
|
||||
|
||||
if assigned_any:
|
||||
tier1_claimed.add(corpus_word)
|
||||
|
||||
tier1_count = len(assignments)
|
||||
logger.info("Tier 1 (headword): %d entries assigned", tier1_count)
|
||||
|
||||
# --- Tier 2: conjugation/inflection matches ---
|
||||
# Only use corpus words NOT claimed in tier 1.
|
||||
# A corpus word that matches an inflection is "owned" by that headword —
|
||||
# it cannot also upgrade an unrelated verb via conjugation.
|
||||
# Upgrades (when enabled) only apply within the same match type priority.
|
||||
for corpus_word, _clean_rank in corpus_by_rank:
|
||||
if corpus_word in tier1_claimed:
|
||||
continue
|
||||
|
||||
matches = form_index.get(corpus_word, [])
|
||||
secondary_matches = [(k, t) for k, t in matches if t in ("conjugation", "inflection")]
|
||||
if not secondary_matches:
|
||||
continue
|
||||
|
||||
original_rank = rank_source.get(corpus_word, _clean_rank)
|
||||
|
||||
# Split by type: inflections take priority over conjugations
|
||||
inflection_matches = [(k, t) for k, t in secondary_matches if t == "inflection"]
|
||||
conjugation_matches = [(k, t) for k, t in secondary_matches if t == "conjugation"]
|
||||
|
||||
# If any inflection matches exist, this corpus word belongs to inflection.
|
||||
# Don't let conjugations claim it.
|
||||
active_matches = inflection_matches if inflection_matches else conjugation_matches
|
||||
|
||||
for entry_key, match_type in active_matches:
|
||||
existing = assignments.get(entry_key)
|
||||
if existing is None:
|
||||
# New assignment — conjugations only allowed for rank > 5000
|
||||
# (too many false positives in the important tiers)
|
||||
if match_type == "conjugation" and original_rank <= 5000:
|
||||
continue
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": match_type,
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
break
|
||||
if upgrade and match_type == "inflection" and original_rank < existing["rank"]:
|
||||
# Upgrade — only allowed for inflections (conjugations collide too much)
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": f"upgrade:{match_type}",
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
break
|
||||
|
||||
tier2_count = len(assignments) - tier1_count
|
||||
logger.info("Tier 2 (conjugation/inflection): %d entries assigned", tier2_count)
|
||||
|
||||
return assignments
|
||||
|
||||
|
||||
def print_stats(words: dict, assignments: dict, freq_corpus: dict) -> None:
|
||||
"""Print detailed statistics about frequency assignment."""
|
||||
total = len(words)
|
||||
assigned = len(assignments)
|
||||
previously_had = sum(1 for e in words.values() if e.get("frequency") is not None)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("Frequency Assignment Statistics")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Words.json entries: {total}")
|
||||
print(f"Clean corpus size: {len(freq_corpus)}")
|
||||
print(f"Previously had freq: {previously_had}")
|
||||
print(f"Now assigned: {assigned}")
|
||||
print(f"Newly gained: {assigned - previously_had}")
|
||||
print(f"Still unlisted: {total - assigned}")
|
||||
|
||||
# By tier
|
||||
tier1 = sum(1 for a in assignments.values() if a["source"] == "headword")
|
||||
tier2_conj = sum(1 for a in assignments.values() if a["source"] == "conjugation")
|
||||
tier2_inf = sum(1 for a in assignments.values() if a["source"] == "inflection")
|
||||
print("\nBy assignment tier:")
|
||||
print(f" Tier 1 (headword): {tier1}")
|
||||
print(f" Tier 2 (conjugation): {tier2_conj}")
|
||||
print(f" Tier 2 (inflection): {tier2_inf}")
|
||||
|
||||
# By PoS
|
||||
print("\nBy PoS:")
|
||||
from collections import Counter
|
||||
|
||||
pos_assigned = Counter()
|
||||
pos_total = Counter()
|
||||
for k, v in words.items():
|
||||
pos = _get_pos_tag(v)
|
||||
pos_total[pos] += 1
|
||||
if k in assignments:
|
||||
pos_assigned[pos] += 1
|
||||
pos_order = [
|
||||
"כינויי_גוף",
|
||||
"מילות_חיבור",
|
||||
"שם_תואר",
|
||||
"מילית",
|
||||
"שם_עצם",
|
||||
"תוארי_הפועל",
|
||||
"מילות_יחס",
|
||||
"פעלים",
|
||||
"unknown",
|
||||
]
|
||||
for pos in sorted(pos_total, key=lambda p: pos_order.index(p) if p in pos_order else 99):
|
||||
a = pos_assigned[pos]
|
||||
t = pos_total[pos]
|
||||
pct = a / t * 100 if t else 0
|
||||
print(f" {pos:20s}: {a:5d}/{t:5d} ({pct:.0f}%)")
|
||||
|
||||
# By frequency tier (using apkg_builder tiers)
|
||||
print("\nBy frequency tier:")
|
||||
tiers = {
|
||||
"Core (1-500)": (1, 500),
|
||||
"Essential (501-1500)": (501, 1500),
|
||||
"Intermediate (1501-3000)": (1501, 3000),
|
||||
"Upper-intermediate (3001-5000)": (3001, 5000),
|
||||
"Advanced (5001-10000)": (5001, 10000),
|
||||
"Rare (10001+)": (10001, 999999),
|
||||
}
|
||||
for label, (lo, hi) in tiers.items():
|
||||
count = sum(1 for a in assignments.values() if lo <= a["rank"] <= hi)
|
||||
print(f" {label:35s}: {count}")
|
||||
|
||||
# Top 20 newly assigned (entries that didn't have frequency before)
|
||||
newly = []
|
||||
for k, a in assignments.items():
|
||||
if words[k].get("frequency") is None:
|
||||
w = words[k].get("word", {})
|
||||
newly.append((a["rank"], k, w.get("ktiv_male", ""), a["source"], a["corpus_word"]))
|
||||
newly.sort()
|
||||
if newly:
|
||||
print("\nTop 20 newly assigned entries:")
|
||||
for rank, _key, ktiv, source, corpus_word in newly[:20]:
|
||||
print(f" rank {rank:5d}: {ktiv:15s} via {source:12s} (corpus: {corpus_word})")
|
||||
|
||||
# Entries that LOST frequency (had it before, not assigned now)
|
||||
lost = []
|
||||
for k, v in words.items():
|
||||
old_freq = v.get("frequency")
|
||||
if old_freq is not None and k not in assignments:
|
||||
w = v.get("word", {})
|
||||
lost.append((old_freq, k, w.get("ktiv_male", "")))
|
||||
lost.sort()
|
||||
if lost:
|
||||
print(f"\nEntries that would LOSE frequency ({len(lost)} total):")
|
||||
for rank, _key, ktiv in lost[:20]:
|
||||
print(f" was rank {rank:5d}: {ktiv}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Assign frequency to words.json")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||
parser.add_argument("--stats", action="store_true", help="Show statistics only")
|
||||
parser.add_argument(
|
||||
"--upgrade", action="store_true", help="Allow tier 2 to upgrade headword rank from conjugated forms"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
# Load data
|
||||
freq_path = CLEAN_CACHE if CLEAN_CACHE.exists() else RAW_CACHE
|
||||
logger.info("Loading frequency corpus: %s", freq_path)
|
||||
with open(freq_path, encoding="utf-8") as f:
|
||||
freq_corpus: dict[str, int] = json.load(f)
|
||||
|
||||
# Load raw corpus for original rank numbers (with gaps)
|
||||
raw_corpus: dict[str, int] | None = None
|
||||
if RAW_CACHE.exists() and freq_path != RAW_CACHE:
|
||||
with open(RAW_CACHE, encoding="utf-8") as f:
|
||||
raw_corpus = json.load(f)
|
||||
logger.info("Using original ranks from %s", RAW_CACHE)
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words: dict = json.load(f)
|
||||
|
||||
logger.info("Corpus: %d entries, Words.json: %d entries", len(freq_corpus), len(words))
|
||||
|
||||
# Run assignment
|
||||
assignments = assign_frequencies(words, freq_corpus, raw_corpus, upgrade=args.upgrade)
|
||||
|
||||
# Stats
|
||||
print_stats(words, assignments, freq_corpus)
|
||||
|
||||
if args.stats or args.dry_run:
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — no changes saved")
|
||||
return
|
||||
|
||||
# Apply to words.json
|
||||
changed = 0
|
||||
for key, entry in words.items():
|
||||
if key in assignments:
|
||||
new_rank = assignments[key]["rank"]
|
||||
if entry.get("frequency") != new_rank:
|
||||
entry["frequency"] = new_rank
|
||||
changed += 1
|
||||
else:
|
||||
if entry.get("frequency") is not None:
|
||||
entry["frequency"] = None
|
||||
changed += 1
|
||||
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Updated %d entries in words.json", changed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
400
scripts/clean_frequency_corpus.py
Normal file
400
scripts/clean_frequency_corpus.py
Normal file
|
|
@ -0,0 +1,400 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Clean the Hebrew frequency corpus by removing prefix+word combinations.
|
||||
|
||||
Two modes:
|
||||
--mode yap (default) Use YAP morphological analyzer for accurate prefix detection.
|
||||
Requires YAP API running at localhost:8000.
|
||||
--mode heuristic Use rule-based prefix stripping (no external dependencies).
|
||||
|
||||
Both modes preserve words that exist as known dictionary forms in words.json.
|
||||
|
||||
Usage:
|
||||
python3 scripts/clean_frequency_corpus.py # YAP mode
|
||||
python3 scripts/clean_frequency_corpus.py --mode heuristic # heuristic fallback
|
||||
python3 scripts/clean_frequency_corpus.py --dry-run # preview only
|
||||
python3 scripts/clean_frequency_corpus.py --resume # resume YAP from checkpoint
|
||||
python3 scripts/clean_frequency_corpus.py --limit 1000 # process first N entries
|
||||
|
||||
Input: data/frequency_cache.json (raw he_50k.txt, 49999 entries)
|
||||
Output: data/frequency_clean.json (filtered, prefix combos removed)
|
||||
data/frequency_discarded.json (discarded entries with reason)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
|
||||
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
|
||||
DISCARDED = PROJECT_ROOT / "data" / "frequency_discarded.json"
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
CHECKPOINT = PROJECT_ROOT / "data" / "_yap_checkpoint.json"
|
||||
|
||||
YAP_URL = os.environ.get("YAP_URL", "http://localhost:8000/yap/heb/joint")
|
||||
YAP_TIMEOUT = 10
|
||||
BATCH_SAVE_INTERVAL = 500
|
||||
|
||||
# --- YAP mode constants ---
|
||||
# POS tags that indicate a prefix
|
||||
PREFIX_POS = frozenset({"PREPOSITION", "CONJ", "DEF", "REL"})
|
||||
# POS tags for the host word that make the combo a false positive
|
||||
HOST_POS = frozenset({"NN", "NNP", "NNT", "PRP", "CD", "DT", "EX"})
|
||||
|
||||
# --- Heuristic mode constants ---
|
||||
# Hebrew prefix combinations, longest first for greedy matching.
|
||||
PREFIXES = [
|
||||
# 4-char
|
||||
"וכשמ",
|
||||
"וכשב",
|
||||
"וכשל",
|
||||
"וכשה",
|
||||
# 3-char
|
||||
"וכש",
|
||||
"ומה",
|
||||
"ובה",
|
||||
"וכה",
|
||||
"ולה",
|
||||
"ומש",
|
||||
"ובש",
|
||||
"וכב",
|
||||
"ולב",
|
||||
"ומב",
|
||||
"וכל",
|
||||
"ולכ",
|
||||
"שבה",
|
||||
"שמה",
|
||||
# 2-char
|
||||
"כש",
|
||||
"מה",
|
||||
"בה",
|
||||
"כה",
|
||||
"לה",
|
||||
"מש",
|
||||
"בש",
|
||||
"וב",
|
||||
"וה",
|
||||
"וכ",
|
||||
"ול",
|
||||
"ומ",
|
||||
"וש",
|
||||
"כב",
|
||||
"לב",
|
||||
"מב",
|
||||
"כל",
|
||||
"לכ",
|
||||
"שב",
|
||||
"שה",
|
||||
"שכ",
|
||||
"של",
|
||||
"שמ",
|
||||
# 1-char
|
||||
"ב",
|
||||
"ה",
|
||||
"ו",
|
||||
"כ",
|
||||
"ל",
|
||||
"מ",
|
||||
"ש",
|
||||
]
|
||||
MIN_REMAINDER_LEN = 2
|
||||
|
||||
|
||||
def _load_known_forms(words_path: Path) -> set[str]:
|
||||
"""Load all known ktiv_male forms from words.json."""
|
||||
if not words_path.exists():
|
||||
logger.warning("words.json not found at %s — no dictionary filter", words_path)
|
||||
return set()
|
||||
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
known: set[str] = set()
|
||||
for entry in words.values():
|
||||
w = entry.get("word") or {}
|
||||
if km := w.get("ktiv_male"):
|
||||
known.add(km)
|
||||
|
||||
for form in entry.get("active_forms") or []:
|
||||
if isinstance(form, dict) and (km2 := form.get("ktiv_male")):
|
||||
known.add(km2)
|
||||
|
||||
for hp in entry.get("hufal_pual_forms") or []:
|
||||
if isinstance(hp, dict) and (km3 := hp.get("ktiv_male")):
|
||||
known.add(km3)
|
||||
|
||||
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
|
||||
for inf_data in (entry.get(field) or {}).values():
|
||||
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
|
||||
known.add(km4)
|
||||
|
||||
logger.info("Loaded %d known dictionary forms from words.json", len(known))
|
||||
return known
|
||||
|
||||
|
||||
# ── YAP mode ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def query_yap(word: str) -> dict | None:
|
||||
"""Send a single word to YAP and return the JSON response."""
|
||||
payload = {"text": f"{word} "}
|
||||
try:
|
||||
resp = requests.post(YAP_URL, json=payload, timeout=YAP_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except requests.RequestException as e:
|
||||
logger.warning("YAP request failed for '%s': %s", word, e)
|
||||
return None
|
||||
|
||||
|
||||
def is_prefix_combo_yap(yap_response: dict) -> tuple[bool, str]:
|
||||
"""Check if any morphological analysis segments the word as prefix+host.
|
||||
|
||||
Conservative: if ANY analysis in the lattice shows prefix+host → discard.
|
||||
"""
|
||||
lattice = yap_response.get("ma_lattice", "")
|
||||
if not lattice:
|
||||
return False, ""
|
||||
|
||||
arcs = []
|
||||
for line in lattice.strip().split("\n"):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 6:
|
||||
continue
|
||||
arcs.append(
|
||||
{
|
||||
"from": parts[0],
|
||||
"to": parts[1],
|
||||
"form": parts[2],
|
||||
"lemma": parts[3],
|
||||
"cpos": parts[4],
|
||||
"pos": parts[5],
|
||||
}
|
||||
)
|
||||
|
||||
if len(arcs) < 2:
|
||||
return False, ""
|
||||
|
||||
for a in arcs:
|
||||
if a["cpos"] not in PREFIX_POS and a["pos"] not in PREFIX_POS:
|
||||
continue
|
||||
for b in arcs:
|
||||
if b["from"] != a["to"]:
|
||||
continue
|
||||
if b["cpos"] in HOST_POS or b["pos"] in HOST_POS:
|
||||
reason = f"{a['form']}({a['cpos']})+{b['form']}({b['cpos']})"
|
||||
return True, reason
|
||||
|
||||
return False, ""
|
||||
|
||||
|
||||
# ── Heuristic mode ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def find_prefix_decomposition(word: str, freq: dict[str, int]) -> tuple[str, str] | None:
|
||||
"""Check if word is a prefix+higher-ranked-word combo (heuristic)."""
|
||||
if len(word) <= MIN_REMAINDER_LEN:
|
||||
return None
|
||||
|
||||
word_rank = freq.get(word, 999999)
|
||||
|
||||
for prefix in PREFIXES:
|
||||
if not word.startswith(prefix):
|
||||
continue
|
||||
remainder = word[len(prefix) :]
|
||||
if len(remainder) < MIN_REMAINDER_LEN:
|
||||
continue
|
||||
if remainder in freq and freq[remainder] < word_rank:
|
||||
return prefix, remainder
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Clean frequency corpus")
|
||||
parser.add_argument("--mode", choices=["yap", "heuristic"], default="yap", help="Detection mode")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show removals without saving")
|
||||
parser.add_argument("--resume", action="store_true", help="Resume YAP mode from checkpoint")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Process only first N words (0=all)")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
if not RAW_CACHE.exists():
|
||||
logger.error("Raw frequency cache not found: %s", RAW_CACHE)
|
||||
sys.exit(1)
|
||||
|
||||
with open(RAW_CACHE, encoding="utf-8") as f:
|
||||
raw_freq: dict[str, int] = json.load(f)
|
||||
|
||||
logger.info("Raw frequency corpus: %d entries", len(raw_freq))
|
||||
|
||||
# Sort by rank
|
||||
words_by_rank = sorted(raw_freq.items(), key=lambda x: x[1])
|
||||
if args.limit:
|
||||
words_by_rank = words_by_rank[: args.limit]
|
||||
|
||||
if args.mode == "yap":
|
||||
discarded_list = _run_yap_mode(words_by_rank, args)
|
||||
else:
|
||||
known_forms = _load_known_forms(WORDS_JSON)
|
||||
discarded_list = _run_heuristic_mode(words_by_rank, raw_freq, known_forms)
|
||||
|
||||
kept_count = len(words_by_rank) - len(discarded_list)
|
||||
logger.info("Done. Kept: %d, Discarded: %d", kept_count, len(discarded_list))
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — no files written")
|
||||
return
|
||||
|
||||
# Build clean frequency dict (re-ranked without gaps)
|
||||
discarded_words = {d["word"] for d in discarded_list}
|
||||
clean_freq: dict[str, int] = {}
|
||||
new_rank = 1
|
||||
for word, _rank in words_by_rank:
|
||||
if word not in discarded_words:
|
||||
clean_freq[word] = new_rank
|
||||
new_rank += 1
|
||||
|
||||
with open(CLEAN_CACHE, "w", encoding="utf-8") as f:
|
||||
json.dump(clean_freq, f, ensure_ascii=False)
|
||||
logger.info("Clean frequency saved: %d entries → %s", len(clean_freq), CLEAN_CACHE)
|
||||
|
||||
with open(DISCARDED, "w", encoding="utf-8") as f:
|
||||
json.dump(discarded_list, f, ensure_ascii=False, indent=2)
|
||||
logger.info("Discarded entries saved: %d → %s", len(discarded_list), DISCARDED)
|
||||
|
||||
|
||||
def _run_yap_mode(
|
||||
words_by_rank: list[tuple[str, int]],
|
||||
args: argparse.Namespace,
|
||||
) -> list[dict]:
|
||||
"""Run YAP-based prefix detection."""
|
||||
# Check YAP connectivity
|
||||
test = query_yap("בדיקה")
|
||||
if test is None:
|
||||
logger.error("Cannot connect to YAP API at %s", YAP_URL)
|
||||
sys.exit(1)
|
||||
logger.info("YAP API connected")
|
||||
|
||||
# Load checkpoint if resuming
|
||||
analyzed: dict[str, dict] = {}
|
||||
if args.resume and CHECKPOINT.exists():
|
||||
with open(CHECKPOINT, encoding="utf-8") as f:
|
||||
analyzed = json.load(f)
|
||||
logger.info("Resumed from checkpoint: %d words already analyzed", len(analyzed))
|
||||
|
||||
discarded_list: list[dict] = []
|
||||
discarded_count = 0
|
||||
kept_count = 0
|
||||
error_count = 0
|
||||
|
||||
for i, (word, rank) in enumerate(words_by_rank):
|
||||
# Already analyzed (from checkpoint)
|
||||
if word in analyzed:
|
||||
if analyzed[word]["discard"]:
|
||||
discarded_count += 1
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": analyzed[word]["reason"]})
|
||||
else:
|
||||
kept_count += 1
|
||||
continue
|
||||
|
||||
# Trivial: single char, ASCII, or too short
|
||||
if len(word) <= 1 or word.isascii():
|
||||
analyzed[word] = {"discard": False, "reason": ""}
|
||||
kept_count += 1
|
||||
continue
|
||||
|
||||
result = query_yap(word)
|
||||
if result is None:
|
||||
analyzed[word] = {"discard": False, "reason": "yap_error"}
|
||||
error_count += 1
|
||||
kept_count += 1
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
is_combo, reason = is_prefix_combo_yap(result)
|
||||
analyzed[word] = {"discard": is_combo, "reason": reason}
|
||||
|
||||
if is_combo:
|
||||
discarded_count += 1
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
|
||||
if rank <= 500 or discarded_count <= 50:
|
||||
logger.info(" DISCARD rank %5d: %s (%s)", rank, word, reason)
|
||||
else:
|
||||
kept_count += 1
|
||||
|
||||
# Rate limit
|
||||
if i % 10 == 0:
|
||||
time.sleep(0.01)
|
||||
|
||||
# Checkpoint
|
||||
if (i + 1) % BATCH_SAVE_INTERVAL == 0:
|
||||
if not args.dry_run:
|
||||
with open(CHECKPOINT, "w", encoding="utf-8") as f:
|
||||
json.dump(analyzed, f, ensure_ascii=False)
|
||||
logger.info(
|
||||
" [%d/%d] kept=%d discarded=%d errors=%d",
|
||||
i + 1,
|
||||
len(words_by_rank),
|
||||
kept_count,
|
||||
discarded_count,
|
||||
error_count,
|
||||
)
|
||||
|
||||
# Final checkpoint save
|
||||
if not args.dry_run and CHECKPOINT.exists():
|
||||
CHECKPOINT.unlink()
|
||||
|
||||
if error_count:
|
||||
logger.warning("%d YAP errors encountered", error_count)
|
||||
|
||||
return discarded_list
|
||||
|
||||
|
||||
def _run_heuristic_mode(
|
||||
words_by_rank: list[tuple[str, int]],
|
||||
raw_freq: dict[str, int],
|
||||
known_forms: set[str],
|
||||
) -> list[dict]:
|
||||
"""Run heuristic prefix detection (no external dependencies)."""
|
||||
discarded_list: list[dict] = []
|
||||
discarded_count = 0
|
||||
|
||||
for word, rank in words_by_rank:
|
||||
if len(word) <= 1 or word.isascii():
|
||||
continue
|
||||
|
||||
# Known dictionary form → keep
|
||||
if word in known_forms:
|
||||
continue
|
||||
|
||||
result = find_prefix_decomposition(word, raw_freq)
|
||||
if result is not None:
|
||||
prefix, remainder = result
|
||||
discarded_count += 1
|
||||
reason = f"{prefix}+{remainder} (rank {raw_freq[remainder]})"
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
|
||||
if rank <= 500 or discarded_count <= 50:
|
||||
logger.info(" DISCARD rank %5d: %s = %s", rank, word, reason)
|
||||
|
||||
return discarded_list
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Loading…
Reference in a new issue