feat: pseudo-frequency for confusables using English word frequency
264 confusable groups where all entries shared the same Hebrew frequency now have differentiated pseudo_frequency values based on English word commonality (hermitdave en_50k.txt). Most common meaning keeps base rank; less common meanings get +100 offset per position. Examples: - אב: "father" (en:194) → 2491, "bud" (en:2963) → 2591 - אח: "brother" (en:300) → 911, "fireplace" (en:9389) → 1011 Builder uses pseudo_frequency for sort order when available. Confusable card definitions now sorted most-common-first. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f978e5f39a
commit
6d2d446ed5
4 changed files with 50821 additions and 543 deletions
|
|
@ -969,9 +969,11 @@ def build_vocab_deck(
|
||||||
if word_nikkud not in word_to_pos_cat:
|
if word_nikkud not in word_to_pos_cat:
|
||||||
word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
|
word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
|
||||||
|
|
||||||
# Sort entries by frequency (null → 999999), applying limit after sort
|
# Sort entries by effective frequency (pseudo_frequency for confusables,
|
||||||
|
# else regular frequency; null → 999999), applying limit after sort
|
||||||
def _freq_key(item: tuple[str, dict]) -> int:
|
def _freq_key(item: tuple[str, dict]) -> int:
|
||||||
return item[1].get("frequency") or 999_999
|
e = item[1]
|
||||||
|
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
|
||||||
|
|
||||||
sorted_entries = sorted(words.items(), key=_freq_key)
|
sorted_entries = sorted(words.items(), key=_freq_key)
|
||||||
if limit:
|
if limit:
|
||||||
|
|
@ -1558,9 +1560,12 @@ def build_confusables_deck(
|
||||||
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
|
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
|
||||||
guid_to_entries.setdefault(guid, []).append(entry)
|
guid_to_entries.setdefault(guid, []).append(entry)
|
||||||
|
|
||||||
|
def _eff_freq(e: dict) -> int:
|
||||||
|
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
|
||||||
|
|
||||||
for guid, group_entries in sorted(
|
for guid, group_entries in sorted(
|
||||||
guid_to_entries.items(),
|
guid_to_entries.items(),
|
||||||
key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
|
key=lambda x: sum(_eff_freq(e) for e in x[1]) / len(x[1]),
|
||||||
):
|
):
|
||||||
if guid in seen_guids:
|
if guid in seen_guids:
|
||||||
continue
|
continue
|
||||||
|
|
@ -1579,6 +1584,10 @@ def build_confusables_deck(
|
||||||
unique_entries.append(e)
|
unique_entries.append(e)
|
||||||
if len(unique_entries) < 2:
|
if len(unique_entries) < 2:
|
||||||
continue
|
continue
|
||||||
|
# Sort by pseudo/frequency so most common meaning appears first
|
||||||
|
unique_entries.sort(key=_eff_freq)
|
||||||
|
if len(unique_entries) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
|
word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
|
||||||
words_display = word_no_nik # Show ktiv male (shared form) on front
|
words_display = word_no_nik # Show ktiv male (shared form) on front
|
||||||
|
|
|
||||||
50000
data/en_50k.txt
Normal file
50000
data/en_50k.txt
Normal file
File diff suppressed because it is too large
Load diff
1080
data/words.json
1080
data/words.json
File diff suppressed because it is too large
Load diff
269
scripts/assign_pseudo_frequency.py
Normal file
269
scripts/assign_pseudo_frequency.py
Normal file
|
|
@ -0,0 +1,269 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Assign pseudo-frequency to confusable groups using English word frequency.
|
||||||
|
|
||||||
|
Problem: Confusable entries share the same ktiv_male and thus the same Hebrew
|
||||||
|
frequency rank. This script uses English frequency to differentiate them so
|
||||||
|
Anki sorts more-common meanings first.
|
||||||
|
|
||||||
|
Algorithm:
|
||||||
|
1. For each confusable group where all entries share the same Hebrew frequency,
|
||||||
|
extract the first meaningful English keyword from each entry's meaning field.
|
||||||
|
2. Look up English frequency rank for each keyword.
|
||||||
|
3. Assign pseudo_frequency: the most frequent English meaning keeps the original
|
||||||
|
Hebrew rank; less frequent meanings get progressively higher (worse) ranks
|
||||||
|
by adding an offset (100 * position in group).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/assign_pseudo_frequency.py # assign and save
|
||||||
|
python3 scripts/assign_pseudo_frequency.py --dry-run # preview only
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
PROJECT_ROOT = Path(__file__).parent.parent
|
||||||
|
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||||
|
EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt"
|
||||||
|
|
||||||
|
# Words too common/vague to use as frequency signal
|
||||||
|
_EN_STOP = frozenset(
|
||||||
|
{
|
||||||
|
"to",
|
||||||
|
"be",
|
||||||
|
"a",
|
||||||
|
"an",
|
||||||
|
"the",
|
||||||
|
"of",
|
||||||
|
"in",
|
||||||
|
"on",
|
||||||
|
"at",
|
||||||
|
"for",
|
||||||
|
"and",
|
||||||
|
"with",
|
||||||
|
"by",
|
||||||
|
"or",
|
||||||
|
"but",
|
||||||
|
"not",
|
||||||
|
"as",
|
||||||
|
"its",
|
||||||
|
"it",
|
||||||
|
"is",
|
||||||
|
"was",
|
||||||
|
"are",
|
||||||
|
"from",
|
||||||
|
"that",
|
||||||
|
"this",
|
||||||
|
"have",
|
||||||
|
"has",
|
||||||
|
"had",
|
||||||
|
"do",
|
||||||
|
"does",
|
||||||
|
"did",
|
||||||
|
"will",
|
||||||
|
"would",
|
||||||
|
"can",
|
||||||
|
"could",
|
||||||
|
"may",
|
||||||
|
"might",
|
||||||
|
"shall",
|
||||||
|
"should",
|
||||||
|
"must",
|
||||||
|
"no",
|
||||||
|
"yes",
|
||||||
|
"very",
|
||||||
|
"too",
|
||||||
|
"also",
|
||||||
|
"just",
|
||||||
|
"only",
|
||||||
|
"so",
|
||||||
|
"up",
|
||||||
|
"out",
|
||||||
|
"into",
|
||||||
|
"over",
|
||||||
|
"after",
|
||||||
|
"before",
|
||||||
|
"about",
|
||||||
|
"more",
|
||||||
|
"than",
|
||||||
|
"other",
|
||||||
|
"some",
|
||||||
|
"any",
|
||||||
|
"all",
|
||||||
|
"each",
|
||||||
|
"every",
|
||||||
|
"both",
|
||||||
|
"few",
|
||||||
|
"many",
|
||||||
|
"much",
|
||||||
|
"most",
|
||||||
|
"such",
|
||||||
|
"own",
|
||||||
|
"same",
|
||||||
|
"well",
|
||||||
|
"still",
|
||||||
|
"even",
|
||||||
|
"how",
|
||||||
|
"what",
|
||||||
|
"when",
|
||||||
|
"where",
|
||||||
|
"which",
|
||||||
|
"who",
|
||||||
|
"whom",
|
||||||
|
"whose",
|
||||||
|
"why",
|
||||||
|
"because",
|
||||||
|
"if",
|
||||||
|
"then",
|
||||||
|
"else",
|
||||||
|
"while",
|
||||||
|
"until",
|
||||||
|
"though",
|
||||||
|
"whether",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_en_freq() -> dict[str, int]:
|
||||||
|
"""Load English frequency data: word -> rank (1 = most common)."""
|
||||||
|
freq: dict[str, int] = {}
|
||||||
|
rank = 1
|
||||||
|
with open(EN_FREQ_PATH, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
parts = line.strip().split()
|
||||||
|
if parts:
|
||||||
|
word = parts[0].lower()
|
||||||
|
if word not in freq:
|
||||||
|
freq[word] = rank
|
||||||
|
rank += 1
|
||||||
|
return freq
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_keywords(meaning: str) -> list[str]:
|
||||||
|
"""Extract meaningful English keywords from a meaning string.
|
||||||
|
|
||||||
|
Returns list of lowercase words, filtered for stop words and short words.
|
||||||
|
"""
|
||||||
|
# Strip parenthesized content, punctuation
|
||||||
|
cleaned = re.sub(r"\([^)]*\)", " ", meaning)
|
||||||
|
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
|
||||||
|
return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP]
|
||||||
|
|
||||||
|
|
||||||
|
def assign_pseudo_frequencies(
|
||||||
|
words: dict,
|
||||||
|
en_freq: dict[str, int],
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> int:
|
||||||
|
"""Assign pseudo_frequency to confusable groups. Returns count of changes."""
|
||||||
|
|
||||||
|
# Group by confusables_guid
|
||||||
|
groups: dict[str, list[str]] = defaultdict(list)
|
||||||
|
for key, entry in words.items():
|
||||||
|
cg = entry.get("confusables_guid")
|
||||||
|
if cg:
|
||||||
|
groups[cg].append(key)
|
||||||
|
|
||||||
|
changes = 0
|
||||||
|
assigned_groups = 0
|
||||||
|
skipped_diff = 0
|
||||||
|
skipped_no_en = 0
|
||||||
|
|
||||||
|
for _guid, keys in groups.items():
|
||||||
|
entries = [words[k] for k in keys]
|
||||||
|
freqs = [e.get("frequency") for e in entries]
|
||||||
|
|
||||||
|
# Skip groups that are already differentiated
|
||||||
|
unique_freqs = set(freqs)
|
||||||
|
if len(unique_freqs) > 1:
|
||||||
|
skipped_diff += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
base_freq = freqs[0] # All same (or all None)
|
||||||
|
|
||||||
|
# Look up English frequency for each entry
|
||||||
|
en_ranks: list[tuple[int, str]] = [] # (en_rank, key)
|
||||||
|
for key, entry in zip(keys, entries, strict=True):
|
||||||
|
keywords = _extract_keywords(entry.get("meaning", ""))
|
||||||
|
en_rank = 999_999
|
||||||
|
for kw in keywords[:5]:
|
||||||
|
r = en_freq.get(kw)
|
||||||
|
if r is not None:
|
||||||
|
en_rank = r
|
||||||
|
break
|
||||||
|
en_ranks.append((en_rank, key))
|
||||||
|
|
||||||
|
# Sort by English frequency (lower rank = more common)
|
||||||
|
en_ranks.sort()
|
||||||
|
|
||||||
|
# Check if all entries have the same English rank (no signal)
|
||||||
|
if len({r for r, _ in en_ranks}) <= 1:
|
||||||
|
skipped_no_en += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
assigned_groups += 1
|
||||||
|
|
||||||
|
# Assign pseudo_frequency: most common gets base, others get offset
|
||||||
|
for position, (en_rank, key) in enumerate(en_ranks):
|
||||||
|
pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
words[key]["pseudo_frequency"] = pseudo
|
||||||
|
changes += 1
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
meaning = words[key].get("meaning", "")[:40]
|
||||||
|
logger.info(
|
||||||
|
" [en:%5d] pseudo=%6d %s",
|
||||||
|
en_rank,
|
||||||
|
pseudo,
|
||||||
|
meaning,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal",
|
||||||
|
assigned_groups,
|
||||||
|
skipped_diff,
|
||||||
|
skipped_no_en,
|
||||||
|
)
|
||||||
|
return changes
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Loading English frequency data: %s", EN_FREQ_PATH)
|
||||||
|
en_freq = _load_en_freq()
|
||||||
|
logger.info("English frequency: %d entries", len(en_freq))
|
||||||
|
|
||||||
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||||
|
words: dict = json.load(f)
|
||||||
|
|
||||||
|
changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
logger.info("Dry run — %d changes would be made", changes)
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
logger.info("Saved %d pseudo-frequency assignments to words.json", changes)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue