Compare commits
4 commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6d2d446ed5 | |||
| f978e5f39a | |||
| 5f617af4ba | |||
| f3496998f5 |
6 changed files with 51132 additions and 717 deletions
220
apkg_builder.py
220
apkg_builder.py
|
|
@ -265,6 +265,14 @@ details[open] > .more-header::before { content: "● "; }
|
||||||
text-align: center;
|
text-align: center;
|
||||||
margin: 0.3em 0;
|
margin: 0.3em 0;
|
||||||
}
|
}
|
||||||
|
.plural-direction {
|
||||||
|
font-size: 32px;
|
||||||
|
color: #444;
|
||||||
|
text-align: center;
|
||||||
|
direction: rtl;
|
||||||
|
margin: 8px 0;
|
||||||
|
font-weight: bold;
|
||||||
|
}
|
||||||
.card [type="button"], .card button, .replay-button {
|
.card [type="button"], .card button, .replay-button {
|
||||||
display: block !important;
|
display: block !important;
|
||||||
margin: 4px auto !important;
|
margin: 4px auto !important;
|
||||||
|
|
@ -288,7 +296,26 @@ details[open] > .more-header::before { content: "● "; }
|
||||||
.related-header { color: #999; }
|
.related-header { color: #999; }
|
||||||
.rw-word { color: #e0e0e0; }
|
.rw-word { color: #e0e0e0; }
|
||||||
.rw-meaning { color: #999; }
|
.rw-meaning { color: #999; }
|
||||||
|
.plural-direction { color: #aaa; }
|
||||||
}
|
}
|
||||||
|
.nightMode .card { color: #e8e8e8; background: #1c1c1e; }
|
||||||
|
.nightMode .hebrew { color: #f0f0f0; }
|
||||||
|
.nightMode .hebrew-sm { color: #e0e0e0; }
|
||||||
|
.nightMode .meaning { color: #82b0ff; }
|
||||||
|
.nightMode .sec-label { color: #e0e0e0; }
|
||||||
|
.nightMode .sec-key { color: #e0e0e0; }
|
||||||
|
.nightMode .sec-val { color: #e0e0e0; }
|
||||||
|
.nightMode .conf-entry { color: #ddd; }
|
||||||
|
.nightMode .hint { color: #777; }
|
||||||
|
.nightMode .voice-label { color: #888; }
|
||||||
|
.nightMode .example { color: #e0e0e0; border-right-color: #555; }
|
||||||
|
.nightMode .divider { border-top-color: #333; }
|
||||||
|
.nightMode .freq-badge { color: #888; border-color: #444; }
|
||||||
|
.nightMode .more-header { color: #bbb; background: #2a2a2e; border-color: #555; }
|
||||||
|
.nightMode .related-header { color: #999; }
|
||||||
|
.nightMode .rw-word { color: #e0e0e0; }
|
||||||
|
.nightMode .rw-meaning { color: #999; }
|
||||||
|
.nightMode .plural-direction { color: #aaa; }
|
||||||
"""
|
"""
|
||||||
|
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
@ -422,7 +449,7 @@ CONJ_BACK = """
|
||||||
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
|
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
|
||||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||||
{{#Meaning}}<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>{{/Meaning}}
|
{{#Meaning}}<div class="meaning" style="font-size:28px;">{{Meaning}}</div>{{/Meaning}}
|
||||||
<div class="sec-table">
|
<div class="sec-table">
|
||||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
|
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
|
||||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
|
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
|
||||||
|
|
@ -693,6 +720,116 @@ _EMOJI_STOP = frozenset(
|
||||||
"bar",
|
"bar",
|
||||||
"wheel",
|
"wheel",
|
||||||
"horizontal",
|
"horizontal",
|
||||||
|
# Polysemous keywords producing wrong-sense emoji (Sprint 17 audit)
|
||||||
|
"high", # ⚡ high voltage, not "tall"
|
||||||
|
"down", # 🫳 palm down, not "descend"
|
||||||
|
"off", # 📴 phone off, not "remove"
|
||||||
|
"away", # 💨 dashing away, not "depart"
|
||||||
|
"together", # 🤲 palms together, not "unite"
|
||||||
|
"top", # 🎩 top hat, not "upper"
|
||||||
|
"low", # 🔈 low volume, not "short"
|
||||||
|
"flat", # 🥿 ballet flat, not "apartment"
|
||||||
|
"soft", # 🍦 soft serve, not "quiet"
|
||||||
|
"broken", # 💔 broken heart, not "damaged"
|
||||||
|
"round", # 📍 round pushpin, not "circular"
|
||||||
|
"cool", # 🆒 COOL button, not "cold"
|
||||||
|
"free", # 🆓 FREE button, not "liberated"
|
||||||
|
"long", # 🪘 long drum, not "lengthy"
|
||||||
|
"straight", # 📏 straight ruler, not "direct"
|
||||||
|
"empty", # 🪹 empty nest, not "void"
|
||||||
|
"hot", # 🥵 hot face, not "warm"
|
||||||
|
"cross", # ✝️ latin cross, not "intersect"
|
||||||
|
"bright", # 🔆 bright button, not "luminous"
|
||||||
|
"old", # 👴 old man, not "aged"
|
||||||
|
"head", # 🙂↔️ shaking head, not "leader"
|
||||||
|
# Category words that match generic emoji
|
||||||
|
"military", # 🎖️ military medal for any military term
|
||||||
|
"sports", # 🏅 sports medal for any sports term
|
||||||
|
"food", # 😋 yummy face for any food term
|
||||||
|
"city", # 🇻🇦 Vatican flag for any city
|
||||||
|
"china", # 🇨🇳 China flag for "porcelain"
|
||||||
|
"polish", # 💅 nail polish for "to polish/shine"
|
||||||
|
"aid", # 🦻 hearing aid for "to help"
|
||||||
|
"office", # 🧑💼 office worker for "bureau"
|
||||||
|
"construction", # 🏛️ classical building, not construction
|
||||||
|
"cinema", # 🎦 cinema emoji for any film term
|
||||||
|
"ceremony", # 🎑 moon ceremony for any ceremony
|
||||||
|
"building", # 🏛️ classical building for any structure
|
||||||
|
# Body parts / human features → wrong emoji
|
||||||
|
"arm", # 🦾 mechanical arm for "to arm"
|
||||||
|
"hair", # 👱 blond person for "hair"
|
||||||
|
"nose", # 😤 steam from nose
|
||||||
|
"tongue", # 😛 tongue-out face
|
||||||
|
"chest", # not a chest
|
||||||
|
"eyes", # 😃 face with eyes
|
||||||
|
# Abstract/vague words
|
||||||
|
"fear", # 😱 screaming face
|
||||||
|
"anger", # 💢 anger symbol
|
||||||
|
"angry", # 😠 angry face
|
||||||
|
"tired", # 😫 tired face
|
||||||
|
"sad", # 😥 sad face
|
||||||
|
"joy", # 😂 tears of joy
|
||||||
|
"love", # 💌 love letter
|
||||||
|
"cold", # 🥶 cold face
|
||||||
|
"pile", # 💩 pile of poo
|
||||||
|
"man", # 👨 man
|
||||||
|
"woman", # 👩 woman
|
||||||
|
"boy", # 👦 boy
|
||||||
|
"girl", # 👧 girl
|
||||||
|
"baby", # 👶 baby
|
||||||
|
"children", # 🚸 children crossing
|
||||||
|
"student", # 🧑🎓 student
|
||||||
|
"adult", # 🧑🧑🧒 family
|
||||||
|
"name", # 📛 name badge
|
||||||
|
"check", # ✅ check mark
|
||||||
|
"line", # 🫥 dotted line face
|
||||||
|
"floor", # 🤣 ROFL (rolling on floor)
|
||||||
|
"room", # 🧖 person in steamy room
|
||||||
|
"bubble", # 👁️🗨️ speech bubble
|
||||||
|
"car", # 🚃 railway car, not automobile
|
||||||
|
"bullet", # 🚅 bullet train
|
||||||
|
"steam", # 😤 face with steam
|
||||||
|
"fly", # 🪰 the insect, not the verb
|
||||||
|
"plant", # 🪴 potted plant for all "X (plant)" entries
|
||||||
|
"tree", # 🌲 evergreen for all "X (tree)" entries
|
||||||
|
"ball", # ⛹️ person bouncing ball
|
||||||
|
"bag", # 👝 clutch bag
|
||||||
|
"fight", # not a fight
|
||||||
|
"cloud", # not a cloud
|
||||||
|
"video", # 🎮 video game, not video
|
||||||
|
"rescue", # ⛑️ rescue worker helmet
|
||||||
|
"exchange", # 💱 currency exchange
|
||||||
|
"cut", # 🥩 cut of meat, not "to cut"
|
||||||
|
"key", # 🔐 locked with key
|
||||||
|
"walking", # 🚶 person walking
|
||||||
|
"running", # 🏃 person running
|
||||||
|
"climbing", # 🧗 person climbing
|
||||||
|
"speaking", # 🗣️ speaking head
|
||||||
|
"playing", # 🤽 person playing
|
||||||
|
"feeding", # 👩🍼 person feeding
|
||||||
|
"shooting", # 🌠 shooting star
|
||||||
|
"clapping", # 👏 clapping hands
|
||||||
|
"cooking", # 🍳 cooking emoji
|
||||||
|
"holding", # 🥹 face holding back tears
|
||||||
|
# More wrong-sense matches from remaining audit
|
||||||
|
"paper", # 🏮 red lantern for "paper"
|
||||||
|
"track", # 🛤️ railroad for "track record"
|
||||||
|
"vertical", # 🚦 traffic light for "vertical"
|
||||||
|
"speaker", # 🔇 muted speaker for "speaker (person)"
|
||||||
|
"square", # 🟥 red square for "plaza"
|
||||||
|
"wrapped", # 🎁 gift for "wrapped, bound"
|
||||||
|
"volume", # 🔈 speaker for "volume (book)"
|
||||||
|
"mobile", # 📱 phone for "mobile, moveable"
|
||||||
|
"flash", # 📸 camera flash for "to shine"
|
||||||
|
"identification", # 🪪 ID card for "locating"
|
||||||
|
"service", # 🐕🦺 service dog for "service, term"
|
||||||
|
"ground", # ⛱️ umbrella on ground
|
||||||
|
"machine", # 🎰 slot machine for "mechanism"
|
||||||
|
"liquid", # 🫗 pouring for "liquid, drop"
|
||||||
|
"vehicle", # 🚙 SUV for any vehicle mention
|
||||||
|
"window", # 🪟 window pane for "window, gap"
|
||||||
|
"information", # ℹ️ info symbol
|
||||||
|
"child", # 🧒 child emoji
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
@ -832,9 +969,11 @@ def build_vocab_deck(
|
||||||
if word_nikkud not in word_to_pos_cat:
|
if word_nikkud not in word_to_pos_cat:
|
||||||
word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
|
word_to_pos_cat[word_nikkud] = _categorize_pos(pos_raw) if pos_raw else "Other"
|
||||||
|
|
||||||
# Sort entries by frequency (null → 999999), applying limit after sort
|
# Sort entries by effective frequency (pseudo_frequency for confusables,
|
||||||
|
# else regular frequency; null → 999999), applying limit after sort
|
||||||
def _freq_key(item: tuple[str, dict]) -> int:
|
def _freq_key(item: tuple[str, dict]) -> int:
|
||||||
return item[1].get("frequency") or 999_999
|
e = item[1]
|
||||||
|
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
|
||||||
|
|
||||||
sorted_entries = sorted(words.items(), key=_freq_key)
|
sorted_entries = sorted(words.items(), key=_freq_key)
|
||||||
if limit:
|
if limit:
|
||||||
|
|
@ -860,7 +999,6 @@ def build_vocab_deck(
|
||||||
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||||||
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
|
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
|
||||||
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
|
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
|
||||||
meaning_raw = entry.get("meaning_raw", "") or ""
|
|
||||||
slug = entry.get("slug", "") or ""
|
slug = entry.get("slug", "") or ""
|
||||||
frequency = entry.get("frequency") or 999_999
|
frequency = entry.get("frequency") or 999_999
|
||||||
audio_file = entry.get("audio_file", "") or ""
|
audio_file = entry.get("audio_file", "") or ""
|
||||||
|
|
@ -895,25 +1033,22 @@ def build_vocab_deck(
|
||||||
else:
|
else:
|
||||||
freq_display = "Unlisted"
|
freq_display = "Unlisted"
|
||||||
|
|
||||||
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup
|
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup.
|
||||||
|
# Skip fallback for verbs — keyword matching on verb definitions produces
|
||||||
|
# wrong-sense emoji (e.g. "to cut" → 🥩, "to arm" → 🦾).
|
||||||
emoji_str = ""
|
emoji_str = ""
|
||||||
if entry.get("emoji_visible") and entry.get("emoji"):
|
if entry.get("emoji_visible") and entry.get("emoji"):
|
||||||
emoji_str = entry["emoji"]
|
emoji_str = entry["emoji"]
|
||||||
elif not emoji_str and emoji_lookup:
|
elif emoji_lookup and not meaning.startswith("to "):
|
||||||
meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip()
|
meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip()
|
||||||
for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]:
|
for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]:
|
||||||
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
|
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
|
||||||
emoji_str = emoji_lookup[kw]
|
emoji_str = emoji_lookup[kw]
|
||||||
break
|
break
|
||||||
|
|
||||||
# Extract Hebrew prepositions: prefer upstream-parsed prep field, fall back to meaning_raw scan
|
# Hebrew prepositions — extracted upstream by list scraper
|
||||||
# (fallback covers entries scraped before prep was moved upstream)
|
|
||||||
entry_prep = entry.get("prep")
|
entry_prep = entry.get("prep")
|
||||||
if entry_prep:
|
prep_str = " ".join(f"({p})" for p in entry_prep.split()) if entry_prep else ""
|
||||||
prep_str = " ".join(f"({p})" for p in entry_prep.split())
|
|
||||||
else:
|
|
||||||
preps = HBPAREN_RE.findall(meaning_raw)
|
|
||||||
prep_str = " ".join(f"({p})" for p in preps)
|
|
||||||
|
|
||||||
# Audio — use audio_file from entry; for confusables it's already slug-based
|
# Audio — use audio_file from entry; for confusables it's already slug-based
|
||||||
audio_tag = ""
|
audio_tag = ""
|
||||||
|
|
@ -1123,25 +1258,12 @@ def build_conj_deck(
|
||||||
root = ".".join(root_list)
|
root = ".".join(root_list)
|
||||||
voice = VOICE_MAP.get(binyan, "")
|
voice = VOICE_MAP.get(binyan, "")
|
||||||
|
|
||||||
meaning_raw = entry.get("meaning_raw", "") or ""
|
|
||||||
meaning = entry.get("meaning", "") or ""
|
meaning = entry.get("meaning", "") or ""
|
||||||
# Extract Hebrew preposition — strip from meaning, show on Hebrew side
|
# Hebrew preposition — extracted upstream by scraper
|
||||||
prep_str = ""
|
prep_str = ""
|
||||||
conj_prep = conj.get("prep")
|
conj_prep = conj.get("prep") or entry.get("prep")
|
||||||
if conj_prep:
|
if conj_prep:
|
||||||
# Strip any parentheses from stored prep value
|
|
||||||
prep_str = conj_prep.strip("() ")
|
prep_str = conj_prep.strip("() ")
|
||||||
elif meaning_raw:
|
|
||||||
preps = HBPAREN_RE.findall(meaning_raw)
|
|
||||||
if preps:
|
|
||||||
prep_str = preps[0]
|
|
||||||
# Strip Hebrew prepositions from English meaning to avoid duplication
|
|
||||||
if prep_str:
|
|
||||||
meaning = HBPAREN_RE.sub("", meaning).strip()
|
|
||||||
# Also strip from meaning_raw patterns like "(על)"
|
|
||||||
meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
|
|
||||||
# Clean up double spaces and trailing commas
|
|
||||||
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
|
|
||||||
|
|
||||||
related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive]
|
related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive]
|
||||||
if related:
|
if related:
|
||||||
|
|
@ -1438,9 +1560,12 @@ def build_confusables_deck(
|
||||||
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
|
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
|
||||||
guid_to_entries.setdefault(guid, []).append(entry)
|
guid_to_entries.setdefault(guid, []).append(entry)
|
||||||
|
|
||||||
|
def _eff_freq(e: dict) -> int:
|
||||||
|
return e.get("pseudo_frequency") or e.get("frequency") or 999_999
|
||||||
|
|
||||||
for guid, group_entries in sorted(
|
for guid, group_entries in sorted(
|
||||||
guid_to_entries.items(),
|
guid_to_entries.items(),
|
||||||
key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
|
key=lambda x: sum(_eff_freq(e) for e in x[1]) / len(x[1]),
|
||||||
):
|
):
|
||||||
if guid in seen_guids:
|
if guid in seen_guids:
|
||||||
continue
|
continue
|
||||||
|
|
@ -1459,9 +1584,13 @@ def build_confusables_deck(
|
||||||
unique_entries.append(e)
|
unique_entries.append(e)
|
||||||
if len(unique_entries) < 2:
|
if len(unique_entries) < 2:
|
||||||
continue
|
continue
|
||||||
|
# Sort by pseudo/frequency so most common meaning appears first
|
||||||
|
unique_entries.sort(key=_eff_freq)
|
||||||
|
if len(unique_entries) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
|
word_no_nik = unique_entries[0]["word"].get("ktiv_male", "")
|
||||||
words_display = " / ".join(e["word"]["nikkud"] for e in unique_entries)
|
words_display = word_no_nik # Show ktiv male (shared form) on front
|
||||||
|
|
||||||
defs_parts: list[str] = []
|
defs_parts: list[str] = []
|
||||||
audio_parts: list[str] = []
|
audio_parts: list[str] = []
|
||||||
|
|
@ -1530,8 +1659,8 @@ def write_conf_apkg(
|
||||||
PLURAL_FRONT_SG = """
|
PLURAL_FRONT_SG = """
|
||||||
<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
|
<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
|
||||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||||
<div class="sec-label">{{Meaning}}</div>
|
<div class="meaning" style="font-size:28px;">{{Meaning}}</div>
|
||||||
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
|
<div class="plural-direction">יָחִיד ← רַבִּים</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PLURAL_BACK_SG = """
|
PLURAL_BACK_SG = """
|
||||||
|
|
@ -1547,14 +1676,14 @@ PLURAL_BACK_SG = """
|
||||||
PLURAL_FRONT_PL = """
|
PLURAL_FRONT_PL = """
|
||||||
<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
|
<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
|
||||||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||||||
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
|
<div class="plural-direction">רַבִּים ← יָחִיד</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PLURAL_BACK_PL = """
|
PLURAL_BACK_PL = """
|
||||||
{{FrontSide}}<hr>
|
{{FrontSide}}<hr>
|
||||||
<div class="hebrew">{{Singular}}</div>
|
<div class="hebrew">{{Singular}}</div>
|
||||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||||
<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>
|
<div class="meaning" style="font-size:28px;">{{Meaning}}</div>
|
||||||
<div class="sec-table">
|
<div class="sec-table">
|
||||||
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
|
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
|
||||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
|
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
|
||||||
|
|
@ -1651,9 +1780,9 @@ def build_plural_deck(
|
||||||
irregular_count = len(irregulars)
|
irregular_count = len(irregulars)
|
||||||
target_regular = irregular_count * 2
|
target_regular = irregular_count * 2
|
||||||
mishkal_count = len(by_mishkal) or 1
|
mishkal_count = len(by_mishkal) or 1
|
||||||
per_mishkal = max(2, target_regular // mishkal_count)
|
# Over-sample per mishkal to compensate for small patterns, then trim
|
||||||
|
per_mishkal = max(3, (target_regular * 3) // (mishkal_count * 2))
|
||||||
|
|
||||||
selected: list[tuple[str, dict, dict]] = list(irregulars)
|
|
||||||
regular_pool: list[tuple[str, dict, dict]] = []
|
regular_pool: list[tuple[str, dict, dict]] = []
|
||||||
for _mishkal, entries in sorted(by_mishkal.items()):
|
for _mishkal, entries in sorted(by_mishkal.items()):
|
||||||
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||||
|
|
@ -1664,7 +1793,24 @@ def build_plural_deck(
|
||||||
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||||
regular_pool = regular_pool[:target_regular]
|
regular_pool = regular_pool[:target_regular]
|
||||||
|
|
||||||
selected.extend(regular_pool)
|
# Sort both pools by frequency, then interleave for homogeneous 2:1 regular:irregular
|
||||||
|
irregulars.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||||
|
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||||
|
|
||||||
|
# Interleave: for every 1 irregular, insert 2 regulars
|
||||||
|
selected: list[tuple[str, dict, dict]] = []
|
||||||
|
ri = 0 # regular index
|
||||||
|
for _ii, irr in enumerate(irregulars):
|
||||||
|
# Insert 2 regulars before each irregular (when available)
|
||||||
|
for _ in range(2):
|
||||||
|
if ri < len(regular_pool):
|
||||||
|
selected.append(regular_pool[ri])
|
||||||
|
ri += 1
|
||||||
|
selected.append(irr)
|
||||||
|
# Append remaining regulars
|
||||||
|
while ri < len(regular_pool):
|
||||||
|
selected.append(regular_pool[ri])
|
||||||
|
ri += 1
|
||||||
|
|
||||||
note_count = 0
|
note_count = 0
|
||||||
for _unique_key, entry, noun_inflection in selected:
|
for _unique_key, entry, noun_inflection in selected:
|
||||||
|
|
|
||||||
50000
data/en_50k.txt
Normal file
50000
data/en_50k.txt
Normal file
File diff suppressed because it is too large
Load diff
1356
data/words.json
1356
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -82,7 +82,7 @@ BINYAN_HEBREW: dict[str, str] = {
|
||||||
|
|
||||||
# Regex for extracting emoji characters
|
# Regex for extracting emoji characters
|
||||||
EMOJI_RE = re.compile(
|
EMOJI_RE = re.compile(
|
||||||
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF]+",
|
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF\uFE0E\uFE0F\u200D]+",
|
||||||
re.UNICODE,
|
re.UNICODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ sys.path.insert(0, "/home/node/projects")
|
||||||
import load_keeshare
|
import load_keeshare
|
||||||
|
|
||||||
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
||||||
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["API_TOKEN"]
|
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["password"]
|
||||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||||
|
|
||||||
# All deck variants to include in release
|
# All deck variants to include in release
|
||||||
|
|
|
||||||
269
scripts/assign_pseudo_frequency.py
Normal file
269
scripts/assign_pseudo_frequency.py
Normal file
|
|
@ -0,0 +1,269 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Assign pseudo-frequency to confusable groups using English word frequency.
|
||||||
|
|
||||||
|
Problem: Confusable entries share the same ktiv_male and thus the same Hebrew
|
||||||
|
frequency rank. This script uses English frequency to differentiate them so
|
||||||
|
Anki sorts more-common meanings first.
|
||||||
|
|
||||||
|
Algorithm:
|
||||||
|
1. For each confusable group where all entries share the same Hebrew frequency,
|
||||||
|
extract the first meaningful English keyword from each entry's meaning field.
|
||||||
|
2. Look up English frequency rank for each keyword.
|
||||||
|
3. Assign pseudo_frequency: the most frequent English meaning keeps the original
|
||||||
|
Hebrew rank; less frequent meanings get progressively higher (worse) ranks
|
||||||
|
by adding an offset (100 * position in group).
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/assign_pseudo_frequency.py # assign and save
|
||||||
|
python3 scripts/assign_pseudo_frequency.py --dry-run # preview only
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
PROJECT_ROOT = Path(__file__).parent.parent
|
||||||
|
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||||
|
EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt"
|
||||||
|
|
||||||
|
# Words too common/vague to use as frequency signal
|
||||||
|
_EN_STOP = frozenset(
|
||||||
|
{
|
||||||
|
"to",
|
||||||
|
"be",
|
||||||
|
"a",
|
||||||
|
"an",
|
||||||
|
"the",
|
||||||
|
"of",
|
||||||
|
"in",
|
||||||
|
"on",
|
||||||
|
"at",
|
||||||
|
"for",
|
||||||
|
"and",
|
||||||
|
"with",
|
||||||
|
"by",
|
||||||
|
"or",
|
||||||
|
"but",
|
||||||
|
"not",
|
||||||
|
"as",
|
||||||
|
"its",
|
||||||
|
"it",
|
||||||
|
"is",
|
||||||
|
"was",
|
||||||
|
"are",
|
||||||
|
"from",
|
||||||
|
"that",
|
||||||
|
"this",
|
||||||
|
"have",
|
||||||
|
"has",
|
||||||
|
"had",
|
||||||
|
"do",
|
||||||
|
"does",
|
||||||
|
"did",
|
||||||
|
"will",
|
||||||
|
"would",
|
||||||
|
"can",
|
||||||
|
"could",
|
||||||
|
"may",
|
||||||
|
"might",
|
||||||
|
"shall",
|
||||||
|
"should",
|
||||||
|
"must",
|
||||||
|
"no",
|
||||||
|
"yes",
|
||||||
|
"very",
|
||||||
|
"too",
|
||||||
|
"also",
|
||||||
|
"just",
|
||||||
|
"only",
|
||||||
|
"so",
|
||||||
|
"up",
|
||||||
|
"out",
|
||||||
|
"into",
|
||||||
|
"over",
|
||||||
|
"after",
|
||||||
|
"before",
|
||||||
|
"about",
|
||||||
|
"more",
|
||||||
|
"than",
|
||||||
|
"other",
|
||||||
|
"some",
|
||||||
|
"any",
|
||||||
|
"all",
|
||||||
|
"each",
|
||||||
|
"every",
|
||||||
|
"both",
|
||||||
|
"few",
|
||||||
|
"many",
|
||||||
|
"much",
|
||||||
|
"most",
|
||||||
|
"such",
|
||||||
|
"own",
|
||||||
|
"same",
|
||||||
|
"well",
|
||||||
|
"still",
|
||||||
|
"even",
|
||||||
|
"how",
|
||||||
|
"what",
|
||||||
|
"when",
|
||||||
|
"where",
|
||||||
|
"which",
|
||||||
|
"who",
|
||||||
|
"whom",
|
||||||
|
"whose",
|
||||||
|
"why",
|
||||||
|
"because",
|
||||||
|
"if",
|
||||||
|
"then",
|
||||||
|
"else",
|
||||||
|
"while",
|
||||||
|
"until",
|
||||||
|
"though",
|
||||||
|
"whether",
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _load_en_freq() -> dict[str, int]:
|
||||||
|
"""Load English frequency data: word -> rank (1 = most common)."""
|
||||||
|
freq: dict[str, int] = {}
|
||||||
|
rank = 1
|
||||||
|
with open(EN_FREQ_PATH, encoding="utf-8") as f:
|
||||||
|
for line in f:
|
||||||
|
parts = line.strip().split()
|
||||||
|
if parts:
|
||||||
|
word = parts[0].lower()
|
||||||
|
if word not in freq:
|
||||||
|
freq[word] = rank
|
||||||
|
rank += 1
|
||||||
|
return freq
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_keywords(meaning: str) -> list[str]:
|
||||||
|
"""Extract meaningful English keywords from a meaning string.
|
||||||
|
|
||||||
|
Returns list of lowercase words, filtered for stop words and short words.
|
||||||
|
"""
|
||||||
|
# Strip parenthesized content, punctuation
|
||||||
|
cleaned = re.sub(r"\([^)]*\)", " ", meaning)
|
||||||
|
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
|
||||||
|
return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP]
|
||||||
|
|
||||||
|
|
||||||
|
def assign_pseudo_frequencies(
|
||||||
|
words: dict,
|
||||||
|
en_freq: dict[str, int],
|
||||||
|
dry_run: bool = False,
|
||||||
|
) -> int:
|
||||||
|
"""Assign pseudo_frequency to confusable groups. Returns count of changes."""
|
||||||
|
|
||||||
|
# Group by confusables_guid
|
||||||
|
groups: dict[str, list[str]] = defaultdict(list)
|
||||||
|
for key, entry in words.items():
|
||||||
|
cg = entry.get("confusables_guid")
|
||||||
|
if cg:
|
||||||
|
groups[cg].append(key)
|
||||||
|
|
||||||
|
changes = 0
|
||||||
|
assigned_groups = 0
|
||||||
|
skipped_diff = 0
|
||||||
|
skipped_no_en = 0
|
||||||
|
|
||||||
|
for _guid, keys in groups.items():
|
||||||
|
entries = [words[k] for k in keys]
|
||||||
|
freqs = [e.get("frequency") for e in entries]
|
||||||
|
|
||||||
|
# Skip groups that are already differentiated
|
||||||
|
unique_freqs = set(freqs)
|
||||||
|
if len(unique_freqs) > 1:
|
||||||
|
skipped_diff += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
base_freq = freqs[0] # All same (or all None)
|
||||||
|
|
||||||
|
# Look up English frequency for each entry
|
||||||
|
en_ranks: list[tuple[int, str]] = [] # (en_rank, key)
|
||||||
|
for key, entry in zip(keys, entries, strict=True):
|
||||||
|
keywords = _extract_keywords(entry.get("meaning", ""))
|
||||||
|
en_rank = 999_999
|
||||||
|
for kw in keywords[:5]:
|
||||||
|
r = en_freq.get(kw)
|
||||||
|
if r is not None:
|
||||||
|
en_rank = r
|
||||||
|
break
|
||||||
|
en_ranks.append((en_rank, key))
|
||||||
|
|
||||||
|
# Sort by English frequency (lower rank = more common)
|
||||||
|
en_ranks.sort()
|
||||||
|
|
||||||
|
# Check if all entries have the same English rank (no signal)
|
||||||
|
if len({r for r, _ in en_ranks}) <= 1:
|
||||||
|
skipped_no_en += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
assigned_groups += 1
|
||||||
|
|
||||||
|
# Assign pseudo_frequency: most common gets base, others get offset
|
||||||
|
for position, (en_rank, key) in enumerate(en_ranks):
|
||||||
|
pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
words[key]["pseudo_frequency"] = pseudo
|
||||||
|
changes += 1
|
||||||
|
|
||||||
|
if dry_run:
|
||||||
|
meaning = words[key].get("meaning", "")[:40]
|
||||||
|
logger.info(
|
||||||
|
" [en:%5d] pseudo=%6d %s",
|
||||||
|
en_rank,
|
||||||
|
pseudo,
|
||||||
|
meaning,
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal",
|
||||||
|
assigned_groups,
|
||||||
|
skipped_diff,
|
||||||
|
skipped_no_en,
|
||||||
|
)
|
||||||
|
return changes
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables")
|
||||||
|
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
logger.info("Loading English frequency data: %s", EN_FREQ_PATH)
|
||||||
|
en_freq = _load_en_freq()
|
||||||
|
logger.info("English frequency: %d entries", len(en_freq))
|
||||||
|
|
||||||
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||||
|
words: dict = json.load(f)
|
||||||
|
|
||||||
|
changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
logger.info("Dry run — %d changes would be made", changes)
|
||||||
|
return
|
||||||
|
|
||||||
|
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
logger.info("Saved %d pseudo-frequency assignments to words.json", changes)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Loading…
Reference in a new issue