From f3496998f55e871cb422af95ee6ddbe00232483c Mon Sep 17 00:00:00 2001 From: Sochen Date: Sat, 21 Mar 2026 02:19:03 +0000 Subject: [PATCH] feat: confusables show ktiv male, emoji/prep stripping fully upstream MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Confusables deck front now shows shared ktiv male form instead of nikkud variants joined by "/". Back still shows nikkud with definitions. - Fixed list scraper EMOJI_RE to catch variation selectors (U+FE0F) and ZWJ (U+200D) — cleaned 17 entries with leftover selectors in meaning. - Removed build-time prep extraction fallback (0 entries relied on it). - release.py: fix keeshare field name (API_TOKEN → password). Closes: Pealim #11 (emoji/prep upstream), Pealim #16 (confusables ktiv male) Co-Authored-By: Claude Opus 4.6 --- apkg_builder.py | 89 +++++++++++++++++++++++++++---------------- data/words.json | 34 ++++++++--------- pealim_list_scrape.py | 2 +- release.py | 2 +- 4 files changed, 76 insertions(+), 51 deletions(-) diff --git a/apkg_builder.py b/apkg_builder.py index 6ae60e5..97dea4f 100644 --- a/apkg_builder.py +++ b/apkg_builder.py @@ -265,6 +265,14 @@ details[open] > .more-header::before { content: "● "; } text-align: center; margin: 0.3em 0; } +.plural-direction { + font-size: 32px; + color: #444; + text-align: center; + direction: rtl; + margin: 8px 0; + font-weight: bold; +} .card [type="button"], .card button, .replay-button { display: block !important; margin: 4px auto !important; @@ -288,7 +296,26 @@ details[open] > .more-header::before { content: "● "; } .related-header { color: #999; } .rw-word { color: #e0e0e0; } .rw-meaning { color: #999; } + .plural-direction { color: #aaa; } } +.nightMode .card { color: #e8e8e8; background: #1c1c1e; } +.nightMode .hebrew { color: #f0f0f0; } +.nightMode .hebrew-sm { color: #e0e0e0; } +.nightMode .meaning { color: #82b0ff; } +.nightMode .sec-label { color: #e0e0e0; } +.nightMode .sec-key { color: #e0e0e0; } +.nightMode .sec-val { color: #e0e0e0; } +.nightMode .conf-entry { color: #ddd; } +.nightMode .hint { color: #777; } +.nightMode .voice-label { color: #888; } +.nightMode .example { color: #e0e0e0; border-right-color: #555; } +.nightMode .divider { border-top-color: #333; } +.nightMode .freq-badge { color: #888; border-color: #444; } +.nightMode .more-header { color: #bbb; background: #2a2a2e; border-color: #555; } +.nightMode .related-header { color: #999; } +.nightMode .rw-word { color: #e0e0e0; } +.nightMode .rw-meaning { color: #999; } +.nightMode .plural-direction { color: #aaa; } """ # ────────────────────────────────────────────────────────────────────────────── @@ -422,7 +449,7 @@ CONJ_BACK = """
{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}
{{#Audio}}
{{Audio}}
{{/Audio}}
מידע נוסף -{{#Meaning}}
{{Meaning}}
{{/Meaning}} +{{#Meaning}}
{{Meaning}}
{{/Meaning}}
שֹׁרֶשׁ:{{Root}}
בִּנְיָן:{{Binyan}}
@@ -860,7 +887,6 @@ def build_vocab_deck( meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:") meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma - meaning_raw = entry.get("meaning_raw", "") or "" slug = entry.get("slug", "") or "" frequency = entry.get("frequency") or 999_999 audio_file = entry.get("audio_file", "") or "" @@ -906,14 +932,9 @@ def build_vocab_deck( emoji_str = emoji_lookup[kw] break - # Extract Hebrew prepositions: prefer upstream-parsed prep field, fall back to meaning_raw scan - # (fallback covers entries scraped before prep was moved upstream) + # Hebrew prepositions — extracted upstream by list scraper entry_prep = entry.get("prep") - if entry_prep: - prep_str = " ".join(f"({p})" for p in entry_prep.split()) - else: - preps = HBPAREN_RE.findall(meaning_raw) - prep_str = " ".join(f"({p})" for p in preps) + prep_str = " ".join(f"({p})" for p in entry_prep.split()) if entry_prep else "" # Audio — use audio_file from entry; for confusables it's already slug-based audio_tag = "" @@ -1123,25 +1144,12 @@ def build_conj_deck( root = ".".join(root_list) voice = VOICE_MAP.get(binyan, "") - meaning_raw = entry.get("meaning_raw", "") or "" meaning = entry.get("meaning", "") or "" - # Extract Hebrew preposition — strip from meaning, show on Hebrew side + # Hebrew preposition — extracted upstream by scraper prep_str = "" - conj_prep = conj.get("prep") + conj_prep = conj.get("prep") or entry.get("prep") if conj_prep: - # Strip any parentheses from stored prep value prep_str = conj_prep.strip("() ") - elif meaning_raw: - preps = HBPAREN_RE.findall(meaning_raw) - if preps: - prep_str = preps[0] - # Strip Hebrew prepositions from English meaning to avoid duplication - if prep_str: - meaning = HBPAREN_RE.sub("", meaning).strip() - # Also strip from meaning_raw patterns like "(על)" - meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip() - # Clean up double spaces and trailing commas - meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ") related = [(f, w, m) for f, w, m in root_words.get(root, []) if w != infinitive] if related: @@ -1461,7 +1469,7 @@ def build_confusables_deck( continue word_no_nik = unique_entries[0]["word"].get("ktiv_male", "") - words_display = " / ".join(e["word"]["nikkud"] for e in unique_entries) + words_display = word_no_nik # Show ktiv male (shared form) on front defs_parts: list[str] = [] audio_parts: list[str] = [] @@ -1530,8 +1538,8 @@ def write_conf_apkg( PLURAL_FRONT_SG = """
{{Singular}}
{{#SingularAudio}}
{{SingularAudio}}
{{/SingularAudio}} -
{{Meaning}}
-
יָחִיד ← רַבִּים
+
{{Meaning}}
+
יָחִיד ← רַבִּים
""" PLURAL_BACK_SG = """ @@ -1547,14 +1555,14 @@ PLURAL_BACK_SG = """ PLURAL_FRONT_PL = """
{{Plural}}
{{#PluralAudio}}
{{PluralAudio}}
{{/PluralAudio}} -
רַבִּים ← יָחִיד
+
רַבִּים ← יָחִיד
""" PLURAL_BACK_PL = """ {{FrontSide}}
{{Singular}}
{{#SingularAudio}}
{{SingularAudio}}
{{/SingularAudio}} -
{{Meaning}}
+
{{Meaning}}
{{#Gender}}
מִין:{{Gender}}
{{/Gender}} {{#Mishkal}}
מִשְׁקָל:{{Mishkal}}
{{/Mishkal}} @@ -1651,9 +1659,9 @@ def build_plural_deck( irregular_count = len(irregulars) target_regular = irregular_count * 2 mishkal_count = len(by_mishkal) or 1 - per_mishkal = max(2, target_regular // mishkal_count) + # Over-sample per mishkal to compensate for small patterns, then trim + per_mishkal = max(3, (target_regular * 3) // (mishkal_count * 2)) - selected: list[tuple[str, dict, dict]] = list(irregulars) regular_pool: list[tuple[str, dict, dict]] = [] for _mishkal, entries in sorted(by_mishkal.items()): entries.sort(key=lambda e: e[1].get("frequency") or 999_999) @@ -1664,7 +1672,24 @@ def build_plural_deck( regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999) regular_pool = regular_pool[:target_regular] - selected.extend(regular_pool) + # Sort both pools by frequency, then interleave for homogeneous 2:1 regular:irregular + irregulars.sort(key=lambda e: e[1].get("frequency") or 999_999) + regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999) + + # Interleave: for every 1 irregular, insert 2 regulars + selected: list[tuple[str, dict, dict]] = [] + ri = 0 # regular index + for _ii, irr in enumerate(irregulars): + # Insert 2 regulars before each irregular (when available) + for _ in range(2): + if ri < len(regular_pool): + selected.append(regular_pool[ri]) + ri += 1 + selected.append(irr) + # Append remaining regulars + while ri < len(regular_pool): + selected.append(regular_pool[ri]) + ri += 1 note_count = 0 for _unique_key, entry, noun_inflection in selected: diff --git a/data/words.json b/data/words.json index 09973ae..0a707fc 100644 --- a/data/words.json +++ b/data/words.json @@ -28003,7 +28003,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ motorbike, motorcycle", + "meaning": "motorbike, motorcycle", "meaning_raw": "🏍️ motorbike, motorcycle", "audio_url": "https://audio.pealim.com/v0/1s/1s8poqw78fk8v.mp3", "audio_file": "אופנוע.mp3", @@ -107000,7 +107000,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ stadium", + "meaning": "stadium", "meaning_raw": "🏟️ stadium", "audio_url": "https://audio.pealim.com/v0/vl/vln4sqr0ez7n.mp3", "audio_file": "אצטדיון.mp3", @@ -197835,7 +197835,7 @@ ], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "recreation, leisure, pleasure️; spending (time, resource)", + "meaning": "recreation, leisure, pleasure; spending (time, resource)", "meaning_raw": "recreation, leisure, pleasure🏖️; spending (time, resource)", "audio_url": "https://audio.pealim.com/v0/12/12s7w66rute00.mp3", "audio_file": "בילוי.mp3", @@ -573280,7 +573280,7 @@ ], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ smile", + "meaning": "smile", "meaning_raw": "☺️ smile", "audio_url": "https://audio.pealim.com/v0/1o/1o0qkdlzzgz5y.mp3", "audio_file": "חיוך.mp3", @@ -723335,7 +723335,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ telephone; (coll.) telephone call", + "meaning": "telephone; (coll.) telephone call", "meaning_raw": "☎️ telephone; (coll.) telephone call", "audio_url": "https://audio.pealim.com/v0/q4/q4vlmpn3xv25.mp3", "audio_file": "טלפון.mp3", @@ -768548,7 +768548,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "diamond; ️ diamonds (cards)", + "meaning": "diamond; diamonds (cards)", "meaning_raw": "💎 diamond; ♦️ diamonds (cards)", "audio_url": "https://audio.pealim.com/v0/2n/2ng3y19ptzo6.mp3", "audio_file": "יהלום.mp3", @@ -778091,7 +778091,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ yacht", + "meaning": "yacht", "meaning_raw": "🛥️ yacht", "audio_url": "https://audio.pealim.com/v0/fn/fnqancplf3p9.mp3", "audio_file": "יכטה.mp3", @@ -927031,7 +927031,7 @@ ], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "ability, fitness, capability; ️ physical fitness", + "meaning": "ability, fitness, capability; physical fitness", "meaning_raw": "ability, fitness, capability; 🏋️ physical fitness", "audio_url": "https://audio.pealim.com/v0/15/15dggrlz3ui39.mp3", "audio_file": "כושר.mp3", @@ -1003336,7 +1003336,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "‍️ meditation", + "meaning": "meditation", "meaning_raw": "🧘‍♂️ meditation", "audio_url": "https://audio.pealim.com/v0/3w/3wecs1tujod3.mp3", "audio_file": "מדיטציה.mp3", @@ -1019869,7 +1019869,7 @@ ], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ recycling", + "meaning": "recycling", "meaning_raw": "♻️ recycling", "audio_url": "https://audio.pealim.com/v0/1y/1y0idioumf0oj.mp3", "audio_file": "מיחזור.mp3", @@ -1107382,7 +1107382,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ sunglasses", + "meaning": "sunglasses", "meaning_raw": "🕶️ sunglasses", "audio_url": "https://audio.pealim.com/v0/18/18evs0abnwrg0.mp3", "audio_file": "משקפי שמש.mp3", @@ -1384284,7 +1384284,7 @@ ], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ scissors", + "meaning": "scissors", "meaning_raw": "✂️ scissors", "audio_url": "https://audio.pealim.com/v0/gl/gllnqtowivaa.mp3", "audio_file": "מספריים.mp3", @@ -1451640,7 +1451640,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ spider", + "meaning": "spider", "meaning_raw": "🕷️ spider", "audio_url": "https://audio.pealim.com/v0/1u/1umo4y75g98kf.mp3", "audio_file": "עכביש.mp3", @@ -1451701,7 +1451701,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "mouse ️", + "meaning": "mouse", "meaning_raw": "mouse 🐀🖱️", "audio_url": "https://audio.pealim.com/v0/eu/euem3vbl9inl.mp3", "audio_file": "עכבר.mp3", @@ -1548300,7 +1548300,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ park", + "meaning": "park", "meaning_raw": "🏞️ park", "audio_url": "https://audio.pealim.com/v0/fb/fb2nrwhv75ib.mp3", "audio_file": "פארק.mp3", @@ -1597759,7 +1597759,7 @@ "root": [], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ pepper", + "meaning": "pepper", "meaning_raw": "🌶️ pepper", "audio_url": "https://audio.pealim.com/v0/1w/1wnt7hycmb6re.mp3", "audio_file": "פלפל.mp3", @@ -1891786,7 +1891786,7 @@ ], "pos": "Noun", "pos_hebrew": "שֵׁם עֶצֶם", - "meaning": "️ slide projector; radiator; barrel jacket (weaponry)", + "meaning": "slide projector; radiator; barrel jacket (weaponry)", "meaning_raw": "📽️ slide projector; radiator; barrel jacket (weaponry)", "audio_url": "https://audio.pealim.com/v0/ur/urcm907agh5r.mp3", "audio_file": "מקרן.mp3", diff --git a/pealim_list_scrape.py b/pealim_list_scrape.py index 5bb1190..011e8d5 100644 --- a/pealim_list_scrape.py +++ b/pealim_list_scrape.py @@ -82,7 +82,7 @@ BINYAN_HEBREW: dict[str, str] = { # Regex for extracting emoji characters EMOJI_RE = re.compile( - r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF]+", + r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF\uFE0E\uFE0F\u200D]+", re.UNICODE, ) diff --git a/release.py b/release.py index 082176b..bd7e306 100644 --- a/release.py +++ b/release.py @@ -24,7 +24,7 @@ sys.path.insert(0, "/home/node/projects") import load_keeshare REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards" -FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["API_TOKEN"] +FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["password"] OUTPUT_DIR = Path(__file__).parent / "output" # All deck variants to include in release