v0.15: PoS fix, slug-based audio, CSS cleanup, template improvements

- Fix PoS substring bug: "Pronoun" no longer matches "Noun" - CSS: reduce sec-label/sec-key font sizes, add .definitions/.conf-entry - Slug-based audio filenames for confusable words (no more collisions) - Scraper captures slug from pealim.com list page links - Confusables: RTL alignment, re-enable audio (remove all-must-have gate) - Plurals: blue given word, gray meaning, labeled mishkal badge - Conjugation: add "אֵיךְ אוֹמְרִים" prompt, tense prefix (בְּ), Prep field from HBPAREN_RE, labeled RelatedVocab - Ben Yehuda: skip stripped fallback for confusable words - Bump RELEASE_TAG to v0.15 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 17:50:23 +00:00 · 2026-03-07 17:50:23 +00:00 · 2e48109d7f
commit 2e48109d7f
parent 802c369365
6 changed files with 9310 additions and 9157 deletions
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -39,7 +39,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903

 # Release version tag added to all notes so users can identify which release
 # their cards come from (visible in Anki's Browse view and card info).
-RELEASE_TAG = "v0.14"
+RELEASE_TAG = "v0.15"

 # Regex for extracting emoji and Hebrew prepositions from meaning strings
 EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -212,16 +212,26 @@ CARD_CSS = """
  color: #555;
 }
 .sec-label {
-  font-size: 32px;
+  font-size: 20px;
+  font-weight: normal;
  color: #555;
  direction: rtl;
  text-align: center;
  margin-top: 6px;
 }
 .sec-key {
-  font-size: 24px;
+  font-size: 18px;
  color: #888;
 }
+.definitions {
+  direction: rtl;
+  text-align: center;
+}
+.conf-entry {
+  margin: 8px 0;
+  font-size: 20px;
+  direction: rtl;
+}
 .related-group {
  direction: rtl;
  text-align: right;
@ -241,6 +251,7 @@ CARD_CSS = """
  .root-info   { color: #aaa; }
  .sec-label   { color: #aaa; }
  .sec-key     { color: #666; }
+  .conf-entry  { color: #ddd; }
  .hint        { color: #777; }
  .voice-label { color: #888; }
  .example     { color: #bbb; border-right-color: #555; }
@ -361,19 +372,21 @@ VOCAB_MODEL = genanki.Model(
 # ──────────────────────────────────────────────────────────────────────────────

 CONJ_FRONT = """
+<div class="sec-label">אֵיךְ אוֹמְרִים</div>
+<div class="hebrew" style="color:#1a1a8c;">{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
 <div class="hebrew">{{Pronoun}}</div>
-<div class="meaning" style="font-size:28px;">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
 <div class="hebrew">{{Tense}}</div>
 """

 CONJ_BACK = """
 {{FrontSide}}<hr>
-<div class="hebrew">{{ConjugatedForm}}</div>
+<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
 {{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
 <div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
 <div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
-{{#RelatedVocab}}<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
+{{#RelatedVocab}}<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
+<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
 """

 CONJ_CSS = CARD_CSS
@ -393,6 +406,7 @@ CONJ_MODEL = genanki.Model(
        {"name": "Audio"},
        {"name": "Meaning"},
        {"name": "RelatedVocab"},
+        {"name": "Prep"},
    ],
    templates=[
        {
@ -441,6 +455,14 @@ PAST_3P_EXPANSION = [
    ("הֵן", "עָבָר"),
 ]

+# Tense labels with "בְּ" prefix for display on cards
+TENSE_WITH_BE = {
+    "עָבָר": "בֶּעָבָר",
+    "הוֹוֶה": "בַּהוֹוֶה",
+    "עָתִיד": "בֶּעָתִיד",
+    "צִיּוּוּי": "בַּצִּוּוּי",
+}
+
 # Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
 VOICE_MAP = {
    "Pu'al": "סָבִיל",
@ -453,8 +475,15 @@ VOICE_MAP = {
 # ──────────────────────────────────────────────────────────────────────────────


-def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR) -> str:
-    """Return [sound:xxx.mp3] if audio file exists, else empty string."""
+def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR, slug: str = "") -> str:
+    """Return [sound:xxx.mp3] if audio file exists, else empty string.
+
+    Tries slug-based filename first (for confusable words), then consonant-based.
+    """
+    if slug:
+        slug_path = audio_dir / f"{slug}.mp3"
+        if slug_path.exists():
+            return f"[sound:{slug_path.name}]"
    safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
    if not safe:
        return ""
@ -651,8 +680,9 @@ def _load_emoji_lookup() -> dict[str, str]:

 def _translate_pos(pos_str: str) -> str:
    """Translate PoS string to Hebrew. For verbs, appends binyan."""
+    base = pos_str.split("–")[0].split("—")[0].strip()
    for eng, heb in POS_TO_HEBREW.items():
-        if eng.lower() in pos_str.lower():
+        if base == eng:
            if eng == "Verb":
                # Extract binyan from strings like "Verb – Pi'el" or "Verb –pi'el"
                for binyan_eng, binyan_heb in BINYAN_TO_HEBREW.items():
@ -932,18 +962,20 @@ def build_vocab_deck(
        # Eng→Heb disambiguation hint (PoS + binyan, shown only for ambiguous meanings)
        hint_str = _word_meaning_hints.get((word, meaning), "")

-        # Audio
-        audio_tag = _audio_tag(word_no_nik) if include_audio else ""
+        # Consonant-only form for confusable detection and cloze matching
+        word_consonants = _strip_nikkud(word)
+        is_confusable = word_consonants in _confusable_words
+
+        # Audio — use slug-based filename for confusable words
+        slug_val = str(row.get("slug", "")).strip()
+        slug_val = "" if slug_val in ("nan", "None") else slug_val
+        audio_tag = _audio_tag(word_no_nik, slug=slug_val if is_confusable else "") if include_audio else ""
        if audio_tag:
            mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
            mp3_path = AUDIO_DIR / mp3_name
            if mp3_path not in media_files:
                media_files.append(mp3_path)

-        # Consonant-only form for confusable detection and cloze matching
-        word_consonants = _strip_nikkud(word)
-        is_confusable = word_consonants in _confusable_words
-
        # Example sentences — priority: EPUB (nikkud'd) > Ben Yehuda > none
        # For confusable words (same consonants, different nikkud), only match by
        # exact nikkud form to avoid showing wrong-word sentences.
@ -1137,6 +1169,12 @@ def build_conj_deck(
            or verb_meaning.get(infinitive, "")
            or verb_meaning.get(_strip_nikkud(infinitive), "")
        )
+        # Extract Hebrew preposition from meaning (e.g., "(על)" → prep_str)
+        prep_str = ""
+        if meaning:
+            preps = HBPAREN_RE.findall(meaning)
+            prep_str = " ".join(f"({p})" for p in preps)
+
        related = [w for w in root_words.get(root, []) if w != infinitive]
        related_str = " ".join(related[:8]) if related else ""
        forms = data["forms"]
@ -1154,10 +1192,13 @@ def build_conj_deck(
            _voice: str = voice,
            _meaning: str = meaning,
            _related_str: str = related_str,
+            _prep_str: str = prep_str,
        ) -> None:
            nonlocal note_count
            if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
                return
+            # Apply tense prefix (בְּ)
+            display_tense = TENSE_WITH_BE.get(tense, tense)
            note = genanki.Note(
                model=CONJ_MODEL,
                guid=genanki.guid_for(_infinitive, pronoun, tense),
@ -1165,7 +1206,7 @@ def build_conj_deck(
                    _infinitive,
                    _ref_form,
                    pronoun,
-                    tense,
+                    display_tense,
                    conj_form,
                    _root,
                    _binyan_heb,
@ -1173,6 +1214,7 @@ def build_conj_deck(
                    audio_tag,
                    _meaning,
                    _related_str,
+                    _prep_str,
                ],
                tags=[RELEASE_TAG],
            )
@ -1245,7 +1287,7 @@ def build_conj_deck(

 CONF_FRONT = """
 <div class="hebrew" style="font-size:36px;">{{Words}}</div>
-<div class="meaning" style="font-size:32px;">מה ההבדל?</div>
+<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
 """

 CONF_BACK = """
@ -1293,6 +1335,15 @@ def build_confusables_deck(
    media_files: list[Path] = []
    note_count = 0

+    # Build slug lookup: word (nikkud) → slug
+    slug_lookup: dict[str, str] = {}
+    if "slug" in df.columns:
+        for _, row in df.iterrows():
+            w = str(row.get("Word", "")).strip()
+            s = str(row.get("slug", "")).strip()
+            if w and s and s not in ("nan", "None"):
+                slug_lookup[w] = s
+
    # Group by Word Without Nikkud
    groups = {}
    for _, row in df.iterrows():
@ -1326,7 +1377,6 @@ def build_confusables_deck(
        words_display = " / ".join(w for w, _, _ in unique_entries)
        defs_parts = []
        audio_parts = []
-        all_have_audio = True
        for w, m, p in unique_entries:
            pos_label = f" ({p})" if p else ""
            defs_parts.append(
@ -1334,19 +1384,14 @@ def build_confusables_deck(
                f" = {m}{pos_label}</div>"
            )
            if include_audio:
-                at = _audio_tag(_strip_nikkud(w))
+                slug = slug_lookup.get(w, "")
+                at = _audio_tag(_strip_nikkud(w), slug=slug)
                if at and at not in audio_parts:
                    audio_parts.append(at)
                    mp3_name = at.removeprefix("[sound:").removesuffix("]")
                    mp3_path = AUDIO_DIR / mp3_name
                    if mp3_path not in media_files:
                        media_files.append(mp3_path)
-                else:
-                    all_have_audio = False
-
-        # Only include audio if every word in the group has it
-        if not all_have_audio:
-            audio_parts = []

        defs_html = "\n".join(defs_parts)
        audio_html = " ".join(audio_parts)
@ -1382,9 +1427,9 @@ def write_conf_apkg(
 # ──────────────────────────────────────────────────────────────────────────────

 PLURAL_FRONT_SG = """
-<div class="hebrew">{{Singular}}</div>
+<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
 {{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
-<div class="meaning">{{Meaning}}</div>
+<div class="sec-label">{{Meaning}}</div>
 <div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
 """

@ -1392,11 +1437,11 @@ PLURAL_BACK_SG = """
 {{FrontSide}}<hr>
 <div class="hebrew">{{Plural}}</div>
 {{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
-{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
+{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
 """

 PLURAL_FRONT_PL = """
-<div class="hebrew">{{Plural}}</div>
+<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
 {{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
 <div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
 """
@ -1405,8 +1450,8 @@ PLURAL_BACK_PL = """
 {{FrontSide}}<hr>
 <div class="hebrew">{{Singular}}</div>
 {{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
-<div class="meaning">{{Meaning}}</div>
-{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
+<div class="sec-label">{{Meaning}}</div>
+{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
 """

 PLURAL_CSS = CARD_CSS
--- a/benyehuda.py
+++ b/benyehuda.py
@ -131,13 +131,15 @@ def save_examples_cache() -> None:
    logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")


-def get_examples(word_nikkud: str) -> list[str]:
+def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
    """
    Return 0 or 1 example sentences for the given word (nikkud form).

    Lookup strategy:
    1. Try exact nikkud match in index.
    2. Fall back to stripped (no-nikkud) match against index keys.
+       Skipped when word's consonants are in confusable_consonants set
+       (to avoid returning sentences for the wrong homograph).

    Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
    the word as a whole token.
@ -155,7 +157,7 @@ def get_examples(word_nikkud: str) -> list[str]:

    # Lookup: try exact nikkud first, then stripped fallback
    candidates = _index.get(word, [])
-    if not candidates and word_stripped:
+    if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
        # Try looking up by stripped form across index keys
        for k, v in _index.items():
            if _strip_nikkud(k) == word_stripped:
--- a/data/hebrew_dict_for_anki.csv
+++ b/data/hebrew_dict_for_anki.csv
--- a/hebrew_extract.py
+++ b/hebrew_extract.py
@ -5,6 +5,7 @@ Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards
 """

 import logging
+import re
 import time

 import pandas as pd
@ -41,7 +42,7 @@ def get_total_pages() -> int:
 def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
    """
    Parse a dict page with BeautifulSoup to extract word data + audio URL.
-    Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
+    Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url, slug.
    """
    soup = BeautifulSoup(html_bytes, "html.parser")
    rows = []
@ -52,6 +53,13 @@ def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
        # Audio URL from span[data-audio] in first td
        audio_span = tds[0].find(attrs={"data-audio": True})
        audio_url = audio_span["data-audio"] if audio_span else ""
+        # Slug from the detail page link (e.g., /dict/6009-av/ → 6009-av)
+        slug = ""
+        link = tds[0].find("a", href=True)
+        if link:
+            m = re.search(r"/dict/([^/]+)/", link["href"])
+            if m:
+                slug = m.group(1)
        # Word with nikkud
        menukad = tds[0].find("span", class_="menukad")
        word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
@ -69,6 +77,7 @@ def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
                    "Part of Speech": pos,
                    "Meaning": meaning,
                    "audio_url": audio_url,
+                    "slug": slug,
                }
            )
    return rows
--- a/run.py
+++ b/run.py
@ -136,12 +136,35 @@ def step_examples(args, freq_cache: dict):
        if args.test:
            df = df.head(args.test)

+        # Build confusable consonant set from CSV
+        consonant_counts: dict[str, int] = {}
+        for _, row in df.iterrows():
+            word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
+            if word_no_nik and word_no_nik not in ("nan", "None"):
+                safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
+                if safe:
+                    consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
+        confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
+
+        # Delete stale cache entries for confusable words so they get re-fetched
+        stale_deleted = 0
+        for _, row in df.iterrows():
+            word_nikkud = str(row.get("Word", "")).strip()
+            word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
+            if word_nikkud and word_no_nik:
+                safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
+                if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
+                    del benyehuda._examples_cache[word_nikkud]
+                    stale_deleted += 1
+        if stale_deleted:
+            logger.info(f"    Deleted {stale_deleted} stale confusable cache entries")
+
        logger.info(f"    Pre-fetching examples for {len(df)} words …")
        for _, row in df.iterrows():
            # Use nikkud word form as primary key (nikkud corpus)
            word_nikkud = str(row.get("Word", "")).strip()
            if word_nikkud:
-                benyehuda.get_examples(word_nikkud)
+                benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)

    except Exception as e:
        logger.warning(f"    Could not pre-fetch all examples: {e}")
@ -184,6 +207,17 @@ def step_audio(args):
        if args.test:
            df = df.head(args.test)

+        # Build confusable set: consonant forms that appear more than once
+        confusable_consonants: set[str] = set()
+        consonant_counts: dict[str, int] = {}
+        for _, row in df.iterrows():
+            word_plain = str(row.get("Word Without Nikkud", "")).strip()
+            if word_plain and word_plain not in ("nan", "None"):
+                safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain))
+                if safe:
+                    consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
+        confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
+
        AUDIO_DIR.mkdir(parents=True, exist_ok=True)
        downloaded = 0
        skipped = 0
@ -193,6 +227,7 @@ def step_audio(args):
            word = str(row.get("Word", "")).strip()
            word_plain = str(row.get("Word Without Nikkud", "")).strip()
            audio_url = str(row.get("audio_url", "")).strip()
+            slug = str(row.get("slug", "")).strip()

            if not word:
                continue
@ -200,7 +235,12 @@ def step_audio(args):
            safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
            if not safe_name:
                continue
-            mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
+
+            # Confusable words: use slug-based filename to avoid collisions
+            if safe_name in confusable_consonants and slug and slug not in ("nan", "None"):
+                mp3_path = AUDIO_DIR / f"{slug}.mp3"
+            else:
+                mp3_path = AUDIO_DIR / f"{safe_name}.mp3"

            if mp3_path.exists():
                skipped += 1
--- a/scripts/add_slugs.py
+++ b/scripts/add_slugs.py
@ -0,0 +1,57 @@
+#!/usr/bin/env python3
+"""One-time script: scrape slugs from pealim.com dict pages and add to CSV."""
+
+import logging
+import re
+import sys
+import time
+
+import pandas as pd
+import requests
+from bs4 import BeautifulSoup
+
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", stream=sys.stderr)
+logger = logging.getLogger()
+
+dict_csv = "data/hebrew_dict_for_anki.csv"
+df = pd.read_csv(dict_csv, sep=";", index_col=0)
+logger.info(f"Loaded {len(df)} rows")
+
+session = requests.Session()
+session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
+
+word_slug_map: dict[str, str] = {}
+total_pages = 608
+
+for page_num in range(1, total_pages + 1):
+    url = f"https://www.pealim.com/dict/?page={page_num}"
+    cookies = {"translit": "none", "hebstyle": "mo"}
+    try:
+        resp = session.get(url, cookies=cookies, timeout=10)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.content, "html.parser")
+        for tr in soup.select("table tr"):
+            tds = tr.find_all("td")
+            if len(tds) < 4:
+                continue
+            menukad = tds[0].find("span", class_="menukad")
+            word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
+            link = tds[0].find("a", href=True)
+            slug = ""
+            if link:
+                m = re.search(r"/dict/([^/]+)/", link["href"])
+                if m:
+                    slug = m.group(1)
+            if word and slug:
+                word_slug_map[word] = slug
+    except Exception as e:
+        logger.warning(f"Page {page_num} failed: {e}")
+
+    if page_num % 50 == 0:
+        logger.info(f"Scraped {page_num}/{total_pages} pages ({len(word_slug_map)} slugs)")
+    time.sleep(0.8)
+
+df["slug"] = df["Word"].map(word_slug_map).fillna("")
+df.to_csv(dict_csv, sep=";", index=True)
+matched = (df["slug"] != "").sum()
+logger.info(f"Done. {matched}/{len(df)} words have slugs. Saved → {dict_csv}")