v0.15: PoS fix, slug-based audio, CSS cleanup, template improvements

- Fix PoS substring bug: "Pronoun" no longer matches "Noun"
- CSS: reduce sec-label/sec-key font sizes, add .definitions/.conf-entry
- Slug-based audio filenames for confusable words (no more collisions)
- Scraper captures slug from pealim.com list page links
- Confusables: RTL alignment, re-enable audio (remove all-must-have gate)
- Plurals: blue given word, gray meaning, labeled mishkal badge
- Conjugation: add "אֵיךְ אוֹמְרִים" prompt, tense prefix (בְּ),
  Prep field from HBPAREN_RE, labeled RelatedVocab
- Ben Yehuda: skip stripped fallback for confusable words
- Bump RELEASE_TAG to v0.15

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-07 17:50:23 +00:00
parent 802c369365
commit 2e48109d7f
6 changed files with 9310 additions and 9157 deletions

View file

@ -39,7 +39,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.14"
RELEASE_TAG = "v0.15"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -212,16 +212,26 @@ CARD_CSS = """
color: #555;
}
.sec-label {
font-size: 32px;
font-size: 20px;
font-weight: normal;
color: #555;
direction: rtl;
text-align: center;
margin-top: 6px;
}
.sec-key {
font-size: 24px;
font-size: 18px;
color: #888;
}
.definitions {
direction: rtl;
text-align: center;
}
.conf-entry {
margin: 8px 0;
font-size: 20px;
direction: rtl;
}
.related-group {
direction: rtl;
text-align: right;
@ -241,6 +251,7 @@ CARD_CSS = """
.root-info { color: #aaa; }
.sec-label { color: #aaa; }
.sec-key { color: #666; }
.conf-entry { color: #ddd; }
.hint { color: #777; }
.voice-label { color: #888; }
.example { color: #bbb; border-right-color: #555; }
@ -361,19 +372,21 @@ VOCAB_MODEL = genanki.Model(
# ──────────────────────────────────────────────────────────────────────────────
CONJ_FRONT = """
<div class="sec-label">אֵיךְ אוֹמְרִים</div>
<div class="hebrew" style="color:#1a1a8c;">{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Pronoun}}</div>
<div class="meaning" style="font-size:28px;">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Tense}}</div>
"""
CONJ_BACK = """
{{FrontSide}}<hr>
<div class="hebrew">{{ConjugatedForm}}</div>
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
{{#RelatedVocab}}<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
{{#RelatedVocab}}<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
"""
CONJ_CSS = CARD_CSS
@ -393,6 +406,7 @@ CONJ_MODEL = genanki.Model(
{"name": "Audio"},
{"name": "Meaning"},
{"name": "RelatedVocab"},
{"name": "Prep"},
],
templates=[
{
@ -441,6 +455,14 @@ PAST_3P_EXPANSION = [
("הֵן", "עָבָר"),
]
# Tense labels with "בְּ" prefix for display on cards
TENSE_WITH_BE = {
"עָבָר": "בֶּעָבָר",
"הוֹוֶה": "בַּהוֹוֶה",
"עָתִיד": "בֶּעָתִיד",
"צִיּוּוּי": "בַּצִּוּוּי",
}
# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
VOICE_MAP = {
"Pu'al": "סָבִיל",
@ -453,8 +475,15 @@ VOICE_MAP = {
# ──────────────────────────────────────────────────────────────────────────────
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR) -> str:
"""Return [sound:xxx.mp3] if audio file exists, else empty string."""
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR, slug: str = "") -> str:
"""Return [sound:xxx.mp3] if audio file exists, else empty string.
Tries slug-based filename first (for confusable words), then consonant-based.
"""
if slug:
slug_path = audio_dir / f"{slug}.mp3"
if slug_path.exists():
return f"[sound:{slug_path.name}]"
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
if not safe:
return ""
@ -651,8 +680,9 @@ def _load_emoji_lookup() -> dict[str, str]:
def _translate_pos(pos_str: str) -> str:
"""Translate PoS string to Hebrew. For verbs, appends binyan."""
base = pos_str.split("")[0].split("")[0].strip()
for eng, heb in POS_TO_HEBREW.items():
if eng.lower() in pos_str.lower():
if base == eng:
if eng == "Verb":
# Extract binyan from strings like "Verb Pi'el" or "Verb pi'el"
for binyan_eng, binyan_heb in BINYAN_TO_HEBREW.items():
@ -932,18 +962,20 @@ def build_vocab_deck(
# Eng→Heb disambiguation hint (PoS + binyan, shown only for ambiguous meanings)
hint_str = _word_meaning_hints.get((word, meaning), "")
# Audio
audio_tag = _audio_tag(word_no_nik) if include_audio else ""
# Consonant-only form for confusable detection and cloze matching
word_consonants = _strip_nikkud(word)
is_confusable = word_consonants in _confusable_words
# Audio — use slug-based filename for confusable words
slug_val = str(row.get("slug", "")).strip()
slug_val = "" if slug_val in ("nan", "None") else slug_val
audio_tag = _audio_tag(word_no_nik, slug=slug_val if is_confusable else "") if include_audio else ""
if audio_tag:
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
mp3_path = AUDIO_DIR / mp3_name
if mp3_path not in media_files:
media_files.append(mp3_path)
# Consonant-only form for confusable detection and cloze matching
word_consonants = _strip_nikkud(word)
is_confusable = word_consonants in _confusable_words
# Example sentences — priority: EPUB (nikkud'd) > Ben Yehuda > none
# For confusable words (same consonants, different nikkud), only match by
# exact nikkud form to avoid showing wrong-word sentences.
@ -1137,6 +1169,12 @@ def build_conj_deck(
or verb_meaning.get(infinitive, "")
or verb_meaning.get(_strip_nikkud(infinitive), "")
)
# Extract Hebrew preposition from meaning (e.g., "(על)" → prep_str)
prep_str = ""
if meaning:
preps = HBPAREN_RE.findall(meaning)
prep_str = " ".join(f"({p})" for p in preps)
related = [w for w in root_words.get(root, []) if w != infinitive]
related_str = " ".join(related[:8]) if related else ""
forms = data["forms"]
@ -1154,10 +1192,13 @@ def build_conj_deck(
_voice: str = voice,
_meaning: str = meaning,
_related_str: str = related_str,
_prep_str: str = prep_str,
) -> None:
nonlocal note_count
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
return
# Apply tense prefix (בְּ)
display_tense = TENSE_WITH_BE.get(tense, tense)
note = genanki.Note(
model=CONJ_MODEL,
guid=genanki.guid_for(_infinitive, pronoun, tense),
@ -1165,7 +1206,7 @@ def build_conj_deck(
_infinitive,
_ref_form,
pronoun,
tense,
display_tense,
conj_form,
_root,
_binyan_heb,
@ -1173,6 +1214,7 @@ def build_conj_deck(
audio_tag,
_meaning,
_related_str,
_prep_str,
],
tags=[RELEASE_TAG],
)
@ -1245,7 +1287,7 @@ def build_conj_deck(
CONF_FRONT = """
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
<div class="meaning" style="font-size:32px;">מה ההבדל?</div>
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
"""
CONF_BACK = """
@ -1293,6 +1335,15 @@ def build_confusables_deck(
media_files: list[Path] = []
note_count = 0
# Build slug lookup: word (nikkud) → slug
slug_lookup: dict[str, str] = {}
if "slug" in df.columns:
for _, row in df.iterrows():
w = str(row.get("Word", "")).strip()
s = str(row.get("slug", "")).strip()
if w and s and s not in ("nan", "None"):
slug_lookup[w] = s
# Group by Word Without Nikkud
groups = {}
for _, row in df.iterrows():
@ -1326,7 +1377,6 @@ def build_confusables_deck(
words_display = " / ".join(w for w, _, _ in unique_entries)
defs_parts = []
audio_parts = []
all_have_audio = True
for w, m, p in unique_entries:
pos_label = f" ({p})" if p else ""
defs_parts.append(
@ -1334,19 +1384,14 @@ def build_confusables_deck(
f" = {m}{pos_label}</div>"
)
if include_audio:
at = _audio_tag(_strip_nikkud(w))
slug = slug_lookup.get(w, "")
at = _audio_tag(_strip_nikkud(w), slug=slug)
if at and at not in audio_parts:
audio_parts.append(at)
mp3_name = at.removeprefix("[sound:").removesuffix("]")
mp3_path = AUDIO_DIR / mp3_name
if mp3_path not in media_files:
media_files.append(mp3_path)
else:
all_have_audio = False
# Only include audio if every word in the group has it
if not all_have_audio:
audio_parts = []
defs_html = "\n".join(defs_parts)
audio_html = " ".join(audio_parts)
@ -1382,9 +1427,9 @@ def write_conf_apkg(
# ──────────────────────────────────────────────────────────────────────────────
PLURAL_FRONT_SG = """
<div class="hebrew">{{Singular}}</div>
<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="meaning">{{Meaning}}</div>
<div class="sec-label">{{Meaning}}</div>
<div class="hint" style="font-size:28px;">יָחִיד רַבִּים</div>
"""
@ -1392,11 +1437,11 @@ PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
"""
PLURAL_FRONT_PL = """
<div class="hebrew">{{Plural}}</div>
<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
<div class="hint" style="font-size:28px;">רַבִּים יָחִיד</div>
"""
@ -1405,8 +1450,8 @@ PLURAL_BACK_PL = """
{{FrontSide}}<hr>
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="meaning">{{Meaning}}</div>
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
<div class="sec-label">{{Meaning}}</div>
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
"""
PLURAL_CSS = CARD_CSS

View file

@ -131,13 +131,15 @@ def save_examples_cache() -> None:
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
def get_examples(word_nikkud: str) -> list[str]:
def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
"""
Return 0 or 1 example sentences for the given word (nikkud form).
Lookup strategy:
1. Try exact nikkud match in index.
2. Fall back to stripped (no-nikkud) match against index keys.
Skipped when word's consonants are in confusable_consonants set
(to avoid returning sentences for the wrong homograph).
Returns the single longest sentence MAX_SENTENCE_LEN that contains
the word as a whole token.
@ -155,7 +157,7 @@ def get_examples(word_nikkud: str) -> list[str]:
# Lookup: try exact nikkud first, then stripped fallback
candidates = _index.get(word, [])
if not candidates and word_stripped:
if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
# Try looking up by stripped form across index keys
for k, v in _index.items():
if _strip_nikkud(k) == word_stripped:

File diff suppressed because it is too large Load diff

View file

@ -5,6 +5,7 @@ Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards
"""
import logging
import re
import time
import pandas as pd
@ -41,7 +42,7 @@ def get_total_pages() -> int:
def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
"""
Parse a dict page with BeautifulSoup to extract word data + audio URL.
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url, slug.
"""
soup = BeautifulSoup(html_bytes, "html.parser")
rows = []
@ -52,6 +53,13 @@ def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
# Audio URL from span[data-audio] in first td
audio_span = tds[0].find(attrs={"data-audio": True})
audio_url = audio_span["data-audio"] if audio_span else ""
# Slug from the detail page link (e.g., /dict/6009-av/ → 6009-av)
slug = ""
link = tds[0].find("a", href=True)
if link:
m = re.search(r"/dict/([^/]+)/", link["href"])
if m:
slug = m.group(1)
# Word with nikkud
menukad = tds[0].find("span", class_="menukad")
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
@ -69,6 +77,7 @@ def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
"Part of Speech": pos,
"Meaning": meaning,
"audio_url": audio_url,
"slug": slug,
}
)
return rows

44
run.py
View file

@ -136,12 +136,35 @@ def step_examples(args, freq_cache: dict):
if args.test:
df = df.head(args.test)
# Build confusable consonant set from CSV
consonant_counts: dict[str, int] = {}
for _, row in df.iterrows():
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
if word_no_nik and word_no_nik not in ("nan", "None"):
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
# Delete stale cache entries for confusable words so they get re-fetched
stale_deleted = 0
for _, row in df.iterrows():
word_nikkud = str(row.get("Word", "")).strip()
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
if word_nikkud and word_no_nik:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
del benyehuda._examples_cache[word_nikkud]
stale_deleted += 1
if stale_deleted:
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
logger.info(f" Pre-fetching examples for {len(df)} words …")
for _, row in df.iterrows():
# Use nikkud word form as primary key (nikkud corpus)
word_nikkud = str(row.get("Word", "")).strip()
if word_nikkud:
benyehuda.get_examples(word_nikkud)
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
except Exception as e:
logger.warning(f" Could not pre-fetch all examples: {e}")
@ -184,6 +207,17 @@ def step_audio(args):
if args.test:
df = df.head(args.test)
# Build confusable set: consonant forms that appear more than once
confusable_consonants: set[str] = set()
consonant_counts: dict[str, int] = {}
for _, row in df.iterrows():
word_plain = str(row.get("Word Without Nikkud", "")).strip()
if word_plain and word_plain not in ("nan", "None"):
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain))
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
downloaded = 0
skipped = 0
@ -193,6 +227,7 @@ def step_audio(args):
word = str(row.get("Word", "")).strip()
word_plain = str(row.get("Word Without Nikkud", "")).strip()
audio_url = str(row.get("audio_url", "")).strip()
slug = str(row.get("slug", "")).strip()
if not word:
continue
@ -200,7 +235,12 @@ def step_audio(args):
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
if not safe_name:
continue
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
# Confusable words: use slug-based filename to avoid collisions
if safe_name in confusable_consonants and slug and slug not in ("nan", "None"):
mp3_path = AUDIO_DIR / f"{slug}.mp3"
else:
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
if mp3_path.exists():
skipped += 1

57
scripts/add_slugs.py Normal file
View file

@ -0,0 +1,57 @@
#!/usr/bin/env python3
"""One-time script: scrape slugs from pealim.com dict pages and add to CSV."""
import logging
import re
import sys
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", stream=sys.stderr)
logger = logging.getLogger()
dict_csv = "data/hebrew_dict_for_anki.csv"
df = pd.read_csv(dict_csv, sep=";", index_col=0)
logger.info(f"Loaded {len(df)} rows")
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
word_slug_map: dict[str, str] = {}
total_pages = 608
for page_num in range(1, total_pages + 1):
url = f"https://www.pealim.com/dict/?page={page_num}"
cookies = {"translit": "none", "hebstyle": "mo"}
try:
resp = session.get(url, cookies=cookies, timeout=10)
resp.raise_for_status()
soup = BeautifulSoup(resp.content, "html.parser")
for tr in soup.select("table tr"):
tds = tr.find_all("td")
if len(tds) < 4:
continue
menukad = tds[0].find("span", class_="menukad")
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
link = tds[0].find("a", href=True)
slug = ""
if link:
m = re.search(r"/dict/([^/]+)/", link["href"])
if m:
slug = m.group(1)
if word and slug:
word_slug_map[word] = slug
except Exception as e:
logger.warning(f"Page {page_num} failed: {e}")
if page_num % 50 == 0:
logger.info(f"Scraped {page_num}/{total_pages} pages ({len(word_slug_map)} slugs)")
time.sleep(0.8)
df["slug"] = df["Word"].map(word_slug_map).fillna("")
df.to_csv(dict_csv, sep=";", index=True)
matched = (df["slug"] != "").sum()
logger.info(f"Done. {matched}/{len(df)} words have slugs. Saved → {dict_csv}")