feat: Sprint 3 — verb validation, Heebo font, images, card fixes

Verb validation:
- Add validate_verb_list.py: queries pealim.com for all 70 entries in
  nevo_typed_verbs_from_modern_hebrew, classifies as OK/3ms/REVIEW/NOT_FOUND,
  writes cleaned verbs_input.txt with structured sections and REVIEW flags
- New verbs_input.txt: 51 verified verbs, 15 Pu'al/Huf'al 3ms-past forms,
  4 REVIEW entries flagged for manual correction before conjugation re-run

Card fixes (apkg_builder.py):
- Conjugation card: binyan field now in Hebrew (פָּעַל/נִפְעַל etc.) via
  BINYAN_TO_HEBREW map; root and binyan on separate lines in CONJ_BACK template
- Vocabulary card: remove "דוגמה:" label (keep right-border quote styling)
- Related-words: "Other" category shown unlabeled (no spurious Hebrew header)
- Frequency: unlisted words (not in 50k corpus) now display "50k+" badge
- Add Image field to VOCAB_MODEL and templates ({{#Image}}<img>{{/Image}})
- Diagnostic logging: unlisted word count and uncategorized related-words count

Hebrew font (Heebo):
- Download Heebo variable font TTF from Google Fonts GitHub → data/fonts/
- Add @font-face declarations to CARD_CSS for both decks
- Bundle _Heebo-Regular.ttf and _Heebo-Bold.ttf in every .apkg via
  write_vocab_apkg() / write_conj_apkg() using _font_media_files() helper

Image infrastructure (image_fetch.py):
- New script: fetches Wikipedia pageimages + Wikimedia Commons thumbnails
  for concrete Noun-PoS entries (concreteness heuristic: ≤4 words, no
  abstract suffixes: -tion/-ity/-ness/-ment/-ance/-ence/-ism/-hood/-ship/-ure)
- Caches results in data/image_cache.json; downloads to data/images/
- Resume-safe; supports --limit/--dry-run/--word flags

Pipeline (run.py):
- Add step_fonts(): downloads Heebo TTF files to data/fonts/ (cached)
- Add step_images(): calls image_fetch.run(), respects --skip-images
- Add --skip-images flag
- Pass image_cache to build_vocab_deck(); add image stats to print_summary()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-03 08:07:37 +00:00
parent 78cc7f0ef1
commit d9e2533166
8 changed files with 895 additions and 104 deletions

View file

@ -33,6 +33,20 @@ OUTPUT_DIR = Path(__file__).parent / "output"
VOCAB_APKG = OUTPUT_DIR / "pealim_vocabulary.apkg"
CONJ_APKG = OUTPUT_DIR / "pealim_conjugations.apkg"
# ──────────────────────────────────────────────────────────────────────────────
# Binyan → Hebrew label mapping (for conjugation card display)
# ──────────────────────────────────────────────────────────────────────────────
BINYAN_TO_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּעֵל",
"Pu'al": "פֻּעַל",
"Hitpa'el": "הִתְפַּעֵל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
}
# ──────────────────────────────────────────────────────────────────────────────
# PoS → Hebrew label mapping
# ──────────────────────────────────────────────────────────────────────────────
@ -60,9 +74,21 @@ POS_CATEGORY_LABELS = {
# Shared CSS
# ──────────────────────────────────────────────────────────────────────────────
FONTS_DIR = DATA_DIR / "fonts"
CARD_CSS = """
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Regular.ttf');
font-weight: normal;
}
@font-face {
font-family: 'Heebo';
src: url('_Heebo-Bold.ttf');
font-weight: bold;
}
.card {
font-family: Arial, sans-serif;
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
font-size: 20px;
text-align: center;
color: #222;
@ -149,8 +175,8 @@ VOCAB_BACK_HEB = """
<div class="sec-label">מילים קשורות:</div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
{{#Example}}
<div class="sec-label">דוגמה:</div>
<div class="example">{{Example}}</div>
{{/Example}}
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
@ -168,8 +194,8 @@ VOCAB_BACK_ENG = """
{{#WordNoNikkud}}<div class="sec-label">ללא ניקוד: {{WordNoNikkud}}</div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label">שורש: {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label">חלק דיבור: {{PoS}}</div>{{/PoS}}
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
{{#Example}}
<div class="sec-label">דוגמה:</div>
<div class="example">{{Example}}</div>
{{/Example}}
"""
@ -188,6 +214,7 @@ VOCAB_MODEL = genanki.Model(
{"name": "Audio"},
{"name": "Example"},
{"name": "Frequency"},
{"name": "Image"},
],
templates=[
{
@ -219,7 +246,8 @@ CONJ_BACK = """
{{FrontSide}}<hr>
<div class="hebrew">{{ConjugatedForm}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<div class="sec-label">שורש: {{Root}} | בניין: {{Binyan}}</div>
<div class="sec-label">שורש: {{Root}}</div>
<div class="sec-label">בניין: {{Binyan}}</div>
"""
CONJ_CSS = CARD_CSS
@ -339,6 +367,7 @@ def build_vocab_deck(
dict_csv: Path,
examples_cache: Optional[dict] = None,
freq_cache: Optional[dict] = None,
image_cache: Optional[dict] = None,
limit: Optional[int] = None,
) -> tuple[genanki.Deck, list[Path]]:
"""
@ -360,6 +389,18 @@ def build_vocab_deck(
examples_cache = examples_cache or {}
freq_cache = freq_cache or {}
image_cache = image_cache or {}
# Load image cache from disk if not passed in
image_cache_path = DATA_DIR / "image_cache.json"
if not image_cache and image_cache_path.exists():
try:
with open(image_cache_path) as _f:
image_cache = json.load(_f)
except Exception:
pass
images_dir = DATA_DIR / "images"
# Build word_stripped → pos_category dict for related-words grouping
word_to_pos_cat: dict[str, str] = {}
@ -390,7 +431,8 @@ def build_vocab_deck(
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
tags_str = str(row.get("tags", row.get("Tags", ""))).strip()
freq_rank = int(row["_freq_rank"]) if row["_freq_rank"] < 999_999 else ""
freq_rank_raw = int(row["_freq_rank"])
freq_display = str(freq_rank_raw) if freq_rank_raw < 999_999 else "50k+"
root = "" if root in ("nan", "None", "-") else root
pos_raw = "" if pos_raw in ("nan", "None") else pos_raw
@ -430,12 +472,26 @@ def build_vocab_deck(
groups[cat].append(rw)
parts = []
for cat, words in groups.items():
label = POS_CATEGORY_LABELS.get(cat, cat)
parts.append(
f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>'
)
if cat == "Other":
# No label for uncategorized words — just list them plain
parts.append(f'<div class="related-group">{" ".join(words)}</div>')
else:
label = POS_CATEGORY_LABELS.get(cat, cat)
parts.append(
f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>'
)
related_html = "\n".join(parts)
# Image: look up by stripped word (no-nikkud)
image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None)
image_tag = ""
if image_filename:
image_path = images_dir / image_filename
if image_path.exists():
image_tag = image_filename
if image_path not in media_files:
media_files.append(image_path)
note = genanki.Note(
model=VOCAB_MODEL,
fields=[
@ -448,12 +504,23 @@ def build_vocab_deck(
tags_str,
audio_tag,
example_html,
str(freq_rank),
freq_display,
image_tag,
],
tags=tags_str.split() if tags_str else [],
)
deck.add_note(note)
# Diagnostic: count words without PoS coverage in shared_roots
other_count = sum(
1 for _, row in df.iterrows()
for rw in str(row.get("shared roots", row.get("SharedRoots", ""))).split()
if str(row.get("shared roots", row.get("SharedRoots", ""))) not in ("nan", "None", "")
and word_to_pos_cat.get(_strip_nikkud(rw)) is None
)
unlisted = sum(1 for v in df["_freq_rank"] if int(v) >= 999_999)
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(df)}")
logger.info(f" Related-words without PoS coverage: {other_count} (shown unlabeled)")
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
return deck, media_files
@ -471,11 +538,12 @@ def build_conj_deck(
if not data or not data.get("forms"):
continue
root = data.get("root", "")
binyan = data.get("binyan", "")
ref_form = data.get("reference_form", infinitive)
slug = data.get("slug", "")
voice = VOICE_MAP.get(binyan, "")
root = data.get("root", "")
binyan = data.get("binyan", "")
binyan_heb = BINYAN_TO_HEBREW.get(binyan, binyan)
ref_form = data.get("reference_form", infinitive)
slug = data.get("slug", "")
voice = VOICE_MAP.get(binyan, "")
forms = data["forms"]
def add_note(pronoun: str, tense: str, conj_form: str, audio_tag: str) -> None:
@ -491,7 +559,7 @@ def build_conj_deck(
tense,
conj_form,
root,
binyan,
binyan_heb,
voice,
audio_tag,
],
@ -540,11 +608,12 @@ def build_conj_deck(
# Also process passive partner forms if present
passive = data.get("passive_partner")
if passive and passive.get("forms"):
passive_root = passive.get("root", root)
passive_binyan = passive.get("binyan", "")
passive_ref = passive.get("reference_form", ref_form)
passive_voice = VOICE_MAP.get(passive_binyan, "")
passive_slug = passive.get("slug", slug)
passive_root = passive.get("root", root)
passive_binyan = passive.get("binyan", "")
passive_binyan_heb = BINYAN_TO_HEBREW.get(passive_binyan, passive_binyan)
passive_ref = passive.get("reference_form", ref_form)
passive_voice = VOICE_MAP.get(passive_binyan, "")
passive_slug = passive.get("slug", slug)
for form_key, form_data in passive["forms"].items():
conj_form = form_data.get("form", "")
@ -573,7 +642,7 @@ def build_conj_deck(
tense,
conj_form,
passive_root,
passive_binyan,
passive_binyan_heb,
passive_voice,
audio_tag,
],
@ -588,6 +657,12 @@ def build_conj_deck(
return deck, media_files
def _font_media_files() -> list[str]:
"""Return list of Heebo font file paths that exist, for bundling in .apkg."""
font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
return [str(p) for p in font_paths if p.exists()]
def write_vocab_apkg(
deck: genanki.Deck,
media_files: list[Path],
@ -595,7 +670,7 @@ def write_vocab_apkg(
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
pkg.media_files = [str(p) for p in media_files if p.exists()]
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Vocabulary deck written → {out_path}")
@ -607,8 +682,8 @@ def write_conj_apkg(
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
if media_files:
pkg.media_files = [str(p) for p in media_files if p.exists()]
base = [str(p) for p in (media_files or []) if p.exists()]
pkg.media_files = base + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Conjugation deck written → {out_path}")

BIN
data/fonts/_Heebo-Bold.ttf Normal file

Binary file not shown.

Binary file not shown.

313
image_fetch.py Normal file
View file

@ -0,0 +1,313 @@
#!/usr/bin/env python3
"""
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
Scope: Noun PoS entries only. Concreteness heuristic:
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
-ship, -ure, -al, -ing when not a gerund, -ence)
- Meaning is 4 words
Image sources (tried in order):
1. Wikipedia page image via pageimages API
2. Wikimedia Commons search (first image file result)
Cache: data/image_cache.json (word_no_nikkud filename or null)
Output: data/images/<safe_name>.jpg
Usage:
python3 image_fetch.py [--limit N] [--sample] [--word WORD] [--dry-run]
"""
import argparse
import json
import logging
import re
import sys
import time
import unicodedata
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
IMAGES_DIR = DATA_DIR / "images"
CACHE_PATH = DATA_DIR / "image_cache.json"
REQUEST_DELAY = 0.5
REQUEST_TIMEOUT = 10
# Abstract noun suffixes — words whose English meaning ends in these are skipped
ABSTRACT_SUFFIXES = (
"tion", "ity", "ness", "ment", "ance", "ence", "ism",
"hood", "ship", "ure", "ism", "age",
)
session = requests.Session()
session.headers.update({
"User-Agent": "pealim-anki/3.0 (educational Hebrew Anki deck builder; contact: anki@pealim.invalid)"
})
def _strip_nikkud(text: str) -> str:
return "".join(
ch for ch in unicodedata.normalize("NFD", text)
if unicodedata.category(ch) != "Mn"
)
def is_concrete(english_meaning: str) -> bool:
"""Return True if the English meaning looks like a concrete noun."""
meaning = english_meaning.strip().lower()
# Strip leading article
meaning = re.sub(r"^(a|an|the)\s+", "", meaning)
words = meaning.split()
if len(words) > 4:
return False
# Check last word for abstract suffixes
last = words[-1] if words else ""
for suffix in ABSTRACT_SUFFIXES:
if last.endswith(suffix):
return False
return True
def _safe_name(word_no_nikkud: str) -> str:
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud))
return hebrew_only if hebrew_only else "unknown"
def _try_wikipedia(query: str) -> str | None:
"""Try Wikipedia pageimages API. Returns image URL or None."""
url = "https://en.wikipedia.org/w/api.php"
params = {
"action": "query",
"titles": query,
"prop": "pageimages",
"format": "json",
"pithumbsize": 200,
"redirects": 1,
}
try:
resp = session.get(url, params=params, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
data = resp.json()
pages = data.get("query", {}).get("pages", {})
for page in pages.values():
if "thumbnail" in page:
return page["thumbnail"]["source"]
except Exception as e:
logger.debug(f"Wikipedia API error for {query!r}: {e}")
return None
def _try_commons(query: str) -> str | None:
"""Try Wikimedia Commons file search. Returns thumbnail URL or None."""
url = "https://commons.wikimedia.org/w/api.php"
params = {
"action": "query",
"list": "search",
"srnamespace": 6,
"srsearch": query,
"format": "json",
"srlimit": 1,
}
try:
resp = session.get(url, params=params, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
data = resp.json()
hits = data.get("query", {}).get("search", [])
if not hits:
return None
file_title = hits[0]["title"] # e.g. "File:Cat_portrait.jpg"
# Fetch imageinfo to get thumbnail URL
info_params = {
"action": "query",
"titles": file_title,
"prop": "imageinfo",
"iiprop": "url",
"iiurlwidth": 200,
"format": "json",
}
resp2 = session.get(url, params=info_params, timeout=REQUEST_TIMEOUT)
resp2.raise_for_status()
data2 = resp2.json()
pages2 = data2.get("query", {}).get("pages", {})
for page in pages2.values():
info = page.get("imageinfo", [])
if info and "thumburl" in info[0]:
return info[0]["thumburl"]
except Exception as e:
logger.debug(f"Commons API error for {query!r}: {e}")
return None
def _download_image(image_url: str, dest_path: Path) -> bool:
"""Download image_url to dest_path. Returns True on success."""
try:
resp = session.get(image_url, timeout=REQUEST_TIMEOUT, stream=True)
resp.raise_for_status()
content_type = resp.headers.get("content-type", "")
if "image" not in content_type:
return False
dest_path.write_bytes(resp.content)
return True
except Exception as e:
logger.debug(f"Download failed {image_url}: {e}")
return False
def get_image(english_meaning: str, word_no_nikkud: str) -> str | None:
"""
Fetch a thumbnail image for the word. Returns filename (in IMAGES_DIR) or None.
Downloads to IMAGES_DIR/<safe_name>.jpg.
"""
if not is_concrete(english_meaning):
return None
safe = _safe_name(word_no_nikkud)
dest = IMAGES_DIR / f"{safe}.jpg"
if dest.exists():
return dest.name
# Try Wikipedia first, then Commons
query = english_meaning.strip().lower()
query = re.sub(r"^(a|an|the)\s+", "", query)
image_url = _try_wikipedia(query)
time.sleep(REQUEST_DELAY)
if not image_url:
image_url = _try_commons(query)
time.sleep(REQUEST_DELAY)
if not image_url:
return None
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
if _download_image(image_url, dest):
logger.info(f" {word_no_nikkud!r} ({english_meaning!r}) → {dest.name}")
return dest.name
return None
def load_cache() -> dict:
if CACHE_PATH.exists():
try:
with open(CACHE_PATH, encoding="utf-8") as f:
return json.load(f)
except Exception:
pass
return {}
def save_cache(cache: dict) -> None:
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(CACHE_PATH, "w", encoding="utf-8") as f:
json.dump(cache, f, ensure_ascii=False, indent=2, sort_keys=True)
def run(limit: int | None = None, dry_run: bool = False, single_word: str | None = None) -> dict:
"""
Fetch images for all Noun-PoS words in pealim_dict_for_anki.csv.
Returns the updated image_cache dict.
"""
import pandas as pd
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if not dict_csv.exists():
logger.error("Dictionary CSV not found")
return {}
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
cache = load_cache()
processed = 0
hits = 0
skipped_abstract = 0
skipped_cached = 0
for _, row in df.iterrows():
if limit and processed >= limit:
break
word = str(row.get("Word", "")).strip()
meaning = str(row.get("Meaning", "")).strip()
word_plain = str(row.get("Word Without Nikkud", "")).strip()
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
if not word or not meaning or meaning in ("nan", "None"):
continue
if "nan" in pos_raw.lower() or "Noun" not in pos_raw:
continue
if single_word and word_plain != single_word:
continue
cache_key = word_plain or _strip_nikkud(word)
if cache_key in cache:
skipped_cached += 1
continue
if not is_concrete(meaning):
skipped_abstract += 1
cache[cache_key] = None
continue
processed += 1
logger.info(f"[{processed}] {word_plain!r} ({meaning!r}) …")
if dry_run:
logger.info(" [dry-run] would fetch image")
cache[cache_key] = None
continue
filename = get_image(meaning, cache_key)
cache[cache_key] = filename
if filename:
hits += 1
# Save cache periodically
if processed % 10 == 0:
save_cache(cache)
save_cache(cache)
logger.info(
f"Image fetch complete: {hits} found, "
f"{processed - hits} not found, "
f"{skipped_abstract} abstract (skipped), "
f"{skipped_cached} cached"
)
return cache
def main() -> None:
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
p = argparse.ArgumentParser(description="Fetch images for concrete Hebrew nouns")
p.add_argument("--limit", type=int, metavar="N", help="Process at most N nouns (for testing)")
p.add_argument("--dry-run", action="store_true", help="Don't download, just check concreteness")
p.add_argument("--word", metavar="WORD", help="Fetch image for a specific word (no-nikkud form)")
args = p.parse_args()
cache = run(limit=args.limit, dry_run=args.dry_run, single_word=args.word)
found = [(k, v) for k, v in cache.items() if v]
print(f"\n{len(found)} words with images (of {len(cache)} in cache)")
if found[:5]:
print("Sample:", found[:5])
if __name__ == "__main__":
main()

View file

@ -0,0 +1,70 @@
לשמור
ללמוד
לאסוף
לעבוד
לחבוש
לאכול
לשאול
לשלוח
לגבוה
לשבת
לרשת
לפול
לקום
לשים
לחון
לקרוא
לקנות
להיבדק
להרדם
לההרג
להחקר
להישאר
להיפגע
להיוולד
להנצל
להיסוג
להימצא
להיבנות
לדבר
לברך
לנהל
לנצח
לקומם
למלא
לחכות
לגלגל
בותל
תואם
קומם
דוכא
זוכה
פורסם
להתלבש
להסתלק
להצטלם
להזדקק
להתנהג
להתלקלח
להתקומם
להתפלא
להתגלות
להתקלקל
להכניס
להעסיק
להחליט
להבטיח
להוריד
להפיל
להקים
להקלל
המציא
להרשות
הוגבל
העבר
הוזהר
הופל
הוקם
הוחל
הוקפא
הופנה

87
run.py
View file

@ -10,6 +10,7 @@ Options:
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-conjugations Skip verb conjugation extraction
--skip-images Skip image fetching for concrete nouns
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
--test N Process only the first N dictionary words (for quick testing)
"""
@ -34,6 +35,7 @@ DATA_DIR = Path(__file__).parent / "data"
OUTPUT_DIR = Path(__file__).parent / "output"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
FONTS_DIR = DATA_DIR / "fonts"
def parse_args():
@ -42,6 +44,7 @@ def parse_args():
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument("--skip-conjugations", action="store_true", help="Skip verb conjugation extraction")
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
@ -270,7 +273,77 @@ def step_conj_audio(args, conjugations: dict):
)
def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
def step_fonts(args):
"""Step 4c — download Heebo font files (one-time, cached)."""
FONTS_DIR.mkdir(parents=True, exist_ok=True)
regular = FONTS_DIR / "_Heebo-Regular.ttf"
bold = FONTS_DIR / "_Heebo-Bold.ttf"
if regular.exists() and bold.exists():
logger.info("[4c] Heebo fonts already cached")
return
logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
# Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
import requests as _req
headers = {
# Request TTF (not woff2) so Anki can embed them
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
}
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
try:
css_resp = _req.get(css_url, headers=headers, timeout=15)
css_resp.raise_for_status()
css_text = css_resp.text
# Find all src: url(...) references (may be woff2 for modern UA)
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
logger.debug(f" Found {len(font_urls)} font URL(s) in CSS")
# Prefer TTF; if only woff2 available, download first two and note
downloaded = []
for i, fu in enumerate(font_urls[:2]):
fu = fu.strip("'\"")
dest = regular if i == 0 else bold
if dest.exists():
continue
fr = _req.get(fu, timeout=15)
fr.raise_for_status()
dest.write_bytes(fr.content)
downloaded.append(dest.name)
logger.info(f" Downloaded → {dest.name}")
if not downloaded:
logger.info(" All font files already present")
except Exception as e:
logger.warning(f" Heebo download failed: {e}")
logger.warning(" Cards will fall back to Arial Hebrew / David.")
logger.warning(
" To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
"from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
f"into {FONTS_DIR}"
)
def step_images(args) -> dict:
"""Step 4d — fetch images for concrete nouns (resume-safe)."""
if args.skip_images:
logger.info("[4d] Skipping images (--skip-images)")
cache_path = DATA_DIR / "image_cache.json"
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
return {}
limit = args.test # When in test mode, limit images too
logger.info("[4d] Fetching images for concrete nouns …")
import image_fetch
return image_fetch.run(limit=limit)
def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache: dict = None):
"""Step 5 — build vocabulary .apkg."""
logger.info("[5] Building vocabulary deck …")
import apkg_builder
@ -283,6 +356,7 @@ def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
dict_csv,
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache or {},
limit=args.test,
)
apkg_builder.write_vocab_apkg(deck, media)
@ -349,6 +423,13 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3"))
logger.info(f" Conjugation audio files: {len(mp3s)}")
image_cache_path = DATA_DIR / "image_cache.json"
if image_cache_path.exists():
with open(image_cache_path) as f:
ic = json.load(f)
found_imgs = sum(1 for v in ic.values() if v)
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
conj_apkg = OUTPUT_DIR / "pealim_conjugations.apkg"
if vocab_apkg.exists():
@ -380,7 +461,9 @@ def main():
freq_cache = step_frequency()
examples_cache = step_examples(args, freq_cache)
step_audio(args)
step_build_vocab(args, examples_cache, freq_cache)
step_fonts(args)
image_cache = step_images(args)
step_build_vocab(args, examples_cache, freq_cache, image_cache)
conjugations = step_conjugations(args)
print_summary(args, examples_cache, freq_cache, conjugations or {})

251
validate_verb_list.py Normal file
View file

@ -0,0 +1,251 @@
#!/usr/bin/env python3
"""
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
For each verb:
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
2. Searches pealim.com to find URL slug
3. Fetches the page to confirm the binyan
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
Output:
verbs_input.txt cleaned verb list for conjugation_extract.py
Printed validation report table
Usage:
python3 validate_verb_list.py
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
running conjugation extraction.
"""
import re
import sys
import time
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
# Known problem entries: word → (action, note)
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
}
# Expected binyan by line range (1-indexed) per plan analysis
LINE_RANGES: list[tuple[range, str]] = [
(range(1, 18), "Pa'al"),
(range(18, 29), "Nif'al"),
(range(29, 37), "Pi'el"),
(range(37, 43), "Pu'al"),
(range(43, 53), "Hitpa'el"),
(range(53, 63), "Hif'il"),
(range(63, 71), "Huf'al"),
]
SECTION_HEADERS: dict[str, str] = {
"Pa'al": "# Pa'al (פָּעַל)",
"Nif'al": "# Nif'al (נִפְעַל)",
"Pi'el": "# Pi'el (פִּעֵל)",
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
"Hif'il": "# Hif'il (הִפְעִיל)",
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
}
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
def classify_by_line(line_num: int) -> str:
"""Return expected binyan for a 1-indexed line number."""
for r, binyan in LINE_RANGES:
if line_num in r:
return binyan
return "Unknown"
def find_slug(query: str) -> str | None:
"""Search pealim.com and return first URL slug found."""
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
try:
resp = session.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
return slugs[0] if slugs else None
except Exception as e:
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
return None
def get_page_binyan(slug: str) -> str:
"""Fetch /dict/<slug>/ and extract binyan from page header."""
url = f"{PEALIM_BASE}/dict/{slug}/"
try:
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
for h3 in soup.find_all("h3", class_="page-header"):
text = h3.get_text(" ", strip=True)
for bname in binyan_names:
if bname in text:
return bname
meta = soup.find("meta", {"property": "og:description"})
if meta:
desc = meta.get("content", "")
for bname in binyan_names:
if bname in desc:
return bname
except Exception as e:
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
return ""
def main() -> None:
if not SOURCE_FILE.exists():
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
sys.exit(1)
lines = [l.strip() for l in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if l.strip()]
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
results = []
for line_num, word in enumerate(lines, start=1):
expected_binyan = classify_by_line(line_num)
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
if issue_type == "REVIEW":
# Don't query pealim for known-bad entries
print(f"REVIEW (skipping query)")
results.append({
"line": line_num, "word": word,
"expected_binyan": expected_binyan,
"slug": "", "page_binyan": "",
"status": "REVIEW", "notes": issue_note,
"is_3ms": is_3ms_by_position,
})
continue
time.sleep(REQUEST_DELAY)
slug = find_slug(word)
if slug:
time.sleep(REQUEST_DELAY)
page_binyan = get_page_binyan(slug)
else:
page_binyan = ""
# Determine status
if issue_type == "3ms" or is_3ms_by_position:
status = "3ms"
notes = issue_note or "Pu'al/Huf'al 3ms past form"
elif not slug:
status = "NOT_FOUND"
notes = "no search result on pealim.com"
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
status = "MISMATCH"
notes = f"expected {expected_binyan}, page says {page_binyan}"
else:
status = "OK"
notes = ""
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
results.append({
"line": line_num, "word": word,
"expected_binyan": expected_binyan,
"slug": slug or "", "page_binyan": page_binyan,
"status": status, "notes": notes,
"is_3ms": is_3ms_by_position or issue_type == "3ms",
})
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
review_lines: list[str] = []
for r in results:
b = r["expected_binyan"]
if b not in sections:
b = list(sections.keys())[0]
if r["status"] == "REVIEW":
review_lines.append(f"# REVIEW: {r['word']}{r['notes']}")
elif r["status"] == "3ms":
sections[b].append(f"# 3ms: {r['word']}")
elif r["status"] in ("OK", "MISMATCH"):
sections[b].append(r["word"])
else: # NOT_FOUND
sections[b].append(f"# NOT_FOUND: {r['word']}{r['notes']}")
output_lines = [
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
"",
]
for binyan, header in SECTION_HEADERS.items():
if sections.get(binyan):
output_lines.append(header)
output_lines.extend(sections[binyan])
output_lines.append("")
if review_lines:
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
output_lines.extend(review_lines)
output_lines.append("")
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
print(f"\nWrote → {OUTPUT_FILE}")
# ── Print summary table ──────────────────────────────────────────────────────
col_w = [4, 22, 14, 38, 12]
print("\n" + "=" * 95)
print("VALIDATION REPORT")
print("=" * 95)
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
print("-" * 95)
for r in results:
print(
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
)
print("=" * 95)
counts = {s: sum(1 for r in results if r["status"] == s)
for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
print(
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
)
print(f"Total entries: {len(results)}")
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
print(
"\n⚠ Review flagged entries in verbs_input.txt before running:\n"
" python3 conjugation_extract.py"
)
if __name__ == "__main__":
main()

View file

@ -1,91 +1,90 @@
# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.
# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
# Cambridge University Press, 2005.
# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.
# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew
# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).
# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.
# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.
# Pa'al (פָּעַל)
לָלֶכֶת
לָבוֹא
לָשֶׁבֶת
לָקוּם
לָשִׂים
לָדַעַת
לִרְאוֹת
לוֹמַר
לַעֲשׂוֹת
לִתֵּן
לִקְחַת
לֶאֱכֹל
לִשְׁתּוֹת
לִכְתּוֹב
לִקְרוֹא
לִשְׁמוֹר
לִשְׁמֹעַ
לִפְתּוֹחַ
לִסְגּוֹר
לִנְסוֹעַ
לִרְכּוֹב
לִשְׁכַּב
לַחְשׁוֹב
לִבְכּוֹת
לָרוּץ
לִשְׁאֹל
לַעֲנוֹת
לִמְכּוֹר
לִקְנוֹת
לִלְמֹד
לשמור
ללמוד
לאסוף
לעבוד
לחבוש
לאכול
לשאול
לשלוח
לשבת
לרשת
לפול
לקום
לשים
לחון
לקרוא
לקנות
# Nif'al (נִפְעַל)
לְהִכָּנֵס
לְהִפָּתַח
לְהִסָּגֵר
לְהִשָּׁמֵר
לְהִמָּצֵא
לְהִרְאוֹת
לְהִכָּתֵב
לְהִשָּׁבֵר
להיבדק
להרדם
להחקר
להישאר
להיפגע
להיוולד
להנצל
להיסוג
להימצא
להיבנות
# Pi'el (פִּעֵל)
לְדַבֵּר
לְסַפֵּר
לְבַקֵּשׁ
לְקַבֵּל
לְשַׁלֵּם
לְצַלֵּם
לְנַסּוֹת
לְחַכּוֹת
לְטַלְפֵן
לְבַשֵּׁל
לדבר
לברך
לנהל
לנצח
לקומם
למלא
לחכות
לגלגל
# Pu'al (פֻּעַל) — 3ms past, no infinitive
# 3ms: דֻּבַּר
# 3ms: סֻפַּר
# 3ms: בֻּקַּשׁ
# 3ms: קֻבַּל
# 3ms: בותל
# 3ms: תואם
# 3ms: קומם
# 3ms: דוכא
# 3ms: זוכה
# 3ms: פורסם
# Hitpa'el (הִתְפַּעֵל)
לְהִתְלַבֵּשׁ
לְהִתְרַחֵץ
לְהִתְנַהֵג
לְהִתְחַתֵּן
לְהִתְגּוֹרֵר
לְהִתְכּוֹנֵן
לְהִתְחִיל
להתלבש
להסתלק
להצטלם
להזדקק
להתנהג
להתקומם
להתפלא
להתגלות
להתקלקל
# Hif'il (הִפְעִיל)
לְהַגִּיד
לְהַבִּין
לְהַכִּיר
לְהַרְגִּישׁ
לְהַחְלִיט
לְהַתְחִיל
לְהַכְנִיס
לְהוֹצִיא
לְהוֹרִיד
לְהַעְלוֹת
להכניס
להעסיק
להחליט
להבטיח
להוריד
להפיל
להקים
# 3ms: המציא
להרשות
# Huf'al (הֻפְעַל) — 3ms past, no infinitive
# 3ms: הוּגַד
# 3ms: הוּבַן
# 3ms: הוּכְנַס
# 3ms: הוּצָא
# 3ms: הוגבל
# 3ms: העבר
# 3ms: הוזהר
# 3ms: הופל
# 3ms: הוקם
# 3ms: הוחל
# 3ms: הוקפא
# 3ms: הופנה
# ── Entries flagged for manual review ──────────────────────────────────────────
# REVIEW: לגבוה — not a standard infinitive form; likely defective spelling or wrong word
# REVIEW: לההרג — extra ה; should probably be להיהרג (Nif'al of הרג)
# REVIEW: להתלקלח — not a real word; likely typo for להתקלקל
# REVIEW: להקלל — ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל