feat: Sprint 3 — verb validation, Heebo font, images, card fixes

Verb validation: - Add validate_verb_list.py: queries pealim.com for all 70 entries in nevo_typed_verbs_from_modern_hebrew, classifies as OK/3ms/REVIEW/NOT_FOUND, writes cleaned verbs_input.txt with structured sections and REVIEW flags - New verbs_input.txt: 51 verified verbs, 15 Pu'al/Huf'al 3ms-past forms, 4 REVIEW entries flagged for manual correction before conjugation re-run Card fixes (apkg_builder.py): - Conjugation card: binyan field now in Hebrew (פָּעַל/נִפְעַל etc.) via BINYAN_TO_HEBREW map; root and binyan on separate lines in CONJ_BACK template - Vocabulary card: remove "דוגמה:" label (keep right-border quote styling) - Related-words: "Other" category shown unlabeled (no spurious Hebrew header) - Frequency: unlisted words (not in 50k corpus) now display "50k+" badge - Add Image field to VOCAB_MODEL and templates ({{#Image}}<img>{{/Image}}) - Diagnostic logging: unlisted word count and uncategorized related-words count Hebrew font (Heebo): - Download Heebo variable font TTF from Google Fonts GitHub → data/fonts/ - Add @font-face declarations to CARD_CSS for both decks - Bundle _Heebo-Regular.ttf and _Heebo-Bold.ttf in every .apkg via write_vocab_apkg() / write_conj_apkg() using _font_media_files() helper Image infrastructure (image_fetch.py): - New script: fetches Wikipedia pageimages + Wikimedia Commons thumbnails for concrete Noun-PoS entries (concreteness heuristic: ≤4 words, no abstract suffixes: -tion/-ity/-ness/-ment/-ance/-ence/-ism/-hood/-ship/-ure) - Caches results in data/image_cache.json; downloads to data/images/ - Resume-safe; supports --limit/--dry-run/--word flags Pipeline (run.py): - Add step_fonts(): downloads Heebo TTF files to data/fonts/ (cached) - Add step_images(): calls image_fetch.run(), respects --skip-images - Add --skip-images flag - Pass image_cache to build_vocab_deck(); add image stats to print_summary() Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 08:07:37 +00:00 · 2026-03-03 08:07:37 +00:00 · d9e2533166
commit d9e2533166
parent 78cc7f0ef1
8 changed files with 895 additions and 104 deletions
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -33,6 +33,20 @@ OUTPUT_DIR     = Path(__file__).parent / "output"
 VOCAB_APKG  = OUTPUT_DIR / "pealim_vocabulary.apkg"
 CONJ_APKG   = OUTPUT_DIR / "pealim_conjugations.apkg"

+# ──────────────────────────────────────────────────────────────────────────────
+# Binyan → Hebrew label mapping (for conjugation card display)
+# ──────────────────────────────────────────────────────────────────────────────
+
+BINYAN_TO_HEBREW: dict[str, str] = {
+    "Pa'al":    "פָּעַל",
+    "Nif'al":   "נִפְעַל",
+    "Pi'el":    "פִּעֵל",
+    "Pu'al":    "פֻּעַל",
+    "Hitpa'el": "הִתְפַּעֵל",
+    "Hif'il":   "הִפְעִיל",
+    "Huf'al":   "הֻפְעַל",
+}
+
 # ──────────────────────────────────────────────────────────────────────────────
 # PoS → Hebrew label mapping
 # ──────────────────────────────────────────────────────────────────────────────
@ -60,9 +74,21 @@ POS_CATEGORY_LABELS = {
 # Shared CSS
 # ──────────────────────────────────────────────────────────────────────────────

+FONTS_DIR = DATA_DIR / "fonts"
+
 CARD_CSS = """
+@font-face {
+  font-family: 'Heebo';
+  src: url('_Heebo-Regular.ttf');
+  font-weight: normal;
+}
+@font-face {
+  font-family: 'Heebo';
+  src: url('_Heebo-Bold.ttf');
+  font-weight: bold;
+}
 .card {
-  font-family: Arial, sans-serif;
+  font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
  font-size: 20px;
  text-align: center;
  color: #222;
@ -149,8 +175,8 @@ VOCAB_BACK_HEB = """
 <div class="sec-label">מילים קשורות:</div>
 <div class="root-info">{{SharedRoots}}</div>
 {{/SharedRoots}}
+{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
 {{#Example}}
-<div class="sec-label">דוגמה:</div>
 <div class="example">{{Example}}</div>
 {{/Example}}
 {{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
@ -168,8 +194,8 @@ VOCAB_BACK_ENG = """
 {{#WordNoNikkud}}<div class="sec-label">ללא ניקוד: {{WordNoNikkud}}</div>{{/WordNoNikkud}}
 {{#Root}}<div class="sec-label">שורש: {{Root}}</div>{{/Root}}
 {{#PoS}}<div class="sec-label">חלק דיבור: {{PoS}}</div>{{/PoS}}
+{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
 {{#Example}}
-<div class="sec-label">דוגמה:</div>
 <div class="example">{{Example}}</div>
 {{/Example}}
 """
@ -188,6 +214,7 @@ VOCAB_MODEL = genanki.Model(
        {"name": "Audio"},
        {"name": "Example"},
        {"name": "Frequency"},
+        {"name": "Image"},
    ],
    templates=[
        {
@ -219,7 +246,8 @@ CONJ_BACK = """
 {{FrontSide}}<hr>
 <div class="hebrew">{{ConjugatedForm}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
-<div class="sec-label">שורש: {{Root}} | בניין: {{Binyan}}</div>
+<div class="sec-label">שורש: {{Root}}</div>
+<div class="sec-label">בניין: {{Binyan}}</div>
 """

 CONJ_CSS = CARD_CSS
@ -339,6 +367,7 @@ def build_vocab_deck(
    dict_csv: Path,
    examples_cache: Optional[dict] = None,
    freq_cache: Optional[dict] = None,
+    image_cache: Optional[dict] = None,
    limit: Optional[int] = None,
 ) -> tuple[genanki.Deck, list[Path]]:
    """
@ -360,6 +389,18 @@ def build_vocab_deck(

    examples_cache = examples_cache or {}
    freq_cache = freq_cache or {}
+    image_cache = image_cache or {}
+
+    # Load image cache from disk if not passed in
+    image_cache_path = DATA_DIR / "image_cache.json"
+    if not image_cache and image_cache_path.exists():
+        try:
+            with open(image_cache_path) as _f:
+                image_cache = json.load(_f)
+        except Exception:
+            pass
+
+    images_dir = DATA_DIR / "images"

    # Build word_stripped → pos_category dict for related-words grouping
    word_to_pos_cat: dict[str, str] = {}
@ -390,7 +431,8 @@ def build_vocab_deck(
        word_no_nik  = str(row.get("Word Without Nikkud", "")).strip()
        shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
        tags_str     = str(row.get("tags", row.get("Tags", ""))).strip()
-        freq_rank    = int(row["_freq_rank"]) if row["_freq_rank"] < 999_999 else ""
+        freq_rank_raw = int(row["_freq_rank"])
+        freq_display  = str(freq_rank_raw) if freq_rank_raw < 999_999 else "50k+"

        root        = "" if root in ("nan", "None", "-") else root
        pos_raw     = "" if pos_raw in ("nan", "None") else pos_raw
@ -430,12 +472,26 @@ def build_vocab_deck(
                groups[cat].append(rw)
            parts = []
            for cat, words in groups.items():
-                label = POS_CATEGORY_LABELS.get(cat, cat)
-                parts.append(
-                    f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>'
-                )
+                if cat == "Other":
+                    # No label for uncategorized words — just list them plain
+                    parts.append(f'<div class="related-group">{" ".join(words)}</div>')
+                else:
+                    label = POS_CATEGORY_LABELS.get(cat, cat)
+                    parts.append(
+                        f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>'
+                    )
            related_html = "\n".join(parts)

+        # Image: look up by stripped word (no-nikkud)
+        image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None)
+        image_tag = ""
+        if image_filename:
+            image_path = images_dir / image_filename
+            if image_path.exists():
+                image_tag = image_filename
+                if image_path not in media_files:
+                    media_files.append(image_path)
+
        note = genanki.Note(
            model=VOCAB_MODEL,
            fields=[
@ -448,12 +504,23 @@ def build_vocab_deck(
                tags_str,
                audio_tag,
                example_html,
-                str(freq_rank),
+                freq_display,
+                image_tag,
            ],
            tags=tags_str.split() if tags_str else [],
        )
        deck.add_note(note)

+    # Diagnostic: count words without PoS coverage in shared_roots
+    other_count = sum(
+        1 for _, row in df.iterrows()
+        for rw in str(row.get("shared roots", row.get("SharedRoots", ""))).split()
+        if str(row.get("shared roots", row.get("SharedRoots", ""))) not in ("nan", "None", "")
+        and word_to_pos_cat.get(_strip_nikkud(rw)) is None
+    )
+    unlisted = sum(1 for v in df["_freq_rank"] if int(v) >= 999_999)
+    logger.info(f"  Unlisted words (not in frequency corpus): {unlisted}/{len(df)}")
+    logger.info(f"  Related-words without PoS coverage: {other_count} (shown unlabeled)")
    logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
    return deck, media_files

@ -471,11 +538,12 @@ def build_conj_deck(
        if not data or not data.get("forms"):
            continue

-        root     = data.get("root", "")
-        binyan   = data.get("binyan", "")
-        ref_form = data.get("reference_form", infinitive)
-        slug     = data.get("slug", "")
-        voice    = VOICE_MAP.get(binyan, "")
+        root       = data.get("root", "")
+        binyan     = data.get("binyan", "")
+        binyan_heb = BINYAN_TO_HEBREW.get(binyan, binyan)
+        ref_form   = data.get("reference_form", infinitive)
+        slug       = data.get("slug", "")
+        voice      = VOICE_MAP.get(binyan, "")
        forms    = data["forms"]

        def add_note(pronoun: str, tense: str, conj_form: str, audio_tag: str) -> None:
@ -491,7 +559,7 @@ def build_conj_deck(
                    tense,
                    conj_form,
                    root,
-                    binyan,
+                    binyan_heb,
                    voice,
                    audio_tag,
                ],
@ -540,11 +608,12 @@ def build_conj_deck(
        # Also process passive partner forms if present
        passive = data.get("passive_partner")
        if passive and passive.get("forms"):
-            passive_root    = passive.get("root", root)
-            passive_binyan  = passive.get("binyan", "")
-            passive_ref     = passive.get("reference_form", ref_form)
-            passive_voice   = VOICE_MAP.get(passive_binyan, "")
-            passive_slug    = passive.get("slug", slug)
+            passive_root       = passive.get("root", root)
+            passive_binyan     = passive.get("binyan", "")
+            passive_binyan_heb = BINYAN_TO_HEBREW.get(passive_binyan, passive_binyan)
+            passive_ref        = passive.get("reference_form", ref_form)
+            passive_voice      = VOICE_MAP.get(passive_binyan, "")
+            passive_slug       = passive.get("slug", slug)

            for form_key, form_data in passive["forms"].items():
                conj_form = form_data.get("form", "")
@ -573,7 +642,7 @@ def build_conj_deck(
                        tense,
                        conj_form,
                        passive_root,
-                        passive_binyan,
+                        passive_binyan_heb,
                        passive_voice,
                        audio_tag,
                    ],
@ -588,6 +657,12 @@ def build_conj_deck(
    return deck, media_files


+def _font_media_files() -> list[str]:
+    """Return list of Heebo font file paths that exist, for bundling in .apkg."""
+    font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
+    return [str(p) for p in font_paths if p.exists()]
+
+
 def write_vocab_apkg(
    deck: genanki.Deck,
    media_files: list[Path],
@ -595,7 +670,7 @@ def write_vocab_apkg(
 ) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)
-    pkg.media_files = [str(p) for p in media_files if p.exists()]
+    pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Vocabulary deck written → {out_path}")

@ -607,8 +682,8 @@ def write_conj_apkg(
 ) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)
-    if media_files:
-        pkg.media_files = [str(p) for p in media_files if p.exists()]
+    base = [str(p) for p in (media_files or []) if p.exists()]
+    pkg.media_files = base + _font_media_files()
    pkg.write_to_file(str(out_path))
    logger.info(f"Conjugation deck written → {out_path}")

--- a/data/fonts/_Heebo-Bold.ttf
+++ b/data/fonts/_Heebo-Bold.ttf
--- a/data/fonts/_Heebo-Regular.ttf
+++ b/data/fonts/_Heebo-Regular.ttf
--- a/image_fetch.py
+++ b/image_fetch.py
@ -0,0 +1,313 @@
+#!/usr/bin/env python3
+"""
+Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
+
+Scope: Noun PoS entries only. Concreteness heuristic:
+  - English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
+    -ship, -ure, -al, -ing when not a gerund, -ence)
+  - Meaning is ≤ 4 words
+
+Image sources (tried in order):
+  1. Wikipedia page image via pageimages API
+  2. Wikimedia Commons search (first image file result)
+
+Cache: data/image_cache.json  (word_no_nikkud → filename or null)
+Output: data/images/<safe_name>.jpg
+
+Usage:
+  python3 image_fetch.py [--limit N] [--sample] [--word WORD] [--dry-run]
+"""
+
+import argparse
+import json
+import logging
+import re
+import sys
+import time
+import unicodedata
+from pathlib import Path
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+DATA_DIR    = Path(__file__).parent / "data"
+IMAGES_DIR  = DATA_DIR / "images"
+CACHE_PATH  = DATA_DIR / "image_cache.json"
+
+REQUEST_DELAY   = 0.5
+REQUEST_TIMEOUT = 10
+
+# Abstract noun suffixes — words whose English meaning ends in these are skipped
+ABSTRACT_SUFFIXES = (
+    "tion", "ity", "ness", "ment", "ance", "ence", "ism",
+    "hood", "ship", "ure", "ism", "age",
+)
+
+session = requests.Session()
+session.headers.update({
+    "User-Agent": "pealim-anki/3.0 (educational Hebrew Anki deck builder; contact: anki@pealim.invalid)"
+})
+
+
+def _strip_nikkud(text: str) -> str:
+    return "".join(
+        ch for ch in unicodedata.normalize("NFD", text)
+        if unicodedata.category(ch) != "Mn"
+    )
+
+
+def is_concrete(english_meaning: str) -> bool:
+    """Return True if the English meaning looks like a concrete noun."""
+    meaning = english_meaning.strip().lower()
+    # Strip leading article
+    meaning = re.sub(r"^(a|an|the)\s+", "", meaning)
+    words = meaning.split()
+    if len(words) > 4:
+        return False
+    # Check last word for abstract suffixes
+    last = words[-1] if words else ""
+    for suffix in ABSTRACT_SUFFIXES:
+        if last.endswith(suffix):
+            return False
+    return True
+
+
+def _safe_name(word_no_nikkud: str) -> str:
+    """Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
+    hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud))
+    return hebrew_only if hebrew_only else "unknown"
+
+
+def _try_wikipedia(query: str) -> str | None:
+    """Try Wikipedia pageimages API. Returns image URL or None."""
+    url = "https://en.wikipedia.org/w/api.php"
+    params = {
+        "action": "query",
+        "titles": query,
+        "prop": "pageimages",
+        "format": "json",
+        "pithumbsize": 200,
+        "redirects": 1,
+    }
+    try:
+        resp = session.get(url, params=params, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        data = resp.json()
+        pages = data.get("query", {}).get("pages", {})
+        for page in pages.values():
+            if "thumbnail" in page:
+                return page["thumbnail"]["source"]
+    except Exception as e:
+        logger.debug(f"Wikipedia API error for {query!r}: {e}")
+    return None
+
+
+def _try_commons(query: str) -> str | None:
+    """Try Wikimedia Commons file search. Returns thumbnail URL or None."""
+    url = "https://commons.wikimedia.org/w/api.php"
+    params = {
+        "action": "query",
+        "list": "search",
+        "srnamespace": 6,
+        "srsearch": query,
+        "format": "json",
+        "srlimit": 1,
+    }
+    try:
+        resp = session.get(url, params=params, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        data = resp.json()
+        hits = data.get("query", {}).get("search", [])
+        if not hits:
+            return None
+        file_title = hits[0]["title"]  # e.g. "File:Cat_portrait.jpg"
+        # Fetch imageinfo to get thumbnail URL
+        info_params = {
+            "action": "query",
+            "titles": file_title,
+            "prop": "imageinfo",
+            "iiprop": "url",
+            "iiurlwidth": 200,
+            "format": "json",
+        }
+        resp2 = session.get(url, params=info_params, timeout=REQUEST_TIMEOUT)
+        resp2.raise_for_status()
+        data2 = resp2.json()
+        pages2 = data2.get("query", {}).get("pages", {})
+        for page in pages2.values():
+            info = page.get("imageinfo", [])
+            if info and "thumburl" in info[0]:
+                return info[0]["thumburl"]
+    except Exception as e:
+        logger.debug(f"Commons API error for {query!r}: {e}")
+    return None
+
+
+def _download_image(image_url: str, dest_path: Path) -> bool:
+    """Download image_url to dest_path. Returns True on success."""
+    try:
+        resp = session.get(image_url, timeout=REQUEST_TIMEOUT, stream=True)
+        resp.raise_for_status()
+        content_type = resp.headers.get("content-type", "")
+        if "image" not in content_type:
+            return False
+        dest_path.write_bytes(resp.content)
+        return True
+    except Exception as e:
+        logger.debug(f"Download failed {image_url}: {e}")
+        return False
+
+
+def get_image(english_meaning: str, word_no_nikkud: str) -> str | None:
+    """
+    Fetch a thumbnail image for the word. Returns filename (in IMAGES_DIR) or None.
+    Downloads to IMAGES_DIR/<safe_name>.jpg.
+    """
+    if not is_concrete(english_meaning):
+        return None
+
+    safe = _safe_name(word_no_nikkud)
+    dest = IMAGES_DIR / f"{safe}.jpg"
+
+    if dest.exists():
+        return dest.name
+
+    # Try Wikipedia first, then Commons
+    query = english_meaning.strip().lower()
+    query = re.sub(r"^(a|an|the)\s+", "", query)
+
+    image_url = _try_wikipedia(query)
+    time.sleep(REQUEST_DELAY)
+    if not image_url:
+        image_url = _try_commons(query)
+        time.sleep(REQUEST_DELAY)
+
+    if not image_url:
+        return None
+
+    IMAGES_DIR.mkdir(parents=True, exist_ok=True)
+    if _download_image(image_url, dest):
+        logger.info(f"  {word_no_nikkud!r} ({english_meaning!r}) → {dest.name}")
+        return dest.name
+
+    return None
+
+
+def load_cache() -> dict:
+    if CACHE_PATH.exists():
+        try:
+            with open(CACHE_PATH, encoding="utf-8") as f:
+                return json.load(f)
+        except Exception:
+            pass
+    return {}
+
+
+def save_cache(cache: dict) -> None:
+    CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(CACHE_PATH, "w", encoding="utf-8") as f:
+        json.dump(cache, f, ensure_ascii=False, indent=2, sort_keys=True)
+
+
+def run(limit: int | None = None, dry_run: bool = False, single_word: str | None = None) -> dict:
+    """
+    Fetch images for all Noun-PoS words in pealim_dict_for_anki.csv.
+    Returns the updated image_cache dict.
+    """
+    import pandas as pd
+
+    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict.csv"
+    if not dict_csv.exists():
+        logger.error("Dictionary CSV not found")
+        return {}
+
+    try:
+        df = pd.read_csv(dict_csv, sep=";", index_col=0)
+        if df.shape[1] < 3:
+            raise ValueError
+    except Exception:
+        df = pd.read_csv(dict_csv, index_col=0)
+
+    cache = load_cache()
+    processed = 0
+    hits = 0
+    skipped_abstract = 0
+    skipped_cached = 0
+
+    for _, row in df.iterrows():
+        if limit and processed >= limit:
+            break
+
+        word      = str(row.get("Word", "")).strip()
+        meaning   = str(row.get("Meaning", "")).strip()
+        word_plain = str(row.get("Word Without Nikkud", "")).strip()
+        pos_raw   = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
+
+        if not word or not meaning or meaning in ("nan", "None"):
+            continue
+        if "nan" in pos_raw.lower() or "Noun" not in pos_raw:
+            continue
+
+        if single_word and word_plain != single_word:
+            continue
+
+        cache_key = word_plain or _strip_nikkud(word)
+
+        if cache_key in cache:
+            skipped_cached += 1
+            continue
+
+        if not is_concrete(meaning):
+            skipped_abstract += 1
+            cache[cache_key] = None
+            continue
+
+        processed += 1
+        logger.info(f"[{processed}] {word_plain!r} ({meaning!r}) …")
+
+        if dry_run:
+            logger.info("  [dry-run] would fetch image")
+            cache[cache_key] = None
+            continue
+
+        filename = get_image(meaning, cache_key)
+        cache[cache_key] = filename
+        if filename:
+            hits += 1
+
+        # Save cache periodically
+        if processed % 10 == 0:
+            save_cache(cache)
+
+    save_cache(cache)
+
+    logger.info(
+        f"Image fetch complete: {hits} found, "
+        f"{processed - hits} not found, "
+        f"{skipped_abstract} abstract (skipped), "
+        f"{skipped_cached} cached"
+    )
+    return cache
+
+
+def main() -> None:
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    p = argparse.ArgumentParser(description="Fetch images for concrete Hebrew nouns")
+    p.add_argument("--limit", type=int, metavar="N", help="Process at most N nouns (for testing)")
+    p.add_argument("--dry-run", action="store_true", help="Don't download, just check concreteness")
+    p.add_argument("--word", metavar="WORD", help="Fetch image for a specific word (no-nikkud form)")
+    args = p.parse_args()
+
+    cache = run(limit=args.limit, dry_run=args.dry_run, single_word=args.word)
+    found = [(k, v) for k, v in cache.items() if v]
+    print(f"\n{len(found)} words with images (of {len(cache)} in cache)")
+    if found[:5]:
+        print("Sample:", found[:5])
+
+
+if __name__ == "__main__":
+    main()
--- a/70
+++ b/70
@ -0,0 +1,70 @@
+לשמור
+ללמוד
+לאסוף
+לעבוד
+לחבוש
+לאכול
+לשאול
+לשלוח
+לגבוה
+לשבת
+לרשת
+לפול
+לקום
+לשים
+לחון
+לקרוא
+לקנות
+להיבדק
+להרדם
+לההרג
+להחקר
+להישאר
+להיפגע
+להיוולד
+להנצל
+להיסוג
+להימצא
+להיבנות
+לדבר
+לברך
+לנהל
+לנצח
+לקומם
+למלא
+לחכות
+לגלגל
+בותל
+תואם
+קומם
+דוכא
+זוכה
+פורסם
+להתלבש
+להסתלק
+להצטלם
+להזדקק
+להתנהג
+להתלקלח
+להתקומם
+להתפלא
+להתגלות
+להתקלקל
+להכניס
+להעסיק
+להחליט
+להבטיח
+להוריד
+להפיל
+להקים
+להקלל
+המציא
+להרשות
+הוגבל
+העבר
+הוזהר
+הופל
+הוקם
+הוחל
+הוקפא
+הופנה
--- a/run.py
+++ b/run.py
@ -10,6 +10,7 @@ Options:
  --skip-audio         Skip audio .mp3 downloads
  --skip-examples      Skip Ben Yehuda example fetching
  --skip-conjugations  Skip verb conjugation extraction
+  --skip-images        Skip image fetching for concrete nouns
  --refresh-examples   Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
  --test N             Process only the first N dictionary words (for quick testing)
 """
@ -34,6 +35,7 @@ DATA_DIR       = Path(__file__).parent / "data"
 OUTPUT_DIR     = Path(__file__).parent / "output"
 AUDIO_DIR      = DATA_DIR / "audio"
 AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
+FONTS_DIR      = DATA_DIR / "fonts"


 def parse_args():
@ -42,6 +44,7 @@ def parse_args():
    p.add_argument("--skip-audio",         action="store_true", help="Skip audio downloads")
    p.add_argument("--skip-examples",      action="store_true", help="Skip Ben Yehuda example lookup")
    p.add_argument("--skip-conjugations",  action="store_true", help="Skip verb conjugation extraction")
+    p.add_argument("--skip-images",        action="store_true", help="Skip image fetching")
    p.add_argument("--refresh-examples",   action="store_true", help="Force rebuild of Ben Yehuda index")
    p.add_argument("--test",               type=int, metavar="N", help="Limit to first N words")
    return p.parse_args()
@ -270,7 +273,77 @@ def step_conj_audio(args, conjugations: dict):
    )


-def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
+def step_fonts(args):
+    """Step 4c — download Heebo font files (one-time, cached)."""
+    FONTS_DIR.mkdir(parents=True, exist_ok=True)
+    regular = FONTS_DIR / "_Heebo-Regular.ttf"
+    bold    = FONTS_DIR / "_Heebo-Bold.ttf"
+
+    if regular.exists() and bold.exists():
+        logger.info("[4c] Heebo fonts already cached")
+        return
+
+    logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
+
+    # Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
+    import requests as _req
+    headers = {
+        # Request TTF (not woff2) so Anki can embed them
+        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
+    }
+    css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
+    try:
+        css_resp = _req.get(css_url, headers=headers, timeout=15)
+        css_resp.raise_for_status()
+        css_text = css_resp.text
+
+        # Find all src: url(...) references (may be woff2 for modern UA)
+        font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
+        logger.debug(f"    Found {len(font_urls)} font URL(s) in CSS")
+
+        # Prefer TTF; if only woff2 available, download first two and note
+        downloaded = []
+        for i, fu in enumerate(font_urls[:2]):
+            fu = fu.strip("'\"")
+            dest = regular if i == 0 else bold
+            if dest.exists():
+                continue
+            fr = _req.get(fu, timeout=15)
+            fr.raise_for_status()
+            dest.write_bytes(fr.content)
+            downloaded.append(dest.name)
+            logger.info(f"    Downloaded → {dest.name}")
+
+        if not downloaded:
+            logger.info("    All font files already present")
+
+    except Exception as e:
+        logger.warning(f"    Heebo download failed: {e}")
+        logger.warning("    Cards will fall back to Arial Hebrew / David.")
+        logger.warning(
+            "    To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
+            "from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
+            f"into {FONTS_DIR}"
+        )
+
+
+def step_images(args) -> dict:
+    """Step 4d — fetch images for concrete nouns (resume-safe)."""
+    if args.skip_images:
+        logger.info("[4d] Skipping images (--skip-images)")
+        cache_path = DATA_DIR / "image_cache.json"
+        if cache_path.exists():
+            with open(cache_path) as f:
+                return json.load(f)
+        return {}
+
+    limit = args.test  # When in test mode, limit images too
+    logger.info("[4d] Fetching images for concrete nouns …")
+    import image_fetch
+    return image_fetch.run(limit=limit)
+
+
+def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache: dict = None):
    """Step 5 — build vocabulary .apkg."""
    logger.info("[5] Building vocabulary deck …")
    import apkg_builder
@ -283,6 +356,7 @@ def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
        dict_csv,
        examples_cache=examples_cache,
        freq_cache=freq_cache,
+        image_cache=image_cache or {},
        limit=args.test,
    )
    apkg_builder.write_vocab_apkg(deck, media)
@ -349,6 +423,13 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
        mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3"))
        logger.info(f"  Conjugation audio files: {len(mp3s)}")

+    image_cache_path = DATA_DIR / "image_cache.json"
+    if image_cache_path.exists():
+        with open(image_cache_path) as f:
+            ic = json.load(f)
+        found_imgs = sum(1 for v in ic.values() if v)
+        logger.info(f"  Images: {found_imgs}/{len(ic)} nouns with images")
+
    vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
    conj_apkg  = OUTPUT_DIR / "pealim_conjugations.apkg"
    if vocab_apkg.exists():
@ -380,7 +461,9 @@ def main():
    freq_cache     = step_frequency()
    examples_cache = step_examples(args, freq_cache)
    step_audio(args)
-    step_build_vocab(args, examples_cache, freq_cache)
+    step_fonts(args)
+    image_cache    = step_images(args)
+    step_build_vocab(args, examples_cache, freq_cache, image_cache)
    conjugations = step_conjugations(args)

    print_summary(args, examples_cache, freq_cache, conjugations or {})
--- a/validate_verb_list.py
+++ b/validate_verb_list.py
@ -0,0 +1,251 @@
+#!/usr/bin/env python3
+"""
+Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
+
+For each verb:
+  1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
+  2. Searches pealim.com to find URL slug
+  3. Fetches the page to confirm the binyan
+  4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
+
+Output:
+  verbs_input.txt  — cleaned verb list for conjugation_extract.py
+  Printed validation report table
+
+Usage:
+  python3 validate_verb_list.py
+
+After running, review verbs_input.txt (especially REVIEW-flagged entries) before
+running conjugation extraction.
+"""
+
+import re
+import sys
+import time
+import urllib.parse
+from pathlib import Path
+
+import requests
+from bs4 import BeautifulSoup
+
+PEALIM_BASE    = "https://www.pealim.com"
+REQUEST_DELAY  = 1.5
+REQUEST_TIMEOUT = 15
+SOURCE_FILE    = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
+OUTPUT_FILE    = Path(__file__).parent / "verbs_input.txt"
+
+# Known problem entries: word → (action, note)
+# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
+KNOWN_ISSUES: dict[str, tuple[str, str]] = {
+    "לגבוה":   ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
+    "לההרג":   ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
+    "להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
+    "להקלל":   ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
+    "המציא":   ("3ms",    "Hif'il 3ms past form, not an infinitive"),
+    "קומם":    ("3ms",    "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
+}
+
+# Expected binyan by line range (1-indexed) per plan analysis
+LINE_RANGES: list[tuple[range, str]] = [
+    (range(1,  18),  "Pa'al"),
+    (range(18, 29),  "Nif'al"),
+    (range(29, 37),  "Pi'el"),
+    (range(37, 43),  "Pu'al"),
+    (range(43, 53),  "Hitpa'el"),
+    (range(53, 63),  "Hif'il"),
+    (range(63, 71),  "Huf'al"),
+]
+
+SECTION_HEADERS: dict[str, str] = {
+    "Pa'al":    "# Pa'al (פָּעַל)",
+    "Nif'al":   "# Nif'al (נִפְעַל)",
+    "Pi'el":    "# Pi'el (פִּעֵל)",
+    "Pu'al":    "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
+    "Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
+    "Hif'il":   "# Hif'il (הִפְעִיל)",
+    "Huf'al":   "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
+}
+
+session = requests.Session()
+session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
+
+
+def classify_by_line(line_num: int) -> str:
+    """Return expected binyan for a 1-indexed line number."""
+    for r, binyan in LINE_RANGES:
+        if line_num in r:
+            return binyan
+    return "Unknown"
+
+
+def find_slug(query: str) -> str | None:
+    """Search pealim.com and return first URL slug found."""
+    url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
+    try:
+        resp = session.get(url, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
+        return slugs[0] if slugs else None
+    except Exception as e:
+        print(f"  ERROR searching {query!r}: {e}", file=sys.stderr)
+        return None
+
+
+def get_page_binyan(slug: str) -> str:
+    """Fetch /dict/<slug>/ and extract binyan from page header."""
+    url = f"{PEALIM_BASE}/dict/{slug}/"
+    try:
+        resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        soup = BeautifulSoup(resp.text, "lxml")
+        binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
+        for h3 in soup.find_all("h3", class_="page-header"):
+            text = h3.get_text(" ", strip=True)
+            for bname in binyan_names:
+                if bname in text:
+                    return bname
+        meta = soup.find("meta", {"property": "og:description"})
+        if meta:
+            desc = meta.get("content", "")
+            for bname in binyan_names:
+                if bname in desc:
+                    return bname
+    except Exception as e:
+        print(f"  ERROR fetching {slug}: {e}", file=sys.stderr)
+    return ""
+
+
+def main() -> None:
+    if not SOURCE_FILE.exists():
+        print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
+        sys.exit(1)
+
+    lines = [l.strip() for l in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if l.strip()]
+    print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
+    print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
+
+    results = []
+
+    for line_num, word in enumerate(lines, start=1):
+        expected_binyan = classify_by_line(line_num)
+        issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
+
+        # Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
+        is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
+
+        print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
+
+        if issue_type == "REVIEW":
+            # Don't query pealim for known-bad entries
+            print(f"REVIEW  (skipping query)")
+            results.append({
+                "line": line_num, "word": word,
+                "expected_binyan": expected_binyan,
+                "slug": "", "page_binyan": "",
+                "status": "REVIEW", "notes": issue_note,
+                "is_3ms": is_3ms_by_position,
+            })
+            continue
+
+        time.sleep(REQUEST_DELAY)
+        slug = find_slug(word)
+
+        if slug:
+            time.sleep(REQUEST_DELAY)
+            page_binyan = get_page_binyan(slug)
+        else:
+            page_binyan = ""
+
+        # Determine status
+        if issue_type == "3ms" or is_3ms_by_position:
+            status = "3ms"
+            notes = issue_note or "Pu'al/Huf'al 3ms past form"
+        elif not slug:
+            status = "NOT_FOUND"
+            notes = "no search result on pealim.com"
+        elif page_binyan and expected_binyan and page_binyan != expected_binyan:
+            status = "MISMATCH"
+            notes = f"expected {expected_binyan}, page says {page_binyan}"
+        else:
+            status = "OK"
+            notes = ""
+
+        print(f"{status:<12}  slug={slug or '-':<35}  binyan={page_binyan or '-'}")
+        results.append({
+            "line": line_num, "word": word,
+            "expected_binyan": expected_binyan,
+            "slug": slug or "", "page_binyan": page_binyan,
+            "status": status, "notes": notes,
+            "is_3ms": is_3ms_by_position or issue_type == "3ms",
+        })
+
+    # ── Write cleaned verbs_input.txt ────────────────────────────────────────────
+    sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
+    review_lines: list[str] = []
+
+    for r in results:
+        b = r["expected_binyan"]
+        if b not in sections:
+            b = list(sections.keys())[0]
+
+        if r["status"] == "REVIEW":
+            review_lines.append(f"# REVIEW: {r['word']}  — {r['notes']}")
+        elif r["status"] == "3ms":
+            sections[b].append(f"# 3ms: {r['word']}")
+        elif r["status"] in ("OK", "MISMATCH"):
+            sections[b].append(r["word"])
+        else:  # NOT_FOUND
+            sections[b].append(f"# NOT_FOUND: {r['word']}  — {r['notes']}")
+
+    output_lines = [
+        "# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
+        "# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
+        "# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
+        "# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
+        "",
+    ]
+    for binyan, header in SECTION_HEADERS.items():
+        if sections.get(binyan):
+            output_lines.append(header)
+            output_lines.extend(sections[binyan])
+            output_lines.append("")
+
+    if review_lines:
+        output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
+        output_lines.extend(review_lines)
+        output_lines.append("")
+
+    OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
+    print(f"\nWrote → {OUTPUT_FILE}")
+
+    # ── Print summary table ──────────────────────────────────────────────────────
+    col_w = [4, 22, 14, 38, 12]
+    print("\n" + "=" * 95)
+    print("VALIDATION REPORT")
+    print("=" * 95)
+    print(f"{'#':>4}  {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12}  Notes")
+    print("-" * 95)
+    for r in results:
+        print(
+            f"{r['line']:>4}  {r['word']:<22} {r['status']:<14} "
+            f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12}  {r['notes']}"
+        )
+    print("=" * 95)
+
+    counts = {s: sum(1 for r in results if r["status"] == s)
+              for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
+    print(
+        f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
+        f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
+    )
+    print(f"Total entries: {len(results)}")
+
+    if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
+        print(
+            "\n⚠  Review flagged entries in verbs_input.txt before running:\n"
+            "   python3 conjugation_extract.py"
+        )
+
+
+if __name__ == "__main__":
+    main()
--- a/verbs_input.txt
+++ b/verbs_input.txt
@ -1,91 +1,90 @@
-# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.
-# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
-#   Cambridge University Press, 2005.
-# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.
+# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew
+# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).
+# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.
+# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.

 # Pa'al (פָּעַל)
-לָלֶכֶת
-לָבוֹא
-לָשֶׁבֶת
-לָקוּם
-לָשִׂים
-לָדַעַת
-לִרְאוֹת
-לוֹמַר
-לַעֲשׂוֹת
-לִתֵּן
-לִקְחַת
-לֶאֱכֹל
-לִשְׁתּוֹת
-לִכְתּוֹב
-לִקְרוֹא
-לִשְׁמוֹר
-לִשְׁמֹעַ
-לִפְתּוֹחַ
-לִסְגּוֹר
-לִנְסוֹעַ
-לִרְכּוֹב
-לִשְׁכַּב
-לַחְשׁוֹב
-לִבְכּוֹת
-לָרוּץ
-לִשְׁאֹל
-לַעֲנוֹת
-לִמְכּוֹר
-לִקְנוֹת
-לִלְמֹד
+לשמור
+ללמוד
+לאסוף
+לעבוד
+לחבוש
+לאכול
+לשאול
+לשלוח
+לשבת
+לרשת
+לפול
+לקום
+לשים
+לחון
+לקרוא
+לקנות

 # Nif'al (נִפְעַל)
-לְהִכָּנֵס
-לְהִפָּתַח
-לְהִסָּגֵר
-לְהִשָּׁמֵר
-לְהִמָּצֵא
-לְהִרְאוֹת
-לְהִכָּתֵב
-לְהִשָּׁבֵר
+להיבדק
+להרדם
+להחקר
+להישאר
+להיפגע
+להיוולד
+להנצל
+להיסוג
+להימצא
+להיבנות

 # Pi'el (פִּעֵל)
-לְדַבֵּר
-לְסַפֵּר
-לְבַקֵּשׁ
-לְקַבֵּל
-לְשַׁלֵּם
-לְצַלֵּם
-לְנַסּוֹת
-לְחַכּוֹת
-לְטַלְפֵן
-לְבַשֵּׁל
+לדבר
+לברך
+לנהל
+לנצח
+לקומם
+למלא
+לחכות
+לגלגל

 # Pu'al (פֻּעַל) — 3ms past, no infinitive
-# 3ms: דֻּבַּר
-# 3ms: סֻפַּר
-# 3ms: בֻּקַּשׁ
-# 3ms: קֻבַּל
+# 3ms: בותל
+# 3ms: תואם
+# 3ms: קומם
+# 3ms: דוכא
+# 3ms: זוכה
+# 3ms: פורסם

 # Hitpa'el (הִתְפַּעֵל)
-לְהִתְלַבֵּשׁ
-לְהִתְרַחֵץ
-לְהִתְנַהֵג
-לְהִתְחַתֵּן
-לְהִתְגּוֹרֵר
-לְהִתְכּוֹנֵן
-לְהִתְחִיל
+להתלבש
+להסתלק
+להצטלם
+להזדקק
+להתנהג
+להתקומם
+להתפלא
+להתגלות
+להתקלקל

 # Hif'il (הִפְעִיל)
-לְהַגִּיד
-לְהַבִּין
-לְהַכִּיר
-לְהַרְגִּישׁ
-לְהַחְלִיט
-לְהַתְחִיל
-לְהַכְנִיס
-לְהוֹצִיא
-לְהוֹרִיד
-לְהַעְלוֹת
+להכניס
+להעסיק
+להחליט
+להבטיח
+להוריד
+להפיל
+להקים
+# 3ms: המציא
+להרשות

 # Huf'al (הֻפְעַל) — 3ms past, no infinitive
-# 3ms: הוּגַד
-# 3ms: הוּבַן
-# 3ms: הוּכְנַס
-# 3ms: הוּצָא
+# 3ms: הוגבל
+# 3ms: העבר
+# 3ms: הוזהר
+# 3ms: הופל
+# 3ms: הוקם
+# 3ms: הוחל
+# 3ms: הוקפא
+# 3ms: הופנה
+
+# ── Entries flagged for manual review ──────────────────────────────────────────
+# REVIEW: לגבוה  — not a standard infinitive form; likely defective spelling or wrong word
+# REVIEW: לההרג  — extra ה; should probably be להיהרג (Nif'al of הרג)
+# REVIEW: להתלקלח  — not a real word; likely typo for להתקלקל
+# REVIEW: להקלל  — ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל