feat: add apkg builder, frequency, Ben Yehuda examples, conjugation deck

Implements four major improvements to the Pealim Anki deck pipeline: 1. Automated .apkg generation (genanki) — no more manual Anki Desktop step. Both vocabulary and conjugation decks are built programmatically. 2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus. Notes sorted by rank so Anki presents most common words first. 3. Example sentences from Ben Yehuda public domain corpus (not pealim.com). Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set. 4. Conjugation drill deck — one card per form × verb. Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one per binyan). Extracts 28 forms each via pealim.com/search/ + table parse. New files: apkg_builder.py — genanki deck builder for both decks benyehuda.py — Ben Yehuda corpus downloader + sentence indexer frequency_lookup.py — FrequencyWords downloader + rank lookup verbs_input.txt — verb input list (7 test verbs, one per binyan) data/ — baseline CSVs + generated caches Updated: conjugation_extract.py — rewritten: reads verbs_input.txt, searches /search/?q= for slug, parses table by row labels requirements.txt — add genanki, beautifulsoup4, lxml run.py — full orchestration pipeline with CLI flags .gitignore — exclude venv/, benyehuda_index.json, audio/, output/ CLI: python run.py --skip-scrape --skip-audio --test 20 (quick test) python run.py --skip-scrape (full build) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 01:58:31 +00:00 · 2026-03-03 01:58:31 +00:00 · b086123bec
commit b086123bec
parent e23b353064
13 changed files with 23502 additions and 162 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,15 @@ lib**
 include**
 lib64**
 pyvenv.cfg
+venv/
+__pycache__/
+*.pyc
+
+# Large generated cache files (rebuild locally)
+data/benyehuda_index.json
+
+# Audio directory (large; rebuild with --skip-scrape)
+data/audio/
+
+# Output .apkg files (generated by pipeline)
+output/
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -0,0 +1,428 @@
+#!/usr/bin/env python3
+"""
+Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
+Uses genanki for reliable, stable deck generation.
+
+Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
+in Anki rather than creating a duplicate.
+"""
+
+import json
+import logging
+import unicodedata
+from pathlib import Path
+from typing import Optional
+
+import genanki
+import pandas as pd
+
+logger = logging.getLogger(__name__)
+
+# Stable deck/model IDs — do not change these
+VOCAB_DECK_ID   = 1_234_567_890
+VOCAB_MODEL_ID  = 1_234_567_891
+CONJ_DECK_ID    = 1_234_567_892
+CONJ_MODEL_ID   = 1_234_567_893
+
+DATA_DIR   = Path(__file__).parent / "data"
+AUDIO_DIR  = DATA_DIR / "audio"
+OUTPUT_DIR = Path(__file__).parent / "output"
+
+VOCAB_APKG  = OUTPUT_DIR / "pealim_vocabulary.apkg"
+CONJ_APKG   = OUTPUT_DIR / "pealim_conjugations.apkg"
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Shared CSS
+# ──────────────────────────────────────────────────────────────────────────────
+
+CARD_CSS = """
+.card {
+  font-family: Arial, sans-serif;
+  font-size: 20px;
+  text-align: center;
+  color: #222;
+  background: #fff;
+  padding: 16px;
+}
+.hebrew {
+  font-size: 36px;
+  font-weight: bold;
+  direction: rtl;
+  text-align: right;
+  line-height: 1.5;
+  color: #1a1a8c;
+}
+.hebrew-sm {
+  font-size: 24px;
+  direction: rtl;
+  text-align: right;
+  color: #333;
+}
+.label {
+  font-size: 13px;
+  color: #888;
+  text-transform: uppercase;
+  letter-spacing: 0.05em;
+  margin-top: 10px;
+}
+.meaning {
+  font-size: 28px;
+  color: #111;
+  margin: 8px 0;
+}
+.root-info {
+  font-size: 16px;
+  color: #555;
+  margin-top: 6px;
+  direction: rtl;
+}
+.example {
+  font-size: 16px;
+  color: #444;
+  direction: rtl;
+  text-align: right;
+  font-style: italic;
+  margin-top: 10px;
+  border-left: 3px solid #aaa;
+  padding-left: 8px;
+}
+.divider { border-top: 1px solid #ddd; margin: 10px 0; }
+.freq-badge {
+  display: inline-block;
+  font-size: 12px;
+  color: #fff;
+  background: #0070c0;
+  border-radius: 10px;
+  padding: 2px 8px;
+  margin-top: 4px;
+}
+"""
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Vocabulary Deck
+# ──────────────────────────────────────────────────────────────────────────────
+
+VOCAB_FRONT_HEB = """
+<div class="hebrew">{{Word}}</div>
+{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
+<div class="label">What does this mean?</div>
+"""
+
+VOCAB_BACK_HEB = """
+{{FrontSide}}
+<div class="divider"></div>
+<div class="meaning">{{Meaning}}</div>
+<div class="label">Root</div>
+<div class="hebrew-sm">{{Root}}</div>
+<div class="label">Part of Speech</div>
+<div style="font-size:15px;color:#555">{{PoS}}</div>
+{{#SharedRoots}}
+<div class="label">Related words (same root)</div>
+<div class="root-info">{{SharedRoots}}</div>
+{{/SharedRoots}}
+{{#Example}}
+<div class="label">Example</div>
+<div class="example">{{Example}}</div>
+{{/Example}}
+{{#Frequency}}<div class="freq-badge">Rank #{{Frequency}}</div>{{/Frequency}}
+"""
+
+VOCAB_FRONT_ENG = """
+<div class="meaning">{{Meaning}}</div>
+<div class="label">Translate to Hebrew</div>
+"""
+
+VOCAB_BACK_ENG = """
+{{FrontSide}}
+<div class="divider"></div>
+<div class="hebrew">{{Word}}</div>
+{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
+<div class="label">Without nikkud</div>
+<div class="hebrew-sm">{{WordNoNikkud}}</div>
+<div class="label">Root</div>
+<div class="hebrew-sm">{{Root}}</div>
+<div class="label">Part of Speech</div>
+<div style="font-size:15px;color:#555">{{PoS}}</div>
+{{#Example}}
+<div class="label">Example</div>
+<div class="example">{{Example}}</div>
+{{/Example}}
+"""
+
+VOCAB_MODEL = genanki.Model(
+    VOCAB_MODEL_ID,
+    "Pealim Hebrew",
+    fields=[
+        {"name": "Word"},
+        {"name": "Root"},
+        {"name": "PoS"},
+        {"name": "Meaning"},
+        {"name": "WordNoNikkud"},
+        {"name": "SharedRoots"},
+        {"name": "Tags"},
+        {"name": "Audio"},
+        {"name": "Example"},
+        {"name": "Frequency"},
+    ],
+    templates=[
+        {
+            "name": "Hebrew → English",
+            "qfmt": VOCAB_FRONT_HEB,
+            "afmt": VOCAB_BACK_HEB,
+        },
+        {
+            "name": "English → Hebrew",
+            "qfmt": VOCAB_FRONT_ENG,
+            "afmt": VOCAB_BACK_ENG,
+        },
+    ],
+    css=CARD_CSS,
+)
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Conjugation Deck
+# ──────────────────────────────────────────────────────────────────────────────
+
+CONJ_FRONT = """
+<div class="label">פועל (Verb)</div>
+<div class="hebrew">{{ReferenceForm}}</div>
+{{#Pronoun}}<div class="hebrew-sm">{{Pronoun}}</div>{{/Pronoun}}
+<div class="label">זמן (Tense)</div>
+<div class="hebrew-sm">{{Tense}}</div>
+<div class="label">מה הצורה? (What is the form?)</div>
+"""
+
+CONJ_BACK = """
+{{FrontSide}}
+<div class="divider"></div>
+<div class="hebrew">{{ConjugatedForm}}</div>
+<div class="label">שורש (Root): {{Root}} &nbsp;|&nbsp; בניין (Binyan): {{Binyan}}</div>
+"""
+
+CONJ_CSS = CARD_CSS + """
+.card { direction: rtl; }
+.label { direction: ltr; }
+"""
+
+CONJ_MODEL = genanki.Model(
+    CONJ_MODEL_ID,
+    "Pealim Conjugation",
+    fields=[
+        {"name": "Infinitive"},
+        {"name": "ReferenceForm"},
+        {"name": "Pronoun"},
+        {"name": "Tense"},
+        {"name": "ConjugatedForm"},
+        {"name": "Root"},
+        {"name": "Binyan"},
+    ],
+    templates=[
+        {
+            "name": "Conjugation Drill",
+            "qfmt": CONJ_FRONT,
+            "afmt": CONJ_BACK,
+        }
+    ],
+    css=CONJ_CSS,
+)
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+def _strip_nikkud(text: str) -> str:
+    return "".join(
+        ch for ch in unicodedata.normalize("NFD", text)
+        if unicodedata.category(ch) != "Mn"
+    )
+
+
+def _audio_tag(word_no_nikkud: str) -> str:
+    """Return [sound:xxx.mp3] if audio file exists, else empty string."""
+    safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
+    if not safe:
+        return ""
+    mp3_path = AUDIO_DIR / f"{safe}.mp3"
+    if mp3_path.exists():
+        return f"[sound:{mp3_path.name}]"
+    return ""
+
+
+import re
+
+
+def build_vocab_deck(
+    dict_csv: Path,
+    examples_cache: Optional[dict] = None,
+    freq_cache: Optional[dict] = None,
+    limit: Optional[int] = None,
+) -> tuple[genanki.Deck, list[Path]]:
+    """
+    Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv).
+    Returns (deck, list_of_media_files).
+    """
+    logger.info(f"Loading dictionary from {dict_csv}")
+    # Try semicolon separator first (enriched CSV), fall back to comma
+    try:
+        df = pd.read_csv(dict_csv, sep=";", index_col=0)
+        if df.shape[1] < 3:
+            raise ValueError("too few columns")
+    except Exception:
+        df = pd.read_csv(dict_csv, index_col=0)
+
+    if limit:
+        df = df.head(limit)
+
+    logger.info(f"  {len(df)} rows loaded")
+
+    examples_cache = examples_cache or {}
+    freq_cache = freq_cache or {}
+
+    # Sort by frequency rank (ascending) so Anki presents common words first
+    def freq_sort_key(row):
+        word_plain = str(row.get("Word Without Nikkud", row.get("WordNoNikkud", ""))).strip()
+        word_plain = _strip_nikkud(word_plain)
+        return freq_cache.get(word_plain, 999_999)
+
+    df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
+    df = df.sort_values("_freq_rank")
+
+    deck = genanki.Deck(VOCAB_DECK_ID, "Pealim Hebrew Vocabulary")
+    media_files: list[Path] = []
+
+    for _, row in df.iterrows():
+        word         = str(row.get("Word", "")).strip()
+        root         = str(row.get("Root", "")).strip()
+        pos          = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
+        meaning      = str(row.get("Meaning", "")).strip()
+        word_no_nik  = str(row.get("Word Without Nikkud", "")).strip()
+        shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
+        tags_str     = str(row.get("tags", row.get("Tags", ""))).strip()
+        freq_rank    = int(row["_freq_rank"]) if row["_freq_rank"] < 999_999 else ""
+
+        # Audio
+        audio_tag = _audio_tag(word_no_nik)
+        if audio_tag:
+            mp3_name = audio_tag[7:-1]  # strip [sound: and ]
+            mp3_path = AUDIO_DIR / mp3_name
+            if mp3_path not in media_files:
+                media_files.append(mp3_path)
+
+        # Example sentences
+        plain_key = _strip_nikkud(word_no_nik)
+        examples_list = examples_cache.get(plain_key, examples_cache.get(word_no_nik, []))
+        example_html = "<br>".join(examples_list[:2]) if examples_list else ""
+
+        # Clean up nan values
+        for val, default in [(root, ""), (pos, ""), (meaning, ""), (word_no_nik, ""),
+                              (shared_roots, ""), (tags_str, "")]:
+            if val in ("nan", "None"):
+                val = default
+
+        root        = "" if root in ("nan", "None", "-") else root
+        pos         = "" if pos in ("nan", "None") else pos
+        meaning     = "" if meaning in ("nan", "None") else meaning
+        word_no_nik = "" if word_no_nik in ("nan", "None") else word_no_nik
+        shared_roots = "" if shared_roots in ("nan", "None") else shared_roots
+        tags_str    = "" if tags_str in ("nan", "None") else tags_str
+
+        if not word or not meaning:
+            continue
+
+        note = genanki.Note(
+            model=VOCAB_MODEL,
+            fields=[
+                word,
+                root,
+                pos,
+                meaning,
+                word_no_nik,
+                shared_roots,
+                tags_str,
+                audio_tag,
+                example_html,
+                str(freq_rank),
+            ],
+            tags=tags_str.split() if tags_str else [],
+        )
+        deck.add_note(note)
+
+    logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
+    return deck, media_files
+
+
+def build_conj_deck(conjugations: dict) -> genanki.Deck:
+    """Build the conjugation drill deck from conjugations.json data."""
+    deck = genanki.Deck(CONJ_DECK_ID, "Pealim Hebrew Conjugations")
+    note_count = 0
+
+    for infinitive, data in conjugations.items():
+        if not data or not data.get("forms"):
+            continue
+        root         = data.get("root", "")
+        binyan       = data.get("binyan", "")
+        ref_form     = data.get("reference_form", infinitive)
+
+        for form_key, form_data in data["forms"].items():
+            conj_form = form_data.get("form", "")
+            pronoun   = form_data.get("pronoun", "")
+            tense     = form_data.get("tense", "")
+
+            if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
+                continue
+
+            note = genanki.Note(
+                model=CONJ_MODEL,
+                fields=[
+                    infinitive,
+                    ref_form,
+                    pronoun,
+                    tense,
+                    conj_form,
+                    root,
+                    binyan,
+                ],
+            )
+            deck.add_note(note)
+            note_count += 1
+
+    logger.info(f"Conjugation deck: {note_count} notes across {sum(1 for v in conjugations.values() if v)} verbs")
+    return deck
+
+
+def write_vocab_apkg(
+    deck: genanki.Deck,
+    media_files: list[Path],
+    out_path: Path = VOCAB_APKG,
+) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    pkg = genanki.Package(deck)
+    pkg.media_files = [str(p) for p in media_files if p.exists()]
+    pkg.write_to_file(str(out_path))
+    logger.info(f"Vocabulary deck written → {out_path}")
+
+
+def write_conj_apkg(deck: genanki.Deck, out_path: Path = CONJ_APKG) -> None:
+    out_path.parent.mkdir(parents=True, exist_ok=True)
+    genanki.Package(deck).write_to_file(str(out_path))
+    logger.info(f"Conjugation deck written → {out_path}")
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+
+    # Quick self-test with 20 words, no audio, no examples
+    csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
+    if not csv_path.exists():
+        csv_path = DATA_DIR / "pealim_dict.csv"
+
+    deck, media = build_vocab_deck(csv_path, limit=20)
+    write_vocab_apkg(deck, media)
+
+    conj_path = DATA_DIR / "conjugations.json"
+    if conj_path.exists():
+        with open(conj_path) as f:
+            conjugations = json.load(f)
+        conj_deck = build_conj_deck(conjugations)
+        write_conj_apkg(conj_deck)
--- a/benyehuda.py
+++ b/benyehuda.py
@ -0,0 +1,160 @@
+#!/usr/bin/env python3
+"""
+Ben Yehuda corpus example-sentence lookup.
+Downloads plaintext-no-nikkud ZIP once, indexes sentences, then answers queries locally.
+Exposed API: get_examples(word_no_nikkud) -> list[str]
+"""
+
+import json
+import logging
+import re
+import unicodedata
+import zipfile
+from io import BytesIO
+from pathlib import Path
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+CORPUS_URL = (
+    "https://github.com/projectbenyehuda/public_domain_dump/releases/"
+    "download/2025-10/txt_stripped.zip"
+)
+INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
+EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
+REQUEST_TIMEOUT = 120
+MIN_SENTENCE_LEN = 15
+MAX_EXAMPLES_PER_WORD = 2
+MAX_INDEX_ENTRIES = 500  # cap examples kept per word in index to limit memory
+
+# Module-level state
+_index: dict[str, list[str]] = {}          # word -> [sentence, ...]
+_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
+
+
+def _strip_nikkud(text: str) -> str:
+    return "".join(
+        ch for ch in unicodedata.normalize("NFD", text)
+        if unicodedata.category(ch) != "Mn"
+    )
+
+
+def _split_sentences(text: str) -> list[str]:
+    """Split text into sentences on common sentence-ending punctuation."""
+    raw = re.split(r"[.!?؟\n]{1,3}", text)
+    out = []
+    for s in raw:
+        s = s.strip()
+        if len(s) >= MIN_SENTENCE_LEN:
+            out.append(s)
+    return out
+
+
+def _build_index(corpus_zip_bytes: bytes) -> None:
+    """Parse corpus ZIP and build word → sentences index."""
+    global _index
+    _index = {}
+    logger.info("Building Ben Yehuda index from corpus …")
+
+    with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
+        txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
+        logger.info(f"  Corpus contains {len(txt_files)} text files")
+        for fname in txt_files:
+            try:
+                raw = zf.read(fname).decode("utf-8", errors="ignore")
+            except Exception:
+                continue
+            for sentence in _split_sentences(raw):
+                words = re.findall(r"[\u05d0-\u05ea'\"]+", sentence)
+                for w in set(words):
+                    if len(w) >= 2:
+                        if w not in _index:
+                            _index[w] = []
+                        if len(_index[w]) < MAX_INDEX_ENTRIES:
+                            _index[w].append(sentence)
+
+    logger.info(f"Index built: {len(_index)} unique words")
+
+
+def _save_index() -> None:
+    INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(INDEX_PATH, "w", encoding="utf-8") as f:
+        json.dump(_index, f, ensure_ascii=False)
+    logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
+
+
+def _load_index() -> None:
+    global _index
+    with open(INDEX_PATH, encoding="utf-8") as f:
+        _index = json.load(f)
+    logger.info(f"Ben Yehuda index loaded: {len(_index)} words")
+
+
+def load(force_rebuild: bool = False) -> None:
+    """Load or build the Ben Yehuda index. Downloads corpus if needed."""
+    global _index, _examples_cache
+    if _index and not force_rebuild:
+        return
+
+    # Load persisted examples cache
+    if EXAMPLES_CACHE_PATH.exists():
+        with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
+            _examples_cache = json.load(f)
+
+    if INDEX_PATH.exists() and not force_rebuild:
+        _load_index()
+        return
+
+    logger.info("Downloading Ben Yehuda corpus … (this may take 1-2 minutes)")
+    resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
+    resp.raise_for_status()
+    data = resp.content
+    logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
+
+    _build_index(data)
+    _save_index()
+
+
+def save_examples_cache() -> None:
+    EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
+        json.dump(_examples_cache, f, ensure_ascii=False)
+    logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
+
+
+def get_examples(word_no_nikkud: str) -> list[str]:
+    """
+    Return up to 2 shortest complete sentences (≥15 chars) containing word_no_nikkud
+    as a whole token.  Results are cached; subsequent calls for the same word are instant.
+    """
+    if not _index:
+        load()
+
+    word = _strip_nikkud(word_no_nikkud.strip())
+
+    if word in _examples_cache:
+        return _examples_cache[word]
+
+    candidates = _index.get(word, [])
+    # Filter: must contain word as whole token (word boundary)
+    pattern = r"(?<![^\s\W])" + re.escape(word) + r"(?![^\s\W])"
+    matched = [s for s in candidates if re.search(pattern, s)]
+
+    # Sort by length (prefer shorter, more natural sentences)
+    matched.sort(key=len)
+    result = matched[:MAX_EXAMPLES_PER_WORD]
+    _examples_cache[word] = result
+    return result
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    load()
+    tests = ["שלום", "בית", "ספר", "מים", "אהבה", "ילד"]
+    for w in tests:
+        exs = get_examples(w)
+        print(f"\n{w}: {len(exs)} examples")
+        for ex in exs:
+            print(f"  → {ex[:80]}")
+    save_examples_cache()
--- a/conjugation_extract.py
+++ b/conjugation_extract.py
@ -1,153 +1,408 @@
 #!/usr/bin/env python3
 """
 Extract Hebrew verb conjugations from pealim.com.
-Scrapes conjugation tables for specific verbs.
+Input: verbs_input.txt  (one Hebrew infinitive per line)
+Output: data/conjugations.json
+
+For each verb:
+  1. Search pealim.com/search/?q=<verb> to find URL slug
+  2. Fetch /dict/<slug>/ with hebstyle=mo cookie
+  3. Parse conjugation table by row labels
+
+Resume-safe: verbs already in conjugations.json are skipped.
 """

-import requests
-import pandas as pd
-import numpy as np
+import json
 import logging
+import re
 import time
+import urllib.parse
+from pathlib import Path
+
+import requests
+from bs4 import BeautifulSoup

-# Configure logging
-logging.basicConfig(
-    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
-)
 logger = logging.getLogger(__name__)

-# Session for connection pooling
+PEALIM_BASE = "https://www.pealim.com"
+REQUEST_DELAY = 1.5
+REQUEST_TIMEOUT = 15
+VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
+CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
+
+# Pronoun labels (for card front display)
+PRONOUN_LABELS = {
+    "present_ms": "",
+    "present_fs": "",
+    "present_mp": "",
+    "present_fp": "",
+    "past_1s":    "אֲנִי",
+    "past_1p":    "אֲנַחְנוּ",
+    "past_2ms":   "אַתָּה",
+    "past_2fs":   "אַתְּ",
+    "past_2mp":   "אַתֶּם",
+    "past_2fp":   "אַתֶּן",
+    "past_3ms":   "הוּא",
+    "past_3fs":   "הִיא",
+    "past_3p":    "הֵם / הֵן",
+    "future_1s":  "אֲנִי",
+    "future_1p":  "אֲנַחְנוּ",
+    "future_2ms": "אַתָּה",
+    "future_2fs": "אַתְּ",
+    "future_2mp": "אַתֶּם",
+    "future_2fp": "אַתֶּן",
+    "future_3ms": "הוּא",
+    "future_3fs": "הִיא",
+    "future_3mp": "הֵם",
+    "future_3fp": "הֵן",
+    "imperative_ms": "אַתָּה",
+    "imperative_fs": "אַתְּ",
+    "imperative_mp": "אַתֶּם",
+    "imperative_fp": "אַתֶּן",
+    "infinitive": "",
+}
+
+# Human-readable tense description for card front
+TENSE_DESCRIPTION = {
+    "present_ms": "הוֹוֶה (זכר יחיד)",
+    "present_fs": "הוֹוֶה (נקבה יחיד)",
+    "present_mp": "הוֹוֶה (זכר רבים)",
+    "present_fp": "הוֹוֶה (נקבה רבים)",
+    "past_1s":    "עָבָר",
+    "past_1p":    "עָבָר",
+    "past_2ms":   "עָבָר",
+    "past_2fs":   "עָבָר",
+    "past_2mp":   "עָבָר",
+    "past_2fp":   "עָבָר",
+    "past_3ms":   "עָבָר",
+    "past_3fs":   "עָבָר",
+    "past_3p":    "עָבָר",
+    "future_1s":  "עָתִיד",
+    "future_1p":  "עָתִיד",
+    "future_2ms": "עָתִיד",
+    "future_2fs": "עָתִיד",
+    "future_2mp": "עָתִיד",
+    "future_2fp": "עָתִיד",
+    "future_3ms": "עָתִיד",
+    "future_3fs": "עָתִיד",
+    "future_3mp": "עָתִיד",
+    "future_3fp": "עָתִיד",
+    "imperative_ms": "צִוּוּי",
+    "imperative_fs": "צִוּוּי",
+    "imperative_mp": "צִוּוּי",
+    "imperative_fp": "צִוּוּי",
+    "infinitive": "מְקוֹר",
+}
+
 session = requests.Session()
-session.headers.update({
-    'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
-})
-
-PEALIM_BASE_URL = "https://www.pealim.com/dict"
-REQUEST_TIMEOUT = 10
-REQUEST_DELAY = 1.0  # seconds between requests (respectful scraping)
-
-# Conjugation column order (standard Hebrew verb forms)
-CONJUGATION_COLUMNS = [
-    'present_ms', 'present_fs', 'present_mp', 'present_fp',
-    'past_1s', 'past_1p', 'past_2ms', 'past_2fs', 'past_2mp', 'past_2fp',
-    'past_3ms', 'past_3fs', 'past_3p',
-    'future_1s', 'future_1p', 'future_2ms', 'future_2fs', 'future_2mp', 'future_2fp',
-    'future_3ms', 'future_3fs', 'future_3mp', 'future_3fp',
-    'imperative_ms', 'imperative_fs', 'imperative_mp', 'imperative_fp',
-    'infinitive'
-]
+session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})


-def extract_verb(url_suffix: str, max_retries: int = 3) -> pd.DataFrame:
-    """
-    Extract conjugation table for a single verb.
-    
-    Args:
-        url_suffix: URL suffix (e.g., '2255-lishmor', '860-lishon')
-        max_retries: Maximum retry attempts on failure
-    
-    Returns:
-        DataFrame with conjugation forms, or None if extraction fails
-    """
-    url = f"{PEALIM_BASE_URL}/{url_suffix}"
-    
-    for attempt in range(max_retries):
+def _find_slug(infinitive: str) -> str | None:
+    """Search pealim.com/search/?q=<verb> and return the URL slug."""
+    url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(infinitive)}"
    try:
-            logger.info(f"Fetching: {url} (attempt {attempt + 1}/{max_retries})")
+        resp = session.get(url, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+        # Slugs look like /dict/2255-lishmor/
+        slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
+        if slugs:
+            slug = slugs[0]
+            logger.info(f"  Slug: {slug}")
+            return slug
+    except Exception as e:
+        logger.error(f"  Error searching for '{infinitive}': {e}")
+    return None

-            cookies = {
-                'translit': 'none',
-                'hebstyle': 'bp',
-                'showmeaning': 'off'
+
+def _is_passive_binyan(binyan: str) -> bool:
+    for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]:
+        if marker.lower() in binyan.lower():
+            return True
+    return False
+
+
+def _get_menukad(cell) -> str:
+    """Extract nikkud Hebrew text from a table cell."""
+    span = cell.find("span", class_="menukad")
+    if span:
+        return span.get_text(strip=True)
+    # fallback: any Hebrew text in cell
+    txt = cell.get_text(strip=True)
+    if re.search(r"[\u05d0-\u05ea]", txt):
+        return txt
+    return ""
+
+
+def _parse_table(soup: BeautifulSoup) -> dict[str, str]:
+    """
+    Parse the pealim conjugation table and return form_key -> Hebrew form mapping.
+
+    Table structure (rows after two header rows):
+      Row 2  (Present): [label x2] [ms] [fs] [mp] [fp]
+      Row 3  (Past 1):  [Past x1] [1st x1] [1s x2] [1p x2]
+      Row 4  (Past 2):  [2nd x1] [2ms] [2fs] [2mp] [2fp]
+      Row 5  (Past 3):  [3rd x1] [3ms] [3fs] [3p x2]
+      Row 6  (Fut 1):   [Future x1] [1st x1] [1s x2] [1p x2]
+      Row 7  (Fut 2):   [2nd x1] [2ms] [2fs] [2mp] [2fp]
+      Row 8  (Fut 3):   [3rd x1] [3ms] [3fs] [3mp] [3fp]
+      Row 9  (Imp):     [Imp x2] [ms] [fs] [mp] [fp]
+      Row 10 (Inf):     [Inf x2] [form x4]
+    """
+    table = soup.find("table", class_="conjugation-table")
+    if not table:
+        return {}
+
+    rows = table.find_all("tr")
+    if len(rows) < 9:
+        return {}
+
+    forms: dict[str, str] = {}
+
+    def row_forms(row_idx: int) -> list[str]:
+        """Extract all Hebrew form values from a row (expanding colspans)."""
+        cells = rows[row_idx].find_all(["th", "td"])
+        result = []
+        for cell in cells:
+            txt = _get_menukad(cell)
+            colspan = int(cell.get("colspan", 1))
+            if txt:
+                for _ in range(colspan):
+                    result.append(txt)
+            else:
+                for _ in range(colspan):
+                    result.append("")
+        return result
+
+    def first_heb_forms(row_idx: int) -> list[str]:
+        """Get only the Hebrew-text cells from a row (skip label cells)."""
+        cells = rows[row_idx].find_all(["th", "td"])
+        result = []
+        for cell in cells:
+            txt = _get_menukad(cell)
+            colspan = int(cell.get("colspan", 1))
+            if txt and re.search(r"[\u05d0-\u05ea]", txt):
+                for _ in range(colspan):
+                    result.append(txt)
+        return result
+
+    # Row label detection
+    def row_label(idx: int) -> str:
+        row = rows[idx]
+        return row.get_text(" ", strip=True).lower()
+
+    # Find rows by tense label
+    present_row = past_row = future_row = imp_row = inf_row = -1
+    for i, row in enumerate(rows):
+        label = row.get_text(" ", strip=True).lower()
+        if "present" in label and present_row < 0:
+            present_row = i
+        elif "past" in label and past_row < 0:
+            past_row = i
+        elif "future" in label and future_row < 0:
+            future_row = i
+        elif "imperative" in label and imp_row < 0:
+            imp_row = i
+        elif "infinitive" in label and inf_row < 0:
+            inf_row = i
+
+    # Present tense (4 forms: ms fs mp fp)
+    if present_row >= 0:
+        hf = first_heb_forms(present_row)
+        keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
+        for k, v in zip(keys, hf):
+            if v:
+                forms[k] = v
+
+    # Past tense (rows: 1st person, 2nd person, 3rd person)
+    if past_row >= 0:
+        # 1st person row
+        hf = first_heb_forms(past_row)
+        # Row has: [Past label] [1st label] [1s] [1s] [1p] [1p] or just [1s] [1p]
+        # After label stripping: we get 1s and 1p (possibly duplicated by colspan)
+        unique = list(dict.fromkeys(hf))  # deduplicate consecutive
+        if len(unique) >= 1:
+            forms["past_1s"] = unique[0]
+        if len(unique) >= 2:
+            forms["past_1p"] = unique[1]
+
+        # 2nd person row
+        if past_row + 1 < len(rows):
+            hf2 = first_heb_forms(past_row + 1)
+            keys = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
+            for k, v in zip(keys, hf2):
+                if v:
+                    forms[k] = v
+
+        # 3rd person row
+        if past_row + 2 < len(rows):
+            hf3 = first_heb_forms(past_row + 2)
+            # 3ms, 3fs, 3p (3p colspan=2 so may appear twice)
+            unique3 = list(dict.fromkeys(hf3))
+            keys3 = ["past_3ms", "past_3fs", "past_3p"]
+            for k, v in zip(keys3, unique3):
+                if v:
+                    forms[k] = v
+
+    # Future tense
+    if future_row >= 0:
+        # 1st person
+        hf = first_heb_forms(future_row)
+        unique = list(dict.fromkeys(hf))
+        if len(unique) >= 1:
+            forms["future_1s"] = unique[0]
+        if len(unique) >= 2:
+            forms["future_1p"] = unique[1]
+
+        if future_row + 1 < len(rows):
+            hf2 = first_heb_forms(future_row + 1)
+            keys = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
+            for k, v in zip(keys, hf2):
+                if v:
+                    forms[k] = v
+
+        if future_row + 2 < len(rows):
+            hf3 = first_heb_forms(future_row + 2)
+            keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
+            for k, v in zip(keys3, hf3):
+                if v:
+                    forms[k] = v
+
+    # Imperative
+    if imp_row >= 0:
+        hf = first_heb_forms(imp_row)
+        keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
+        for k, v in zip(keys, hf):
+            if v:
+                forms[k] = v
+
+    # Infinitive
+    if inf_row >= 0:
+        hf = first_heb_forms(inf_row)
+        if hf:
+            forms["infinitive"] = hf[0]
+
+    return forms
+
+
+def _extract_conjugations(slug: str, infinitive: str) -> dict | None:
+    """Fetch /dict/<slug>/ and parse conjugation table."""
+    url = f"{PEALIM_BASE}/dict/{slug}/"
+    try:
+        resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
+        resp.raise_for_status()
+    except Exception as e:
+        logger.error(f"  Error fetching {url}: {e}")
+        return None
+
+    soup = BeautifulSoup(resp.text, "lxml")
+
+    # Extract root from menukad span in header
+    root = ""
+    for span in soup.find_all("span", class_="menukad"):
+        txt = span.get_text(strip=True)
+        if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
+            root = txt
+            break
+
+    # Extract binyan / verb type from lead text or title
+    binyan = ""
+    meta = soup.find("meta", {"property": "og:description"})
+    if meta:
+        desc = meta.get("content", "")
+        for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]:
+            if bname in desc:
+                binyan = bname
+                break
+
+    forms = _parse_table(soup)
+
+    if not forms:
+        logger.warning(f"  No forms found for {slug}")
+        return None
+
+    is_passive = _is_passive_binyan(binyan)
+    reference_form = forms.get("infinitive", infinitive) if not is_passive else forms.get("past_3ms", infinitive)
+
+    result = {
+        "infinitive": infinitive,
+        "slug": slug,
+        "root": root,
+        "binyan": binyan,
+        "is_passive": is_passive,
+        "reference_form": reference_form,
+        "forms": {},
+    }
+    for key, form in forms.items():
+        if key in PRONOUN_LABELS:
+            result["forms"][key] = {
+                "form": form,
+                "pronoun": PRONOUN_LABELS[key],
+                "tense": TENSE_DESCRIPTION.get(key, ""),
            }

-            response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
-            response.raise_for_status()
+    logger.info(f"  Extracted {len(result['forms'])} forms for {infinitive}")
+    return result

-            # Parse HTML table
-            dfs = pd.read_html(response.content)
-            if not dfs:
-                logger.warning(f"No tables found for {url_suffix}")
-                return None

-            df = dfs[0]
+def _load_conjugations() -> dict:
+    if CONJUGATIONS_PATH.exists():
+        with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
+            return json.load(f)
+    return {}

-            # Extract conjugation forms (skip header columns, flatten)
-            # Adjust indices based on actual table structure
-            np_flat = df.iloc[:, 2:].values.flatten()

-            # Remove NaN and invalid entries
-            np_flat = np.delete(np_flat, [5, 7, 15, 17, 19, 33, 34, 35])
+def _save_conjugations(data: dict) -> None:
+    CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
+    with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
+        json.dump(data, f, ensure_ascii=False, indent=2)

-            # Create DataFrame with proper column names
-            df_result = pd.DataFrame([np_flat], columns=CONJUGATION_COLUMNS)
-            logger.info(f"✓ Extracted {url_suffix}")

-            return df_result
+def main(verbs_file: Path = VERBS_INPUT) -> dict:
+    """Read verbs from file and extract conjugations. Returns full conjugations dict."""
+    if not verbs_file.exists():
+        logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
+        return _load_conjugations()

-        except requests.RequestException as e:
-            logger.error(f"Network error for {url_suffix} (attempt {attempt + 1}): {e}")
-            if attempt < max_retries - 1:
-                time.sleep(2 ** attempt)  # Exponential backoff
+    verbs = [v.strip() for v in verbs_file.read_text(encoding="utf-8").splitlines()
+             if v.strip() and not v.startswith("#")]
+    logger.info(f"Loaded {len(verbs)} verbs from {verbs_file}")
+
+    conjugations = _load_conjugations()
+    new_count = 0
+
+    for verb in verbs:
+        if verb in conjugations:
+            logger.info(f"Skipping {verb} (cached)")
+            continue
+
+        logger.info(f"Processing: {verb}")
+        time.sleep(REQUEST_DELAY)
+        slug = _find_slug(verb)
+        if not slug:
+            logger.warning(f"  No slug found for {verb}")
+            conjugations[verb] = None
+            _save_conjugations(conjugations)
+            continue
+
+        time.sleep(REQUEST_DELAY)
+        data = _extract_conjugations(slug, verb)
+        conjugations[verb] = data
+        _save_conjugations(conjugations)
+        new_count += 1
+
+    logger.info(f"Done: {new_count} new verbs processed")
+    return conjugations
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+    result = main()
+    for verb, data in result.items():
+        if data:
+            forms = data.get("forms", {})
+            print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
+            for k, v in list(forms.items())[:3]:
+                print(f"  {k}: {v['form']}")
        else:
-                return None
-        except Exception as e:
-            logger.error(f"Error parsing {url_suffix}: {e}")
-            return None
-
-
-def extract_from_website(url_suffixes: list = None) -> pd.DataFrame:
-    """
-    Extract conjugations for multiple verbs.
-    
-    Args:
-        url_suffixes: List of URL suffixes to process
-    
-    Returns:
-        Combined DataFrame with all conjugations
-    """
-    if url_suffixes is None:
-        # Default verbs: "to guard" and "to sleep"
-        url_suffixes = ['2255-lishmor', '860-lishon']
-    
-    logger.info(f"Starting extraction for {len(url_suffixes)} verb(s)...")
-    
-    all_dfs = []
-    for url_suffix in url_suffixes:
-        df = extract_verb(url_suffix)
-        if df is not None:
-            all_dfs.append(df)
-        time.sleep(0.5)  # Small delay between requests
-    
-    if not all_dfs:
-        logger.error("No data extracted!")
-        return pd.DataFrame()
-    
-    combined_df = pd.concat(all_dfs, ignore_index=True)
-    logger.info(f"Extraction complete. Total verbs: {len(combined_df)}")
-    
-    return combined_df
-
-
-def main():
-    """Main entry point."""
-    try:
-        df = extract_from_website()
-        
-        if df.empty:
-            logger.error("No data to save!")
-            return
-        
-        df.to_csv('conjugations.csv', sep=';', index=True)
-        logger.info("Saved: conjugations.csv")
-        logger.info("\n" + df.to_string())
-        logger.info("✅ Complete!")
-        
-    except Exception as e:
-        logger.error(f"Fatal error: {e}")
-        raise
-
-
-if __name__ == '__main__':
-    main()
+            print(f"{verb}: no data")
--- a/data/conjugations.json
+++ b/data/conjugations.json
@ -0,0 +1,903 @@
+{
+  "לִשְׁמוֹר": {
+    "infinitive": "לִשְׁמוֹר",
+    "slug": "2255-lishmor",
+    "root": "שׁ - מ - ר",
+    "binyan": "",
+    "is_passive": false,
+    "reference_form": "לִשְׁמֹר",
+    "forms": {
+      "present_ms": {
+        "form": "שׁוֹמֵר",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר יחיד)"
+      },
+      "present_fs": {
+        "form": "שׁוֹמֶרֶת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה יחיד)"
+      },
+      "present_mp": {
+        "form": "שׁוֹמְרִים",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר רבים)"
+      },
+      "present_fp": {
+        "form": "שׁוֹמְרוֹת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה רבים)"
+      },
+      "past_1s": {
+        "form": "שָׁמַרְתִּי",
+        "pronoun": "אֲנִי",
+        "tense": "עָבָר"
+      },
+      "past_1p": {
+        "form": "שָׁמַרְנוּ",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָבָר"
+      },
+      "past_2ms": {
+        "form": "שָׁמַרְתָּ",
+        "pronoun": "אַתָּה",
+        "tense": "עָבָר"
+      },
+      "past_2fs": {
+        "form": "שָׁמַרְתְּ",
+        "pronoun": "אַתְּ",
+        "tense": "עָבָר"
+      },
+      "past_2mp": {
+        "form": "שְׁמַרְתֶּם",
+        "pronoun": "אַתֶּם",
+        "tense": "עָבָר"
+      },
+      "past_2fp": {
+        "form": "שְׁמַרְתֶּן",
+        "pronoun": "אַתֶּן",
+        "tense": "עָבָר"
+      },
+      "past_3ms": {
+        "form": "שָׁמַר",
+        "pronoun": "הוּא",
+        "tense": "עָבָר"
+      },
+      "past_3fs": {
+        "form": "שָׁמְרָה",
+        "pronoun": "הִיא",
+        "tense": "עָבָר"
+      },
+      "past_3p": {
+        "form": "שָׁמְרוּ",
+        "pronoun": "הֵם / הֵן",
+        "tense": "עָבָר"
+      },
+      "future_1s": {
+        "form": "אֶשְׁמֹר",
+        "pronoun": "אֲנִי",
+        "tense": "עָתִיד"
+      },
+      "future_1p": {
+        "form": "נִשְׁמֹר",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָתִיד"
+      },
+      "future_2ms": {
+        "form": "תִּשְׁמֹר",
+        "pronoun": "אַתָּה",
+        "tense": "עָתִיד"
+      },
+      "future_2fs": {
+        "form": "תִּשְׁמְרִי",
+        "pronoun": "אַתְּ",
+        "tense": "עָתִיד"
+      },
+      "future_2mp": {
+        "form": "תִּשְׁמְרוּ",
+        "pronoun": "אַתֶּם",
+        "tense": "עָתִיד"
+      },
+      "future_2fp": {
+        "form": "תִּשְׁמֹרְנָה",
+        "pronoun": "אַתֶּן",
+        "tense": "עָתִיד"
+      },
+      "future_3ms": {
+        "form": "יִשְׁמֹר",
+        "pronoun": "הוּא",
+        "tense": "עָתִיד"
+      },
+      "future_3fs": {
+        "form": "תִּשְׁמֹר",
+        "pronoun": "הִיא",
+        "tense": "עָתִיד"
+      },
+      "future_3mp": {
+        "form": "יִשְׁמְרוּ",
+        "pronoun": "הֵם",
+        "tense": "עָתִיד"
+      },
+      "future_3fp": {
+        "form": "תִּשְׁמֹרְנָה",
+        "pronoun": "הֵן",
+        "tense": "עָתִיד"
+      },
+      "imperative_ms": {
+        "form": "שְׁמֹר!‏",
+        "pronoun": "אַתָּה",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fs": {
+        "form": "שִׁמְרִי!‏",
+        "pronoun": "אַתְּ",
+        "tense": "צִוּוּי"
+      },
+      "imperative_mp": {
+        "form": "שִׁמְרוּ!‏",
+        "pronoun": "אַתֶּם",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fp": {
+        "form": "שְׁמֹרְנָה!‏",
+        "pronoun": "אַתֶּן",
+        "tense": "צִוּוּי"
+      },
+      "infinitive": {
+        "form": "לִשְׁמֹר",
+        "pronoun": "",
+        "tense": "מְקוֹר"
+      }
+    }
+  },
+  "לְהִשָּׁמֵר": {
+    "infinitive": "לְהִשָּׁמֵר",
+    "slug": "2256-lehishamer",
+    "root": "שׁ - מ - ר",
+    "binyan": "",
+    "is_passive": false,
+    "reference_form": "לְהִשָּׁמֵר",
+    "forms": {
+      "present_ms": {
+        "form": "נִשְׁמָר",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר יחיד)"
+      },
+      "present_fs": {
+        "form": "נִשְׁמֶרֶת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה יחיד)"
+      },
+      "present_mp": {
+        "form": "נִשְׁמָרִים",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר רבים)"
+      },
+      "present_fp": {
+        "form": "נִשְׁמָרוֹת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה רבים)"
+      },
+      "past_1s": {
+        "form": "נִשְׁמַרְתִּי",
+        "pronoun": "אֲנִי",
+        "tense": "עָבָר"
+      },
+      "past_1p": {
+        "form": "נִשְׁמַרְנוּ",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָבָר"
+      },
+      "past_2ms": {
+        "form": "נִשְׁמַרְתָּ",
+        "pronoun": "אַתָּה",
+        "tense": "עָבָר"
+      },
+      "past_2fs": {
+        "form": "נִשְׁמַרְתְּ",
+        "pronoun": "אַתְּ",
+        "tense": "עָבָר"
+      },
+      "past_2mp": {
+        "form": "נִשְׁמַרְתֶּם",
+        "pronoun": "אַתֶּם",
+        "tense": "עָבָר"
+      },
+      "past_2fp": {
+        "form": "נִשְׁמַרְתֶּן",
+        "pronoun": "אַתֶּן",
+        "tense": "עָבָר"
+      },
+      "past_3ms": {
+        "form": "נִשְׁמַר",
+        "pronoun": "הוּא",
+        "tense": "עָבָר"
+      },
+      "past_3fs": {
+        "form": "נִשְׁמְרָה",
+        "pronoun": "הִיא",
+        "tense": "עָבָר"
+      },
+      "past_3p": {
+        "form": "נִשְׁמְרוּ",
+        "pronoun": "הֵם / הֵן",
+        "tense": "עָבָר"
+      },
+      "future_1s": {
+        "form": "אֶשָּׁמֵר",
+        "pronoun": "אֲנִי",
+        "tense": "עָתִיד"
+      },
+      "future_1p": {
+        "form": "נִשָּׁמֵר",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָתִיד"
+      },
+      "future_2ms": {
+        "form": "תִּשָּׁמֵר",
+        "pronoun": "אַתָּה",
+        "tense": "עָתִיד"
+      },
+      "future_2fs": {
+        "form": "תִּשָּׁמְרִי",
+        "pronoun": "אַתְּ",
+        "tense": "עָתִיד"
+      },
+      "future_2mp": {
+        "form": "תִּשָּׁמְרוּ",
+        "pronoun": "אַתֶּם",
+        "tense": "עָתִיד"
+      },
+      "future_2fp": {
+        "form": "תִּשָּׁמַרְנָה",
+        "pronoun": "אַתֶּן",
+        "tense": "עָתִיד"
+      },
+      "future_3ms": {
+        "form": "יִשָּׁמֵר",
+        "pronoun": "הוּא",
+        "tense": "עָתִיד"
+      },
+      "future_3fs": {
+        "form": "תִּשָּׁמֵר",
+        "pronoun": "הִיא",
+        "tense": "עָתִיד"
+      },
+      "future_3mp": {
+        "form": "יִשָּׁמְרוּ",
+        "pronoun": "הֵם",
+        "tense": "עָתִיד"
+      },
+      "future_3fp": {
+        "form": "תִּשָּׁמַרְנָה",
+        "pronoun": "הֵן",
+        "tense": "עָתִיד"
+      },
+      "imperative_ms": {
+        "form": "הִשָּׁמֵר!‏",
+        "pronoun": "אַתָּה",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fs": {
+        "form": "הִשָּׁמְרִי!‏",
+        "pronoun": "אַתְּ",
+        "tense": "צִוּוּי"
+      },
+      "imperative_mp": {
+        "form": "הִשָּׁמְרוּ!‏",
+        "pronoun": "אַתֶּם",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fp": {
+        "form": "הִשָּׁמַרְנָה!‏",
+        "pronoun": "אַתֶּן",
+        "tense": "צִוּוּי"
+      },
+      "infinitive": {
+        "form": "לְהִשָּׁמֵר",
+        "pronoun": "",
+        "tense": "מְקוֹר"
+      }
+    }
+  },
+  "לְדַבֵּר": {
+    "infinitive": "לְדַבֵּר",
+    "slug": "2-ledaber",
+    "root": "ד - ב - ר",
+    "binyan": "",
+    "is_passive": false,
+    "reference_form": "לְדַבֵּר",
+    "forms": {
+      "present_ms": {
+        "form": "מְדַבֵּר",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר יחיד)"
+      },
+      "present_fs": {
+        "form": "מְדַבֶּרֶת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה יחיד)"
+      },
+      "present_mp": {
+        "form": "מְדַבְּרִים",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר רבים)"
+      },
+      "present_fp": {
+        "form": "מְדַבְּרוֹת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה רבים)"
+      },
+      "past_1s": {
+        "form": "דִּבַּרְתִּי",
+        "pronoun": "אֲנִי",
+        "tense": "עָבָר"
+      },
+      "past_1p": {
+        "form": "דִּבַּרְנוּ",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָבָר"
+      },
+      "past_2ms": {
+        "form": "דִּבַּרְתָּ",
+        "pronoun": "אַתָּה",
+        "tense": "עָבָר"
+      },
+      "past_2fs": {
+        "form": "דִּבַּרְתְּ",
+        "pronoun": "אַתְּ",
+        "tense": "עָבָר"
+      },
+      "past_2mp": {
+        "form": "דִּבַּרְתֶּם",
+        "pronoun": "אַתֶּם",
+        "tense": "עָבָר"
+      },
+      "past_2fp": {
+        "form": "דִּבַּרְתֶּן",
+        "pronoun": "אַתֶּן",
+        "tense": "עָבָר"
+      },
+      "past_3ms": {
+        "form": "דִּבֵּר",
+        "pronoun": "הוּא",
+        "tense": "עָבָר"
+      },
+      "past_3fs": {
+        "form": "דִּבְּרָה",
+        "pronoun": "הִיא",
+        "tense": "עָבָר"
+      },
+      "past_3p": {
+        "form": "דִּבְּרוּ",
+        "pronoun": "הֵם / הֵן",
+        "tense": "עָבָר"
+      },
+      "future_1s": {
+        "form": "אֲדַבֵּר",
+        "pronoun": "אֲנִי",
+        "tense": "עָתִיד"
+      },
+      "future_1p": {
+        "form": "נְדַבֵּר",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָתִיד"
+      },
+      "future_2ms": {
+        "form": "תְּדַבֵּר",
+        "pronoun": "אַתָּה",
+        "tense": "עָתִיד"
+      },
+      "future_2fs": {
+        "form": "תְּדַבְּרִי",
+        "pronoun": "אַתְּ",
+        "tense": "עָתִיד"
+      },
+      "future_2mp": {
+        "form": "תְּדַבְּרוּ",
+        "pronoun": "אַתֶּם",
+        "tense": "עָתִיד"
+      },
+      "future_2fp": {
+        "form": "תְּדַבֵּרְנָה",
+        "pronoun": "אַתֶּן",
+        "tense": "עָתִיד"
+      },
+      "future_3ms": {
+        "form": "יְדַבֵּר",
+        "pronoun": "הוּא",
+        "tense": "עָתִיד"
+      },
+      "future_3fs": {
+        "form": "תְּדַבֵּר",
+        "pronoun": "הִיא",
+        "tense": "עָתִיד"
+      },
+      "future_3mp": {
+        "form": "יְדַבְּרוּ",
+        "pronoun": "הֵם",
+        "tense": "עָתִיד"
+      },
+      "future_3fp": {
+        "form": "תְּדַבֵּרְנָה",
+        "pronoun": "הֵן",
+        "tense": "עָתִיד"
+      },
+      "imperative_ms": {
+        "form": "דַּבֵּר!‏",
+        "pronoun": "אַתָּה",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fs": {
+        "form": "דַּבְּרִי!‏",
+        "pronoun": "אַתְּ",
+        "tense": "צִוּוּי"
+      },
+      "imperative_mp": {
+        "form": "דַּבְּרוּ!‏",
+        "pronoun": "אַתֶּם",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fp": {
+        "form": "דַּבֵּרְנָה!‏",
+        "pronoun": "אַתֶּן",
+        "tense": "צִוּוּי"
+      },
+      "infinitive": {
+        "form": "לְדַבֵּר",
+        "pronoun": "",
+        "tense": "מְקוֹר"
+      }
+    }
+  },
+  "לְדֻבַּר": {
+    "infinitive": "לְדֻבַּר",
+    "slug": "2-ledaber",
+    "root": "ד - ב - ר",
+    "binyan": "",
+    "is_passive": false,
+    "reference_form": "לְדַבֵּר",
+    "forms": {
+      "present_ms": {
+        "form": "מְדַבֵּר",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר יחיד)"
+      },
+      "present_fs": {
+        "form": "מְדַבֶּרֶת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה יחיד)"
+      },
+      "present_mp": {
+        "form": "מְדַבְּרִים",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר רבים)"
+      },
+      "present_fp": {
+        "form": "מְדַבְּרוֹת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה רבים)"
+      },
+      "past_1s": {
+        "form": "דִּבַּרְתִּי",
+        "pronoun": "אֲנִי",
+        "tense": "עָבָר"
+      },
+      "past_1p": {
+        "form": "דִּבַּרְנוּ",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָבָר"
+      },
+      "past_2ms": {
+        "form": "דִּבַּרְתָּ",
+        "pronoun": "אַתָּה",
+        "tense": "עָבָר"
+      },
+      "past_2fs": {
+        "form": "דִּבַּרְתְּ",
+        "pronoun": "אַתְּ",
+        "tense": "עָבָר"
+      },
+      "past_2mp": {
+        "form": "דִּבַּרְתֶּם",
+        "pronoun": "אַתֶּם",
+        "tense": "עָבָר"
+      },
+      "past_2fp": {
+        "form": "דִּבַּרְתֶּן",
+        "pronoun": "אַתֶּן",
+        "tense": "עָבָר"
+      },
+      "past_3ms": {
+        "form": "דִּבֵּר",
+        "pronoun": "הוּא",
+        "tense": "עָבָר"
+      },
+      "past_3fs": {
+        "form": "דִּבְּרָה",
+        "pronoun": "הִיא",
+        "tense": "עָבָר"
+      },
+      "past_3p": {
+        "form": "דִּבְּרוּ",
+        "pronoun": "הֵם / הֵן",
+        "tense": "עָבָר"
+      },
+      "future_1s": {
+        "form": "אֲדַבֵּר",
+        "pronoun": "אֲנִי",
+        "tense": "עָתִיד"
+      },
+      "future_1p": {
+        "form": "נְדַבֵּר",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָתִיד"
+      },
+      "future_2ms": {
+        "form": "תְּדַבֵּר",
+        "pronoun": "אַתָּה",
+        "tense": "עָתִיד"
+      },
+      "future_2fs": {
+        "form": "תְּדַבְּרִי",
+        "pronoun": "אַתְּ",
+        "tense": "עָתִיד"
+      },
+      "future_2mp": {
+        "form": "תְּדַבְּרוּ",
+        "pronoun": "אַתֶּם",
+        "tense": "עָתִיד"
+      },
+      "future_2fp": {
+        "form": "תְּדַבֵּרְנָה",
+        "pronoun": "אַתֶּן",
+        "tense": "עָתִיד"
+      },
+      "future_3ms": {
+        "form": "יְדַבֵּר",
+        "pronoun": "הוּא",
+        "tense": "עָתִיד"
+      },
+      "future_3fs": {
+        "form": "תְּדַבֵּר",
+        "pronoun": "הִיא",
+        "tense": "עָתִיד"
+      },
+      "future_3mp": {
+        "form": "יְדַבְּרוּ",
+        "pronoun": "הֵם",
+        "tense": "עָתִיד"
+      },
+      "future_3fp": {
+        "form": "תְּדַבֵּרְנָה",
+        "pronoun": "הֵן",
+        "tense": "עָתִיד"
+      },
+      "imperative_ms": {
+        "form": "דַּבֵּר!‏",
+        "pronoun": "אַתָּה",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fs": {
+        "form": "דַּבְּרִי!‏",
+        "pronoun": "אַתְּ",
+        "tense": "צִוּוּי"
+      },
+      "imperative_mp": {
+        "form": "דַּבְּרוּ!‏",
+        "pronoun": "אַתֶּם",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fp": {
+        "form": "דַּבֵּרְנָה!‏",
+        "pronoun": "אַתֶּן",
+        "tense": "צִוּוּי"
+      },
+      "infinitive": {
+        "form": "לְדַבֵּר",
+        "pronoun": "",
+        "tense": "מְקוֹר"
+      }
+    }
+  },
+  "לְהִתְלַבֵּשׁ": {
+    "infinitive": "לְהִתְלַבֵּשׁ",
+    "slug": "974-lehitlabesh",
+    "root": "ל - ב - שׁ",
+    "binyan": "",
+    "is_passive": false,
+    "reference_form": "לְהִתְלַבֵּשׁ",
+    "forms": {
+      "present_ms": {
+        "form": "מִתְלַבֵּשׁ",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר יחיד)"
+      },
+      "present_fs": {
+        "form": "מִתְלַבֶּשֶׁת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה יחיד)"
+      },
+      "present_mp": {
+        "form": "מִתְלַבְּשִׁים",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר רבים)"
+      },
+      "present_fp": {
+        "form": "מִתְלַבְּשׁוֹת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה רבים)"
+      },
+      "past_1s": {
+        "form": "הִתְלַבַּשְׁתִּי",
+        "pronoun": "אֲנִי",
+        "tense": "עָבָר"
+      },
+      "past_1p": {
+        "form": "הִתְלַבַּשְׁנוּ",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָבָר"
+      },
+      "past_2ms": {
+        "form": "הִתְלַבַּשְׁתָּ",
+        "pronoun": "אַתָּה",
+        "tense": "עָבָר"
+      },
+      "past_2fs": {
+        "form": "הִתְלַבַּשְׁתְּ",
+        "pronoun": "אַתְּ",
+        "tense": "עָבָר"
+      },
+      "past_2mp": {
+        "form": "הִתְלַבַּשְׁתֶּם",
+        "pronoun": "אַתֶּם",
+        "tense": "עָבָר"
+      },
+      "past_2fp": {
+        "form": "הִתְלַבַּשְׁתֶּן",
+        "pronoun": "אַתֶּן",
+        "tense": "עָבָר"
+      },
+      "past_3ms": {
+        "form": "הִתְלַבֵּשׁ",
+        "pronoun": "הוּא",
+        "tense": "עָבָר"
+      },
+      "past_3fs": {
+        "form": "הִתְלַבְּשָׁה",
+        "pronoun": "הִיא",
+        "tense": "עָבָר"
+      },
+      "past_3p": {
+        "form": "הִתְלַבְּשׁוּ",
+        "pronoun": "הֵם / הֵן",
+        "tense": "עָבָר"
+      },
+      "future_1s": {
+        "form": "אֶתְלַבֵּשׁ",
+        "pronoun": "אֲנִי",
+        "tense": "עָתִיד"
+      },
+      "future_1p": {
+        "form": "נִתְלַבֵּשׁ",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָתִיד"
+      },
+      "future_2ms": {
+        "form": "תִּתְלַבֵּשׁ",
+        "pronoun": "אַתָּה",
+        "tense": "עָתִיד"
+      },
+      "future_2fs": {
+        "form": "תִּתְלַבְּשִׁי",
+        "pronoun": "אַתְּ",
+        "tense": "עָתִיד"
+      },
+      "future_2mp": {
+        "form": "תִּתְלַבְּשׁוּ",
+        "pronoun": "אַתֶּם",
+        "tense": "עָתִיד"
+      },
+      "future_2fp": {
+        "form": "תִּתְלַבֵּשְׁנָה",
+        "pronoun": "אַתֶּן",
+        "tense": "עָתִיד"
+      },
+      "future_3ms": {
+        "form": "יִתְלַבֵּשׁ",
+        "pronoun": "הוּא",
+        "tense": "עָתִיד"
+      },
+      "future_3fs": {
+        "form": "תִּתְלַבֵּשׁ",
+        "pronoun": "הִיא",
+        "tense": "עָתִיד"
+      },
+      "future_3mp": {
+        "form": "יִתְלַבְּשׁוּ",
+        "pronoun": "הֵם",
+        "tense": "עָתִיד"
+      },
+      "future_3fp": {
+        "form": "תִּתְלַבֵּשְׁנָה",
+        "pronoun": "הֵן",
+        "tense": "עָתִיד"
+      },
+      "imperative_ms": {
+        "form": "הִתְלַבֵּשׁ!‏",
+        "pronoun": "אַתָּה",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fs": {
+        "form": "הִתְלַבְּשִׁי!‏",
+        "pronoun": "אַתְּ",
+        "tense": "צִוּוּי"
+      },
+      "imperative_mp": {
+        "form": "הִתְלַבְּשׁוּ!‏",
+        "pronoun": "אַתֶּם",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fp": {
+        "form": "הִתְלַבֵּשְׁנָה!‏",
+        "pronoun": "אַתֶּן",
+        "tense": "צִוּוּי"
+      },
+      "infinitive": {
+        "form": "לְהִתְלַבֵּשׁ",
+        "pronoun": "",
+        "tense": "מְקוֹר"
+      }
+    }
+  },
+  "לְהַגִּיד": {
+    "infinitive": "לְהַגִּיד",
+    "slug": "1135-lehagid",
+    "root": "נ - ג - ד",
+    "binyan": "",
+    "is_passive": false,
+    "reference_form": "לְהַגִּיד",
+    "forms": {
+      "present_ms": {
+        "form": "מַגִּיד",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר יחיד)"
+      },
+      "present_fs": {
+        "form": "מַגִּידָה",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה יחיד)"
+      },
+      "present_mp": {
+        "form": "מַגִּידִים",
+        "pronoun": "",
+        "tense": "הוֹוֶה (זכר רבים)"
+      },
+      "present_fp": {
+        "form": "מַגִּידוֹת",
+        "pronoun": "",
+        "tense": "הוֹוֶה (נקבה רבים)"
+      },
+      "past_1s": {
+        "form": "הִגַּדְתִּי",
+        "pronoun": "אֲנִי",
+        "tense": "עָבָר"
+      },
+      "past_1p": {
+        "form": "הִגַּדְנוּ",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָבָר"
+      },
+      "past_2ms": {
+        "form": "הִגַּדְתָּ",
+        "pronoun": "אַתָּה",
+        "tense": "עָבָר"
+      },
+      "past_2fs": {
+        "form": "הִגַּדְתְּ",
+        "pronoun": "אַתְּ",
+        "tense": "עָבָר"
+      },
+      "past_2mp": {
+        "form": "הִגַּדְתֶּם",
+        "pronoun": "אַתֶּם",
+        "tense": "עָבָר"
+      },
+      "past_2fp": {
+        "form": "הִגַּדְתֶּן",
+        "pronoun": "אַתֶּן",
+        "tense": "עָבָר"
+      },
+      "past_3ms": {
+        "form": "הִגִּיד",
+        "pronoun": "הוּא",
+        "tense": "עָבָר"
+      },
+      "past_3fs": {
+        "form": "הִגִּידָה",
+        "pronoun": "הִיא",
+        "tense": "עָבָר"
+      },
+      "past_3p": {
+        "form": "הִגִּידוּ",
+        "pronoun": "הֵם / הֵן",
+        "tense": "עָבָר"
+      },
+      "future_1s": {
+        "form": "אַגִּיד",
+        "pronoun": "אֲנִי",
+        "tense": "עָתִיד"
+      },
+      "future_1p": {
+        "form": "נַגִּיד",
+        "pronoun": "אֲנַחְנוּ",
+        "tense": "עָתִיד"
+      },
+      "future_2ms": {
+        "form": "תַּגִּיד",
+        "pronoun": "אַתָּה",
+        "tense": "עָתִיד"
+      },
+      "future_2fs": {
+        "form": "תַּגִּידִי",
+        "pronoun": "אַתְּ",
+        "tense": "עָתִיד"
+      },
+      "future_2mp": {
+        "form": "תַּגִּידוּ",
+        "pronoun": "אַתֶּם",
+        "tense": "עָתִיד"
+      },
+      "future_2fp": {
+        "form": "תַּגֵּדְנָה",
+        "pronoun": "אַתֶּן",
+        "tense": "עָתִיד"
+      },
+      "future_3ms": {
+        "form": "יַגִּיד",
+        "pronoun": "הוּא",
+        "tense": "עָתִיד"
+      },
+      "future_3fs": {
+        "form": "תַּגִּיד",
+        "pronoun": "הִיא",
+        "tense": "עָתִיד"
+      },
+      "future_3mp": {
+        "form": "יַגִּידוּ",
+        "pronoun": "הֵם",
+        "tense": "עָתִיד"
+      },
+      "future_3fp": {
+        "form": "תַּגֵּדְנָה",
+        "pronoun": "הֵן",
+        "tense": "עָתִיד"
+      },
+      "imperative_ms": {
+        "form": "הַגֵּד!‏",
+        "pronoun": "אַתָּה",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fs": {
+        "form": "הַגִּידִי!‏",
+        "pronoun": "אַתְּ",
+        "tense": "צִוּוּי"
+      },
+      "imperative_mp": {
+        "form": "הַגִּידוּ!‏",
+        "pronoun": "אַתֶּם",
+        "tense": "צִוּוּי"
+      },
+      "imperative_fp": {
+        "form": "הַגֵּדְנָה!‏",
+        "pronoun": "אַתֶּן",
+        "tense": "צִוּוּי"
+      },
+      "infinitive": {
+        "form": "לְהַגִּיד",
+        "pronoun": "",
+        "tense": "מְקוֹר"
+      }
+    }
+  },
+  "לְהוּגַד": null
+}
--- a/data/examples_cache.json
+++ b/data/examples_cache.json
@ -0,0 +1 @@
+{"אב": ["לא אב לחגלה אתה", "כרחם אב על בנים"], "אבא": ["כך כך אבא יקירי", "“אבא איננו בבית"], "אביבי": ["אמרת: תם אביבי,", "אמרת: תם אביבי,"], "אביב": ["אביב כי יתחדש –", "ברחובות תל־אביב"], "אבידה": ["אבידה בדבר מועט", "ואם לרבות אבידה"], "לאבוד": ["אבל נאנחתי לאבוד", "אנו הולכים לאבוד"], "להיאבד": [], "להתאבד": ["מעמד והחליטה להתאבד", "היא נסתה פעם להתאבד"], "איבוד": ["איבוד דמי משלוח", "איבוד עצמו לדעת"], "התאבדות": ["והביאו לידי התאבדות", "הקלון, בלתי אם התאבדות"], "להאביד": ["ויאמר להאביד זכרם –", "קול שם רשעים להאביד"], "אבדה": ["ועתה אבדה תקותה", "וכל תשועתו אבדה"], "אבוד": ["— הה, הנני אבוד", "“אבוד עצמי לדעת"], "לאבד": ["אין לכם מה לאבד", "יש חשש לאבד שנה"], "אבדון": ["אבדון, אש הנעורת", "אבדון, פתחי עולם"], "אבוקדו": ["מטעים רצופים של עצי אבוקדו ומנגו", "את זרעי הפירות וגידלו מהם שתילים חדשים של אבוקדו"], "אבזם": ["רצו אל הטבח, הגישו לו הפעם חגורה עם אבזם מבריק… הביאו כוס", "רגליו היו עטופות לפפות חדשות ומתניו חגורות חגורה חדשה עם אבזם"], "לאבזר": [], "איבחון": ["לשלבים גבוהים יותר של איבחון וריפוי", "איבחון נחפז, כמוהו כהיסוס מופרז, עלול לגרור תוצאות בלתי־נעימות"]}
--- a/data/frequency_cache.json
+++ b/data/frequency_cache.json
--- a/data/pealim_dict.csv
+++ b/data/pealim_dict.csv
--- a/data/pealim_dict_for_anki.csv
+++ b/data/pealim_dict_for_anki.csv
--- a/frequency_lookup.py
+++ b/frequency_lookup.py
@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""
+Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
+Downloads he_50k.txt once; subsequent runs read from cache.
+Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
+"""
+
+import json
+import logging
+import re
+import unicodedata
+from pathlib import Path
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+FREQ_URL = (
+    "https://raw.githubusercontent.com/hermitdave/FrequencyWords/"
+    "master/content/2016/he/he_50k.txt"
+)
+CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
+REQUEST_TIMEOUT = 30
+
+# Module-level cache: word_no_nikkud -> rank (1 = most common)
+_freq: dict[str, int] = {}
+
+
+def _strip_nikkud(text: str) -> str:
+    """Remove Hebrew nikkud (diacritics) from a string."""
+    return "".join(
+        ch for ch in unicodedata.normalize("NFD", text)
+        if unicodedata.category(ch) != "Mn"
+    )
+
+
+def load(cache_path: Path = CACHE_PATH) -> None:
+    """Load frequency data from cache, downloading if not present."""
+    global _freq
+    if cache_path.exists():
+        with open(cache_path, encoding="utf-8") as f:
+            _freq = json.load(f)
+        logger.info(f"Frequency cache loaded: {len(_freq)} entries")
+        return
+
+    logger.info("Downloading FrequencyWords he_50k.txt …")
+    resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT)
+    resp.raise_for_status()
+
+    rank = 1
+    for line in resp.text.splitlines():
+        line = line.strip()
+        if not line:
+            continue
+        parts = line.split()
+        if len(parts) >= 1:
+            word = _strip_nikkud(parts[0])
+            if word and word not in _freq:
+                _freq[word] = rank
+                rank += 1
+
+    cache_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(cache_path, "w", encoding="utf-8") as f:
+        json.dump(_freq, f, ensure_ascii=False)
+    logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}")
+
+
+def get_frequency_rank(word_no_nikkud: str) -> int | None:
+    """
+    Return the frequency rank of a word (1 = most common).
+    Returns None if not found in the corpus.
+    Strips nikkud from the input before lookup.
+    """
+    if not _freq:
+        load()
+    clean = _strip_nikkud(word_no_nikkud.strip())
+    return _freq.get(clean)
+
+
+if __name__ == "__main__":
+    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
+    load()
+    tests = ["שלום", "ספר", "בית", "מים", "כלב"]
+    for w in tests:
+        print(f"{w}: rank {get_frequency_rank(w)}")
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,6 @@
 pandas>=1.3.0
 requests>=2.26.0
 numpy>=1.21.0
+genanki>=0.8.0
+beautifulsoup4>=4.11.0
+lxml>=4.9.0
--- a/run.py
+++ b/run.py
@ -1,48 +1,313 @@
 #!/usr/bin/env python3
 """
-Main entry point: orchestrate dictionary and conjugation extraction.
+Pealim Anki Deck Builder — full pipeline orchestrator.
+
+Usage:
+  python run.py [options]
+
+Options:
+  --skip-scrape        Use existing data/pealim_dict.csv (no pealim.com dict scraping)
+  --skip-audio         Skip audio .mp3 downloads
+  --skip-examples      Skip Ben Yehuda example fetching
+  --skip-conjugations  Skip verb conjugation extraction
+  --test N             Process only the first N dictionary words (for quick testing)
 """

+import argparse
+import json
 import logging
 import sys
+import time
 from pathlib import Path

-# Add current directory to path
 sys.path.insert(0, str(Path(__file__).parent))

-import pealim_extract
-import conjugation_extract
-
 logging.basicConfig(
    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
+    format="%(asctime)s %(levelname)s %(message)s",
 )
 logger = logging.getLogger(__name__)

+DATA_DIR = Path(__file__).parent / "data"
+OUTPUT_DIR = Path(__file__).parent / "output"

-def main():
-    """Run all extraction tasks."""
-    logger.info("=" * 60)
-    logger.info("PEALIM EXTRACTION SUITE")
-    logger.info("=" * 60)
+
+def parse_args():
+    p = argparse.ArgumentParser(description="Pealim Anki deck builder")
+    p.add_argument("--skip-scrape",        action="store_true", help="Skip dict scraping; use cached CSV")
+    p.add_argument("--skip-audio",         action="store_true", help="Skip audio downloads")
+    p.add_argument("--skip-examples",      action="store_true", help="Skip Ben Yehuda example lookup")
+    p.add_argument("--skip-conjugations",  action="store_true", help="Skip verb conjugation extraction")
+    p.add_argument("--test",               type=int, metavar="N", help="Limit to first N words")
+    return p.parse_args()
+
+
+def step_scrape(args):
+    """Step 1 — scrape or load dictionary."""
+    dict_csv = DATA_DIR / "pealim_dict.csv"
+    anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+
+    if args.skip_scrape:
+        if dict_csv.exists():
+            logger.info(f"[1] Using existing {dict_csv}")
+        else:
+            logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
+            sys.exit(1)
+        return
+
+    logger.info("[1] Scraping dictionary from pealim.com …")
+    import pealim_extract
+    import pandas as pd
+
+    df = pealim_extract.extract_from_website()
+    df.to_csv(dict_csv, index=True)
+    logger.info(f"    Saved {len(df)} words → {dict_csv}")
+
+    df = pealim_extract.modify_for_anki(df)
+    df.to_csv(anki_csv, sep=";", index=True)
+    logger.info(f"    Saved Anki CSV → {anki_csv}")
+
+
+def step_frequency():
+    """Step 2 — load/download word frequency data."""
+    logger.info("[2] Loading word frequency data …")
+    import frequency_lookup
+    frequency_lookup.load()
+    return frequency_lookup._freq
+
+
+def step_examples(args, freq_cache: dict):
+    """Step 3 — load/build Ben Yehuda example index."""
+    if args.skip_examples:
+        logger.info("[3] Skipping examples (--skip-examples)")
+        examples_path = DATA_DIR / "examples_cache.json"
+        if examples_path.exists():
+            with open(examples_path) as f:
+                return json.load(f)
+        return {}
+
+    logger.info("[3] Loading Ben Yehuda example index …")
+    import benyehuda
+    benyehuda.load()
+    # Pre-fetch examples for all words in the dict (uses cache)
+    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict.csv"

    try:
-        # Extract dictionary
-        logger.info("\n[1/2] Extracting dictionary...")
-        pealim_extract.main()
+        import pandas as pd
+        try:
+            df = pd.read_csv(dict_csv, sep=";", index_col=0)
+            if df.shape[1] < 3:
+                raise ValueError
+        except Exception:
+            df = pd.read_csv(dict_csv, index_col=0)

-        # Extract conjugations
-        logger.info("\n[2/2] Extracting conjugations...")
-        conjugation_extract.main()
+        if args.test:
+            df = df.head(args.test)

-        logger.info("\n" + "=" * 60)
-        logger.info("✅ ALL TASKS COMPLETE")
-        logger.info("=" * 60)
+        import unicodedata
+        def strip(t):
+            return "".join(c for c in unicodedata.normalize("NFD", str(t))
+                           if unicodedata.category(c) != "Mn")
+
+        logger.info(f"    Pre-fetching examples for {len(df)} words …")
+        for _, row in df.iterrows():
+            word_plain = strip(str(row.get("Word Without Nikkud", "")).strip())
+            if word_plain:
+                benyehuda.get_examples(word_plain)

    except Exception as e:
-        logger.error(f"\n❌ EXTRACTION FAILED: {e}")
-        sys.exit(1)
+        logger.warning(f"    Could not pre-fetch all examples: {e}")
+
+    benyehuda.save_examples_cache()
+    return benyehuda._examples_cache


-if __name__ == '__main__':
+def step_audio(args):
+    """Step 4 — download audio .mp3 files."""
+    if args.skip_audio:
+        logger.info("[4] Skipping audio (--skip-audio)")
+        return
+
+    logger.info("[4] Downloading audio files …")
+    # Load audio URL cache (from old workspace if available)
+    audio_cache_path = DATA_DIR / "audio_cache.json"
+    audio_url_cache: dict = {}
+    if audio_cache_path.exists():
+        with open(audio_cache_path) as f:
+            audio_url_cache = json.load(f)
+
+    import audio_extract as ae
+    ae._audio_cache = audio_url_cache
+
+    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict.csv"
+
+    import pandas as pd
+    import requests
+    try:
+        try:
+            df = pd.read_csv(dict_csv, sep=";", index_col=0)
+            if df.shape[1] < 3:
+                raise ValueError
+        except Exception:
+            df = pd.read_csv(dict_csv, index_col=0)
+
+        if args.test:
+            df = df.head(args.test)
+
+        AUDIO_DIR.mkdir(parents=True, exist_ok=True)
+        downloaded = 0
+        skipped = 0
+
+        for _, row in df.iterrows():
+            word = str(row.get("Word", "")).strip()
+            word_plain = str(row.get("Word Without Nikkud", "")).strip()
+            if not word:
+                continue
+
+            import re, unicodedata
+            def strip_nik(t):
+                return "".join(c for c in unicodedata.normalize("NFD", t)
+                               if unicodedata.category(c) != "Mn")
+
+            safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nik(word_plain or word))
+            if not safe_name:
+                continue
+            mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
+
+            if mp3_path.exists():
+                skipped += 1
+                continue
+
+            # Get audio URL from cache or fetch
+            audio_url = ae.extract_audio_url(word)
+            if audio_url:
+                try:
+                    resp = requests.get(audio_url, timeout=10)
+                    resp.raise_for_status()
+                    mp3_path.write_bytes(resp.content)
+                    downloaded += 1
+                    time.sleep(0.3)
+                except Exception as e:
+                    logger.debug(f"    Audio download failed for {word}: {e}")
+
+        ae.save_audio_cache(str(audio_cache_path))
+        logger.info(f"    Audio: {downloaded} downloaded, {skipped} already cached")
+
+    except Exception as e:
+        logger.warning(f"    Audio step failed: {e}")
+
+
+def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
+    """Step 5 — build vocabulary .apkg."""
+    logger.info("[5] Building vocabulary deck …")
+    import apkg_builder
+
+    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict.csv"
+
+    deck, media = apkg_builder.build_vocab_deck(
+        dict_csv,
+        examples_cache=examples_cache,
+        freq_cache=freq_cache,
+        limit=args.test,
+    )
+    apkg_builder.write_vocab_apkg(deck, media)
+    logger.info(f"    Vocabulary .apkg → {apkg_builder.VOCAB_APKG}")
+    return deck
+
+
+def step_conjugations(args):
+    """Step 6 — extract conjugations and build conjugation deck."""
+    if args.skip_conjugations:
+        logger.info("[6] Skipping conjugations (--skip-conjugations)")
+        return
+
+    verbs_file = Path(__file__).parent / "verbs_input.txt"
+    if not verbs_file.exists():
+        logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
+        return
+
+    logger.info("[6] Extracting verb conjugations …")
+    import conjugation_extract
+    conjugations = conjugation_extract.main(verbs_file)
+
+    import apkg_builder
+    conj_deck = apkg_builder.build_conj_deck(conjugations)
+    apkg_builder.write_conj_apkg(conj_deck)
+    logger.info(f"    Conjugation .apkg → {apkg_builder.CONJ_APKG}")
+
+    return conjugations
+
+
+def print_summary(args, examples_cache, freq_cache, conjugations):
+    logger.info("")
+    logger.info("=" * 60)
+    logger.info("SUMMARY")
+    logger.info("=" * 60)
+
+    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict.csv"
+    if dict_csv.exists():
+        import pandas as pd
+        try:
+            df = pd.read_csv(dict_csv, sep=";", index_col=0)
+            if df.shape[1] < 3:
+                raise ValueError
+        except Exception:
+            df = pd.read_csv(dict_csv, index_col=0)
+        logger.info(f"  Dictionary words: {len(df)}")
+
+    logger.info(f"  Frequency entries: {len(freq_cache)}")
+    logger.info(f"  Example cache entries: {len(examples_cache)}")
+    covered = sum(1 for v in examples_cache.values() if v)
+    if examples_cache:
+        logger.info(f"  Example coverage: {covered}/{len(examples_cache)} ({100*covered//len(examples_cache)}%)")
+
+    audio_dir = DATA_DIR / "audio"
+    if audio_dir.exists():
+        mp3s = list(audio_dir.glob("*.mp3"))
+        logger.info(f"  Audio files: {len(mp3s)}")
+
+    vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
+    conj_apkg  = OUTPUT_DIR / "pealim_conjugations.apkg"
+    if vocab_apkg.exists():
+        size_mb = vocab_apkg.stat().st_size / 1e6
+        logger.info(f"  Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")
+    if conj_apkg.exists():
+        size_mb = conj_apkg.stat().st_size / 1e6
+        logger.info(f"  Conjugation .apkg: {size_mb:.1f} MB → {conj_apkg}")
+        if conjugations:
+            verb_count = sum(1 for v in conjugations.values() if v)
+            logger.info(f"  Verbs in conjugation deck: {verb_count}")
+
+    logger.info("=" * 60)
+    logger.info("✅ DONE")
+
+
+def main():
+    args = parse_args()
+
+    logger.info("=" * 60)
+    logger.info("PEALIM ANKI DECK BUILDER")
+    if args.test:
+        logger.info(f"  TEST MODE: {args.test} words")
+    logger.info("=" * 60)
+
+    step_scrape(args)
+    freq_cache     = step_frequency()
+    examples_cache = step_examples(args, freq_cache)
+    step_audio(args)
+    step_build_vocab(args, examples_cache, freq_cache)
+    conjugations = step_conjugations(args)
+
+    print_summary(args, examples_cache, freq_cache, conjugations or {})
+
+
+if __name__ == "__main__":
    main()
--- a/verbs_input.txt
+++ b/verbs_input.txt
@ -0,0 +1,10 @@
+# One Hebrew infinitive per line.
+# Lines starting with # are ignored.
+# Initial test set — one verb per binyan:
+לִשְׁמוֹר
+לְהִשָּׁמֵר
+לְדַבֵּר
+לְדֻבַּר
+לְהִתְלַבֵּשׁ
+לְהַגִּיד
+לְהוּגַד
				`@ -0,0 +1 @@`
				{"אב": ["לא אב לחגלה אתה", "כרחם אב על בנים"], "אבא": ["כך כך אבא יקירי", "“אבא איננו בבית"], "אביבי": ["אמרת: תם אביבי,", "אמרת: תם אביבי,"], "אביב": ["אביב כי יתחדש –", "ברחובות תל־אביב"], "אבידה": ["אבידה בדבר מועט", "ואם לרבות אבידה"], "לאבוד": ["אבל נאנחתי לאבוד", "אנו הולכים לאבוד"], "להיאבד": [], "להתאבד": ["מעמד והחליטה להתאבד", "היא נסתה פעם להתאבד"], "איבוד": ["איבוד דמי משלוח", "איבוד עצמו לדעת"], "התאבדות": ["והביאו לידי התאבדות", "הקלון, בלתי אם התאבדות"], "להאביד": ["ויאמר להאביד זכרם –", "קול שם רשעים להאביד"], "אבדה": ["ועתה אבדה תקותה", "וכל תשועתו אבדה"], "אבוד": ["— הה, הנני אבוד", "“אבוד עצמי לדעת"], "לאבד": ["אין לכם מה לאבד", "יש חשש לאבד שנה"], "אבדון": ["אבדון, אש הנעורת", "אבדון, פתחי עולם"], "אבוקדו": ["מטעים רצופים של עצי אבוקדו ומנגו", "את זרעי הפירות וגידלו מהם שתילים חדשים של אבוקדו"], "אבזם": ["רצו אל הטבח, הגישו לו הפעם חגורה עם אבזם מבריק… הביאו כוס", "רגליו היו עטופות לפפות חדשות ומתניו חגורות חגורה חדשה עם אבזם"], "לאבזר": [], "איבחון": ["לשלבים גבוהים יותר של איבחון וריפוי", "איבחון נחפז, כמוהו כהיסוס מופרז, עלול לגרור תוצאות בלתי־נעימות"]}