feat: add apkg builder, frequency, Ben Yehuda examples, conjugation deck

Implements four major improvements to the Pealim Anki deck pipeline: 1. Automated .apkg generation (genanki) — no more manual Anki Desktop step. Both vocabulary and conjugation decks are built programmatically. 2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus. Notes sorted by rank so Anki presents most common words first. 3. Example sentences from Ben Yehuda public domain corpus (not pealim.com). Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set. 4. Conjugation drill deck — one card per form × verb. Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one per binyan). Extracts 28 forms each via pealim.com/search/ + table parse. New files: apkg_builder.py — genanki deck builder for both decks benyehuda.py — Ben Yehuda corpus downloader + sentence indexer frequency_lookup.py — FrequencyWords downloader + rank lookup verbs_input.txt — verb input list (7 test verbs, one per binyan) data/ — baseline CSVs + generated caches Updated: conjugation_extract.py — rewritten: reads verbs_input.txt, searches /search/?q= for slug, parses table by row labels requirements.txt — add genanki, beautifulsoup4, lxml run.py — full orchestration pipeline with CLI flags .gitignore — exclude venv/, benyehuda_index.json, audio/, output/ CLI: python run.py --skip-scrape --skip-audio --test 20 (quick test) python run.py --skip-scrape (full build) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 01:58:31 +00:00 · 2026-03-03 01:58:31 +00:00 · b086123bec
commit b086123bec
parent e23b353064
13 changed files with 23502 additions and 162 deletions
--- a/.gitignore
+++ b/.gitignore
@ -8,3 +8,15 @@ lib**
 include**
 lib64**
 pyvenv.cfg
 venv/
 __pycache__/
 *.pyc
 # Large generated cache files (rebuild locally)
 data/benyehuda_index.json
 # Audio directory (large; rebuild with --skip-scrape)
 data/audio/
 # Output .apkg files (generated by pipeline)
 output/
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -0,0 +1,428 @@
 #!/usr/bin/env python3
 """
 Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
 Uses genanki for reliable, stable deck generation.
 Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
 in Anki rather than creating a duplicate.
 """
 import json
 import logging
 import unicodedata
 from pathlib import Path
 from typing import Optional
 import genanki
 import pandas as pd
 logger = logging.getLogger(__name__)
 # Stable deck/model IDs — do not change these
 VOCAB_DECK_ID   = 1_234_567_890
 VOCAB_MODEL_ID  = 1_234_567_891
 CONJ_DECK_ID    = 1_234_567_892
 CONJ_MODEL_ID   = 1_234_567_893
 DATA_DIR   = Path(__file__).parent / "data"
 AUDIO_DIR  = DATA_DIR / "audio"
 OUTPUT_DIR = Path(__file__).parent / "output"
 VOCAB_APKG  = OUTPUT_DIR / "pealim_vocabulary.apkg"
 CONJ_APKG   = OUTPUT_DIR / "pealim_conjugations.apkg"
 # ──────────────────────────────────────────────────────────────────────────────
 # Shared CSS
 # ──────────────────────────────────────────────────────────────────────────────
 CARD_CSS = """
 .card {
  font-family: Arial, sans-serif;
  font-size: 20px;
  text-align: center;
  color: #222;
  background: #fff;
  padding: 16px;
 }
 .hebrew {
  font-size: 36px;
  font-weight: bold;
  direction: rtl;
  text-align: right;
  line-height: 1.5;
  color: #1a1a8c;
 }
 .hebrew-sm {
  font-size: 24px;
  direction: rtl;
  text-align: right;
  color: #333;
 }
 .label {
  font-size: 13px;
  color: #888;
  text-transform: uppercase;
  letter-spacing: 0.05em;
  margin-top: 10px;
 }
 .meaning {
  font-size: 28px;
  color: #111;
  margin: 8px 0;
 }
 .root-info {
  font-size: 16px;
  color: #555;
  margin-top: 6px;
  direction: rtl;
 }
 .example {
  font-size: 16px;
  color: #444;
  direction: rtl;
  text-align: right;
  font-style: italic;
  margin-top: 10px;
  border-left: 3px solid #aaa;
  padding-left: 8px;
 }
 .divider { border-top: 1px solid #ddd; margin: 10px 0; }
 .freq-badge {
  display: inline-block;
  font-size: 12px;
  color: #fff;
  background: #0070c0;
  border-radius: 10px;
  padding: 2px 8px;
  margin-top: 4px;
 }
 """
 # ──────────────────────────────────────────────────────────────────────────────
 # Vocabulary Deck
 # ──────────────────────────────────────────────────────────────────────────────
 VOCAB_FRONT_HEB = """
 <div class="hebrew">{{Word}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
 <div class="label">What does this mean?</div>
 """
 VOCAB_BACK_HEB = """
 {{FrontSide}}
 <div class="divider"></div>
 <div class="meaning">{{Meaning}}</div>
 <div class="label">Root</div>
 <div class="hebrew-sm">{{Root}}</div>
 <div class="label">Part of Speech</div>
 <div style="font-size:15px;color:#555">{{PoS}}</div>
 {{#SharedRoots}}
 <div class="label">Related words (same root)</div>
 <div class="root-info">{{SharedRoots}}</div>
 {{/SharedRoots}}
 {{#Example}}
 <div class="label">Example</div>
 <div class="example">{{Example}}</div>
 {{/Example}}
 {{#Frequency}}<div class="freq-badge">Rank #{{Frequency}}</div>{{/Frequency}}
 """
 VOCAB_FRONT_ENG = """
 <div class="meaning">{{Meaning}}</div>
 <div class="label">Translate to Hebrew</div>
 """
 VOCAB_BACK_ENG = """
 {{FrontSide}}
 <div class="divider"></div>
 <div class="hebrew">{{Word}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
 <div class="label">Without nikkud</div>
 <div class="hebrew-sm">{{WordNoNikkud}}</div>
 <div class="label">Root</div>
 <div class="hebrew-sm">{{Root}}</div>
 <div class="label">Part of Speech</div>
 <div style="font-size:15px;color:#555">{{PoS}}</div>
 {{#Example}}
 <div class="label">Example</div>
 <div class="example">{{Example}}</div>
 {{/Example}}
 """
 VOCAB_MODEL = genanki.Model(
    VOCAB_MODEL_ID,
    "Pealim Hebrew",
    fields=[
        {"name": "Word"},
        {"name": "Root"},
        {"name": "PoS"},
        {"name": "Meaning"},
        {"name": "WordNoNikkud"},
        {"name": "SharedRoots"},
        {"name": "Tags"},
        {"name": "Audio"},
        {"name": "Example"},
        {"name": "Frequency"},
    ],
    templates=[
        {
            "name": "Hebrew → English",
            "qfmt": VOCAB_FRONT_HEB,
            "afmt": VOCAB_BACK_HEB,
        },
        {
            "name": "English → Hebrew",
            "qfmt": VOCAB_FRONT_ENG,
            "afmt": VOCAB_BACK_ENG,
        },
    ],
    css=CARD_CSS,
 )
 # ──────────────────────────────────────────────────────────────────────────────
 # Conjugation Deck
 # ──────────────────────────────────────────────────────────────────────────────
 CONJ_FRONT = """
 <div class="label">פועל (Verb)</div>
 <div class="hebrew">{{ReferenceForm}}</div>
 {{#Pronoun}}<div class="hebrew-sm">{{Pronoun}}</div>{{/Pronoun}}
 <div class="label">זמן (Tense)</div>
 <div class="hebrew-sm">{{Tense}}</div>
 <div class="label">מה הצורה? (What is the form?)</div>
 """
 CONJ_BACK = """
 {{FrontSide}}
 <div class="divider"></div>
 <div class="hebrew">{{ConjugatedForm}}</div>
 <div class="label">שורש (Root): {{Root}} &nbsp;|&nbsp; בניין (Binyan): {{Binyan}}</div>
 """
 CONJ_CSS = CARD_CSS + """
 .card { direction: rtl; }
 .label { direction: ltr; }
 """
 CONJ_MODEL = genanki.Model(
    CONJ_MODEL_ID,
    "Pealim Conjugation",
    fields=[
        {"name": "Infinitive"},
        {"name": "ReferenceForm"},
        {"name": "Pronoun"},
        {"name": "Tense"},
        {"name": "ConjugatedForm"},
        {"name": "Root"},
        {"name": "Binyan"},
    ],
    templates=[
        {
            "name": "Conjugation Drill",
            "qfmt": CONJ_FRONT,
            "afmt": CONJ_BACK,
        }
    ],
    css=CONJ_CSS,
 )
 # ──────────────────────────────────────────────────────────────────────────────
 # Helpers
 # ──────────────────────────────────────────────────────────────────────────────
 def _strip_nikkud(text: str) -> str:
    return "".join(
        ch for ch in unicodedata.normalize("NFD", text)
        if unicodedata.category(ch) != "Mn"
    )
 def _audio_tag(word_no_nikkud: str) -> str:
    """Return [sound:xxx.mp3] if audio file exists, else empty string."""
    safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
    if not safe:
        return ""
    mp3_path = AUDIO_DIR / f"{safe}.mp3"
    if mp3_path.exists():
        return f"[sound:{mp3_path.name}]"
    return ""
 import re
 def build_vocab_deck(
    dict_csv: Path,
    examples_cache: Optional[dict] = None,
    freq_cache: Optional[dict] = None,
    limit: Optional[int] = None,
 ) -> tuple[genanki.Deck, list[Path]]:
    """
    Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv).
    Returns (deck, list_of_media_files).
    """
    logger.info(f"Loading dictionary from {dict_csv}")
    # Try semicolon separator first (enriched CSV), fall back to comma
    try:
        df = pd.read_csv(dict_csv, sep=";", index_col=0)
        if df.shape[1] < 3:
            raise ValueError("too few columns")
    except Exception:
        df = pd.read_csv(dict_csv, index_col=0)
    if limit:
        df = df.head(limit)
    logger.info(f"  {len(df)} rows loaded")
    examples_cache = examples_cache or {}
    freq_cache = freq_cache or {}
    # Sort by frequency rank (ascending) so Anki presents common words first
    def freq_sort_key(row):
        word_plain = str(row.get("Word Without Nikkud", row.get("WordNoNikkud", ""))).strip()
        word_plain = _strip_nikkud(word_plain)
        return freq_cache.get(word_plain, 999_999)
    df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
    df = df.sort_values("_freq_rank")
    deck = genanki.Deck(VOCAB_DECK_ID, "Pealim Hebrew Vocabulary")
    media_files: list[Path] = []
    for _, row in df.iterrows():
        word         = str(row.get("Word", "")).strip()
        root         = str(row.get("Root", "")).strip()
        pos          = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
        meaning      = str(row.get("Meaning", "")).strip()
        word_no_nik  = str(row.get("Word Without Nikkud", "")).strip()
        shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
        tags_str     = str(row.get("tags", row.get("Tags", ""))).strip()
        freq_rank    = int(row["_freq_rank"]) if row["_freq_rank"] < 999_999 else ""
        # Audio
        audio_tag = _audio_tag(word_no_nik)
        if audio_tag:
            mp3_name = audio_tag[7:-1]  # strip [sound: and ]
            mp3_path = AUDIO_DIR / mp3_name
            if mp3_path not in media_files:
                media_files.append(mp3_path)
        # Example sentences
        plain_key = _strip_nikkud(word_no_nik)
        examples_list = examples_cache.get(plain_key, examples_cache.get(word_no_nik, []))
        example_html = "<br>".join(examples_list[:2]) if examples_list else ""
        # Clean up nan values
        for val, default in [(root, ""), (pos, ""), (meaning, ""), (word_no_nik, ""),
                              (shared_roots, ""), (tags_str, "")]:
            if val in ("nan", "None"):
                val = default
        root        = "" if root in ("nan", "None", "-") else root
        pos         = "" if pos in ("nan", "None") else pos
        meaning     = "" if meaning in ("nan", "None") else meaning
        word_no_nik = "" if word_no_nik in ("nan", "None") else word_no_nik
        shared_roots = "" if shared_roots in ("nan", "None") else shared_roots
        tags_str    = "" if tags_str in ("nan", "None") else tags_str
        if not word or not meaning:
            continue
        note = genanki.Note(
            model=VOCAB_MODEL,
            fields=[
                word,
                root,
                pos,
                meaning,
                word_no_nik,
                shared_roots,
                tags_str,
                audio_tag,
                example_html,
                str(freq_rank),
            ],
            tags=tags_str.split() if tags_str else [],
        )
        deck.add_note(note)
    logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
    return deck, media_files
 def build_conj_deck(conjugations: dict) -> genanki.Deck:
    """Build the conjugation drill deck from conjugations.json data."""
    deck = genanki.Deck(CONJ_DECK_ID, "Pealim Hebrew Conjugations")
    note_count = 0
    for infinitive, data in conjugations.items():
        if not data or not data.get("forms"):
            continue
        root         = data.get("root", "")
        binyan       = data.get("binyan", "")
        ref_form     = data.get("reference_form", infinitive)
        for form_key, form_data in data["forms"].items():
            conj_form = form_data.get("form", "")
            pronoun   = form_data.get("pronoun", "")
            tense     = form_data.get("tense", "")
            if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
                continue
            note = genanki.Note(
                model=CONJ_MODEL,
                fields=[
                    infinitive,
                    ref_form,
                    pronoun,
                    tense,
                    conj_form,
                    root,
                    binyan,
                ],
            )
            deck.add_note(note)
            note_count += 1
    logger.info(f"Conjugation deck: {note_count} notes across {sum(1 for v in conjugations.values() if v)} verbs")
    return deck
 def write_vocab_apkg(
    deck: genanki.Deck,
    media_files: list[Path],
    out_path: Path = VOCAB_APKG,
 ) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    pkg = genanki.Package(deck)
    pkg.media_files = [str(p) for p in media_files if p.exists()]
    pkg.write_to_file(str(out_path))
    logger.info(f"Vocabulary deck written → {out_path}")
 def write_conj_apkg(deck: genanki.Deck, out_path: Path = CONJ_APKG) -> None:
    out_path.parent.mkdir(parents=True, exist_ok=True)
    genanki.Package(deck).write_to_file(str(out_path))
    logger.info(f"Conjugation deck written → {out_path}")
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    # Quick self-test with 20 words, no audio, no examples
    csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
    if not csv_path.exists():
        csv_path = DATA_DIR / "pealim_dict.csv"
    deck, media = build_vocab_deck(csv_path, limit=20)
    write_vocab_apkg(deck, media)
    conj_path = DATA_DIR / "conjugations.json"
    if conj_path.exists():
        with open(conj_path) as f:
            conjugations = json.load(f)
        conj_deck = build_conj_deck(conjugations)
        write_conj_apkg(conj_deck)
--- a/benyehuda.py
+++ b/benyehuda.py
@ -0,0 +1,160 @@
 #!/usr/bin/env python3
 """
 Ben Yehuda corpus example-sentence lookup.
 Downloads plaintext-no-nikkud ZIP once, indexes sentences, then answers queries locally.
 Exposed API: get_examples(word_no_nikkud) -> list[str]
 """
 import json
 import logging
 import re
 import unicodedata
 import zipfile
 from io import BytesIO
 from pathlib import Path
 import requests
 logger = logging.getLogger(__name__)
 CORPUS_URL = (
    "https://github.com/projectbenyehuda/public_domain_dump/releases/"
    "download/2025-10/txt_stripped.zip"
 )
 INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
 EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
 REQUEST_TIMEOUT = 120
 MIN_SENTENCE_LEN = 15
 MAX_EXAMPLES_PER_WORD = 2
 MAX_INDEX_ENTRIES = 500  # cap examples kept per word in index to limit memory
 # Module-level state
 _index: dict[str, list[str]] = {}          # word -> [sentence, ...]
 _examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
 def _strip_nikkud(text: str) -> str:
    return "".join(
        ch for ch in unicodedata.normalize("NFD", text)
        if unicodedata.category(ch) != "Mn"
    )
 def _split_sentences(text: str) -> list[str]:
    """Split text into sentences on common sentence-ending punctuation."""
    raw = re.split(r"[.!?؟\n]{1,3}", text)
    out = []
    for s in raw:
        s = s.strip()
        if len(s) >= MIN_SENTENCE_LEN:
            out.append(s)
    return out
 def _build_index(corpus_zip_bytes: bytes) -> None:
    """Parse corpus ZIP and build word → sentences index."""
    global _index
    _index = {}
    logger.info("Building Ben Yehuda index from corpus …")
    with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
        txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
        logger.info(f"  Corpus contains {len(txt_files)} text files")
        for fname in txt_files:
            try:
                raw = zf.read(fname).decode("utf-8", errors="ignore")
            except Exception:
                continue
            for sentence in _split_sentences(raw):
                words = re.findall(r"[\u05d0-\u05ea'\"]+", sentence)
                for w in set(words):
                    if len(w) >= 2:
                        if w not in _index:
                            _index[w] = []
                        if len(_index[w]) < MAX_INDEX_ENTRIES:
                            _index[w].append(sentence)
    logger.info(f"Index built: {len(_index)} unique words")
 def _save_index() -> None:
    INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(INDEX_PATH, "w", encoding="utf-8") as f:
        json.dump(_index, f, ensure_ascii=False)
    logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
 def _load_index() -> None:
    global _index
    with open(INDEX_PATH, encoding="utf-8") as f:
        _index = json.load(f)
    logger.info(f"Ben Yehuda index loaded: {len(_index)} words")
 def load(force_rebuild: bool = False) -> None:
    """Load or build the Ben Yehuda index. Downloads corpus if needed."""
    global _index, _examples_cache
    if _index and not force_rebuild:
        return
    # Load persisted examples cache
    if EXAMPLES_CACHE_PATH.exists():
        with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
            _examples_cache = json.load(f)
    if INDEX_PATH.exists() and not force_rebuild:
        _load_index()
        return
    logger.info("Downloading Ben Yehuda corpus … (this may take 1-2 minutes)")
    resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
    resp.raise_for_status()
    data = resp.content
    logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
    _build_index(data)
    _save_index()
 def save_examples_cache() -> None:
    EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
        json.dump(_examples_cache, f, ensure_ascii=False)
    logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
 def get_examples(word_no_nikkud: str) -> list[str]:
    """
    Return up to 2 shortest complete sentences (≥15 chars) containing word_no_nikkud
    as a whole token.  Results are cached; subsequent calls for the same word are instant.
    """
    if not _index:
        load()
    word = _strip_nikkud(word_no_nikkud.strip())
    if word in _examples_cache:
        return _examples_cache[word]
    candidates = _index.get(word, [])
    # Filter: must contain word as whole token (word boundary)
    pattern = r"(?<![^\s\W])" + re.escape(word) + r"(?![^\s\W])"
    matched = [s for s in candidates if re.search(pattern, s)]
    # Sort by length (prefer shorter, more natural sentences)
    matched.sort(key=len)
    result = matched[:MAX_EXAMPLES_PER_WORD]
    _examples_cache[word] = result
    return result
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    load()
    tests = ["שלום", "בית", "ספר", "מים", "אהבה", "ילד"]
    for w in tests:
        exs = get_examples(w)
        print(f"\n{w}: {len(exs)} examples")
        for ex in exs:
            print(f"  → {ex[:80]}")
    save_examples_cache()
--- a/conjugation_extract.py
+++ b/conjugation_extract.py
@ -1,153 +1,408 @@
 #!/usr/bin/env python3
 """
 Extract Hebrew verb conjugations from pealim.com.
-Scrapes conjugation tables for specific verbs.
+Input: verbs_input.txt  (one Hebrew infinitive per line)
 Output: data/conjugations.json
 For each verb:
  1. Search pealim.com/search/?q=<verb> to find URL slug
  2. Fetch /dict/<slug>/ with hebstyle=mo cookie
  3. Parse conjugation table by row labels
 Resume-safe: verbs already in conjugations.json are skipped.
 """
-import requests
+import json
 import pandas as pd
 import numpy as np
 import logging
 import re
 import time
 import urllib.parse
 from pathlib import Path
 import requests
 from bs4 import BeautifulSoup
 # Configure logging
 logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
 )
 logger = logging.getLogger(__name__)
-# Session for connection pooling
+PEALIM_BASE = "https://www.pealim.com"
 REQUEST_DELAY = 1.5
 REQUEST_TIMEOUT = 15
 VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
 CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
 # Pronoun labels (for card front display)
 PRONOUN_LABELS = {
    "present_ms": "",
    "present_fs": "",
    "present_mp": "",
    "present_fp": "",
    "past_1s":    "אֲנִי",
    "past_1p":    "אֲנַחְנוּ",
    "past_2ms":   "אַתָּה",
    "past_2fs":   "אַתְּ",
    "past_2mp":   "אַתֶּם",
    "past_2fp":   "אַתֶּן",
    "past_3ms":   "הוּא",
    "past_3fs":   "הִיא",
    "past_3p":    "הֵם / הֵן",
    "future_1s":  "אֲנִי",
    "future_1p":  "אֲנַחְנוּ",
    "future_2ms": "אַתָּה",
    "future_2fs": "אַתְּ",
    "future_2mp": "אַתֶּם",
    "future_2fp": "אַתֶּן",
    "future_3ms": "הוּא",
    "future_3fs": "הִיא",
    "future_3mp": "הֵם",
    "future_3fp": "הֵן",
    "imperative_ms": "אַתָּה",
    "imperative_fs": "אַתְּ",
    "imperative_mp": "אַתֶּם",
    "imperative_fp": "אַתֶּן",
    "infinitive": "",
 }
 # Human-readable tense description for card front
 TENSE_DESCRIPTION = {
    "present_ms": "הוֹוֶה (זכר יחיד)",
    "present_fs": "הוֹוֶה (נקבה יחיד)",
    "present_mp": "הוֹוֶה (זכר רבים)",
    "present_fp": "הוֹוֶה (נקבה רבים)",
    "past_1s":    "עָבָר",
    "past_1p":    "עָבָר",
    "past_2ms":   "עָבָר",
    "past_2fs":   "עָבָר",
    "past_2mp":   "עָבָר",
    "past_2fp":   "עָבָר",
    "past_3ms":   "עָבָר",
    "past_3fs":   "עָבָר",
    "past_3p":    "עָבָר",
    "future_1s":  "עָתִיד",
    "future_1p":  "עָתִיד",
    "future_2ms": "עָתִיד",
    "future_2fs": "עָתִיד",
    "future_2mp": "עָתִיד",
    "future_2fp": "עָתִיד",
    "future_3ms": "עָתִיד",
    "future_3fs": "עָתִיד",
    "future_3mp": "עָתִיד",
    "future_3fp": "עָתִיד",
    "imperative_ms": "צִוּוּי",
    "imperative_fs": "צִוּוּי",
    "imperative_mp": "צִוּוּי",
    "imperative_fp": "צִוּוּי",
    "infinitive": "מְקוֹר",
 }
 session = requests.Session()
-session.headers.update({
+session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
    'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
 })
 PEALIM_BASE_URL = "https://www.pealim.com/dict"
 REQUEST_TIMEOUT = 10
 REQUEST_DELAY = 1.0  # seconds between requests (respectful scraping)
 # Conjugation column order (standard Hebrew verb forms)
 CONJUGATION_COLUMNS = [
    'present_ms', 'present_fs', 'present_mp', 'present_fp',
    'past_1s', 'past_1p', 'past_2ms', 'past_2fs', 'past_2mp', 'past_2fp',
    'past_3ms', 'past_3fs', 'past_3p',
    'future_1s', 'future_1p', 'future_2ms', 'future_2fs', 'future_2mp', 'future_2fp',
    'future_3ms', 'future_3fs', 'future_3mp', 'future_3fp',
    'imperative_ms', 'imperative_fs', 'imperative_mp', 'imperative_fp',
    'infinitive'
 ]
-def extract_verb(url_suffix: str, max_retries: int = 3) -> pd.DataFrame:
+def _find_slug(infinitive: str) -> str | None:
-    """
+    """Search pealim.com/search/?q=<verb> and return the URL slug."""
-    Extract conjugation table for a single verb.
+    url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(infinitive)}"
    Args:
        url_suffix: URL suffix (e.g., '2255-lishmor', '860-lishon')
        max_retries: Maximum retry attempts on failure
    Returns:
        DataFrame with conjugation forms, or None if extraction fails
    """
    url = f"{PEALIM_BASE_URL}/{url_suffix}"
    for attempt in range(max_retries):
        try:
            logger.info(f"Fetching: {url} (attempt {attempt + 1}/{max_retries})")
            cookies = {
                'translit': 'none',
                'hebstyle': 'bp',
                'showmeaning': 'off'
            }
            response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
            response.raise_for_status()
            # Parse HTML table
            dfs = pd.read_html(response.content)
            if not dfs:
                logger.warning(f"No tables found for {url_suffix}")
                return None
            df = dfs[0]
            # Extract conjugation forms (skip header columns, flatten)
            # Adjust indices based on actual table structure
            np_flat = df.iloc[:, 2:].values.flatten()
            # Remove NaN and invalid entries
            np_flat = np.delete(np_flat, [5, 7, 15, 17, 19, 33, 34, 35])
            # Create DataFrame with proper column names
            df_result = pd.DataFrame([np_flat], columns=CONJUGATION_COLUMNS)
            logger.info(f"✓ Extracted {url_suffix}")
            return df_result
        except requests.RequestException as e:
            logger.error(f"Network error for {url_suffix} (attempt {attempt + 1}): {e}")
            if attempt < max_retries - 1:
                time.sleep(2 ** attempt)  # Exponential backoff
            else:
                return None
        except Exception as e:
            logger.error(f"Error parsing {url_suffix}: {e}")
            return None
 def extract_from_website(url_suffixes: list = None) -> pd.DataFrame:
    """
    Extract conjugations for multiple verbs.
    Args:
        url_suffixes: List of URL suffixes to process
    Returns:
        Combined DataFrame with all conjugations
    """
    if url_suffixes is None:
        # Default verbs: "to guard" and "to sleep"
        url_suffixes = ['2255-lishmor', '860-lishon']
    logger.info(f"Starting extraction for {len(url_suffixes)} verb(s)...")
    all_dfs = []
    for url_suffix in url_suffixes:
        df = extract_verb(url_suffix)
        if df is not None:
            all_dfs.append(df)
        time.sleep(0.5)  # Small delay between requests
    if not all_dfs:
        logger.error("No data extracted!")
        return pd.DataFrame()
    combined_df = pd.concat(all_dfs, ignore_index=True)
    logger.info(f"Extraction complete. Total verbs: {len(combined_df)}")
    return combined_df
 def main():
    """Main entry point."""
    try:
-        df = extract_from_website()
+        resp = session.get(url, timeout=REQUEST_TIMEOUT)
-        
+        resp.raise_for_status()
-        if df.empty:
+        # Slugs look like /dict/2255-lishmor/
-            logger.error("No data to save!")
+        slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
-            return
+        if slugs:
-        
+            slug = slugs[0]
-        df.to_csv('conjugations.csv', sep=';', index=True)
+            logger.info(f"  Slug: {slug}")
-        logger.info("Saved: conjugations.csv")
+            return slug
        logger.info("\n" + df.to_string())
        logger.info("✅ Complete!")
    except Exception as e:
-        logger.error(f"Fatal error: {e}")
+        logger.error(f"  Error searching for '{infinitive}': {e}")
-        raise
+    return None
-if __name__ == '__main__':
+def _is_passive_binyan(binyan: str) -> bool:
-    main()
+    for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]:
        if marker.lower() in binyan.lower():
            return True
    return False
 def _get_menukad(cell) -> str:
    """Extract nikkud Hebrew text from a table cell."""
    span = cell.find("span", class_="menukad")
    if span:
        return span.get_text(strip=True)
    # fallback: any Hebrew text in cell
    txt = cell.get_text(strip=True)
    if re.search(r"[\u05d0-\u05ea]", txt):
        return txt
    return ""
 def _parse_table(soup: BeautifulSoup) -> dict[str, str]:
    """
    Parse the pealim conjugation table and return form_key -> Hebrew form mapping.
    Table structure (rows after two header rows):
      Row 2  (Present): [label x2] [ms] [fs] [mp] [fp]
      Row 3  (Past 1):  [Past x1] [1st x1] [1s x2] [1p x2]
      Row 4  (Past 2):  [2nd x1] [2ms] [2fs] [2mp] [2fp]
      Row 5  (Past 3):  [3rd x1] [3ms] [3fs] [3p x2]
      Row 6  (Fut 1):   [Future x1] [1st x1] [1s x2] [1p x2]
      Row 7  (Fut 2):   [2nd x1] [2ms] [2fs] [2mp] [2fp]
      Row 8  (Fut 3):   [3rd x1] [3ms] [3fs] [3mp] [3fp]
      Row 9  (Imp):     [Imp x2] [ms] [fs] [mp] [fp]
      Row 10 (Inf):     [Inf x2] [form x4]
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}
    rows = table.find_all("tr")
    if len(rows) < 9:
        return {}
    forms: dict[str, str] = {}
    def row_forms(row_idx: int) -> list[str]:
        """Extract all Hebrew form values from a row (expanding colspans)."""
        cells = rows[row_idx].find_all(["th", "td"])
        result = []
        for cell in cells:
            txt = _get_menukad(cell)
            colspan = int(cell.get("colspan", 1))
            if txt:
                for _ in range(colspan):
                    result.append(txt)
            else:
                for _ in range(colspan):
                    result.append("")
        return result
    def first_heb_forms(row_idx: int) -> list[str]:
        """Get only the Hebrew-text cells from a row (skip label cells)."""
        cells = rows[row_idx].find_all(["th", "td"])
        result = []
        for cell in cells:
            txt = _get_menukad(cell)
            colspan = int(cell.get("colspan", 1))
            if txt and re.search(r"[\u05d0-\u05ea]", txt):
                for _ in range(colspan):
                    result.append(txt)
        return result
    # Row label detection
    def row_label(idx: int) -> str:
        row = rows[idx]
        return row.get_text(" ", strip=True).lower()
    # Find rows by tense label
    present_row = past_row = future_row = imp_row = inf_row = -1
    for i, row in enumerate(rows):
        label = row.get_text(" ", strip=True).lower()
        if "present" in label and present_row < 0:
            present_row = i
        elif "past" in label and past_row < 0:
            past_row = i
        elif "future" in label and future_row < 0:
            future_row = i
        elif "imperative" in label and imp_row < 0:
            imp_row = i
        elif "infinitive" in label and inf_row < 0:
            inf_row = i
    # Present tense (4 forms: ms fs mp fp)
    if present_row >= 0:
        hf = first_heb_forms(present_row)
        keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
        for k, v in zip(keys, hf):
            if v:
                forms[k] = v
    # Past tense (rows: 1st person, 2nd person, 3rd person)
    if past_row >= 0:
        # 1st person row
        hf = first_heb_forms(past_row)
        # Row has: [Past label] [1st label] [1s] [1s] [1p] [1p] or just [1s] [1p]
        # After label stripping: we get 1s and 1p (possibly duplicated by colspan)
        unique = list(dict.fromkeys(hf))  # deduplicate consecutive
        if len(unique) >= 1:
            forms["past_1s"] = unique[0]
        if len(unique) >= 2:
            forms["past_1p"] = unique[1]
        # 2nd person row
        if past_row + 1 < len(rows):
            hf2 = first_heb_forms(past_row + 1)
            keys = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
            for k, v in zip(keys, hf2):
                if v:
                    forms[k] = v
        # 3rd person row
        if past_row + 2 < len(rows):
            hf3 = first_heb_forms(past_row + 2)
            # 3ms, 3fs, 3p (3p colspan=2 so may appear twice)
            unique3 = list(dict.fromkeys(hf3))
            keys3 = ["past_3ms", "past_3fs", "past_3p"]
            for k, v in zip(keys3, unique3):
                if v:
                    forms[k] = v
    # Future tense
    if future_row >= 0:
        # 1st person
        hf = first_heb_forms(future_row)
        unique = list(dict.fromkeys(hf))
        if len(unique) >= 1:
            forms["future_1s"] = unique[0]
        if len(unique) >= 2:
            forms["future_1p"] = unique[1]
        if future_row + 1 < len(rows):
            hf2 = first_heb_forms(future_row + 1)
            keys = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
            for k, v in zip(keys, hf2):
                if v:
                    forms[k] = v
        if future_row + 2 < len(rows):
            hf3 = first_heb_forms(future_row + 2)
            keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
            for k, v in zip(keys3, hf3):
                if v:
                    forms[k] = v
    # Imperative
    if imp_row >= 0:
        hf = first_heb_forms(imp_row)
        keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
        for k, v in zip(keys, hf):
            if v:
                forms[k] = v
    # Infinitive
    if inf_row >= 0:
        hf = first_heb_forms(inf_row)
        if hf:
            forms["infinitive"] = hf[0]
    return forms
 def _extract_conjugations(slug: str, infinitive: str) -> dict | None:
    """Fetch /dict/<slug>/ and parse conjugation table."""
    url = f"{PEALIM_BASE}/dict/{slug}/"
    try:
        resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
    except Exception as e:
        logger.error(f"  Error fetching {url}: {e}")
        return None
    soup = BeautifulSoup(resp.text, "lxml")
    # Extract root from menukad span in header
    root = ""
    for span in soup.find_all("span", class_="menukad"):
        txt = span.get_text(strip=True)
        if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
            root = txt
            break
    # Extract binyan / verb type from lead text or title
    binyan = ""
    meta = soup.find("meta", {"property": "og:description"})
    if meta:
        desc = meta.get("content", "")
        for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]:
            if bname in desc:
                binyan = bname
                break
    forms = _parse_table(soup)
    if not forms:
        logger.warning(f"  No forms found for {slug}")
        return None
    is_passive = _is_passive_binyan(binyan)
    reference_form = forms.get("infinitive", infinitive) if not is_passive else forms.get("past_3ms", infinitive)
    result = {
        "infinitive": infinitive,
        "slug": slug,
        "root": root,
        "binyan": binyan,
        "is_passive": is_passive,
        "reference_form": reference_form,
        "forms": {},
    }
    for key, form in forms.items():
        if key in PRONOUN_LABELS:
            result["forms"][key] = {
                "form": form,
                "pronoun": PRONOUN_LABELS[key],
                "tense": TENSE_DESCRIPTION.get(key, ""),
            }
    logger.info(f"  Extracted {len(result['forms'])} forms for {infinitive}")
    return result
 def _load_conjugations() -> dict:
    if CONJUGATIONS_PATH.exists():
        with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
            return json.load(f)
    return {}
 def _save_conjugations(data: dict) -> None:
    CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
 def main(verbs_file: Path = VERBS_INPUT) -> dict:
    """Read verbs from file and extract conjugations. Returns full conjugations dict."""
    if not verbs_file.exists():
        logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
        return _load_conjugations()
    verbs = [v.strip() for v in verbs_file.read_text(encoding="utf-8").splitlines()
             if v.strip() and not v.startswith("#")]
    logger.info(f"Loaded {len(verbs)} verbs from {verbs_file}")
    conjugations = _load_conjugations()
    new_count = 0
    for verb in verbs:
        if verb in conjugations:
            logger.info(f"Skipping {verb} (cached)")
            continue
        logger.info(f"Processing: {verb}")
        time.sleep(REQUEST_DELAY)
        slug = _find_slug(verb)
        if not slug:
            logger.warning(f"  No slug found for {verb}")
            conjugations[verb] = None
            _save_conjugations(conjugations)
            continue
        time.sleep(REQUEST_DELAY)
        data = _extract_conjugations(slug, verb)
        conjugations[verb] = data
        _save_conjugations(conjugations)
        new_count += 1
    logger.info(f"Done: {new_count} new verbs processed")
    return conjugations
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
    result = main()
    for verb, data in result.items():
        if data:
            forms = data.get("forms", {})
            print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
            for k, v in list(forms.items())[:3]:
                print(f"  {k}: {v['form']}")
        else:
            print(f"{verb}: no data")
--- a/data/conjugations.json
+++ b/data/conjugations.json
@ -0,0 +1,903 @@
 {
  "לִשְׁמוֹר": {
    "infinitive": "לִשְׁמוֹר",
    "slug": "2255-lishmor",
    "root": "שׁ - מ - ר",
    "binyan": "",
    "is_passive": false,
    "reference_form": "לִשְׁמֹר",
    "forms": {
      "present_ms": {
        "form": "שׁוֹמֵר",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר יחיד)"
      },
      "present_fs": {
        "form": "שׁוֹמֶרֶת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה יחיד)"
      },
      "present_mp": {
        "form": "שׁוֹמְרִים",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר רבים)"
      },
      "present_fp": {
        "form": "שׁוֹמְרוֹת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה רבים)"
      },
      "past_1s": {
        "form": "שָׁמַרְתִּי",
        "pronoun": "אֲנִי",
        "tense": "עָבָר"
      },
      "past_1p": {
        "form": "שָׁמַרְנוּ",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָבָר"
      },
      "past_2ms": {
        "form": "שָׁמַרְתָּ",
        "pronoun": "אַתָּה",
        "tense": "עָבָר"
      },
      "past_2fs": {
        "form": "שָׁמַרְתְּ",
        "pronoun": "אַתְּ",
        "tense": "עָבָר"
      },
      "past_2mp": {
        "form": "שְׁמַרְתֶּם",
        "pronoun": "אַתֶּם",
        "tense": "עָבָר"
      },
      "past_2fp": {
        "form": "שְׁמַרְתֶּן",
        "pronoun": "אַתֶּן",
        "tense": "עָבָר"
      },
      "past_3ms": {
        "form": "שָׁמַר",
        "pronoun": "הוּא",
        "tense": "עָבָר"
      },
      "past_3fs": {
        "form": "שָׁמְרָה",
        "pronoun": "הִיא",
        "tense": "עָבָר"
      },
      "past_3p": {
        "form": "שָׁמְרוּ",
        "pronoun": "הֵם / הֵן",
        "tense": "עָבָר"
      },
      "future_1s": {
        "form": "אֶשְׁמֹר",
        "pronoun": "אֲנִי",
        "tense": "עָתִיד"
      },
      "future_1p": {
        "form": "נִשְׁמֹר",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָתִיד"
      },
      "future_2ms": {
        "form": "תִּשְׁמֹר",
        "pronoun": "אַתָּה",
        "tense": "עָתִיד"
      },
      "future_2fs": {
        "form": "תִּשְׁמְרִי",
        "pronoun": "אַתְּ",
        "tense": "עָתִיד"
      },
      "future_2mp": {
        "form": "תִּשְׁמְרוּ",
        "pronoun": "אַתֶּם",
        "tense": "עָתִיד"
      },
      "future_2fp": {
        "form": "תִּשְׁמֹרְנָה",
        "pronoun": "אַתֶּן",
        "tense": "עָתִיד"
      },
      "future_3ms": {
        "form": "יִשְׁמֹר",
        "pronoun": "הוּא",
        "tense": "עָתִיד"
      },
      "future_3fs": {
        "form": "תִּשְׁמֹר",
        "pronoun": "הִיא",
        "tense": "עָתִיד"
      },
      "future_3mp": {
        "form": "יִשְׁמְרוּ",
        "pronoun": "הֵם",
        "tense": "עָתִיד"
      },
      "future_3fp": {
        "form": "תִּשְׁמֹרְנָה",
        "pronoun": "הֵן",
        "tense": "עָתִיד"
      },
      "imperative_ms": {
        "form": "שְׁמֹר!‏",
        "pronoun": "אַתָּה",
        "tense": "צִוּוּי"
      },
      "imperative_fs": {
        "form": "שִׁמְרִי!‏",
        "pronoun": "אַתְּ",
        "tense": "צִוּוּי"
      },
      "imperative_mp": {
        "form": "שִׁמְרוּ!‏",
        "pronoun": "אַתֶּם",
        "tense": "צִוּוּי"
      },
      "imperative_fp": {
        "form": "שְׁמֹרְנָה!‏",
        "pronoun": "אַתֶּן",
        "tense": "צִוּוּי"
      },
      "infinitive": {
        "form": "לִשְׁמֹר",
        "pronoun": "",
        "tense": "מְקוֹר"
      }
    }
  },
  "לְהִשָּׁמֵר": {
    "infinitive": "לְהִשָּׁמֵר",
    "slug": "2256-lehishamer",
    "root": "שׁ - מ - ר",
    "binyan": "",
    "is_passive": false,
    "reference_form": "לְהִשָּׁמֵר",
    "forms": {
      "present_ms": {
        "form": "נִשְׁמָר",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר יחיד)"
      },
      "present_fs": {
        "form": "נִשְׁמֶרֶת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה יחיד)"
      },
      "present_mp": {
        "form": "נִשְׁמָרִים",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר רבים)"
      },
      "present_fp": {
        "form": "נִשְׁמָרוֹת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה רבים)"
      },
      "past_1s": {
        "form": "נִשְׁמַרְתִּי",
        "pronoun": "אֲנִי",
        "tense": "עָבָר"
      },
      "past_1p": {
        "form": "נִשְׁמַרְנוּ",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָבָר"
      },
      "past_2ms": {
        "form": "נִשְׁמַרְתָּ",
        "pronoun": "אַתָּה",
        "tense": "עָבָר"
      },
      "past_2fs": {
        "form": "נִשְׁמַרְתְּ",
        "pronoun": "אַתְּ",
        "tense": "עָבָר"
      },
      "past_2mp": {
        "form": "נִשְׁמַרְתֶּם",
        "pronoun": "אַתֶּם",
        "tense": "עָבָר"
      },
      "past_2fp": {
        "form": "נִשְׁמַרְתֶּן",
        "pronoun": "אַתֶּן",
        "tense": "עָבָר"
      },
      "past_3ms": {
        "form": "נִשְׁמַר",
        "pronoun": "הוּא",
        "tense": "עָבָר"
      },
      "past_3fs": {
        "form": "נִשְׁמְרָה",
        "pronoun": "הִיא",
        "tense": "עָבָר"
      },
      "past_3p": {
        "form": "נִשְׁמְרוּ",
        "pronoun": "הֵם / הֵן",
        "tense": "עָבָר"
      },
      "future_1s": {
        "form": "אֶשָּׁמֵר",
        "pronoun": "אֲנִי",
        "tense": "עָתִיד"
      },
      "future_1p": {
        "form": "נִשָּׁמֵר",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָתִיד"
      },
      "future_2ms": {
        "form": "תִּשָּׁמֵר",
        "pronoun": "אַתָּה",
        "tense": "עָתִיד"
      },
      "future_2fs": {
        "form": "תִּשָּׁמְרִי",
        "pronoun": "אַתְּ",
        "tense": "עָתִיד"
      },
      "future_2mp": {
        "form": "תִּשָּׁמְרוּ",
        "pronoun": "אַתֶּם",
        "tense": "עָתִיד"
      },
      "future_2fp": {
        "form": "תִּשָּׁמַרְנָה",
        "pronoun": "אַתֶּן",
        "tense": "עָתִיד"
      },
      "future_3ms": {
        "form": "יִשָּׁמֵר",
        "pronoun": "הוּא",
        "tense": "עָתִיד"
      },
      "future_3fs": {
        "form": "תִּשָּׁמֵר",
        "pronoun": "הִיא",
        "tense": "עָתִיד"
      },
      "future_3mp": {
        "form": "יִשָּׁמְרוּ",
        "pronoun": "הֵם",
        "tense": "עָתִיד"
      },
      "future_3fp": {
        "form": "תִּשָּׁמַרְנָה",
        "pronoun": "הֵן",
        "tense": "עָתִיד"
      },
      "imperative_ms": {
        "form": "הִשָּׁמֵר!‏",
        "pronoun": "אַתָּה",
        "tense": "צִוּוּי"
      },
      "imperative_fs": {
        "form": "הִשָּׁמְרִי!‏",
        "pronoun": "אַתְּ",
        "tense": "צִוּוּי"
      },
      "imperative_mp": {
        "form": "הִשָּׁמְרוּ!‏",
        "pronoun": "אַתֶּם",
        "tense": "צִוּוּי"
      },
      "imperative_fp": {
        "form": "הִשָּׁמַרְנָה!‏",
        "pronoun": "אַתֶּן",
        "tense": "צִוּוּי"
      },
      "infinitive": {
        "form": "לְהִשָּׁמֵר",
        "pronoun": "",
        "tense": "מְקוֹר"
      }
    }
  },
  "לְדַבֵּר": {
    "infinitive": "לְדַבֵּר",
    "slug": "2-ledaber",
    "root": "ד - ב - ר",
    "binyan": "",
    "is_passive": false,
    "reference_form": "לְדַבֵּר",
    "forms": {
      "present_ms": {
        "form": "מְדַבֵּר",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר יחיד)"
      },
      "present_fs": {
        "form": "מְדַבֶּרֶת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה יחיד)"
      },
      "present_mp": {
        "form": "מְדַבְּרִים",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר רבים)"
      },
      "present_fp": {
        "form": "מְדַבְּרוֹת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה רבים)"
      },
      "past_1s": {
        "form": "דִּבַּרְתִּי",
        "pronoun": "אֲנִי",
        "tense": "עָבָר"
      },
      "past_1p": {
        "form": "דִּבַּרְנוּ",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָבָר"
      },
      "past_2ms": {
        "form": "דִּבַּרְתָּ",
        "pronoun": "אַתָּה",
        "tense": "עָבָר"
      },
      "past_2fs": {
        "form": "דִּבַּרְתְּ",
        "pronoun": "אַתְּ",
        "tense": "עָבָר"
      },
      "past_2mp": {
        "form": "דִּבַּרְתֶּם",
        "pronoun": "אַתֶּם",
        "tense": "עָבָר"
      },
      "past_2fp": {
        "form": "דִּבַּרְתֶּן",
        "pronoun": "אַתֶּן",
        "tense": "עָבָר"
      },
      "past_3ms": {
        "form": "דִּבֵּר",
        "pronoun": "הוּא",
        "tense": "עָבָר"
      },
      "past_3fs": {
        "form": "דִּבְּרָה",
        "pronoun": "הִיא",
        "tense": "עָבָר"
      },
      "past_3p": {
        "form": "דִּבְּרוּ",
        "pronoun": "הֵם / הֵן",
        "tense": "עָבָר"
      },
      "future_1s": {
        "form": "אֲדַבֵּר",
        "pronoun": "אֲנִי",
        "tense": "עָתִיד"
      },
      "future_1p": {
        "form": "נְדַבֵּר",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָתִיד"
      },
      "future_2ms": {
        "form": "תְּדַבֵּר",
        "pronoun": "אַתָּה",
        "tense": "עָתִיד"
      },
      "future_2fs": {
        "form": "תְּדַבְּרִי",
        "pronoun": "אַתְּ",
        "tense": "עָתִיד"
      },
      "future_2mp": {
        "form": "תְּדַבְּרוּ",
        "pronoun": "אַתֶּם",
        "tense": "עָתִיד"
      },
      "future_2fp": {
        "form": "תְּדַבֵּרְנָה",
        "pronoun": "אַתֶּן",
        "tense": "עָתִיד"
      },
      "future_3ms": {
        "form": "יְדַבֵּר",
        "pronoun": "הוּא",
        "tense": "עָתִיד"
      },
      "future_3fs": {
        "form": "תְּדַבֵּר",
        "pronoun": "הִיא",
        "tense": "עָתִיד"
      },
      "future_3mp": {
        "form": "יְדַבְּרוּ",
        "pronoun": "הֵם",
        "tense": "עָתִיד"
      },
      "future_3fp": {
        "form": "תְּדַבֵּרְנָה",
        "pronoun": "הֵן",
        "tense": "עָתִיד"
      },
      "imperative_ms": {
        "form": "דַּבֵּר!‏",
        "pronoun": "אַתָּה",
        "tense": "צִוּוּי"
      },
      "imperative_fs": {
        "form": "דַּבְּרִי!‏",
        "pronoun": "אַתְּ",
        "tense": "צִוּוּי"
      },
      "imperative_mp": {
        "form": "דַּבְּרוּ!‏",
        "pronoun": "אַתֶּם",
        "tense": "צִוּוּי"
      },
      "imperative_fp": {
        "form": "דַּבֵּרְנָה!‏",
        "pronoun": "אַתֶּן",
        "tense": "צִוּוּי"
      },
      "infinitive": {
        "form": "לְדַבֵּר",
        "pronoun": "",
        "tense": "מְקוֹר"
      }
    }
  },
  "לְדֻבַּר": {
    "infinitive": "לְדֻבַּר",
    "slug": "2-ledaber",
    "root": "ד - ב - ר",
    "binyan": "",
    "is_passive": false,
    "reference_form": "לְדַבֵּר",
    "forms": {
      "present_ms": {
        "form": "מְדַבֵּר",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר יחיד)"
      },
      "present_fs": {
        "form": "מְדַבֶּרֶת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה יחיד)"
      },
      "present_mp": {
        "form": "מְדַבְּרִים",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר רבים)"
      },
      "present_fp": {
        "form": "מְדַבְּרוֹת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה רבים)"
      },
      "past_1s": {
        "form": "דִּבַּרְתִּי",
        "pronoun": "אֲנִי",
        "tense": "עָבָר"
      },
      "past_1p": {
        "form": "דִּבַּרְנוּ",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָבָר"
      },
      "past_2ms": {
        "form": "דִּבַּרְתָּ",
        "pronoun": "אַתָּה",
        "tense": "עָבָר"
      },
      "past_2fs": {
        "form": "דִּבַּרְתְּ",
        "pronoun": "אַתְּ",
        "tense": "עָבָר"
      },
      "past_2mp": {
        "form": "דִּבַּרְתֶּם",
        "pronoun": "אַתֶּם",
        "tense": "עָבָר"
      },
      "past_2fp": {
        "form": "דִּבַּרְתֶּן",
        "pronoun": "אַתֶּן",
        "tense": "עָבָר"
      },
      "past_3ms": {
        "form": "דִּבֵּר",
        "pronoun": "הוּא",
        "tense": "עָבָר"
      },
      "past_3fs": {
        "form": "דִּבְּרָה",
        "pronoun": "הִיא",
        "tense": "עָבָר"
      },
      "past_3p": {
        "form": "דִּבְּרוּ",
        "pronoun": "הֵם / הֵן",
        "tense": "עָבָר"
      },
      "future_1s": {
        "form": "אֲדַבֵּר",
        "pronoun": "אֲנִי",
        "tense": "עָתִיד"
      },
      "future_1p": {
        "form": "נְדַבֵּר",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָתִיד"
      },
      "future_2ms": {
        "form": "תְּדַבֵּר",
        "pronoun": "אַתָּה",
        "tense": "עָתִיד"
      },
      "future_2fs": {
        "form": "תְּדַבְּרִי",
        "pronoun": "אַתְּ",
        "tense": "עָתִיד"
      },
      "future_2mp": {
        "form": "תְּדַבְּרוּ",
        "pronoun": "אַתֶּם",
        "tense": "עָתִיד"
      },
      "future_2fp": {
        "form": "תְּדַבֵּרְנָה",
        "pronoun": "אַתֶּן",
        "tense": "עָתִיד"
      },
      "future_3ms": {
        "form": "יְדַבֵּר",
        "pronoun": "הוּא",
        "tense": "עָתִיד"
      },
      "future_3fs": {
        "form": "תְּדַבֵּר",
        "pronoun": "הִיא",
        "tense": "עָתִיד"
      },
      "future_3mp": {
        "form": "יְדַבְּרוּ",
        "pronoun": "הֵם",
        "tense": "עָתִיד"
      },
      "future_3fp": {
        "form": "תְּדַבֵּרְנָה",
        "pronoun": "הֵן",
        "tense": "עָתִיד"
      },
      "imperative_ms": {
        "form": "דַּבֵּר!‏",
        "pronoun": "אַתָּה",
        "tense": "צִוּוּי"
      },
      "imperative_fs": {
        "form": "דַּבְּרִי!‏",
        "pronoun": "אַתְּ",
        "tense": "צִוּוּי"
      },
      "imperative_mp": {
        "form": "דַּבְּרוּ!‏",
        "pronoun": "אַתֶּם",
        "tense": "צִוּוּי"
      },
      "imperative_fp": {
        "form": "דַּבֵּרְנָה!‏",
        "pronoun": "אַתֶּן",
        "tense": "צִוּוּי"
      },
      "infinitive": {
        "form": "לְדַבֵּר",
        "pronoun": "",
        "tense": "מְקוֹר"
      }
    }
  },
  "לְהִתְלַבֵּשׁ": {
    "infinitive": "לְהִתְלַבֵּשׁ",
    "slug": "974-lehitlabesh",
    "root": "ל - ב - שׁ",
    "binyan": "",
    "is_passive": false,
    "reference_form": "לְהִתְלַבֵּשׁ",
    "forms": {
      "present_ms": {
        "form": "מִתְלַבֵּשׁ",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר יחיד)"
      },
      "present_fs": {
        "form": "מִתְלַבֶּשֶׁת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה יחיד)"
      },
      "present_mp": {
        "form": "מִתְלַבְּשִׁים",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר רבים)"
      },
      "present_fp": {
        "form": "מִתְלַבְּשׁוֹת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה רבים)"
      },
      "past_1s": {
        "form": "הִתְלַבַּשְׁתִּי",
        "pronoun": "אֲנִי",
        "tense": "עָבָר"
      },
      "past_1p": {
        "form": "הִתְלַבַּשְׁנוּ",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָבָר"
      },
      "past_2ms": {
        "form": "הִתְלַבַּשְׁתָּ",
        "pronoun": "אַתָּה",
        "tense": "עָבָר"
      },
      "past_2fs": {
        "form": "הִתְלַבַּשְׁתְּ",
        "pronoun": "אַתְּ",
        "tense": "עָבָר"
      },
      "past_2mp": {
        "form": "הִתְלַבַּשְׁתֶּם",
        "pronoun": "אַתֶּם",
        "tense": "עָבָר"
      },
      "past_2fp": {
        "form": "הִתְלַבַּשְׁתֶּן",
        "pronoun": "אַתֶּן",
        "tense": "עָבָר"
      },
      "past_3ms": {
        "form": "הִתְלַבֵּשׁ",
        "pronoun": "הוּא",
        "tense": "עָבָר"
      },
      "past_3fs": {
        "form": "הִתְלַבְּשָׁה",
        "pronoun": "הִיא",
        "tense": "עָבָר"
      },
      "past_3p": {
        "form": "הִתְלַבְּשׁוּ",
        "pronoun": "הֵם / הֵן",
        "tense": "עָבָר"
      },
      "future_1s": {
        "form": "אֶתְלַבֵּשׁ",
        "pronoun": "אֲנִי",
        "tense": "עָתִיד"
      },
      "future_1p": {
        "form": "נִתְלַבֵּשׁ",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָתִיד"
      },
      "future_2ms": {
        "form": "תִּתְלַבֵּשׁ",
        "pronoun": "אַתָּה",
        "tense": "עָתִיד"
      },
      "future_2fs": {
        "form": "תִּתְלַבְּשִׁי",
        "pronoun": "אַתְּ",
        "tense": "עָתִיד"
      },
      "future_2mp": {
        "form": "תִּתְלַבְּשׁוּ",
        "pronoun": "אַתֶּם",
        "tense": "עָתִיד"
      },
      "future_2fp": {
        "form": "תִּתְלַבֵּשְׁנָה",
        "pronoun": "אַתֶּן",
        "tense": "עָתִיד"
      },
      "future_3ms": {
        "form": "יִתְלַבֵּשׁ",
        "pronoun": "הוּא",
        "tense": "עָתִיד"
      },
      "future_3fs": {
        "form": "תִּתְלַבֵּשׁ",
        "pronoun": "הִיא",
        "tense": "עָתִיד"
      },
      "future_3mp": {
        "form": "יִתְלַבְּשׁוּ",
        "pronoun": "הֵם",
        "tense": "עָתִיד"
      },
      "future_3fp": {
        "form": "תִּתְלַבֵּשְׁנָה",
        "pronoun": "הֵן",
        "tense": "עָתִיד"
      },
      "imperative_ms": {
        "form": "הִתְלַבֵּשׁ!‏",
        "pronoun": "אַתָּה",
        "tense": "צִוּוּי"
      },
      "imperative_fs": {
        "form": "הִתְלַבְּשִׁי!‏",
        "pronoun": "אַתְּ",
        "tense": "צִוּוּי"
      },
      "imperative_mp": {
        "form": "הִתְלַבְּשׁוּ!‏",
        "pronoun": "אַתֶּם",
        "tense": "צִוּוּי"
      },
      "imperative_fp": {
        "form": "הִתְלַבֵּשְׁנָה!‏",
        "pronoun": "אַתֶּן",
        "tense": "צִוּוּי"
      },
      "infinitive": {
        "form": "לְהִתְלַבֵּשׁ",
        "pronoun": "",
        "tense": "מְקוֹר"
      }
    }
  },
  "לְהַגִּיד": {
    "infinitive": "לְהַגִּיד",
    "slug": "1135-lehagid",
    "root": "נ - ג - ד",
    "binyan": "",
    "is_passive": false,
    "reference_form": "לְהַגִּיד",
    "forms": {
      "present_ms": {
        "form": "מַגִּיד",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר יחיד)"
      },
      "present_fs": {
        "form": "מַגִּידָה",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה יחיד)"
      },
      "present_mp": {
        "form": "מַגִּידִים",
        "pronoun": "",
        "tense": "הוֹוֶה (זכר רבים)"
      },
      "present_fp": {
        "form": "מַגִּידוֹת",
        "pronoun": "",
        "tense": "הוֹוֶה (נקבה רבים)"
      },
      "past_1s": {
        "form": "הִגַּדְתִּי",
        "pronoun": "אֲנִי",
        "tense": "עָבָר"
      },
      "past_1p": {
        "form": "הִגַּדְנוּ",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָבָר"
      },
      "past_2ms": {
        "form": "הִגַּדְתָּ",
        "pronoun": "אַתָּה",
        "tense": "עָבָר"
      },
      "past_2fs": {
        "form": "הִגַּדְתְּ",
        "pronoun": "אַתְּ",
        "tense": "עָבָר"
      },
      "past_2mp": {
        "form": "הִגַּדְתֶּם",
        "pronoun": "אַתֶּם",
        "tense": "עָבָר"
      },
      "past_2fp": {
        "form": "הִגַּדְתֶּן",
        "pronoun": "אַתֶּן",
        "tense": "עָבָר"
      },
      "past_3ms": {
        "form": "הִגִּיד",
        "pronoun": "הוּא",
        "tense": "עָבָר"
      },
      "past_3fs": {
        "form": "הִגִּידָה",
        "pronoun": "הִיא",
        "tense": "עָבָר"
      },
      "past_3p": {
        "form": "הִגִּידוּ",
        "pronoun": "הֵם / הֵן",
        "tense": "עָבָר"
      },
      "future_1s": {
        "form": "אַגִּיד",
        "pronoun": "אֲנִי",
        "tense": "עָתִיד"
      },
      "future_1p": {
        "form": "נַגִּיד",
        "pronoun": "אֲנַחְנוּ",
        "tense": "עָתִיד"
      },
      "future_2ms": {
        "form": "תַּגִּיד",
        "pronoun": "אַתָּה",
        "tense": "עָתִיד"
      },
      "future_2fs": {
        "form": "תַּגִּידִי",
        "pronoun": "אַתְּ",
        "tense": "עָתִיד"
      },
      "future_2mp": {
        "form": "תַּגִּידוּ",
        "pronoun": "אַתֶּם",
        "tense": "עָתִיד"
      },
      "future_2fp": {
        "form": "תַּגֵּדְנָה",
        "pronoun": "אַתֶּן",
        "tense": "עָתִיד"
      },
      "future_3ms": {
        "form": "יַגִּיד",
        "pronoun": "הוּא",
        "tense": "עָתִיד"
      },
      "future_3fs": {
        "form": "תַּגִּיד",
        "pronoun": "הִיא",
        "tense": "עָתִיד"
      },
      "future_3mp": {
        "form": "יַגִּידוּ",
        "pronoun": "הֵם",
        "tense": "עָתִיד"
      },
      "future_3fp": {
        "form": "תַּגֵּדְנָה",
        "pronoun": "הֵן",
        "tense": "עָתִיד"
      },
      "imperative_ms": {
        "form": "הַגֵּד!‏",
        "pronoun": "אַתָּה",
        "tense": "צִוּוּי"
      },
      "imperative_fs": {
        "form": "הַגִּידִי!‏",
        "pronoun": "אַתְּ",
        "tense": "צִוּוּי"
      },
      "imperative_mp": {
        "form": "הַגִּידוּ!‏",
        "pronoun": "אַתֶּם",
        "tense": "צִוּוּי"
      },
      "imperative_fp": {
        "form": "הַגֵּדְנָה!‏",
        "pronoun": "אַתֶּן",
        "tense": "צִוּוּי"
      },
      "infinitive": {
        "form": "לְהַגִּיד",
        "pronoun": "",
        "tense": "מְקוֹר"
      }
    }
  },
  "לְהוּגַד": null
 }
--- a/data/examples_cache.json
+++ b/data/examples_cache.json
@ -0,0 +1 @@
 {"אב": ["לא אב לחגלה אתה", "כרחם אב על בנים"], "אבא": ["כך כך אבא יקירי", "“אבא איננו בבית"], "אביבי": ["אמרת: תם אביבי,", "אמרת: תם אביבי,"], "אביב": ["אביב כי יתחדש –", "ברחובות תל־אביב"], "אבידה": ["אבידה בדבר מועט", "ואם לרבות אבידה"], "לאבוד": ["אבל נאנחתי לאבוד", "אנו הולכים לאבוד"], "להיאבד": [], "להתאבד": ["מעמד והחליטה להתאבד", "היא נסתה פעם להתאבד"], "איבוד": ["איבוד דמי משלוח", "איבוד עצמו לדעת"], "התאבדות": ["והביאו לידי התאבדות", "הקלון, בלתי אם התאבדות"], "להאביד": ["ויאמר להאביד זכרם –", "קול שם רשעים להאביד"], "אבדה": ["ועתה אבדה תקותה", "וכל תשועתו אבדה"], "אבוד": ["— הה, הנני אבוד", "“אבוד עצמי לדעת"], "לאבד": ["אין לכם מה לאבד", "יש חשש לאבד שנה"], "אבדון": ["אבדון, אש הנעורת", "אבדון, פתחי עולם"], "אבוקדו": ["מטעים רצופים של עצי אבוקדו ומנגו", "את זרעי הפירות וגידלו מהם שתילים חדשים של אבוקדו"], "אבזם": ["רצו אל הטבח, הגישו לו הפעם חגורה עם אבזם מבריק… הביאו כוס", "רגליו היו עטופות לפפות חדשות ומתניו חגורות חגורה חדשה עם אבזם"], "לאבזר": [], "איבחון": ["לשלבים גבוהים יותר של איבחון וריפוי", "איבחון נחפז, כמוהו כהיסוס מופרז, עלול לגרור תוצאות בלתי־נעימות"]}
--- a/data/frequency_cache.json
+++ b/data/frequency_cache.json
--- a/data/pealim_dict.csv
+++ b/data/pealim_dict.csv
--- a/data/pealim_dict_for_anki.csv
+++ b/data/pealim_dict_for_anki.csv
--- a/frequency_lookup.py
+++ b/frequency_lookup.py
@ -0,0 +1,85 @@
 #!/usr/bin/env python3
 """
 Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
 Downloads he_50k.txt once; subsequent runs read from cache.
 Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
 """
 import json
 import logging
 import re
 import unicodedata
 from pathlib import Path
 import requests
 logger = logging.getLogger(__name__)
 FREQ_URL = (
    "https://raw.githubusercontent.com/hermitdave/FrequencyWords/"
    "master/content/2016/he/he_50k.txt"
 )
 CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
 REQUEST_TIMEOUT = 30
 # Module-level cache: word_no_nikkud -> rank (1 = most common)
 _freq: dict[str, int] = {}
 def _strip_nikkud(text: str) -> str:
    """Remove Hebrew nikkud (diacritics) from a string."""
    return "".join(
        ch for ch in unicodedata.normalize("NFD", text)
        if unicodedata.category(ch) != "Mn"
    )
 def load(cache_path: Path = CACHE_PATH) -> None:
    """Load frequency data from cache, downloading if not present."""
    global _freq
    if cache_path.exists():
        with open(cache_path, encoding="utf-8") as f:
            _freq = json.load(f)
        logger.info(f"Frequency cache loaded: {len(_freq)} entries")
        return
    logger.info("Downloading FrequencyWords he_50k.txt …")
    resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT)
    resp.raise_for_status()
    rank = 1
    for line in resp.text.splitlines():
        line = line.strip()
        if not line:
            continue
        parts = line.split()
        if len(parts) >= 1:
            word = _strip_nikkud(parts[0])
            if word and word not in _freq:
                _freq[word] = rank
                rank += 1
    cache_path.parent.mkdir(parents=True, exist_ok=True)
    with open(cache_path, "w", encoding="utf-8") as f:
        json.dump(_freq, f, ensure_ascii=False)
    logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}")
 def get_frequency_rank(word_no_nikkud: str) -> int | None:
    """
    Return the frequency rank of a word (1 = most common).
    Returns None if not found in the corpus.
    Strips nikkud from the input before lookup.
    """
    if not _freq:
        load()
    clean = _strip_nikkud(word_no_nikkud.strip())
    return _freq.get(clean)
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
    load()
    tests = ["שלום", "ספר", "בית", "מים", "כלב"]
    for w in tests:
        print(f"{w}: rank {get_frequency_rank(w)}")
--- a/requirements.txt
+++ b/requirements.txt
@ -1,3 +1,6 @@
 pandas>=1.3.0
 requests>=2.26.0
 numpy>=1.21.0
 genanki>=0.8.0
 beautifulsoup4>=4.11.0
 lxml>=4.9.0
--- a/run.py
+++ b/run.py
@ -1,48 +1,313 @@
 #!/usr/bin/env python3
 """
-Main entry point: orchestrate dictionary and conjugation extraction.
+Pealim Anki Deck Builder — full pipeline orchestrator.
 Usage:
  python run.py [options]
 Options:
  --skip-scrape        Use existing data/pealim_dict.csv (no pealim.com dict scraping)
  --skip-audio         Skip audio .mp3 downloads
  --skip-examples      Skip Ben Yehuda example fetching
  --skip-conjugations  Skip verb conjugation extraction
  --test N             Process only the first N dictionary words (for quick testing)
 """
 import argparse
 import json
 import logging
 import sys
 import time
 from pathlib import Path
 # Add current directory to path
 sys.path.insert(0, str(Path(__file__).parent))
 import pealim_extract
 import conjugation_extract
 logging.basicConfig(
    level=logging.INFO,
-    format='%(asctime)s - %(levelname)s - %(message)s'
+    format="%(asctime)s %(levelname)s %(message)s",
 )
 logger = logging.getLogger(__name__)
 DATA_DIR = Path(__file__).parent / "data"
 OUTPUT_DIR = Path(__file__).parent / "output"
 def parse_args():
    p = argparse.ArgumentParser(description="Pealim Anki deck builder")
    p.add_argument("--skip-scrape",        action="store_true", help="Skip dict scraping; use cached CSV")
    p.add_argument("--skip-audio",         action="store_true", help="Skip audio downloads")
    p.add_argument("--skip-examples",      action="store_true", help="Skip Ben Yehuda example lookup")
    p.add_argument("--skip-conjugations",  action="store_true", help="Skip verb conjugation extraction")
    p.add_argument("--test",               type=int, metavar="N", help="Limit to first N words")
    return p.parse_args()
 def step_scrape(args):
    """Step 1 — scrape or load dictionary."""
    dict_csv = DATA_DIR / "pealim_dict.csv"
    anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if args.skip_scrape:
        if dict_csv.exists():
            logger.info(f"[1] Using existing {dict_csv}")
        else:
            logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
            sys.exit(1)
        return
    logger.info("[1] Scraping dictionary from pealim.com …")
    import pealim_extract
    import pandas as pd
    df = pealim_extract.extract_from_website()
    df.to_csv(dict_csv, index=True)
    logger.info(f"    Saved {len(df)} words → {dict_csv}")
    df = pealim_extract.modify_for_anki(df)
    df.to_csv(anki_csv, sep=";", index=True)
    logger.info(f"    Saved Anki CSV → {anki_csv}")
 def step_frequency():
    """Step 2 — load/download word frequency data."""
    logger.info("[2] Loading word frequency data …")
    import frequency_lookup
    frequency_lookup.load()
    return frequency_lookup._freq
 def step_examples(args, freq_cache: dict):
    """Step 3 — load/build Ben Yehuda example index."""
    if args.skip_examples:
        logger.info("[3] Skipping examples (--skip-examples)")
        examples_path = DATA_DIR / "examples_cache.json"
        if examples_path.exists():
            with open(examples_path) as f:
                return json.load(f)
        return {}
    logger.info("[3] Loading Ben Yehuda example index …")
    import benyehuda
    benyehuda.load()
    # Pre-fetch examples for all words in the dict (uses cache)
    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    try:
        import pandas as pd
        try:
            df = pd.read_csv(dict_csv, sep=";", index_col=0)
            if df.shape[1] < 3:
                raise ValueError
        except Exception:
            df = pd.read_csv(dict_csv, index_col=0)
        if args.test:
            df = df.head(args.test)
        import unicodedata
        def strip(t):
            return "".join(c for c in unicodedata.normalize("NFD", str(t))
                           if unicodedata.category(c) != "Mn")
        logger.info(f"    Pre-fetching examples for {len(df)} words …")
        for _, row in df.iterrows():
            word_plain = strip(str(row.get("Word Without Nikkud", "")).strip())
            if word_plain:
                benyehuda.get_examples(word_plain)
    except Exception as e:
        logger.warning(f"    Could not pre-fetch all examples: {e}")
    benyehuda.save_examples_cache()
    return benyehuda._examples_cache
 def step_audio(args):
    """Step 4 — download audio .mp3 files."""
    if args.skip_audio:
        logger.info("[4] Skipping audio (--skip-audio)")
        return
    logger.info("[4] Downloading audio files …")
    # Load audio URL cache (from old workspace if available)
    audio_cache_path = DATA_DIR / "audio_cache.json"
    audio_url_cache: dict = {}
    if audio_cache_path.exists():
        with open(audio_cache_path) as f:
            audio_url_cache = json.load(f)
    import audio_extract as ae
    ae._audio_cache = audio_url_cache
    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    import pandas as pd
    import requests
    try:
        try:
            df = pd.read_csv(dict_csv, sep=";", index_col=0)
            if df.shape[1] < 3:
                raise ValueError
        except Exception:
            df = pd.read_csv(dict_csv, index_col=0)
        if args.test:
            df = df.head(args.test)
        AUDIO_DIR.mkdir(parents=True, exist_ok=True)
        downloaded = 0
        skipped = 0
        for _, row in df.iterrows():
            word = str(row.get("Word", "")).strip()
            word_plain = str(row.get("Word Without Nikkud", "")).strip()
            if not word:
                continue
            import re, unicodedata
            def strip_nik(t):
                return "".join(c for c in unicodedata.normalize("NFD", t)
                               if unicodedata.category(c) != "Mn")
            safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nik(word_plain or word))
            if not safe_name:
                continue
            mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
            if mp3_path.exists():
                skipped += 1
                continue
            # Get audio URL from cache or fetch
            audio_url = ae.extract_audio_url(word)
            if audio_url:
                try:
                    resp = requests.get(audio_url, timeout=10)
                    resp.raise_for_status()
                    mp3_path.write_bytes(resp.content)
                    downloaded += 1
                    time.sleep(0.3)
                except Exception as e:
                    logger.debug(f"    Audio download failed for {word}: {e}")
        ae.save_audio_cache(str(audio_cache_path))
        logger.info(f"    Audio: {downloaded} downloaded, {skipped} already cached")
    except Exception as e:
        logger.warning(f"    Audio step failed: {e}")
 def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
    """Step 5 — build vocabulary .apkg."""
    logger.info("[5] Building vocabulary deck …")
    import apkg_builder
    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    deck, media = apkg_builder.build_vocab_deck(
        dict_csv,
        examples_cache=examples_cache,
        freq_cache=freq_cache,
        limit=args.test,
    )
    apkg_builder.write_vocab_apkg(deck, media)
    logger.info(f"    Vocabulary .apkg → {apkg_builder.VOCAB_APKG}")
    return deck
 def step_conjugations(args):
    """Step 6 — extract conjugations and build conjugation deck."""
    if args.skip_conjugations:
        logger.info("[6] Skipping conjugations (--skip-conjugations)")
        return
    verbs_file = Path(__file__).parent / "verbs_input.txt"
    if not verbs_file.exists():
        logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
        return
    logger.info("[6] Extracting verb conjugations …")
    import conjugation_extract
    conjugations = conjugation_extract.main(verbs_file)
    import apkg_builder
    conj_deck = apkg_builder.build_conj_deck(conjugations)
    apkg_builder.write_conj_apkg(conj_deck)
    logger.info(f"    Conjugation .apkg → {apkg_builder.CONJ_APKG}")
    return conjugations
 def print_summary(args, examples_cache, freq_cache, conjugations):
    logger.info("")
    logger.info("=" * 60)
    logger.info("SUMMARY")
    logger.info("=" * 60)
    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    if dict_csv.exists():
        import pandas as pd
        try:
            df = pd.read_csv(dict_csv, sep=";", index_col=0)
            if df.shape[1] < 3:
                raise ValueError
        except Exception:
            df = pd.read_csv(dict_csv, index_col=0)
        logger.info(f"  Dictionary words: {len(df)}")
    logger.info(f"  Frequency entries: {len(freq_cache)}")
    logger.info(f"  Example cache entries: {len(examples_cache)}")
    covered = sum(1 for v in examples_cache.values() if v)
    if examples_cache:
        logger.info(f"  Example coverage: {covered}/{len(examples_cache)} ({100*covered//len(examples_cache)}%)")
    audio_dir = DATA_DIR / "audio"
    if audio_dir.exists():
        mp3s = list(audio_dir.glob("*.mp3"))
        logger.info(f"  Audio files: {len(mp3s)}")
    vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
    conj_apkg  = OUTPUT_DIR / "pealim_conjugations.apkg"
    if vocab_apkg.exists():
        size_mb = vocab_apkg.stat().st_size / 1e6
        logger.info(f"  Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")
    if conj_apkg.exists():
        size_mb = conj_apkg.stat().st_size / 1e6
        logger.info(f"  Conjugation .apkg: {size_mb:.1f} MB → {conj_apkg}")
        if conjugations:
            verb_count = sum(1 for v in conjugations.values() if v)
            logger.info(f"  Verbs in conjugation deck: {verb_count}")
    logger.info("=" * 60)
    logger.info("✅ DONE")
 def main():
-    """Run all extraction tasks."""
+    args = parse_args()
    logger.info("=" * 60)
-    logger.info("PEALIM EXTRACTION SUITE")
+    logger.info("PEALIM ANKI DECK BUILDER")
    if args.test:
        logger.info(f"  TEST MODE: {args.test} words")
    logger.info("=" * 60)
-    
+
-    try:
+    step_scrape(args)
-        # Extract dictionary
+    freq_cache     = step_frequency()
-        logger.info("\n[1/2] Extracting dictionary...")
+    examples_cache = step_examples(args, freq_cache)
-        pealim_extract.main()
+    step_audio(args)
-        
+    step_build_vocab(args, examples_cache, freq_cache)
-        # Extract conjugations
+    conjugations = step_conjugations(args)
-        logger.info("\n[2/2] Extracting conjugations...")
+
-        conjugation_extract.main()
+    print_summary(args, examples_cache, freq_cache, conjugations or {})
        logger.info("\n" + "=" * 60)
        logger.info("✅ ALL TASKS COMPLETE")
        logger.info("=" * 60)
    except Exception as e:
        logger.error(f"\n❌ EXTRACTION FAILED: {e}")
        sys.exit(1)
-if __name__ == '__main__':
+if __name__ == "__main__":
    main()
--- a/verbs_input.txt
+++ b/verbs_input.txt
@ -0,0 +1,10 @@
 # One Hebrew infinitive per line.
 # Lines starting with # are ignored.
 # Initial test set — one verb per binyan:
 לִשְׁמוֹר
 לְהִשָּׁמֵר
 לְדַבֵּר
 לְדֻבַּר
 לְהִתְלַבֵּשׁ
 לְהַגִּיד
 לְהוּגַד
		`@ -0,0 +1 @@`
							{"אב": ["לא אב לחגלה אתה", "כרחם אב על בנים"], "אבא": ["כך כך אבא יקירי", "“אבא איננו בבית"], "אביבי": ["אמרת: תם אביבי,", "אמרת: תם אביבי,"], "אביב": ["אביב כי יתחדש –", "ברחובות תל־אביב"], "אבידה": ["אבידה בדבר מועט", "ואם לרבות אבידה"], "לאבוד": ["אבל נאנחתי לאבוד", "אנו הולכים לאבוד"], "להיאבד": [], "להתאבד": ["מעמד והחליטה להתאבד", "היא נסתה פעם להתאבד"], "איבוד": ["איבוד דמי משלוח", "איבוד עצמו לדעת"], "התאבדות": ["והביאו לידי התאבדות", "הקלון, בלתי אם התאבדות"], "להאביד": ["ויאמר להאביד זכרם –", "קול שם רשעים להאביד"], "אבדה": ["ועתה אבדה תקותה", "וכל תשועתו אבדה"], "אבוד": ["— הה, הנני אבוד", "“אבוד עצמי לדעת"], "לאבד": ["אין לכם מה לאבד", "יש חשש לאבד שנה"], "אבדון": ["אבדון, אש הנעורת", "אבדון, פתחי עולם"], "אבוקדו": ["מטעים רצופים של עצי אבוקדו ומנגו", "את זרעי הפירות וגידלו מהם שתילים חדשים של אבוקדו"], "אבזם": ["רצו אל הטבח, הגישו לו הפעם חגורה עם אבזם מבריק… הביאו כוס", "רגליו היו עטופות לפפות חדשות ומתניו חגורות חגורה חדשה עם אבזם"], "לאבזר": [], "איבחון": ["לשלבים גבוהים יותר של איבחון וריפוי", "איבחון נחפז, כמוהו כהיסוס מופרז, עלול לגרור תוצאות בלתי־נעימות"]}