Sprint 7: emoji/prep extraction, conjugation reduction, project rename

- Item 1/2: Extract emoji and Hebrew parentheticals (prepositions) from Meaning field; display emoji with 3.5em font, prep inline after Hebrew word. Add Emoji and Prep fields to Hebrew Flash Cards model. - Item 3: Seeded RNG per verb reduces conjugation cards by ~630 (4 present forms → 1 pronoun each; past_3p → 1 gender). 1st-person forms gain gender label (זכר/נקבה). Total: 1,834 conj cards (was ~2,464). - Item 4: hebrew_extract.py uses BeautifulSoup to capture data-audio URLs from pealim.com list pages during scraping. step_audio() reads audio_url column from CSV (no longer needs audio_extract.py). - Item 5: Rename to 'Hebrew Flash Cards'. New filenames: hebrew_dict.csv, hebrew_extract.py, hebrew_vocabulary.apkg, hebrew_conjugations.apkg. Deck/model names updated throughout. Forgejo repo rename pending (sochen lacks admin rights — Nevo must do via UI). - Fix: Deduplicate entries with same Hebrew word before adding notes (eliminates GUID collisions from duplicate source CSV rows). - Bump RELEASE_TAG to v0.11. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-05 05:49:51 +00:00 · 2026-03-05 05:49:51 +00:00 · 64a1b18951
commit 64a1b18951
parent f8e4873349
9 changed files with 21580 additions and 60 deletions
--- a/README.md
+++ b/README.md
@ -1,4 +1,4 @@
-# Pealim — Hebrew Vocabulary & Verb Flashcards for Anki
+# Hebrew Flash Cards — Hebrew Vocabulary & Verb Flashcards for Anki

 ![Flashcard screenshot](flashcard.png)

@ -21,7 +21,7 @@ All card data comes from open or academic sources:

 ## Just give me the flashcards

-1. Download the `.apkg` files from [Releases](https://git.nevo.engineer/nevo/pealim/releases)
+1. Download the `.apkg` files from [Releases](https://git.nevo.engineer/nevo/hebrew_flash_cards/releases)
 2. Double-click to import into [Anki](https://apps.ankiweb.net/) (free, cross-platform)
 3. Start studying

@ -136,7 +136,7 @@ python run.py --skip-scrape --refresh-examples
 ```
 python run.py [options]

-  --skip-scrape        Use cached data/pealim_dict.csv (no pealim.com scraping)
+  --skip-scrape        Use cached data/hebrew_dict.csv (no pealim.com scraping)
  --skip-audio         Skip audio .mp3 downloads
  --skip-examples      Skip Ben Yehuda example fetching
  --only {vocab,conjugations}  Run only one deck (skips all unrelated steps)
@ -150,20 +150,20 @@ python run.py [options]

 | File | Description |
 |------|-------------|
-| `data/pealim_dict.csv` | Raw dictionary |
-| `data/pealim_dict_for_anki.csv` | Enriched Anki CSV |
+| `data/hebrew_dict.csv` | Raw dictionary |
+| `data/hebrew_dict_for_anki.csv` | Enriched Anki CSV |
 | `data/conjugations.json` | Verb conjugation data |
 | `data/audio/` | Vocabulary audio (.mp3) |
 | `data/audio_conj/` | Conjugation audio (.mp3) |
 | `data/fonts/` | Heebo font files (bundled in .apkg) |
 | `data/images/` | Noun images from Wikipedia/Commons |
 | `data/image_cache.json` | Image fetch cache |
-| `output/pealim_vocabulary.apkg` | Vocabulary Anki deck |
-| `output/pealim_conjugations.apkg` | Conjugation Anki deck |
+| `output/hebrew_vocabulary.apkg` | Vocabulary Anki deck |
+| `output/hebrew_conjugations.apkg` | Conjugation Anki deck |

 ### Pipeline overview

-1. `pealim_extract.py` — scrapes pealim.com dictionary
+1. `hebrew_extract.py` — scrapes pealim.com dictionary
 2. `frequency_lookup.py` — downloads/loads Hebrew frequency data
 3. `benyehuda.py` — builds sentence index from Ben-Yehuda corpus
 4. `extract_verb_list.py` — extracts verb list from Coffin & Bolozky PDF
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -9,6 +9,7 @@ in Anki rather than creating a duplicate.

 import json
 import logging
+import random
 import re
 import unicodedata
 from pathlib import Path
@ -27,15 +28,19 @@ CONJ_MODEL_ID   = 1_234_567_893

 # Release version tag added to all notes so users can identify which release
 # their cards come from (visible in Anki's Browse view and card info).
-RELEASE_TAG = "v0.10"
+RELEASE_TAG = "v0.11"
+
+# Regex for extracting emoji and Hebrew prepositions from meaning strings
+EMOJI_RE   = re.compile(r'[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+')
+HBPAREN_RE = re.compile(r'\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)')

 DATA_DIR       = Path(__file__).parent / "data"
 AUDIO_DIR      = DATA_DIR / "audio"
 AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
 OUTPUT_DIR     = Path(__file__).parent / "output"

-VOCAB_APKG  = OUTPUT_DIR / "pealim_vocabulary.apkg"
-CONJ_APKG   = OUTPUT_DIR / "pealim_conjugations.apkg"
+VOCAB_APKG  = OUTPUT_DIR / "hebrew_vocabulary.apkg"
+CONJ_APKG   = OUTPUT_DIR / "hebrew_conjugations.apkg"

 # ──────────────────────────────────────────────────────────────────────────────
 # Binyan → Hebrew label mapping (for conjugation card display)
@ -163,6 +168,11 @@ CARD_CSS = """
  margin: 2px 0;
  font-size: 15px;
 }
+.emoji-img {
+  font-size: 3.5em;
+  text-align: center;
+  margin: 0.3em 0;
+}
@media (prefers-color-scheme: dark) {
  .card        { color: #e8e8e8; background: #1c1c1e; }
  .hebrew      { color: #f0f0f0; }
@ -182,7 +192,7 @@ CARD_CSS = """
 # ──────────────────────────────────────────────────────────────────────────────

 VOCAB_FRONT_HEB = """
-<div class="hebrew">{{Word}}</div>
+<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
 """

@ -197,6 +207,7 @@ VOCAB_BACK_HEB = """
 <div class="root-info">{{SharedRoots}}</div>
 {{/SharedRoots}}
 {{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
+{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
 {{#Example}}
 <div class="example">{{Example}}</div>
 {{/Example}}
@ -210,12 +221,13 @@ VOCAB_FRONT_ENG = """
 VOCAB_BACK_ENG = """
 {{FrontSide}}
 <div class="divider"></div>
-<div class="hebrew">{{Word}}</div>
+<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
 {{#WordNoNikkud}}<div class="sec-label">ללא ניקוד: {{WordNoNikkud}}</div>{{/WordNoNikkud}}
 {{#Root}}<div class="sec-label">שורש: {{Root}}</div>{{/Root}}
 {{#PoS}}<div class="sec-label">חלק דיבור: {{PoS}}</div>{{/PoS}}
 {{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
+{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
 {{#Example}}
 <div class="example">{{Example}}</div>
 {{/Example}}
@ -223,7 +235,7 @@ VOCAB_BACK_ENG = """

 VOCAB_MODEL = genanki.Model(
    VOCAB_MODEL_ID,
-    "Pealim Hebrew",
+    "Hebrew Flash Cards",
    fields=[
        {"name": "Word"},
        {"name": "Root"},
@ -236,6 +248,8 @@ VOCAB_MODEL = genanki.Model(
        {"name": "Example"},
        {"name": "Frequency"},
        {"name": "Image"},
+        {"name": "Emoji"},
+        {"name": "Prep"},
    ],
    templates=[
        {
@ -444,8 +458,9 @@ def build_vocab_deck(
    df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
    df = df.sort_values("_freq_rank")

-    deck = genanki.Deck(VOCAB_DECK_ID, "Pealim Hebrew Vocabulary")
+    deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
    media_files: list[Path] = []
+    seen_words: set[str] = set()

    for _, row in df.iterrows():
        word         = str(row.get("Word", "")).strip()
@ -468,6 +483,21 @@ def build_vocab_deck(
        if not word or not meaning:
            continue

+        # Skip exact duplicates (same Hebrew word with nikkud)
+        if word in seen_words:
+            logger.debug(f"  Skipping duplicate word: {word}")
+            continue
+        seen_words.add(word)
+
+        # Extract emoji from meaning
+        emoji_str   = ''.join(EMOJI_RE.findall(meaning))
+        meaning_clean = EMOJI_RE.sub('', meaning).strip()
+
+        # Extract Hebrew parentheticals (prepositions) from meaning
+        preps = HBPAREN_RE.findall(meaning_clean)
+        prep_str = ' '.join(f'({p})' for p in preps)
+        meaning_clean = HBPAREN_RE.sub('', meaning_clean).strip().strip(',').strip()
+
        # Translate PoS to Hebrew
        pos_heb = _translate_pos(pos_raw) if pos_raw else ""

@ -523,7 +553,7 @@ def build_vocab_deck(
                word,
                root,
                pos_heb,
-                meaning,
+                meaning_clean,
                word_no_nik,
                related_html or shared_roots,
                tags_str,
@ -531,11 +561,21 @@ def build_vocab_deck(
                example_html,
                freq_display,
                image_tag,
+                emoji_str,
+                prep_str,
            ],
            tags=(tags_str.split() if tags_str else []) + [RELEASE_TAG],
        )
        deck.add_note(note)

+    # Diagnostic: count words with emoji/prep extracted
+    emoji_count = sum(1 for n in deck.notes if n.fields[11])
+    prep_count  = sum(1 for n in deck.notes if n.fields[12])
+    if emoji_count:
+        logger.info(f"  Emoji extracted: {emoji_count} words")
+    if prep_count:
+        logger.info(f"  Hebrew prepositions extracted: {prep_count} words")
+
    # Diagnostic: count words without PoS coverage in shared_roots
    other_count = 0
    for _, row in df.iterrows():
@ -557,7 +597,7 @@ def build_conj_deck(
    audio_dir: Path = AUDIO_CONJ_DIR,
 ) -> tuple[genanki.Deck, list[Path]]:
    """Build the conjugation drill deck from conjugations.json data."""
-    deck = genanki.Deck(CONJ_DECK_ID, "Pealim Hebrew Conjugations")
+    deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
    media_files: list[Path] = []
    note_count = 0

@ -601,6 +641,9 @@ def build_conj_deck(

        alternate_forms = data.get("alternate_forms", {})

+        # Seeded RNG per verb — deterministic pronoun/gender choices
+        verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)
+
        for form_key, form_data in forms.items():
            primary_form = form_data.get("form", "")
            alt_form = alternate_forms.get(form_key, "")
@ -620,16 +663,16 @@ def build_conj_deck(
                    if mp3_path not in media_files:
                        media_files.append(mp3_path)

-            # Present tense expansion: 4 form keys → 3 cards each = 12 cards
+            # Present tense expansion: 4 form keys → 1 card each (seeded RNG)
            if form_key in PRESENT_EXPANSION:
-                for pronoun, tense_label in PRESENT_EXPANSION[form_key]:
-                    add_note(pronoun, tense_label, conj_form, audio_tag)
+                chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
+                add_note(chosen[0], chosen[1], conj_form, audio_tag)
                continue

-            # Past 3rd plural: same form for m/f → two separate pronoun cards
+            # Past 3rd plural: same form for m/f → 1 card (seeded RNG)
            if form_key == "past_3p":
-                for pronoun, tense_label in PAST_3P_EXPANSION:
-                    add_note(pronoun, tense_label, conj_form, audio_tag)
+                chosen = verb_rng.choice(PAST_3P_EXPANSION)
+                add_note(chosen[0], chosen[1], conj_form, audio_tag)
                continue

            # 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
@ -649,6 +692,12 @@ def build_conj_deck(
            # Standard card
            pronoun = form_data.get("pronoun", "")
            tense   = form_data.get("tense", "")
+
+            # 1st-person forms get a randomly assigned gender label (deterministic per verb)
+            if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
+                gender = verb_rng.choice(["זכר", "נקבה"])
+                pronoun = f"{pronoun} ({gender})"
+
            add_note(pronoun, tense, conj_form, audio_tag)


@ -707,7 +756,11 @@ def write_conj_apkg(
 if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")

-    csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
+    csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if not csv_path.exists():
+        csv_path = DATA_DIR / "hebrew_dict.csv"
+    if not csv_path.exists():
+        csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
    if not csv_path.exists():
        csv_path = DATA_DIR / "pealim_dict.csv"

--- a/conjugation_extract.py
+++ b/conjugation_extract.py
@ -33,7 +33,13 @@ REQUEST_DELAY = 1.5
 REQUEST_TIMEOUT = 15
 VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
 CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
-DICT_CSV = Path(__file__).parent / "data" / "pealim_dict_for_anki.csv"
+DICT_CSV = next(
+    (p for p in [
+        Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
+        Path(__file__).parent / "data" / "pealim_dict_for_anki.csv",
+    ] if p.exists()),
+    Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
+)

 # Pronoun labels (for card front display)
 PRONOUN_LABELS = {
--- a/data/hebrew_dict.csv
+++ b/data/hebrew_dict.csv
--- a/data/hebrew_dict_for_anki.csv
+++ b/data/hebrew_dict_for_anki.csv
--- a/hebrew_extract.py
+++ b/hebrew_extract.py
@ -0,0 +1,219 @@
+#!/usr/bin/env python3
+"""
+Extract Hebrew vocabulary from pealim.com dictionary.
+Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards.
+"""
+
+import requests
+import pandas as pd
+from bs4 import BeautifulSoup
+import logging
+import time
+from typing import Optional
+
+# Configure logging
+logging.basicConfig(
+    level=logging.INFO,
+    format='%(asctime)s - %(levelname)s - %(message)s'
+)
+logger = logging.getLogger(__name__)
+
+# Session for connection pooling
+session = requests.Session()
+session.headers.update({
+    'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
+})
+
+PEALIM_DICT_URL = "https://www.pealim.com/dict/"
+REQUEST_DELAY = 1.5  # seconds between requests (respectful scraping)
+REQUEST_TIMEOUT = 10  # seconds
+
+
+def get_total_pages() -> int:
+    """Dynamically determine total pages from first request."""
+    try:
+        logger.info("Fetching total page count...")
+        cookies = {'translit': 'none', 'hebstyle': 'mo'}
+        response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
+        response.raise_for_status()
+        # Hardcoded — pealim.com has ~608 pages at ~15 words/page
+        return 608
+    except Exception as e:
+        logger.error(f"Error fetching page count: {e}. Using default (608).")
+        return 608
+
+
+def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
+    """
+    Parse a dict page with BeautifulSoup to extract word data + audio URL.
+    Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
+    """
+    soup = BeautifulSoup(html_bytes, 'html.parser')
+    rows = []
+    for tr in soup.select('table tr'):
+        tds = tr.find_all('td')
+        if len(tds) < 4:
+            continue
+        # Audio URL from span[data-audio] in first td
+        audio_span = tds[0].find(attrs={'data-audio': True})
+        audio_url = audio_span['data-audio'] if audio_span else ''
+        # Word with nikkud
+        menukad = tds[0].find('span', class_='menukad')
+        word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
+        # Root (may be link or plain text)
+        root = tds[1].get_text(strip=True)
+        # Part of speech
+        pos = tds[2].get_text(strip=True)
+        # Meaning
+        meaning = tds[3].get_text(strip=True)
+        if word:
+            rows.append({
+                'Word': word,
+                'Root': root if root else '-',
+                'Part of Speech': pos,
+                'Meaning': meaning,
+                'audio_url': audio_url,
+            })
+    return rows
+
+
+def extract_from_website(max_pages: Optional[int] = None) -> pd.DataFrame:
+    """
+    Extract dictionary entries from pealim.com.
+    Captures audio URLs from each word entry's data-audio attribute.
+
+    Args:
+        max_pages: Maximum pages to scrape (None = all)
+
+    Returns:
+        DataFrame with Word, Root, Part of Speech, Meaning, Word Without Nikkud, audio_url columns
+    """
+    total_pages = max_pages or get_total_pages()
+    logger.info(f"Starting extraction from {total_pages} pages...")
+
+    all_rows: list[dict] = []
+
+    for page_num in range(1, total_pages):
+        try:
+            url = f"{PEALIM_DICT_URL}?page={page_num}"
+
+            # First request: with nikkud — parse with BeautifulSoup for audio URL
+            cookies = {'translit': 'none', 'hebstyle': 'mo'}
+            response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
+            response.raise_for_status()
+            page_rows = _parse_page_with_audio(response.content)
+
+            # Second request: without nikkud — just get the word column
+            cookies_vl = {'translit': 'none', 'hebstyle': 'vl', 'showmeaning': 'off'}
+            resp_vl = session.get(url, cookies=cookies_vl, timeout=REQUEST_TIMEOUT)
+            resp_vl.raise_for_status()
+            soup_vl = BeautifulSoup(resp_vl.content, 'html.parser')
+            no_nik_words = []
+            for tr in soup_vl.select('table tr'):
+                tds = tr.find_all('td')
+                if len(tds) < 4:
+                    continue
+                menukad = tds[0].find('span', class_='menukad')
+                w = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
+                no_nik_words.append(w)
+
+            # Merge no-nikkud words into rows
+            for i, row in enumerate(page_rows):
+                row['Word Without Nikkud'] = no_nik_words[i] if i < len(no_nik_words) else ''
+
+            all_rows.extend(page_rows)
+
+            if page_num % 50 == 0:
+                logger.info(f"Processed {page_num}/{total_pages} pages ({len(all_rows)} words so far)...")
+
+            time.sleep(REQUEST_DELAY)
+
+        except requests.RequestException as e:
+            logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
+            time.sleep(REQUEST_DELAY * 2)
+        except Exception as e:
+            logger.error(f"Unexpected error on page {page_num}: {e}")
+            continue
+
+    df = pd.DataFrame(all_rows)
+    audio_count = (df['audio_url'] != '').sum() if 'audio_url' in df.columns else 0
+    logger.info(f"Extraction complete. Total words: {len(df)}, with audio URL: {audio_count}")
+    return df
+
+
+def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Transform dictionary DataFrame for Anki import.
+    Adds shared root words and Hebrew tags. Preserves audio_url column.
+    """
+    logger.info("Preparing data for Anki...")
+
+    # Find shared root words
+    shared_root_words = []
+    for idx, row in df.iterrows():
+        root = row['Root']
+        word = row['Word']
+
+        if root != '-' and pd.notna(root):
+            same_root = df[(df['Root'] == root) & (df['Word'] != word)]['Word'].values
+            shared = ' '.join(str(w) for w in same_root)
+            shared_root_words.append(shared)
+        else:
+            shared_root_words.append('')
+
+    df['shared roots'] = shared_root_words
+
+    # Generate Hebrew tags
+    tags = []
+    for idx, row in df.iterrows():
+        tag_parts = []
+
+        root = str(row['Root']).replace(' ', '').replace('-', '')
+        if 'nan' not in root and root:
+            root_clean = root.replace('.', '')
+            tag_parts.append(f"שורש::{root_clean}")
+
+        pos = str(row['Part of Speech'])
+        pos_tags = {
+            'Adverb': 'תוארי_הפועל',
+            'Pronoun': 'כינויי_גוף',
+            'Noun': 'שם_עצם',
+            'Verb': 'פעלים',
+            'Adjective': 'שם_תואר',
+            'Preposition': 'מילות_יחס',
+            'Conjunction': 'מילות_חיבור',
+            'Particle': 'מילית'
+        }
+
+        for key, value in pos_tags.items():
+            if key in pos:
+                tag_parts.append(value)
+                break
+
+        tags.append(' '.join(tag_parts))
+
+    df['tags'] = tags
+    logger.info("Anki preparation complete.")
+    return df
+
+
+def main():
+    """Main entry point."""
+    try:
+        df = extract_from_website()
+        df.to_csv('hebrew_dict.csv', index=True)
+        logger.info("Saved: hebrew_dict.csv")
+
+        df = modify_for_anki(df)
+        df.to_csv('hebrew_dict_for_anki.csv', sep=';', index=True)
+        logger.info("Saved: hebrew_dict_for_anki.csv")
+
+        logger.info("Complete!")
+
+    except Exception as e:
+        logger.error(f"Fatal error: {e}")
+        raise
+
+
+if __name__ == '__main__':
+    main()
--- a/image_fetch.py
+++ b/image_fetch.py
@ -214,7 +214,11 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
    """
    import pandas as pd

-    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "hebrew_dict.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    if not dict_csv.exists():
--- a/run.py
+++ b/run.py
@ -55,26 +55,31 @@ def parse_args():

 def step_scrape(args):
    """Step 1 — scrape or load dictionary."""
-    dict_csv = DATA_DIR / "pealim_dict.csv"
-    anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    dict_csv = DATA_DIR / "hebrew_dict.csv"
+    anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    # Legacy fallback names
+    legacy_dict = DATA_DIR / "pealim_dict.csv"
+    legacy_anki = DATA_DIR / "pealim_dict_for_anki.csv"

    if args.skip_scrape:
        if dict_csv.exists():
            logger.info(f"[1] Using existing {dict_csv}")
+        elif legacy_dict.exists():
+            logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
        else:
            logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
            sys.exit(1)
        return

    logger.info("[1] Scraping dictionary from pealim.com …")
-    import pealim_extract
+    import hebrew_extract
    import pandas as pd

-    df = pealim_extract.extract_from_website()
+    df = hebrew_extract.extract_from_website()
    df.to_csv(dict_csv, index=True)
    logger.info(f"    Saved {len(df)} words → {dict_csv}")

-    df = pealim_extract.modify_for_anki(df)
+    df = hebrew_extract.modify_for_anki(df)
    df.to_csv(anki_csv, sep=";", index=True)
    logger.info(f"    Saved Anki CSV → {anki_csv}")

@ -101,7 +106,11 @@ def step_examples(args, freq_cache: dict):
    import benyehuda
    benyehuda.load(force_rebuild=args.refresh_examples)

-    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "hebrew_dict.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"

@ -132,22 +141,18 @@ def step_examples(args, freq_cache: dict):


 def step_audio(args):
-    """Step 4 — download vocabulary audio .mp3 files."""
+    """Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
    if args.skip_audio:
        logger.info("[4] Skipping audio (--skip-audio)")
        return

    logger.info("[4] Downloading vocabulary audio files …")
-    audio_cache_path = DATA_DIR / "audio_cache.json"
-    audio_url_cache: dict = {}
-    if audio_cache_path.exists():
-        with open(audio_cache_path) as f:
-            audio_url_cache = json.load(f)

-    import audio_extract as ae
-    ae._audio_cache = audio_url_cache
-
-    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "hebrew_dict.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"

@ -161,12 +166,17 @@ def step_audio(args):
        except (ValueError, pd.errors.ParserError):
            df = pd.read_csv(dict_csv, index_col=0)

+        if 'audio_url' not in df.columns:
+            logger.warning("    No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
+            return
+
        if args.test:
            df = df.head(args.test)

        AUDIO_DIR.mkdir(parents=True, exist_ok=True)
        downloaded = 0
        skipped = 0
+        no_url = 0

        def strip_nik(t: str) -> str:
            return "".join(c for c in unicodedata.normalize("NFD", t)
@ -175,6 +185,8 @@ def step_audio(args):
        for _, row in df.iterrows():
            word = str(row.get("Word", "")).strip()
            word_plain = str(row.get("Word Without Nikkud", "")).strip()
+            audio_url = str(row.get("audio_url", "")).strip()
+
            if not word:
                continue

@ -187,19 +199,20 @@ def step_audio(args):
                skipped += 1
                continue

-            audio_url = ae.extract_audio_url(word)
-            if audio_url:
-                try:
-                    resp = requests.get(audio_url, timeout=10)
-                    resp.raise_for_status()
-                    mp3_path.write_bytes(resp.content)
-                    downloaded += 1
-                    time.sleep(0.3)
-                except Exception as e:
-                    logger.debug(f"    Audio download failed for {word}: {e}")
+            if not audio_url or audio_url in ("nan", "None", ""):
+                no_url += 1
+                continue

-        ae.save_audio_cache(str(audio_cache_path))
-        logger.info(f"    Audio: {downloaded} downloaded, {skipped} already cached")
+            try:
+                resp = requests.get(audio_url, timeout=10)
+                resp.raise_for_status()
+                mp3_path.write_bytes(resp.content)
+                downloaded += 1
+                time.sleep(0.3)
+            except Exception as e:
+                logger.debug(f"    Audio download failed for {word}: {e}")
+
+        logger.info(f"    Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")

    except Exception as e:
        logger.warning(f"    Audio step failed: {e}")
@ -350,7 +363,11 @@ def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache:
    logger.info("[5] Building vocabulary deck …")
    import apkg_builder

-    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "hebrew_dict.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"

@ -398,7 +415,11 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
    logger.info("SUMMARY")
    logger.info("=" * 60)

-    dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
+    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "hebrew_dict.csv"
+    if not dict_csv.exists():
+        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    if dict_csv.exists():
@ -432,8 +453,8 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
        found_imgs = sum(1 for v in ic.values() if v)
        logger.info(f"  Images: {found_imgs}/{len(ic)} nouns with images")

-    vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
-    conj_apkg  = OUTPUT_DIR / "pealim_conjugations.apkg"
+    vocab_apkg = OUTPUT_DIR / "hebrew_vocabulary.apkg"
+    conj_apkg  = OUTPUT_DIR / "hebrew_conjugations.apkg"
    if vocab_apkg.exists():
        size_mb = vocab_apkg.stat().st_size / 1e6
        logger.info(f"  Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")
--- a/validate_apkg.py
+++ b/validate_apkg.py
@ -20,8 +20,8 @@ import tempfile
 import zipfile
 from pathlib import Path

-VOCAB_APKG = Path("output/pealim_vocabulary.apkg")
-CONJ_APKG = Path("output/pealim_conjugations.apkg")
+VOCAB_APKG = Path("output/hebrew_vocabulary.apkg")
+CONJ_APKG = Path("output/hebrew_conjugations.apkg")

 PASS = "\033[32m✓\033[0m"
 FAIL = "\033[31m✗\033[0m"