hebrew_flash_cards/run.py
Sochen b086123bec feat: add apkg builder, frequency, Ben Yehuda examples, conjugation deck
Implements four major improvements to the Pealim Anki deck pipeline:

1. Automated .apkg generation (genanki) — no more manual Anki Desktop step.
   Both vocabulary and conjugation decks are built programmatically.

2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus.
   Notes sorted by rank so Anki presents most common words first.

3. Example sentences from Ben Yehuda public domain corpus (not pealim.com).
   Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set.

4. Conjugation drill deck — one card per form × verb.
   Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one
   per binyan). Extracts 28 forms each via pealim.com/search/ + table parse.

New files:
  apkg_builder.py     — genanki deck builder for both decks
  benyehuda.py        — Ben Yehuda corpus downloader + sentence indexer
  frequency_lookup.py — FrequencyWords downloader + rank lookup
  verbs_input.txt     — verb input list (7 test verbs, one per binyan)
  data/               — baseline CSVs + generated caches

Updated:
  conjugation_extract.py — rewritten: reads verbs_input.txt, searches
                           /search/?q= for slug, parses table by row labels
  requirements.txt       — add genanki, beautifulsoup4, lxml
  run.py                 — full orchestration pipeline with CLI flags
  .gitignore             — exclude venv/, benyehuda_index.json, audio/, output/

CLI:
  python run.py --skip-scrape --skip-audio --test 20  (quick test)
  python run.py --skip-scrape                          (full build)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 01:58:31 +00:00

313 lines
10 KiB
Python

#!/usr/bin/env python3
"""
Pealim Anki Deck Builder — full pipeline orchestrator.
Usage:
python run.py [options]
Options:
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-conjugations Skip verb conjugation extraction
--test N Process only the first N dictionary words (for quick testing)
"""
import argparse
import json
import logging
import sys
import time
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
OUTPUT_DIR = Path(__file__).parent / "output"
def parse_args():
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument("--skip-conjugations", action="store_true", help="Skip verb conjugation extraction")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
def step_scrape(args):
"""Step 1 — scrape or load dictionary."""
dict_csv = DATA_DIR / "pealim_dict.csv"
anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if args.skip_scrape:
if dict_csv.exists():
logger.info(f"[1] Using existing {dict_csv}")
else:
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
sys.exit(1)
return
logger.info("[1] Scraping dictionary from pealim.com …")
import pealim_extract
import pandas as pd
df = pealim_extract.extract_from_website()
df.to_csv(dict_csv, index=True)
logger.info(f" Saved {len(df)} words → {dict_csv}")
df = pealim_extract.modify_for_anki(df)
df.to_csv(anki_csv, sep=";", index=True)
logger.info(f" Saved Anki CSV → {anki_csv}")
def step_frequency():
"""Step 2 — load/download word frequency data."""
logger.info("[2] Loading word frequency data …")
import frequency_lookup
frequency_lookup.load()
return frequency_lookup._freq
def step_examples(args, freq_cache: dict):
"""Step 3 — load/build Ben Yehuda example index."""
if args.skip_examples:
logger.info("[3] Skipping examples (--skip-examples)")
examples_path = DATA_DIR / "examples_cache.json"
if examples_path.exists():
with open(examples_path) as f:
return json.load(f)
return {}
logger.info("[3] Loading Ben Yehuda example index …")
import benyehuda
benyehuda.load()
# Pre-fetch examples for all words in the dict (uses cache)
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
try:
import pandas as pd
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
if args.test:
df = df.head(args.test)
import unicodedata
def strip(t):
return "".join(c for c in unicodedata.normalize("NFD", str(t))
if unicodedata.category(c) != "Mn")
logger.info(f" Pre-fetching examples for {len(df)} words …")
for _, row in df.iterrows():
word_plain = strip(str(row.get("Word Without Nikkud", "")).strip())
if word_plain:
benyehuda.get_examples(word_plain)
except Exception as e:
logger.warning(f" Could not pre-fetch all examples: {e}")
benyehuda.save_examples_cache()
return benyehuda._examples_cache
def step_audio(args):
"""Step 4 — download audio .mp3 files."""
if args.skip_audio:
logger.info("[4] Skipping audio (--skip-audio)")
return
logger.info("[4] Downloading audio files …")
# Load audio URL cache (from old workspace if available)
audio_cache_path = DATA_DIR / "audio_cache.json"
audio_url_cache: dict = {}
if audio_cache_path.exists():
with open(audio_cache_path) as f:
audio_url_cache = json.load(f)
import audio_extract as ae
ae._audio_cache = audio_url_cache
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
import pandas as pd
import requests
try:
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
if args.test:
df = df.head(args.test)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
downloaded = 0
skipped = 0
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
word_plain = str(row.get("Word Without Nikkud", "")).strip()
if not word:
continue
import re, unicodedata
def strip_nik(t):
return "".join(c for c in unicodedata.normalize("NFD", t)
if unicodedata.category(c) != "Mn")
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nik(word_plain or word))
if not safe_name:
continue
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
if mp3_path.exists():
skipped += 1
continue
# Get audio URL from cache or fetch
audio_url = ae.extract_audio_url(word)
if audio_url:
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.3)
except Exception as e:
logger.debug(f" Audio download failed for {word}: {e}")
ae.save_audio_cache(str(audio_cache_path))
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached")
except Exception as e:
logger.warning(f" Audio step failed: {e}")
def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
"""Step 5 — build vocabulary .apkg."""
logger.info("[5] Building vocabulary deck …")
import apkg_builder
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
deck, media = apkg_builder.build_vocab_deck(
dict_csv,
examples_cache=examples_cache,
freq_cache=freq_cache,
limit=args.test,
)
apkg_builder.write_vocab_apkg(deck, media)
logger.info(f" Vocabulary .apkg → {apkg_builder.VOCAB_APKG}")
return deck
def step_conjugations(args):
"""Step 6 — extract conjugations and build conjugation deck."""
if args.skip_conjugations:
logger.info("[6] Skipping conjugations (--skip-conjugations)")
return
verbs_file = Path(__file__).parent / "verbs_input.txt"
if not verbs_file.exists():
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
return
logger.info("[6] Extracting verb conjugations …")
import conjugation_extract
conjugations = conjugation_extract.main(verbs_file)
import apkg_builder
conj_deck = apkg_builder.build_conj_deck(conjugations)
apkg_builder.write_conj_apkg(conj_deck)
logger.info(f" Conjugation .apkg → {apkg_builder.CONJ_APKG}")
return conjugations
def print_summary(args, examples_cache, freq_cache, conjugations):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if dict_csv.exists():
import pandas as pd
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
logger.info(f" Dictionary words: {len(df)}")
logger.info(f" Frequency entries: {len(freq_cache)}")
logger.info(f" Example cache entries: {len(examples_cache)}")
covered = sum(1 for v in examples_cache.values() if v)
if examples_cache:
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100*covered//len(examples_cache)}%)")
audio_dir = DATA_DIR / "audio"
if audio_dir.exists():
mp3s = list(audio_dir.glob("*.mp3"))
logger.info(f" Audio files: {len(mp3s)}")
vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
conj_apkg = OUTPUT_DIR / "pealim_conjugations.apkg"
if vocab_apkg.exists():
size_mb = vocab_apkg.stat().st_size / 1e6
logger.info(f" Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")
if conj_apkg.exists():
size_mb = conj_apkg.stat().st_size / 1e6
logger.info(f" Conjugation .apkg: {size_mb:.1f} MB → {conj_apkg}")
if conjugations:
verb_count = sum(1 for v in conjugations.values() if v)
logger.info(f" Verbs in conjugation deck: {verb_count}")
logger.info("=" * 60)
logger.info("✅ DONE")
def main():
args = parse_args()
logger.info("=" * 60)
logger.info("PEALIM ANKI DECK BUILDER")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
logger.info("=" * 60)
step_scrape(args)
freq_cache = step_frequency()
examples_cache = step_examples(args, freq_cache)
step_audio(args)
step_build_vocab(args, examples_cache, freq_cache)
conjugations = step_conjugations(args)
print_summary(args, examples_cache, freq_cache, conjugations or {})
if __name__ == "__main__":
main()