hebrew_flash_cards/run.py
Sochen efd0745ada Sprint 14: deck template/CSS overhaul + Sprint 12 detail scrape
Template & CSS fixes (15 items from Mar 9 feedback):
- Fix conjugation front showing 3ms form instead of infinitive
- Rename conjugation model to "Hebrew Conjugation"
- Strip Hebrew parenthesized text from English meanings
- Shoresh separator: spaces → dots (א.כ.ל)
- Remove duplicate English meaning from cloze back
- Remove example sentences from vocab front/back (cloze only)
- Center-align audio buttons on all decks
- Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)"
- Unify sec-key/sec-label fonts, make keys bold
- Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px)
- Center-align related words groups
- Sort confusables by average frequency
- Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning
- Clean duplicate quotation marks in cloze sentences

Sprint 12 carry-forward (detail scrape + EPUB):
- Adjective/preposition detail scraping in pealim_detail_scrape.py
- EPUB example matching rewrite in epub_examples.py
- Delete benyehuda.py and rebuild_sentence_matches.py (merged)
- 49 parser tests for detail scraping
- SCHEMA.yaml updates for new fields

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 07:44:47 +00:00

362 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Pealim Anki Deck Builder — full pipeline orchestrator.
Usage:
python run.py [options]
Options:
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
Pipeline steps:
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
3. Frequency — load/download word frequency data
4. Examples — extract example sentences from Hebrew EPUBs
5. Audio download — download audio mp3 files
6. Fonts — download Heebo font files
7. Images — fetch noun images from Wikipedia
8. Build — build all .apkg deck variants
Options:
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip EPUB example extraction
--skip-images Skip image fetching for concrete nouns
--test N Limit to first N words/pages
"""
import argparse
import json
import logging
import re
import sys
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
OUTPUT_DIR = Path(__file__).parent / "output"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
FONTS_DIR = DATA_DIR / "fonts"
WORDS_JSON = DATA_DIR / "words.json"
def parse_args():
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
p.add_argument(
"--only",
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
help="Run only one deck (skips all unrelated steps)",
)
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
def step_list_scrape(args):
"""Step 1 — scrape pealim.com list pages → words.json."""
if args.skip_scrape:
if WORDS_JSON.exists():
logger.info("[1] Using existing words.json (--skip-scrape)")
else:
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
sys.exit(1)
return
logger.info("[1] Scraping dictionary list pages from pealim.com …")
import pealim_list_scrape
total_pages = args.test if args.test else None
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
def step_frequency() -> dict[str, int]:
"""Step 3 — load/download word frequency data."""
logger.info("[3] Loading word frequency data …")
import frequency_lookup
frequency_lookup.load()
return frequency_lookup._freq
def step_examples(args) -> dict:
"""Step 4 — extract example sentences from Hebrew EPUBs."""
if args.skip_examples:
logger.info("[4] Skipping examples (--skip-examples)")
return {}
logger.info("[4] Extracting EPUB example sentences …")
import epub_examples
if not WORDS_JSON.exists():
logger.warning("[4] words.json not found, skipping examples")
return {}
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
stats = epub_examples.run(words)
# Save updated words.json
with open(WORDS_JSON, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
return stats
def step_detail_scrape(args):
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
if args.skip_detail:
logger.info("[2] Skipping detail scrape (--skip-detail)")
return
logger.info("[2] Scraping detail pages from pealim.com …")
import pealim_detail_scrape
test_limit = args.test if args.test else None
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
def step_audio_download(args):
"""Step 5 — download audio .mp3 files from URLs in words.json."""
if args.skip_audio:
logger.info("[5] Skipping audio (--skip-audio)")
return
logger.info("[5] Downloading audio files …")
import pealim_audio_download
test_limit = args.test if args.test else None
pealim_audio_download.run(test=test_limit)
def step_fonts(_args: argparse.Namespace):
"""Step 6 — download Heebo font files (one-time, cached)."""
FONTS_DIR.mkdir(parents=True, exist_ok=True)
regular = FONTS_DIR / "_Heebo-Regular.ttf"
bold = FONTS_DIR / "_Heebo-Bold.ttf"
if regular.exists() and bold.exists():
logger.info("[6] Heebo fonts already cached")
return
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
import requests as _req
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
try:
css_resp = _req.get(css_url, headers=headers, timeout=15)
css_resp.raise_for_status()
css_text = css_resp.text
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
for i, fu in enumerate(font_urls[:2]):
fu = fu.strip("'\"")
dest = regular if i == 0 else bold
if dest.exists():
continue
fr = _req.get(fu, timeout=15)
fr.raise_for_status()
dest.write_bytes(fr.content)
logger.info(f" Downloaded → {dest.name}")
except Exception as e:
logger.warning(f" Heebo download failed: {e}")
logger.warning(" Cards will fall back to Arial Hebrew / David.")
def step_images(args) -> dict:
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
if args.skip_images:
logger.info("[7] Skipping images (--skip-images)")
cache_path = DATA_DIR / "image_cache.json"
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
return {}
limit = args.test
logger.info("[7] Fetching images for concrete nouns …")
import image_fetch
return image_fetch.run(limit=limit)
def step_build_all(args):
"""Step 8 — build all 12 release variants from the unified words.json."""
logger.info("[8] Building all deck variants …")
import apkg_builder
if not WORDS_JSON.exists():
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
sys.exit(1)
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
apkg_builder.build_all_variants(words, limit=args.test)
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
if WORDS_JSON.exists():
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
logger.info(f" Dictionary words: {len(words)}")
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
logger.info(f" Frequency entries: {len(freq_cache)}")
matched = example_stats.get("matched", 0)
total = example_stats.get("total_vocab", 0)
if total:
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
for book, count in example_stats.get("books", {}).items():
logger.info(f" {book}: {count} sentences")
if AUDIO_DIR.exists():
mp3s = list(AUDIO_DIR.glob("*.mp3"))
logger.info(f" Vocabulary audio files: {len(mp3s)}")
if AUDIO_CONJ_DIR.exists():
mp3s = [
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
]
logger.info(f" Conjugation audio files (bundled): {len(mp3s)}")
image_cache_path = DATA_DIR / "image_cache.json"
if image_cache_path.exists():
with open(image_cache_path) as f:
ic = json.load(f)
found_imgs = sum(1 for v in ic.values() if v)
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
import apkg_builder as _ab
all_apkgs = [
_ab.VOCAB_APKG,
_ab.VOCAB_APKG_AUDIO,
_ab.VOCAB_APKG_IMAGES,
_ab.VOCAB_APKG_AUDIO_IMAGES,
_ab.CONJ_APKG,
_ab.CONJ_APKG_AUDIO,
_ab.CONF_APKG,
_ab.CONF_APKG_AUDIO,
_ab.COMPLETE_APKG,
_ab.COMPLETE_APKG_AUDIO,
]
for apkg in all_apkgs:
if apkg.exists():
size_mb = apkg.stat().st_size / 1e6
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
logger.info("=" * 60)
logger.info("DONE")
def main():
args = parse_args()
logger.info("=" * 60)
logger.info("PEALIM ANKI DECK BUILDER")
if args.only:
logger.info(f" MODE: --only {args.only}")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
logger.info("=" * 60)
def _load_words_for_only() -> dict:
if not WORDS_JSON.exists():
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
sys.exit(1)
with open(WORDS_JSON, encoding="utf-8") as f:
return json.load(f)
if args.only == "conjugations":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
apkg_builder.write_conj_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
return
if args.only == "confusables":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
apkg_builder.write_conf_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
return
if args.only == "plurals":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
apkg_builder.write_plural_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
return
if args.only == "complete":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
emoji_lookup = apkg_builder._load_emoji_lookup()
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
decks, media = apkg_builder.build_complete_deck(
words,
include_audio=audio,
emoji_lookup=emoji_lookup,
)
apkg_builder.write_complete_apkg(decks, media, out_path=path)
print_summary(args, {}, {})
return
# Full pipeline
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
freq_cache = step_frequency() # 3 — word frequency data
example_stats = step_examples(args) # 4 — EPUB example sentences
step_audio_download(args) # 5 — download audio mp3s
step_fonts(args) # 6 — download Heebo fonts
step_images(args) # 7 — fetch noun images
step_build_all(args) # 8 — build all .apkg variants
print_summary(args, example_stats, freq_cache)
if __name__ == "__main__":
main()