hebrew_flash_cards/run.py
Sochen 08fb7009d8 Sprint 11: unified JSON architecture + consolidated scraping pipeline
Migrate from fragmented CSV + 10 JSON files to a single data/words.json
(9,104 entries) as the unified data store. All GUIDs preserved for Anki
study progress continuity.

New files:
- SCHEMA.yaml: authoritative schema for words.json
- pealim_list_scrape.py: consolidated list page scraper → words.json
- pealim_detail_scrape.py: noun/verb detail scraper → words.json
- pealim_audio_download.py: audio downloader reading from words.json
- scripts/migrate_to_json.py: one-time CSV→JSON migration
- scripts/validate_data.py: 17 data integrity tests
- scripts/check_guid_coverage.py: GUID preservation checker
- scripts/repair_slugs.py: slug deduplication repair tool
- tests/test_scraper_integration.py: live scraper integration tests

Updated:
- apkg_builder.py: reads from words.json (no more pandas)
- run.py: 8-step pipeline (list scrape → frequency → examples →
  detail scrape → audio download → fonts → images → build)
- benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers
  for future words.json integration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:54:58 +00:00

388 lines
13 KiB
Python

#!/usr/bin/env python3
"""
Pealim Anki Deck Builder — full pipeline orchestrator.
Usage:
python run.py [options]
Options:
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-images Skip image fetching for concrete nouns
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
--test N Process only the first N dictionary words (for quick testing)
"""
import argparse
import json
import logging
import re
import sys
from pathlib import Path
from helpers import strip_nikkud
sys.path.insert(0, str(Path(__file__).parent))
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
OUTPUT_DIR = Path(__file__).parent / "output"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
FONTS_DIR = DATA_DIR / "fonts"
WORDS_JSON = DATA_DIR / "words.json"
def parse_args():
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
p.add_argument(
"--only",
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
help="Run only one deck (skips all unrelated steps)",
)
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
def step_list_scrape(args):
"""Step 1 — scrape pealim.com list pages → words.json."""
if args.skip_scrape:
if WORDS_JSON.exists():
logger.info("[1] Using existing words.json (--skip-scrape)")
else:
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
sys.exit(1)
return
logger.info("[1] Scraping dictionary list pages from pealim.com …")
import pealim_list_scrape
total_pages = args.test if args.test else None
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
def step_frequency() -> dict[str, int]:
"""Step 2 — load/download word frequency data."""
logger.info("[2] Loading word frequency data …")
import frequency_lookup
frequency_lookup.load()
return frequency_lookup._freq
def step_examples(args, _freq_cache: dict):
"""Step 3 — load/build Ben Yehuda example index."""
if args.skip_examples:
logger.info("[3] Skipping examples (--skip-examples)")
examples_path = DATA_DIR / "examples_cache.json"
if examples_path.exists():
with open(examples_path) as f:
return json.load(f)
return {}
logger.info("[3] Loading Ben Yehuda example index …")
import benyehuda
benyehuda.load(force_rebuild=args.refresh_examples)
# Read word list from words.json instead of CSV
if not WORDS_JSON.exists():
logger.warning("[3] words.json not found, skipping examples")
return {}
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
entries = list(words.values())
if args.test:
entries = entries[: args.test]
# Build confusable consonant set from words.json
consonant_counts: dict[str, int] = {}
for entry in entries:
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
# Delete stale cache entries for confusable words so they get re-fetched
stale_deleted = 0
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if word_nikkud and ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
del benyehuda._examples_cache[word_nikkud]
stale_deleted += 1
if stale_deleted:
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
logger.info(f" Pre-fetching examples for {len(entries)} words …")
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
if word_nikkud:
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
benyehuda.save_examples_cache()
return benyehuda._examples_cache
def step_detail_scrape(args):
"""Step 4 — scrape detail pages for nouns and verbs → update words.json."""
if args.skip_detail:
logger.info("[4] Skipping detail scrape (--skip-detail)")
return
logger.info("[4] Scraping detail pages from pealim.com …")
import pealim_detail_scrape
test_limit = args.test if args.test else None
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
def step_audio_download(args):
"""Step 5 — download audio .mp3 files from URLs in words.json."""
if args.skip_audio:
logger.info("[5] Skipping audio (--skip-audio)")
return
logger.info("[5] Downloading audio files …")
import pealim_audio_download
test_limit = args.test if args.test else None
pealim_audio_download.run(test=test_limit)
def step_fonts(_args: argparse.Namespace):
"""Step 6 — download Heebo font files (one-time, cached)."""
FONTS_DIR.mkdir(parents=True, exist_ok=True)
regular = FONTS_DIR / "_Heebo-Regular.ttf"
bold = FONTS_DIR / "_Heebo-Bold.ttf"
if regular.exists() and bold.exists():
logger.info("[6] Heebo fonts already cached")
return
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
import requests as _req
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
try:
css_resp = _req.get(css_url, headers=headers, timeout=15)
css_resp.raise_for_status()
css_text = css_resp.text
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
for i, fu in enumerate(font_urls[:2]):
fu = fu.strip("'\"")
dest = regular if i == 0 else bold
if dest.exists():
continue
fr = _req.get(fu, timeout=15)
fr.raise_for_status()
dest.write_bytes(fr.content)
logger.info(f" Downloaded → {dest.name}")
except Exception as e:
logger.warning(f" Heebo download failed: {e}")
logger.warning(" Cards will fall back to Arial Hebrew / David.")
def step_images(args) -> dict:
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
if args.skip_images:
logger.info("[7] Skipping images (--skip-images)")
cache_path = DATA_DIR / "image_cache.json"
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
return {}
limit = args.test
logger.info("[7] Fetching images for concrete nouns …")
import image_fetch
return image_fetch.run(limit=limit)
def step_build_all(args):
"""Step 8 — build all 12 release variants from the unified words.json."""
logger.info("[8] Building all deck variants …")
import apkg_builder
if not WORDS_JSON.exists():
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
sys.exit(1)
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
apkg_builder.build_all_variants(words, limit=args.test)
def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
if WORDS_JSON.exists():
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
logger.info(f" Dictionary words: {len(words)}")
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
logger.info(f" Frequency entries: {len(freq_cache)}")
logger.info(f" Example cache entries: {len(examples_cache)}")
covered = sum(1 for v in examples_cache.values() if v)
if examples_cache:
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
if AUDIO_DIR.exists():
mp3s = list(AUDIO_DIR.glob("*.mp3"))
logger.info(f" Vocabulary audio files: {len(mp3s)}")
if AUDIO_CONJ_DIR.exists():
mp3s = [
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
]
logger.info(f" Conjugation audio files (bundled): {len(mp3s)}")
image_cache_path = DATA_DIR / "image_cache.json"
if image_cache_path.exists():
with open(image_cache_path) as f:
ic = json.load(f)
found_imgs = sum(1 for v in ic.values() if v)
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
import apkg_builder as _ab
all_apkgs = [
_ab.VOCAB_APKG,
_ab.VOCAB_APKG_AUDIO,
_ab.VOCAB_APKG_IMAGES,
_ab.VOCAB_APKG_AUDIO_IMAGES,
_ab.CONJ_APKG,
_ab.CONJ_APKG_AUDIO,
_ab.CONF_APKG,
_ab.CONF_APKG_AUDIO,
_ab.COMPLETE_APKG,
_ab.COMPLETE_APKG_AUDIO,
]
for apkg in all_apkgs:
if apkg.exists():
size_mb = apkg.stat().st_size / 1e6
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
logger.info("=" * 60)
logger.info("DONE")
def main():
args = parse_args()
logger.info("=" * 60)
logger.info("PEALIM ANKI DECK BUILDER")
if args.only:
logger.info(f" MODE: --only {args.only}")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
if args.refresh_examples:
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
logger.info("=" * 60)
def _load_words_for_only() -> dict:
if not WORDS_JSON.exists():
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
sys.exit(1)
with open(WORDS_JSON, encoding="utf-8") as f:
return json.load(f)
if args.only == "conjugations":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
apkg_builder.write_conj_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
return
if args.only == "confusables":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
apkg_builder.write_conf_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
return
if args.only == "plurals":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
apkg_builder.write_plural_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
return
if args.only == "complete":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
emoji_lookup = apkg_builder._load_emoji_lookup()
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
decks, media = apkg_builder.build_complete_deck(
words,
include_audio=audio,
emoji_lookup=emoji_lookup,
)
apkg_builder.write_complete_apkg(decks, media, out_path=path)
print_summary(args, {}, {})
return
# Full pipeline
step_list_scrape(args)
freq_cache = step_frequency()
examples_cache = step_examples(args, freq_cache)
step_detail_scrape(args)
step_audio_download(args)
step_fonts(args)
step_images(args)
step_build_all(args)
print_summary(args, examples_cache, freq_cache)
if __name__ == "__main__":
main()