Remove strip_nikkud from all pipeline files — use ktiv_male directly. Fix case-insensitive binyan matching in detail scraper (og:description uses UPPERCASE). Fix integration test slugs and test limits. Delete legacy CSVs, stale .apkg, and dead scripts from git. Add vulture to pre-commit hook. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
398 lines
14 KiB
Python
398 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Pealim Anki Deck Builder — full pipeline orchestrator.
|
|
|
|
Usage:
|
|
python run.py [options]
|
|
|
|
Options:
|
|
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
|
|
Pipeline steps:
|
|
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
|
|
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
|
|
3. Frequency — load/download word frequency data
|
|
4. Examples — fetch Ben Yehuda example sentences
|
|
5. Audio download — download audio mp3 files
|
|
6. Fonts — download Heebo font files
|
|
7. Images — fetch noun images from Wikipedia
|
|
8. Build — build all .apkg deck variants
|
|
|
|
Options:
|
|
--skip-scrape Skip list page scraping (use existing words.json)
|
|
--skip-detail Skip detail page scraping
|
|
--skip-audio Skip audio .mp3 downloads
|
|
--skip-examples Skip Ben Yehuda example fetching
|
|
--skip-images Skip image fetching for concrete nouns
|
|
--refresh-examples Force rebuild of Ben Yehuda index
|
|
--test N Limit to first N words/pages
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DATA_DIR = Path(__file__).parent / "data"
|
|
OUTPUT_DIR = Path(__file__).parent / "output"
|
|
AUDIO_DIR = DATA_DIR / "audio"
|
|
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
|
FONTS_DIR = DATA_DIR / "fonts"
|
|
WORDS_JSON = DATA_DIR / "words.json"
|
|
|
|
|
|
def parse_args():
|
|
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
|
|
p.add_argument(
|
|
"--only",
|
|
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
|
|
help="Run only one deck (skips all unrelated steps)",
|
|
)
|
|
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
|
|
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
|
|
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
|
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
|
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
|
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
|
|
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
|
return p.parse_args()
|
|
|
|
|
|
def step_list_scrape(args):
|
|
"""Step 1 — scrape pealim.com list pages → words.json."""
|
|
if args.skip_scrape:
|
|
if WORDS_JSON.exists():
|
|
logger.info("[1] Using existing words.json (--skip-scrape)")
|
|
else:
|
|
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
|
|
sys.exit(1)
|
|
return
|
|
|
|
logger.info("[1] Scraping dictionary list pages from pealim.com …")
|
|
import pealim_list_scrape
|
|
|
|
total_pages = args.test if args.test else None
|
|
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
|
|
|
|
|
|
def step_frequency() -> dict[str, int]:
|
|
"""Step 3 — load/download word frequency data."""
|
|
logger.info("[3] Loading word frequency data …")
|
|
import frequency_lookup
|
|
|
|
frequency_lookup.load()
|
|
return frequency_lookup._freq
|
|
|
|
|
|
def step_examples(args, _freq_cache: dict):
|
|
"""Step 4 — load/build Ben Yehuda example index."""
|
|
if args.skip_examples:
|
|
logger.info("[4] Skipping examples (--skip-examples)")
|
|
examples_path = DATA_DIR / "examples_cache.json"
|
|
if examples_path.exists():
|
|
with open(examples_path) as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
logger.info("[4] Loading Ben Yehuda example index …")
|
|
import benyehuda
|
|
|
|
benyehuda.load(force_rebuild=args.refresh_examples)
|
|
|
|
# Read word list from words.json instead of CSV
|
|
if not WORDS_JSON.exists():
|
|
logger.warning("[4] words.json not found, skipping examples")
|
|
return {}
|
|
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
words = json.load(f)
|
|
|
|
entries = list(words.values())
|
|
if args.test:
|
|
entries = entries[: args.test]
|
|
|
|
# Build confusable consonant set from words.json
|
|
consonant_counts: dict[str, int] = {}
|
|
for entry in entries:
|
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
|
if ktiv_male:
|
|
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
|
if safe:
|
|
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
|
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
|
|
|
# Delete stale cache entries for confusable words so they get re-fetched
|
|
stale_deleted = 0
|
|
for entry in entries:
|
|
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
|
if word_nikkud and ktiv_male:
|
|
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
|
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
|
|
del benyehuda._examples_cache[word_nikkud]
|
|
stale_deleted += 1
|
|
if stale_deleted:
|
|
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
|
|
|
|
logger.info(f" Pre-fetching examples for {len(entries)} words …")
|
|
for entry in entries:
|
|
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
|
if word_nikkud:
|
|
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
|
|
|
|
benyehuda.save_examples_cache()
|
|
return benyehuda._examples_cache
|
|
|
|
|
|
def step_detail_scrape(args):
|
|
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
|
|
if args.skip_detail:
|
|
logger.info("[2] Skipping detail scrape (--skip-detail)")
|
|
return
|
|
|
|
logger.info("[2] Scraping detail pages from pealim.com …")
|
|
import pealim_detail_scrape
|
|
|
|
test_limit = args.test if args.test else None
|
|
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
|
|
|
|
|
|
def step_audio_download(args):
|
|
"""Step 5 — download audio .mp3 files from URLs in words.json."""
|
|
if args.skip_audio:
|
|
logger.info("[5] Skipping audio (--skip-audio)")
|
|
return
|
|
|
|
logger.info("[5] Downloading audio files …")
|
|
|
|
import pealim_audio_download
|
|
|
|
test_limit = args.test if args.test else None
|
|
pealim_audio_download.run(test=test_limit)
|
|
|
|
|
|
def step_fonts(_args: argparse.Namespace):
|
|
"""Step 6 — download Heebo font files (one-time, cached)."""
|
|
FONTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
regular = FONTS_DIR / "_Heebo-Regular.ttf"
|
|
bold = FONTS_DIR / "_Heebo-Bold.ttf"
|
|
|
|
if regular.exists() and bold.exists():
|
|
logger.info("[6] Heebo fonts already cached")
|
|
return
|
|
|
|
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
|
|
|
|
import requests as _req
|
|
|
|
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
|
|
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
|
|
try:
|
|
css_resp = _req.get(css_url, headers=headers, timeout=15)
|
|
css_resp.raise_for_status()
|
|
css_text = css_resp.text
|
|
|
|
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
|
|
|
|
for i, fu in enumerate(font_urls[:2]):
|
|
fu = fu.strip("'\"")
|
|
dest = regular if i == 0 else bold
|
|
if dest.exists():
|
|
continue
|
|
fr = _req.get(fu, timeout=15)
|
|
fr.raise_for_status()
|
|
dest.write_bytes(fr.content)
|
|
logger.info(f" Downloaded → {dest.name}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f" Heebo download failed: {e}")
|
|
logger.warning(" Cards will fall back to Arial Hebrew / David.")
|
|
|
|
|
|
def step_images(args) -> dict:
|
|
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
|
|
if args.skip_images:
|
|
logger.info("[7] Skipping images (--skip-images)")
|
|
cache_path = DATA_DIR / "image_cache.json"
|
|
if cache_path.exists():
|
|
with open(cache_path) as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
limit = args.test
|
|
logger.info("[7] Fetching images for concrete nouns …")
|
|
import image_fetch
|
|
|
|
return image_fetch.run(limit=limit)
|
|
|
|
|
|
def step_build_all(args):
|
|
"""Step 8 — build all 12 release variants from the unified words.json."""
|
|
logger.info("[8] Building all deck variants …")
|
|
import apkg_builder
|
|
|
|
if not WORDS_JSON.exists():
|
|
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
|
|
sys.exit(1)
|
|
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
words = json.load(f)
|
|
|
|
apkg_builder.build_all_variants(words, limit=args.test)
|
|
|
|
|
|
def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
|
|
logger.info("")
|
|
logger.info("=" * 60)
|
|
logger.info("SUMMARY")
|
|
logger.info("=" * 60)
|
|
|
|
if WORDS_JSON.exists():
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
words = json.load(f)
|
|
logger.info(f" Dictionary words: {len(words)}")
|
|
|
|
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
|
|
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
|
|
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
|
|
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
|
|
|
|
logger.info(f" Frequency entries: {len(freq_cache)}")
|
|
logger.info(f" Example cache entries: {len(examples_cache)}")
|
|
covered = sum(1 for v in examples_cache.values() if v)
|
|
if examples_cache:
|
|
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
|
|
|
|
if AUDIO_DIR.exists():
|
|
mp3s = list(AUDIO_DIR.glob("*.mp3"))
|
|
logger.info(f" Vocabulary audio files: {len(mp3s)}")
|
|
|
|
if AUDIO_CONJ_DIR.exists():
|
|
mp3s = [
|
|
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
|
|
]
|
|
logger.info(f" Conjugation audio files (bundled): {len(mp3s)}")
|
|
|
|
image_cache_path = DATA_DIR / "image_cache.json"
|
|
if image_cache_path.exists():
|
|
with open(image_cache_path) as f:
|
|
ic = json.load(f)
|
|
found_imgs = sum(1 for v in ic.values() if v)
|
|
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
|
|
|
|
import apkg_builder as _ab
|
|
|
|
all_apkgs = [
|
|
_ab.VOCAB_APKG,
|
|
_ab.VOCAB_APKG_AUDIO,
|
|
_ab.VOCAB_APKG_IMAGES,
|
|
_ab.VOCAB_APKG_AUDIO_IMAGES,
|
|
_ab.CONJ_APKG,
|
|
_ab.CONJ_APKG_AUDIO,
|
|
_ab.CONF_APKG,
|
|
_ab.CONF_APKG_AUDIO,
|
|
_ab.COMPLETE_APKG,
|
|
_ab.COMPLETE_APKG_AUDIO,
|
|
]
|
|
for apkg in all_apkgs:
|
|
if apkg.exists():
|
|
size_mb = apkg.stat().st_size / 1e6
|
|
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("DONE")
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("PEALIM ANKI DECK BUILDER")
|
|
if args.only:
|
|
logger.info(f" MODE: --only {args.only}")
|
|
if args.test:
|
|
logger.info(f" TEST MODE: {args.test} words")
|
|
if args.refresh_examples:
|
|
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
|
|
logger.info("=" * 60)
|
|
|
|
def _load_words_for_only() -> dict:
|
|
if not WORDS_JSON.exists():
|
|
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
|
|
sys.exit(1)
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
if args.only == "conjugations":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
|
|
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
|
|
apkg_builder.write_conj_apkg(deck, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
if args.only == "confusables":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
|
|
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
|
|
apkg_builder.write_conf_apkg(deck, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
if args.only == "plurals":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
|
|
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
|
|
apkg_builder.write_plural_apkg(deck, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
if args.only == "complete":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
emoji_lookup = apkg_builder._load_emoji_lookup()
|
|
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
|
|
decks, media = apkg_builder.build_complete_deck(
|
|
words,
|
|
include_audio=audio,
|
|
emoji_lookup=emoji_lookup,
|
|
)
|
|
apkg_builder.write_complete_apkg(decks, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
# Full pipeline
|
|
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
|
|
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
|
|
freq_cache = step_frequency() # 3 — word frequency data
|
|
examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples
|
|
step_audio_download(args) # 5 — download audio mp3s
|
|
step_fonts(args) # 6 — download Heebo fonts
|
|
step_images(args) # 7 — fetch noun images
|
|
step_build_all(args) # 8 — build all .apkg variants
|
|
|
|
print_summary(args, examples_cache, freq_cache)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|