Template & CSS fixes (15 items from Mar 9 feedback): - Fix conjugation front showing 3ms form instead of infinitive - Rename conjugation model to "Hebrew Conjugation" - Strip Hebrew parenthesized text from English meanings - Shoresh separator: spaces → dots (א.כ.ל) - Remove duplicate English meaning from cloze back - Remove example sentences from vocab front/back (cloze only) - Center-align audio buttons on all decks - Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)" - Unify sec-key/sec-label fonts, make keys bold - Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px) - Center-align related words groups - Sort confusables by average frequency - Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning - Clean duplicate quotation marks in cloze sentences Sprint 12 carry-forward (detail scrape + EPUB): - Adjective/preposition detail scraping in pealim_detail_scrape.py - EPUB example matching rewrite in epub_examples.py - Delete benyehuda.py and rebuild_sentence_matches.py (merged) - 49 parser tests for detail scraping - SCHEMA.yaml updates for new fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
362 lines
12 KiB
Python
362 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Pealim Anki Deck Builder — full pipeline orchestrator.
|
|
|
|
Usage:
|
|
python run.py [options]
|
|
|
|
Options:
|
|
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
|
|
Pipeline steps:
|
|
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
|
|
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
|
|
3. Frequency — load/download word frequency data
|
|
4. Examples — extract example sentences from Hebrew EPUBs
|
|
5. Audio download — download audio mp3 files
|
|
6. Fonts — download Heebo font files
|
|
7. Images — fetch noun images from Wikipedia
|
|
8. Build — build all .apkg deck variants
|
|
|
|
Options:
|
|
--skip-scrape Skip list page scraping (use existing words.json)
|
|
--skip-detail Skip detail page scraping
|
|
--skip-audio Skip audio .mp3 downloads
|
|
--skip-examples Skip EPUB example extraction
|
|
--skip-images Skip image fetching for concrete nouns
|
|
--test N Limit to first N words/pages
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).parent))
|
|
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DATA_DIR = Path(__file__).parent / "data"
|
|
OUTPUT_DIR = Path(__file__).parent / "output"
|
|
AUDIO_DIR = DATA_DIR / "audio"
|
|
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
|
FONTS_DIR = DATA_DIR / "fonts"
|
|
WORDS_JSON = DATA_DIR / "words.json"
|
|
|
|
|
|
def parse_args():
|
|
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
|
|
p.add_argument(
|
|
"--only",
|
|
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
|
|
help="Run only one deck (skips all unrelated steps)",
|
|
)
|
|
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
|
|
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
|
|
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
|
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
|
|
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
|
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
|
return p.parse_args()
|
|
|
|
|
|
def step_list_scrape(args):
|
|
"""Step 1 — scrape pealim.com list pages → words.json."""
|
|
if args.skip_scrape:
|
|
if WORDS_JSON.exists():
|
|
logger.info("[1] Using existing words.json (--skip-scrape)")
|
|
else:
|
|
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
|
|
sys.exit(1)
|
|
return
|
|
|
|
logger.info("[1] Scraping dictionary list pages from pealim.com …")
|
|
import pealim_list_scrape
|
|
|
|
total_pages = args.test if args.test else None
|
|
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
|
|
|
|
|
|
def step_frequency() -> dict[str, int]:
|
|
"""Step 3 — load/download word frequency data."""
|
|
logger.info("[3] Loading word frequency data …")
|
|
import frequency_lookup
|
|
|
|
frequency_lookup.load()
|
|
return frequency_lookup._freq
|
|
|
|
|
|
def step_examples(args) -> dict:
|
|
"""Step 4 — extract example sentences from Hebrew EPUBs."""
|
|
if args.skip_examples:
|
|
logger.info("[4] Skipping examples (--skip-examples)")
|
|
return {}
|
|
|
|
logger.info("[4] Extracting EPUB example sentences …")
|
|
import epub_examples
|
|
|
|
if not WORDS_JSON.exists():
|
|
logger.warning("[4] words.json not found, skipping examples")
|
|
return {}
|
|
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
words = json.load(f)
|
|
|
|
stats = epub_examples.run(words)
|
|
|
|
# Save updated words.json
|
|
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
|
json.dump(words, f, ensure_ascii=False, indent=2)
|
|
|
|
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
|
|
return stats
|
|
|
|
|
|
def step_detail_scrape(args):
|
|
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
|
|
if args.skip_detail:
|
|
logger.info("[2] Skipping detail scrape (--skip-detail)")
|
|
return
|
|
|
|
logger.info("[2] Scraping detail pages from pealim.com …")
|
|
import pealim_detail_scrape
|
|
|
|
test_limit = args.test if args.test else None
|
|
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
|
|
|
|
|
|
def step_audio_download(args):
|
|
"""Step 5 — download audio .mp3 files from URLs in words.json."""
|
|
if args.skip_audio:
|
|
logger.info("[5] Skipping audio (--skip-audio)")
|
|
return
|
|
|
|
logger.info("[5] Downloading audio files …")
|
|
|
|
import pealim_audio_download
|
|
|
|
test_limit = args.test if args.test else None
|
|
pealim_audio_download.run(test=test_limit)
|
|
|
|
|
|
def step_fonts(_args: argparse.Namespace):
|
|
"""Step 6 — download Heebo font files (one-time, cached)."""
|
|
FONTS_DIR.mkdir(parents=True, exist_ok=True)
|
|
regular = FONTS_DIR / "_Heebo-Regular.ttf"
|
|
bold = FONTS_DIR / "_Heebo-Bold.ttf"
|
|
|
|
if regular.exists() and bold.exists():
|
|
logger.info("[6] Heebo fonts already cached")
|
|
return
|
|
|
|
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
|
|
|
|
import requests as _req
|
|
|
|
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
|
|
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
|
|
try:
|
|
css_resp = _req.get(css_url, headers=headers, timeout=15)
|
|
css_resp.raise_for_status()
|
|
css_text = css_resp.text
|
|
|
|
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
|
|
|
|
for i, fu in enumerate(font_urls[:2]):
|
|
fu = fu.strip("'\"")
|
|
dest = regular if i == 0 else bold
|
|
if dest.exists():
|
|
continue
|
|
fr = _req.get(fu, timeout=15)
|
|
fr.raise_for_status()
|
|
dest.write_bytes(fr.content)
|
|
logger.info(f" Downloaded → {dest.name}")
|
|
|
|
except Exception as e:
|
|
logger.warning(f" Heebo download failed: {e}")
|
|
logger.warning(" Cards will fall back to Arial Hebrew / David.")
|
|
|
|
|
|
def step_images(args) -> dict:
|
|
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
|
|
if args.skip_images:
|
|
logger.info("[7] Skipping images (--skip-images)")
|
|
cache_path = DATA_DIR / "image_cache.json"
|
|
if cache_path.exists():
|
|
with open(cache_path) as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
limit = args.test
|
|
logger.info("[7] Fetching images for concrete nouns …")
|
|
import image_fetch
|
|
|
|
return image_fetch.run(limit=limit)
|
|
|
|
|
|
def step_build_all(args):
|
|
"""Step 8 — build all 12 release variants from the unified words.json."""
|
|
logger.info("[8] Building all deck variants …")
|
|
import apkg_builder
|
|
|
|
if not WORDS_JSON.exists():
|
|
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
|
|
sys.exit(1)
|
|
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
words = json.load(f)
|
|
|
|
apkg_builder.build_all_variants(words, limit=args.test)
|
|
|
|
|
|
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
|
|
logger.info("")
|
|
logger.info("=" * 60)
|
|
logger.info("SUMMARY")
|
|
logger.info("=" * 60)
|
|
|
|
if WORDS_JSON.exists():
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
words = json.load(f)
|
|
logger.info(f" Dictionary words: {len(words)}")
|
|
|
|
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
|
|
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
|
|
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
|
|
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
|
|
|
|
logger.info(f" Frequency entries: {len(freq_cache)}")
|
|
matched = example_stats.get("matched", 0)
|
|
total = example_stats.get("total_vocab", 0)
|
|
if total:
|
|
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
|
|
for book, count in example_stats.get("books", {}).items():
|
|
logger.info(f" {book}: {count} sentences")
|
|
|
|
if AUDIO_DIR.exists():
|
|
mp3s = list(AUDIO_DIR.glob("*.mp3"))
|
|
logger.info(f" Vocabulary audio files: {len(mp3s)}")
|
|
|
|
if AUDIO_CONJ_DIR.exists():
|
|
mp3s = [
|
|
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
|
|
]
|
|
logger.info(f" Conjugation audio files (bundled): {len(mp3s)}")
|
|
|
|
image_cache_path = DATA_DIR / "image_cache.json"
|
|
if image_cache_path.exists():
|
|
with open(image_cache_path) as f:
|
|
ic = json.load(f)
|
|
found_imgs = sum(1 for v in ic.values() if v)
|
|
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
|
|
|
|
import apkg_builder as _ab
|
|
|
|
all_apkgs = [
|
|
_ab.VOCAB_APKG,
|
|
_ab.VOCAB_APKG_AUDIO,
|
|
_ab.VOCAB_APKG_IMAGES,
|
|
_ab.VOCAB_APKG_AUDIO_IMAGES,
|
|
_ab.CONJ_APKG,
|
|
_ab.CONJ_APKG_AUDIO,
|
|
_ab.CONF_APKG,
|
|
_ab.CONF_APKG_AUDIO,
|
|
_ab.COMPLETE_APKG,
|
|
_ab.COMPLETE_APKG_AUDIO,
|
|
]
|
|
for apkg in all_apkgs:
|
|
if apkg.exists():
|
|
size_mb = apkg.stat().st_size / 1e6
|
|
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("DONE")
|
|
|
|
|
|
def main():
|
|
args = parse_args()
|
|
|
|
logger.info("=" * 60)
|
|
logger.info("PEALIM ANKI DECK BUILDER")
|
|
if args.only:
|
|
logger.info(f" MODE: --only {args.only}")
|
|
if args.test:
|
|
logger.info(f" TEST MODE: {args.test} words")
|
|
logger.info("=" * 60)
|
|
|
|
def _load_words_for_only() -> dict:
|
|
if not WORDS_JSON.exists():
|
|
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
|
|
sys.exit(1)
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
|
|
if args.only == "conjugations":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
|
|
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
|
|
apkg_builder.write_conj_apkg(deck, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
if args.only == "confusables":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
|
|
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
|
|
apkg_builder.write_conf_apkg(deck, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
if args.only == "plurals":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
|
|
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
|
|
apkg_builder.write_plural_apkg(deck, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
if args.only == "complete":
|
|
step_fonts(args)
|
|
import apkg_builder
|
|
|
|
words = _load_words_for_only()
|
|
emoji_lookup = apkg_builder._load_emoji_lookup()
|
|
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
|
|
decks, media = apkg_builder.build_complete_deck(
|
|
words,
|
|
include_audio=audio,
|
|
emoji_lookup=emoji_lookup,
|
|
)
|
|
apkg_builder.write_complete_apkg(decks, media, out_path=path)
|
|
print_summary(args, {}, {})
|
|
return
|
|
|
|
# Full pipeline
|
|
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
|
|
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
|
|
freq_cache = step_frequency() # 3 — word frequency data
|
|
example_stats = step_examples(args) # 4 — EPUB example sentences
|
|
step_audio_download(args) # 5 — download audio mp3s
|
|
step_fonts(args) # 6 — download Heebo fonts
|
|
step_images(args) # 7 — fetch noun images
|
|
step_build_all(args) # 8 — build all .apkg variants
|
|
|
|
print_summary(args, example_stats, freq_cache)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|