diff --git a/run.py b/run.py index 84c6067..548ffb9 100644 --- a/run.py +++ b/run.py @@ -7,13 +7,24 @@ Usage: Options: --only {vocab,conjugations,confusables,plurals,complete} Run only one deck +Pipeline steps: + 1. List scrape — scrape pealim.com list pages → words.json (captures slugs) + 2. Detail scrape — scrape noun/verb detail pages using slugs → words.json + 3. Frequency — load/download word frequency data + 4. Examples — fetch Ben Yehuda example sentences + 5. Audio download — download audio mp3 files + 6. Fonts — download Heebo font files + 7. Images — fetch noun images from Wikipedia + 8. Build — build all .apkg deck variants + +Options: --skip-scrape Skip list page scraping (use existing words.json) --skip-detail Skip detail page scraping --skip-audio Skip audio .mp3 downloads --skip-examples Skip Ben Yehuda example fetching --skip-images Skip image fetching for concrete nouns - --refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus) - --test N Process only the first N dictionary words (for quick testing) + --refresh-examples Force rebuild of Ben Yehuda index + --test N Limit to first N words/pages """ import argparse @@ -76,8 +87,8 @@ def step_list_scrape(args): def step_frequency() -> dict[str, int]: - """Step 2 — load/download word frequency data.""" - logger.info("[2] Loading word frequency data …") + """Step 3 — load/download word frequency data.""" + logger.info("[3] Loading word frequency data …") import frequency_lookup frequency_lookup.load() @@ -85,23 +96,23 @@ def step_frequency() -> dict[str, int]: def step_examples(args, _freq_cache: dict): - """Step 3 — load/build Ben Yehuda example index.""" + """Step 4 — load/build Ben Yehuda example index.""" if args.skip_examples: - logger.info("[3] Skipping examples (--skip-examples)") + logger.info("[4] Skipping examples (--skip-examples)") examples_path = DATA_DIR / "examples_cache.json" if examples_path.exists(): with open(examples_path) as f: return json.load(f) return {} - logger.info("[3] Loading Ben Yehuda example index …") + logger.info("[4] Loading Ben Yehuda example index …") import benyehuda benyehuda.load(force_rebuild=args.refresh_examples) # Read word list from words.json instead of CSV if not WORDS_JSON.exists(): - logger.warning("[3] words.json not found, skipping examples") + logger.warning("[4] words.json not found, skipping examples") return {} with open(WORDS_JSON, encoding="utf-8") as f: @@ -145,12 +156,12 @@ def step_examples(args, _freq_cache: dict): def step_detail_scrape(args): - """Step 4 — scrape detail pages for nouns and verbs → update words.json.""" + """Step 2 — scrape detail pages for nouns and verbs → update words.json.""" if args.skip_detail: - logger.info("[4] Skipping detail scrape (--skip-detail)") + logger.info("[2] Skipping detail scrape (--skip-detail)") return - logger.info("[4] Scraping detail pages from pealim.com …") + logger.info("[2] Scraping detail pages from pealim.com …") import pealim_detail_scrape test_limit = args.test if args.test else None @@ -164,6 +175,7 @@ def step_audio_download(args): return logger.info("[5] Downloading audio files …") + import pealim_audio_download test_limit = args.test if args.test else None @@ -372,14 +384,14 @@ def main(): return # Full pipeline - step_list_scrape(args) - freq_cache = step_frequency() - examples_cache = step_examples(args, freq_cache) - step_detail_scrape(args) - step_audio_download(args) - step_fonts(args) - step_images(args) - step_build_all(args) + step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs) + step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json + freq_cache = step_frequency() # 3 — word frequency data + examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples + step_audio_download(args) # 5 — download audio mp3s + step_fonts(args) # 6 — download Heebo fonts + step_images(args) # 7 — fetch noun images + step_build_all(args) # 8 — build all .apkg variants print_summary(args, examples_cache, freq_cache)