fix: reorder pipeline — detail scrape immediately after list scrape

List scrape captures slugs needed by detail scrape, so they should be adjacent. Reordered: list→detail→frequency→examples→audio→fonts→images→build Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 19:16:57 +00:00 · 2026-03-08 19:16:57 +00:00 · a1d970a782
commit a1d970a782
parent 6c2a0f8eed
1 changed files with 31 additions and 19 deletions
--- a/run.py
+++ b/run.py
@ -7,13 +7,24 @@ Usage:

 Options:
  --only {vocab,conjugations,confusables,plurals,complete}  Run only one deck
+Pipeline steps:
+  1. List scrape    — scrape pealim.com list pages → words.json (captures slugs)
+  2. Detail scrape  — scrape noun/verb detail pages using slugs → words.json
+  3. Frequency      — load/download word frequency data
+  4. Examples       — fetch Ben Yehuda example sentences
+  5. Audio download — download audio mp3 files
+  6. Fonts          — download Heebo font files
+  7. Images         — fetch noun images from Wikipedia
+  8. Build          — build all .apkg deck variants
+
+Options:
  --skip-scrape        Skip list page scraping (use existing words.json)
  --skip-detail        Skip detail page scraping
  --skip-audio         Skip audio .mp3 downloads
  --skip-examples      Skip Ben Yehuda example fetching
  --skip-images        Skip image fetching for concrete nouns
-  --refresh-examples   Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
-  --test N             Process only the first N dictionary words (for quick testing)
+  --refresh-examples   Force rebuild of Ben Yehuda index
+  --test N             Limit to first N words/pages
 """

 import argparse
@ -76,8 +87,8 @@ def step_list_scrape(args):


 def step_frequency() -> dict[str, int]:
-    """Step 2 — load/download word frequency data."""
-    logger.info("[2] Loading word frequency data …")
+    """Step 3 — load/download word frequency data."""
+    logger.info("[3] Loading word frequency data …")
    import frequency_lookup

    frequency_lookup.load()
@ -85,23 +96,23 @@ def step_frequency() -> dict[str, int]:


 def step_examples(args, _freq_cache: dict):
-    """Step 3 — load/build Ben Yehuda example index."""
+    """Step 4 — load/build Ben Yehuda example index."""
    if args.skip_examples:
-        logger.info("[3] Skipping examples (--skip-examples)")
+        logger.info("[4] Skipping examples (--skip-examples)")
        examples_path = DATA_DIR / "examples_cache.json"
        if examples_path.exists():
            with open(examples_path) as f:
                return json.load(f)
        return {}

-    logger.info("[3] Loading Ben Yehuda example index …")
+    logger.info("[4] Loading Ben Yehuda example index …")
    import benyehuda

    benyehuda.load(force_rebuild=args.refresh_examples)

    # Read word list from words.json instead of CSV
    if not WORDS_JSON.exists():
-        logger.warning("[3] words.json not found, skipping examples")
+        logger.warning("[4] words.json not found, skipping examples")
        return {}

    with open(WORDS_JSON, encoding="utf-8") as f:
@ -145,12 +156,12 @@ def step_examples(args, _freq_cache: dict):


 def step_detail_scrape(args):
-    """Step 4 — scrape detail pages for nouns and verbs → update words.json."""
+    """Step 2 — scrape detail pages for nouns and verbs → update words.json."""
    if args.skip_detail:
-        logger.info("[4] Skipping detail scrape (--skip-detail)")
+        logger.info("[2] Skipping detail scrape (--skip-detail)")
        return

-    logger.info("[4] Scraping detail pages from pealim.com …")
+    logger.info("[2] Scraping detail pages from pealim.com …")
    import pealim_detail_scrape

    test_limit = args.test if args.test else None
@ -164,6 +175,7 @@ def step_audio_download(args):
        return

    logger.info("[5] Downloading audio files …")
+
    import pealim_audio_download

    test_limit = args.test if args.test else None
@ -372,14 +384,14 @@ def main():
        return

    # Full pipeline
-    step_list_scrape(args)
-    freq_cache = step_frequency()
-    examples_cache = step_examples(args, freq_cache)
-    step_detail_scrape(args)
-    step_audio_download(args)
-    step_fonts(args)
-    step_images(args)
-    step_build_all(args)
+    step_list_scrape(args)  # 1 — scrape list pages → words.json (captures slugs)
+    step_detail_scrape(args)  # 2 — scrape detail pages using slugs → words.json
+    freq_cache = step_frequency()  # 3 — word frequency data
+    examples_cache = step_examples(args, _freq_cache=freq_cache)  # 4 — Ben Yehuda examples
+    step_audio_download(args)  # 5 — download audio mp3s
+    step_fonts(args)  # 6 — download Heebo fonts
+    step_images(args)  # 7 — fetch noun images
+    step_build_all(args)  # 8 — build all .apkg variants

    print_summary(args, examples_cache, freq_cache)