fix: reorder pipeline — detail scrape immediately after list scrape

List scrape captures slugs needed by detail scrape, so they should be
adjacent. Reordered: list→detail→frequency→examples→audio→fonts→images→build

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-08 19:16:57 +00:00
parent 6c2a0f8eed
commit a1d970a782

50
run.py
View file

@ -7,13 +7,24 @@ Usage:
Options:
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
Pipeline steps:
1. List scrape scrape pealim.com list pages words.json (captures slugs)
2. Detail scrape scrape noun/verb detail pages using slugs words.json
3. Frequency load/download word frequency data
4. Examples fetch Ben Yehuda example sentences
5. Audio download download audio mp3 files
6. Fonts download Heebo font files
7. Images fetch noun images from Wikipedia
8. Build build all .apkg deck variants
Options:
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-images Skip image fetching for concrete nouns
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
--test N Process only the first N dictionary words (for quick testing)
--refresh-examples Force rebuild of Ben Yehuda index
--test N Limit to first N words/pages
"""
import argparse
@ -76,8 +87,8 @@ def step_list_scrape(args):
def step_frequency() -> dict[str, int]:
"""Step 2 — load/download word frequency data."""
logger.info("[2] Loading word frequency data …")
"""Step 3 — load/download word frequency data."""
logger.info("[3] Loading word frequency data …")
import frequency_lookup
frequency_lookup.load()
@ -85,23 +96,23 @@ def step_frequency() -> dict[str, int]:
def step_examples(args, _freq_cache: dict):
"""Step 3 — load/build Ben Yehuda example index."""
"""Step 4 — load/build Ben Yehuda example index."""
if args.skip_examples:
logger.info("[3] Skipping examples (--skip-examples)")
logger.info("[4] Skipping examples (--skip-examples)")
examples_path = DATA_DIR / "examples_cache.json"
if examples_path.exists():
with open(examples_path) as f:
return json.load(f)
return {}
logger.info("[3] Loading Ben Yehuda example index …")
logger.info("[4] Loading Ben Yehuda example index …")
import benyehuda
benyehuda.load(force_rebuild=args.refresh_examples)
# Read word list from words.json instead of CSV
if not WORDS_JSON.exists():
logger.warning("[3] words.json not found, skipping examples")
logger.warning("[4] words.json not found, skipping examples")
return {}
with open(WORDS_JSON, encoding="utf-8") as f:
@ -145,12 +156,12 @@ def step_examples(args, _freq_cache: dict):
def step_detail_scrape(args):
"""Step 4 — scrape detail pages for nouns and verbs → update words.json."""
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
if args.skip_detail:
logger.info("[4] Skipping detail scrape (--skip-detail)")
logger.info("[2] Skipping detail scrape (--skip-detail)")
return
logger.info("[4] Scraping detail pages from pealim.com …")
logger.info("[2] Scraping detail pages from pealim.com …")
import pealim_detail_scrape
test_limit = args.test if args.test else None
@ -164,6 +175,7 @@ def step_audio_download(args):
return
logger.info("[5] Downloading audio files …")
import pealim_audio_download
test_limit = args.test if args.test else None
@ -372,14 +384,14 @@ def main():
return
# Full pipeline
step_list_scrape(args)
freq_cache = step_frequency()
examples_cache = step_examples(args, freq_cache)
step_detail_scrape(args)
step_audio_download(args)
step_fonts(args)
step_images(args)
step_build_all(args)
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
freq_cache = step_frequency() # 3 — word frequency data
examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples
step_audio_download(args) # 5 — download audio mp3s
step_fonts(args) # 6 — download Heebo fonts
step_images(args) # 7 — fetch noun images
step_build_all(args) # 8 — build all .apkg variants
print_summary(args, examples_cache, freq_cache)