fix: reorder pipeline — detail scrape immediately after list scrape
List scrape captures slugs needed by detail scrape, so they should be adjacent. Reordered: list→detail→frequency→examples→audio→fonts→images→build Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
6c2a0f8eed
commit
a1d970a782
1 changed files with 31 additions and 19 deletions
50
run.py
50
run.py
|
|
@ -7,13 +7,24 @@ Usage:
|
|||
|
||||
Options:
|
||||
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
|
||||
Pipeline steps:
|
||||
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
|
||||
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
|
||||
3. Frequency — load/download word frequency data
|
||||
4. Examples — fetch Ben Yehuda example sentences
|
||||
5. Audio download — download audio mp3 files
|
||||
6. Fonts — download Heebo font files
|
||||
7. Images — fetch noun images from Wikipedia
|
||||
8. Build — build all .apkg deck variants
|
||||
|
||||
Options:
|
||||
--skip-scrape Skip list page scraping (use existing words.json)
|
||||
--skip-detail Skip detail page scraping
|
||||
--skip-audio Skip audio .mp3 downloads
|
||||
--skip-examples Skip Ben Yehuda example fetching
|
||||
--skip-images Skip image fetching for concrete nouns
|
||||
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
|
||||
--test N Process only the first N dictionary words (for quick testing)
|
||||
--refresh-examples Force rebuild of Ben Yehuda index
|
||||
--test N Limit to first N words/pages
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -76,8 +87,8 @@ def step_list_scrape(args):
|
|||
|
||||
|
||||
def step_frequency() -> dict[str, int]:
|
||||
"""Step 2 — load/download word frequency data."""
|
||||
logger.info("[2] Loading word frequency data …")
|
||||
"""Step 3 — load/download word frequency data."""
|
||||
logger.info("[3] Loading word frequency data …")
|
||||
import frequency_lookup
|
||||
|
||||
frequency_lookup.load()
|
||||
|
|
@ -85,23 +96,23 @@ def step_frequency() -> dict[str, int]:
|
|||
|
||||
|
||||
def step_examples(args, _freq_cache: dict):
|
||||
"""Step 3 — load/build Ben Yehuda example index."""
|
||||
"""Step 4 — load/build Ben Yehuda example index."""
|
||||
if args.skip_examples:
|
||||
logger.info("[3] Skipping examples (--skip-examples)")
|
||||
logger.info("[4] Skipping examples (--skip-examples)")
|
||||
examples_path = DATA_DIR / "examples_cache.json"
|
||||
if examples_path.exists():
|
||||
with open(examples_path) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
logger.info("[3] Loading Ben Yehuda example index …")
|
||||
logger.info("[4] Loading Ben Yehuda example index …")
|
||||
import benyehuda
|
||||
|
||||
benyehuda.load(force_rebuild=args.refresh_examples)
|
||||
|
||||
# Read word list from words.json instead of CSV
|
||||
if not WORDS_JSON.exists():
|
||||
logger.warning("[3] words.json not found, skipping examples")
|
||||
logger.warning("[4] words.json not found, skipping examples")
|
||||
return {}
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
|
|
@ -145,12 +156,12 @@ def step_examples(args, _freq_cache: dict):
|
|||
|
||||
|
||||
def step_detail_scrape(args):
|
||||
"""Step 4 — scrape detail pages for nouns and verbs → update words.json."""
|
||||
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
|
||||
if args.skip_detail:
|
||||
logger.info("[4] Skipping detail scrape (--skip-detail)")
|
||||
logger.info("[2] Skipping detail scrape (--skip-detail)")
|
||||
return
|
||||
|
||||
logger.info("[4] Scraping detail pages from pealim.com …")
|
||||
logger.info("[2] Scraping detail pages from pealim.com …")
|
||||
import pealim_detail_scrape
|
||||
|
||||
test_limit = args.test if args.test else None
|
||||
|
|
@ -164,6 +175,7 @@ def step_audio_download(args):
|
|||
return
|
||||
|
||||
logger.info("[5] Downloading audio files …")
|
||||
|
||||
import pealim_audio_download
|
||||
|
||||
test_limit = args.test if args.test else None
|
||||
|
|
@ -372,14 +384,14 @@ def main():
|
|||
return
|
||||
|
||||
# Full pipeline
|
||||
step_list_scrape(args)
|
||||
freq_cache = step_frequency()
|
||||
examples_cache = step_examples(args, freq_cache)
|
||||
step_detail_scrape(args)
|
||||
step_audio_download(args)
|
||||
step_fonts(args)
|
||||
step_images(args)
|
||||
step_build_all(args)
|
||||
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
|
||||
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
|
||||
freq_cache = step_frequency() # 3 — word frequency data
|
||||
examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples
|
||||
step_audio_download(args) # 5 — download audio mp3s
|
||||
step_fonts(args) # 6 — download Heebo fonts
|
||||
step_images(args) # 7 — fetch noun images
|
||||
step_build_all(args) # 8 — build all .apkg variants
|
||||
|
||||
print_summary(args, examples_cache, freq_cache)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue