Sprint 11: unified JSON architecture + consolidated scraping pipeline

Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:54:58 +00:00 · 2026-03-08 10:54:58 +00:00 · 08fb7009d8
commit 08fb7009d8
parent 2e48109d7f
20 changed files with 561420 additions and 10124 deletions
--- a/.claude/settings.json
+++ b/.claude/settings.json
@ -0,0 +1,26 @@
 {
  "hooks": {
    "PostToolUse": [
      {
        "matcher": "Edit|Write",
        "hooks": [
          {
            "type": "command",
            "command": "file=\"$CLAUDE_FILE_PATH\"; if [ -n \"$file\" ] && echo \"$file\" | grep -q '\\.py$'; then ruff format --quiet \"$file\" && ruff check --fix --quiet \"$file\" 2>/dev/null; fi"
          }
        ]
      }
    ],
    "PreToolUse": [
      {
        "matcher": "Edit|Write",
        "hooks": [
          {
            "type": "command",
            "command": "file=\"$CLAUDE_FILE_PATH\"; if echo \"$file\" | grep -qE '(legacy_guid_map\\.json|\\.env)$'; then echo 'BLOCKED: Protected file — legacy_guid_map.json and .env are read-only' >&2; exit 2; fi"
          }
        ]
      }
    ]
  }
 }
--- a/README.md
+++ b/README.md
@ -56,7 +56,7 @@ Fields on each card:
 | Plural form | for nouns: רבים: שֻׁלְחָנוֹת |
 | Disambiguation hint | for ambiguous Eng→Heb cards |
-Cards are presented in **frequency order** — Anki will show you the most common words first.
+Cards are presented in **frequency order** — Anki will show you the most common words first. Note that because frequency is collected with words without nikkud, words that have the same letters but different nikkud will be assigned the same frequency. 
 ### Eng→Heb disambiguation
--- a/SCHEMA.yaml
+++ b/SCHEMA.yaml
@ -0,0 +1,148 @@
 # Hebrew Flash Cards — Unified Data Schema (words.json)
 # Revised based on Nevo's feedback (2026-03-08)
 #
 # Top-level: dict keyed by unique_key
 # Unique key: nikkud word for most entries (e.g. "אָב")
 #   For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
 #   For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
 #
 # Hebrew text fields use nikkud/ktiv_male subfields:
 #   field:
 #     nikkud: "אָב"       # with nikkud (hebstyle=mo)
 #     ktiv_male: "אב"     # plene spelling (hebstyle=vl)
 # This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
 #
 # Pronoun notation for conjugation forms uses grammatical codes:
 #   1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
 # (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
 entry:
  # --- Core Identity ---
  word:
    nikkud: "אָב"
    ktiv_male: "אב"
  slug: "6009-av"                   # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
  root: ["א", "ב"]                 # Shoresh as list of consonant chars
  pos: "Noun"                      # Part of speech in English (as from pealim)
  pos_hebrew: "שֵׁם עֶצֶם"          # Part of speech in Hebrew (with nikkud)
  meaning: "father"                # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
  meaning_raw: "father 👨"         # Original meaning as scraped (may contain emoji and/or Hebrew preps)
  audio_url: "https://..."         # Pealim audio URL
  audio_file: "6009-av.mp3"        # Local filename (slug-based for confusables, consonant-based otherwise)
  tags: ""                         # Pealim tags if any
  last_scrape_date: "2026-03-08"   # ISO date of most recent pealim.com scrape for this entry
  # --- Identity & Progress ---
  vocab_legacy_guid: "abc123..."   # Vocab note GUID from legacy_guid_map.json
  # Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
  # --- Frequency ---
  frequency: 412                   # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
  pseudo_frequency: null           # Adjusted frequency for confusable homographs (deferred to future sprint)
  # --- Display Enrichment ---
  emoji: "👨"
  emoji_source: "ai_vetted"       # One of: ai_vetted, from_pealim, null
  emoji_visible: false             # Whether to show on cards (false until emoji vetting is done)
  image: "father.jpg"              # Wikipedia/Commons image filename, or null
  image_source: "wikipedia"        # One of: wikipedia, commons, null
  hint: ""                         # Eng→Heb disambiguation hint (from refined_meanings.json)
  # --- Shared Roots ---
  shared_roots: []                 # List of unique_keys of other words sharing the same root
  # Computed by iterating all entries and grouping by root
  # --- Confusables ---
  confusable_group: null           # List of unique_keys sharing same ktiv_male, or null
  # e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
  # --- Example Sentences ---
  examples:
    vetted:                        # AI-vetted sentences from Ben Yehuda / EPUB corpus
      - text: "הָאָב הָלַךְ לַעֲבוֹדָה"
        source: "ben_yehuda"       # One of: ben_yehuda, epub_little_prince, epub_alice, ...
        vetted: true
    cloze:                         # Best sentence for cloze card, or null
      text: "הָאָב הָלַךְ לַעֲבוֹדָה"
      cloze_word_start: 0          # Character offset of the clozed word in text
      cloze_word_end: 4            # End offset — enables exact extraction regardless of nikkud changes
      cloze_hint: "family member"
      cloze_guid: "def456..."      # GUID for the cloze note
    rejected_count: 0
  # --- Noun-specific: Inflection Forms ---
  noun_inflection: null            # null for non-nouns
  # When populated:
  #   plurals_guid: "ghi789..."    # GUID for plurals deck note
  #   singular:                    # null if noun is inherently plural (e.g. bicycle/אופניים)
  #     nikkud: "אָב"
  #     ktiv_male: "אב"
  #   plural:
  #     nikkud: "אָבוֹת"
  #     ktiv_male: "אבות"
  #   singular_audio: "6009-av.mp3"
  #   plural_audio: null           # TODO: scrape from detail pages
  #   construct_singular:
  #     nikkud: "אֲבִי"
  #     ktiv_male: "אבי"
  #   construct_plural:
  #     nikkud: "אֲבוֹת"
  #     ktiv_male: "אבות"
  #   pronominal_suffixes:         # Scraped from pealim "forms with pronominal affixes" section
  #     1s:
  #       nikkud: "אָבִי"
  #       ktiv_male: "אבי"
  #     1p:
  #       nikkud: "אָבִינוּ"
  #       ktiv_male: "אבינו"
  #     2ms: ...
  #     2fs: ...
  #     2mp: ...
  #     2fp: ...
  #     3ms: ...
  #     3fs: ...
  #     3mp: ...
  #     3fp: ...
  #   gender: "masculine"
  #   gender_hebrew:
  #     nikkud: "זָכָר"
  #     ktiv_male: "זכר"
  #   mishkal: "CaCaC"             # English mishkal name (scraped from pealim PoS section)
  #   mishkal_hebrew: "קָטָל"      # Hebrew mishkal name (computed via mapping)
  # --- Verb-specific: Conjugation Data ---
  conjugation: null                # null for non-verbs
  # When populated:
  #   in_conjugation_deck: true    # Whether this verb is in the 71-verb conjugation deck
  #   infinitive:
  #     nikkud: "לִשְׁמֹר"
  #     ktiv_male: "לשמור"
  #   reference_form:              # 3ms past (the citation form)
  #     nikkud: "שָׁמַר"
  #     ktiv_male: "שמר"
  #   binyan: "Pa'al"              # English binyan name
  #   binyan_hebrew: "פָּעַל"      # Hebrew binyan name (with nikkud)
  #   prep: "על"                   # Hebrew preposition the verb takes, or null
  #   active_forms:
  #     - person: "1s"             # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
  #       tense: "עָבָר"
  #       form:
  #         nikkud: "שָׁמַרְתִּי"
  #         ktiv_male: "שמרתי"
  #       audio_url: "https://..."
  #       audio_file: null         # For future use
  #   hufal_pual_forms: null       # Same structure as active_forms; non-null only for hif'il/pi'el verbs
  #   # When non-null, binyan MUST be Hif'il or Pi'el (validated)
  #   reference_form_passive:      # 3ms past of the huf'al/pu'al counterpart, or null
  #     nikkud: "שֻׁמַּר"
  #     ktiv_male: "שומר"
  # --- Adjective-specific ---
  adjective_inflection: null       # Reserved for future use
  # When populated:
  #   ms/fs/mp/fp forms with nikkud/ktiv_male subfields
  # --- Preposition-specific ---
  preposition_inflection: null     # Reserved for future use
  # When populated:
  #   Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)
--- a/apkg_builder.py
+++ b/apkg_builder.py
--- a/benyehuda.py
+++ b/benyehuda.py
@ -2,6 +2,10 @@
 """
 Ben Yehuda corpus example-sentence lookup (nikkud corpus).
 TODO: Rewrite to update words.json examples fields directly instead of
 writing to a separate examples_cache.json. Currently the migration script
 bridges the gap. See Phase 5 in SPRINT_LOG.md.
 Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
 then answers queries locally.
--- a/data/hebrew_dict_for_anki.csv
+++ b/data/hebrew_dict_for_anki.csv
--- a/data/words.json
+++ b/data/words.json
--- a/frequency_lookup.py
+++ b/frequency_lookup.py
@ -3,6 +3,10 @@
 Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
 Downloads he_50k.txt once; subsequent runs read from cache.
 Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
 TODO: Rewrite to update words.json frequency field directly instead of
 writing to a separate frequency_cache.json. Currently the migration script
 bridges the gap. See Phase 5 in SPRINT_LOG.md.
 """
 import json
--- a/image_fetch.py
+++ b/image_fetch.py
@ -2,6 +2,10 @@
 """
 Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
 TODO: Rewrite to update words.json image/image_source fields directly instead of
 writing to a separate image_cache.json. Currently the migration script bridges
 the gap. See Phase 5 in SPRINT_LOG.md.
 Scope: Noun PoS entries only. Concreteness heuristic:
  - English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
    -ship, -ure, -al, -ing when not a gerund, -ence)
@ -59,7 +63,6 @@ session.headers.update(
 )
 def is_concrete(english_meaning: str) -> bool:
    """Return True if the English meaning looks like a concrete noun."""
    meaning = english_meaning.strip().lower()
--- a/pealim_audio_download.py
+++ b/pealim_audio_download.py
@ -0,0 +1,346 @@
 #!/usr/bin/env python3
 """Download audio files from URLs stored in words.json.
 Three audio categories are handled:
  1. Vocab audio  → data/audio/{audio_file}
  2. Noun plural  → data/audio/{slug}_plural.mp3
  3. Conjugation  → data/audio_conj/{slug}_{form_key}.mp3
                    data/audio_conj/{slug}_passive_{form_key}.mp3
 """
 import argparse
 import json
 import logging
 import re
 import time
 from pathlib import Path
 import requests
 from helpers import strip_nikkud
 logger = logging.getLogger(__name__)
 DATA_DIR = Path(__file__).parent / "data"
 AUDIO_DIR = DATA_DIR / "audio"
 AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
 WORDS_JSON = DATA_DIR / "words.json"
 DOWNLOAD_DELAY = 0.3
 MAX_RETRIES = 3
 # Map Hebrew tense names to English prefixes for form_key construction.
 # "מְקוֹר" (infinitive) is included for forward compatibility; it does not
 # appear in the current dataset but the form_key collapses to bare "infinitive".
 TENSE_TO_PREFIX = {
    "הוֹוֶה": "present",
    "עָבָר": "past",
    "עָתִיד": "future",
    "צִוּוּי": "imperative",
    "מְקוֹר": "infinitive",
 }
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _make_audio_file(entry: dict) -> str:
    """Derive the vocab audio filename when audio_file is absent.
    Slug-based for confusable entries (slug contains the disambiguating ID),
    consonant-only for all others.
    Args:
        entry: A words.json entry dict.
    Returns:
        Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
    """
    slug: str = entry["slug"]
    if entry.get("confusable_group"):
        return f"{slug}.mp3"
    word: str = entry.get("word", "")
    safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word))
    return f"{safe_name}.mp3"
 def _form_key(person: str, tense: str) -> str:
    """Build a filesystem-safe form key from person and tense fields.
    Args:
        person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
        tense: Hebrew tense string from the conjugation form.
    Returns:
        Form key such as ``"past_1s"`` or ``"present_ms"``.
        Infinitive tense always returns ``"infinitive"`` (no person suffix).
    """
    prefix = TENSE_TO_PREFIX.get(tense, tense)
    if prefix == "infinitive":
        return "infinitive"
    return f"{prefix}_{person}"
 def _download(url: str, dest: Path, session: requests.Session) -> bool:
    """Download *url* to *dest*, retrying up to MAX_RETRIES times.
    Skips the download silently if *dest* already exists.
    Args:
        url: HTTP(S) URL to download.
        dest: Local path to write the file to.
        session: Shared requests session.
    Returns:
        ``True`` if the file was downloaded (or already existed),
        ``False`` if all retries were exhausted.
    """
    if dest.exists():
        return True
    for attempt in range(1, MAX_RETRIES + 1):
        try:
            resp = session.get(url, timeout=15)
            resp.raise_for_status()
            dest.write_bytes(resp.content)
            logger.debug("Downloaded %s → %s", url, dest.name)
            return True
        except requests.RequestException as exc:
            wait = 2**attempt
            if attempt < MAX_RETRIES:
                logger.warning(
                    "Attempt %d/%d failed for %s (%s) — retrying in %ds",
                    attempt,
                    MAX_RETRIES,
                    url,
                    exc,
                    wait,
                )
                time.sleep(wait)
            else:
                logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
    return False
 # ---------------------------------------------------------------------------
 # Per-category downloaders
 # ---------------------------------------------------------------------------
 def download_vocab_audio(
    entries: list[dict],
    session: requests.Session,
 ) -> tuple[int, int, int]:
    """Download vocabulary audio files.
    Args:
        entries: List of words.json entry dicts.
        session: Shared requests session.
    Returns:
        Tuple of (downloaded, cached, no_url) counts.
    """
    downloaded = cached = no_url = 0
    for entry in entries:
        url: str | None = entry.get("audio_url")
        if not url:
            no_url += 1
            continue
        audio_file: str | None = entry.get("audio_file")
        if not audio_file:
            audio_file = _make_audio_file(entry)
        dest = AUDIO_DIR / audio_file
        if dest.exists():
            cached += 1
            continue
        if _download(url, dest, session):
            downloaded += 1
            time.sleep(DOWNLOAD_DELAY)
        else:
            no_url += 1  # count persistent failures alongside missing URLs
    return downloaded, cached, no_url
 def download_noun_plural_audio(
    entries: list[dict],
    session: requests.Session,
 ) -> tuple[int, int]:
    """Download noun plural audio files.
    Destination: ``data/audio/{slug}_plural.mp3``
    Args:
        entries: List of words.json entry dicts.
        session: Shared requests session.
    Returns:
        Tuple of (downloaded, cached) counts.
    """
    downloaded = cached = 0
    for entry in entries:
        ni = entry.get("noun_inflection")
        if not ni or not isinstance(ni, dict):
            continue
        url: str | None = ni.get("plural_audio")
        if not url or not url.startswith("http"):
            continue
        slug: str = entry["slug"]
        dest = AUDIO_DIR / f"{slug}_plural.mp3"
        if dest.exists():
            cached += 1
            continue
        if _download(url, dest, session):
            downloaded += 1
            time.sleep(DOWNLOAD_DELAY)
    return downloaded, cached
 def download_conjugation_audio(
    entries: list[dict],
    session: requests.Session,
 ) -> tuple[int, int, int]:
    """Download conjugation form audio files.
    Active forms   → ``data/audio_conj/{slug}_{form_key}.mp3``
    Passive forms  → ``data/audio_conj/{slug}_passive_{form_key}.mp3``
    Args:
        entries: List of words.json entry dicts.
        session: Shared requests session.
    Returns:
        Tuple of (downloaded, cached, failed) counts.
    """
    downloaded = cached = failed = 0
    for entry in entries:
        conj = entry.get("conjugation")
        if not conj:
            continue
        slug: str = entry["slug"]
        form_sets: list[tuple[str, list]] = [
            ("", conj.get("active_forms") or []),
            ("passive_", conj.get("hufal_pual_forms") or []),
        ]
        for prefix, forms in form_sets:
            for form in forms:
                url: str | None = form.get("audio_url")
                if not url:
                    continue
                key = _form_key(form.get("person", ""), form.get("tense", ""))
                dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
                if dest.exists():
                    cached += 1
                    continue
                if _download(url, dest, session):
                    downloaded += 1
                    time.sleep(DOWNLOAD_DELAY)
                else:
                    failed += 1
    return downloaded, cached, failed
 # ---------------------------------------------------------------------------
 # Entry point
 # ---------------------------------------------------------------------------
 def main() -> None:
    """Parse CLI args and run the audio download pipeline."""
    parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
    parser.add_argument(
        "--skip-vocab",
        action="store_true",
        help="Skip vocabulary audio downloads.",
    )
    parser.add_argument(
        "--skip-conj",
        action="store_true",
        help="Skip conjugation audio downloads.",
    )
    parser.add_argument(
        "--test",
        metavar="N",
        type=int,
        default=None,
        help="Limit processing to the first N words.json entries.",
    )
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.INFO,
        format="%(message)s",
    )
    AUDIO_DIR.mkdir(parents=True, exist_ok=True)
    AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
    with open(WORDS_JSON, encoding="utf-8") as fh:
        raw: dict[str, dict] = json.load(fh)
    entries = list(raw.values())
    if args.test is not None:
        entries = entries[: args.test]
    logger.info("[4] Downloading audio files …")
    session = requests.Session()
    session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
    # --- Vocab ---
    if not args.skip_vocab:
        v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
    else:
        v_dl = v_cached = v_no_url = 0
    # --- Noun plural ---
    np_dl, np_cached = download_noun_plural_audio(entries, session)
    # --- Conjugation ---
    if not args.skip_conj:
        c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
    else:
        c_dl = c_cached = c_failed = 0
    # --- Summary ---
    if not args.skip_vocab:
        logger.info(
            "    Vocab: %d downloaded, %d cached, %d no URL",
            v_dl,
            v_cached,
            v_no_url,
        )
    logger.info("    Noun plural: %d downloaded, %d cached", np_dl, np_cached)
    if not args.skip_conj:
        failed_msg = f", {c_failed} failed" if c_failed else ""
        logger.info(
            "    Conjugation: %d downloaded, %d cached%s",
            c_dl,
            c_cached,
            failed_msg,
        )
 if __name__ == "__main__":
    main()
--- a/pealim_detail_scrape.py
+++ b/pealim_detail_scrape.py
--- a/pealim_list_scrape.py
+++ b/pealim_list_scrape.py
@ -0,0 +1,706 @@
 #!/usr/bin/env python3
 """
 Consolidated list page scraper for pealim.com.
 Scrapes /dict/?page=N with two cookie variants (hebstyle=mo for nikkud,
 hebstyle=vl for ktiv male) and writes results directly to data/words.json.
 Usage:
    python3 pealim_list_scrape.py [--test N] [--force-refresh]
 """
 import argparse
 import json
 import logging
 import os
 import re
 import time
 from datetime import date
 from pathlib import Path
 import requests
 from bs4 import BeautifulSoup
 from helpers import strip_nikkud
 # ---------------------------------------------------------------------------
 # Paths
 # ---------------------------------------------------------------------------
 PROJECT_ROOT = Path(__file__).parent
 DATA_DIR = PROJECT_ROOT / "data"
 WORDS_JSON = DATA_DIR / "words.json"
 PROGRESS_JSON = DATA_DIR / "list_scrape_progress.json"
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
 PEALIM_DICT_URL = "https://www.pealim.com/dict/"
 REQUEST_DELAY = 1.5  # seconds between requests
 REQUEST_TIMEOUT = 15  # seconds
 DEFAULT_TOTAL_PAGES = 608
 SAVE_EVERY = 10  # pages between incremental saves
 TODAY = date.today().isoformat()
 # Prefer lxml if available; html.parser is the fallback
 try:
    import lxml  # type: ignore[import-untyped]  # noqa: F401
    BS4_PARSER = "lxml"
 except ImportError:
    BS4_PARSER = "html.parser"
 # ---------------------------------------------------------------------------
 # Part-of-speech mappings
 # ---------------------------------------------------------------------------
 POS_HEBREW: dict[str, str] = {
    "Noun": "שֵׁם עֶצֶם",
    "Verb": "פֹּעַל",
    "Adjective": "שֵׁם תֹּאַר",
    "Adverb": "תֹּאַר הַפֹּעַל",
    "Pronoun": "כִּנּוּי גּוּף",
    "Preposition": "מִילַּת יַחַס",
    "Conjunction": "מִילַּת חִבּוּר",
    "Interjection": "מִילַּת קְרִיאָה",
    "Numeral": "שֵׁם מִסְפָּר",
    "Cardinal numeral": "שֵׁם מִסְפָּר",
    "Particle": "מִילִּית",
    "Determiner": "מְגַדִּיר",
    "Existential": "מִילַּת קִיּוּם",
    "Interrogative": "מִילַּת שְׁאֵלָה",
 }
 # Use exact match on the POS string prefix; longer keys must be checked first.
 POS_HEBREW_ORDERED: list[tuple[str, str]] = sorted(POS_HEBREW.items(), key=lambda x: -len(x[0]))
 BINYAN_HEBREW: dict[str, str] = {
    "Pa'al": "פָּעַל",
    "Nif'al": "נִפְעַל",
    "Pi'el": "פִּיעֵל",
    "Pu'al": "פֻּעַל",
    "Hif'il": "הִפְעִיל",
    "Huf'al": "הֻפְעַל",
    "Hitpa'el": "הִתְפַּעֵל",
 }
 # Regex for extracting emoji characters
 EMOJI_RE = re.compile(
    r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF]+",
    re.UNICODE,
 )
 # Fields that must never be overwritten when updating an existing entry
 PROTECTED_FIELDS = frozenset(
    [
        "vocab_legacy_guid",
        "confusables_guid",
        "frequency",
        "pseudo_frequency",
        "emoji",
        "emoji_source",
        "emoji_visible",
        "image",
        "image_source",
        "hint",
        "examples",
        "noun_inflection",
        "conjugation",
        "adjective_inflection",
        "preposition_inflection",
    ]
 )
 # ---------------------------------------------------------------------------
 # Logging
 # ---------------------------------------------------------------------------
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # HTTP session
 # ---------------------------------------------------------------------------
 session = requests.Session()
 session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki-scraper/1.0)"})
 # ---------------------------------------------------------------------------
 # Default entry template
 # ---------------------------------------------------------------------------
 def _default_entry() -> dict:
    """Return a fresh entry with all fields initialised to safe defaults."""
    return {
        "word": {"nikkud": "", "ktiv_male": ""},
        "slug": "",
        "root": [],
        "pos": "",
        "pos_hebrew": "",
        "meaning": "",
        "meaning_raw": "",
        "audio_url": "",
        "audio_file": "",
        "tags": "",
        "last_scrape_date": "",
        "vocab_legacy_guid": None,
        "frequency": None,
        "pseudo_frequency": None,
        "emoji": None,
        "emoji_source": None,
        "emoji_visible": False,
        "image": None,
        "image_source": None,
        "hint": "",
        "shared_roots": [],
        "confusable_group": None,
        "confusables_guid": None,
        "examples": None,
        "noun_inflection": None,
        "conjugation": None,
        "adjective_inflection": None,
        "preposition_inflection": None,
    }
 # ---------------------------------------------------------------------------
 # Parsing helpers
 # ---------------------------------------------------------------------------
 def _extract_emoji(text: str) -> str | None:
    """Return the first emoji run found in *text*, or None."""
    m = EMOJI_RE.search(text)
    return m.group(0) if m else None
 def _clean_meaning(raw: str) -> str:
    """Strip emoji and extra whitespace from a raw meaning string."""
    cleaned = EMOJI_RE.sub("", raw)
    return " ".join(cleaned.split())
 def _parse_pos(pos_raw: str) -> tuple[str, str]:
    """
    Parse raw PoS string into (pos_en, pos_hebrew).
    Examples:
        "Noun – masculine"        → ("Noun", "שֵׁם עֶצֶם")
        "Verb – pa'al"            → ("Verb", "פֹּעַל — פָּעַל")
        "Cardinal numeral"        → ("Cardinal numeral", "שֵׁם מִסְפָּר")
    """
    # Strip leading/trailing whitespace; normalise dashes
    pos_clean = pos_raw.strip()
    # Determine the base English PoS with longest-match strategy
    pos_en = ""
    for key, _ in POS_HEBREW_ORDERED:
        if pos_clean.startswith(key):
            pos_en = key
            break
    if not pos_en:
        # Fallback: take everything up to " – " or the full string
        pos_en = pos_clean.split(" – ")[0].split(" - ")[0].strip()
    pos_heb = POS_HEBREW.get(pos_en, pos_en)
    # For verbs, attempt to append binyan
    if pos_en == "Verb":
        # Look for binyan after dash; pealim uses "Verb – pa'al"
        dash_parts = re.split(r"\s*[–-]\s*", pos_clean)
        if len(dash_parts) >= 2:
            binyan_raw = dash_parts[1].strip()
            # Normalise capitalisation for lookup: "pa'al" → "Pa'al"
            binyan_key = binyan_raw.capitalize()
            # Handle mixed-case entries like "Nif'al"
            for bkey in BINYAN_HEBREW:
                if bkey.lower() == binyan_raw.lower():
                    binyan_key = bkey
                    break
            binyan_heb = BINYAN_HEBREW.get(binyan_key)
            if binyan_heb:
                pos_heb = f"{pos_heb} — {binyan_heb}"
    return pos_en, pos_heb
 def _parse_root(root_raw: str) -> list[str]:
    """
    Convert raw root text to a list of consonants.
    Pealim shows roots as "פ - ע - ל" or "פ.ע.ל" or "—" (no root).
    """
    if not root_raw or root_raw in ("-", "—", "–"):
        return []
    # Split on " - " or "." separators
    parts = re.split(r"\s*[-–—.]\s*", root_raw.strip())
    return [p.strip() for p in parts if p.strip()]
 def _build_tags(pos_en: str, root: list[str]) -> str:
    """
    Generate Anki tags string matching the existing project convention.
    Examples:
        pos=Noun, root=[]         → "שם_עצם"
        pos=Noun, root=["א","ב"] → "שורש::אב שם_עצם"
        pos=Verb, root=["שמר"]   → "שורש::שמר פעלים"
    """
    pos_tag_map = {
        "Noun": "שם_עצם",
        "Verb": "פעלים",
        "Adjective": "שם_תואר",
        "Adverb": "תוארי_הפועל",
        "Pronoun": "כינויי_גוף",
        "Preposition": "מילות_יחס",
        "Conjunction": "מילות_חיבור",
        "Particle": "מילית",
        "Numeral": "שם_מספר",
        "Cardinal numeral": "שם_מספר",
        "Determiner": "מגדיר",
        "Existential": "מילת_קיום",
        "Interrogative": "מילת_שאלה",
        "Interjection": "מילת_קריאה",
    }
    parts: list[str] = []
    if root:
        root_str = "".join(strip_nikkud(c) for c in root)
        parts.append(f"שורש::{root_str}")
    pos_heb_tag = pos_tag_map.get(pos_en, "")
    if pos_heb_tag:
        parts.append(pos_heb_tag)
    return " ".join(parts)
 def _compute_audio_file(slug: str, ktiv_male: str) -> str:
    """
    Return the local audio filename for an entry.
    The actual confusable detection happens later (after all pages are scraped);
    here we store a placeholder that post_process() will correct.
    We default to the consonant-based name; confusables get slug-based names.
    """
    consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
    return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
 # ---------------------------------------------------------------------------
 # Page parsing
 # ---------------------------------------------------------------------------
 def _parse_mo_page(html: bytes) -> list[dict]:
    """
    Parse a hebstyle=mo (nikkud) list page.
    Returns a list of raw row dicts with keys:
        nikkud, slug, root_raw, pos_raw, meaning_raw, audio_url
    """
    soup = BeautifulSoup(html, BS4_PARSER)
    rows: list[dict] = []
    for tr in soup.select("table tr"):
        tds = tr.find_all("td")
        if len(tds) < 4:
            continue
        # Audio URL
        audio_span = tds[0].find(attrs={"data-audio": True})
        audio_url: str = audio_span["data-audio"] if audio_span else ""
        # Slug
        slug = ""
        link = tds[0].find("a", href=True)
        if link:
            m = re.search(r"/dict/([^/]+)/", link["href"])
            if m:
                slug = m.group(1)
        # Nikkud word
        menukad = tds[0].find("span", class_="menukad")
        nikkud = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
        root_raw = tds[1].get_text(strip=True)
        pos_raw = tds[2].get_text(strip=True)
        meaning_raw = tds[3].get_text(strip=True)
        if nikkud:
            rows.append(
                {
                    "nikkud": nikkud,
                    "slug": slug,
                    "root_raw": root_raw,
                    "pos_raw": pos_raw,
                    "meaning_raw": meaning_raw,
                    "audio_url": audio_url,
                }
            )
    return rows
 def _parse_vl_words(html: bytes) -> list[str]:
    """
    Parse a hebstyle=vl (ktiv male) list page.
    Returns ordered list of ktiv male strings (one per table row).
    """
    soup = BeautifulSoup(html, BS4_PARSER)
    words: list[str] = []
    for tr in soup.select("table tr"):
        tds = tr.find_all("td")
        if len(tds) < 4:
            continue
        menukad = tds[0].find("span", class_="menukad")
        word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
        words.append(word)
    return words
 # ---------------------------------------------------------------------------
 # words.json I/O
 # ---------------------------------------------------------------------------
 def _load_words() -> dict:
    """Load words.json; return empty dict if missing."""
    if not WORDS_JSON.exists():
        logger.info("data/words.json not found — starting fresh.")
        return {}
    with WORDS_JSON.open(encoding="utf-8") as fh:
        return json.load(fh)
 def _save_words(words: dict) -> None:
    """Atomically write words to words.json via a .tmp file."""
    tmp = WORDS_JSON.with_suffix(".json.tmp")
    with tmp.open("w", encoding="utf-8") as fh:
        json.dump(words, fh, ensure_ascii=False, indent=2)
    os.replace(tmp, WORDS_JSON)
    logger.info("Saved data/words.json (%d entries)", len(words))
 # ---------------------------------------------------------------------------
 # Progress tracking
 # ---------------------------------------------------------------------------
 def _load_progress() -> set[int]:
    """Return set of already-completed page numbers."""
    if not PROGRESS_JSON.exists():
        return set()
    with PROGRESS_JSON.open(encoding="utf-8") as fh:
        data = json.load(fh)
    return set(data.get("completed_pages", []))
 def _save_progress(completed: set[int]) -> None:
    """Atomically write progress file."""
    tmp = PROGRESS_JSON.with_suffix(".json.tmp")
    with tmp.open("w", encoding="utf-8") as fh:
        json.dump({"completed_pages": sorted(completed)}, fh)
    os.replace(tmp, PROGRESS_JSON)
 # ---------------------------------------------------------------------------
 # Unique key generation
 # ---------------------------------------------------------------------------
 def _make_unique_key(nikkud: str, pos_en: str, meaning: str, existing_keys: set[str]) -> str:
    """
    Generate a collision-free unique key for a new entry.
    Escalation:
        1. nikkud
        2. nikkud|pos_en
        3. nikkud|pos_en|meaning
        4. nikkud|pos_en|meaning|N  (N = 2, 3, …)
    """
    candidate = nikkud
    if candidate not in existing_keys:
        return candidate
    candidate = f"{nikkud}|{pos_en}"
    if candidate not in existing_keys:
        return candidate
    candidate = f"{nikkud}|{pos_en}|{meaning}"
    if candidate not in existing_keys:
        return candidate
    n = 2
    while True:
        candidate = f"{nikkud}|{pos_en}|{meaning}|{n}"
        if candidate not in existing_keys:
            return candidate
        n += 1
 # ---------------------------------------------------------------------------
 # Core: merge one scraped row into words dict
 # ---------------------------------------------------------------------------
 def _merge_row(
    words: dict,
    slug_index: dict[str, str],
    nikkud: str,
    ktiv_male: str,
    slug: str,
    root_raw: str,
    pos_raw: str,
    meaning_raw_raw: str,
    audio_url: str,
 ) -> None:
    """
    Upsert a single scraped row into *words* in-place.
    *slug_index* maps slug → unique_key for fast lookup and is updated here
    when a new entry is created.
    """
    # Derived fields
    pos_en, pos_heb = _parse_pos(pos_raw)
    root = _parse_root(root_raw)
    meaning_raw = meaning_raw_raw
    meaning = _clean_meaning(meaning_raw)
    emoji = _extract_emoji(meaning_raw_raw)
    tags = _build_tags(pos_en, root)
    audio_file = _compute_audio_file(slug, ktiv_male)
    # ---- locate existing entry ----
    unique_key: str | None = slug_index.get(slug) if slug else None
    if unique_key and unique_key in words:
        # Update list-level fields only; never touch protected fields
        entry = words[unique_key]
        entry["word"]["nikkud"] = nikkud
        entry["word"]["ktiv_male"] = ktiv_male
        entry["slug"] = slug
        entry["root"] = root
        entry["pos"] = pos_en
        entry["pos_hebrew"] = pos_heb
        entry["meaning"] = meaning
        entry["meaning_raw"] = meaning_raw
        entry["audio_url"] = audio_url
        entry["audio_file"] = audio_file
        entry["tags"] = tags
        entry["last_scrape_date"] = TODAY
    else:
        # Create new entry
        unique_key = _make_unique_key(nikkud, pos_en, meaning, set(words.keys()))
        entry = _default_entry()
        entry["word"]["nikkud"] = nikkud
        entry["word"]["ktiv_male"] = ktiv_male
        entry["slug"] = slug
        entry["root"] = root
        entry["pos"] = pos_en
        entry["pos_hebrew"] = pos_heb
        entry["meaning"] = meaning
        entry["meaning_raw"] = meaning_raw
        entry["emoji"] = emoji
        entry["emoji_source"] = "from_pealim" if emoji else None
        entry["audio_url"] = audio_url
        entry["audio_file"] = audio_file
        entry["tags"] = tags
        entry["last_scrape_date"] = TODAY
        words[unique_key] = entry
        if slug:
            slug_index[slug] = unique_key
 # ---------------------------------------------------------------------------
 # Post-processing: recompute confusable_group, shared_roots, audio_file
 # ---------------------------------------------------------------------------
 def _post_process(words: dict) -> None:
    """
    After all pages are scraped, recompute derived cross-entry fields:
    - confusable_group: entries sharing the same ktiv_male (2+)
    - shared_roots: entries sharing the same root (excluding self)
    - audio_file: slug-based for confusables, consonant-based otherwise
    """
    logger.info("Post-processing: recomputing confusable groups and shared roots...")
    # --- confusable groups ---
    ktiv_to_keys: dict[str, list[str]] = {}
    for key, entry in words.items():
        ktiv = entry.get("word", {}).get("ktiv_male", "")
        if ktiv:
            ktiv_to_keys.setdefault(ktiv, []).append(key)
    for _, entry in words.items():
        ktiv = entry.get("word", {}).get("ktiv_male", "")
        group = ktiv_to_keys.get(ktiv, [])
        if len(group) >= 2:
            entry["confusable_group"] = sorted(group)
            # Confusable → slug-based audio filename
            slug = entry.get("slug", "")
            if slug:
                entry["audio_file"] = f"{slug}.mp3"
        else:
            # Only clear confusable_group if it wasn't set by enrichment (i.e. no confusables_guid)
            if not entry.get("confusables_guid"):
                entry["confusable_group"] = None
            # Non-confusable → consonant-based audio filename
            ktiv_male = entry.get("word", {}).get("ktiv_male", "")
            consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
            slug = entry.get("slug", "")
            entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
    # --- shared roots ---
    root_to_keys: dict[str, list[str]] = {}
    for key, entry in words.items():
        root = entry.get("root")
        if root:
            root_str = "|".join(root)  # canonical form for grouping
            root_to_keys.setdefault(root_str, []).append(key)
    for key, entry in words.items():
        root = entry.get("root")
        if root:
            root_str = "|".join(root)
            siblings = root_to_keys.get(root_str, [])
            entry["shared_roots"] = sorted(k for k in siblings if k != key)
        else:
            entry["shared_roots"] = []
    logger.info("Post-processing complete.")
 # ---------------------------------------------------------------------------
 # Scraping loop
 # ---------------------------------------------------------------------------
 def _build_slug_index(words: dict) -> dict[str, str]:
    """Build slug → unique_key lookup from the current words dict."""
    index: dict[str, str] = {}
    for key, entry in words.items():
        slug = entry.get("slug", "")
        if slug and slug not in index:
            index[slug] = key
    return index
 def _fetch_page(url: str, cookies: dict) -> bytes | None:
    """Fetch a single page; return raw bytes or None on failure."""
    try:
        resp = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        return resp.content
    except requests.RequestException as exc:
        logger.error("Request failed for %s: %s", url, exc)
        return None
 def run_scrape(total_pages: int, force_refresh: bool) -> None:
    """
    Main scrape loop.
    Args:
        total_pages: Number of list pages to scrape.
        force_refresh: If True, ignore progress file and re-scrape all pages.
    """
    words = _load_words()
    slug_index = _build_slug_index(words)
    completed = set() if force_refresh else _load_progress()
    if force_refresh and completed:
        logger.info("--force-refresh: ignoring %d completed pages.", len(completed))
    pages_to_do = [p for p in range(1, total_pages + 1) if p not in completed]
    logger.info(
        "Pages to scrape: %d / %d (already done: %d)",
        len(pages_to_do),
        total_pages,
        len(completed),
    )
    pages_since_save = 0
    for page_num in pages_to_do:
        url = f"{PEALIM_DICT_URL}?page={page_num}"
        logger.info("Scraping page %d / %d …", page_num, total_pages)
        # --- hebstyle=mo (nikkud + audio + slug) ---
        mo_html = _fetch_page(url, {"translit": "none", "hebstyle": "mo"})
        if mo_html is None:
            logger.warning("Skipping page %d (mo fetch failed).", page_num)
            time.sleep(REQUEST_DELAY * 2)
            continue
        time.sleep(REQUEST_DELAY)
        # --- hebstyle=vl (ktiv male) ---
        vl_html = _fetch_page(url, {"translit": "none", "hebstyle": "vl"})
        if vl_html is None:
            logger.warning("Skipping page %d (vl fetch failed).", page_num)
            time.sleep(REQUEST_DELAY * 2)
            continue
        # Parse
        mo_rows = _parse_mo_page(mo_html)
        vl_words = _parse_vl_words(vl_html)
        if not mo_rows:
            logger.warning("Page %d returned no rows — might be past end.", page_num)
            completed.add(page_num)
            _save_progress(completed)
            time.sleep(REQUEST_DELAY)
            continue
        # Merge each row
        for i, row in enumerate(mo_rows):
            ktiv_male = vl_words[i] if i < len(vl_words) else ""
            _merge_row(
                words=words,
                slug_index=slug_index,
                nikkud=row["nikkud"],
                ktiv_male=ktiv_male,
                slug=row["slug"],
                root_raw=row["root_raw"],
                pos_raw=row["pos_raw"],
                meaning_raw_raw=row["meaning_raw"],
                audio_url=row["audio_url"],
            )
        completed.add(page_num)
        pages_since_save += 1
        # Incremental save every SAVE_EVERY pages
        if pages_since_save >= SAVE_EVERY:
            _save_words(words)
            _save_progress(completed)
            pages_since_save = 0
        time.sleep(REQUEST_DELAY)
    # Final save + post-processing
    logger.info("All pages scraped. Running post-processing…")
    _post_process(words)
    _save_words(words)
    _save_progress(completed)
    logger.info("Done. Total entries in words.json: %d", len(words))
 # ---------------------------------------------------------------------------
 # CLI
 # ---------------------------------------------------------------------------
 def main() -> None:
    """Entry point."""
    parser = argparse.ArgumentParser(description="Scrape pealim.com list pages into data/words.json.")
    parser.add_argument(
        "--test",
        metavar="N",
        type=int,
        default=None,
        help="Scrape only the first N pages (for testing).",
    )
    parser.add_argument(
        "--force-refresh",
        action="store_true",
        default=False,
        help="Re-scrape all pages, ignoring existing progress.",
    )
    args = parser.parse_args()
    total_pages = args.test if args.test is not None else DEFAULT_TOTAL_PAGES
    logger.info(
        "Starting pealim list scraper | pages=%d | force=%s | parser=%s",
        total_pages,
        args.force_refresh,
        BS4_PARSER,
    )
    run_scrape(total_pages=total_pages, force_refresh=args.force_refresh)
 if __name__ == "__main__":
    main()
--- a/pyproject.toml
+++ b/pyproject.toml
@ -25,6 +25,9 @@ dev = [
 [tool.pytest.ini_options]
 testpaths = ["tests"]
 markers = [
    "integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
 ]
 [tool.ruff]
 target-version = "py311"
--- a/run.py
+++ b/run.py
@ -7,10 +7,10 @@ Usage:
 Options:
  --only {vocab,conjugations,confusables,plurals,complete}  Run only one deck
-  --skip-scrape        Use existing data/pealim_dict.csv (no pealim.com dict scraping)
+  --skip-scrape        Skip list page scraping (use existing words.json)
  --skip-detail        Skip detail page scraping
  --skip-audio         Skip audio .mp3 downloads
  --skip-examples      Skip Ben Yehuda example fetching
  --skip-conjugations  Skip verb conjugation extraction
  --skip-images        Skip image fetching for concrete nouns
  --refresh-examples   Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
  --test N             Process only the first N dictionary words (for quick testing)
@ -21,7 +21,6 @@ import json
 import logging
 import re
 import sys
 import time
 from pathlib import Path
 from helpers import strip_nikkud
@ -39,6 +38,7 @@ OUTPUT_DIR = Path(__file__).parent / "output"
 AUDIO_DIR = DATA_DIR / "audio"
 AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
 FONTS_DIR = DATA_DIR / "fonts"
 WORDS_JSON = DATA_DIR / "words.json"
 def parse_args():
@ -48,47 +48,31 @@ def parse_args():
        choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
        help="Run only one deck (skips all unrelated steps)",
    )
-    p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
+    p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
    p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
    p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
    p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
    p.add_argument(
        "--skip-conjugations",
        action="store_true",
        help="Skip verb conjugation extraction (deprecated: use --only vocab)",
    )
    p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
    p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
    p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
    return p.parse_args()
-def step_scrape(args):
+def step_list_scrape(args):
-    """Step 1 — scrape or load dictionary."""
+    """Step 1 — scrape pealim.com list pages → words.json."""
    dict_csv = DATA_DIR / "hebrew_dict.csv"
    anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
    # Legacy fallback names
    legacy_dict = DATA_DIR / "pealim_dict.csv"
    if args.skip_scrape:
-        if dict_csv.exists():
+        if WORDS_JSON.exists():
-            logger.info(f"[1] Using existing {dict_csv}")
+            logger.info("[1] Using existing words.json (--skip-scrape)")
        elif legacy_dict.exists():
            logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
        else:
-            logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
+            logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
            sys.exit(1)
        return
-    logger.info("[1] Scraping dictionary from pealim.com …")
+    logger.info("[1] Scraping dictionary list pages from pealim.com …")
    import pealim_list_scrape
-    import hebrew_extract
+    total_pages = args.test if args.test else None
-
+    pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
    df = hebrew_extract.extract_from_website()
    df.to_csv(dict_csv, index=True)
    logger.info(f"    Saved {len(df)} words → {dict_csv}")
    df = hebrew_extract.modify_for_anki(df)
    df.to_csv(anki_csv, sep=";", index=True)
    logger.info(f"    Saved Anki CSV → {anki_csv}")
 def step_frequency() -> dict[str, int]:
@ -100,7 +84,7 @@ def step_frequency() -> dict[str, int]:
    return frequency_lookup._freq
-def step_examples(args, freq_cache: dict):
+def step_examples(args, _freq_cache: dict):
    """Step 3 — load/build Ben Yehuda example index."""
    if args.skip_examples:
        logger.info("[3] Skipping examples (--skip-examples)")
@ -115,255 +99,100 @@ def step_examples(args, freq_cache: dict):
    benyehuda.load(force_rebuild=args.refresh_examples)
-    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    # Read word list from words.json instead of CSV
-    if not dict_csv.exists():
+    if not WORDS_JSON.exists():
-        dict_csv = DATA_DIR / "hebrew_dict.csv"
+        logger.warning("[3] words.json not found, skipping examples")
-    if not dict_csv.exists():
+        return {}
        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
-    try:
+    with open(WORDS_JSON, encoding="utf-8") as f:
-        import pandas as pd
+        words = json.load(f)
-        try:
+    entries = list(words.values())
-            df = pd.read_csv(dict_csv, sep=";", index_col=0)
+    if args.test:
-            if df.shape[1] < 3:
+        entries = entries[: args.test]
                raise ValueError("too few columns")
        except (ValueError, pd.errors.ParserError):
            df = pd.read_csv(dict_csv, index_col=0)
-        if args.test:
+    # Build confusable consonant set from words.json
-            df = df.head(args.test)
+    consonant_counts: dict[str, int] = {}
    for entry in entries:
        ktiv_male = entry.get("word", {}).get("ktiv_male", "")
        if ktiv_male:
            safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
            if safe:
                consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
    confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
-        # Build confusable consonant set from CSV
+    # Delete stale cache entries for confusable words so they get re-fetched
-        consonant_counts: dict[str, int] = {}
+    stale_deleted = 0
-        for _, row in df.iterrows():
+    for entry in entries:
-            word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
+        word_nikkud = entry.get("word", {}).get("nikkud", "")
-            if word_no_nik and word_no_nik not in ("nan", "None"):
+        ktiv_male = entry.get("word", {}).get("ktiv_male", "")
-                safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
+        if word_nikkud and ktiv_male:
-                if safe:
+            safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
-                    consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
+            if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
-        confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
+                del benyehuda._examples_cache[word_nikkud]
                stale_deleted += 1
    if stale_deleted:
        logger.info(f"    Deleted {stale_deleted} stale confusable cache entries")
-        # Delete stale cache entries for confusable words so they get re-fetched
+    logger.info(f"    Pre-fetching examples for {len(entries)} words …")
-        stale_deleted = 0
+    for entry in entries:
-        for _, row in df.iterrows():
+        word_nikkud = entry.get("word", {}).get("nikkud", "")
-            word_nikkud = str(row.get("Word", "")).strip()
+        if word_nikkud:
-            word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
+            benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
            if word_nikkud and word_no_nik:
                safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
                if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
                    del benyehuda._examples_cache[word_nikkud]
                    stale_deleted += 1
        if stale_deleted:
            logger.info(f"    Deleted {stale_deleted} stale confusable cache entries")
        logger.info(f"    Pre-fetching examples for {len(df)} words …")
        for _, row in df.iterrows():
            # Use nikkud word form as primary key (nikkud corpus)
            word_nikkud = str(row.get("Word", "")).strip()
            if word_nikkud:
                benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
    except Exception as e:
        logger.warning(f"    Could not pre-fetch all examples: {e}")
    benyehuda.save_examples_cache()
    return benyehuda._examples_cache
-def step_audio(args):
+def step_detail_scrape(args):
-    """Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
+    """Step 4 — scrape detail pages for nouns and verbs → update words.json."""
-    if args.skip_audio:
+    if args.skip_detail:
-        logger.info("[4] Skipping audio (--skip-audio)")
+        logger.info("[4] Skipping detail scrape (--skip-detail)")
        return
-    logger.info("[4] Downloading vocabulary audio files …")
+    logger.info("[4] Scraping detail pages from pealim.com …")
    import pealim_detail_scrape
-    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    test_limit = args.test if args.test else None
-    if not dict_csv.exists():
+    pealim_detail_scrape.run(test=test_limit, force_refresh=False)
        dict_csv = DATA_DIR / "hebrew_dict.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    import pandas as pd
    import requests
    try:
        try:
            df = pd.read_csv(dict_csv, sep=";", index_col=0)
            if df.shape[1] < 3:
                raise ValueError("too few columns")
        except (ValueError, pd.errors.ParserError):
            df = pd.read_csv(dict_csv, index_col=0)
        if "audio_url" not in df.columns:
            logger.warning("    No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
            return
        if args.test:
            df = df.head(args.test)
        # Build confusable set: consonant forms that appear more than once
        confusable_consonants: set[str] = set()
        consonant_counts: dict[str, int] = {}
        for _, row in df.iterrows():
            word_plain = str(row.get("Word Without Nikkud", "")).strip()
            if word_plain and word_plain not in ("nan", "None"):
                safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain))
                if safe:
                    consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
        confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
        AUDIO_DIR.mkdir(parents=True, exist_ok=True)
        downloaded = 0
        skipped = 0
        no_url = 0
        for _, row in df.iterrows():
            word = str(row.get("Word", "")).strip()
            word_plain = str(row.get("Word Without Nikkud", "")).strip()
            audio_url = str(row.get("audio_url", "")).strip()
            slug = str(row.get("slug", "")).strip()
            if not word:
                continue
            safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
            if not safe_name:
                continue
            # Confusable words: use slug-based filename to avoid collisions
            if safe_name in confusable_consonants and slug and slug not in ("nan", "None"):
                mp3_path = AUDIO_DIR / f"{slug}.mp3"
            else:
                mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
            if mp3_path.exists():
                skipped += 1
                continue
            if not audio_url or audio_url in ("nan", "None", ""):
                no_url += 1
                continue
            try:
                resp = requests.get(audio_url, timeout=10)
                resp.raise_for_status()
                mp3_path.write_bytes(resp.content)
                downloaded += 1
                time.sleep(0.3)
            except Exception as e:
                logger.debug(f"    Audio download failed for {word}: {e}")
        logger.info(f"    Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
    except Exception as e:
        logger.warning(f"    Audio step failed: {e}")
-def step_conj_audio(args, conjugations: dict):
+def step_audio_download(args):
-    """Step 4b — download conjugation audio .mp3 files."""
+    """Step 5 — download audio .mp3 files from URLs in words.json."""
    if args.skip_audio:
-        logger.info("[4b] Skipping conjugation audio (--skip-audio)")
+        logger.info("[5] Skipping audio (--skip-audio)")
        return
-    logger.info("[4b] Downloading conjugation audio files …")
+    logger.info("[5] Downloading audio files …")
-    AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
+    import pealim_audio_download
-    import requests
+    test_limit = args.test if args.test else None
-
+    pealim_audio_download.run(test=test_limit)
    downloaded = 0
    skipped = 0
    failed = 0
    for _infinitive, data in conjugations.items():
        if not data or not data.get("forms"):
            continue
        slug = data.get("slug", "")
        if not slug:
            continue
        # Active forms
        for form_key, form_data in data["forms"].items():
            audio_url = form_data.get("audio_url", "")
            if not audio_url:
                continue
            filename = f"{slug}_{form_key}.mp3"
            mp3_path = AUDIO_CONJ_DIR / filename
            if mp3_path.exists():
                skipped += 1
                continue
            try:
                resp = requests.get(audio_url, timeout=10)
                resp.raise_for_status()
                mp3_path.write_bytes(resp.content)
                downloaded += 1
                time.sleep(0.2)
            except Exception as e:
                logger.debug(f"    Conj audio failed {filename}: {e}")
                failed += 1
        # Passive partner forms
        passive = data.get("passive_partner")
        if passive and passive.get("forms"):
            for form_key, form_data in passive["forms"].items():
                audio_url = form_data.get("audio_url", "")
                if not audio_url:
                    continue
                filename = f"{slug}_passive_{form_key}.mp3"
                mp3_path = AUDIO_CONJ_DIR / filename
                if mp3_path.exists():
                    skipped += 1
                    continue
                try:
                    resp = requests.get(audio_url, timeout=10)
                    resp.raise_for_status()
                    mp3_path.write_bytes(resp.content)
                    downloaded += 1
                    time.sleep(0.2)
                except Exception as e:
                    logger.debug(f"    Conj audio failed {filename}: {e}")
                    failed += 1
    logger.info(f"    Conjugation audio: {downloaded} downloaded, {skipped} cached, {failed} failed")
-def step_fonts(args):
+def step_fonts(_args: argparse.Namespace):
-    """Step 4c — download Heebo font files (one-time, cached)."""
+    """Step 6 — download Heebo font files (one-time, cached)."""
    FONTS_DIR.mkdir(parents=True, exist_ok=True)
    regular = FONTS_DIR / "_Heebo-Regular.ttf"
    bold = FONTS_DIR / "_Heebo-Bold.ttf"
    if regular.exists() and bold.exists():
-        logger.info("[4c] Heebo fonts already cached")
+        logger.info("[6] Heebo fonts already cached")
        return
-    logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
+    logger.info("[6] Downloading Heebo fonts from Google Fonts …")
    # Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
    import requests as _req
-    headers = {
+    headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
        # Request TTF (not woff2) so Anki can embed them
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
    }
    css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
    try:
        css_resp = _req.get(css_url, headers=headers, timeout=15)
        css_resp.raise_for_status()
        css_text = css_resp.text
        # Find all src: url(...) references (may be woff2 for modern UA)
        font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
        logger.debug(f"    Found {len(font_urls)} font URL(s) in CSS")
        # Prefer TTF; if only woff2 available, download first two and note
        downloaded = []
        for i, fu in enumerate(font_urls[:2]):
            fu = fu.strip("'\"")
            dest = regular if i == 0 else bold
@ -372,128 +201,60 @@ def step_fonts(args):
            fr = _req.get(fu, timeout=15)
            fr.raise_for_status()
            dest.write_bytes(fr.content)
            downloaded.append(dest.name)
            logger.info(f"    Downloaded → {dest.name}")
        if not downloaded:
            logger.info("    All font files already present")
    except Exception as e:
        logger.warning(f"    Heebo download failed: {e}")
        logger.warning("    Cards will fall back to Arial Hebrew / David.")
        logger.warning(
            "    To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
            "from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
            f"into {FONTS_DIR}"
        )
 def step_images(args) -> dict:
-    """Step 4d — fetch images for concrete nouns (resume-safe)."""
+    """Step 7 — fetch images for concrete nouns (resume-safe)."""
    if args.skip_images:
-        logger.info("[4d] Skipping images (--skip-images)")
+        logger.info("[7] Skipping images (--skip-images)")
        cache_path = DATA_DIR / "image_cache.json"
        if cache_path.exists():
            with open(cache_path) as f:
                return json.load(f)
        return {}
-    limit = args.test  # When in test mode, limit images too
+    limit = args.test
-    logger.info("[4d] Fetching images for concrete nouns …")
+    logger.info("[7] Fetching images for concrete nouns …")
    import image_fetch
    return image_fetch.run(limit=limit)
-def step_build_all(
+def step_build_all(args):
-    args, examples_cache: dict, freq_cache: dict, conjugations: dict | None, image_cache: dict | None = None
+    """Step 8 — build all 12 release variants from the unified words.json."""
-):
+    logger.info("[8] Building all deck variants …")
    """Step 5 — build all 6 release variants (4 vocab + 2 conj)."""
    logger.info("[5] Building all deck variants …")
    import apkg_builder
-    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if not WORDS_JSON.exists():
-    if not dict_csv.exists():
+        logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
-        dict_csv = DATA_DIR / "hebrew_dict.csv"
+        sys.exit(1)
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
-    apkg_builder.build_all_variants(
+    with open(WORDS_JSON, encoding="utf-8") as f:
-        dict_csv,
+        words = json.load(f)
-        conjugations=conjugations or {},
+
-        examples_cache=examples_cache,
+    apkg_builder.build_all_variants(words, limit=args.test)
        freq_cache=freq_cache,
        image_cache=image_cache or {},
        limit=args.test,
    )
-def step_conjugations(args):
+def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
    """Step 6 — extract conjugations (returns data; building handled by step_build_all).
    --skip-conjugations skips re-extraction from pealim.com but still loads
    from cache so conj deck variants are built correctly.
    """
    conj_cache = DATA_DIR / "conjugations.json"
    if args.skip_conjugations:
        if conj_cache.exists():
            logger.info("[6] --skip-conjugations: loading from cache …")
            with open(conj_cache) as f:
                import json as _json
                return _json.load(f)
        logger.info("[6] --skip-conjugations: no cache found, skipping conj decks")
        return None
    verbs_file = Path(__file__).parent / "verbs_input.txt"
    if not verbs_file.exists():
        logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
        return None
    if conj_cache.exists():
        logger.info("[6] Using cached conjugations.json …")
        with open(conj_cache) as f:
            import json as _json
            conjugations = _json.load(f)
    else:
        logger.info("[6] Extracting verb conjugations …")
        import conjugation_extract
        conjugations = conjugation_extract.main(verbs_file)
    # Download conjugation audio
    step_conj_audio(args, conjugations)
    return conjugations
 def print_summary(args, examples_cache, freq_cache, conjugations):
    logger.info("")
    logger.info("=" * 60)
    logger.info("SUMMARY")
    logger.info("=" * 60)
-    dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+    if WORDS_JSON.exists():
-    if not dict_csv.exists():
+        with open(WORDS_JSON, encoding="utf-8") as f:
-        dict_csv = DATA_DIR / "hebrew_dict.csv"
+            words = json.load(f)
-    if not dict_csv.exists():
+        logger.info(f"  Dictionary words: {len(words)}")
        dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
    if not dict_csv.exists():
        dict_csv = DATA_DIR / "pealim_dict.csv"
    if dict_csv.exists():
        import pandas as pd
-        try:
+        nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
-            df = pd.read_csv(dict_csv, sep=";", index_col=0)
+        verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
-            if df.shape[1] < 3:
+        detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
-                raise ValueError("too few columns")
+        logger.info(f"  Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
        except (ValueError, pd.errors.ParserError):
            df = pd.read_csv(dict_csv, index_col=0)
        logger.info(f"  Dictionary words: {len(df)}")
    logger.info(f"  Frequency entries: {len(freq_cache)}")
    logger.info(f"  Example cache entries: {len(examples_cache)}")
@ -506,8 +267,6 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
        logger.info(f"  Vocabulary audio files: {len(mp3s)}")
    if AUDIO_CONJ_DIR.exists():
        # Count only files that will be bundled: active non-infinitive forms
        # (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
        mp3s = [
            p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
        ]
@ -538,9 +297,6 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
        if apkg.exists():
            size_mb = apkg.stat().st_size / 1e6
            logger.info(f"  {apkg.name}: {size_mb:.1f} MB")
    if conjugations:
        verb_count = sum(1 for v in conjugations.values() if v)
        logger.info(f"  Verbs in conjugation deck: {verb_count}")
    logger.info("=" * 60)
    logger.info("DONE")
@ -559,88 +315,73 @@ def main():
        logger.info("  REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
    logger.info("=" * 60)
    def _load_words_for_only() -> dict:
        if not WORDS_JSON.exists():
            logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
            sys.exit(1)
        with open(WORDS_JSON, encoding="utf-8") as f:
            return json.load(f)
    if args.only == "conjugations":
        step_fonts(args)
-        conjugations = step_conjugations(args)
+        import apkg_builder
        if conjugations:
            import apkg_builder
-            dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+        words = _load_words_for_only()
-            if not dict_csv.exists():
+        for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
-                dict_csv = DATA_DIR / "hebrew_dict.csv"
+            deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
-            for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
+            apkg_builder.write_conj_apkg(deck, media, out_path=path)
-                deck, media = apkg_builder.build_conj_deck(
+        print_summary(args, {}, {})
                    conjugations,
                    include_audio=audio,
                    dict_csv=dict_csv,
                )
                apkg_builder.write_conj_apkg(deck, media, out_path=path)
        print_summary(args, {}, {}, conjugations or {})
        return
    if args.only == "confusables":
        step_fonts(args)
        import apkg_builder
-        dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+        words = _load_words_for_only()
        for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
-            deck, media = apkg_builder.build_confusables_deck(dict_csv, include_audio=audio)
+            deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
            apkg_builder.write_conf_apkg(deck, media, out_path=path)
-        print_summary(args, {}, {}, {})
+        print_summary(args, {}, {})
        return
    if args.only == "plurals":
        step_fonts(args)
        import apkg_builder
-        dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+        words = _load_words_for_only()
        if not dict_csv.exists():
            dict_csv = DATA_DIR / "hebrew_dict.csv"
        for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
-            deck, media = apkg_builder.build_plural_deck(dict_csv=dict_csv, include_audio=audio)
+            deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
            apkg_builder.write_plural_apkg(deck, media, out_path=path)
-        print_summary(args, {}, {}, {})
+        print_summary(args, {}, {})
        return
    if args.only == "complete":
        step_fonts(args)
        freq_cache = step_frequency() if not args.skip_scrape else {}
        examples_cache = step_examples(args, freq_cache) if not args.skip_examples else {}
        image_cache = step_images(args) if not args.skip_images else {}
        conjugations = step_conjugations(args)
        import apkg_builder
-        dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
+        words = _load_words_for_only()
        if not dict_csv.exists():
            dict_csv = DATA_DIR / "hebrew_dict.csv"
        emoji_lookup = apkg_builder._load_emoji_lookup()
        for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
            decks, media = apkg_builder.build_complete_deck(
-                dict_csv,
+                words,
                conjugations=conjugations or {},
                examples_cache=examples_cache,
                freq_cache=freq_cache,
                image_cache=image_cache,
                emoji_lookup=emoji_lookup,
                include_audio=audio,
                emoji_lookup=emoji_lookup,
            )
            apkg_builder.write_complete_apkg(decks, media, out_path=path)
-        print_summary(args, examples_cache, freq_cache, conjugations or {})
+        print_summary(args, {}, {})
        return
-    if args.only == "vocab":
+    # Full pipeline
-        args.skip_conjugations = True
+    step_list_scrape(args)
    step_scrape(args)
    freq_cache = step_frequency()
    examples_cache = step_examples(args, freq_cache)
-    step_audio(args)
+    step_detail_scrape(args)
    step_audio_download(args)
    step_fonts(args)
-    image_cache = step_images(args)
+    step_images(args)
-    conjugations = step_conjugations(args)
+    step_build_all(args)
    step_build_all(args, examples_cache, freq_cache, conjugations, image_cache)
-    print_summary(args, examples_cache, freq_cache, conjugations or {})
+    print_summary(args, examples_cache, freq_cache)
 if __name__ == "__main__":
--- a/scripts/check_guid_coverage.py
+++ b/scripts/check_guid_coverage.py
@ -0,0 +1,212 @@
 """Check that every GUID in the last-release complete .apkg exists in words.json.
 Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
 then compares against all GUID fields stored in data/words.json.
 Usage:
    python3 scripts/check_guid_coverage.py
    python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
    python3 scripts/check_guid_coverage.py --verbose
 """
 from __future__ import annotations
 import argparse
 import json
 import os
 import sqlite3
 import sys
 import tempfile
 import zipfile
 from pathlib import Path
 from typing import Any
 PROJECT_ROOT = Path(__file__).parent.parent
 DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
 WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
 # Known model IDs (from apkg_builder.py)
 MODEL_IDS = {
    1701222017968: "vocab",
    1234567893: "conjugation",
    1234567897: "plurals",
    1234567895: "confusables",
 }
 def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
    """Extract GUIDs from .apkg grouped by model ID."""
    by_model: dict[int, set[str]] = {}
    with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
        z.extractall(td)
        db_path = os.path.join(td, "collection.anki2")
        conn = sqlite3.connect(db_path)
        cur = conn.cursor()
        cur.execute("SELECT guid, mid FROM notes")
        for guid, mid in cur.fetchall():
            by_model.setdefault(mid, set()).add(guid)
        conn.close()
    return by_model
 def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
    """Collect all GUIDs from words.json grouped by deck type."""
    vocab_guids: set[str] = set()
    cloze_guids: set[str] = set()
    conj_guids: set[str] = set()
    plurals_guids: set[str] = set()
    confusables_guids: set[str] = set()
    for entry in data.values():
        # Vocab legacy GUID
        g = entry.get("vocab_legacy_guid")
        if g:
            vocab_guids.add(g)
        # Cloze GUID (stored in examples.cloze.cloze_guid)
        examples = entry.get("examples")
        if examples:
            cloze = examples.get("cloze")
            if cloze:
                g = cloze.get("cloze_guid")
                if g:
                    cloze_guids.add(g)
        # Plurals GUID (stored inside noun_inflection)
        ni = entry.get("noun_inflection")
        if ni:
            g = ni.get("plurals_guid")
            if g:
                plurals_guids.add(g)
        # Confusables GUID (top-level)
        g = entry.get("confusables_guid")
        if g:
            confusables_guids.add(g)
        # Conjugation form GUIDs
        conj = entry.get("conjugation")
        if conj:
            for form_list_key in ("active_forms", "hufal_pual_forms"):
                forms = conj.get(form_list_key)
                if not forms:
                    continue
                for form in forms:
                    g = form.get("guid")
                    if g:
                        conj_guids.add(g)
                    gc = form.get("guid_candidates")
                    if gc:
                        for g2 in gc:
                            conj_guids.add(g2)
    return {
        "vocab": vocab_guids,
        "cloze": cloze_guids,
        "conjugation": conj_guids,
        "plurals": plurals_guids,
        "confusables": confusables_guids,
    }
 def main() -> None:
    parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
    parser.add_argument(
        "--apkg",
        type=Path,
        default=DEFAULT_APKG,
        help=f"Path to .apkg file (default: {DEFAULT_APKG})",
    )
    parser.add_argument("--verbose", "-v", action="store_true")
    args = parser.parse_args()
    if not args.apkg.exists():
        print(f"ERROR: apkg not found: {args.apkg}")
        sys.exit(2)
    if not WORDS_JSON.exists():
        print(f"ERROR: words.json not found: {WORDS_JSON}")
        sys.exit(2)
    print(f"Checking: {args.apkg}")
    print(f"Against:  {WORDS_JSON}")
    print()
    apkg_by_model = extract_apkg_guids(args.apkg)
    data = json.load(WORDS_JSON.open(encoding="utf-8"))
    wj = collect_words_json_guids(data)
    total_apkg = sum(len(s) for s in apkg_by_model.values())
    total_wj = sum(len(s) for s in wj.values())
    print(f"Total GUIDs in apkg:      {total_apkg}")
    print(f"Total GUIDs in words.json: {total_wj}")
    print()
    all_missing = 0
    all_extra = 0
    for mid, deck_name in MODEL_IDS.items():
        apkg_set = apkg_by_model.get(mid, set())
        # Map apkg model to words.json GUID sets
        if deck_name == "vocab":
            # Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
            # They share the note GUID — vocab_legacy_guid IS the note guid
            wj_set = wj["vocab"] | wj["cloze"]
        elif deck_name == "conjugation":
            wj_set = wj["conjugation"]
        elif deck_name == "plurals":
            wj_set = wj["plurals"]
        elif deck_name == "confusables":
            wj_set = wj["confusables"]
        else:
            wj_set = set()
        missing = apkg_set - wj_set
        extra = wj_set - apkg_set
        matched = apkg_set & wj_set
        all_missing += len(missing)
        all_extra += len(extra)
        status = "PASS" if not missing else "FAIL"
        print(f"  {status}  {deck_name} (mid={mid})")
        print(
            f"         apkg={len(apkg_set)}, words.json={len(wj_set)}, "
            f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
        )
        if missing and args.verbose:
            # Try to find what word each missing GUID belongs to in the apkg
            print("         Missing GUIDs (in apkg, not in words.json):")
            for g in sorted(missing)[:20]:
                print(f"           {g!r}")
            if len(missing) > 20:
                print(f"           ... ({len(missing) - 20} more)")
        if extra and args.verbose:
            print("         Extra GUIDs (in words.json, not in apkg):")
            for g in sorted(extra)[:10]:
                print(f"           {g!r}")
            if len(extra) > 10:
                print(f"           ... ({len(extra) - 10} more)")
        print()
    # Check for unknown model IDs in apkg
    unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
    if unknown_mids:
        print(f"  WARNING: Unknown model IDs in apkg: {unknown_mids}")
        for mid in unknown_mids:
            print(f"    mid={mid}: {len(apkg_by_model[mid])} notes")
    print("─" * 60)
    if all_missing:
        print(f"  FAILED: {all_missing} apkg GUIDs not found in words.json")
        print("          (These notes would lose study progress on reimport)")
        sys.exit(1)
    else:
        print(f"  All {total_apkg} apkg GUIDs accounted for in words.json.")
        sys.exit(0)
 if __name__ == "__main__":
    main()
--- a/scripts/migrate_to_json.py
+++ b/scripts/migrate_to_json.py
--- a/scripts/repair_slugs.py
+++ b/scripts/repair_slugs.py
@ -0,0 +1,420 @@
 #!/usr/bin/env python3
 """
 Repair duplicate slugs in data/words.json.
 Homographs (words with identical spelling but different meanings) were
 assigned the same slug by the scraper. This script fetches the pealim.com
 search page for each affected word, matches entries by meaning (and nikkud),
 and writes the corrected slugs back to words.json and the source CSV.
 Usage:
    python3 scripts/repair_slugs.py [--dry-run]
 """
 from __future__ import annotations
 import argparse
 import json
 import logging
 import re
 import sys
 import time
 from collections import defaultdict
 from difflib import SequenceMatcher
 from pathlib import Path
 import pandas as pd
 import requests
 from bs4 import BeautifulSoup
 # ---------------------------------------------------------------------------
 # Paths
 # ---------------------------------------------------------------------------
 PROJECT_ROOT = Path(__file__).resolve().parent.parent
 WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
 CSV_PATH = PROJECT_ROOT / "data" / "hebrew_dict_for_anki.csv"
 # ---------------------------------------------------------------------------
 # HTTP session
 # ---------------------------------------------------------------------------
 SESSION = requests.Session()
 SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
 COOKIES: dict[str, str] = {"translit": "none", "hebstyle": "mo"}
 REQUEST_DELAY = 1.5  # seconds between requests
 REQUEST_TIMEOUT = 15  # seconds
 # ---------------------------------------------------------------------------
 # Logging
 # ---------------------------------------------------------------------------
 logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%H:%M:%S",
 )
 logger = logging.getLogger(__name__)
 # ---------------------------------------------------------------------------
 # Similarity helpers
 # ---------------------------------------------------------------------------
 FUZZY_THRESHOLD = 0.4
 def _similarity(a: str, b: str) -> float:
    """Return SequenceMatcher ratio between two strings (both lowercased)."""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()
 def _best_match(
    our_meaning: str,
    candidates: list[dict],
    our_nikkud: str,
 ) -> tuple[dict | None, float]:
    """
    Return (best_candidate, ratio) by comparing our_meaning against each
    candidate's meaning field.  Nikkud exact-match gives a bonus to break ties.
    """
    best: dict | None = None
    best_score = -1.0
    for cand in candidates:
        ratio = _similarity(our_meaning, cand["meaning"])
        # Nikkud exact match adds a small bonus so the right homograph wins
        # even when meanings are very similar
        if our_nikkud and cand["word"] == our_nikkud:
            ratio = min(1.0, ratio + 0.05)
        if ratio > best_score:
            best_score = ratio
            best = cand
    return best, best_score
 # ---------------------------------------------------------------------------
 # Search-page parser
 # ---------------------------------------------------------------------------
 def _parse_search_results(html: bytes) -> list[dict]:
    """
    Parse pealim.com search results page.
    Each ``div.verb-search-result`` block contains:
    - div.verb-search-data > a[href]  → slug
    - div.verb-search-lemma > span.menukad  → nikkud word
    - div.verb-search-binyan  → part of speech
    - div.verb-search-meaning  → meaning text
    Returns a list of dicts with keys: slug, word, pos, meaning.
    """
    soup = BeautifulSoup(html, "html.parser")
    results: list[dict] = []
    for block in soup.find_all("div", class_="verb-search-result"):
        data_div = block.find("div", class_="verb-search-data")
        if not data_div:
            continue
        # Slug from the detail-page link
        slug = ""
        link = data_div.find("a", href=True)
        if link:
            m = re.search(r"/dict/([^/#]+)/", link["href"])
            if m:
                slug = m.group(1)
        # Nikkud word
        lemma_div = block.find("div", class_="verb-search-lemma")
        menukad = lemma_div.find("span", class_="menukad") if lemma_div else None
        word = menukad.get_text(strip=True) if menukad else (lemma_div.get_text(strip=True) if lemma_div else "")
        # Part of speech
        pos_div = block.find("div", class_="verb-search-binyan")
        pos = pos_div.get_text(strip=True).replace("Part of speech:", "").strip() if pos_div else ""
        # Meaning
        meaning_div = block.find("div", class_="verb-search-meaning")
        meaning = meaning_div.get_text(strip=True) if meaning_div else ""
        if slug:
            results.append({"slug": slug, "word": word, "pos": pos, "meaning": meaning})
    return results
 def _fetch_search_results(ktiv_male: str) -> list[dict]:
    """Fetch and parse search results for a given consonant-only spelling."""
    url = f"https://www.pealim.com/search/?q={ktiv_male}"
    logger.debug("GET %s", url)
    resp = SESSION.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
    resp.raise_for_status()
    return _parse_search_results(resp.content)
 # ---------------------------------------------------------------------------
 # Core logic
 # ---------------------------------------------------------------------------
 def find_duplicate_groups(data: dict) -> dict[str, list[str]]:
    """
    Return mapping slug → [word_key, ...] for all slugs shared by 2+ entries.
    The word_key is the top-level key in words.json (nikkud + PoS + meaning).
    """
    slug_to_keys: dict[str, list[str]] = defaultdict(list)
    for key, entry in data.items():
        slug = entry.get("slug", "")
        if slug:
            slug_to_keys[slug].append(key)
    return {slug: keys for slug, keys in slug_to_keys.items() if len(keys) > 1}
 def repair_group(
    slug: str,
    keys: list[str],
    data: dict,
    dry_run: bool,
 ) -> tuple[int, int]:
    """
    Attempt to repair one group of entries sharing *slug*.
    Homographs can have different ktiv_male spellings (e.g. אבידה vs אבדה for
    the two spellings of אֲבֵדָה).  We therefore build a union of all search
    results obtained by querying each distinct ktiv_male in the group.
    Returns (fixed_count, skipped_count).
    """
    # Collect distinct ktiv_male values across the group (usually one, but
    # sometimes two when homographs have different consonant spellings).
    ktiv_to_keys: dict[str, list[str]] = defaultdict(list)
    for k in keys:
        ktiv = data[k]["word"]["ktiv_male"]
        ktiv_to_keys[ktiv].append(k)
    nikkud_word = data[keys[0]]["word"]["nikkud"]
    logger.info(
        "  Fetching search results for %s — %d entries share slug %s",
        nikkud_word,
        len(keys),
        slug,
    )
    # Fetch search results for every distinct ktiv_male and merge
    all_candidates: list[dict] = []
    seen_slugs: set[str] = set()
    for ktiv in ktiv_to_keys:
        try:
            results = _fetch_search_results(ktiv)
        except requests.RequestException as exc:
            logger.warning("  HTTP error for %s: %s", ktiv, exc)
            results = []
        for r in results:
            if r["slug"] not in seen_slugs:
                seen_slugs.add(r["slug"])
                all_candidates.append(r)
        if len(ktiv_to_keys) > 1:
            # Small delay between sub-queries within the same group
            time.sleep(REQUEST_DELAY)
    if not all_candidates:
        logger.warning("  No search results — skipping group")
        return 0, len(keys)
    # Filter candidates to those whose nikkud word matches the entry's nikkud.
    # This avoids accidentally matching a completely different word that shares
    # the same consonant spelling (e.g. different voweling entirely).
    group_nikkuds = {data[k]["word"]["nikkud"] for k in keys}
    filtered = [c for c in all_candidates if c["word"] in group_nikkuds]
    if not filtered:
        logger.warning(
            "  Search results don't contain nikkud %s — candidates: %s — skipping",
            group_nikkuds,
            [c["word"] for c in all_candidates],
        )
        return 0, len(keys)
    fixed = 0
    skipped = 0
    for key in keys:
        entry = data[key]
        our_meaning = entry.get("meaning", "")
        our_nikkud = entry["word"]["nikkud"]
        # Only consider candidates that match this entry's nikkud
        nikkud_filtered = [c for c in filtered if c["word"] == our_nikkud]
        pool = nikkud_filtered if nikkud_filtered else filtered
        best, score = _best_match(our_meaning, pool, our_nikkud)
        if best is None or score < FUZZY_THRESHOLD:
            logger.warning(
                "    SKIP  key=%s | meaning=%r | best_score=%.2f",
                key,
                our_meaning,
                score,
            )
            skipped += 1
            continue
        new_slug = best["slug"]
        old_slug = entry["slug"]
        if new_slug == old_slug:
            logger.info("    SAME  key=%s | slug=%s (score=%.2f)", key, old_slug, score)
            fixed += 1
            continue
        logger.info(
            "    FIX   key=%s | %s → %s | matched=%r (score=%.2f)",
            key,
            old_slug,
            new_slug,
            best["meaning"],
            score,
        )
        if not dry_run:
            data[key]["slug"] = new_slug
        fixed += 1
    return fixed, skipped
 # ---------------------------------------------------------------------------
 # CSV update
 # ---------------------------------------------------------------------------
 def update_csv(data: dict, dry_run: bool) -> None:
    """
    Re-write the CSV so every row's slug column matches words.json.
    The CSV is semicolon-delimited; the slug column is named 'slug'.
    We match rows by 'Word Without Nikkud' (ktiv_male) AND 'Meaning' because
    homographs share the same ktiv_male.
    """
    df = pd.read_csv(CSV_PATH, sep=";", dtype=str)
    if "slug" not in df.columns:
        logger.warning("CSV has no 'slug' column — skipping CSV update")
        return
    # Build a lookup: (ktiv_male, meaning) → new_slug from words.json
    lookup: dict[tuple[str, str], str] = {}
    for entry in data.values():
        ktiv = entry["word"].get("ktiv_male", "")
        meaning = entry.get("meaning", "")
        slug = entry.get("slug", "")
        if ktiv and slug:
            lookup[(ktiv, meaning)] = slug
    changes = 0
    for idx, row in df.iterrows():
        ktiv = str(row.get("Word Without Nikkud", "")).strip()
        meaning = str(row.get("Meaning", "")).strip()
        key = (ktiv, meaning)
        if key in lookup:
            new_slug = lookup[key]
            old_slug = str(row["slug"]).strip()
            if new_slug != old_slug:
                logger.info(
                    "  CSV row %d: %s → %s  (%s)",
                    idx,
                    old_slug,
                    new_slug,
                    ktiv,
                )
                if not dry_run:
                    df.at[idx, "slug"] = new_slug
                changes += 1
    logger.info("CSV: %d slug(s) to update", changes)
    if not dry_run and changes:
        df.to_csv(CSV_PATH, sep=";", index=True)
        logger.info("CSV written to %s", CSV_PATH)
    elif dry_run:
        logger.info("DRY-RUN: CSV not written")
 # ---------------------------------------------------------------------------
 # Main
 # ---------------------------------------------------------------------------
 def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Repair duplicate slugs in data/words.json")
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Preview changes without writing any files",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable debug logging",
    )
    args = parser.parse_args(argv)
    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)
    if args.dry_run:
        logger.info("=== DRY-RUN mode — no files will be modified ===")
    # Load data
    logger.info("Loading %s", WORDS_JSON)
    with WORDS_JSON.open(encoding="utf-8") as fh:
        data: dict = json.load(fh)
    logger.info("Loaded %d entries", len(data))
    # Identify duplicate groups
    groups = find_duplicate_groups(data)
    total_groups = len(groups)
    total_entries = sum(len(v) for v in groups.values())
    logger.info(
        "Found %d duplicate-slug groups covering %d entries",
        total_groups,
        total_entries,
    )
    # Process each group
    total_fixed = 0
    total_skipped = 0
    for group_idx, (slug, keys) in enumerate(sorted(groups.items()), 1):
        logger.info(
            "[%d/%d] slug=%s (%d entries)",
            group_idx,
            total_groups,
            slug,
            len(keys),
        )
        fixed, skipped = repair_group(slug, keys, data, dry_run=args.dry_run)
        total_fixed += fixed
        total_skipped += skipped
        # Respectful delay between HTTP requests
        if group_idx < total_groups:
            time.sleep(REQUEST_DELAY)
    logger.info(
        "Summary: %d fixed, %d skipped (out of %d entries in %d groups)",
        total_fixed,
        total_skipped,
        total_entries,
        total_groups,
    )
    # Write updated words.json
    if not args.dry_run:
        logger.info("Writing %s", WORDS_JSON)
        with WORDS_JSON.open("w", encoding="utf-8") as fh:
            json.dump(data, fh, ensure_ascii=False, indent=2)
        logger.info("words.json written")
    else:
        logger.info("DRY-RUN: words.json not written")
    # Update CSV
    logger.info("Updating CSV %s", CSV_PATH)
    update_csv(data, dry_run=args.dry_run)
    return 0 if total_skipped == 0 else 1
 if __name__ == "__main__":
    sys.exit(main())
--- a/scripts/validate_data.py
+++ b/scripts/validate_data.py
@ -0,0 +1,800 @@
 """Standalone integrity validator for data/words.json.
 Validates the unified Hebrew Flash Cards data against the schema defined in
 SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
 Usage:
    python3 scripts/validate_data.py
    python3 scripts/validate_data.py --verbose
    python3 scripts/validate_data.py --test confusable_symmetric
 """
 from __future__ import annotations
 import argparse
 import json
 import re
 import sys
 import unicodedata
 from pathlib import Path
 from typing import Any
 # ---------------------------------------------------------------------------
 # Bootstrap: make project root importable so helpers.py is accessible
 # ---------------------------------------------------------------------------
 sys.path.insert(0, str(Path(__file__).parent.parent))
 # ---------------------------------------------------------------------------
 # Constants
 # ---------------------------------------------------------------------------
 DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
 HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA)  # alef–tav
 VALID_PERSON_CODES: frozenset[str] = frozenset(
    ["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
 )
 EMOJI_RE = re.compile(
    r"[\U0001f600-\U0001f64f"
    r"\U0001f300-\U0001f5ff"
    r"\U0001f680-\U0001f6ff"
    r"\U0001f1e0-\U0001f1ff"
    r"\U00002702-\U000027b0"
    r"\U0001f900-\U0001f9ff"
    r"\U0001fa00-\U0001fa6f"
    r"\U0001fa70-\U0001faff]"
 )
 # ---------------------------------------------------------------------------
 # Result tracking
 # ---------------------------------------------------------------------------
 _failures: list[str] = []
 _warnings: list[str] = []
 _verbose: bool = False
 def _pass(name: str) -> None:
    print(f"  PASS  {name}")
 def _fail(name: str, details: list[str]) -> None:
    global _failures
    _failures.append(name)
    print(f"  FAIL  {name}")
    for d in details:
        print(f"          {d}")
 def _warn(name: str, details: list[str]) -> None:
    global _warnings
    _warnings.extend(details)
    print(f"  WARN  {name}")
    for d in details:
        print(f"          {d}")
 def _verbose_print(msg: str) -> None:
    if _verbose:
        print(f"        {msg}")
 # ---------------------------------------------------------------------------
 # Helper: load data
 # ---------------------------------------------------------------------------
 def load_data() -> dict[str, Any]:
    """Load words.json and return the parsed dict."""
    if not DATA_FILE.exists():
        print(f"ERROR: data file not found: {DATA_FILE}")
        sys.exit(2)
    with DATA_FILE.open(encoding="utf-8") as fh:
        return json.load(fh)
 def _is_hebrew_consonant(ch: str) -> bool:
    """Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
    Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
    only the first base character after NFD decomposition.
    """
    normalized = unicodedata.normalize("NFD", ch)
    # The first codepoint is the base consonant; the rest are combining marks.
    base = normalized[0]
    cp = ord(base)
    return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
 # ---------------------------------------------------------------------------
 # Individual tests
 # ---------------------------------------------------------------------------
 def test_required_fields(data: dict[str, Any]) -> None:
    """Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
    name = "required_fields"
    errors: list[str] = []
    warn_details: list[str] = []
    for key, entry in data.items():
        word = entry.get("word")
        if not isinstance(word, dict):
            errors.append(f"[{key}] 'word' is missing or not a dict")
        else:
            if not word.get("nikkud"):
                errors.append(f"[{key}] word.nikkud is missing or empty")
            if not word.get("ktiv_male"):
                errors.append(f"[{key}] word.ktiv_male is missing or empty")
        if not entry.get("slug"):
            errors.append(f"[{key}] 'slug' is missing or empty")
        if not entry.get("pos"):
            errors.append(f"[{key}] 'pos' is missing or empty")
        if not entry.get("meaning"):
            errors.append(f"[{key}] 'meaning' is missing or empty")
        if entry.get("frequency") is None:
            warn_details.append(f"[{key}] 'frequency' is null/missing")
    if warn_details:
        _warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
        if len(warn_details) > 20 and not _verbose:
            print(f"          ... ({len(warn_details) - 20} more; use --verbose)")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_root_format(data: dict[str, Any]) -> None:
    """root is a list of 2-5 Hebrew consonant chars, or an empty list."""
    name = "root_format"
    errors: list[str] = []
    for key, entry in data.items():
        root = entry.get("root")
        if root is None:
            errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
            continue
        if not isinstance(root, list):
            errors.append(f"[{key}] 'root' is not a list: {root!r}")
            continue
        if len(root) == 0:
            continue  # rootless word — valid
        if not (2 <= len(root) <= 5):
            errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
            continue
        for ch in root:
            # A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
            # Validate by checking the base consonant after NFD decomposition.
            if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
                errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
                break
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_unique_slugs(data: dict[str, Any]) -> None:
    """All non-empty slugs are unique across entries — each pealim page is a distinct word."""
    name = "unique_slugs"
    seen: dict[str, list[str]] = {}
    for key, entry in data.items():
        slug = entry.get("slug")
        if slug:
            seen.setdefault(slug, []).append(key)
    dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
    if dups:
        errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_no_duplicate_keys(_data: dict[str, Any]) -> None:  # noqa: ARG001
    """JSON loaded without top-level key collisions.
    Python's json.load silently keeps the last value on duplicate keys;
    we re-parse with a custom object_pairs_hook to detect them.
    The pre-parsed ``_data`` dict is not used here because we need to
    re-read the raw file to catch duplicate keys that json.load would
    silently merge.
    """
    name = "no_duplicate_keys"
    duplicates: list[str] = []
    def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
        d: dict[str, Any] = {}
        for k, v in pairs:
            if k in d:
                duplicates.append(k)
            d[k] = v
        return d
    with DATA_FILE.open(encoding="utf-8") as fh:
        json.load(fh, object_pairs_hook=_detect_dups)
    if duplicates:
        _fail(name, [f"duplicate key: {k!r}" for k in duplicates])
    else:
        _pass(name)
 def test_confusable_symmetric(data: dict[str, Any]) -> None:
    """If A lists B in confusable_group, B must list A."""
    name = "confusable_symmetric"
    errors: list[str] = []
    for key, entry in data.items():
        group = entry.get("confusable_group")
        if not group:
            continue
        for other_key in group:
            other = data.get(other_key)
            if other is None:
                errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
                continue
            other_group = other.get("confusable_group") or []
            if key not in other_group:
                errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
    """Every key in shared_roots must exist as a top-level key."""
    name = "shared_roots_valid_keys"
    errors: list[str] = []
    for key, entry in data.items():
        shared = entry.get("shared_roots")
        if not shared:
            continue
        for ref_key in shared:
            if ref_key not in data:
                errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_unique_legacy_guids(data: dict[str, Any]) -> None:
    """No two entries share the same vocab_legacy_guid (excluding null).
    Exception: entries that share the same word.nikkud value inherited the
    same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
    These are tolerated — the duplicate GUID is a known artefact of how
    legacy GUIDs were generated from the nikkud word alone.
    """
    name = "unique_legacy_guids"
    seen: dict[str, list[str]] = {}
    for key, entry in data.items():
        guid = entry.get("vocab_legacy_guid")
        if guid:
            seen.setdefault(guid, []).append(key)
    errors: list[str] = []
    for guid, keys in seen.items():
        if len(keys) <= 1:
            continue
        # Tolerate sharing if ALL entries with this GUID share the same word.nikkud
        nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
        if len(nikkud_values) == 1:
            # Same nikkud -> inherited from same legacy card; tolerable
            _verbose_print(
                f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
            )
            continue
        errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
    """noun_inflection must be null if pos doesn't start with 'Noun'.
    Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
    """
    name = "no_noun_inflection_on_non_nouns"
    errors: list[str] = []
    for key, entry in data.items():
        pos = entry.get("pos") or ""
        noun_inf = entry.get("noun_inflection")
        if not pos.startswith("Noun") and noun_inf is not None:
            errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
            _verbose_print(f"offending entry: {key!r}")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
    """meaning field must not contain inline emoji characters."""
    name = "no_emoji_in_meaning"
    errors: list[str] = []
    for key, entry in data.items():
        meaning = entry.get("meaning") or ""
        if EMOJI_RE.search(meaning):
            errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
    """For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
    Uses nikkud (exact) matching, not stripped matching.
    """
    name = "example_sentences_contain_word"
    errors: list[str] = []
    for key, entry in data.items():
        examples = entry.get("examples")
        if not examples:
            continue
        vetted = examples.get("vetted")
        if not vetted:
            continue
        word_obj = entry.get("word") or {}
        nikkud_word = word_obj.get("nikkud") or ""
        if not nikkud_word:
            continue
        found = any(nikkud_word in (s.get("text") or "") for s in vetted)
        if not found:
            sentences_preview = [s.get("text", "") for s in vetted[:2]]
            errors.append(
                f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
            )
    if errors:
        _warn(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    _pass(name)
 def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
    """cloze_word_start/end must be within text bounds when present.
    Null offsets are tolerated (and warned separately) because some sentences
    contain only inflected/construct/plural forms that cannot be matched back
    to the base nikkud or ktiv_male — this is a data quality issue in
    vetted_sentences.json, not a schema violation.
    """
    name = "cloze_offsets_valid"
    errors: list[str] = []
    null_warn: list[str] = []
    for key, entry in data.items():
        examples = entry.get("examples")
        if not examples:
            continue
        cloze = examples.get("cloze")
        if not cloze:
            continue
        text = cloze.get("text") or ""
        start = cloze.get("cloze_word_start")
        end = cloze.get("cloze_word_end")
        if start is None or end is None:
            null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
            continue
        text_len = len(text)
        if not isinstance(start, int) or not isinstance(end, int):
            errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
            continue
        if start < 0 or end < 0:
            errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
            continue
        if start >= end:
            errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
            continue
        if end > text_len:
            errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
    if null_warn:
        _warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
        if len(null_warn) > 20 and not _verbose:
            print(f"          ... ({len(null_warn) - 20} more; use --verbose)")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
    """hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
    name = "hufal_pual_only_on_hifil_piel"
    errors: list[str] = []
    for key, entry in data.items():
        conj = entry.get("conjugation")
        if not conj:
            continue
        hufal_pual = conj.get("hufal_pual_forms")
        if hufal_pual is None:
            continue
        binyan = conj.get("binyan") or ""
        binyan_lower = binyan.lower()
        if "hif" not in binyan_lower and "pi" not in binyan_lower:
            errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
    """All entries in a confusable_group must share the same word.ktiv_male."""
    name = "confusable_group_shares_ktiv_male"
    errors: list[str] = []
    for key, entry in data.items():
        group = entry.get("confusable_group")
        if not group:
            continue
        my_word = entry.get("word") or {}
        my_ktiv = my_word.get("ktiv_male")
        if not my_ktiv:
            continue
        for other_key in group:
            other = data.get(other_key)
            if not other:
                continue  # already caught by confusable_symmetric
            other_word = other.get("word") or {}
            other_ktiv = other_word.get("ktiv_male")
            if other_ktiv and other_ktiv != my_ktiv:
                errors.append(
                    f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
                )
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_confusables_guid(data: dict[str, Any]) -> None:
    """confusables_guid must be consistent within each confusable_group.
    Rules:
    - If confusable_group is non-null, confusables_guid must be non-null.
    - If confusable_group is null, confusables_guid must be null.
    - All entries that share a confusable_group must share the same
      confusables_guid value.
    """
    name = "confusables_guid"
    errors: list[str] = []
    for key, entry in data.items():
        group = entry.get("confusable_group")
        guid = entry.get("confusables_guid")
        if group and not guid:
            errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
        elif not group and guid is not None:
            errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
        if not group or not guid:
            continue
        for other_key in group:
            other = data.get(other_key)
            if not other:
                continue  # already caught by confusable_symmetric
            other_guid = other.get("confusables_guid")
            if other_guid != guid:
                errors.append(
                    f"[{key}] confusables_guid={guid!r} but confusable member "
                    f"{other_key!r} has confusables_guid={other_guid!r}"
                )
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_conjugation_form_guids(data: dict[str, Any]) -> None:
    """Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
    Rules:
    - Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
      OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
      1st person forms where multiple GUIDs are possible).
    - No two forms within the same verb (across both form lists) may share a GUID.
    """
    name = "conjugation_form_guids"
    errors: list[str] = []
    for key, entry in data.items():
        conj = entry.get("conjugation")
        if not conj:
            continue
        seen_guids: dict[str, str] = {}  # guid -> "form_list_key[person]" label
        for form_list_key in ("active_forms", "hufal_pual_forms"):
            forms = conj.get(form_list_key)
            if not forms:
                continue
            for form in forms:
                person = form.get("person", "?")
                label = f"{form_list_key}[{person}]"
                guid = form.get("guid")
                guid_candidates = form.get("guid_candidates")
                if not guid and not guid_candidates:
                    errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
                    continue
                if guid:
                    if guid in seen_guids:
                        errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
                    else:
                        seen_guids[guid] = label
                elif guid_candidates:
                    for candidate in guid_candidates:
                        if candidate in seen_guids:
                            errors.append(
                                f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
                            )
                        else:
                            seen_guids[candidate] = label
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_conjugation_person_codes(data: dict[str, Any]) -> None:
    """active_forms person codes must be from the defined valid set."""
    name = "conjugation_person_codes"
    errors: list[str] = []
    for key, entry in data.items():
        conj = entry.get("conjugation")
        if not conj:
            continue
        for form_list_key in ("active_forms", "hufal_pual_forms"):
            forms = conj.get(form_list_key)
            if not forms:
                continue
            for form in forms:
                person = form.get("person")
                if person not in VALID_PERSON_CODES:
                    errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)
 def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
    """For confusable words, their example sentences must not contain the wrong
    homograph's nikkud word.
    Specifically: if A and B are confusable (same ktiv_male), A's vetted
    sentences must not contain B's nikkud form, and vice versa.
    """
    name = "no_stripped_form_sentence_collisions"
    errors: list[str] = []
    for key, entry in data.items():
        group = entry.get("confusable_group")
        if not group:
            continue
        examples = entry.get("examples")
        if not examples:
            continue
        vetted = examples.get("vetted")
        if not vetted:
            continue
        my_word = entry.get("word") or {}
        my_nikkud = my_word.get("nikkud") or ""
        my_texts = [s.get("text") or "" for s in vetted]
        for other_key in group:
            other = data.get(other_key)
            if not other:
                continue
            other_word = other.get("word") or {}
            other_nikkud = other_word.get("nikkud") or ""
            if not other_nikkud or other_nikkud == my_nikkud:
                continue  # same nikkud homographs are ok (we can't distinguish by nikkud)
            for text in my_texts:
                if other_nikkud in text:
                    errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
                    _verbose_print(f"  my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
                    break  # one error per (key, other_key) pair is enough
    if errors:
        _warn(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    _pass(name)
 # ---------------------------------------------------------------------------
 # Stats summary
 # ---------------------------------------------------------------------------
 def print_stats(data: dict[str, Any]) -> None:
    """Print a summary of dataset coverage metrics."""
    total = len(data)
    with_conj = sum(1 for e in data.values() if e.get("conjugation"))
    with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
    with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
    with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
    with_image = sum(1 for e in data.values() if e.get("image"))
    with_emoji = sum(1 for e in data.values() if e.get("emoji"))
    with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
    in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
    with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
    print()
    print("Stats Summary")
    print("─" * 42)
    print(f"  Total entries:                {total:>6}")
    print(f"  With conjugation data:        {with_conj:>6}")
    print(f"  With noun_inflection:         {with_noun_inf:>6}")
    print(f"  With vetted examples:         {with_vetted:>6}")
    print(f"  With cloze examples:          {with_cloze:>6}")
    print(f"  With images:                  {with_image:>6}")
    print(f"  With emoji:                   {with_emoji:>6}")
    print(f"  With legacy GUIDs:            {with_guid:>6}")
    print(f"  In confusable groups:         {in_confusable:>6}")
    print(f"  With shared roots:            {with_shared_roots:>6}")
 # ---------------------------------------------------------------------------
 # Test registry
 # ---------------------------------------------------------------------------
 ALL_TESTS: dict[str, Any] = {
    "required_fields": test_required_fields,
    "root_format": test_root_format,
    "unique_slugs": test_unique_slugs,
    "no_duplicate_keys": test_no_duplicate_keys,
    "confusable_symmetric": test_confusable_symmetric,
    "shared_roots_valid_keys": test_shared_roots_valid_keys,
    "unique_legacy_guids": test_unique_legacy_guids,
    "no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
    "no_emoji_in_meaning": test_no_emoji_in_meaning,
    "example_sentences_contain_word": test_example_sentences_contain_word,
    "cloze_offsets_valid": test_cloze_offsets_valid,
    "hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
    "confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
    "confusables_guid": test_confusables_guid,
    "conjugation_form_guids": test_conjugation_form_guids,
    "conjugation_person_codes": test_conjugation_person_codes,
    "no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
 }
 # ---------------------------------------------------------------------------
 # Entry point
 # ---------------------------------------------------------------------------
 def main() -> None:
    global _verbose
    parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Print full details for all failures (not just first 20).",
    )
    parser.add_argument(
        "--test",
        metavar="NAME",
        help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
    )
    args = parser.parse_args()
    _verbose = args.verbose
    data = load_data()
    # Select tests to run
    if args.test:
        if args.test not in ALL_TESTS:
            print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
            sys.exit(2)
        tests_to_run = {args.test: ALL_TESTS[args.test]}
    else:
        tests_to_run = ALL_TESTS
    print(f"Validating {DATA_FILE} ({len(data)} entries)")
    print("─" * 60)
    # no_duplicate_keys needs the file, not the pre-parsed dict
    for test_fn in tests_to_run.values():
        test_fn(data)
    # Summary
    if not args.test:
        print_stats(data)
    print()
    print("─" * 60)
    if _warnings:
        print(f"  Warnings : {len(_warnings)}")
    if _failures:
        print(f"  FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
        sys.exit(1)
    else:
        print(f"  All {len(tests_to_run)} test(s) passed.")
        sys.exit(0)
 if __name__ == "__main__":
    main()
--- a/tests/test_scraper_integration.py
+++ b/tests/test_scraper_integration.py
@ -0,0 +1,441 @@
 #!/usr/bin/env python3
 """Integration tests: scrape real pealim.com pages and validate data.
 These tests hit pealim.com directly. They are skipped when the environment
 variable SKIP_INTEGRATION is set to any non-empty string.
 Run with:
    pytest tests/test_scraper_integration.py -v -m integration
 """
 import json
 import os
 import re
 import sys
 import time
 from pathlib import Path
 import pytest
 # Add project root to path so all sibling modules are importable
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 import pealim_detail_scrape
 import pealim_list_scrape
 # ---------------------------------------------------------------------------
 # Skip marker
 # ---------------------------------------------------------------------------
 skip_integration = pytest.mark.skipif(
    bool(os.environ.get("SKIP_INTEGRATION", "")),
    reason="SKIP_INTEGRATION is set",
 )
 # A known Hif'il verb slug that is not page-1 dependent.
 # לְהַגִּיד (to tell/say) — Hif'il, slug 4183-lehagid
 HIFIL_VERB_SLUG = "4183-lehagid"
 HIFIL_VERB_NIKKUD = "לְהַגִּיד"
 HIFIL_VERB_MEANING = "to say, to tell"
 # Minimum expected entries from a single list page
 MIN_LIST_ENTRIES = 10
 # Hebrew character regex (Unicode block U+05D0–U+05EA)
 HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
 # Slug pattern: one or more digits, hyphen, one or more word chars
 SLUG_RE = re.compile(r"^\d+-\w+$")
 # ---------------------------------------------------------------------------
 # Helpers
 # ---------------------------------------------------------------------------
 def _has_hebrew(text: str) -> bool:
    """Return True if *text* contains at least one Hebrew consonant."""
    return bool(HEBREW_CHAR_RE.search(text))
 def _words_from_file(path: Path) -> dict:
    with path.open(encoding="utf-8") as fh:
        return json.load(fh)
 # ---------------------------------------------------------------------------
 # Test class: list page scrape
 # ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
 class TestListScrape:
    """Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
    def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"
        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
        # Scrape exactly one page
        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        assert words_path.exists(), "words.json was not created after scrape"
        words = _words_from_file(words_path)
        assert len(words) >= MIN_LIST_ENTRIES, (
            f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
        )
    def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"
        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)
        for key, entry in words.items():
            word_block = entry.get("word", {})
            nikkud = word_block.get("nikkud", "")
            ktiv_male = word_block.get("ktiv_male", "")
            slug = entry.get("slug", "")
            pos = entry.get("pos", "")
            meaning = entry.get("meaning", "")
            assert nikkud, f"Entry '{key}': word.nikkud is empty"
            assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
            assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
            assert slug, f"Entry '{key}': slug is empty"
            assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
            assert pos, f"Entry '{key}': pos is empty"
            assert meaning, f"Entry '{key}': meaning is empty"
    def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """At least one entry on page 1 must have a non-empty root list."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"
        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)
        entries_with_root = [e for e in words.values() if e.get("root")]
        assert entries_with_root, "No entries on page 1 have a non-empty root list"
    def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """At least one entry on page 1 must have a non-empty audio_url."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"
        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)
        entries_with_audio = [e for e in words.values() if e.get("audio_url")]
        assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
    def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"
        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)
        for key, entry in words.items():
            assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
            assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
            assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
 # ---------------------------------------------------------------------------
 # Test class: noun detail scrape
 # ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
 class TestDetailScrapeNoun:
    """Validate pealim_detail_scrape for a real noun detail page."""
    def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
        """Return the first (key, entry) pair that is a Noun with a non-empty root."""
        for key, entry in words.items():
            if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
                return key, entry
        return None
    def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
        """
        Scrape page 1 into a fresh words.json and return (path, words).
        Uses list scraper monkeypatched to tmp_path.
        """
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"
        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)
        return words_path, words
    def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """After detail scrape, noun_inflection must not be null."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, noun_entry = pair
        # Now monkeypatch detail scraper and run it on just this noun
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        # Small rate-limit delay between list scrape and detail scrape
        time.sleep(1.0)
        pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
        updated_words = _words_from_file(words_path)
        entry = updated_words.get(noun_key, {})
        assert entry.get("noun_inflection") is not None, (
            f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
        )
    def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, _noun_entry = pair
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        time.sleep(1.0)
        pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
        updated_words = _words_from_file(words_path)
        ni = updated_words[noun_key].get("noun_inflection", {}) or {}
        singular = ni.get("singular") or {}
        plural = ni.get("plural") or {}
        assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
        assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
        assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
        assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
    def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Noun gender must be 'masculine' or 'feminine'."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, _noun_entry = pair
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        time.sleep(1.0)
        pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
        updated_words = _words_from_file(words_path)
        ni = updated_words[noun_key].get("noun_inflection", {}) or {}
        gender = ni.get("gender", "")
        assert gender in ("masculine", "feminine"), (
            f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
        )
    def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """detail_scraped must be True after a successful noun detail scrape."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, _ = pair
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        time.sleep(1.0)
        pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
        updated_words = _words_from_file(words_path)
        assert updated_words[noun_key].get("detail_scraped") is True, (
            f"detail_scraped is not True after scrape for '{noun_key}'"
        )
 # ---------------------------------------------------------------------------
 # Test class: verb detail scrape (Hif'il)
 # ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
 class TestDetailScrapeVerb:
    """Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
    def _build_test_words_json(self, tmp_path: Path) -> Path:
        """
        Write a minimal words.json containing only the known Hif'il verb entry.
        The detail scraper's run() will pick it up because pos starts with 'Verb'
        and detail_scraped is absent/False.
        """
        words_path = tmp_path / "words.json"
        entry = {
            "word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
            "slug": HIFIL_VERB_SLUG,
            "root": ["נ", "ג", "ד"],
            "pos": "Verb",
            "pos_hebrew": "פֹּעַל — הִפְעִיל",
            "meaning": HIFIL_VERB_MEANING,
            "meaning_raw": HIFIL_VERB_MEANING,
            "audio_url": "",
            "audio_file": "להגיד.mp3",
            "tags": "שורש::נגד פעלים",
            "last_scrape_date": "2026-03-08",
            "vocab_legacy_guid": None,
            "frequency": None,
            "pseudo_frequency": None,
            "emoji": None,
            "emoji_source": None,
            "emoji_visible": False,
            "image": None,
            "image_source": None,
            "hint": "",
            "shared_roots": [],
            "confusable_group": None,
            "confusables_guid": None,
            "examples": None,
            "noun_inflection": None,
            "conjugation": None,
            "adjective_inflection": None,
            "preposition_inflection": None,
            # Intentionally no detail_scraped key so the scraper processes it
        }
        words = {HIFIL_VERB_NIKKUD: entry}
        with words_path.open("w", encoding="utf-8") as fh:
            json.dump(words, fh, ensure_ascii=False, indent=2)
        return words_path
    def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """After detail scrape, conjugation must not be null for the Hif'il verb."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
        words = _words_from_file(words_path)
        entry = words.get(HIFIL_VERB_NIKKUD, {})
        assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
    def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
        assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
        assert conj.get("binyan_hebrew") == "הִפְעִיל", (
            f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
        )
    def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
        infinitive = conj.get("infinitive") or {}
        reference_form = conj.get("reference_form") or {}
        inf_nikkud = infinitive.get("nikkud", "")
        ref_nikkud = reference_form.get("nikkud", "")
        assert inf_nikkud and _has_hebrew(inf_nikkud), (
            f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
        )
        assert ref_nikkud and _has_hebrew(ref_nikkud), (
            f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
        )
    def test_verb_detail_active_forms_count_and_structure(
        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        """active_forms must be a list of at least 20 entries, each with required sub-fields."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
        active_forms = conj.get("active_forms")
        assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
        assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
        for i, form in enumerate(active_forms):
            assert form.get("person"), f"active_forms[{i}].person is empty"
            assert form.get("tense"), f"active_forms[{i}].tense is empty"
            form_block = form.get("form") or {}
            assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
                f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
            )
            assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
                f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
            )
    def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
        hufal_forms = conj.get("hufal_pual_forms")
        assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
        assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
        assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
        ref_passive = conj.get("reference_form_passive")
        assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
        passive_nikkud = (ref_passive or {}).get("nikkud", "")
        assert passive_nikkud and _has_hebrew(passive_nikkud), (
            f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
        )
    def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """detail_scraped must be True after a successful verb detail scrape."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
        words = _words_from_file(words_path)
        entry = words.get(HIFIL_VERB_NIKKUD, {})
        assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@ -25,8 +25,7 @@ def test_apkg_builder_imports():
 def test_data_files_exist():
    data_dir = Path(__file__).resolve().parent.parent / "data"
-    assert (data_dir / "hebrew_dict_for_anki.csv").exists(), "vocab CSV missing"
+    assert (data_dir / "words.json").exists(), "words.json missing"
    assert (data_dir / "conjugations.json").exists(), "conjugations cache missing"
 def test_strip_nikkud_idempotent():
@ -42,4 +41,4 @@ def test_strip_nikkud_all_marks():
    # Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
    nikkud = "הַמַּלְכָּה"
    plain = strip_nikkud(nikkud)
-    assert all(ch < "\u0591" or ch > "\u05C7" for ch in plain), f"Residual nikkud in: {plain}"
+    assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"