hebrew_flash_cards/pealim_detail_scrape.py

#!/usr/bin/env python3
"""
Consolidated detail page scraper for pealim.com.

Visits /dict/<slug>/ detail pages for nouns, verbs, adjectives and prepositions
in data/words.json.
Makes two requests per slug:
  1. hebstyle=mo cookie  → nikkud forms
  2. hebstyle=vl cookie  → ktiv male forms

Updates entries in data/words.json with scraped detail data.

Usage:
    python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
                                    [--nouns-only | --verbs-only |
                                     --adjectives-only | --prepositions-only]
"""

import argparse
import json
import logging
import os
import re
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------

PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5  # seconds between requests
REQUEST_TIMEOUT = 15
SAVE_INTERVAL = 50  # write words.json every N processed entries

WORDS_JSON = Path(__file__).parent / "data" / "words.json"

# Regex for Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")

BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)

BINYAN_HEBREW: dict[str, str] = {
    "Pa'al": "פָּעַל",
    "Nif'al": "נִפְעַל",
    "Pi'el": "פִּיעֵל",
    "Pu'al": "פֻּעַל",
    "Hif'il": "הִפְעִיל",
    "Huf'al": "הֻפְעַל",
    "Hitpa'el": "הִתְפַּעֵל",
}

PRONOUN_LABELS: dict[str, str] = {
    "present_ms": "",
    "present_fs": "",
    "present_mp": "",
    "present_fp": "",
    "past_1s": "אֲנִי",
    "past_1p": "אֲנַחְנוּ",
    "past_2ms": "אַתָּה",
    "past_2fs": "אַתְּ",
    "past_2mp": "אַתֶּם",
    "past_2fp": "אַתֶּן",
    "past_3ms": "הוּא",
    "past_3fs": "הִיא",
    "past_3p": "הֵם / הֵן",
    "future_1s": "אֲנִי",
    "future_1p": "אֲנַחְנוּ",
    "future_2ms": "אַתָּה",
    "future_2fs": "אַתְּ",
    "future_2mp": "אַתֶּם",
    "future_2fp": "אַתֶּן",
    "future_3ms": "הוּא",
    "future_3fs": "הִיא",
    "future_3mp": "הֵם",
    "future_3fp": "הֵן",
    "imperative_ms": "אַתָּה",
    "imperative_fs": "אַתְּ",
    "imperative_mp": "אַתֶּם",
    "imperative_fp": "אַתֶּן",
    "infinitive": "",
}

TENSE_DESCRIPTION: dict[str, str] = {
    "present_ms": "הוֹוֶה",
    "present_fs": "הוֹוֶה",
    "present_mp": "הוֹוֶה",
    "present_fp": "הוֹוֶה",
    "past_1s": "עָבָר",
    "past_1p": "עָבָר",
    "past_2ms": "עָבָר",
    "past_2fs": "עָבָר",
    "past_2mp": "עָבָר",
    "past_2fp": "עָבָר",
    "past_3ms": "עָבָר",
    "past_3fs": "עָבָר",
    "past_3p": "עָבָר",
    "future_1s": "עָתִיד",
    "future_1p": "עָתִיד",
    "future_2ms": "עָתִיד",
    "future_2fs": "עָתִיד",
    "future_2mp": "עָתִיד",
    "future_2fp": "עָתִיד",
    "future_3ms": "עָתִיד",
    "future_3fs": "עָתִיד",
    "future_3mp": "עָתִיד",
    "future_3fp": "עָתִיד",
    "imperative_ms": "צִוּוּי",
    "imperative_fs": "צִוּוּי",
    "imperative_mp": "צִוּוּי",
    "imperative_fp": "צִוּוּי",
    "infinitive": "מְקוֹר",
}

FORM_KEY_TO_PERSON: dict[str, str] = {
    "present_ms": "ms",
    "present_fs": "fs",
    "present_mp": "mp",
    "present_fp": "fp",
    "past_1s": "1s",
    "past_1p": "1p",
    "past_2ms": "2ms",
    "past_2fs": "2fs",
    "past_2mp": "2mp",
    "past_2fp": "2fp",
    "past_3ms": "3ms",
    "past_3fs": "3fs",
    "past_3p": "3p",
    "future_1s": "1s",
    "future_1p": "1p",
    "future_2ms": "2ms",
    "future_2fs": "2fs",
    "future_2mp": "2mp",
    "future_2fp": "2fp",
    "future_3ms": "3ms",
    "future_3fs": "3fs",
    "future_3mp": "3mp",
    "future_3fp": "3fp",
    "imperative_ms": "ms",
    "imperative_fs": "fs",
    "imperative_mp": "mp",
    "imperative_fp": "fp",
    "infinitive": "inf",
}

# Mishkal English name → Hebrew nikkud mapping
# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
_MISHKAL_HEBREW_Q: dict[str, str] = {
    # --- a ---
    "aqtal": "אַקְטָל",
    "aqtala": "אַקְטָלָה",
    # --- e ---
    "eqtal": "אֶקְטָל",
    # --- h ---
    "haqtala": "הַקְטָלָה",
    "heqtel": "הֶקְטֵל",
    "hiqqatlut": "הִקָּטְלוּת",
    "hitqattlut": "הִתְקַטְּלוּת",
    # --- m ---
    "maqtal": "מַקְטָל",
    "maqtel": "מַקְטֵל",
    "maqtela": "מַקְטֵלָה",
    "maqtelet": "מַקְטֶלֶת",
    "maqtil": "מַקְטִיל",
    "maqtol": "מַקְטוֹל",
    "maqtolet": "מַקְטֹלֶת",
    "maqtul": "מַקְטוּל",
    "meqattel": "מְקַטֵּל",
    "meqila": "מְקִילָה",
    "mequla": "מְקוּלָה",
    "mequttal": "מְקֻטָּל",
    "miqtal": "מִקְטָל",
    "miqtala": "מִקְטָלָה",
    "miqtelet": "מִקְטֶלֶת",
    "miqtol": "מִקְטוֹל",
    "miqtolet": "מִקְטֹלֶת",
    "mitqattel": "מִתְקַטֵּל",
    "muqtal": "מֻקְטָל",
    # --- n ---
    "niqtal": "נִקְטָל",
    # --- q ---
    "qal": "קַל",
    "qatal": "קָטָל",
    "qatel": "קָטֵל",
    "qatil": "קָטִיל",
    "qatla": "קַטְלָה",
    "qatlan": "קַטְלָן",
    "qatlut": "קַטְלוּת",
    "qatol": "קָטוֹל",
    "qaton": "קָטוֹן",
    "qattal": "קַטָּל",
    "qattala": "קַטָּלָה",
    "qattelet": "קַטֶּלֶת",
    "qattil": "קַטִּיל",
    "qattila": "קַטִּילָה",
    "qattolet": "קַטֹּלֶת",
    "qattul": "קַטּוּל",
    "qatul": "קָטוּל",
    "qatut": "קָטוּת",
    "qetel": "קֶטֶל",
    "qeteh": "קֵטֶה",
    "qitla": "קִטְלָה",
    "qitlon": "קִטְלוֹן",
    "qittalon": "קִטָּלוֹן",
    "qittel": "קִטֵּל",
    "qittelet": "קִטֶּלֶת",
    "qittol": "קִטּוֹל",
    "qittolet": "קִטֹּלֶת",
    "qittul": "קִטּוּל",
    "qol": "קֹל",
    "qotal": "קוֹטָל",
    "qotel": "קוֹטֵל",
    "qotelet": "קוֹטֶלֶת",
    "qotla": "קָטְלָה",
    "qtal": "קְטָל",
    "qtala": "קְטָלָה",
    "qtaltal": "קְטַלְטַל",
    "qtaltan": "קְטַלְתָּן",
    "qtaltolet": "קְטַלְטֹלֶת",
    "qtel": "קְטֵל",
    "qtela": "קְטֵלָה",
    "qtelet": "קְטֶלֶת",
    "qtil": "קְטִיל",
    "qtila": "קְטִילָה",
    "qtili": "קְטִילִי",
    "qtol": "קְטוֹל",
    "qtola": "קְטוֹלָה",
    "qtolet": "קְטֹלֶת",
    "qtul": "קְטוּל",
    "qtula": "קְטוּלָה",
    "qtulla": "קְטֻלָּה",
    "qtut": "קְטוּת",
    "qutla": "קֻטְלָה",
    "quttolet": "קֻטּוֹלֶת",
    # --- t ---
    "taqtela": "תַּקְטֵלָה",
    "taqtil": "תַּקְטִיל",
    "taqtit": "תַּקְטִית",
    "taqtul": "תַּקְטוּל",
    "taqtula": "תַּקְטוּלָה",
    "taqtut": "תַּקְטוּת",
    "tiqtal": "תִּקְטָל",
    "tiqtala": "תִּקְטָלָה",
    "tiqtelet": "תִּקְטֶלֶת",
    "tiqtolet": "תִּקְטֹלֶת",
    "tqilla": "תְּקִלָּה",
    "tqula": "תְּקוּלָה",
    # --- y ---
    "yaqtul": "יַקְטוּל",
}


def _mishkal_to_hebrew(mishkal: str) -> str | None:
    """Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
    if not mishkal:
        return None
    # Try as-is first (q-notation)
    result = _MISHKAL_HEBREW_Q.get(mishkal)
    if result:
        return result
    # Convert k-notation to q-notation and retry
    q_form = mishkal.replace("k", "q")
    return _MISHKAL_HEBREW_Q.get(q_form)


# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------

_session = requests.Session()
_session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})


def _fetch(url: str, hebstyle: str, backoff: float = REQUEST_DELAY) -> str | None:
    """Fetch a URL with the given hebstyle cookie. Returns HTML string or None on failure."""
    cookies = {"translit": "none", "hebstyle": hebstyle}
    max_wait = 60.0
    while True:
        try:
            resp = _session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
            resp.raise_for_status()
            return resp.text
        except requests.exceptions.HTTPError as exc:
            status = exc.response.status_code if exc.response is not None else 0
            if status == 404:
                logger.warning("  404 for %s — skipping", url)
                return None
            if status in (429, 503):
                wait = min(backoff, max_wait)
                logger.warning("  Rate limited (%s) — waiting %.0fs", status, wait)
                time.sleep(wait)
                backoff = min(backoff * 2, max_wait)
            else:
                logger.error("  HTTP %s for %s", status, url)
                return None
        except requests.RequestException as exc:
            wait = min(backoff, max_wait)
            logger.warning("  Request error for %s: %s — retrying in %.0fs", url, exc, wait)
            time.sleep(wait)
            backoff = min(backoff * 2, max_wait)
            if backoff >= max_wait:
                logger.error("  Giving up on %s", url)
                return None


# ---------------------------------------------------------------------------
# Shared HTML parsing utilities
# ---------------------------------------------------------------------------


def _get_menukad_and_audio(cell) -> tuple[str, str]:
    """Extract (nikkud_text, audio_url) from a table cell."""
    audio_url = ""
    audio_span = cell.find("span", class_=lambda c: c and "audio-play" in c)
    if audio_span:
        audio_url = audio_span.get("data-audio", "")
    # Also check direct data-audio attribute on cell
    if not audio_url:
        da = cell.get("data-audio", "")
        if da:
            audio_url = da

    span = cell.find("span", class_="menukad")
    if span:
        return span.get_text(strip=True), audio_url

    txt = cell.get_text(strip=True)
    if re.search(r"[\u05d0-\u05ea]", txt):
        return txt, audio_url
    return "", audio_url


def _get_plain_text(cell) -> str:
    """Extract plain Hebrew text (no nikkud) from a cell — used for vl pages."""
    span = cell.find("span", class_="menukad")
    if span:
        return span.get_text(strip=True)
    txt = cell.get_text(strip=True)
    if re.search(r"[\u05d0-\u05ea]", txt):
        return txt
    return ""


# ---------------------------------------------------------------------------
# Noun detail parsing
# ---------------------------------------------------------------------------

_GENDER_MAP = {
    "masculine": "masculine",
    "feminine": "feminine",
    "זכר": "masculine",
    "נקבה": "feminine",
    "male": "masculine",
    "female": "feminine",
}

_GENDER_HEBREW = {
    "masculine": {"nikkud": "זָכָר", "ktiv_male": "זכר"},
    "feminine": {"nikkud": "נְקֵבָה", "ktiv_male": "נקבה"},
}


def _parse_noun_table(soup: BeautifulSoup) -> dict[str, dict | str]:
    """
    Parse the noun declension table from a pealim detail page soup.

    Returns a dict with keys: singular, plural, construct_singular, construct_plural,
    singular_audio, plural_audio — values are nikkud strings or audio URLs.
    Returns empty dict if no table found.
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}

    rows = table.find_all("tr")

    result: dict[str, dict | str] = {}

    for row in rows:
        label_cell = row.find("th") or (row.find("td") if row.find_all("td") else None)
        if not label_cell:
            continue
        label_text = label_cell.get_text(strip=True).lower()

        tds = row.find_all("td")
        # Some rows have th + tds; tds may include the first label td
        # We want data cells (the ones with Hebrew forms)
        data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())]

        if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text):
            # Singular and plural forms in two cells
            if len(data_cells) >= 1:
                nikkud_sg, audio_sg = _get_menukad_and_audio(data_cells[0])
                result["singular_nikkud"] = nikkud_sg
                if audio_sg:
                    result["singular_audio"] = audio_sg
            if len(data_cells) >= 2:
                nikkud_pl, audio_pl = _get_menukad_and_audio(data_cells[1])
                result["plural_nikkud"] = nikkud_pl
                if audio_pl:
                    result["plural_audio"] = audio_pl

        elif "construct" in label_text or "סמיכות" in label_text:
            if len(data_cells) >= 1:
                nikkud_csg, _ = _get_menukad_and_audio(data_cells[0])
                result["construct_singular_nikkud"] = nikkud_csg
            if len(data_cells) >= 2:
                nikkud_cpl, _ = _get_menukad_and_audio(data_cells[1])
                result["construct_plural_nikkud"] = nikkud_cpl

    return result


def _parse_noun_table_vl(soup: BeautifulSoup) -> dict[str, str]:
    """
    Parse the noun declension table from a vl (ktiv male) page.
    Returns dict with keys: singular_ktiv, plural_ktiv, construct_singular_ktiv, construct_plural_ktiv.
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}

    rows = table.find_all("tr")
    result: dict[str, str] = {}

    for row in rows:
        label_cell = row.find("th")
        if not label_cell:
            tds_all = row.find_all("td")
            if tds_all:
                label_cell = tds_all[0]
        if not label_cell:
            continue
        label_text = label_cell.get_text(strip=True).lower()

        tds = row.find_all("td")
        data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())]

        if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text):
            if len(data_cells) >= 1:
                result["singular_ktiv"] = _get_plain_text(data_cells[0])
            if len(data_cells) >= 2:
                result["plural_ktiv"] = _get_plain_text(data_cells[1])
        elif "construct" in label_text or "סמיכות" in label_text:
            if len(data_cells) >= 1:
                result["construct_singular_ktiv"] = _get_plain_text(data_cells[0])
            if len(data_cells) >= 2:
                result["construct_plural_ktiv"] = _get_plain_text(data_cells[1])

    return result


def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
    """
    Extract (gender, mishkal) from the PoS section of the detail page.
    Returns ("masculine"|"feminine"|"", mishkal_english|"").

    Pealim HTML structure:
      <p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>
    The mishkal is in the <i> tag (k-notation, e.g. "ketel") or the nm= URL param (q-notation).
    Some nouns have no mishkal link: <p>Noun – masculine</p>
    """
    gender = ""
    mishkal = ""

    # Find the PoS <p> tag — on pealim detail pages it's a bare <p> like
    # "Noun – ketel pattern, masculine" or "Adjective – katul pattern"
    pos_section = None
    for p in soup.find_all("p"):
        text = p.get_text(" ", strip=True)
        if re.match(r"^(Noun|Adjective)\b", text):
            pos_section = p
            break

    # Fall back to older selectors (div.pos, p.pos, div.page-header)
    if not pos_section:
        pos_section = (
            soup.find("div", class_="pos") or soup.find("p", class_="pos") or soup.find("div", class_="page-header")
        )

    if pos_section:
        text = pos_section.get_text(" ", strip=True)
        # Gender detection
        for raw, canonical in _GENDER_MAP.items():
            if raw in text.lower():
                gender = canonical
                break

        # Mishkal detection: extract from <a href="...nm=XXXX"><i>YYYY</i> pattern</a>
        # Nouns use nm= param, adjectives use am= param
        mishkal_link = pos_section.find("a", href=re.compile(r"[na]m="))
        if mishkal_link:
            # Prefer <i> tag text (k-notation, matches _MISHKAL_HEBREW_Q after k→q)
            i_tag = mishkal_link.find("i")
            if i_tag:
                mishkal = i_tag.get_text(strip=True)
            else:
                # Fall back to nm= URL parameter (already q-notation)
                href = mishkal_link.get("href", "")
                nm_match = re.search(r"[na]m=([a-zA-Z']+)", href)
                if nm_match:
                    mishkal = nm_match.group(1)

    # Also check the og:description or breadcrumbs for gender
    if not gender:
        meta = soup.find("meta", {"property": "og:description"})
        if meta:
            desc = meta.get("content", "").lower()
            for raw, canonical in _GENDER_MAP.items():
                if raw in desc:
                    gender = canonical
                    break

    # Scan small/muted spans that often contain gender info
    if not gender:
        for span in soup.find_all("span", class_=lambda c: c and ("small" in c or "muted" in c or "pos" in c)):
            txt = span.get_text(strip=True).lower()
            for raw, canonical in _GENDER_MAP.items():
                if raw in txt:
                    gender = canonical
                    break
            if gender:
                break

    return gender, mishkal


def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
    """
    Parse noun detail pages (mo=nikkud, vl=ktiv male).
    Returns dict to merge into entry's noun_inflection field.
    """
    mo_soup = BeautifulSoup(mo_html, "lxml")
    vl_soup = BeautifulSoup(vl_html, "lxml")

    mo_data = _parse_noun_table(mo_soup)
    vl_data = _parse_noun_table_vl(vl_soup)
    gender, mishkal = _parse_noun_gender_mishkal(mo_soup)

    def form_or_null(nikkud: str, ktiv: str) -> dict | None:
        if not nikkud:
            return None
        if not ktiv:
            logger.warning("No ktiv_male for noun form: %s", nikkud)
        return {"nikkud": nikkud, "ktiv_male": ktiv}

    singular_nikkud = str(mo_data.get("singular_nikkud", ""))
    plural_nikkud = str(mo_data.get("plural_nikkud", ""))
    construct_singular_nikkud = str(mo_data.get("construct_singular_nikkud", ""))
    construct_plural_nikkud = str(mo_data.get("construct_plural_nikkud", ""))

    singular_ktiv = str(vl_data.get("singular_ktiv", ""))
    plural_ktiv = str(vl_data.get("plural_ktiv", ""))
    construct_singular_ktiv = str(vl_data.get("construct_singular_ktiv", ""))
    construct_plural_ktiv = str(vl_data.get("construct_plural_ktiv", ""))

    result: dict = {
        "singular": form_or_null(singular_nikkud, singular_ktiv),
        "plural": form_or_null(plural_nikkud, plural_ktiv),
        "construct_singular": form_or_null(construct_singular_nikkud, construct_singular_ktiv),
        "construct_plural": form_or_null(construct_plural_nikkud, construct_plural_ktiv),
        "singular_audio": mo_data.get("singular_audio"),
        "plural_audio": mo_data.get("plural_audio"),
        "pronominal_suffixes": None,
        # plurals_guid is PRESERVED by the merge step — not set here
    }

    if gender:
        result["gender"] = gender
        result["gender_hebrew"] = _GENDER_HEBREW.get(gender)

    if mishkal:
        result["mishkal"] = mishkal
        result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)

    return result


# ---------------------------------------------------------------------------
# Verb detail parsing (ported from conjugation_extract.py)
# ---------------------------------------------------------------------------


def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
    """Extract binyan from page header span or og:description."""
    texts = [h3.get_text(" ", strip=True) for h3 in soup.find_all("h3", class_="page-header")]
    meta = soup.find("meta", {"property": "og:description"})
    if meta:
        texts.append(str(meta.get("content", "")))
    for text in texts:
        text_lower = text.lower()
        for i, bname_lower in enumerate(_BINYAN_NAMES_LOWER):
            if bname_lower in text_lower:
                return BINYAN_NAMES[i]
    return ""


def _parse_conjugation_table(
    soup: BeautifulSoup,
    passive: bool = False,
    table_el=None,
) -> dict[str, dict]:
    """
    Parse conjugation table. Returns form_key -> {form_nikkud, audio_url} dict.
    If passive=True, locates the passive table (after "Passive" heading).
    If table_el is provided, parses that table directly.
    """
    if passive:
        passive_h3 = next(
            (h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
            None,
        )
        if not passive_h3:
            return {}
        table = next(
            (
                sib
                for sib in passive_h3.find_all_next()
                if sib.name == "table" and "conjugation-table" in sib.get("class", [])
            ),
            None,
        )
        if not table:
            return {}
    elif table_el is not None:
        table = table_el
    else:
        table = soup.find("table", class_="conjugation-table")

    if not table:
        return {}

    rows = table.find_all("tr")
    if len(rows) < 3:
        return {}

    forms: dict[str, dict] = {}

    def heb_cells(row_idx: int) -> list[tuple[str, str]]:
        """Return (nikkud_text, audio_url) for each Hebrew-containing cell in the row."""
        cells = rows[row_idx].find_all(["th", "td"])
        result = []
        for cell in cells:
            txt, au = _get_menukad_and_audio(cell)
            colspan = int(cell.get("colspan", 1))
            if txt and re.search(r"[\u05d0-\u05ea]", txt):
                for _ in range(colspan):
                    result.append((txt, au))
        return result

    def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
        seen: set[str] = set()
        out: list[tuple[str, str]] = []
        for pair in pairs:
            if pair[0] not in seen:
                seen.add(pair[0])
                out.append(pair)
        return out

    def store(key: str, nikkud: str, audio_url: str) -> None:
        if nikkud:
            forms[key] = {"form_nikkud": nikkud, "audio_url": audio_url}

    # Locate tense rows by label text
    present_row = past_row = future_row = imp_row = inf_row = -1
    for i, row in enumerate(rows):
        label = row.get_text(" ", strip=True).lower()
        if "present" in label and present_row < 0:
            present_row = i
        elif "past" in label and past_row < 0:
            past_row = i
        elif "future" in label and future_row < 0:
            future_row = i
        elif "imperative" in label and imp_row < 0:
            imp_row = i
        elif "infinitive" in label and inf_row < 0:
            inf_row = i

    # Present: ms fs mp fp
    if present_row >= 0:
        hf = heb_cells(present_row)
        for k, (v, au) in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False):
            store(k, v, au)

    # Past row 0: 1s 1p (deduplicated)
    if past_row >= 0:
        uniq = deduplicate(heb_cells(past_row))
        if len(uniq) >= 1:
            store("past_1s", uniq[0][0], uniq[0][1])
        if len(uniq) >= 2:
            store("past_1p", uniq[1][0], uniq[1][1])
        # Past row 1: 2ms 2fs 2mp 2fp
        if past_row + 1 < len(rows):
            for k, (v, au) in zip(
                ["past_2ms", "past_2fs", "past_2mp", "past_2fp"],
                heb_cells(past_row + 1),
                strict=False,
            ):
                store(k, v, au)
        # Past row 2: 3ms 3fs 3p (deduplicated)
        if past_row + 2 < len(rows):
            uniq3 = deduplicate(heb_cells(past_row + 2))
            for k, (v, au) in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False):
                store(k, v, au)

    # Future row 0: 1s 1p (deduplicated)
    if future_row >= 0:
        uniq_f = deduplicate(heb_cells(future_row))
        if len(uniq_f) >= 1:
            store("future_1s", uniq_f[0][0], uniq_f[0][1])
        if len(uniq_f) >= 2:
            store("future_1p", uniq_f[1][0], uniq_f[1][1])
        # Future row 1: 2ms 2fs 2mp 2fp
        if future_row + 1 < len(rows):
            for k, (v, au) in zip(
                ["future_2ms", "future_2fs", "future_2mp", "future_2fp"],
                heb_cells(future_row + 1),
                strict=False,
            ):
                store(k, v, au)
        # Future row 2: 3ms 3fs 3mp 3fp
        if future_row + 2 < len(rows):
            for k, (v, au) in zip(
                ["future_3ms", "future_3fs", "future_3mp", "future_3fp"],
                heb_cells(future_row + 2),
                strict=False,
            ):
                store(k, v, au)

    # Imperative: ms fs mp fp
    if imp_row >= 0:
        hf = heb_cells(imp_row)
        for k, (v, au) in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False):
            store(k, v, au)

    # Infinitive
    if inf_row >= 0:
        hf = heb_cells(inf_row)
        if hf:
            store("infinitive", hf[0][0], hf[0][1])

    return forms


def _parse_conjugation_table_vl(
    soup: BeautifulSoup,
    passive: bool = False,
    table_el=None,
) -> dict[str, str]:
    """
    Parse conjugation table from a vl (ktiv male) page.
    Returns form_key -> ktiv_male_text dict.
    Mirrors _parse_conjugation_table but extracts plain text.
    """
    if passive:
        passive_h3 = next(
            (h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
            None,
        )
        if not passive_h3:
            return {}
        table = next(
            (
                sib
                for sib in passive_h3.find_all_next()
                if sib.name == "table" and "conjugation-table" in sib.get("class", [])
            ),
            None,
        )
        if not table:
            return {}
    elif table_el is not None:
        table = table_el
    else:
        table = soup.find("table", class_="conjugation-table")

    if not table:
        return {}

    rows = table.find_all("tr")
    if len(rows) < 3:
        return {}

    ktiv_forms: dict[str, str] = {}

    def heb_cells_plain(row_idx: int) -> list[str]:
        cells = rows[row_idx].find_all(["th", "td"])
        result = []
        for cell in cells:
            txt = _get_plain_text(cell)
            colspan = int(cell.get("colspan", 1))
            if txt and re.search(r"[\u05d0-\u05ea]", txt):
                for _ in range(colspan):
                    result.append(txt)
        return result

    def deduplicate_str(items: list[str]) -> list[str]:
        seen: set[str] = set()
        out: list[str] = []
        for item in items:
            if item not in seen:
                seen.add(item)
                out.append(item)
        return out

    present_row = past_row = future_row = imp_row = inf_row = -1
    for i, row in enumerate(rows):
        label = row.get_text(" ", strip=True).lower()
        if "present" in label and present_row < 0:
            present_row = i
        elif "past" in label and past_row < 0:
            past_row = i
        elif "future" in label and future_row < 0:
            future_row = i
        elif "imperative" in label and imp_row < 0:
            imp_row = i
        elif "infinitive" in label and inf_row < 0:
            inf_row = i

    if present_row >= 0:
        hf = heb_cells_plain(present_row)
        for k, v in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False):
            if v:
                ktiv_forms[k] = v

    if past_row >= 0:
        uniq = deduplicate_str(heb_cells_plain(past_row))
        if len(uniq) >= 1:
            ktiv_forms["past_1s"] = uniq[0]
        if len(uniq) >= 2:
            ktiv_forms["past_1p"] = uniq[1]
        if past_row + 1 < len(rows):
            for k, v in zip(
                ["past_2ms", "past_2fs", "past_2mp", "past_2fp"],
                heb_cells_plain(past_row + 1),
                strict=False,
            ):
                if v:
                    ktiv_forms[k] = v
        if past_row + 2 < len(rows):
            uniq3 = deduplicate_str(heb_cells_plain(past_row + 2))
            for k, v in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False):
                if v:
                    ktiv_forms[k] = v

    if future_row >= 0:
        uniq_f = deduplicate_str(heb_cells_plain(future_row))
        if len(uniq_f) >= 1:
            ktiv_forms["future_1s"] = uniq_f[0]
        if len(uniq_f) >= 2:
            ktiv_forms["future_1p"] = uniq_f[1]
        if future_row + 1 < len(rows):
            for k, v in zip(
                ["future_2ms", "future_2fs", "future_2mp", "future_2fp"],
                heb_cells_plain(future_row + 1),
                strict=False,
            ):
                if v:
                    ktiv_forms[k] = v
        if future_row + 2 < len(rows):
            for k, v in zip(
                ["future_3ms", "future_3fs", "future_3mp", "future_3fp"],
                heb_cells_plain(future_row + 2),
                strict=False,
            ):
                if v:
                    ktiv_forms[k] = v

    if imp_row >= 0:
        hf = heb_cells_plain(imp_row)
        for k, v in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False):
            if v:
                ktiv_forms[k] = v

    if inf_row >= 0:
        hf = heb_cells_plain(inf_row)
        if hf:
            ktiv_forms["infinitive"] = hf[0]

    return ktiv_forms


def _forms_to_active_list(
    mo_forms: dict[str, dict],
    vl_forms: dict[str, str],
    existing_forms: list[dict] | None,
) -> list[dict]:
    """
    Convert parsed form dicts into the active_forms list structure (matches SCHEMA.yaml).
    Preserves guid and guid_candidates from existing_forms where present.
    """
    # Build a lookup of existing form data keyed by (person, tense) for GUID preservation
    existing_lookup: dict[tuple[str, str], dict] = {}
    if existing_forms:
        for ef in existing_forms:
            key = (ef.get("person", ""), ef.get("tense", ""))
            existing_lookup[key] = ef

    active_forms: list[dict] = []
    for form_key, form_data in mo_forms.items():
        person = FORM_KEY_TO_PERSON.get(form_key, form_key)
        tense = TENSE_DESCRIPTION.get(form_key, "")
        nikkud = form_data["form_nikkud"]
        ktiv = vl_forms.get(form_key, "")
        if not ktiv:
            logger.warning("No ktiv_male for verb form %s: %s", form_key, nikkud)
        audio_url = form_data.get("audio_url", "")
        pronoun = PRONOUN_LABELS.get(form_key, "")

        # Preserve GUIDs from existing entry
        existing = existing_lookup.get((person, tense), {})
        guid = existing.get("guid")
        guid_candidates = existing.get("guid_candidates")

        active_forms.append(
            {
                "person": person,
                "tense": tense,
                "pronoun_hebrew": pronoun,
                "form": {"nikkud": nikkud, "ktiv_male": ktiv},
                "audio_url": audio_url,
                "audio_file": existing.get("audio_file"),
                "guid": guid,
                "guid_candidates": guid_candidates,
            }
        )

    return active_forms


def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: dict | None) -> dict:
    """
    Parse verb detail pages (mo=nikkud, vl=ktiv male).
    Returns dict to merge into entry's conjugation field.
    Preserves in_conjugation_deck, guid, guid_candidates from existing_conj.
    """
    mo_soup = BeautifulSoup(mo_html, "lxml")
    vl_soup = BeautifulSoup(vl_html, "lxml")

    existing = existing_conj or {}

    # Extract metadata from mo page
    binyan = _extract_binyan_from_page(mo_soup)

    meaning = ""
    prep: str | None = None
    lead_div = mo_soup.find("div", class_="lead")
    if lead_div:
        meaning = lead_div.get_text(strip=True)
        # Extract preposition(s) from the lead text, e.g. "(על)" → "על"
        prep_matches = HBPAREN_RE.findall(meaning)
        if prep_matches:
            prep = " ".join(prep_matches)
    # Fall back to any prep already stored (e.g. from a previous manual edit)
    if prep is None:
        prep = existing.get("prep")

    # Parse active forms
    mo_active = _parse_conjugation_table(mo_soup, passive=False)
    vl_active = _parse_conjugation_table_vl(vl_soup, passive=False)

    if not mo_active:
        logger.warning("  No active forms found for slug=%s", slug)
        return {}

    # Determine infinitive and reference form
    infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "")
    infinitive_ktiv = vl_active.get("infinitive", "")
    if infinitive_nikkud and not infinitive_ktiv:
        logger.warning("No ktiv_male for infinitive: %s (slug=%s)", infinitive_nikkud, slug)
    past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "")
    past_3ms_ktiv = vl_active.get("past_3ms", "")
    if past_3ms_nikkud and not past_3ms_ktiv:
        logger.warning("No ktiv_male for past_3ms: %s (slug=%s)", past_3ms_nikkud, slug)

    # Build active forms list, preserving GUIDs
    existing_active_forms = existing.get("active_forms")
    active_forms = _forms_to_active_list(mo_active, vl_active, existing_active_forms)

    # Check for passive section (Hif'il / Pi'el verbs)
    passive_h3 = next(
        (h for h in mo_soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
        None,
    )
    hufal_pual_forms = None
    reference_form_passive = None

    if passive_h3:
        mo_passive = _parse_conjugation_table(mo_soup, passive=True)
        vl_passive = _parse_conjugation_table_vl(vl_soup, passive=True)
        if mo_passive:
            existing_passive_forms = existing.get("hufal_pual_forms")
            hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms)
            passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "")
            passive_3ms_ktiv = vl_passive.get("past_3ms", "")
            if passive_3ms_nikkud and not passive_3ms_ktiv:
                logger.warning("No ktiv_male for passive past_3ms: %s (slug=%s)", passive_3ms_nikkud, slug)
            if passive_3ms_nikkud:
                reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv}

    result: dict = {
        "in_conjugation_deck": existing.get("in_conjugation_deck", False),
        "infinitive": {"nikkud": infinitive_nikkud, "ktiv_male": infinitive_ktiv} if infinitive_nikkud else None,
        "reference_form": {"nikkud": past_3ms_nikkud, "ktiv_male": past_3ms_ktiv} if past_3ms_nikkud else None,
        "binyan": binyan,
        "binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
        "meaning": meaning,
        "prep": prep,
        "active_forms": active_forms,
        "hufal_pual_forms": hufal_pual_forms,
        "reference_form_passive": reference_form_passive,
    }

    return result


# ---------------------------------------------------------------------------
# Adjective detail parsing
# ---------------------------------------------------------------------------

_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")


def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
    """
    Parse the adjective inflection table from a pealim detail page (mo/nikkud).

    Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
    and audio URL from each.

    Returns:
        Dict mapping form key ("ms", "fs", "mp", "fp") to
        {"nikkud": str, "audio_url": str}, or empty dict if table not found.
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}

    result: dict[str, dict] = {}
    for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
        div = table.find(id=cell_id)
        if not div:
            continue
        nikkud, audio_url = _get_menukad_and_audio(div)
        if nikkud:
            result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}

    return result


def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
    """
    Parse the adjective inflection table from a vl (ktiv male) page.

    Returns:
        Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}

    result: dict[str, str] = {}
    for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
        div = table.find(id=cell_id)
        if not div:
            continue
        ktiv = _get_plain_text(div)
        if ktiv:
            result[form_key] = ktiv

    return result


def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
    """
    Extract mishkal from the PoS section of an adjective detail page.

    Reuses the same extraction logic as _parse_noun_gender_mishkal.

    Returns:
        Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
    """
    _, mishkal = _parse_noun_gender_mishkal(soup)
    mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
    return mishkal, mishkal_hebrew


def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
    """
    Parse adjective detail pages (mo=nikkud, vl=ktiv male).

    Returns:
        Dict matching the adjective_inflection schema:
        {ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
        Empty dict if no forms found.
    """
    mo_soup = BeautifulSoup(mo_html, "lxml")
    vl_soup = BeautifulSoup(vl_html, "lxml")

    mo_data = _parse_adjective_table(mo_soup)
    vl_data = _parse_adjective_table_vl(vl_soup)
    mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)

    if not mo_data:
        return {}

    result: dict = {}
    for form_key in _ADJECTIVE_FORM_KEYS:
        mo_form = mo_data.get(form_key)
        if mo_form:
            nikkud = mo_form["nikkud"]
            ktiv = vl_data.get(form_key, "")
            if not ktiv:
                logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
            result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
        else:
            result[form_key] = None

    result["mishkal"] = mishkal or None
    result["mishkal_hebrew"] = mishkal_hebrew or None

    return result


# ---------------------------------------------------------------------------
# Preposition detail parsing
# ---------------------------------------------------------------------------

_PREPOSITION_CELL_IDS: tuple[str, ...] = (
    "P-1s",
    "P-1p",
    "P-2ms",
    "P-2fs",
    "P-2mp",
    "P-2fp",
    "P-3ms",
    "P-3fs",
    "P-3mp",
    "P-3fp",
)
_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
    "1s",
    "1p",
    "2ms",
    "2fs",
    "2mp",
    "2fp",
    "3ms",
    "3fs",
    "3mp",
    "3fp",
)


def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
    """
    Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).

    Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud
    text and audio URL from each.

    Returns:
        Dict mapping person key ("1s", "1p", …, "3fp") to
        {"nikkud": str, "audio_url": str}, or empty dict if table not found.
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}

    result: dict[str, dict] = {}
    for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
        div = table.find(id=cell_id)
        if not div:
            continue
        nikkud, audio_url = _get_menukad_and_audio(div)
        if nikkud:
            result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}

    return result


def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
    """
    Parse the preposition pronominal suffix table from a vl (ktiv male) page.

    Returns:
        Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string.
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}

    result: dict[str, str] = {}
    for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
        div = table.find(id=cell_id)
        if not div:
            continue
        ktiv = _get_plain_text(div)
        if ktiv:
            result[form_key] = ktiv

    return result


def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
    """
    Parse preposition detail pages (mo=nikkud, vl=ktiv male).

    Returns:
        Dict matching the preposition_inflection schema:
        {1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
        Empty dict if no forms found.
    """
    mo_soup = BeautifulSoup(mo_html, "lxml")
    vl_soup = BeautifulSoup(vl_html, "lxml")

    mo_data = _parse_preposition_table(mo_soup)
    vl_data = _parse_preposition_table_vl(vl_soup)

    if not mo_data:
        return {}

    result: dict = {}
    for form_key in _PREPOSITION_FORM_KEYS:
        mo_form = mo_data.get(form_key)
        if mo_form:
            nikkud = mo_form["nikkud"]
            ktiv = vl_data.get(form_key, "")
            if not ktiv:
                logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
            result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
        else:
            result[form_key] = None

    return result


# ---------------------------------------------------------------------------
# Merging strategy
# ---------------------------------------------------------------------------


def _merge_noun_inflection(existing_ni: dict | None, scraped: dict) -> dict:
    """
    Merge scraped noun data into existing noun_inflection, preserving plurals_guid.
    """
    result = dict(scraped)
    if existing_ni:
        # PRESERVE existing plurals_guid — never overwrite
        if existing_ni.get("plurals_guid"):
            result["plurals_guid"] = existing_ni["plurals_guid"]
        # Preserve existing singular_audio if we didn't scrape one
        if not result.get("singular_audio") and existing_ni.get("singular_audio"):
            result["singular_audio"] = existing_ni["singular_audio"]
        # Preserve existing plural_audio if we didn't scrape one
        if not result.get("plural_audio") and existing_ni.get("plural_audio"):
            result["plural_audio"] = existing_ni["plural_audio"]
        # Preserve existing singular/plural if we failed to scrape them
        for field in ("singular", "plural", "construct_singular", "construct_plural"):
            if not result.get(field) and existing_ni.get(field):
                result[field] = existing_ni[field]
    else:
        result.setdefault("plurals_guid", None)

    return result


def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
    """
    Merge scraped verb data into existing conjugation, preserving in_conjugation_deck
    and all guid/guid_candidates fields (already handled in _forms_to_active_list).
    """
    # The scraped dict already preserves in_conjugation_deck and GUIDs via _forms_to_active_list
    return scraped


def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
    """
    Merge scraped adjective data into existing adjective_inflection.
    No GUIDs to preserve — simple overwrite with scraped data.
    """
    return dict(scraped)


def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
    """
    Merge scraped preposition data into existing preposition_inflection.
    No GUIDs to preserve — simple overwrite with scraped data.
    """
    return dict(scraped)


# ---------------------------------------------------------------------------
# I/O helpers
# ---------------------------------------------------------------------------


def _load_words() -> dict:
    """Load words.json. Returns empty dict if file not found."""
    if WORDS_JSON.exists():
        with open(WORDS_JSON, encoding="utf-8") as f:
            return json.load(f)
    return {}


def _save_words(data: dict) -> None:
    """Atomically write words.json via a .tmp file."""
    WORDS_JSON.parent.mkdir(parents=True, exist_ok=True)
    tmp_path = str(WORDS_JSON) + ".tmp"
    with open(tmp_path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)
    os.replace(tmp_path, WORDS_JSON)


# ---------------------------------------------------------------------------
# Main scrape loop
# ---------------------------------------------------------------------------


def _should_process(
    entry: dict,
    pos: str,
    force: bool,
    nouns_only: bool,
    verbs_only: bool,
    adjectives_only: bool,
    prepositions_only: bool,
) -> bool:
    """Return True if this entry should be scraped."""
    if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
        return False
    if nouns_only and not pos.startswith("Noun"):
        return False
    if verbs_only and not pos.startswith("Verb"):
        return False
    if adjectives_only and not pos.startswith("Adjective"):
        return False
    if prepositions_only and not pos.startswith("Preposition"):
        return False
    return force or not entry.get("detail_scraped")


def run(
    test: int | None = None,
    force_refresh: bool = False,
    nouns_only: bool = False,
    verbs_only: bool = False,
    adjectives_only: bool = False,
    prepositions_only: bool = False,
) -> None:
    """
    Main scrape loop.

    Args:
        test: If set, scrape at most this many entries (for smoke-testing).
        force_refresh: Re-scrape entries where detail_scraped=True.
        nouns_only: Only scrape noun entries.
        verbs_only: Only scrape verb entries.
        adjectives_only: Only scrape adjective entries.
        prepositions_only: Only scrape preposition entries.
    """
    words = _load_words()

    candidates = [
        (unique_key, entry)
        for unique_key, entry in words.items()
        if _should_process(
            entry,
            entry.get("pos", ""),
            force_refresh,
            nouns_only,
            verbs_only,
            adjectives_only,
            prepositions_only,
        )
        and entry.get("slug")
    ]

    total = len(candidates)
    if test is not None:
        candidates = candidates[:test]
        logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
    else:
        logger.info(
            "Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
            total,
        )

    processed = 0
    errors = 0

    for idx, (unique_key, entry) in enumerate(candidates, start=1):
        slug = entry["slug"]
        pos = entry.get("pos", "")
        word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
        url = f"{PEALIM_BASE}/dict/{slug}/"

        if pos.startswith("Noun"):
            label = "Noun"
        elif pos.startswith("Verb"):
            label = "Verb"
        elif pos.startswith("Adjective"):
            label = "Adjective"
        else:
            label = "Preposition"
        logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)

        # Fetch mo (nikkud) page
        time.sleep(REQUEST_DELAY)
        mo_html = _fetch(url, hebstyle="mo")
        if not mo_html:
            logger.warning("  Skipping %s — failed to fetch mo page", slug)
            errors += 1
            continue

        # Fetch vl (ktiv male) page
        time.sleep(REQUEST_DELAY)
        vl_html = _fetch(url, hebstyle="vl")
        if not vl_html:
            logger.warning("  Skipping %s — failed to fetch vl page", slug)
            errors += 1
            continue

        # Parse and merge
        try:
            if pos.startswith("Noun"):
                scraped = _scrape_noun_detail(slug, mo_html, vl_html)
                if scraped:
                    existing_ni = entry.get("noun_inflection") or {}
                    merged = _merge_noun_inflection(existing_ni, scraped)
                    words[unique_key]["noun_inflection"] = merged
                    sg = merged.get("singular", {}) or {}
                    pl = merged.get("plural", {}) or {}
                    logger.info(
                        "  singular=%s plural=%s",
                        sg.get("nikkud", "—"),
                        pl.get("nikkud", "—"),
                    )
                else:
                    logger.warning("  No noun data scraped for %s", slug)
                    errors += 1
                    continue

            elif pos.startswith("Verb"):
                existing_conj = entry.get("conjugation")
                scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
                if scraped:
                    merged = _merge_conjugation(existing_conj, scraped)
                    words[unique_key]["conjugation"] = merged
                    n_forms = len(merged.get("active_forms", []))
                    logger.info(
                        "  %s, %d forms",
                        merged.get("binyan", "?"),
                        n_forms,
                    )
                else:
                    logger.warning("  No verb data scraped for %s", slug)
                    errors += 1
                    continue

            elif pos.startswith("Adjective"):
                scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
                if scraped:
                    existing_ai = entry.get("adjective_inflection")
                    merged = _merge_adjective_inflection(existing_ai, scraped)
                    words[unique_key]["adjective_inflection"] = merged
                    ms = merged.get("ms", {}) or {}
                    fs = merged.get("fs", {}) or {}
                    logger.info(
                        "  ms=%s fs=%s mishkal=%s",
                        ms.get("nikkud", "—"),
                        fs.get("nikkud", "—"),
                        merged.get("mishkal", "—"),
                    )
                else:
                    logger.warning("  No adjective data scraped for %s", slug)
                    errors += 1
                    continue

            else:  # Preposition
                scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
                if scraped:
                    existing_pi = entry.get("preposition_inflection")
                    merged = _merge_preposition_inflection(existing_pi, scraped)
                    words[unique_key]["preposition_inflection"] = merged
                    form_1s = merged.get("1s", {}) or {}
                    logger.info(
                        "  1s=%s",
                        form_1s.get("nikkud", "—"),
                    )
                else:
                    logger.warning("  No preposition data scraped for %s", slug)
                    errors += 1
                    continue

        except Exception as exc:  # noqa: BLE001
            logger.error("  Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
            errors += 1
            continue

        words[unique_key]["detail_scraped"] = True
        processed += 1

        # Incremental save every SAVE_INTERVAL entries
        if processed % SAVE_INTERVAL == 0:
            logger.info("  Auto-saving after %d entries...", processed)
            _save_words(words)

    # Final save
    _save_words(words)
    logger.info(
        "Done. Processed=%d, Errors=%d, Total eligible=%d",
        processed,
        errors,
        len(candidates),
    )


# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------


def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
        description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
    )
    parser.add_argument(
        "--test",
        metavar="N",
        type=int,
        default=None,
        help="Scrape only N entries (smoke-test mode).",
    )
    parser.add_argument(
        "--force-refresh-detail",
        action="store_true",
        default=False,
        help="Re-scrape entries where detail_scraped=True.",
    )
    group = parser.add_mutually_exclusive_group()
    group.add_argument(
        "--nouns-only",
        action="store_true",
        default=False,
        help="Only scrape Noun entries.",
    )
    group.add_argument(
        "--verbs-only",
        action="store_true",
        default=False,
        help="Only scrape Verb entries.",
    )
    group.add_argument(
        "--adjectives-only",
        action="store_true",
        default=False,
        help="Only scrape Adjective entries.",
    )
    group.add_argument(
        "--prepositions-only",
        action="store_true",
        default=False,
        help="Only scrape Preposition entries.",
    )
    return parser


if __name__ == "__main__":
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s %(levelname)s %(message)s",
        datefmt="%H:%M:%S",
    )

    args = _build_parser().parse_args()
    run(
        test=args.test,
        force_refresh=args.force_refresh_detail,
        nouns_only=args.nouns_only,
        verbs_only=args.verbs_only,
        adjectives_only=args.adjectives_only,
        prepositions_only=args.prepositions_only,
    )