hebrew_flash_cards/pealim_detail_scrape.py
Sochen af186e2030 Sprint 17: homograph example dedup + plural audio + prep extraction
- Homograph collision fix: _deduplicate_confusable_examples() clears
  shared examples from less-common confusable group members (36 entries
  fixed). Keeps examples only on highest-frequency meaning.
- Plural deck audio: wired up PluralAudio field in apkg_builder.py,
  downloaded 613 plural audio files from pealim.com for all deck entries.
- Prep extraction upstream: moved Hebrew preposition parsing from build
  time into list/detail scrapers (SCHEMA.yaml prep field added).
- Validation: new no_shared_confusable_examples check in validate_data.py
- Tests: 9 new unit tests for confusable deduplication (98 total)
- Release: v0.19

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 21:51:35 +00:00

1593 lines
54 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Consolidated detail page scraper for pealim.com.
Visits /dict/<slug>/ detail pages for nouns, verbs, adjectives and prepositions
in data/words.json.
Makes two requests per slug:
1. hebstyle=mo cookie → nikkud forms
2. hebstyle=vl cookie → ktiv male forms
Updates entries in data/words.json with scraped detail data.
Usage:
python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
[--nouns-only | --verbs-only |
--adjectives-only | --prepositions-only]
"""
import argparse
import json
import logging
import os
import re
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5 # seconds between requests
REQUEST_TIMEOUT = 15
SAVE_INTERVAL = 50 # write words.json every N processed entries
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
# Regex for Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
BINYAN_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּיעֵל",
"Pu'al": "פֻּעַל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
"Hitpa'el": "הִתְפַּעֵל",
}
PRONOUN_LABELS: dict[str, str] = {
"present_ms": "",
"present_fs": "",
"present_mp": "",
"present_fp": "",
"past_1s": "אֲנִי",
"past_1p": "אֲנַחְנוּ",
"past_2ms": "אַתָּה",
"past_2fs": "אַתְּ",
"past_2mp": "אַתֶּם",
"past_2fp": "אַתֶּן",
"past_3ms": "הוּא",
"past_3fs": "הִיא",
"past_3p": "הֵם / הֵן",
"future_1s": "אֲנִי",
"future_1p": "אֲנַחְנוּ",
"future_2ms": "אַתָּה",
"future_2fs": "אַתְּ",
"future_2mp": "אַתֶּם",
"future_2fp": "אַתֶּן",
"future_3ms": "הוּא",
"future_3fs": "הִיא",
"future_3mp": "הֵם",
"future_3fp": "הֵן",
"imperative_ms": "אַתָּה",
"imperative_fs": "אַתְּ",
"imperative_mp": "אַתֶּם",
"imperative_fp": "אַתֶּן",
"infinitive": "",
}
TENSE_DESCRIPTION: dict[str, str] = {
"present_ms": "הוֹוֶה",
"present_fs": "הוֹוֶה",
"present_mp": "הוֹוֶה",
"present_fp": "הוֹוֶה",
"past_1s": "עָבָר",
"past_1p": "עָבָר",
"past_2ms": "עָבָר",
"past_2fs": "עָבָר",
"past_2mp": "עָבָר",
"past_2fp": "עָבָר",
"past_3ms": "עָבָר",
"past_3fs": "עָבָר",
"past_3p": "עָבָר",
"future_1s": "עָתִיד",
"future_1p": "עָתִיד",
"future_2ms": "עָתִיד",
"future_2fs": "עָתִיד",
"future_2mp": "עָתִיד",
"future_2fp": "עָתִיד",
"future_3ms": "עָתִיד",
"future_3fs": "עָתִיד",
"future_3mp": "עָתִיד",
"future_3fp": "עָתִיד",
"imperative_ms": "צִוּוּי",
"imperative_fs": "צִוּוּי",
"imperative_mp": "צִוּוּי",
"imperative_fp": "צִוּוּי",
"infinitive": "מְקוֹר",
}
FORM_KEY_TO_PERSON: dict[str, str] = {
"present_ms": "ms",
"present_fs": "fs",
"present_mp": "mp",
"present_fp": "fp",
"past_1s": "1s",
"past_1p": "1p",
"past_2ms": "2ms",
"past_2fs": "2fs",
"past_2mp": "2mp",
"past_2fp": "2fp",
"past_3ms": "3ms",
"past_3fs": "3fs",
"past_3p": "3p",
"future_1s": "1s",
"future_1p": "1p",
"future_2ms": "2ms",
"future_2fs": "2fs",
"future_2mp": "2mp",
"future_2fp": "2fp",
"future_3ms": "3ms",
"future_3fs": "3fs",
"future_3mp": "3mp",
"future_3fp": "3fp",
"imperative_ms": "ms",
"imperative_fs": "fs",
"imperative_mp": "mp",
"imperative_fp": "fp",
"infinitive": "inf",
}
# Mishkal English name → Hebrew nikkud mapping
# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
_MISHKAL_HEBREW_Q: dict[str, str] = {
# --- a ---
"aqtal": "אַקְטָל",
"aqtala": "אַקְטָלָה",
# --- e ---
"eqtal": "אֶקְטָל",
# --- h ---
"haqtala": "הַקְטָלָה",
"heqtel": "הֶקְטֵל",
"hiqqatlut": "הִקָּטְלוּת",
"hitqattlut": "הִתְקַטְּלוּת",
# --- m ---
"maqtal": "מַקְטָל",
"maqtel": "מַקְטֵל",
"maqtela": "מַקְטֵלָה",
"maqtelet": "מַקְטֶלֶת",
"maqtil": "מַקְטִיל",
"maqtol": "מַקְטוֹל",
"maqtolet": "מַקְטֹלֶת",
"maqtul": "מַקְטוּל",
"meqattel": "מְקַטֵּל",
"meqila": "מְקִילָה",
"mequla": "מְקוּלָה",
"mequttal": "מְקֻטָּל",
"miqtal": "מִקְטָל",
"miqtala": "מִקְטָלָה",
"miqtelet": "מִקְטֶלֶת",
"miqtol": "מִקְטוֹל",
"miqtolet": "מִקְטֹלֶת",
"mitqattel": "מִתְקַטֵּל",
"muqtal": "מֻקְטָל",
# --- n ---
"niqtal": "נִקְטָל",
# --- q ---
"qal": "קַל",
"qatal": "קָטָל",
"qatel": "קָטֵל",
"qatil": "קָטִיל",
"qatla": "קַטְלָה",
"qatlan": "קַטְלָן",
"qatlut": "קַטְלוּת",
"qatol": "קָטוֹל",
"qaton": "קָטוֹן",
"qattal": "קַטָּל",
"qattala": "קַטָּלָה",
"qattelet": "קַטֶּלֶת",
"qattil": "קַטִּיל",
"qattila": "קַטִּילָה",
"qattolet": "קַטֹּלֶת",
"qattul": "קַטּוּל",
"qatul": "קָטוּל",
"qatut": "קָטוּת",
"qetel": "קֶטֶל",
"qeteh": "קֵטֶה",
"qitla": "קִטְלָה",
"qitlon": "קִטְלוֹן",
"qittalon": "קִטָּלוֹן",
"qittel": "קִטֵּל",
"qittelet": "קִטֶּלֶת",
"qittol": "קִטּוֹל",
"qittolet": "קִטֹּלֶת",
"qittul": "קִטּוּל",
"qol": "קֹל",
"qotal": "קוֹטָל",
"qotel": "קוֹטֵל",
"qotelet": "קוֹטֶלֶת",
"qotla": "קָטְלָה",
"qtal": "קְטָל",
"qtala": "קְטָלָה",
"qtaltal": "קְטַלְטַל",
"qtaltan": "קְטַלְתָּן",
"qtaltolet": "קְטַלְטֹלֶת",
"qtel": "קְטֵל",
"qtela": "קְטֵלָה",
"qtelet": "קְטֶלֶת",
"qtil": "קְטִיל",
"qtila": "קְטִילָה",
"qtili": "קְטִילִי",
"qtol": "קְטוֹל",
"qtola": "קְטוֹלָה",
"qtolet": "קְטֹלֶת",
"qtul": "קְטוּל",
"qtula": "קְטוּלָה",
"qtulla": "קְטֻלָּה",
"qtut": "קְטוּת",
"qutla": "קֻטְלָה",
"quttolet": "קֻטּוֹלֶת",
# --- t ---
"taqtela": "תַּקְטֵלָה",
"taqtil": "תַּקְטִיל",
"taqtit": "תַּקְטִית",
"taqtul": "תַּקְטוּל",
"taqtula": "תַּקְטוּלָה",
"taqtut": "תַּקְטוּת",
"tiqtal": "תִּקְטָל",
"tiqtala": "תִּקְטָלָה",
"tiqtelet": "תִּקְטֶלֶת",
"tiqtolet": "תִּקְטֹלֶת",
"tqilla": "תְּקִלָּה",
"tqula": "תְּקוּלָה",
# --- y ---
"yaqtul": "יַקְטוּל",
}
def _mishkal_to_hebrew(mishkal: str) -> str | None:
"""Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
if not mishkal:
return None
# Try as-is first (q-notation)
result = _MISHKAL_HEBREW_Q.get(mishkal)
if result:
return result
# Convert k-notation to q-notation and retry
q_form = mishkal.replace("k", "q")
return _MISHKAL_HEBREW_Q.get(q_form)
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
_session = requests.Session()
_session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
def _fetch(url: str, hebstyle: str, backoff: float = REQUEST_DELAY) -> str | None:
"""Fetch a URL with the given hebstyle cookie. Returns HTML string or None on failure."""
cookies = {"translit": "none", "hebstyle": hebstyle}
max_wait = 60.0
while True:
try:
resp = _session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp.text
except requests.exceptions.HTTPError as exc:
status = exc.response.status_code if exc.response is not None else 0
if status == 404:
logger.warning(" 404 for %s — skipping", url)
return None
if status in (429, 503):
wait = min(backoff, max_wait)
logger.warning(" Rate limited (%s) — waiting %.0fs", status, wait)
time.sleep(wait)
backoff = min(backoff * 2, max_wait)
else:
logger.error(" HTTP %s for %s", status, url)
return None
except requests.RequestException as exc:
wait = min(backoff, max_wait)
logger.warning(" Request error for %s: %s — retrying in %.0fs", url, exc, wait)
time.sleep(wait)
backoff = min(backoff * 2, max_wait)
if backoff >= max_wait:
logger.error(" Giving up on %s", url)
return None
# ---------------------------------------------------------------------------
# Shared HTML parsing utilities
# ---------------------------------------------------------------------------
def _get_menukad_and_audio(cell) -> tuple[str, str]:
"""Extract (nikkud_text, audio_url) from a table cell."""
audio_url = ""
audio_span = cell.find("span", class_=lambda c: c and "audio-play" in c)
if audio_span:
audio_url = audio_span.get("data-audio", "")
# Also check direct data-audio attribute on cell
if not audio_url:
da = cell.get("data-audio", "")
if da:
audio_url = da
span = cell.find("span", class_="menukad")
if span:
return span.get_text(strip=True), audio_url
txt = cell.get_text(strip=True)
if re.search(r"[\u05d0-\u05ea]", txt):
return txt, audio_url
return "", audio_url
def _get_plain_text(cell) -> str:
"""Extract plain Hebrew text (no nikkud) from a cell — used for vl pages."""
span = cell.find("span", class_="menukad")
if span:
return span.get_text(strip=True)
txt = cell.get_text(strip=True)
if re.search(r"[\u05d0-\u05ea]", txt):
return txt
return ""
# ---------------------------------------------------------------------------
# Noun detail parsing
# ---------------------------------------------------------------------------
_GENDER_MAP = {
"masculine": "masculine",
"feminine": "feminine",
"זכר": "masculine",
"נקבה": "feminine",
"male": "masculine",
"female": "feminine",
}
_GENDER_HEBREW = {
"masculine": {"nikkud": "זָכָר", "ktiv_male": "זכר"},
"feminine": {"nikkud": "נְקֵבָה", "ktiv_male": "נקבה"},
}
def _parse_noun_table(soup: BeautifulSoup) -> dict[str, dict | str]:
"""
Parse the noun declension table from a pealim detail page soup.
Returns a dict with keys: singular, plural, construct_singular, construct_plural,
singular_audio, plural_audio — values are nikkud strings or audio URLs.
Returns empty dict if no table found.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
rows = table.find_all("tr")
result: dict[str, dict | str] = {}
for row in rows:
label_cell = row.find("th") or (row.find("td") if row.find_all("td") else None)
if not label_cell:
continue
label_text = label_cell.get_text(strip=True).lower()
tds = row.find_all("td")
# Some rows have th + tds; tds may include the first label td
# We want data cells (the ones with Hebrew forms)
data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())]
if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text):
# Singular and plural forms in two cells
if len(data_cells) >= 1:
nikkud_sg, audio_sg = _get_menukad_and_audio(data_cells[0])
result["singular_nikkud"] = nikkud_sg
if audio_sg:
result["singular_audio"] = audio_sg
if len(data_cells) >= 2:
nikkud_pl, audio_pl = _get_menukad_and_audio(data_cells[1])
result["plural_nikkud"] = nikkud_pl
if audio_pl:
result["plural_audio"] = audio_pl
elif "construct" in label_text or "סמיכות" in label_text:
if len(data_cells) >= 1:
nikkud_csg, _ = _get_menukad_and_audio(data_cells[0])
result["construct_singular_nikkud"] = nikkud_csg
if len(data_cells) >= 2:
nikkud_cpl, _ = _get_menukad_and_audio(data_cells[1])
result["construct_plural_nikkud"] = nikkud_cpl
return result
def _parse_noun_table_vl(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the noun declension table from a vl (ktiv male) page.
Returns dict with keys: singular_ktiv, plural_ktiv, construct_singular_ktiv, construct_plural_ktiv.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
rows = table.find_all("tr")
result: dict[str, str] = {}
for row in rows:
label_cell = row.find("th")
if not label_cell:
tds_all = row.find_all("td")
if tds_all:
label_cell = tds_all[0]
if not label_cell:
continue
label_text = label_cell.get_text(strip=True).lower()
tds = row.find_all("td")
data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())]
if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text):
if len(data_cells) >= 1:
result["singular_ktiv"] = _get_plain_text(data_cells[0])
if len(data_cells) >= 2:
result["plural_ktiv"] = _get_plain_text(data_cells[1])
elif "construct" in label_text or "סמיכות" in label_text:
if len(data_cells) >= 1:
result["construct_singular_ktiv"] = _get_plain_text(data_cells[0])
if len(data_cells) >= 2:
result["construct_plural_ktiv"] = _get_plain_text(data_cells[1])
return result
def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
"""
Extract (gender, mishkal) from the PoS section of the detail page.
Returns ("masculine"|"feminine"|"", mishkal_english|"").
Pealim HTML structure:
<p>Noun <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>
The mishkal is in the <i> tag (k-notation, e.g. "ketel") or the nm= URL param (q-notation).
Some nouns have no mishkal link: <p>Noun masculine</p>
"""
gender = ""
mishkal = ""
# Find the PoS <p> tag — on pealim detail pages it's a bare <p> like
# "Noun ketel pattern, masculine" or "Adjective katul pattern"
pos_section = None
for p in soup.find_all("p"):
text = p.get_text(" ", strip=True)
if re.match(r"^(Noun|Adjective)\b", text):
pos_section = p
break
# Fall back to older selectors (div.pos, p.pos, div.page-header)
if not pos_section:
pos_section = (
soup.find("div", class_="pos") or soup.find("p", class_="pos") or soup.find("div", class_="page-header")
)
if pos_section:
text = pos_section.get_text(" ", strip=True)
# Gender detection
for raw, canonical in _GENDER_MAP.items():
if raw in text.lower():
gender = canonical
break
# Mishkal detection: extract from <a href="...nm=XXXX"><i>YYYY</i> pattern</a>
# Nouns use nm= param, adjectives use am= param
mishkal_link = pos_section.find("a", href=re.compile(r"[na]m="))
if mishkal_link:
# Prefer <i> tag text (k-notation, matches _MISHKAL_HEBREW_Q after k→q)
i_tag = mishkal_link.find("i")
if i_tag:
mishkal = i_tag.get_text(strip=True)
else:
# Fall back to nm= URL parameter (already q-notation)
href = mishkal_link.get("href", "")
nm_match = re.search(r"[na]m=([a-zA-Z']+)", href)
if nm_match:
mishkal = nm_match.group(1)
# Also check the og:description or breadcrumbs for gender
if not gender:
meta = soup.find("meta", {"property": "og:description"})
if meta:
desc = meta.get("content", "").lower()
for raw, canonical in _GENDER_MAP.items():
if raw in desc:
gender = canonical
break
# Scan small/muted spans that often contain gender info
if not gender:
for span in soup.find_all("span", class_=lambda c: c and ("small" in c or "muted" in c or "pos" in c)):
txt = span.get_text(strip=True).lower()
for raw, canonical in _GENDER_MAP.items():
if raw in txt:
gender = canonical
break
if gender:
break
return gender, mishkal
def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
"""
Parse noun detail pages (mo=nikkud, vl=ktiv male).
Returns dict to merge into entry's noun_inflection field.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
mo_data = _parse_noun_table(mo_soup)
vl_data = _parse_noun_table_vl(vl_soup)
gender, mishkal = _parse_noun_gender_mishkal(mo_soup)
def form_or_null(nikkud: str, ktiv: str) -> dict | None:
if not nikkud:
return None
if not ktiv:
logger.warning("No ktiv_male for noun form: %s", nikkud)
return {"nikkud": nikkud, "ktiv_male": ktiv}
singular_nikkud = str(mo_data.get("singular_nikkud", ""))
plural_nikkud = str(mo_data.get("plural_nikkud", ""))
construct_singular_nikkud = str(mo_data.get("construct_singular_nikkud", ""))
construct_plural_nikkud = str(mo_data.get("construct_plural_nikkud", ""))
singular_ktiv = str(vl_data.get("singular_ktiv", ""))
plural_ktiv = str(vl_data.get("plural_ktiv", ""))
construct_singular_ktiv = str(vl_data.get("construct_singular_ktiv", ""))
construct_plural_ktiv = str(vl_data.get("construct_plural_ktiv", ""))
result: dict = {
"singular": form_or_null(singular_nikkud, singular_ktiv),
"plural": form_or_null(plural_nikkud, plural_ktiv),
"construct_singular": form_or_null(construct_singular_nikkud, construct_singular_ktiv),
"construct_plural": form_or_null(construct_plural_nikkud, construct_plural_ktiv),
"singular_audio": mo_data.get("singular_audio"),
"plural_audio": mo_data.get("plural_audio"),
"pronominal_suffixes": None,
# plurals_guid is PRESERVED by the merge step — not set here
}
if gender:
result["gender"] = gender
result["gender_hebrew"] = _GENDER_HEBREW.get(gender)
if mishkal:
result["mishkal"] = mishkal
result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)
return result
# ---------------------------------------------------------------------------
# Verb detail parsing (ported from conjugation_extract.py)
# ---------------------------------------------------------------------------
def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
"""Extract binyan from page header span or og:description."""
texts = [h3.get_text(" ", strip=True) for h3 in soup.find_all("h3", class_="page-header")]
meta = soup.find("meta", {"property": "og:description"})
if meta:
texts.append(str(meta.get("content", "")))
for text in texts:
text_lower = text.lower()
for i, bname_lower in enumerate(_BINYAN_NAMES_LOWER):
if bname_lower in text_lower:
return BINYAN_NAMES[i]
return ""
def _parse_conjugation_table(
soup: BeautifulSoup,
passive: bool = False,
table_el=None,
) -> dict[str, dict]:
"""
Parse conjugation table. Returns form_key -> {form_nikkud, audio_url} dict.
If passive=True, locates the passive table (after "Passive" heading).
If table_el is provided, parses that table directly.
"""
if passive:
passive_h3 = next(
(h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
None,
)
if not passive_h3:
return {}
table = next(
(
sib
for sib in passive_h3.find_all_next()
if sib.name == "table" and "conjugation-table" in sib.get("class", [])
),
None,
)
if not table:
return {}
elif table_el is not None:
table = table_el
else:
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
rows = table.find_all("tr")
if len(rows) < 3:
return {}
forms: dict[str, dict] = {}
def heb_cells(row_idx: int) -> list[tuple[str, str]]:
"""Return (nikkud_text, audio_url) for each Hebrew-containing cell in the row."""
cells = rows[row_idx].find_all(["th", "td"])
result = []
for cell in cells:
txt, au = _get_menukad_and_audio(cell)
colspan = int(cell.get("colspan", 1))
if txt and re.search(r"[\u05d0-\u05ea]", txt):
for _ in range(colspan):
result.append((txt, au))
return result
def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
seen: set[str] = set()
out: list[tuple[str, str]] = []
for pair in pairs:
if pair[0] not in seen:
seen.add(pair[0])
out.append(pair)
return out
def store(key: str, nikkud: str, audio_url: str) -> None:
if nikkud:
forms[key] = {"form_nikkud": nikkud, "audio_url": audio_url}
# Locate tense rows by label text
present_row = past_row = future_row = imp_row = inf_row = -1
for i, row in enumerate(rows):
label = row.get_text(" ", strip=True).lower()
if "present" in label and present_row < 0:
present_row = i
elif "past" in label and past_row < 0:
past_row = i
elif "future" in label and future_row < 0:
future_row = i
elif "imperative" in label and imp_row < 0:
imp_row = i
elif "infinitive" in label and inf_row < 0:
inf_row = i
# Present: ms fs mp fp
if present_row >= 0:
hf = heb_cells(present_row)
for k, (v, au) in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False):
store(k, v, au)
# Past row 0: 1s 1p (deduplicated)
if past_row >= 0:
uniq = deduplicate(heb_cells(past_row))
if len(uniq) >= 1:
store("past_1s", uniq[0][0], uniq[0][1])
if len(uniq) >= 2:
store("past_1p", uniq[1][0], uniq[1][1])
# Past row 1: 2ms 2fs 2mp 2fp
if past_row + 1 < len(rows):
for k, (v, au) in zip(
["past_2ms", "past_2fs", "past_2mp", "past_2fp"],
heb_cells(past_row + 1),
strict=False,
):
store(k, v, au)
# Past row 2: 3ms 3fs 3p (deduplicated)
if past_row + 2 < len(rows):
uniq3 = deduplicate(heb_cells(past_row + 2))
for k, (v, au) in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False):
store(k, v, au)
# Future row 0: 1s 1p (deduplicated)
if future_row >= 0:
uniq_f = deduplicate(heb_cells(future_row))
if len(uniq_f) >= 1:
store("future_1s", uniq_f[0][0], uniq_f[0][1])
if len(uniq_f) >= 2:
store("future_1p", uniq_f[1][0], uniq_f[1][1])
# Future row 1: 2ms 2fs 2mp 2fp
if future_row + 1 < len(rows):
for k, (v, au) in zip(
["future_2ms", "future_2fs", "future_2mp", "future_2fp"],
heb_cells(future_row + 1),
strict=False,
):
store(k, v, au)
# Future row 2: 3ms 3fs 3mp 3fp
if future_row + 2 < len(rows):
for k, (v, au) in zip(
["future_3ms", "future_3fs", "future_3mp", "future_3fp"],
heb_cells(future_row + 2),
strict=False,
):
store(k, v, au)
# Imperative: ms fs mp fp
if imp_row >= 0:
hf = heb_cells(imp_row)
for k, (v, au) in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False):
store(k, v, au)
# Infinitive
if inf_row >= 0:
hf = heb_cells(inf_row)
if hf:
store("infinitive", hf[0][0], hf[0][1])
return forms
def _parse_conjugation_table_vl(
soup: BeautifulSoup,
passive: bool = False,
table_el=None,
) -> dict[str, str]:
"""
Parse conjugation table from a vl (ktiv male) page.
Returns form_key -> ktiv_male_text dict.
Mirrors _parse_conjugation_table but extracts plain text.
"""
if passive:
passive_h3 = next(
(h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
None,
)
if not passive_h3:
return {}
table = next(
(
sib
for sib in passive_h3.find_all_next()
if sib.name == "table" and "conjugation-table" in sib.get("class", [])
),
None,
)
if not table:
return {}
elif table_el is not None:
table = table_el
else:
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
rows = table.find_all("tr")
if len(rows) < 3:
return {}
ktiv_forms: dict[str, str] = {}
def heb_cells_plain(row_idx: int) -> list[str]:
cells = rows[row_idx].find_all(["th", "td"])
result = []
for cell in cells:
txt = _get_plain_text(cell)
colspan = int(cell.get("colspan", 1))
if txt and re.search(r"[\u05d0-\u05ea]", txt):
for _ in range(colspan):
result.append(txt)
return result
def deduplicate_str(items: list[str]) -> list[str]:
seen: set[str] = set()
out: list[str] = []
for item in items:
if item not in seen:
seen.add(item)
out.append(item)
return out
present_row = past_row = future_row = imp_row = inf_row = -1
for i, row in enumerate(rows):
label = row.get_text(" ", strip=True).lower()
if "present" in label and present_row < 0:
present_row = i
elif "past" in label and past_row < 0:
past_row = i
elif "future" in label and future_row < 0:
future_row = i
elif "imperative" in label and imp_row < 0:
imp_row = i
elif "infinitive" in label and inf_row < 0:
inf_row = i
if present_row >= 0:
hf = heb_cells_plain(present_row)
for k, v in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False):
if v:
ktiv_forms[k] = v
if past_row >= 0:
uniq = deduplicate_str(heb_cells_plain(past_row))
if len(uniq) >= 1:
ktiv_forms["past_1s"] = uniq[0]
if len(uniq) >= 2:
ktiv_forms["past_1p"] = uniq[1]
if past_row + 1 < len(rows):
for k, v in zip(
["past_2ms", "past_2fs", "past_2mp", "past_2fp"],
heb_cells_plain(past_row + 1),
strict=False,
):
if v:
ktiv_forms[k] = v
if past_row + 2 < len(rows):
uniq3 = deduplicate_str(heb_cells_plain(past_row + 2))
for k, v in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False):
if v:
ktiv_forms[k] = v
if future_row >= 0:
uniq_f = deduplicate_str(heb_cells_plain(future_row))
if len(uniq_f) >= 1:
ktiv_forms["future_1s"] = uniq_f[0]
if len(uniq_f) >= 2:
ktiv_forms["future_1p"] = uniq_f[1]
if future_row + 1 < len(rows):
for k, v in zip(
["future_2ms", "future_2fs", "future_2mp", "future_2fp"],
heb_cells_plain(future_row + 1),
strict=False,
):
if v:
ktiv_forms[k] = v
if future_row + 2 < len(rows):
for k, v in zip(
["future_3ms", "future_3fs", "future_3mp", "future_3fp"],
heb_cells_plain(future_row + 2),
strict=False,
):
if v:
ktiv_forms[k] = v
if imp_row >= 0:
hf = heb_cells_plain(imp_row)
for k, v in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False):
if v:
ktiv_forms[k] = v
if inf_row >= 0:
hf = heb_cells_plain(inf_row)
if hf:
ktiv_forms["infinitive"] = hf[0]
return ktiv_forms
def _forms_to_active_list(
mo_forms: dict[str, dict],
vl_forms: dict[str, str],
existing_forms: list[dict] | None,
) -> list[dict]:
"""
Convert parsed form dicts into the active_forms list structure (matches SCHEMA.yaml).
Preserves guid and guid_candidates from existing_forms where present.
"""
# Build a lookup of existing form data keyed by (person, tense) for GUID preservation
existing_lookup: dict[tuple[str, str], dict] = {}
if existing_forms:
for ef in existing_forms:
key = (ef.get("person", ""), ef.get("tense", ""))
existing_lookup[key] = ef
active_forms: list[dict] = []
for form_key, form_data in mo_forms.items():
person = FORM_KEY_TO_PERSON.get(form_key, form_key)
tense = TENSE_DESCRIPTION.get(form_key, "")
nikkud = form_data["form_nikkud"]
ktiv = vl_forms.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for verb form %s: %s", form_key, nikkud)
audio_url = form_data.get("audio_url", "")
pronoun = PRONOUN_LABELS.get(form_key, "")
# Preserve GUIDs from existing entry
existing = existing_lookup.get((person, tense), {})
guid = existing.get("guid")
guid_candidates = existing.get("guid_candidates")
active_forms.append(
{
"person": person,
"tense": tense,
"pronoun_hebrew": pronoun,
"form": {"nikkud": nikkud, "ktiv_male": ktiv},
"audio_url": audio_url,
"audio_file": existing.get("audio_file"),
"guid": guid,
"guid_candidates": guid_candidates,
}
)
return active_forms
def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: dict | None) -> dict:
"""
Parse verb detail pages (mo=nikkud, vl=ktiv male).
Returns dict to merge into entry's conjugation field.
Preserves in_conjugation_deck, guid, guid_candidates from existing_conj.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
existing = existing_conj or {}
# Extract metadata from mo page
binyan = _extract_binyan_from_page(mo_soup)
meaning = ""
prep: str | None = None
lead_div = mo_soup.find("div", class_="lead")
if lead_div:
meaning = lead_div.get_text(strip=True)
# Extract preposition(s) from the lead text, e.g. "(על)" → "על"
prep_matches = HBPAREN_RE.findall(meaning)
if prep_matches:
prep = " ".join(prep_matches)
# Fall back to any prep already stored (e.g. from a previous manual edit)
if prep is None:
prep = existing.get("prep")
# Parse active forms
mo_active = _parse_conjugation_table(mo_soup, passive=False)
vl_active = _parse_conjugation_table_vl(vl_soup, passive=False)
if not mo_active:
logger.warning(" No active forms found for slug=%s", slug)
return {}
# Determine infinitive and reference form
infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "")
infinitive_ktiv = vl_active.get("infinitive", "")
if infinitive_nikkud and not infinitive_ktiv:
logger.warning("No ktiv_male for infinitive: %s (slug=%s)", infinitive_nikkud, slug)
past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "")
past_3ms_ktiv = vl_active.get("past_3ms", "")
if past_3ms_nikkud and not past_3ms_ktiv:
logger.warning("No ktiv_male for past_3ms: %s (slug=%s)", past_3ms_nikkud, slug)
# Build active forms list, preserving GUIDs
existing_active_forms = existing.get("active_forms")
active_forms = _forms_to_active_list(mo_active, vl_active, existing_active_forms)
# Check for passive section (Hif'il / Pi'el verbs)
passive_h3 = next(
(h for h in mo_soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
None,
)
hufal_pual_forms = None
reference_form_passive = None
if passive_h3:
mo_passive = _parse_conjugation_table(mo_soup, passive=True)
vl_passive = _parse_conjugation_table_vl(vl_soup, passive=True)
if mo_passive:
existing_passive_forms = existing.get("hufal_pual_forms")
hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms)
passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "")
passive_3ms_ktiv = vl_passive.get("past_3ms", "")
if passive_3ms_nikkud and not passive_3ms_ktiv:
logger.warning("No ktiv_male for passive past_3ms: %s (slug=%s)", passive_3ms_nikkud, slug)
if passive_3ms_nikkud:
reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv}
result: dict = {
"in_conjugation_deck": existing.get("in_conjugation_deck", False),
"infinitive": {"nikkud": infinitive_nikkud, "ktiv_male": infinitive_ktiv} if infinitive_nikkud else None,
"reference_form": {"nikkud": past_3ms_nikkud, "ktiv_male": past_3ms_ktiv} if past_3ms_nikkud else None,
"binyan": binyan,
"binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
"meaning": meaning,
"prep": prep,
"active_forms": active_forms,
"hufal_pual_forms": hufal_pual_forms,
"reference_form_passive": reference_form_passive,
}
return result
# ---------------------------------------------------------------------------
# Adjective detail parsing
# ---------------------------------------------------------------------------
_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")
def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
"""
Parse the adjective inflection table from a pealim detail page (mo/nikkud).
Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
and audio URL from each.
Returns:
Dict mapping form key ("ms", "fs", "mp", "fp") to
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, dict] = {}
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
nikkud, audio_url = _get_menukad_and_audio(div)
if nikkud:
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
return result
def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the adjective inflection table from a vl (ktiv male) page.
Returns:
Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, str] = {}
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
ktiv = _get_plain_text(div)
if ktiv:
result[form_key] = ktiv
return result
def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
"""
Extract mishkal from the PoS section of an adjective detail page.
Reuses the same extraction logic as _parse_noun_gender_mishkal.
Returns:
Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
"""
_, mishkal = _parse_noun_gender_mishkal(soup)
mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
return mishkal, mishkal_hebrew
def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
"""
Parse adjective detail pages (mo=nikkud, vl=ktiv male).
Returns:
Dict matching the adjective_inflection schema:
{ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
Empty dict if no forms found.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
mo_data = _parse_adjective_table(mo_soup)
vl_data = _parse_adjective_table_vl(vl_soup)
mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)
if not mo_data:
return {}
result: dict = {}
for form_key in _ADJECTIVE_FORM_KEYS:
mo_form = mo_data.get(form_key)
if mo_form:
nikkud = mo_form["nikkud"]
ktiv = vl_data.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
else:
result[form_key] = None
result["mishkal"] = mishkal or None
result["mishkal_hebrew"] = mishkal_hebrew or None
return result
# ---------------------------------------------------------------------------
# Preposition detail parsing
# ---------------------------------------------------------------------------
_PREPOSITION_CELL_IDS: tuple[str, ...] = (
"P-1s",
"P-1p",
"P-2ms",
"P-2fs",
"P-2mp",
"P-2fp",
"P-3ms",
"P-3fs",
"P-3mp",
"P-3fp",
)
_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
"1s",
"1p",
"2ms",
"2fs",
"2mp",
"2fp",
"3ms",
"3fs",
"3mp",
"3fp",
)
def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
"""
Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).
Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud
text and audio URL from each.
Returns:
Dict mapping person key ("1s", "1p", …, "3fp") to
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, dict] = {}
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
nikkud, audio_url = _get_menukad_and_audio(div)
if nikkud:
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
return result
def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the preposition pronominal suffix table from a vl (ktiv male) page.
Returns:
Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, str] = {}
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
ktiv = _get_plain_text(div)
if ktiv:
result[form_key] = ktiv
return result
def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
"""
Parse preposition detail pages (mo=nikkud, vl=ktiv male).
Returns:
Dict matching the preposition_inflection schema:
{1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
Empty dict if no forms found.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
mo_data = _parse_preposition_table(mo_soup)
vl_data = _parse_preposition_table_vl(vl_soup)
if not mo_data:
return {}
result: dict = {}
for form_key in _PREPOSITION_FORM_KEYS:
mo_form = mo_data.get(form_key)
if mo_form:
nikkud = mo_form["nikkud"]
ktiv = vl_data.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
else:
result[form_key] = None
return result
# ---------------------------------------------------------------------------
# Merging strategy
# ---------------------------------------------------------------------------
def _merge_noun_inflection(existing_ni: dict | None, scraped: dict) -> dict:
"""
Merge scraped noun data into existing noun_inflection, preserving plurals_guid.
"""
result = dict(scraped)
if existing_ni:
# PRESERVE existing plurals_guid — never overwrite
if existing_ni.get("plurals_guid"):
result["plurals_guid"] = existing_ni["plurals_guid"]
# Preserve existing singular_audio if we didn't scrape one
if not result.get("singular_audio") and existing_ni.get("singular_audio"):
result["singular_audio"] = existing_ni["singular_audio"]
# Preserve existing plural_audio if we didn't scrape one
if not result.get("plural_audio") and existing_ni.get("plural_audio"):
result["plural_audio"] = existing_ni["plural_audio"]
# Preserve existing singular/plural if we failed to scrape them
for field in ("singular", "plural", "construct_singular", "construct_plural"):
if not result.get(field) and existing_ni.get(field):
result[field] = existing_ni[field]
else:
result.setdefault("plurals_guid", None)
return result
def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
"""
Merge scraped verb data into existing conjugation, preserving in_conjugation_deck
and all guid/guid_candidates fields (already handled in _forms_to_active_list).
"""
# The scraped dict already preserves in_conjugation_deck and GUIDs via _forms_to_active_list
return scraped
def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
"""
Merge scraped adjective data into existing adjective_inflection.
No GUIDs to preserve — simple overwrite with scraped data.
"""
return dict(scraped)
def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
"""
Merge scraped preposition data into existing preposition_inflection.
No GUIDs to preserve — simple overwrite with scraped data.
"""
return dict(scraped)
# ---------------------------------------------------------------------------
# I/O helpers
# ---------------------------------------------------------------------------
def _load_words() -> dict:
"""Load words.json. Returns empty dict if file not found."""
if WORDS_JSON.exists():
with open(WORDS_JSON, encoding="utf-8") as f:
return json.load(f)
return {}
def _save_words(data: dict) -> None:
"""Atomically write words.json via a .tmp file."""
WORDS_JSON.parent.mkdir(parents=True, exist_ok=True)
tmp_path = str(WORDS_JSON) + ".tmp"
with open(tmp_path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
os.replace(tmp_path, WORDS_JSON)
# ---------------------------------------------------------------------------
# Main scrape loop
# ---------------------------------------------------------------------------
def _should_process(
entry: dict,
pos: str,
force: bool,
nouns_only: bool,
verbs_only: bool,
adjectives_only: bool,
prepositions_only: bool,
) -> bool:
"""Return True if this entry should be scraped."""
if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
return False
if nouns_only and not pos.startswith("Noun"):
return False
if verbs_only and not pos.startswith("Verb"):
return False
if adjectives_only and not pos.startswith("Adjective"):
return False
if prepositions_only and not pos.startswith("Preposition"):
return False
return force or not entry.get("detail_scraped")
def run(
test: int | None = None,
force_refresh: bool = False,
nouns_only: bool = False,
verbs_only: bool = False,
adjectives_only: bool = False,
prepositions_only: bool = False,
) -> None:
"""
Main scrape loop.
Args:
test: If set, scrape at most this many entries (for smoke-testing).
force_refresh: Re-scrape entries where detail_scraped=True.
nouns_only: Only scrape noun entries.
verbs_only: Only scrape verb entries.
adjectives_only: Only scrape adjective entries.
prepositions_only: Only scrape preposition entries.
"""
words = _load_words()
candidates = [
(unique_key, entry)
for unique_key, entry in words.items()
if _should_process(
entry,
entry.get("pos", ""),
force_refresh,
nouns_only,
verbs_only,
adjectives_only,
prepositions_only,
)
and entry.get("slug")
]
total = len(candidates)
if test is not None:
candidates = candidates[:test]
logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
else:
logger.info(
"Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
total,
)
processed = 0
errors = 0
for idx, (unique_key, entry) in enumerate(candidates, start=1):
slug = entry["slug"]
pos = entry.get("pos", "")
word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
url = f"{PEALIM_BASE}/dict/{slug}/"
if pos.startswith("Noun"):
label = "Noun"
elif pos.startswith("Verb"):
label = "Verb"
elif pos.startswith("Adjective"):
label = "Adjective"
else:
label = "Preposition"
logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)
# Fetch mo (nikkud) page
time.sleep(REQUEST_DELAY)
mo_html = _fetch(url, hebstyle="mo")
if not mo_html:
logger.warning(" Skipping %s — failed to fetch mo page", slug)
errors += 1
continue
# Fetch vl (ktiv male) page
time.sleep(REQUEST_DELAY)
vl_html = _fetch(url, hebstyle="vl")
if not vl_html:
logger.warning(" Skipping %s — failed to fetch vl page", slug)
errors += 1
continue
# Parse and merge
try:
if pos.startswith("Noun"):
scraped = _scrape_noun_detail(slug, mo_html, vl_html)
if scraped:
existing_ni = entry.get("noun_inflection") or {}
merged = _merge_noun_inflection(existing_ni, scraped)
words[unique_key]["noun_inflection"] = merged
sg = merged.get("singular", {}) or {}
pl = merged.get("plural", {}) or {}
logger.info(
" singular=%s plural=%s",
sg.get("nikkud", ""),
pl.get("nikkud", ""),
)
else:
logger.warning(" No noun data scraped for %s", slug)
errors += 1
continue
elif pos.startswith("Verb"):
existing_conj = entry.get("conjugation")
scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
if scraped:
merged = _merge_conjugation(existing_conj, scraped)
words[unique_key]["conjugation"] = merged
n_forms = len(merged.get("active_forms", []))
logger.info(
" %s, %d forms",
merged.get("binyan", "?"),
n_forms,
)
else:
logger.warning(" No verb data scraped for %s", slug)
errors += 1
continue
elif pos.startswith("Adjective"):
scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
if scraped:
existing_ai = entry.get("adjective_inflection")
merged = _merge_adjective_inflection(existing_ai, scraped)
words[unique_key]["adjective_inflection"] = merged
ms = merged.get("ms", {}) or {}
fs = merged.get("fs", {}) or {}
logger.info(
" ms=%s fs=%s mishkal=%s",
ms.get("nikkud", ""),
fs.get("nikkud", ""),
merged.get("mishkal", ""),
)
else:
logger.warning(" No adjective data scraped for %s", slug)
errors += 1
continue
else: # Preposition
scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
if scraped:
existing_pi = entry.get("preposition_inflection")
merged = _merge_preposition_inflection(existing_pi, scraped)
words[unique_key]["preposition_inflection"] = merged
form_1s = merged.get("1s", {}) or {}
logger.info(
" 1s=%s",
form_1s.get("nikkud", ""),
)
else:
logger.warning(" No preposition data scraped for %s", slug)
errors += 1
continue
except Exception as exc: # noqa: BLE001
logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
errors += 1
continue
words[unique_key]["detail_scraped"] = True
processed += 1
# Incremental save every SAVE_INTERVAL entries
if processed % SAVE_INTERVAL == 0:
logger.info(" Auto-saving after %d entries...", processed)
_save_words(words)
# Final save
_save_words(words)
logger.info(
"Done. Processed=%d, Errors=%d, Total eligible=%d",
processed,
errors,
len(candidates),
)
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
)
parser.add_argument(
"--test",
metavar="N",
type=int,
default=None,
help="Scrape only N entries (smoke-test mode).",
)
parser.add_argument(
"--force-refresh-detail",
action="store_true",
default=False,
help="Re-scrape entries where detail_scraped=True.",
)
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--nouns-only",
action="store_true",
default=False,
help="Only scrape Noun entries.",
)
group.add_argument(
"--verbs-only",
action="store_true",
default=False,
help="Only scrape Verb entries.",
)
group.add_argument(
"--adjectives-only",
action="store_true",
default=False,
help="Only scrape Adjective entries.",
)
group.add_argument(
"--prepositions-only",
action="store_true",
default=False,
help="Only scrape Preposition entries.",
)
return parser
if __name__ == "__main__":
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
args = _build_parser().parse_args()
run(
test=args.test,
force_refresh=args.force_refresh_detail,
nouns_only=args.nouns_only,
verbs_only=args.verbs_only,
adjectives_only=args.adjectives_only,
prepositions_only=args.prepositions_only,
)