#!/usr/bin/env python3 """ Consolidated detail page scraper for pealim.com. Visits /dict// detail pages for nouns, verbs, adjectives and prepositions in data/words.json. Makes two requests per slug: 1. hebstyle=mo cookie → nikkud forms 2. hebstyle=vl cookie → ktiv male forms Updates entries in data/words.json with scraped detail data. Usage: python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail] [--nouns-only | --verbs-only | --adjectives-only | --prepositions-only] """ import argparse import json import logging import os import re import time from pathlib import Path import requests from bs4 import BeautifulSoup logger = logging.getLogger(__name__) # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- PEALIM_BASE = "https://www.pealim.com" REQUEST_DELAY = 1.5 # seconds between requests REQUEST_TIMEOUT = 15 SAVE_INTERVAL = 50 # write words.json every N processed entries WORDS_JSON = Path(__file__).parent / "data" / "words.json" # Regex for Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)" HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)") BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al") _BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES) BINYAN_HEBREW: dict[str, str] = { "Pa'al": "פָּעַל", "Nif'al": "נִפְעַל", "Pi'el": "פִּיעֵל", "Pu'al": "פֻּעַל", "Hif'il": "הִפְעִיל", "Huf'al": "הֻפְעַל", "Hitpa'el": "הִתְפַּעֵל", } PRONOUN_LABELS: dict[str, str] = { "present_ms": "", "present_fs": "", "present_mp": "", "present_fp": "", "past_1s": "אֲנִי", "past_1p": "אֲנַחְנוּ", "past_2ms": "אַתָּה", "past_2fs": "אַתְּ", "past_2mp": "אַתֶּם", "past_2fp": "אַתֶּן", "past_3ms": "הוּא", "past_3fs": "הִיא", "past_3p": "הֵם / הֵן", "future_1s": "אֲנִי", "future_1p": "אֲנַחְנוּ", "future_2ms": "אַתָּה", "future_2fs": "אַתְּ", "future_2mp": "אַתֶּם", "future_2fp": "אַתֶּן", "future_3ms": "הוּא", "future_3fs": "הִיא", "future_3mp": "הֵם", "future_3fp": "הֵן", "imperative_ms": "אַתָּה", "imperative_fs": "אַתְּ", "imperative_mp": "אַתֶּם", "imperative_fp": "אַתֶּן", "infinitive": "", } TENSE_DESCRIPTION: dict[str, str] = { "present_ms": "הוֹוֶה", "present_fs": "הוֹוֶה", "present_mp": "הוֹוֶה", "present_fp": "הוֹוֶה", "past_1s": "עָבָר", "past_1p": "עָבָר", "past_2ms": "עָבָר", "past_2fs": "עָבָר", "past_2mp": "עָבָר", "past_2fp": "עָבָר", "past_3ms": "עָבָר", "past_3fs": "עָבָר", "past_3p": "עָבָר", "future_1s": "עָתִיד", "future_1p": "עָתִיד", "future_2ms": "עָתִיד", "future_2fs": "עָתִיד", "future_2mp": "עָתִיד", "future_2fp": "עָתִיד", "future_3ms": "עָתִיד", "future_3fs": "עָתִיד", "future_3mp": "עָתִיד", "future_3fp": "עָתִיד", "imperative_ms": "צִוּוּי", "imperative_fs": "צִוּוּי", "imperative_mp": "צִוּוּי", "imperative_fp": "צִוּוּי", "infinitive": "מְקוֹר", } FORM_KEY_TO_PERSON: dict[str, str] = { "present_ms": "ms", "present_fs": "fs", "present_mp": "mp", "present_fp": "fp", "past_1s": "1s", "past_1p": "1p", "past_2ms": "2ms", "past_2fs": "2fs", "past_2mp": "2mp", "past_2fp": "2fp", "past_3ms": "3ms", "past_3fs": "3fs", "past_3p": "3p", "future_1s": "1s", "future_1p": "1p", "future_2ms": "2ms", "future_2fs": "2fs", "future_2mp": "2mp", "future_2fp": "2fp", "future_3ms": "3ms", "future_3fs": "3fs", "future_3mp": "3mp", "future_3fp": "3fp", "imperative_ms": "ms", "imperative_fs": "fs", "imperative_mp": "mp", "imperative_fp": "fp", "infinitive": "inf", } # Mishkal English name → Hebrew nikkud mapping # Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal). # We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion. # Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns _MISHKAL_HEBREW_Q: dict[str, str] = { # --- a --- "aqtal": "אַקְטָל", "aqtala": "אַקְטָלָה", # --- e --- "eqtal": "אֶקְטָל", # --- h --- "haqtala": "הַקְטָלָה", "heqtel": "הֶקְטֵל", "hiqqatlut": "הִקָּטְלוּת", "hitqattlut": "הִתְקַטְּלוּת", # --- m --- "maqtal": "מַקְטָל", "maqtel": "מַקְטֵל", "maqtela": "מַקְטֵלָה", "maqtelet": "מַקְטֶלֶת", "maqtil": "מַקְטִיל", "maqtol": "מַקְטוֹל", "maqtolet": "מַקְטֹלֶת", "maqtul": "מַקְטוּל", "meqattel": "מְקַטֵּל", "meqila": "מְקִילָה", "mequla": "מְקוּלָה", "mequttal": "מְקֻטָּל", "miqtal": "מִקְטָל", "miqtala": "מִקְטָלָה", "miqtelet": "מִקְטֶלֶת", "miqtol": "מִקְטוֹל", "miqtolet": "מִקְטֹלֶת", "mitqattel": "מִתְקַטֵּל", "muqtal": "מֻקְטָל", # --- n --- "niqtal": "נִקְטָל", # --- q --- "qal": "קַל", "qatal": "קָטָל", "qatel": "קָטֵל", "qatil": "קָטִיל", "qatla": "קַטְלָה", "qatlan": "קַטְלָן", "qatlut": "קַטְלוּת", "qatol": "קָטוֹל", "qaton": "קָטוֹן", "qattal": "קַטָּל", "qattala": "קַטָּלָה", "qattelet": "קַטֶּלֶת", "qattil": "קַטִּיל", "qattila": "קַטִּילָה", "qattolet": "קַטֹּלֶת", "qattul": "קַטּוּל", "qatul": "קָטוּל", "qatut": "קָטוּת", "qetel": "קֶטֶל", "qeteh": "קֵטֶה", "qitla": "קִטְלָה", "qitlon": "קִטְלוֹן", "qittalon": "קִטָּלוֹן", "qittel": "קִטֵּל", "qittelet": "קִטֶּלֶת", "qittol": "קִטּוֹל", "qittolet": "קִטֹּלֶת", "qittul": "קִטּוּל", "qol": "קֹל", "qotal": "קוֹטָל", "qotel": "קוֹטֵל", "qotelet": "קוֹטֶלֶת", "qotla": "קָטְלָה", "qtal": "קְטָל", "qtala": "קְטָלָה", "qtaltal": "קְטַלְטַל", "qtaltan": "קְטַלְתָּן", "qtaltolet": "קְטַלְטֹלֶת", "qtel": "קְטֵל", "qtela": "קְטֵלָה", "qtelet": "קְטֶלֶת", "qtil": "קְטִיל", "qtila": "קְטִילָה", "qtili": "קְטִילִי", "qtol": "קְטוֹל", "qtola": "קְטוֹלָה", "qtolet": "קְטֹלֶת", "qtul": "קְטוּל", "qtula": "קְטוּלָה", "qtulla": "קְטֻלָּה", "qtut": "קְטוּת", "qutla": "קֻטְלָה", "quttolet": "קֻטּוֹלֶת", # --- t --- "taqtela": "תַּקְטֵלָה", "taqtil": "תַּקְטִיל", "taqtit": "תַּקְטִית", "taqtul": "תַּקְטוּל", "taqtula": "תַּקְטוּלָה", "taqtut": "תַּקְטוּת", "tiqtal": "תִּקְטָל", "tiqtala": "תִּקְטָלָה", "tiqtelet": "תִּקְטֶלֶת", "tiqtolet": "תִּקְטֹלֶת", "tqilla": "תְּקִלָּה", "tqula": "תְּקוּלָה", # --- y --- "yaqtul": "יַקְטוּל", } def _mishkal_to_hebrew(mishkal: str) -> str | None: """Look up Hebrew mishkal, handling k-notation → q-notation conversion.""" if not mishkal: return None # Try as-is first (q-notation) result = _MISHKAL_HEBREW_Q.get(mishkal) if result: return result # Convert k-notation to q-notation and retry q_form = mishkal.replace("k", "q") return _MISHKAL_HEBREW_Q.get(q_form) # --------------------------------------------------------------------------- # HTTP session # --------------------------------------------------------------------------- _session = requests.Session() _session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"}) def _fetch(url: str, hebstyle: str, backoff: float = REQUEST_DELAY) -> str | None: """Fetch a URL with the given hebstyle cookie. Returns HTML string or None on failure.""" cookies = {"translit": "none", "hebstyle": hebstyle} max_wait = 60.0 while True: try: resp = _session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT) resp.raise_for_status() return resp.text except requests.exceptions.HTTPError as exc: status = exc.response.status_code if exc.response is not None else 0 if status == 404: logger.warning(" 404 for %s — skipping", url) return None if status in (429, 503): wait = min(backoff, max_wait) logger.warning(" Rate limited (%s) — waiting %.0fs", status, wait) time.sleep(wait) backoff = min(backoff * 2, max_wait) else: logger.error(" HTTP %s for %s", status, url) return None except requests.RequestException as exc: wait = min(backoff, max_wait) logger.warning(" Request error for %s: %s — retrying in %.0fs", url, exc, wait) time.sleep(wait) backoff = min(backoff * 2, max_wait) if backoff >= max_wait: logger.error(" Giving up on %s", url) return None # --------------------------------------------------------------------------- # Shared HTML parsing utilities # --------------------------------------------------------------------------- def _get_menukad_and_audio(cell) -> tuple[str, str]: """Extract (nikkud_text, audio_url) from a table cell.""" audio_url = "" audio_span = cell.find("span", class_=lambda c: c and "audio-play" in c) if audio_span: audio_url = audio_span.get("data-audio", "") # Also check direct data-audio attribute on cell if not audio_url: da = cell.get("data-audio", "") if da: audio_url = da span = cell.find("span", class_="menukad") if span: return span.get_text(strip=True), audio_url txt = cell.get_text(strip=True) if re.search(r"[\u05d0-\u05ea]", txt): return txt, audio_url return "", audio_url def _get_plain_text(cell) -> str: """Extract plain Hebrew text (no nikkud) from a cell — used for vl pages.""" span = cell.find("span", class_="menukad") if span: return span.get_text(strip=True) txt = cell.get_text(strip=True) if re.search(r"[\u05d0-\u05ea]", txt): return txt return "" # --------------------------------------------------------------------------- # Noun detail parsing # --------------------------------------------------------------------------- _GENDER_MAP = { "masculine": "masculine", "feminine": "feminine", "זכר": "masculine", "נקבה": "feminine", "male": "masculine", "female": "feminine", } _GENDER_HEBREW = { "masculine": {"nikkud": "זָכָר", "ktiv_male": "זכר"}, "feminine": {"nikkud": "נְקֵבָה", "ktiv_male": "נקבה"}, } def _parse_noun_table(soup: BeautifulSoup) -> dict[str, dict | str]: """ Parse the noun declension table from a pealim detail page soup. Returns a dict with keys: singular, plural, construct_singular, construct_plural, singular_audio, plural_audio — values are nikkud strings or audio URLs. Returns empty dict if no table found. """ table = soup.find("table", class_="conjugation-table") if not table: return {} rows = table.find_all("tr") result: dict[str, dict | str] = {} for row in rows: label_cell = row.find("th") or (row.find("td") if row.find_all("td") else None) if not label_cell: continue label_text = label_cell.get_text(strip=True).lower() tds = row.find_all("td") # Some rows have th + tds; tds may include the first label td # We want data cells (the ones with Hebrew forms) data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())] if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text): # Singular and plural forms in two cells if len(data_cells) >= 1: nikkud_sg, audio_sg = _get_menukad_and_audio(data_cells[0]) result["singular_nikkud"] = nikkud_sg if audio_sg: result["singular_audio"] = audio_sg if len(data_cells) >= 2: nikkud_pl, audio_pl = _get_menukad_and_audio(data_cells[1]) result["plural_nikkud"] = nikkud_pl if audio_pl: result["plural_audio"] = audio_pl elif "construct" in label_text or "סמיכות" in label_text: if len(data_cells) >= 1: nikkud_csg, _ = _get_menukad_and_audio(data_cells[0]) result["construct_singular_nikkud"] = nikkud_csg if len(data_cells) >= 2: nikkud_cpl, _ = _get_menukad_and_audio(data_cells[1]) result["construct_plural_nikkud"] = nikkud_cpl return result def _parse_noun_table_vl(soup: BeautifulSoup) -> dict[str, str]: """ Parse the noun declension table from a vl (ktiv male) page. Returns dict with keys: singular_ktiv, plural_ktiv, construct_singular_ktiv, construct_plural_ktiv. """ table = soup.find("table", class_="conjugation-table") if not table: return {} rows = table.find_all("tr") result: dict[str, str] = {} for row in rows: label_cell = row.find("th") if not label_cell: tds_all = row.find_all("td") if tds_all: label_cell = tds_all[0] if not label_cell: continue label_text = label_cell.get_text(strip=True).lower() tds = row.find_all("td") data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())] if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text): if len(data_cells) >= 1: result["singular_ktiv"] = _get_plain_text(data_cells[0]) if len(data_cells) >= 2: result["plural_ktiv"] = _get_plain_text(data_cells[1]) elif "construct" in label_text or "סמיכות" in label_text: if len(data_cells) >= 1: result["construct_singular_ktiv"] = _get_plain_text(data_cells[0]) if len(data_cells) >= 2: result["construct_plural_ktiv"] = _get_plain_text(data_cells[1]) return result def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]: """ Extract (gender, mishkal) from the PoS section of the detail page. Returns ("masculine"|"feminine"|"", mishkal_english|""). Pealim HTML structure:

Noun – ketel pattern, masculine

The mishkal is in the tag (k-notation, e.g. "ketel") or the nm= URL param (q-notation). Some nouns have no mishkal link:

Noun – masculine

""" gender = "" mishkal = "" # Find the PoS

tag — on pealim detail pages it's a bare

like # "Noun – ketel pattern, masculine" or "Adjective – katul pattern" pos_section = None for p in soup.find_all("p"): text = p.get_text(" ", strip=True) if re.match(r"^(Noun|Adjective)\b", text): pos_section = p break # Fall back to older selectors (div.pos, p.pos, div.page-header) if not pos_section: pos_section = ( soup.find("div", class_="pos") or soup.find("p", class_="pos") or soup.find("div", class_="page-header") ) if pos_section: text = pos_section.get_text(" ", strip=True) # Gender detection for raw, canonical in _GENDER_MAP.items(): if raw in text.lower(): gender = canonical break # Mishkal detection: extract from YYYY pattern # Nouns use nm= param, adjectives use am= param mishkal_link = pos_section.find("a", href=re.compile(r"[na]m=")) if mishkal_link: # Prefer tag text (k-notation, matches _MISHKAL_HEBREW_Q after k→q) i_tag = mishkal_link.find("i") if i_tag: mishkal = i_tag.get_text(strip=True) else: # Fall back to nm= URL parameter (already q-notation) href = mishkal_link.get("href", "") nm_match = re.search(r"[na]m=([a-zA-Z']+)", href) if nm_match: mishkal = nm_match.group(1) # Also check the og:description or breadcrumbs for gender if not gender: meta = soup.find("meta", {"property": "og:description"}) if meta: desc = meta.get("content", "").lower() for raw, canonical in _GENDER_MAP.items(): if raw in desc: gender = canonical break # Scan small/muted spans that often contain gender info if not gender: for span in soup.find_all("span", class_=lambda c: c and ("small" in c or "muted" in c or "pos" in c)): txt = span.get_text(strip=True).lower() for raw, canonical in _GENDER_MAP.items(): if raw in txt: gender = canonical break if gender: break return gender, mishkal def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict: """ Parse noun detail pages (mo=nikkud, vl=ktiv male). Returns dict to merge into entry's noun_inflection field. """ mo_soup = BeautifulSoup(mo_html, "lxml") vl_soup = BeautifulSoup(vl_html, "lxml") mo_data = _parse_noun_table(mo_soup) vl_data = _parse_noun_table_vl(vl_soup) gender, mishkal = _parse_noun_gender_mishkal(mo_soup) def form_or_null(nikkud: str, ktiv: str) -> dict | None: if not nikkud: return None if not ktiv: logger.warning("No ktiv_male for noun form: %s", nikkud) return {"nikkud": nikkud, "ktiv_male": ktiv} singular_nikkud = str(mo_data.get("singular_nikkud", "")) plural_nikkud = str(mo_data.get("plural_nikkud", "")) construct_singular_nikkud = str(mo_data.get("construct_singular_nikkud", "")) construct_plural_nikkud = str(mo_data.get("construct_plural_nikkud", "")) singular_ktiv = str(vl_data.get("singular_ktiv", "")) plural_ktiv = str(vl_data.get("plural_ktiv", "")) construct_singular_ktiv = str(vl_data.get("construct_singular_ktiv", "")) construct_plural_ktiv = str(vl_data.get("construct_plural_ktiv", "")) result: dict = { "singular": form_or_null(singular_nikkud, singular_ktiv), "plural": form_or_null(plural_nikkud, plural_ktiv), "construct_singular": form_or_null(construct_singular_nikkud, construct_singular_ktiv), "construct_plural": form_or_null(construct_plural_nikkud, construct_plural_ktiv), "singular_audio": mo_data.get("singular_audio"), "plural_audio": mo_data.get("plural_audio"), "pronominal_suffixes": None, # plurals_guid is PRESERVED by the merge step — not set here } if gender: result["gender"] = gender result["gender_hebrew"] = _GENDER_HEBREW.get(gender) if mishkal: result["mishkal"] = mishkal result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal) return result # --------------------------------------------------------------------------- # Verb detail parsing (ported from conjugation_extract.py) # --------------------------------------------------------------------------- def _extract_binyan_from_page(soup: BeautifulSoup) -> str: """Extract binyan from page header span or og:description.""" texts = [h3.get_text(" ", strip=True) for h3 in soup.find_all("h3", class_="page-header")] meta = soup.find("meta", {"property": "og:description"}) if meta: texts.append(str(meta.get("content", ""))) for text in texts: text_lower = text.lower() for i, bname_lower in enumerate(_BINYAN_NAMES_LOWER): if bname_lower in text_lower: return BINYAN_NAMES[i] return "" def _parse_conjugation_table( soup: BeautifulSoup, passive: bool = False, table_el=None, ) -> dict[str, dict]: """ Parse conjugation table. Returns form_key -> {form_nikkud, audio_url} dict. If passive=True, locates the passive table (after "Passive" heading). If table_el is provided, parses that table directly. """ if passive: passive_h3 = next( (h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()), None, ) if not passive_h3: return {} table = next( ( sib for sib in passive_h3.find_all_next() if sib.name == "table" and "conjugation-table" in sib.get("class", []) ), None, ) if not table: return {} elif table_el is not None: table = table_el else: table = soup.find("table", class_="conjugation-table") if not table: return {} rows = table.find_all("tr") if len(rows) < 3: return {} forms: dict[str, dict] = {} def heb_cells(row_idx: int) -> list[tuple[str, str]]: """Return (nikkud_text, audio_url) for each Hebrew-containing cell in the row.""" cells = rows[row_idx].find_all(["th", "td"]) result = [] for cell in cells: txt, au = _get_menukad_and_audio(cell) colspan = int(cell.get("colspan", 1)) if txt and re.search(r"[\u05d0-\u05ea]", txt): for _ in range(colspan): result.append((txt, au)) return result def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]: seen: set[str] = set() out: list[tuple[str, str]] = [] for pair in pairs: if pair[0] not in seen: seen.add(pair[0]) out.append(pair) return out def store(key: str, nikkud: str, audio_url: str) -> None: if nikkud: forms[key] = {"form_nikkud": nikkud, "audio_url": audio_url} # Locate tense rows by label text present_row = past_row = future_row = imp_row = inf_row = -1 for i, row in enumerate(rows): label = row.get_text(" ", strip=True).lower() if "present" in label and present_row < 0: present_row = i elif "past" in label and past_row < 0: past_row = i elif "future" in label and future_row < 0: future_row = i elif "imperative" in label and imp_row < 0: imp_row = i elif "infinitive" in label and inf_row < 0: inf_row = i # Present: ms fs mp fp if present_row >= 0: hf = heb_cells(present_row) for k, (v, au) in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False): store(k, v, au) # Past row 0: 1s 1p (deduplicated) if past_row >= 0: uniq = deduplicate(heb_cells(past_row)) if len(uniq) >= 1: store("past_1s", uniq[0][0], uniq[0][1]) if len(uniq) >= 2: store("past_1p", uniq[1][0], uniq[1][1]) # Past row 1: 2ms 2fs 2mp 2fp if past_row + 1 < len(rows): for k, (v, au) in zip( ["past_2ms", "past_2fs", "past_2mp", "past_2fp"], heb_cells(past_row + 1), strict=False, ): store(k, v, au) # Past row 2: 3ms 3fs 3p (deduplicated) if past_row + 2 < len(rows): uniq3 = deduplicate(heb_cells(past_row + 2)) for k, (v, au) in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False): store(k, v, au) # Future row 0: 1s 1p (deduplicated) if future_row >= 0: uniq_f = deduplicate(heb_cells(future_row)) if len(uniq_f) >= 1: store("future_1s", uniq_f[0][0], uniq_f[0][1]) if len(uniq_f) >= 2: store("future_1p", uniq_f[1][0], uniq_f[1][1]) # Future row 1: 2ms 2fs 2mp 2fp if future_row + 1 < len(rows): for k, (v, au) in zip( ["future_2ms", "future_2fs", "future_2mp", "future_2fp"], heb_cells(future_row + 1), strict=False, ): store(k, v, au) # Future row 2: 3ms 3fs 3mp 3fp if future_row + 2 < len(rows): for k, (v, au) in zip( ["future_3ms", "future_3fs", "future_3mp", "future_3fp"], heb_cells(future_row + 2), strict=False, ): store(k, v, au) # Imperative: ms fs mp fp if imp_row >= 0: hf = heb_cells(imp_row) for k, (v, au) in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False): store(k, v, au) # Infinitive if inf_row >= 0: hf = heb_cells(inf_row) if hf: store("infinitive", hf[0][0], hf[0][1]) return forms def _parse_conjugation_table_vl( soup: BeautifulSoup, passive: bool = False, table_el=None, ) -> dict[str, str]: """ Parse conjugation table from a vl (ktiv male) page. Returns form_key -> ktiv_male_text dict. Mirrors _parse_conjugation_table but extracts plain text. """ if passive: passive_h3 = next( (h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()), None, ) if not passive_h3: return {} table = next( ( sib for sib in passive_h3.find_all_next() if sib.name == "table" and "conjugation-table" in sib.get("class", []) ), None, ) if not table: return {} elif table_el is not None: table = table_el else: table = soup.find("table", class_="conjugation-table") if not table: return {} rows = table.find_all("tr") if len(rows) < 3: return {} ktiv_forms: dict[str, str] = {} def heb_cells_plain(row_idx: int) -> list[str]: cells = rows[row_idx].find_all(["th", "td"]) result = [] for cell in cells: txt = _get_plain_text(cell) colspan = int(cell.get("colspan", 1)) if txt and re.search(r"[\u05d0-\u05ea]", txt): for _ in range(colspan): result.append(txt) return result def deduplicate_str(items: list[str]) -> list[str]: seen: set[str] = set() out: list[str] = [] for item in items: if item not in seen: seen.add(item) out.append(item) return out present_row = past_row = future_row = imp_row = inf_row = -1 for i, row in enumerate(rows): label = row.get_text(" ", strip=True).lower() if "present" in label and present_row < 0: present_row = i elif "past" in label and past_row < 0: past_row = i elif "future" in label and future_row < 0: future_row = i elif "imperative" in label and imp_row < 0: imp_row = i elif "infinitive" in label and inf_row < 0: inf_row = i if present_row >= 0: hf = heb_cells_plain(present_row) for k, v in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False): if v: ktiv_forms[k] = v if past_row >= 0: uniq = deduplicate_str(heb_cells_plain(past_row)) if len(uniq) >= 1: ktiv_forms["past_1s"] = uniq[0] if len(uniq) >= 2: ktiv_forms["past_1p"] = uniq[1] if past_row + 1 < len(rows): for k, v in zip( ["past_2ms", "past_2fs", "past_2mp", "past_2fp"], heb_cells_plain(past_row + 1), strict=False, ): if v: ktiv_forms[k] = v if past_row + 2 < len(rows): uniq3 = deduplicate_str(heb_cells_plain(past_row + 2)) for k, v in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False): if v: ktiv_forms[k] = v if future_row >= 0: uniq_f = deduplicate_str(heb_cells_plain(future_row)) if len(uniq_f) >= 1: ktiv_forms["future_1s"] = uniq_f[0] if len(uniq_f) >= 2: ktiv_forms["future_1p"] = uniq_f[1] if future_row + 1 < len(rows): for k, v in zip( ["future_2ms", "future_2fs", "future_2mp", "future_2fp"], heb_cells_plain(future_row + 1), strict=False, ): if v: ktiv_forms[k] = v if future_row + 2 < len(rows): for k, v in zip( ["future_3ms", "future_3fs", "future_3mp", "future_3fp"], heb_cells_plain(future_row + 2), strict=False, ): if v: ktiv_forms[k] = v if imp_row >= 0: hf = heb_cells_plain(imp_row) for k, v in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False): if v: ktiv_forms[k] = v if inf_row >= 0: hf = heb_cells_plain(inf_row) if hf: ktiv_forms["infinitive"] = hf[0] return ktiv_forms def _forms_to_active_list( mo_forms: dict[str, dict], vl_forms: dict[str, str], existing_forms: list[dict] | None, ) -> list[dict]: """ Convert parsed form dicts into the active_forms list structure (matches SCHEMA.yaml). Preserves guid and guid_candidates from existing_forms where present. """ # Build a lookup of existing form data keyed by (person, tense) for GUID preservation existing_lookup: dict[tuple[str, str], dict] = {} if existing_forms: for ef in existing_forms: key = (ef.get("person", ""), ef.get("tense", "")) existing_lookup[key] = ef active_forms: list[dict] = [] for form_key, form_data in mo_forms.items(): person = FORM_KEY_TO_PERSON.get(form_key, form_key) tense = TENSE_DESCRIPTION.get(form_key, "") nikkud = form_data["form_nikkud"] ktiv = vl_forms.get(form_key, "") if not ktiv: logger.warning("No ktiv_male for verb form %s: %s", form_key, nikkud) audio_url = form_data.get("audio_url", "") pronoun = PRONOUN_LABELS.get(form_key, "") # Preserve GUIDs from existing entry existing = existing_lookup.get((person, tense), {}) guid = existing.get("guid") guid_candidates = existing.get("guid_candidates") active_forms.append( { "person": person, "tense": tense, "pronoun_hebrew": pronoun, "form": {"nikkud": nikkud, "ktiv_male": ktiv}, "audio_url": audio_url, "audio_file": existing.get("audio_file"), "guid": guid, "guid_candidates": guid_candidates, } ) return active_forms def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: dict | None) -> dict: """ Parse verb detail pages (mo=nikkud, vl=ktiv male). Returns dict to merge into entry's conjugation field. Preserves in_conjugation_deck, guid, guid_candidates from existing_conj. """ mo_soup = BeautifulSoup(mo_html, "lxml") vl_soup = BeautifulSoup(vl_html, "lxml") existing = existing_conj or {} # Extract metadata from mo page binyan = _extract_binyan_from_page(mo_soup) meaning = "" prep: str | None = None lead_div = mo_soup.find("div", class_="lead") if lead_div: meaning = lead_div.get_text(strip=True) # Extract preposition(s) from the lead text, e.g. "(על)" → "על" prep_matches = HBPAREN_RE.findall(meaning) if prep_matches: prep = " ".join(prep_matches) # Fall back to any prep already stored (e.g. from a previous manual edit) if prep is None: prep = existing.get("prep") # Parse active forms mo_active = _parse_conjugation_table(mo_soup, passive=False) vl_active = _parse_conjugation_table_vl(vl_soup, passive=False) if not mo_active: logger.warning(" No active forms found for slug=%s", slug) return {} # Determine infinitive and reference form infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "") infinitive_ktiv = vl_active.get("infinitive", "") if infinitive_nikkud and not infinitive_ktiv: logger.warning("No ktiv_male for infinitive: %s (slug=%s)", infinitive_nikkud, slug) past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "") past_3ms_ktiv = vl_active.get("past_3ms", "") if past_3ms_nikkud and not past_3ms_ktiv: logger.warning("No ktiv_male for past_3ms: %s (slug=%s)", past_3ms_nikkud, slug) # Build active forms list, preserving GUIDs existing_active_forms = existing.get("active_forms") active_forms = _forms_to_active_list(mo_active, vl_active, existing_active_forms) # Check for passive section (Hif'il / Pi'el verbs) passive_h3 = next( (h for h in mo_soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()), None, ) hufal_pual_forms = None reference_form_passive = None if passive_h3: mo_passive = _parse_conjugation_table(mo_soup, passive=True) vl_passive = _parse_conjugation_table_vl(vl_soup, passive=True) if mo_passive: existing_passive_forms = existing.get("hufal_pual_forms") hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms) passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "") passive_3ms_ktiv = vl_passive.get("past_3ms", "") if passive_3ms_nikkud and not passive_3ms_ktiv: logger.warning("No ktiv_male for passive past_3ms: %s (slug=%s)", passive_3ms_nikkud, slug) if passive_3ms_nikkud: reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv} result: dict = { "in_conjugation_deck": existing.get("in_conjugation_deck", False), "infinitive": {"nikkud": infinitive_nikkud, "ktiv_male": infinitive_ktiv} if infinitive_nikkud else None, "reference_form": {"nikkud": past_3ms_nikkud, "ktiv_male": past_3ms_ktiv} if past_3ms_nikkud else None, "binyan": binyan, "binyan_hebrew": BINYAN_HEBREW.get(binyan, ""), "meaning": meaning, "prep": prep, "active_forms": active_forms, "hufal_pual_forms": hufal_pual_forms, "reference_form_passive": reference_form_passive, } return result # --------------------------------------------------------------------------- # Adjective detail parsing # --------------------------------------------------------------------------- _ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a") _ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp") def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]: """ Parse the adjective inflection table from a pealim detail page (mo/nikkud). Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text and audio URL from each. Returns: Dict mapping form key ("ms", "fs", "mp", "fp") to {"nikkud": str, "audio_url": str}, or empty dict if table not found. """ table = soup.find("table", class_="conjugation-table") if not table: return {} result: dict[str, dict] = {} for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True): div = table.find(id=cell_id) if not div: continue nikkud, audio_url = _get_menukad_and_audio(div) if nikkud: result[form_key] = {"nikkud": nikkud, "audio_url": audio_url} return result def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]: """ Parse the adjective inflection table from a vl (ktiv male) page. Returns: Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string. """ table = soup.find("table", class_="conjugation-table") if not table: return {} result: dict[str, str] = {} for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True): div = table.find(id=cell_id) if not div: continue ktiv = _get_plain_text(div) if ktiv: result[form_key] = ktiv return result def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]: """ Extract mishkal from the PoS section of an adjective detail page. Reuses the same extraction logic as _parse_noun_gender_mishkal. Returns: Tuple of (mishkal_english, mishkal_hebrew) where either may be empty. """ _, mishkal = _parse_noun_gender_mishkal(soup) mishkal_hebrew = _mishkal_to_hebrew(mishkal) or "" return mishkal, mishkal_hebrew def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict: """ Parse adjective detail pages (mo=nikkud, vl=ktiv male). Returns: Dict matching the adjective_inflection schema: {ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}. Empty dict if no forms found. """ mo_soup = BeautifulSoup(mo_html, "lxml") vl_soup = BeautifulSoup(vl_html, "lxml") mo_data = _parse_adjective_table(mo_soup) vl_data = _parse_adjective_table_vl(vl_soup) mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup) if not mo_data: return {} result: dict = {} for form_key in _ADJECTIVE_FORM_KEYS: mo_form = mo_data.get(form_key) if mo_form: nikkud = mo_form["nikkud"] ktiv = vl_data.get(form_key, "") if not ktiv: logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud) result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv} else: result[form_key] = None result["mishkal"] = mishkal or None result["mishkal_hebrew"] = mishkal_hebrew or None return result # --------------------------------------------------------------------------- # Preposition detail parsing # --------------------------------------------------------------------------- _PREPOSITION_CELL_IDS: tuple[str, ...] = ( "P-1s", "P-1p", "P-2ms", "P-2fs", "P-2mp", "P-2fp", "P-3ms", "P-3fs", "P-3mp", "P-3fp", ) _PREPOSITION_FORM_KEYS: tuple[str, ...] = ( "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", ) def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]: """ Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud). Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud text and audio URL from each. Returns: Dict mapping person key ("1s", "1p", …, "3fp") to {"nikkud": str, "audio_url": str}, or empty dict if table not found. """ table = soup.find("table", class_="conjugation-table") if not table: return {} result: dict[str, dict] = {} for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True): div = table.find(id=cell_id) if not div: continue nikkud, audio_url = _get_menukad_and_audio(div) if nikkud: result[form_key] = {"nikkud": nikkud, "audio_url": audio_url} return result def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]: """ Parse the preposition pronominal suffix table from a vl (ktiv male) page. Returns: Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string. """ table = soup.find("table", class_="conjugation-table") if not table: return {} result: dict[str, str] = {} for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True): div = table.find(id=cell_id) if not div: continue ktiv = _get_plain_text(div) if ktiv: result[form_key] = ktiv return result def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict: """ Parse preposition detail pages (mo=nikkud, vl=ktiv male). Returns: Dict matching the preposition_inflection schema: {1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}. Empty dict if no forms found. """ mo_soup = BeautifulSoup(mo_html, "lxml") vl_soup = BeautifulSoup(vl_html, "lxml") mo_data = _parse_preposition_table(mo_soup) vl_data = _parse_preposition_table_vl(vl_soup) if not mo_data: return {} result: dict = {} for form_key in _PREPOSITION_FORM_KEYS: mo_form = mo_data.get(form_key) if mo_form: nikkud = mo_form["nikkud"] ktiv = vl_data.get(form_key, "") if not ktiv: logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud) result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv} else: result[form_key] = None return result # --------------------------------------------------------------------------- # Merging strategy # --------------------------------------------------------------------------- def _merge_noun_inflection(existing_ni: dict | None, scraped: dict) -> dict: """ Merge scraped noun data into existing noun_inflection, preserving plurals_guid. """ result = dict(scraped) if existing_ni: # PRESERVE existing plurals_guid — never overwrite if existing_ni.get("plurals_guid"): result["plurals_guid"] = existing_ni["plurals_guid"] # Preserve existing singular_audio if we didn't scrape one if not result.get("singular_audio") and existing_ni.get("singular_audio"): result["singular_audio"] = existing_ni["singular_audio"] # Preserve existing plural_audio if we didn't scrape one if not result.get("plural_audio") and existing_ni.get("plural_audio"): result["plural_audio"] = existing_ni["plural_audio"] # Preserve existing singular/plural if we failed to scrape them for field in ("singular", "plural", "construct_singular", "construct_plural"): if not result.get(field) and existing_ni.get(field): result[field] = existing_ni[field] else: result.setdefault("plurals_guid", None) return result def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict: """ Merge scraped verb data into existing conjugation, preserving in_conjugation_deck and all guid/guid_candidates fields (already handled in _forms_to_active_list). """ # The scraped dict already preserves in_conjugation_deck and GUIDs via _forms_to_active_list return scraped def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict: """ Merge scraped adjective data into existing adjective_inflection. No GUIDs to preserve — simple overwrite with scraped data. """ return dict(scraped) def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict: """ Merge scraped preposition data into existing preposition_inflection. No GUIDs to preserve — simple overwrite with scraped data. """ return dict(scraped) # --------------------------------------------------------------------------- # I/O helpers # --------------------------------------------------------------------------- def _load_words() -> dict: """Load words.json. Returns empty dict if file not found.""" if WORDS_JSON.exists(): with open(WORDS_JSON, encoding="utf-8") as f: return json.load(f) return {} def _save_words(data: dict) -> None: """Atomically write words.json via a .tmp file.""" WORDS_JSON.parent.mkdir(parents=True, exist_ok=True) tmp_path = str(WORDS_JSON) + ".tmp" with open(tmp_path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) os.replace(tmp_path, WORDS_JSON) # --------------------------------------------------------------------------- # Main scrape loop # --------------------------------------------------------------------------- def _should_process( entry: dict, pos: str, force: bool, nouns_only: bool, verbs_only: bool, adjectives_only: bool, prepositions_only: bool, ) -> bool: """Return True if this entry should be scraped.""" if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")): return False if nouns_only and not pos.startswith("Noun"): return False if verbs_only and not pos.startswith("Verb"): return False if adjectives_only and not pos.startswith("Adjective"): return False if prepositions_only and not pos.startswith("Preposition"): return False return force or not entry.get("detail_scraped") def run( test: int | None = None, force_refresh: bool = False, nouns_only: bool = False, verbs_only: bool = False, adjectives_only: bool = False, prepositions_only: bool = False, ) -> None: """ Main scrape loop. Args: test: If set, scrape at most this many entries (for smoke-testing). force_refresh: Re-scrape entries where detail_scraped=True. nouns_only: Only scrape noun entries. verbs_only: Only scrape verb entries. adjectives_only: Only scrape adjective entries. prepositions_only: Only scrape preposition entries. """ words = _load_words() candidates = [ (unique_key, entry) for unique_key, entry in words.items() if _should_process( entry, entry.get("pos", ""), force_refresh, nouns_only, verbs_only, adjectives_only, prepositions_only, ) and entry.get("slug") ] total = len(candidates) if test is not None: candidates = candidates[:test] logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total) else: logger.info( "Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json", total, ) processed = 0 errors = 0 for idx, (unique_key, entry) in enumerate(candidates, start=1): slug = entry["slug"] pos = entry.get("pos", "") word_nikkud = entry.get("word", {}).get("nikkud", unique_key) url = f"{PEALIM_BASE}/dict/{slug}/" if pos.startswith("Noun"): label = "Noun" elif pos.startswith("Verb"): label = "Verb" elif pos.startswith("Adjective"): label = "Adjective" else: label = "Preposition" logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug) # Fetch mo (nikkud) page time.sleep(REQUEST_DELAY) mo_html = _fetch(url, hebstyle="mo") if not mo_html: logger.warning(" Skipping %s — failed to fetch mo page", slug) errors += 1 continue # Fetch vl (ktiv male) page time.sleep(REQUEST_DELAY) vl_html = _fetch(url, hebstyle="vl") if not vl_html: logger.warning(" Skipping %s — failed to fetch vl page", slug) errors += 1 continue # Parse and merge try: if pos.startswith("Noun"): scraped = _scrape_noun_detail(slug, mo_html, vl_html) if scraped: existing_ni = entry.get("noun_inflection") or {} merged = _merge_noun_inflection(existing_ni, scraped) words[unique_key]["noun_inflection"] = merged sg = merged.get("singular", {}) or {} pl = merged.get("plural", {}) or {} logger.info( " singular=%s plural=%s", sg.get("nikkud", "—"), pl.get("nikkud", "—"), ) else: logger.warning(" No noun data scraped for %s", slug) errors += 1 continue elif pos.startswith("Verb"): existing_conj = entry.get("conjugation") scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj) if scraped: merged = _merge_conjugation(existing_conj, scraped) words[unique_key]["conjugation"] = merged n_forms = len(merged.get("active_forms", [])) logger.info( " %s, %d forms", merged.get("binyan", "?"), n_forms, ) else: logger.warning(" No verb data scraped for %s", slug) errors += 1 continue elif pos.startswith("Adjective"): scraped = _scrape_adjective_detail(slug, mo_html, vl_html) if scraped: existing_ai = entry.get("adjective_inflection") merged = _merge_adjective_inflection(existing_ai, scraped) words[unique_key]["adjective_inflection"] = merged ms = merged.get("ms", {}) or {} fs = merged.get("fs", {}) or {} logger.info( " ms=%s fs=%s mishkal=%s", ms.get("nikkud", "—"), fs.get("nikkud", "—"), merged.get("mishkal", "—"), ) else: logger.warning(" No adjective data scraped for %s", slug) errors += 1 continue else: # Preposition scraped = _scrape_preposition_detail(slug, mo_html, vl_html) if scraped: existing_pi = entry.get("preposition_inflection") merged = _merge_preposition_inflection(existing_pi, scraped) words[unique_key]["preposition_inflection"] = merged form_1s = merged.get("1s", {}) or {} logger.info( " 1s=%s", form_1s.get("nikkud", "—"), ) else: logger.warning(" No preposition data scraped for %s", slug) errors += 1 continue except Exception as exc: # noqa: BLE001 logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True) errors += 1 continue words[unique_key]["detail_scraped"] = True processed += 1 # Incremental save every SAVE_INTERVAL entries if processed % SAVE_INTERVAL == 0: logger.info(" Auto-saving after %d entries...", processed) _save_words(words) # Final save _save_words(words) logger.info( "Done. Processed=%d, Errors=%d, Total eligible=%d", processed, errors, len(candidates), ) # --------------------------------------------------------------------------- # CLI # --------------------------------------------------------------------------- def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.") ) parser.add_argument( "--test", metavar="N", type=int, default=None, help="Scrape only N entries (smoke-test mode).", ) parser.add_argument( "--force-refresh-detail", action="store_true", default=False, help="Re-scrape entries where detail_scraped=True.", ) group = parser.add_mutually_exclusive_group() group.add_argument( "--nouns-only", action="store_true", default=False, help="Only scrape Noun entries.", ) group.add_argument( "--verbs-only", action="store_true", default=False, help="Only scrape Verb entries.", ) group.add_argument( "--adjectives-only", action="store_true", default=False, help="Only scrape Adjective entries.", ) group.add_argument( "--prepositions-only", action="store_true", default=False, help="Only scrape Preposition entries.", ) return parser if __name__ == "__main__": logging.basicConfig( level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s", datefmt="%H:%M:%S", ) args = _build_parser().parse_args() run( test=args.test, force_refresh=args.force_refresh_detail, nouns_only=args.nouns_only, verbs_only=args.verbs_only, adjectives_only=args.adjectives_only, prepositions_only=args.prepositions_only, )