Template & CSS fixes (15 items from Mar 9 feedback): - Fix conjugation front showing 3ms form instead of infinitive - Rename conjugation model to "Hebrew Conjugation" - Strip Hebrew parenthesized text from English meanings - Shoresh separator: spaces → dots (א.כ.ל) - Remove duplicate English meaning from cloze back - Remove example sentences from vocab front/back (cloze only) - Center-align audio buttons on all decks - Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)" - Unify sec-key/sec-label fonts, make keys bold - Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px) - Center-align related words groups - Sort confusables by average frequency - Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning - Clean duplicate quotation marks in cloze sentences Sprint 12 carry-forward (detail scrape + EPUB): - Adjective/preposition detail scraping in pealim_detail_scrape.py - EPUB example matching rewrite in epub_examples.py - Delete benyehuda.py and rebuild_sentence_matches.py (merged) - 49 parser tests for detail scraping - SCHEMA.yaml updates for new fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1560 lines
52 KiB
Python
1560 lines
52 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Consolidated detail page scraper for pealim.com.
|
|
|
|
Visits /dict/<slug>/ detail pages for nouns, verbs, adjectives and prepositions
|
|
in data/words.json.
|
|
Makes two requests per slug:
|
|
1. hebstyle=mo cookie → nikkud forms
|
|
2. hebstyle=vl cookie → ktiv male forms
|
|
|
|
Updates entries in data/words.json with scraped detail data.
|
|
|
|
Usage:
|
|
python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
|
|
[--nouns-only | --verbs-only |
|
|
--adjectives-only | --prepositions-only]
|
|
"""
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Constants
|
|
# ---------------------------------------------------------------------------
|
|
|
|
PEALIM_BASE = "https://www.pealim.com"
|
|
REQUEST_DELAY = 1.5 # seconds between requests
|
|
REQUEST_TIMEOUT = 15
|
|
SAVE_INTERVAL = 50 # write words.json every N processed entries
|
|
|
|
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
|
|
|
|
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
|
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
|
|
|
|
BINYAN_HEBREW: dict[str, str] = {
|
|
"Pa'al": "פָּעַל",
|
|
"Nif'al": "נִפְעַל",
|
|
"Pi'el": "פִּיעֵל",
|
|
"Pu'al": "פֻּעַל",
|
|
"Hif'il": "הִפְעִיל",
|
|
"Huf'al": "הֻפְעַל",
|
|
"Hitpa'el": "הִתְפַּעֵל",
|
|
}
|
|
|
|
PRONOUN_LABELS: dict[str, str] = {
|
|
"present_ms": "",
|
|
"present_fs": "",
|
|
"present_mp": "",
|
|
"present_fp": "",
|
|
"past_1s": "אֲנִי",
|
|
"past_1p": "אֲנַחְנוּ",
|
|
"past_2ms": "אַתָּה",
|
|
"past_2fs": "אַתְּ",
|
|
"past_2mp": "אַתֶּם",
|
|
"past_2fp": "אַתֶּן",
|
|
"past_3ms": "הוּא",
|
|
"past_3fs": "הִיא",
|
|
"past_3p": "הֵם / הֵן",
|
|
"future_1s": "אֲנִי",
|
|
"future_1p": "אֲנַחְנוּ",
|
|
"future_2ms": "אַתָּה",
|
|
"future_2fs": "אַתְּ",
|
|
"future_2mp": "אַתֶּם",
|
|
"future_2fp": "אַתֶּן",
|
|
"future_3ms": "הוּא",
|
|
"future_3fs": "הִיא",
|
|
"future_3mp": "הֵם",
|
|
"future_3fp": "הֵן",
|
|
"imperative_ms": "אַתָּה",
|
|
"imperative_fs": "אַתְּ",
|
|
"imperative_mp": "אַתֶּם",
|
|
"imperative_fp": "אַתֶּן",
|
|
"infinitive": "",
|
|
}
|
|
|
|
TENSE_DESCRIPTION: dict[str, str] = {
|
|
"present_ms": "הוֹוֶה",
|
|
"present_fs": "הוֹוֶה",
|
|
"present_mp": "הוֹוֶה",
|
|
"present_fp": "הוֹוֶה",
|
|
"past_1s": "עָבָר",
|
|
"past_1p": "עָבָר",
|
|
"past_2ms": "עָבָר",
|
|
"past_2fs": "עָבָר",
|
|
"past_2mp": "עָבָר",
|
|
"past_2fp": "עָבָר",
|
|
"past_3ms": "עָבָר",
|
|
"past_3fs": "עָבָר",
|
|
"past_3p": "עָבָר",
|
|
"future_1s": "עָתִיד",
|
|
"future_1p": "עָתִיד",
|
|
"future_2ms": "עָתִיד",
|
|
"future_2fs": "עָתִיד",
|
|
"future_2mp": "עָתִיד",
|
|
"future_2fp": "עָתִיד",
|
|
"future_3ms": "עָתִיד",
|
|
"future_3fs": "עָתִיד",
|
|
"future_3mp": "עָתִיד",
|
|
"future_3fp": "עָתִיד",
|
|
"imperative_ms": "צִוּוּי",
|
|
"imperative_fs": "צִוּוּי",
|
|
"imperative_mp": "צִוּוּי",
|
|
"imperative_fp": "צִוּוּי",
|
|
"infinitive": "מְקוֹר",
|
|
}
|
|
|
|
FORM_KEY_TO_PERSON: dict[str, str] = {
|
|
"present_ms": "ms",
|
|
"present_fs": "fs",
|
|
"present_mp": "mp",
|
|
"present_fp": "fp",
|
|
"past_1s": "1s",
|
|
"past_1p": "1p",
|
|
"past_2ms": "2ms",
|
|
"past_2fs": "2fs",
|
|
"past_2mp": "2mp",
|
|
"past_2fp": "2fp",
|
|
"past_3ms": "3ms",
|
|
"past_3fs": "3fs",
|
|
"past_3p": "3p",
|
|
"future_1s": "1s",
|
|
"future_1p": "1p",
|
|
"future_2ms": "2ms",
|
|
"future_2fs": "2fs",
|
|
"future_2mp": "2mp",
|
|
"future_2fp": "2fp",
|
|
"future_3ms": "3ms",
|
|
"future_3fs": "3fs",
|
|
"future_3mp": "3mp",
|
|
"future_3fp": "3fp",
|
|
"imperative_ms": "ms",
|
|
"imperative_fs": "fs",
|
|
"imperative_mp": "mp",
|
|
"imperative_fp": "fp",
|
|
"infinitive": "inf",
|
|
}
|
|
|
|
# Mishkal English name → Hebrew nikkud mapping
|
|
# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
|
|
# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
|
|
# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
|
|
_MISHKAL_HEBREW_Q: dict[str, str] = {
|
|
# --- a ---
|
|
"aqtal": "אַקְטָל",
|
|
"aqtala": "אַקְטָלָה",
|
|
# --- e ---
|
|
"eqtal": "אֶקְטָל",
|
|
# --- h ---
|
|
"haqtala": "הַקְטָלָה",
|
|
"heqtel": "הֶקְטֵל",
|
|
"hiqqatlut": "הִקָּטְלוּת",
|
|
"hitqattlut": "הִתְקַטְּלוּת",
|
|
# --- m ---
|
|
"maqtal": "מַקְטָל",
|
|
"maqtel": "מַקְטֵל",
|
|
"maqtela": "מַקְטֵלָה",
|
|
"maqtelet": "מַקְטֶלֶת",
|
|
"maqtil": "מַקְטִיל",
|
|
"maqtol": "מַקְטוֹל",
|
|
"maqtolet": "מַקְטֹלֶת",
|
|
"maqtul": "מַקְטוּל",
|
|
"meqattel": "מְקַטֵּל",
|
|
"meqila": "מְקִילָה",
|
|
"mequla": "מְקוּלָה",
|
|
"mequttal": "מְקֻטָּל",
|
|
"miqtal": "מִקְטָל",
|
|
"miqtala": "מִקְטָלָה",
|
|
"miqtelet": "מִקְטֶלֶת",
|
|
"miqtol": "מִקְטוֹל",
|
|
"miqtolet": "מִקְטֹלֶת",
|
|
"mitqattel": "מִתְקַטֵּל",
|
|
"muqtal": "מֻקְטָל",
|
|
# --- n ---
|
|
"niqtal": "נִקְטָל",
|
|
# --- q ---
|
|
"qal": "קַל",
|
|
"qatal": "קָטָל",
|
|
"qatel": "קָטֵל",
|
|
"qatil": "קָטִיל",
|
|
"qatla": "קַטְלָה",
|
|
"qatlan": "קַטְלָן",
|
|
"qatlut": "קַטְלוּת",
|
|
"qatol": "קָטוֹל",
|
|
"qaton": "קָטוֹן",
|
|
"qattal": "קַטָּל",
|
|
"qattala": "קַטָּלָה",
|
|
"qattelet": "קַטֶּלֶת",
|
|
"qattil": "קַטִּיל",
|
|
"qattila": "קַטִּילָה",
|
|
"qattolet": "קַטֹּלֶת",
|
|
"qattul": "קַטּוּל",
|
|
"qatul": "קָטוּל",
|
|
"qatut": "קָטוּת",
|
|
"qetel": "קֶטֶל",
|
|
"qeteh": "קֵטֶה",
|
|
"qitla": "קִטְלָה",
|
|
"qitlon": "קִטְלוֹן",
|
|
"qittalon": "קִטָּלוֹן",
|
|
"qittel": "קִטֵּל",
|
|
"qittelet": "קִטֶּלֶת",
|
|
"qittol": "קִטּוֹל",
|
|
"qittolet": "קִטֹּלֶת",
|
|
"qittul": "קִטּוּל",
|
|
"qol": "קֹל",
|
|
"qotal": "קוֹטָל",
|
|
"qotel": "קוֹטֵל",
|
|
"qotelet": "קוֹטֶלֶת",
|
|
"qotla": "קָטְלָה",
|
|
"qtal": "קְטָל",
|
|
"qtala": "קְטָלָה",
|
|
"qtaltal": "קְטַלְטַל",
|
|
"qtaltan": "קְטַלְתָּן",
|
|
"qtaltolet": "קְטַלְטֹלֶת",
|
|
"qtel": "קְטֵל",
|
|
"qtela": "קְטֵלָה",
|
|
"qtelet": "קְטֶלֶת",
|
|
"qtil": "קְטִיל",
|
|
"qtila": "קְטִילָה",
|
|
"qtili": "קְטִילִי",
|
|
"qtol": "קְטוֹל",
|
|
"qtola": "קְטוֹלָה",
|
|
"qtolet": "קְטֹלֶת",
|
|
"qtul": "קְטוּל",
|
|
"qtula": "קְטוּלָה",
|
|
"qtulla": "קְטֻלָּה",
|
|
"qtut": "קְטוּת",
|
|
"qutla": "קֻטְלָה",
|
|
"quttolet": "קֻטּוֹלֶת",
|
|
# --- t ---
|
|
"taqtela": "תַּקְטֵלָה",
|
|
"taqtil": "תַּקְטִיל",
|
|
"taqtit": "תַּקְטִית",
|
|
"taqtul": "תַּקְטוּל",
|
|
"taqtula": "תַּקְטוּלָה",
|
|
"taqtut": "תַּקְטוּת",
|
|
"tiqtal": "תִּקְטָל",
|
|
"tiqtala": "תִּקְטָלָה",
|
|
"tiqtelet": "תִּקְטֶלֶת",
|
|
"tiqtolet": "תִּקְטֹלֶת",
|
|
"tqilla": "תְּקִלָּה",
|
|
"tqula": "תְּקוּלָה",
|
|
# --- y ---
|
|
"yaqtul": "יַקְטוּל",
|
|
}
|
|
|
|
|
|
def _mishkal_to_hebrew(mishkal: str) -> str | None:
|
|
"""Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
|
|
if not mishkal:
|
|
return None
|
|
# Try as-is first (q-notation)
|
|
result = _MISHKAL_HEBREW_Q.get(mishkal)
|
|
if result:
|
|
return result
|
|
# Convert k-notation to q-notation and retry
|
|
q_form = mishkal.replace("k", "q")
|
|
return _MISHKAL_HEBREW_Q.get(q_form)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP session
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_session = requests.Session()
|
|
_session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
|
|
|
|
|
|
def _fetch(url: str, hebstyle: str, backoff: float = REQUEST_DELAY) -> str | None:
|
|
"""Fetch a URL with the given hebstyle cookie. Returns HTML string or None on failure."""
|
|
cookies = {"translit": "none", "hebstyle": hebstyle}
|
|
max_wait = 60.0
|
|
while True:
|
|
try:
|
|
resp = _session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
return resp.text
|
|
except requests.exceptions.HTTPError as exc:
|
|
status = exc.response.status_code if exc.response is not None else 0
|
|
if status == 404:
|
|
logger.warning(" 404 for %s — skipping", url)
|
|
return None
|
|
if status in (429, 503):
|
|
wait = min(backoff, max_wait)
|
|
logger.warning(" Rate limited (%s) — waiting %.0fs", status, wait)
|
|
time.sleep(wait)
|
|
backoff = min(backoff * 2, max_wait)
|
|
else:
|
|
logger.error(" HTTP %s for %s", status, url)
|
|
return None
|
|
except requests.RequestException as exc:
|
|
wait = min(backoff, max_wait)
|
|
logger.warning(" Request error for %s: %s — retrying in %.0fs", url, exc, wait)
|
|
time.sleep(wait)
|
|
backoff = min(backoff * 2, max_wait)
|
|
if backoff >= max_wait:
|
|
logger.error(" Giving up on %s", url)
|
|
return None
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Shared HTML parsing utilities
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _get_menukad_and_audio(cell) -> tuple[str, str]:
|
|
"""Extract (nikkud_text, audio_url) from a table cell."""
|
|
audio_url = ""
|
|
audio_span = cell.find("span", class_=lambda c: c and "audio-play" in c)
|
|
if audio_span:
|
|
audio_url = audio_span.get("data-audio", "")
|
|
# Also check direct data-audio attribute on cell
|
|
if not audio_url:
|
|
da = cell.get("data-audio", "")
|
|
if da:
|
|
audio_url = da
|
|
|
|
span = cell.find("span", class_="menukad")
|
|
if span:
|
|
return span.get_text(strip=True), audio_url
|
|
|
|
txt = cell.get_text(strip=True)
|
|
if re.search(r"[\u05d0-\u05ea]", txt):
|
|
return txt, audio_url
|
|
return "", audio_url
|
|
|
|
|
|
def _get_plain_text(cell) -> str:
|
|
"""Extract plain Hebrew text (no nikkud) from a cell — used for vl pages."""
|
|
span = cell.find("span", class_="menukad")
|
|
if span:
|
|
return span.get_text(strip=True)
|
|
txt = cell.get_text(strip=True)
|
|
if re.search(r"[\u05d0-\u05ea]", txt):
|
|
return txt
|
|
return ""
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Noun detail parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_GENDER_MAP = {
|
|
"masculine": "masculine",
|
|
"feminine": "feminine",
|
|
"זכר": "masculine",
|
|
"נקבה": "feminine",
|
|
"male": "masculine",
|
|
"female": "feminine",
|
|
}
|
|
|
|
_GENDER_HEBREW = {
|
|
"masculine": {"nikkud": "זָכָר", "ktiv_male": "זכר"},
|
|
"feminine": {"nikkud": "נְקֵבָה", "ktiv_male": "נקבה"},
|
|
}
|
|
|
|
|
|
def _parse_noun_table(soup: BeautifulSoup) -> dict[str, dict | str]:
|
|
"""
|
|
Parse the noun declension table from a pealim detail page soup.
|
|
|
|
Returns a dict with keys: singular, plural, construct_singular, construct_plural,
|
|
singular_audio, plural_audio — values are nikkud strings or audio URLs.
|
|
Returns empty dict if no table found.
|
|
"""
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return {}
|
|
|
|
rows = table.find_all("tr")
|
|
|
|
result: dict[str, dict | str] = {}
|
|
|
|
for row in rows:
|
|
label_cell = row.find("th") or (row.find("td") if row.find_all("td") else None)
|
|
if not label_cell:
|
|
continue
|
|
label_text = label_cell.get_text(strip=True).lower()
|
|
|
|
tds = row.find_all("td")
|
|
# Some rows have th + tds; tds may include the first label td
|
|
# We want data cells (the ones with Hebrew forms)
|
|
data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())]
|
|
|
|
if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text):
|
|
# Singular and plural forms in two cells
|
|
if len(data_cells) >= 1:
|
|
nikkud_sg, audio_sg = _get_menukad_and_audio(data_cells[0])
|
|
result["singular_nikkud"] = nikkud_sg
|
|
if audio_sg:
|
|
result["singular_audio"] = audio_sg
|
|
if len(data_cells) >= 2:
|
|
nikkud_pl, audio_pl = _get_menukad_and_audio(data_cells[1])
|
|
result["plural_nikkud"] = nikkud_pl
|
|
if audio_pl:
|
|
result["plural_audio"] = audio_pl
|
|
|
|
elif "construct" in label_text or "סמיכות" in label_text:
|
|
if len(data_cells) >= 1:
|
|
nikkud_csg, _ = _get_menukad_and_audio(data_cells[0])
|
|
result["construct_singular_nikkud"] = nikkud_csg
|
|
if len(data_cells) >= 2:
|
|
nikkud_cpl, _ = _get_menukad_and_audio(data_cells[1])
|
|
result["construct_plural_nikkud"] = nikkud_cpl
|
|
|
|
return result
|
|
|
|
|
|
def _parse_noun_table_vl(soup: BeautifulSoup) -> dict[str, str]:
|
|
"""
|
|
Parse the noun declension table from a vl (ktiv male) page.
|
|
Returns dict with keys: singular_ktiv, plural_ktiv, construct_singular_ktiv, construct_plural_ktiv.
|
|
"""
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return {}
|
|
|
|
rows = table.find_all("tr")
|
|
result: dict[str, str] = {}
|
|
|
|
for row in rows:
|
|
label_cell = row.find("th")
|
|
if not label_cell:
|
|
tds_all = row.find_all("td")
|
|
if tds_all:
|
|
label_cell = tds_all[0]
|
|
if not label_cell:
|
|
continue
|
|
label_text = label_cell.get_text(strip=True).lower()
|
|
|
|
tds = row.find_all("td")
|
|
data_cells = [td for td in tds if re.search(r"[\u05d0-\u05ea]", td.get_text())]
|
|
|
|
if "absolute" in label_text or ("singular" in label_text and "construct" not in label_text):
|
|
if len(data_cells) >= 1:
|
|
result["singular_ktiv"] = _get_plain_text(data_cells[0])
|
|
if len(data_cells) >= 2:
|
|
result["plural_ktiv"] = _get_plain_text(data_cells[1])
|
|
elif "construct" in label_text or "סמיכות" in label_text:
|
|
if len(data_cells) >= 1:
|
|
result["construct_singular_ktiv"] = _get_plain_text(data_cells[0])
|
|
if len(data_cells) >= 2:
|
|
result["construct_plural_ktiv"] = _get_plain_text(data_cells[1])
|
|
|
|
return result
|
|
|
|
|
|
def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
|
|
"""
|
|
Extract (gender, mishkal) from the PoS section of the detail page.
|
|
Returns ("masculine"|"feminine"|"", mishkal_english|"").
|
|
"""
|
|
gender = ""
|
|
mishkal = ""
|
|
|
|
# Try various selectors that pealim uses for PoS info
|
|
pos_section = soup.find("div", class_="pos") or soup.find("p", class_="pos")
|
|
if not pos_section:
|
|
# Look for it in the page header area
|
|
pos_section = soup.find("div", class_="page-header")
|
|
|
|
if pos_section:
|
|
text = pos_section.get_text(" ", strip=True)
|
|
# Gender detection
|
|
for raw, canonical in _GENDER_MAP.items():
|
|
if raw in text.lower():
|
|
gender = canonical
|
|
break
|
|
# Mishkal detection: look for CaCaC-style patterns
|
|
mishkal_match = re.search(r"\b([A-Z][a-zA-Z\']+)\b", text)
|
|
if mishkal_match:
|
|
candidate = mishkal_match.group(1)
|
|
# Validate: mishkal names contain uppercase letters in CaCaC pattern
|
|
if re.match(r"^[A-Za-z\']+$", candidate) and any(c.isupper() for c in candidate):
|
|
mishkal = candidate
|
|
|
|
# Also check the og:description or breadcrumbs for gender
|
|
if not gender:
|
|
meta = soup.find("meta", {"property": "og:description"})
|
|
if meta:
|
|
desc = meta.get("content", "").lower()
|
|
for raw, canonical in _GENDER_MAP.items():
|
|
if raw in desc:
|
|
gender = canonical
|
|
break
|
|
|
|
# Scan small/muted spans that often contain gender info
|
|
if not gender:
|
|
for span in soup.find_all("span", class_=lambda c: c and ("small" in c or "muted" in c or "pos" in c)):
|
|
txt = span.get_text(strip=True).lower()
|
|
for raw, canonical in _GENDER_MAP.items():
|
|
if raw in txt:
|
|
gender = canonical
|
|
break
|
|
if gender:
|
|
break
|
|
|
|
return gender, mishkal
|
|
|
|
|
|
def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
|
"""
|
|
Parse noun detail pages (mo=nikkud, vl=ktiv male).
|
|
Returns dict to merge into entry's noun_inflection field.
|
|
"""
|
|
mo_soup = BeautifulSoup(mo_html, "lxml")
|
|
vl_soup = BeautifulSoup(vl_html, "lxml")
|
|
|
|
mo_data = _parse_noun_table(mo_soup)
|
|
vl_data = _parse_noun_table_vl(vl_soup)
|
|
gender, mishkal = _parse_noun_gender_mishkal(mo_soup)
|
|
|
|
def form_or_null(nikkud: str, ktiv: str) -> dict | None:
|
|
if not nikkud:
|
|
return None
|
|
if not ktiv:
|
|
logger.warning("No ktiv_male for noun form: %s", nikkud)
|
|
return {"nikkud": nikkud, "ktiv_male": ktiv}
|
|
|
|
singular_nikkud = str(mo_data.get("singular_nikkud", ""))
|
|
plural_nikkud = str(mo_data.get("plural_nikkud", ""))
|
|
construct_singular_nikkud = str(mo_data.get("construct_singular_nikkud", ""))
|
|
construct_plural_nikkud = str(mo_data.get("construct_plural_nikkud", ""))
|
|
|
|
singular_ktiv = str(vl_data.get("singular_ktiv", ""))
|
|
plural_ktiv = str(vl_data.get("plural_ktiv", ""))
|
|
construct_singular_ktiv = str(vl_data.get("construct_singular_ktiv", ""))
|
|
construct_plural_ktiv = str(vl_data.get("construct_plural_ktiv", ""))
|
|
|
|
result: dict = {
|
|
"singular": form_or_null(singular_nikkud, singular_ktiv),
|
|
"plural": form_or_null(plural_nikkud, plural_ktiv),
|
|
"construct_singular": form_or_null(construct_singular_nikkud, construct_singular_ktiv),
|
|
"construct_plural": form_or_null(construct_plural_nikkud, construct_plural_ktiv),
|
|
"singular_audio": mo_data.get("singular_audio"),
|
|
"plural_audio": mo_data.get("plural_audio"),
|
|
"pronominal_suffixes": None,
|
|
# plurals_guid is PRESERVED by the merge step — not set here
|
|
}
|
|
|
|
if gender:
|
|
result["gender"] = gender
|
|
result["gender_hebrew"] = _GENDER_HEBREW.get(gender)
|
|
|
|
if mishkal:
|
|
result["mishkal"] = mishkal
|
|
result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Verb detail parsing (ported from conjugation_extract.py)
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
|
|
"""Extract binyan from page header span or og:description."""
|
|
texts = [h3.get_text(" ", strip=True) for h3 in soup.find_all("h3", class_="page-header")]
|
|
meta = soup.find("meta", {"property": "og:description"})
|
|
if meta:
|
|
texts.append(str(meta.get("content", "")))
|
|
for text in texts:
|
|
text_lower = text.lower()
|
|
for i, bname_lower in enumerate(_BINYAN_NAMES_LOWER):
|
|
if bname_lower in text_lower:
|
|
return BINYAN_NAMES[i]
|
|
return ""
|
|
|
|
|
|
def _parse_conjugation_table(
|
|
soup: BeautifulSoup,
|
|
passive: bool = False,
|
|
table_el=None,
|
|
) -> dict[str, dict]:
|
|
"""
|
|
Parse conjugation table. Returns form_key -> {form_nikkud, audio_url} dict.
|
|
If passive=True, locates the passive table (after "Passive" heading).
|
|
If table_el is provided, parses that table directly.
|
|
"""
|
|
if passive:
|
|
passive_h3 = next(
|
|
(h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
|
|
None,
|
|
)
|
|
if not passive_h3:
|
|
return {}
|
|
table = next(
|
|
(
|
|
sib
|
|
for sib in passive_h3.find_all_next()
|
|
if sib.name == "table" and "conjugation-table" in sib.get("class", [])
|
|
),
|
|
None,
|
|
)
|
|
if not table:
|
|
return {}
|
|
elif table_el is not None:
|
|
table = table_el
|
|
else:
|
|
table = soup.find("table", class_="conjugation-table")
|
|
|
|
if not table:
|
|
return {}
|
|
|
|
rows = table.find_all("tr")
|
|
if len(rows) < 3:
|
|
return {}
|
|
|
|
forms: dict[str, dict] = {}
|
|
|
|
def heb_cells(row_idx: int) -> list[tuple[str, str]]:
|
|
"""Return (nikkud_text, audio_url) for each Hebrew-containing cell in the row."""
|
|
cells = rows[row_idx].find_all(["th", "td"])
|
|
result = []
|
|
for cell in cells:
|
|
txt, au = _get_menukad_and_audio(cell)
|
|
colspan = int(cell.get("colspan", 1))
|
|
if txt and re.search(r"[\u05d0-\u05ea]", txt):
|
|
for _ in range(colspan):
|
|
result.append((txt, au))
|
|
return result
|
|
|
|
def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
|
seen: set[str] = set()
|
|
out: list[tuple[str, str]] = []
|
|
for pair in pairs:
|
|
if pair[0] not in seen:
|
|
seen.add(pair[0])
|
|
out.append(pair)
|
|
return out
|
|
|
|
def store(key: str, nikkud: str, audio_url: str) -> None:
|
|
if nikkud:
|
|
forms[key] = {"form_nikkud": nikkud, "audio_url": audio_url}
|
|
|
|
# Locate tense rows by label text
|
|
present_row = past_row = future_row = imp_row = inf_row = -1
|
|
for i, row in enumerate(rows):
|
|
label = row.get_text(" ", strip=True).lower()
|
|
if "present" in label and present_row < 0:
|
|
present_row = i
|
|
elif "past" in label and past_row < 0:
|
|
past_row = i
|
|
elif "future" in label and future_row < 0:
|
|
future_row = i
|
|
elif "imperative" in label and imp_row < 0:
|
|
imp_row = i
|
|
elif "infinitive" in label and inf_row < 0:
|
|
inf_row = i
|
|
|
|
# Present: ms fs mp fp
|
|
if present_row >= 0:
|
|
hf = heb_cells(present_row)
|
|
for k, (v, au) in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False):
|
|
store(k, v, au)
|
|
|
|
# Past row 0: 1s 1p (deduplicated)
|
|
if past_row >= 0:
|
|
uniq = deduplicate(heb_cells(past_row))
|
|
if len(uniq) >= 1:
|
|
store("past_1s", uniq[0][0], uniq[0][1])
|
|
if len(uniq) >= 2:
|
|
store("past_1p", uniq[1][0], uniq[1][1])
|
|
# Past row 1: 2ms 2fs 2mp 2fp
|
|
if past_row + 1 < len(rows):
|
|
for k, (v, au) in zip(
|
|
["past_2ms", "past_2fs", "past_2mp", "past_2fp"],
|
|
heb_cells(past_row + 1),
|
|
strict=False,
|
|
):
|
|
store(k, v, au)
|
|
# Past row 2: 3ms 3fs 3p (deduplicated)
|
|
if past_row + 2 < len(rows):
|
|
uniq3 = deduplicate(heb_cells(past_row + 2))
|
|
for k, (v, au) in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False):
|
|
store(k, v, au)
|
|
|
|
# Future row 0: 1s 1p (deduplicated)
|
|
if future_row >= 0:
|
|
uniq_f = deduplicate(heb_cells(future_row))
|
|
if len(uniq_f) >= 1:
|
|
store("future_1s", uniq_f[0][0], uniq_f[0][1])
|
|
if len(uniq_f) >= 2:
|
|
store("future_1p", uniq_f[1][0], uniq_f[1][1])
|
|
# Future row 1: 2ms 2fs 2mp 2fp
|
|
if future_row + 1 < len(rows):
|
|
for k, (v, au) in zip(
|
|
["future_2ms", "future_2fs", "future_2mp", "future_2fp"],
|
|
heb_cells(future_row + 1),
|
|
strict=False,
|
|
):
|
|
store(k, v, au)
|
|
# Future row 2: 3ms 3fs 3mp 3fp
|
|
if future_row + 2 < len(rows):
|
|
for k, (v, au) in zip(
|
|
["future_3ms", "future_3fs", "future_3mp", "future_3fp"],
|
|
heb_cells(future_row + 2),
|
|
strict=False,
|
|
):
|
|
store(k, v, au)
|
|
|
|
# Imperative: ms fs mp fp
|
|
if imp_row >= 0:
|
|
hf = heb_cells(imp_row)
|
|
for k, (v, au) in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False):
|
|
store(k, v, au)
|
|
|
|
# Infinitive
|
|
if inf_row >= 0:
|
|
hf = heb_cells(inf_row)
|
|
if hf:
|
|
store("infinitive", hf[0][0], hf[0][1])
|
|
|
|
return forms
|
|
|
|
|
|
def _parse_conjugation_table_vl(
|
|
soup: BeautifulSoup,
|
|
passive: bool = False,
|
|
table_el=None,
|
|
) -> dict[str, str]:
|
|
"""
|
|
Parse conjugation table from a vl (ktiv male) page.
|
|
Returns form_key -> ktiv_male_text dict.
|
|
Mirrors _parse_conjugation_table but extracts plain text.
|
|
"""
|
|
if passive:
|
|
passive_h3 = next(
|
|
(h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
|
|
None,
|
|
)
|
|
if not passive_h3:
|
|
return {}
|
|
table = next(
|
|
(
|
|
sib
|
|
for sib in passive_h3.find_all_next()
|
|
if sib.name == "table" and "conjugation-table" in sib.get("class", [])
|
|
),
|
|
None,
|
|
)
|
|
if not table:
|
|
return {}
|
|
elif table_el is not None:
|
|
table = table_el
|
|
else:
|
|
table = soup.find("table", class_="conjugation-table")
|
|
|
|
if not table:
|
|
return {}
|
|
|
|
rows = table.find_all("tr")
|
|
if len(rows) < 3:
|
|
return {}
|
|
|
|
ktiv_forms: dict[str, str] = {}
|
|
|
|
def heb_cells_plain(row_idx: int) -> list[str]:
|
|
cells = rows[row_idx].find_all(["th", "td"])
|
|
result = []
|
|
for cell in cells:
|
|
txt = _get_plain_text(cell)
|
|
colspan = int(cell.get("colspan", 1))
|
|
if txt and re.search(r"[\u05d0-\u05ea]", txt):
|
|
for _ in range(colspan):
|
|
result.append(txt)
|
|
return result
|
|
|
|
def deduplicate_str(items: list[str]) -> list[str]:
|
|
seen: set[str] = set()
|
|
out: list[str] = []
|
|
for item in items:
|
|
if item not in seen:
|
|
seen.add(item)
|
|
out.append(item)
|
|
return out
|
|
|
|
present_row = past_row = future_row = imp_row = inf_row = -1
|
|
for i, row in enumerate(rows):
|
|
label = row.get_text(" ", strip=True).lower()
|
|
if "present" in label and present_row < 0:
|
|
present_row = i
|
|
elif "past" in label and past_row < 0:
|
|
past_row = i
|
|
elif "future" in label and future_row < 0:
|
|
future_row = i
|
|
elif "imperative" in label and imp_row < 0:
|
|
imp_row = i
|
|
elif "infinitive" in label and inf_row < 0:
|
|
inf_row = i
|
|
|
|
if present_row >= 0:
|
|
hf = heb_cells_plain(present_row)
|
|
for k, v in zip(["present_ms", "present_fs", "present_mp", "present_fp"], hf, strict=False):
|
|
if v:
|
|
ktiv_forms[k] = v
|
|
|
|
if past_row >= 0:
|
|
uniq = deduplicate_str(heb_cells_plain(past_row))
|
|
if len(uniq) >= 1:
|
|
ktiv_forms["past_1s"] = uniq[0]
|
|
if len(uniq) >= 2:
|
|
ktiv_forms["past_1p"] = uniq[1]
|
|
if past_row + 1 < len(rows):
|
|
for k, v in zip(
|
|
["past_2ms", "past_2fs", "past_2mp", "past_2fp"],
|
|
heb_cells_plain(past_row + 1),
|
|
strict=False,
|
|
):
|
|
if v:
|
|
ktiv_forms[k] = v
|
|
if past_row + 2 < len(rows):
|
|
uniq3 = deduplicate_str(heb_cells_plain(past_row + 2))
|
|
for k, v in zip(["past_3ms", "past_3fs", "past_3p"], uniq3, strict=False):
|
|
if v:
|
|
ktiv_forms[k] = v
|
|
|
|
if future_row >= 0:
|
|
uniq_f = deduplicate_str(heb_cells_plain(future_row))
|
|
if len(uniq_f) >= 1:
|
|
ktiv_forms["future_1s"] = uniq_f[0]
|
|
if len(uniq_f) >= 2:
|
|
ktiv_forms["future_1p"] = uniq_f[1]
|
|
if future_row + 1 < len(rows):
|
|
for k, v in zip(
|
|
["future_2ms", "future_2fs", "future_2mp", "future_2fp"],
|
|
heb_cells_plain(future_row + 1),
|
|
strict=False,
|
|
):
|
|
if v:
|
|
ktiv_forms[k] = v
|
|
if future_row + 2 < len(rows):
|
|
for k, v in zip(
|
|
["future_3ms", "future_3fs", "future_3mp", "future_3fp"],
|
|
heb_cells_plain(future_row + 2),
|
|
strict=False,
|
|
):
|
|
if v:
|
|
ktiv_forms[k] = v
|
|
|
|
if imp_row >= 0:
|
|
hf = heb_cells_plain(imp_row)
|
|
for k, v in zip(["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"], hf, strict=False):
|
|
if v:
|
|
ktiv_forms[k] = v
|
|
|
|
if inf_row >= 0:
|
|
hf = heb_cells_plain(inf_row)
|
|
if hf:
|
|
ktiv_forms["infinitive"] = hf[0]
|
|
|
|
return ktiv_forms
|
|
|
|
|
|
def _forms_to_active_list(
|
|
mo_forms: dict[str, dict],
|
|
vl_forms: dict[str, str],
|
|
existing_forms: list[dict] | None,
|
|
) -> list[dict]:
|
|
"""
|
|
Convert parsed form dicts into the active_forms list structure (matches SCHEMA.yaml).
|
|
Preserves guid and guid_candidates from existing_forms where present.
|
|
"""
|
|
# Build a lookup of existing form data keyed by (person, tense) for GUID preservation
|
|
existing_lookup: dict[tuple[str, str], dict] = {}
|
|
if existing_forms:
|
|
for ef in existing_forms:
|
|
key = (ef.get("person", ""), ef.get("tense", ""))
|
|
existing_lookup[key] = ef
|
|
|
|
active_forms: list[dict] = []
|
|
for form_key, form_data in mo_forms.items():
|
|
person = FORM_KEY_TO_PERSON.get(form_key, form_key)
|
|
tense = TENSE_DESCRIPTION.get(form_key, "")
|
|
nikkud = form_data["form_nikkud"]
|
|
ktiv = vl_forms.get(form_key, "")
|
|
if not ktiv:
|
|
logger.warning("No ktiv_male for verb form %s: %s", form_key, nikkud)
|
|
audio_url = form_data.get("audio_url", "")
|
|
pronoun = PRONOUN_LABELS.get(form_key, "")
|
|
|
|
# Preserve GUIDs from existing entry
|
|
existing = existing_lookup.get((person, tense), {})
|
|
guid = existing.get("guid")
|
|
guid_candidates = existing.get("guid_candidates")
|
|
|
|
active_forms.append(
|
|
{
|
|
"person": person,
|
|
"tense": tense,
|
|
"pronoun_hebrew": pronoun,
|
|
"form": {"nikkud": nikkud, "ktiv_male": ktiv},
|
|
"audio_url": audio_url,
|
|
"audio_file": existing.get("audio_file"),
|
|
"guid": guid,
|
|
"guid_candidates": guid_candidates,
|
|
}
|
|
)
|
|
|
|
return active_forms
|
|
|
|
|
|
def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: dict | None) -> dict:
|
|
"""
|
|
Parse verb detail pages (mo=nikkud, vl=ktiv male).
|
|
Returns dict to merge into entry's conjugation field.
|
|
Preserves in_conjugation_deck, guid, guid_candidates from existing_conj.
|
|
"""
|
|
mo_soup = BeautifulSoup(mo_html, "lxml")
|
|
vl_soup = BeautifulSoup(vl_html, "lxml")
|
|
|
|
existing = existing_conj or {}
|
|
|
|
# Extract metadata from mo page
|
|
binyan = _extract_binyan_from_page(mo_soup)
|
|
|
|
meaning = ""
|
|
lead_div = mo_soup.find("div", class_="lead")
|
|
if lead_div:
|
|
meaning = lead_div.get_text(strip=True)
|
|
|
|
# Parse active forms
|
|
mo_active = _parse_conjugation_table(mo_soup, passive=False)
|
|
vl_active = _parse_conjugation_table_vl(vl_soup, passive=False)
|
|
|
|
if not mo_active:
|
|
logger.warning(" No active forms found for slug=%s", slug)
|
|
return {}
|
|
|
|
# Determine infinitive and reference form
|
|
infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "")
|
|
infinitive_ktiv = vl_active.get("infinitive", "")
|
|
if infinitive_nikkud and not infinitive_ktiv:
|
|
logger.warning("No ktiv_male for infinitive: %s (slug=%s)", infinitive_nikkud, slug)
|
|
past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "")
|
|
past_3ms_ktiv = vl_active.get("past_3ms", "")
|
|
if past_3ms_nikkud and not past_3ms_ktiv:
|
|
logger.warning("No ktiv_male for past_3ms: %s (slug=%s)", past_3ms_nikkud, slug)
|
|
|
|
# Build active forms list, preserving GUIDs
|
|
existing_active_forms = existing.get("active_forms")
|
|
active_forms = _forms_to_active_list(mo_active, vl_active, existing_active_forms)
|
|
|
|
# Check for passive section (Hif'il / Pi'el verbs)
|
|
passive_h3 = next(
|
|
(h for h in mo_soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
|
|
None,
|
|
)
|
|
hufal_pual_forms = None
|
|
reference_form_passive = None
|
|
|
|
if passive_h3:
|
|
mo_passive = _parse_conjugation_table(mo_soup, passive=True)
|
|
vl_passive = _parse_conjugation_table_vl(vl_soup, passive=True)
|
|
if mo_passive:
|
|
existing_passive_forms = existing.get("hufal_pual_forms")
|
|
hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms)
|
|
passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "")
|
|
passive_3ms_ktiv = vl_passive.get("past_3ms", "")
|
|
if passive_3ms_nikkud and not passive_3ms_ktiv:
|
|
logger.warning("No ktiv_male for passive past_3ms: %s (slug=%s)", passive_3ms_nikkud, slug)
|
|
if passive_3ms_nikkud:
|
|
reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv}
|
|
|
|
result: dict = {
|
|
"in_conjugation_deck": existing.get("in_conjugation_deck", False),
|
|
"infinitive": {"nikkud": infinitive_nikkud, "ktiv_male": infinitive_ktiv} if infinitive_nikkud else None,
|
|
"reference_form": {"nikkud": past_3ms_nikkud, "ktiv_male": past_3ms_ktiv} if past_3ms_nikkud else None,
|
|
"binyan": binyan,
|
|
"binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
|
|
"meaning": meaning,
|
|
"prep": existing.get("prep"),
|
|
"active_forms": active_forms,
|
|
"hufal_pual_forms": hufal_pual_forms,
|
|
"reference_form_passive": reference_form_passive,
|
|
}
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Adjective detail parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
|
|
_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")
|
|
|
|
|
|
def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
|
|
"""
|
|
Parse the adjective inflection table from a pealim detail page (mo/nikkud).
|
|
|
|
Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
|
|
and audio URL from each.
|
|
|
|
Returns:
|
|
Dict mapping form key ("ms", "fs", "mp", "fp") to
|
|
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
|
|
"""
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return {}
|
|
|
|
result: dict[str, dict] = {}
|
|
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
|
|
div = table.find(id=cell_id)
|
|
if not div:
|
|
continue
|
|
nikkud, audio_url = _get_menukad_and_audio(div)
|
|
if nikkud:
|
|
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
|
|
|
|
return result
|
|
|
|
|
|
def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
|
|
"""
|
|
Parse the adjective inflection table from a vl (ktiv male) page.
|
|
|
|
Returns:
|
|
Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
|
|
"""
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return {}
|
|
|
|
result: dict[str, str] = {}
|
|
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
|
|
div = table.find(id=cell_id)
|
|
if not div:
|
|
continue
|
|
ktiv = _get_plain_text(div)
|
|
if ktiv:
|
|
result[form_key] = ktiv
|
|
|
|
return result
|
|
|
|
|
|
def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
|
|
"""
|
|
Extract mishkal from the PoS section of an adjective detail page.
|
|
|
|
Reuses the same extraction logic as _parse_noun_gender_mishkal.
|
|
|
|
Returns:
|
|
Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
|
|
"""
|
|
_, mishkal = _parse_noun_gender_mishkal(soup)
|
|
mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
|
|
return mishkal, mishkal_hebrew
|
|
|
|
|
|
def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
|
"""
|
|
Parse adjective detail pages (mo=nikkud, vl=ktiv male).
|
|
|
|
Returns:
|
|
Dict matching the adjective_inflection schema:
|
|
{ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
|
|
Empty dict if no forms found.
|
|
"""
|
|
mo_soup = BeautifulSoup(mo_html, "lxml")
|
|
vl_soup = BeautifulSoup(vl_html, "lxml")
|
|
|
|
mo_data = _parse_adjective_table(mo_soup)
|
|
vl_data = _parse_adjective_table_vl(vl_soup)
|
|
mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)
|
|
|
|
if not mo_data:
|
|
return {}
|
|
|
|
result: dict = {}
|
|
for form_key in _ADJECTIVE_FORM_KEYS:
|
|
mo_form = mo_data.get(form_key)
|
|
if mo_form:
|
|
nikkud = mo_form["nikkud"]
|
|
ktiv = vl_data.get(form_key, "")
|
|
if not ktiv:
|
|
logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
|
|
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
|
|
else:
|
|
result[form_key] = None
|
|
|
|
result["mishkal"] = mishkal or None
|
|
result["mishkal_hebrew"] = mishkal_hebrew or None
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Preposition detail parsing
|
|
# ---------------------------------------------------------------------------
|
|
|
|
_PREPOSITION_CELL_IDS: tuple[str, ...] = (
|
|
"P-1s",
|
|
"P-1p",
|
|
"P-2ms",
|
|
"P-2fs",
|
|
"P-2mp",
|
|
"P-2fp",
|
|
"P-3ms",
|
|
"P-3fs",
|
|
"P-3mp",
|
|
"P-3fp",
|
|
)
|
|
_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
|
|
"1s",
|
|
"1p",
|
|
"2ms",
|
|
"2fs",
|
|
"2mp",
|
|
"2fp",
|
|
"3ms",
|
|
"3fs",
|
|
"3mp",
|
|
"3fp",
|
|
)
|
|
|
|
|
|
def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
|
|
"""
|
|
Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).
|
|
|
|
Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud
|
|
text and audio URL from each.
|
|
|
|
Returns:
|
|
Dict mapping person key ("1s", "1p", …, "3fp") to
|
|
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
|
|
"""
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return {}
|
|
|
|
result: dict[str, dict] = {}
|
|
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
|
|
div = table.find(id=cell_id)
|
|
if not div:
|
|
continue
|
|
nikkud, audio_url = _get_menukad_and_audio(div)
|
|
if nikkud:
|
|
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
|
|
|
|
return result
|
|
|
|
|
|
def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
|
|
"""
|
|
Parse the preposition pronominal suffix table from a vl (ktiv male) page.
|
|
|
|
Returns:
|
|
Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string.
|
|
"""
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return {}
|
|
|
|
result: dict[str, str] = {}
|
|
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
|
|
div = table.find(id=cell_id)
|
|
if not div:
|
|
continue
|
|
ktiv = _get_plain_text(div)
|
|
if ktiv:
|
|
result[form_key] = ktiv
|
|
|
|
return result
|
|
|
|
|
|
def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
|
"""
|
|
Parse preposition detail pages (mo=nikkud, vl=ktiv male).
|
|
|
|
Returns:
|
|
Dict matching the preposition_inflection schema:
|
|
{1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
|
|
Empty dict if no forms found.
|
|
"""
|
|
mo_soup = BeautifulSoup(mo_html, "lxml")
|
|
vl_soup = BeautifulSoup(vl_html, "lxml")
|
|
|
|
mo_data = _parse_preposition_table(mo_soup)
|
|
vl_data = _parse_preposition_table_vl(vl_soup)
|
|
|
|
if not mo_data:
|
|
return {}
|
|
|
|
result: dict = {}
|
|
for form_key in _PREPOSITION_FORM_KEYS:
|
|
mo_form = mo_data.get(form_key)
|
|
if mo_form:
|
|
nikkud = mo_form["nikkud"]
|
|
ktiv = vl_data.get(form_key, "")
|
|
if not ktiv:
|
|
logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
|
|
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
|
|
else:
|
|
result[form_key] = None
|
|
|
|
return result
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Merging strategy
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _merge_noun_inflection(existing_ni: dict | None, scraped: dict) -> dict:
|
|
"""
|
|
Merge scraped noun data into existing noun_inflection, preserving plurals_guid.
|
|
"""
|
|
result = dict(scraped)
|
|
if existing_ni:
|
|
# PRESERVE existing plurals_guid — never overwrite
|
|
if existing_ni.get("plurals_guid"):
|
|
result["plurals_guid"] = existing_ni["plurals_guid"]
|
|
# Preserve existing singular_audio if we didn't scrape one
|
|
if not result.get("singular_audio") and existing_ni.get("singular_audio"):
|
|
result["singular_audio"] = existing_ni["singular_audio"]
|
|
# Preserve existing plural_audio if we didn't scrape one
|
|
if not result.get("plural_audio") and existing_ni.get("plural_audio"):
|
|
result["plural_audio"] = existing_ni["plural_audio"]
|
|
# Preserve existing singular/plural if we failed to scrape them
|
|
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
|
if not result.get(field) and existing_ni.get(field):
|
|
result[field] = existing_ni[field]
|
|
else:
|
|
result.setdefault("plurals_guid", None)
|
|
|
|
return result
|
|
|
|
|
|
def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
|
|
"""
|
|
Merge scraped verb data into existing conjugation, preserving in_conjugation_deck
|
|
and all guid/guid_candidates fields (already handled in _forms_to_active_list).
|
|
"""
|
|
# The scraped dict already preserves in_conjugation_deck and GUIDs via _forms_to_active_list
|
|
return scraped
|
|
|
|
|
|
def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
|
|
"""
|
|
Merge scraped adjective data into existing adjective_inflection.
|
|
No GUIDs to preserve — simple overwrite with scraped data.
|
|
"""
|
|
return dict(scraped)
|
|
|
|
|
|
def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
|
|
"""
|
|
Merge scraped preposition data into existing preposition_inflection.
|
|
No GUIDs to preserve — simple overwrite with scraped data.
|
|
"""
|
|
return dict(scraped)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# I/O helpers
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _load_words() -> dict:
|
|
"""Load words.json. Returns empty dict if file not found."""
|
|
if WORDS_JSON.exists():
|
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
def _save_words(data: dict) -> None:
|
|
"""Atomically write words.json via a .tmp file."""
|
|
WORDS_JSON.parent.mkdir(parents=True, exist_ok=True)
|
|
tmp_path = str(WORDS_JSON) + ".tmp"
|
|
with open(tmp_path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
os.replace(tmp_path, WORDS_JSON)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main scrape loop
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _should_process(
|
|
entry: dict,
|
|
pos: str,
|
|
force: bool,
|
|
nouns_only: bool,
|
|
verbs_only: bool,
|
|
adjectives_only: bool,
|
|
prepositions_only: bool,
|
|
) -> bool:
|
|
"""Return True if this entry should be scraped."""
|
|
if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
|
|
return False
|
|
if nouns_only and not pos.startswith("Noun"):
|
|
return False
|
|
if verbs_only and not pos.startswith("Verb"):
|
|
return False
|
|
if adjectives_only and not pos.startswith("Adjective"):
|
|
return False
|
|
if prepositions_only and not pos.startswith("Preposition"):
|
|
return False
|
|
return force or not entry.get("detail_scraped")
|
|
|
|
|
|
def run(
|
|
test: int | None = None,
|
|
force_refresh: bool = False,
|
|
nouns_only: bool = False,
|
|
verbs_only: bool = False,
|
|
adjectives_only: bool = False,
|
|
prepositions_only: bool = False,
|
|
) -> None:
|
|
"""
|
|
Main scrape loop.
|
|
|
|
Args:
|
|
test: If set, scrape at most this many entries (for smoke-testing).
|
|
force_refresh: Re-scrape entries where detail_scraped=True.
|
|
nouns_only: Only scrape noun entries.
|
|
verbs_only: Only scrape verb entries.
|
|
adjectives_only: Only scrape adjective entries.
|
|
prepositions_only: Only scrape preposition entries.
|
|
"""
|
|
words = _load_words()
|
|
|
|
candidates = [
|
|
(unique_key, entry)
|
|
for unique_key, entry in words.items()
|
|
if _should_process(
|
|
entry,
|
|
entry.get("pos", ""),
|
|
force_refresh,
|
|
nouns_only,
|
|
verbs_only,
|
|
adjectives_only,
|
|
prepositions_only,
|
|
)
|
|
and entry.get("slug")
|
|
]
|
|
|
|
total = len(candidates)
|
|
if test is not None:
|
|
candidates = candidates[:test]
|
|
logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
|
|
else:
|
|
logger.info(
|
|
"Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
|
|
total,
|
|
)
|
|
|
|
processed = 0
|
|
errors = 0
|
|
|
|
for idx, (unique_key, entry) in enumerate(candidates, start=1):
|
|
slug = entry["slug"]
|
|
pos = entry.get("pos", "")
|
|
word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
|
|
url = f"{PEALIM_BASE}/dict/{slug}/"
|
|
|
|
if pos.startswith("Noun"):
|
|
label = "Noun"
|
|
elif pos.startswith("Verb"):
|
|
label = "Verb"
|
|
elif pos.startswith("Adjective"):
|
|
label = "Adjective"
|
|
else:
|
|
label = "Preposition"
|
|
logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)
|
|
|
|
# Fetch mo (nikkud) page
|
|
time.sleep(REQUEST_DELAY)
|
|
mo_html = _fetch(url, hebstyle="mo")
|
|
if not mo_html:
|
|
logger.warning(" Skipping %s — failed to fetch mo page", slug)
|
|
errors += 1
|
|
continue
|
|
|
|
# Fetch vl (ktiv male) page
|
|
time.sleep(REQUEST_DELAY)
|
|
vl_html = _fetch(url, hebstyle="vl")
|
|
if not vl_html:
|
|
logger.warning(" Skipping %s — failed to fetch vl page", slug)
|
|
errors += 1
|
|
continue
|
|
|
|
# Parse and merge
|
|
try:
|
|
if pos.startswith("Noun"):
|
|
scraped = _scrape_noun_detail(slug, mo_html, vl_html)
|
|
if scraped:
|
|
existing_ni = entry.get("noun_inflection") or {}
|
|
merged = _merge_noun_inflection(existing_ni, scraped)
|
|
words[unique_key]["noun_inflection"] = merged
|
|
sg = merged.get("singular", {}) or {}
|
|
pl = merged.get("plural", {}) or {}
|
|
logger.info(
|
|
" singular=%s plural=%s",
|
|
sg.get("nikkud", "—"),
|
|
pl.get("nikkud", "—"),
|
|
)
|
|
else:
|
|
logger.warning(" No noun data scraped for %s", slug)
|
|
errors += 1
|
|
continue
|
|
|
|
elif pos.startswith("Verb"):
|
|
existing_conj = entry.get("conjugation")
|
|
scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
|
|
if scraped:
|
|
merged = _merge_conjugation(existing_conj, scraped)
|
|
words[unique_key]["conjugation"] = merged
|
|
n_forms = len(merged.get("active_forms", []))
|
|
logger.info(
|
|
" %s, %d forms",
|
|
merged.get("binyan", "?"),
|
|
n_forms,
|
|
)
|
|
else:
|
|
logger.warning(" No verb data scraped for %s", slug)
|
|
errors += 1
|
|
continue
|
|
|
|
elif pos.startswith("Adjective"):
|
|
scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
|
|
if scraped:
|
|
existing_ai = entry.get("adjective_inflection")
|
|
merged = _merge_adjective_inflection(existing_ai, scraped)
|
|
words[unique_key]["adjective_inflection"] = merged
|
|
ms = merged.get("ms", {}) or {}
|
|
fs = merged.get("fs", {}) or {}
|
|
logger.info(
|
|
" ms=%s fs=%s mishkal=%s",
|
|
ms.get("nikkud", "—"),
|
|
fs.get("nikkud", "—"),
|
|
merged.get("mishkal", "—"),
|
|
)
|
|
else:
|
|
logger.warning(" No adjective data scraped for %s", slug)
|
|
errors += 1
|
|
continue
|
|
|
|
else: # Preposition
|
|
scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
|
|
if scraped:
|
|
existing_pi = entry.get("preposition_inflection")
|
|
merged = _merge_preposition_inflection(existing_pi, scraped)
|
|
words[unique_key]["preposition_inflection"] = merged
|
|
form_1s = merged.get("1s", {}) or {}
|
|
logger.info(
|
|
" 1s=%s",
|
|
form_1s.get("nikkud", "—"),
|
|
)
|
|
else:
|
|
logger.warning(" No preposition data scraped for %s", slug)
|
|
errors += 1
|
|
continue
|
|
|
|
except Exception as exc: # noqa: BLE001
|
|
logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
|
|
errors += 1
|
|
continue
|
|
|
|
words[unique_key]["detail_scraped"] = True
|
|
processed += 1
|
|
|
|
# Incremental save every SAVE_INTERVAL entries
|
|
if processed % SAVE_INTERVAL == 0:
|
|
logger.info(" Auto-saving after %d entries...", processed)
|
|
_save_words(words)
|
|
|
|
# Final save
|
|
_save_words(words)
|
|
logger.info(
|
|
"Done. Processed=%d, Errors=%d, Total eligible=%d",
|
|
processed,
|
|
errors,
|
|
len(candidates),
|
|
)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CLI
|
|
# ---------------------------------------------------------------------------
|
|
|
|
|
|
def _build_parser() -> argparse.ArgumentParser:
|
|
parser = argparse.ArgumentParser(
|
|
description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
|
|
)
|
|
parser.add_argument(
|
|
"--test",
|
|
metavar="N",
|
|
type=int,
|
|
default=None,
|
|
help="Scrape only N entries (smoke-test mode).",
|
|
)
|
|
parser.add_argument(
|
|
"--force-refresh-detail",
|
|
action="store_true",
|
|
default=False,
|
|
help="Re-scrape entries where detail_scraped=True.",
|
|
)
|
|
group = parser.add_mutually_exclusive_group()
|
|
group.add_argument(
|
|
"--nouns-only",
|
|
action="store_true",
|
|
default=False,
|
|
help="Only scrape Noun entries.",
|
|
)
|
|
group.add_argument(
|
|
"--verbs-only",
|
|
action="store_true",
|
|
default=False,
|
|
help="Only scrape Verb entries.",
|
|
)
|
|
group.add_argument(
|
|
"--adjectives-only",
|
|
action="store_true",
|
|
default=False,
|
|
help="Only scrape Adjective entries.",
|
|
)
|
|
group.add_argument(
|
|
"--prepositions-only",
|
|
action="store_true",
|
|
default=False,
|
|
help="Only scrape Preposition entries.",
|
|
)
|
|
return parser
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
|
|
args = _build_parser().parse_args()
|
|
run(
|
|
test=args.test,
|
|
force_refresh=args.force_refresh_detail,
|
|
nouns_only=args.nouns_only,
|
|
verbs_only=args.verbs_only,
|
|
adjectives_only=args.adjectives_only,
|
|
prepositions_only=args.prepositions_only,
|
|
)
|