hebrew_flash_cards/conjugation_extract.py

#!/usr/bin/env python3
"""
Extract Hebrew verb conjugations from pealim.com.
Input: verbs_input.txt  (one Hebrew infinitive per line)
Output: data/conjugations.json

For each verb:
  1. Search pealim.com/search/?q=<verb> to find URL slug
  2. Fetch /dict/<slug>/ with hebstyle=mo cookie
  3. Parse conjugation table by row labels

Resume-safe: verbs already in conjugations.json are skipped.
"""

import json
import logging
import re
import time
import urllib.parse
from pathlib import Path

import requests
from bs4 import BeautifulSoup

logger = logging.getLogger(__name__)

PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"

# Pronoun labels (for card front display)
PRONOUN_LABELS = {
    "present_ms": "",
    "present_fs": "",
    "present_mp": "",
    "present_fp": "",
    "past_1s":    "אֲנִי",
    "past_1p":    "אֲנַחְנוּ",
    "past_2ms":   "אַתָּה",
    "past_2fs":   "אַתְּ",
    "past_2mp":   "אַתֶּם",
    "past_2fp":   "אַתֶּן",
    "past_3ms":   "הוּא",
    "past_3fs":   "הִיא",
    "past_3p":    "הֵם / הֵן",
    "future_1s":  "אֲנִי",
    "future_1p":  "אֲנַחְנוּ",
    "future_2ms": "אַתָּה",
    "future_2fs": "אַתְּ",
    "future_2mp": "אַתֶּם",
    "future_2fp": "אַתֶּן",
    "future_3ms": "הוּא",
    "future_3fs": "הִיא",
    "future_3mp": "הֵם",
    "future_3fp": "הֵן",
    "imperative_ms": "אַתָּה",
    "imperative_fs": "אַתְּ",
    "imperative_mp": "אַתֶּם",
    "imperative_fp": "אַתֶּן",
    "infinitive": "",
}

# Human-readable tense description for card front
TENSE_DESCRIPTION = {
    "present_ms": "הוֹוֶה (זכר יחיד)",
    "present_fs": "הוֹוֶה (נקבה יחיד)",
    "present_mp": "הוֹוֶה (זכר רבים)",
    "present_fp": "הוֹוֶה (נקבה רבים)",
    "past_1s":    "עָבָר",
    "past_1p":    "עָבָר",
    "past_2ms":   "עָבָר",
    "past_2fs":   "עָבָר",
    "past_2mp":   "עָבָר",
    "past_2fp":   "עָבָר",
    "past_3ms":   "עָבָר",
    "past_3fs":   "עָבָר",
    "past_3p":    "עָבָר",
    "future_1s":  "עָתִיד",
    "future_1p":  "עָתִיד",
    "future_2ms": "עָתִיד",
    "future_2fs": "עָתִיד",
    "future_2mp": "עָתִיד",
    "future_2fp": "עָתִיד",
    "future_3ms": "עָתִיד",
    "future_3fs": "עָתִיד",
    "future_3mp": "עָתִיד",
    "future_3fp": "עָתִיד",
    "imperative_ms": "צִוּוּי",
    "imperative_fs": "צִוּוּי",
    "imperative_mp": "צִוּוּי",
    "imperative_fp": "צִוּוּי",
    "infinitive": "מְקוֹר",
}

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})


def _find_slug(infinitive: str) -> str | None:
    """Search pealim.com/search/?q=<verb> and return the URL slug."""
    url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(infinitive)}"
    try:
        resp = session.get(url, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        # Slugs look like /dict/2255-lishmor/
        slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
        if slugs:
            slug = slugs[0]
            logger.info(f"  Slug: {slug}")
            return slug
    except Exception as e:
        logger.error(f"  Error searching for '{infinitive}': {e}")
    return None


def _is_passive_binyan(binyan: str) -> bool:
    for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]:
        if marker.lower() in binyan.lower():
            return True
    return False


def _get_menukad(cell) -> str:
    """Extract nikkud Hebrew text from a table cell."""
    span = cell.find("span", class_="menukad")
    if span:
        return span.get_text(strip=True)
    # fallback: any Hebrew text in cell
    txt = cell.get_text(strip=True)
    if re.search(r"[\u05d0-\u05ea]", txt):
        return txt
    return ""


def _parse_table(soup: BeautifulSoup) -> dict[str, str]:
    """
    Parse the pealim conjugation table and return form_key -> Hebrew form mapping.

    Table structure (rows after two header rows):
      Row 2  (Present): [label x2] [ms] [fs] [mp] [fp]
      Row 3  (Past 1):  [Past x1] [1st x1] [1s x2] [1p x2]
      Row 4  (Past 2):  [2nd x1] [2ms] [2fs] [2mp] [2fp]
      Row 5  (Past 3):  [3rd x1] [3ms] [3fs] [3p x2]
      Row 6  (Fut 1):   [Future x1] [1st x1] [1s x2] [1p x2]
      Row 7  (Fut 2):   [2nd x1] [2ms] [2fs] [2mp] [2fp]
      Row 8  (Fut 3):   [3rd x1] [3ms] [3fs] [3mp] [3fp]
      Row 9  (Imp):     [Imp x2] [ms] [fs] [mp] [fp]
      Row 10 (Inf):     [Inf x2] [form x4]
    """
    table = soup.find("table", class_="conjugation-table")
    if not table:
        return {}

    rows = table.find_all("tr")
    if len(rows) < 9:
        return {}

    forms: dict[str, str] = {}

    def row_forms(row_idx: int) -> list[str]:
        """Extract all Hebrew form values from a row (expanding colspans)."""
        cells = rows[row_idx].find_all(["th", "td"])
        result = []
        for cell in cells:
            txt = _get_menukad(cell)
            colspan = int(cell.get("colspan", 1))
            if txt:
                for _ in range(colspan):
                    result.append(txt)
            else:
                for _ in range(colspan):
                    result.append("")
        return result

    def first_heb_forms(row_idx: int) -> list[str]:
        """Get only the Hebrew-text cells from a row (skip label cells)."""
        cells = rows[row_idx].find_all(["th", "td"])
        result = []
        for cell in cells:
            txt = _get_menukad(cell)
            colspan = int(cell.get("colspan", 1))
            if txt and re.search(r"[\u05d0-\u05ea]", txt):
                for _ in range(colspan):
                    result.append(txt)
        return result

    # Row label detection
    def row_label(idx: int) -> str:
        row = rows[idx]
        return row.get_text(" ", strip=True).lower()

    # Find rows by tense label
    present_row = past_row = future_row = imp_row = inf_row = -1
    for i, row in enumerate(rows):
        label = row.get_text(" ", strip=True).lower()
        if "present" in label and present_row < 0:
            present_row = i
        elif "past" in label and past_row < 0:
            past_row = i
        elif "future" in label and future_row < 0:
            future_row = i
        elif "imperative" in label and imp_row < 0:
            imp_row = i
        elif "infinitive" in label and inf_row < 0:
            inf_row = i

    # Present tense (4 forms: ms fs mp fp)
    if present_row >= 0:
        hf = first_heb_forms(present_row)
        keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
        for k, v in zip(keys, hf):
            if v:
                forms[k] = v

    # Past tense (rows: 1st person, 2nd person, 3rd person)
    if past_row >= 0:
        # 1st person row
        hf = first_heb_forms(past_row)
        # Row has: [Past label] [1st label] [1s] [1s] [1p] [1p] or just [1s] [1p]
        # After label stripping: we get 1s and 1p (possibly duplicated by colspan)
        unique = list(dict.fromkeys(hf))  # deduplicate consecutive
        if len(unique) >= 1:
            forms["past_1s"] = unique[0]
        if len(unique) >= 2:
            forms["past_1p"] = unique[1]

        # 2nd person row
        if past_row + 1 < len(rows):
            hf2 = first_heb_forms(past_row + 1)
            keys = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
            for k, v in zip(keys, hf2):
                if v:
                    forms[k] = v

        # 3rd person row
        if past_row + 2 < len(rows):
            hf3 = first_heb_forms(past_row + 2)
            # 3ms, 3fs, 3p (3p colspan=2 so may appear twice)
            unique3 = list(dict.fromkeys(hf3))
            keys3 = ["past_3ms", "past_3fs", "past_3p"]
            for k, v in zip(keys3, unique3):
                if v:
                    forms[k] = v

    # Future tense
    if future_row >= 0:
        # 1st person
        hf = first_heb_forms(future_row)
        unique = list(dict.fromkeys(hf))
        if len(unique) >= 1:
            forms["future_1s"] = unique[0]
        if len(unique) >= 2:
            forms["future_1p"] = unique[1]

        if future_row + 1 < len(rows):
            hf2 = first_heb_forms(future_row + 1)
            keys = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
            for k, v in zip(keys, hf2):
                if v:
                    forms[k] = v

        if future_row + 2 < len(rows):
            hf3 = first_heb_forms(future_row + 2)
            keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
            for k, v in zip(keys3, hf3):
                if v:
                    forms[k] = v

    # Imperative
    if imp_row >= 0:
        hf = first_heb_forms(imp_row)
        keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
        for k, v in zip(keys, hf):
            if v:
                forms[k] = v

    # Infinitive
    if inf_row >= 0:
        hf = first_heb_forms(inf_row)
        if hf:
            forms["infinitive"] = hf[0]

    return forms


def _extract_conjugations(slug: str, infinitive: str) -> dict | None:
    """Fetch /dict/<slug>/ and parse conjugation table."""
    url = f"{PEALIM_BASE}/dict/{slug}/"
    try:
        resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
    except Exception as e:
        logger.error(f"  Error fetching {url}: {e}")
        return None

    soup = BeautifulSoup(resp.text, "lxml")

    # Extract root from menukad span in header
    root = ""
    for span in soup.find_all("span", class_="menukad"):
        txt = span.get_text(strip=True)
        if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
            root = txt
            break

    # Extract binyan / verb type from lead text or title
    binyan = ""
    meta = soup.find("meta", {"property": "og:description"})
    if meta:
        desc = meta.get("content", "")
        for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]:
            if bname in desc:
                binyan = bname
                break

    forms = _parse_table(soup)

    if not forms:
        logger.warning(f"  No forms found for {slug}")
        return None

    is_passive = _is_passive_binyan(binyan)
    reference_form = forms.get("infinitive", infinitive) if not is_passive else forms.get("past_3ms", infinitive)

    result = {
        "infinitive": infinitive,
        "slug": slug,
        "root": root,
        "binyan": binyan,
        "is_passive": is_passive,
        "reference_form": reference_form,
        "forms": {},
    }
    for key, form in forms.items():
        if key in PRONOUN_LABELS:
            result["forms"][key] = {
                "form": form,
                "pronoun": PRONOUN_LABELS[key],
                "tense": TENSE_DESCRIPTION.get(key, ""),
            }

    logger.info(f"  Extracted {len(result['forms'])} forms for {infinitive}")
    return result


def _load_conjugations() -> dict:
    if CONJUGATIONS_PATH.exists():
        with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
            return json.load(f)
    return {}


def _save_conjugations(data: dict) -> None:
    CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
    with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def main(verbs_file: Path = VERBS_INPUT) -> dict:
    """Read verbs from file and extract conjugations. Returns full conjugations dict."""
    if not verbs_file.exists():
        logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
        return _load_conjugations()

    verbs = [v.strip() for v in verbs_file.read_text(encoding="utf-8").splitlines()
             if v.strip() and not v.startswith("#")]
    logger.info(f"Loaded {len(verbs)} verbs from {verbs_file}")

    conjugations = _load_conjugations()
    new_count = 0

    for verb in verbs:
        if verb in conjugations:
            logger.info(f"Skipping {verb} (cached)")
            continue

        logger.info(f"Processing: {verb}")
        time.sleep(REQUEST_DELAY)
        slug = _find_slug(verb)
        if not slug:
            logger.warning(f"  No slug found for {verb}")
            conjugations[verb] = None
            _save_conjugations(conjugations)
            continue

        time.sleep(REQUEST_DELAY)
        data = _extract_conjugations(slug, verb)
        conjugations[verb] = data
        _save_conjugations(conjugations)
        new_count += 1

    logger.info(f"Done: {new_count} new verbs processed")
    return conjugations


if __name__ == "__main__":
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
    result = main()
    for verb, data in result.items():
        if data:
            forms = data.get("forms", {})
            print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
            for k, v in list(forms.items())[:3]:
                print(f"  {k}: {v['form']}")
        else:
            print(f"{verb}: no data")