hebrew_flash_cards/validate_verb_list.py

#!/usr/bin/env python3
"""
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.

For each verb:
  1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
  2. Searches pealim.com to find URL slug
  3. Fetches the page to confirm the binyan
  4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos

Output:
  verbs_input.txt  — cleaned verb list for conjugation_extract.py
  Printed validation report table

Usage:
  python3 validate_verb_list.py

After running, review verbs_input.txt (especially REVIEW-flagged entries) before
running conjugation extraction.
"""

import re
import sys
import time
import urllib.parse
from pathlib import Path

import requests
from bs4 import BeautifulSoup

PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"

# Known problem entries: word → (action, note)
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
    "לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
    "לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
    "להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
    "להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
    "המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
    "קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
}

# Expected binyan by line range (1-indexed) per plan analysis
LINE_RANGES: list[tuple[range, str]] = [
    (range(1, 18), "Pa'al"),
    (range(18, 29), "Nif'al"),
    (range(29, 37), "Pi'el"),
    (range(37, 43), "Pu'al"),
    (range(43, 53), "Hitpa'el"),
    (range(53, 63), "Hif'il"),
    (range(63, 71), "Huf'al"),
]

SECTION_HEADERS: dict[str, str] = {
    "Pa'al": "# Pa'al (פָּעַל)",
    "Nif'al": "# Nif'al (נִפְעַל)",
    "Pi'el": "# Pi'el (פִּעֵל)",
    "Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
    "Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
    "Hif'il": "# Hif'il (הִפְעִיל)",
    "Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
}

session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})


def classify_by_line(line_num: int) -> str:
    """Return expected binyan for a 1-indexed line number."""
    for r, binyan in LINE_RANGES:
        if line_num in r:
            return binyan
    return "Unknown"


def find_slug(query: str) -> str | None:
    """Search pealim.com and return first URL slug found."""
    url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
    try:
        resp = session.get(url, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
        return slugs[0] if slugs else None
    except Exception as e:
        print(f"  ERROR searching {query!r}: {e}", file=sys.stderr)
        return None


def get_page_binyan(slug: str) -> str:
    """Fetch /dict/<slug>/ and extract binyan from page header."""
    url = f"{PEALIM_BASE}/dict/{slug}/"
    try:
        resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
        resp.raise_for_status()
        soup = BeautifulSoup(resp.text, "lxml")
        binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
        for h3 in soup.find_all("h3", class_="page-header"):
            text = h3.get_text(" ", strip=True)
            for bname in binyan_names:
                if bname in text:
                    return bname
        meta = soup.find("meta", {"property": "og:description"})
        if meta:
            desc = meta.get("content", "")
            for bname in binyan_names:
                if bname in desc:
                    return bname
    except Exception as e:
        print(f"  ERROR fetching {slug}: {e}", file=sys.stderr)
    return ""


def main() -> None:
    if not SOURCE_FILE.exists():
        print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
        sys.exit(1)

    lines = [line.strip() for line in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
    print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
    print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")

    results = []

    for line_num, word in enumerate(lines, start=1):
        expected_binyan = classify_by_line(line_num)
        issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))

        # Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
        is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")

        print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)

        if issue_type == "REVIEW":
            # Don't query pealim for known-bad entries
            print("REVIEW  (skipping query)")
            results.append(
                {
                    "line": line_num,
                    "word": word,
                    "expected_binyan": expected_binyan,
                    "slug": "",
                    "page_binyan": "",
                    "status": "REVIEW",
                    "notes": issue_note,
                    "is_3ms": is_3ms_by_position,
                }
            )
            continue

        time.sleep(REQUEST_DELAY)
        slug = find_slug(word)

        if slug:
            time.sleep(REQUEST_DELAY)
            page_binyan = get_page_binyan(slug)
        else:
            page_binyan = ""

        # Determine status
        if issue_type == "3ms" or is_3ms_by_position:
            status = "3ms"
            notes = issue_note or "Pu'al/Huf'al 3ms past form"
        elif not slug:
            status = "NOT_FOUND"
            notes = "no search result on pealim.com"
        elif page_binyan and expected_binyan and page_binyan != expected_binyan:
            status = "MISMATCH"
            notes = f"expected {expected_binyan}, page says {page_binyan}"
        else:
            status = "OK"
            notes = ""

        print(f"{status:<12}  slug={slug or '-':<35}  binyan={page_binyan or '-'}")
        results.append(
            {
                "line": line_num,
                "word": word,
                "expected_binyan": expected_binyan,
                "slug": slug or "",
                "page_binyan": page_binyan,
                "status": status,
                "notes": notes,
                "is_3ms": is_3ms_by_position or issue_type == "3ms",
            }
        )

    # ── Write cleaned verbs_input.txt ────────────────────────────────────────────
    sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
    review_lines: list[str] = []

    for r in results:
        b = r["expected_binyan"]
        if b not in sections:
            b = list(sections.keys())[0]

        if r["status"] == "REVIEW":
            review_lines.append(f"# REVIEW: {r['word']}  — {r['notes']}")
        elif r["status"] == "3ms":
            sections[b].append(f"# 3ms: {r['word']}")
        elif r["status"] in ("OK", "MISMATCH"):
            sections[b].append(r["word"])
        else:  # NOT_FOUND
            sections[b].append(f"# NOT_FOUND: {r['word']}  — {r['notes']}")

    output_lines = [
        "# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
        "# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
        "# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
        "# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
        "",
    ]
    for binyan, header in SECTION_HEADERS.items():
        if sections.get(binyan):
            output_lines.append(header)
            output_lines.extend(sections[binyan])
            output_lines.append("")

    if review_lines:
        output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
        output_lines.extend(review_lines)
        output_lines.append("")

    OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
    print(f"\nWrote → {OUTPUT_FILE}")

    # ── Print summary table ──────────────────────────────────────────────────────
    print("\n" + "=" * 95)
    print("VALIDATION REPORT")
    print("=" * 95)
    print(f"{'#':>4}  {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12}  Notes")
    print("-" * 95)
    for r in results:
        print(
            f"{r['line']:>4}  {r['word']:<22} {r['status']:<14} "
            f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12}  {r['notes']}"
        )
    print("=" * 95)

    counts = {s: sum(1 for r in results if r["status"] == s) for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
    print(
        f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
        f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
    )
    print(f"Total entries: {len(results)}")

    if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
        print("\n⚠  Review flagged entries in verbs_input.txt before running:\n   python3 conjugation_extract.py")


if __name__ == "__main__":
    main()