hebrew_flash_cards/scripts/extract_verb_list.py

#!/usr/bin/env python3
"""
Extract verb list from Coffin & Bolozky Appendix 1 (pages 390-411).

Citation:
  Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
  Cambridge University Press, 2005.

Downloads the PDF, extracts verb entries by binyan, writes verbs_input.txt.
Pu'al/Huf'al verbs (no infinitive) are written prefixed with '# 3ms:' so the
conjugation extractor searches by 3ms past form instead of infinitive.

If PDF extraction yields < 40 verbs, falls back to the hardcoded list below.
"""

import logging
import re
import sys
import unicodedata
from pathlib import Path

logger = logging.getLogger(__name__)

PROJECT_ROOT = Path(__file__).resolve().parent.parent
PDF_URL = ""  # Set to URL or local path of Coffin & Bolozky PDF
PDF_PATH = Path("/tmp/coffin_bolozky.pdf")
OUTPUT_PATH = PROJECT_ROOT / "verbs_input.txt"

# Pages to scan (Appendix 1)
PAGE_START = 390
PAGE_END = 411

# Binyan headings in Hebrew (vowelled and unvowelled variants)
BINYAN_HEADINGS_HEB = [
    "פָּעַל",
    "פעל",
    "נִפְעַל",
    "נפעל",
    "פִּעֵל",
    "פיעל",
    "פֻּעַל",
    "פועל",
    "הִתְפַּעֵל",
    "התפעל",
    "הִפְעִיל",
    "הפעיל",
    "הֻפְעַל",
    "הופעל",
]

# Binyan heading → canonical name
BINYAN_CANONICAL = {
    "פָּעַל": "Pa'al",
    "פעל": "Pa'al",
    "נִפְעַל": "Nif'al",
    "נפעל": "Nif'al",
    "פִּעֵל": "Pi'el",
    "פיעל": "Pi'el",
    "פֻּעַל": "Pu'al",
    "פועל": "Pu'al",
    "הִתְפַּעֵל": "Hitpa'el",
    "התפעל": "Hitpa'el",
    "הִפְעִיל": "Hif'il",
    "הפעיל": "Hif'il",
    "הֻפְעַל": "Huf'al",
    "הופעל": "Huf'al",
}

# Passive binyan names — no infinitive, use 3ms past
PASSIVE_BINYANS = {"Pu'al", "Huf'al"}


# ─────────────────────────────────────────────────────────────────────────────
# Fallback hardcoded list (71 verbs from Coffin & Bolozky Appendix 1)
# Pa'al
# ─────────────────────────────────────────────────────────────────────────────
FALLBACK_VERBS = """# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.
# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
#   Cambridge University Press, 2005.
# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.

# Pa'al (פָּעַל)
לָלֶכֶת
לָבוֹא
לָשֶׁבֶת
לָקוּם
לָשִׂים
לָדַעַת
לִרְאוֹת
לוֹמַר
לַעֲשׂוֹת
לִתֵּן
לִקְחַת
לֶאֱכֹל
לִשְׁתּוֹת
לִכְתּוֹב
לִקְרוֹא
לִשְׁמוֹר
לִשְׁמֹעַ
לִפְתּוֹחַ
לִסְגּוֹר
לִנְסוֹעַ
לִרְכּוֹב
לִשְׁכַּב
לַחְשׁוֹב
לִבְכּוֹת
לָרוּץ
לִשְׁאֹל
לַעֲנוֹת
לִמְכּוֹר
לִקְנוֹת
לִלְמֹד

# Nif'al (נִפְעַל)
לְהִכָּנֵס
לְהִפָּתַח
לְהִסָּגֵר
לְהִשָּׁמֵר
לְהִמָּצֵא
לְהִרְאוֹת
לְהִכָּתֵב
לְהִשָּׁבֵר

# Pi'el (פִּעֵל)
לְדַבֵּר
לְסַפֵּר
לְבַקֵּשׁ
לְקַבֵּל
לְשַׁלֵּם
לְצַלֵּם
לְנַסּוֹת
לְחַכּוֹת
לְטַלְפֵן
לְבַשֵּׁל

# Pu'al (פֻּעַל) — 3ms past, no infinitive
# 3ms: דֻּבַּר
# 3ms: סֻפַּר
# 3ms: בֻּקַּשׁ
# 3ms: קֻבַּל

# Hitpa'el (הִתְפַּעֵל)
לְהִתְלַבֵּשׁ
לְהִתְרַחֵץ
לְהִתְנַהֵג
לְהִתְחַתֵּן
לְהִתְגּוֹרֵר
לְהִתְכּוֹנֵן
לְהִתְחִיל

# Hif'il (הִפְעִיל)
לְהַגִּיד
לְהַבִּין
לְהַכִּיר
לְהַרְגִּישׁ
לְהַחְלִיט
לְהַתְחִיל
לְהַכְנִיס
לְהוֹצִיא
לְהוֹרִיד
לְהַעְלוֹת

# Huf'al (הֻפְעַל) — 3ms past, no infinitive
# 3ms: הוּגַד
# 3ms: הוּבַן
# 3ms: הוּכְנַס
# 3ms: הוּצָא
"""


def _install_deps():
    """Install pymupdf and python-bidi if not available."""
    try:
        import bidi  # noqa: F401
        import fitz  # noqa: F401

        return True
    except ImportError:
        logger.info("Installing pymupdf and python-bidi …")
        import subprocess

        result = subprocess.run(
            [sys.executable, "-m", "pip", "install", "pymupdf", "python-bidi", "--break-system-packages", "-q"],
            capture_output=True,
        )
        if result.returncode != 0:
            logger.error(f"pip install failed: {result.stderr.decode()[:200]}")
            return False
        return True


def _download_pdf() -> bool:
    """Download the PDF to /tmp/coffin_bolozky.pdf. Returns True on success."""
    if PDF_PATH.exists() and PDF_PATH.stat().st_size > 100_000:
        logger.info(f"PDF already cached at {PDF_PATH}")
        return True

    logger.info(f"Downloading PDF from {PDF_URL} …")
    try:
        import requests

        resp = requests.get(PDF_URL, timeout=120, stream=True)
        resp.raise_for_status()
        PDF_PATH.write_bytes(resp.content)
        logger.info(f"PDF downloaded: {PDF_PATH.stat().st_size / 1e6:.1f} MB")
        return True
    except Exception as e:
        logger.error(f"PDF download failed: {e}")
        return False


def _needs_bidi_fix(text: str) -> bool:
    """
    Heuristic: if Hebrew text starts with what would be a word-final letter
    (like ר, ל, ם, ן, ף, ך) where logically it should start with alef/bet/etc.
    for known verb forms, it's likely visually ordered and needs reversing.
    """
    # Simple check: does a clearly right-to-left prefix appear reversed?
    # Try to find לשמור or similar — if we see רומשל instead, we need bidi.
    heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]+", text)
    if not heb_words:
        return False
    # Word starting with ל is common for infinitives in Pa'al/Nif'al/Pi'el/Hif'il
    # If we see words ending in ל instead of starting, likely reversed
    starts_with_lamed = sum(1 for w in heb_words[:50] if w.startswith("ל"))
    ends_with_lamed = sum(1 for w in heb_words[:50] if w.endswith("ל"))
    return ends_with_lamed > starts_with_lamed


def _strip_nikkud(text: str) -> str:
    return "".join(ch for ch in unicodedata.normalize("NFD", text) if unicodedata.category(ch) != "Mn")


def _extract_from_pdf() -> list[tuple[str, str, str]]:
    """
    Extract verb entries from PDF pages PAGE_START-PAGE_END.
    Returns list of (binyan_name, form_type, form) tuples.
    form_type is 'infinitive' or '3ms'.
    """
    try:
        import fitz
    except ImportError:
        logger.error("pymupdf not available")
        return []

    if not PDF_PATH.exists():
        return []

    entries = []
    current_binyan = None

    try:
        doc = fitz.open(str(PDF_PATH))
    except Exception as e:
        logger.error(f"Cannot open PDF: {e}")
        return []

    # Check if we need bidi correction
    test_text = ""
    try:
        for page_num in range(min(PAGE_START, doc.page_count - 1), min(PAGE_START + 3, doc.page_count)):
            test_text += doc[page_num].get_text("text")
    except Exception:  # noqa: S110
        pass

    use_bidi = _needs_bidi_fix(test_text)
    logger.info(f"PDF bidi correction: {'YES' if use_bidi else 'NO'}")
    logger.debug(f"Sample text (first 300 chars): {repr(test_text[:300])}")

    def fix_text(t: str) -> str:
        if not use_bidi:
            return t
        try:
            from bidi.algorithm import get_display

            lines = t.split("\n")
            fixed = []
            for line in lines:
                if re.search(r"[\u05d0-\u05ea]", line):
                    fixed.append(get_display(line)[::-1])
                else:
                    fixed.append(line)
            return "\n".join(fixed)
        except Exception:
            return t

    page_end = min(PAGE_END, doc.page_count - 1)
    for page_num in range(PAGE_START - 1, page_end):  # fitz is 0-indexed
        try:
            raw = doc[page_num].get_text("text")
        except Exception:  # noqa: S112
            continue

        text = fix_text(raw)
        lines = text.split("\n")

        for line in lines:
            line = line.strip()
            if not line:
                continue

            # Check for binyan heading
            stripped = _strip_nikkud(line)
            for heb_head in BINYAN_HEADINGS_HEB:
                if stripped == _strip_nikkud(heb_head) or line == heb_head:
                    current_binyan = BINYAN_CANONICAL.get(heb_head, heb_head)
                    logger.info(f"  Binyan heading found: {current_binyan} (page {page_num + 1})")
                    break

            if current_binyan is None:
                continue

            # Look for infinitive marker שֵׁם פֹּעַל or שם פועל
            if "שֵׁם פֹּעַל" in line or "שם פועל" in _strip_nikkud(line):
                # Infinitive is typically on the same line after the label
                # or may be labelled separately
                heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{2,}", line)
                # Filter out the label words themselves
                label_stripped = {_strip_nikkud(w) for w in ["שֵׁם", "פֹּעַל", "שם", "פועל"]}
                verbs = [w for w in heb_words if _strip_nikkud(w) not in label_stripped and len(w) >= 3]
                for v in verbs:
                    if current_binyan in PASSIVE_BINYANS:
                        entries.append((current_binyan, "3ms", v))
                    else:
                        entries.append((current_binyan, "infinitive", v))

            # For passive binyans, look for 3ms past (הוּא + form pattern)
            elif current_binyan in PASSIVE_BINYANS:
                # 3ms past forms for Pu'al/Huf'al start with פֻּ/הֻ
                heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{3,}", line)
                for w in heb_words:
                    stripped_w = _strip_nikkud(w)
                    if (
                        current_binyan == "Pu'al"
                        and stripped_w.startswith("פ")
                        or current_binyan == "Huf'al"
                        and stripped_w.startswith("ה")
                    ):
                        entries.append((current_binyan, "3ms", w))

    doc.close()
    logger.info(f"PDF extraction complete: {len(entries)} raw entries")
    return entries


def _write_output(entries: list[tuple[str, str, str]]) -> None:
    """Write verbs_input.txt from extracted entries."""
    lines = [
        "# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.",
        "# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.",
        "#   Cambridge University Press, 2005.",
        "# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.",
        "",
    ]

    current_binyan = None
    seen = set()

    for binyan, form_type, form in entries:
        if not form or not re.search(r"[\u05d0-\u05ea]", form):
            continue
        key = (binyan, _strip_nikkud(form))
        if key in seen:
            continue
        seen.add(key)

        if binyan != current_binyan:
            current_binyan = binyan
            lines.append(f"# {binyan} ({_binyan_heb(binyan)})")

        if form_type == "3ms":
            lines.append(f"# 3ms: {form}")
        else:
            lines.append(form)

    OUTPUT_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
    verb_count = sum(1 for ln in lines if ln and not ln.startswith("#"))
    passive_count = sum(1 for ln in lines if ln.startswith("# 3ms:"))
    logger.info(f"Written {verb_count} active verbs + {passive_count} passive (3ms) → {OUTPUT_PATH}")


def _binyan_heb(name: str) -> str:
    mapping = {
        "Pa'al": "פָּעַל",
        "Nif'al": "נִפְעַל",
        "Pi'el": "פִּעֵל",
        "Pu'al": "פֻּעַל",
        "Hitpa'el": "הִתְפַּעֵל",
        "Hif'il": "הִפְעִיל",
        "Huf'al": "הֻפְעַל",
    }
    return mapping.get(name, name)


def main():
    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")

    logger.info("=== Coffin & Bolozky Verb Extractor ===")

    # Install deps
    if not _install_deps():
        logger.warning("Dependencies not available — using fallback list")
        OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
        return

    # Download PDF
    if not _download_pdf():
        logger.warning("PDF download failed — using fallback list")
        OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
        return

    # Extract from PDF
    entries = _extract_from_pdf()
    unique_verbs = len({_strip_nikkud(f) for _, t, f in entries if t == "infinitive"})
    unique_passive = len({_strip_nikkud(f) for _, t, f in entries if t == "3ms"})
    logger.info(f"Unique infinitives: {unique_verbs}, passive 3ms: {unique_passive}")

    if unique_verbs + unique_passive < 40:
        logger.warning(
            f"Only {unique_verbs + unique_passive} verbs extracted — below threshold of 40. "
            "Using fallback hardcoded list."
        )
        OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
        logger.info(f"Fallback list written → {OUTPUT_PATH}")
        return

    _write_output(entries)


if __name__ == "__main__":
    main()