#!/usr/bin/env python3 """ Extract verb list from Coffin & Bolozky Appendix 1 (pages 390-411). Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew. Cambridge University Press, 2005. Downloads the PDF, extracts verb entries by binyan, writes verbs_input.txt. Pu'al/Huf'al verbs (no infinitive) are written prefixed with '# 3ms:' so the conjugation extractor searches by 3ms past form instead of infinitive. If PDF extraction yields < 40 verbs, falls back to the hardcoded list below. """ import logging import re import sys import unicodedata from pathlib import Path logger = logging.getLogger(__name__) PROJECT_ROOT = Path(__file__).resolve().parent.parent PDF_URL = "" # Set to URL or local path of Coffin & Bolozky PDF PDF_PATH = Path("/tmp/coffin_bolozky.pdf") OUTPUT_PATH = PROJECT_ROOT / "verbs_input.txt" # Pages to scan (Appendix 1) PAGE_START = 390 PAGE_END = 411 # Binyan headings in Hebrew (vowelled and unvowelled variants) BINYAN_HEADINGS_HEB = [ "פָּעַל", "פעל", "נִפְעַל", "נפעל", "פִּעֵל", "פיעל", "פֻּעַל", "פועל", "הִתְפַּעֵל", "התפעל", "הִפְעִיל", "הפעיל", "הֻפְעַל", "הופעל", ] # Binyan heading → canonical name BINYAN_CANONICAL = { "פָּעַל": "Pa'al", "פעל": "Pa'al", "נִפְעַל": "Nif'al", "נפעל": "Nif'al", "פִּעֵל": "Pi'el", "פיעל": "Pi'el", "פֻּעַל": "Pu'al", "פועל": "Pu'al", "הִתְפַּעֵל": "Hitpa'el", "התפעל": "Hitpa'el", "הִפְעִיל": "Hif'il", "הפעיל": "Hif'il", "הֻפְעַל": "Huf'al", "הופעל": "Huf'al", } # Passive binyan names — no infinitive, use 3ms past PASSIVE_BINYANS = {"Pu'al", "Huf'al"} # ───────────────────────────────────────────────────────────────────────────── # Fallback hardcoded list (71 verbs from Coffin & Bolozky Appendix 1) # Pa'al # ───────────────────────────────────────────────────────────────────────────── FALLBACK_VERBS = """# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1. # Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew. # Cambridge University Press, 2005. # Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form. # Pa'al (פָּעַל) לָלֶכֶת לָבוֹא לָשֶׁבֶת לָקוּם לָשִׂים לָדַעַת לִרְאוֹת לוֹמַר לַעֲשׂוֹת לִתֵּן לִקְחַת לֶאֱכֹל לִשְׁתּוֹת לִכְתּוֹב לִקְרוֹא לִשְׁמוֹר לִשְׁמֹעַ לִפְתּוֹחַ לִסְגּוֹר לִנְסוֹעַ לִרְכּוֹב לִשְׁכַּב לַחְשׁוֹב לִבְכּוֹת לָרוּץ לִשְׁאֹל לַעֲנוֹת לִמְכּוֹר לִקְנוֹת לִלְמֹד # Nif'al (נִפְעַל) לְהִכָּנֵס לְהִפָּתַח לְהִסָּגֵר לְהִשָּׁמֵר לְהִמָּצֵא לְהִרְאוֹת לְהִכָּתֵב לְהִשָּׁבֵר # Pi'el (פִּעֵל) לְדַבֵּר לְסַפֵּר לְבַקֵּשׁ לְקַבֵּל לְשַׁלֵּם לְצַלֵּם לְנַסּוֹת לְחַכּוֹת לְטַלְפֵן לְבַשֵּׁל # Pu'al (פֻּעַל) — 3ms past, no infinitive # 3ms: דֻּבַּר # 3ms: סֻפַּר # 3ms: בֻּקַּשׁ # 3ms: קֻבַּל # Hitpa'el (הִתְפַּעֵל) לְהִתְלַבֵּשׁ לְהִתְרַחֵץ לְהִתְנַהֵג לְהִתְחַתֵּן לְהִתְגּוֹרֵר לְהִתְכּוֹנֵן לְהִתְחִיל # Hif'il (הִפְעִיל) לְהַגִּיד לְהַבִּין לְהַכִּיר לְהַרְגִּישׁ לְהַחְלִיט לְהַתְחִיל לְהַכְנִיס לְהוֹצִיא לְהוֹרִיד לְהַעְלוֹת # Huf'al (הֻפְעַל) — 3ms past, no infinitive # 3ms: הוּגַד # 3ms: הוּבַן # 3ms: הוּכְנַס # 3ms: הוּצָא """ def _install_deps(): """Install pymupdf and python-bidi if not available.""" try: import bidi # noqa: F401 import fitz # noqa: F401 return True except ImportError: logger.info("Installing pymupdf and python-bidi …") import subprocess result = subprocess.run( [sys.executable, "-m", "pip", "install", "pymupdf", "python-bidi", "--break-system-packages", "-q"], capture_output=True, ) if result.returncode != 0: logger.error(f"pip install failed: {result.stderr.decode()[:200]}") return False return True def _download_pdf() -> bool: """Download the PDF to /tmp/coffin_bolozky.pdf. Returns True on success.""" if PDF_PATH.exists() and PDF_PATH.stat().st_size > 100_000: logger.info(f"PDF already cached at {PDF_PATH}") return True logger.info(f"Downloading PDF from {PDF_URL} …") try: import requests resp = requests.get(PDF_URL, timeout=120, stream=True) resp.raise_for_status() PDF_PATH.write_bytes(resp.content) logger.info(f"PDF downloaded: {PDF_PATH.stat().st_size / 1e6:.1f} MB") return True except Exception as e: logger.error(f"PDF download failed: {e}") return False def _needs_bidi_fix(text: str) -> bool: """ Heuristic: if Hebrew text starts with what would be a word-final letter (like ר, ל, ם, ן, ף, ך) where logically it should start with alef/bet/etc. for known verb forms, it's likely visually ordered and needs reversing. """ # Simple check: does a clearly right-to-left prefix appear reversed? # Try to find לשמור or similar — if we see רומשל instead, we need bidi. heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]+", text) if not heb_words: return False # Word starting with ל is common for infinitives in Pa'al/Nif'al/Pi'el/Hif'il # If we see words ending in ל instead of starting, likely reversed starts_with_lamed = sum(1 for w in heb_words[:50] if w.startswith("ל")) ends_with_lamed = sum(1 for w in heb_words[:50] if w.endswith("ל")) return ends_with_lamed > starts_with_lamed def _strip_nikkud(text: str) -> str: return "".join(ch for ch in unicodedata.normalize("NFD", text) if unicodedata.category(ch) != "Mn") def _extract_from_pdf() -> list[tuple[str, str, str]]: """ Extract verb entries from PDF pages PAGE_START-PAGE_END. Returns list of (binyan_name, form_type, form) tuples. form_type is 'infinitive' or '3ms'. """ try: import fitz except ImportError: logger.error("pymupdf not available") return [] if not PDF_PATH.exists(): return [] entries = [] current_binyan = None try: doc = fitz.open(str(PDF_PATH)) except Exception as e: logger.error(f"Cannot open PDF: {e}") return [] # Check if we need bidi correction test_text = "" try: for page_num in range(min(PAGE_START, doc.page_count - 1), min(PAGE_START + 3, doc.page_count)): test_text += doc[page_num].get_text("text") except Exception: # noqa: S110 pass use_bidi = _needs_bidi_fix(test_text) logger.info(f"PDF bidi correction: {'YES' if use_bidi else 'NO'}") logger.debug(f"Sample text (first 300 chars): {repr(test_text[:300])}") def fix_text(t: str) -> str: if not use_bidi: return t try: from bidi.algorithm import get_display lines = t.split("\n") fixed = [] for line in lines: if re.search(r"[\u05d0-\u05ea]", line): fixed.append(get_display(line)[::-1]) else: fixed.append(line) return "\n".join(fixed) except Exception: return t page_end = min(PAGE_END, doc.page_count - 1) for page_num in range(PAGE_START - 1, page_end): # fitz is 0-indexed try: raw = doc[page_num].get_text("text") except Exception: # noqa: S112 continue text = fix_text(raw) lines = text.split("\n") for line in lines: line = line.strip() if not line: continue # Check for binyan heading stripped = _strip_nikkud(line) for heb_head in BINYAN_HEADINGS_HEB: if stripped == _strip_nikkud(heb_head) or line == heb_head: current_binyan = BINYAN_CANONICAL.get(heb_head, heb_head) logger.info(f" Binyan heading found: {current_binyan} (page {page_num + 1})") break if current_binyan is None: continue # Look for infinitive marker שֵׁם פֹּעַל or שם פועל if "שֵׁם פֹּעַל" in line or "שם פועל" in _strip_nikkud(line): # Infinitive is typically on the same line after the label # or may be labelled separately heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{2,}", line) # Filter out the label words themselves label_stripped = {_strip_nikkud(w) for w in ["שֵׁם", "פֹּעַל", "שם", "פועל"]} verbs = [w for w in heb_words if _strip_nikkud(w) not in label_stripped and len(w) >= 3] for v in verbs: if current_binyan in PASSIVE_BINYANS: entries.append((current_binyan, "3ms", v)) else: entries.append((current_binyan, "infinitive", v)) # For passive binyans, look for 3ms past (הוּא + form pattern) elif current_binyan in PASSIVE_BINYANS: # 3ms past forms for Pu'al/Huf'al start with פֻּ/הֻ heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{3,}", line) for w in heb_words: stripped_w = _strip_nikkud(w) if ( current_binyan == "Pu'al" and stripped_w.startswith("פ") or current_binyan == "Huf'al" and stripped_w.startswith("ה") ): entries.append((current_binyan, "3ms", w)) doc.close() logger.info(f"PDF extraction complete: {len(entries)} raw entries") return entries def _write_output(entries: list[tuple[str, str, str]]) -> None: """Write verbs_input.txt from extracted entries.""" lines = [ "# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.", "# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.", "# Cambridge University Press, 2005.", "# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.", "", ] current_binyan = None seen = set() for binyan, form_type, form in entries: if not form or not re.search(r"[\u05d0-\u05ea]", form): continue key = (binyan, _strip_nikkud(form)) if key in seen: continue seen.add(key) if binyan != current_binyan: current_binyan = binyan lines.append(f"# {binyan} ({_binyan_heb(binyan)})") if form_type == "3ms": lines.append(f"# 3ms: {form}") else: lines.append(form) OUTPUT_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8") verb_count = sum(1 for ln in lines if ln and not ln.startswith("#")) passive_count = sum(1 for ln in lines if ln.startswith("# 3ms:")) logger.info(f"Written {verb_count} active verbs + {passive_count} passive (3ms) → {OUTPUT_PATH}") def _binyan_heb(name: str) -> str: mapping = { "Pa'al": "פָּעַל", "Nif'al": "נִפְעַל", "Pi'el": "פִּעֵל", "Pu'al": "פֻּעַל", "Hitpa'el": "הִתְפַּעֵל", "Hif'il": "הִפְעִיל", "Huf'al": "הֻפְעַל", } return mapping.get(name, name) def main(): logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger.info("=== Coffin & Bolozky Verb Extractor ===") # Install deps if not _install_deps(): logger.warning("Dependencies not available — using fallback list") OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8") return # Download PDF if not _download_pdf(): logger.warning("PDF download failed — using fallback list") OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8") return # Extract from PDF entries = _extract_from_pdf() unique_verbs = len({_strip_nikkud(f) for _, t, f in entries if t == "infinitive"}) unique_passive = len({_strip_nikkud(f) for _, t, f in entries if t == "3ms"}) logger.info(f"Unique infinitives: {unique_verbs}, passive 3ms: {unique_passive}") if unique_verbs + unique_passive < 40: logger.warning( f"Only {unique_verbs + unique_passive} verbs extracted — below threshold of 40. " "Using fallback hardcoded list." ) OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8") logger.info(f"Fallback list written → {OUTPUT_PATH}") return _write_output(entries) if __name__ == "__main__": main()