hebrew_flash_cards/scripts/extract_verb_list.py
Sochen 17f7458d19 Sprint 9: cloze cards, plurals deck, project reorg, lint tooling
- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences
- Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns)
- Ktiv male forms expanded to 20,711 entries for sentence matching
- Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for
  one-off tools, tests/ with smoke tests, deleted 3 dead files
- Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig,
  fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars)
- validate_apkg.py: card count range check for optional cloze template
- Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals,
  noun_slug_map, vocab_sentence_matches, epub_sentence_index

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 08:09:39 +00:00

431 lines
14 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Extract verb list from Coffin & Bolozky Appendix 1 (pages 390-411).
Citation:
Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
Cambridge University Press, 2005.
Downloads the PDF, extracts verb entries by binyan, writes verbs_input.txt.
Pu'al/Huf'al verbs (no infinitive) are written prefixed with '# 3ms:' so the
conjugation extractor searches by 3ms past form instead of infinitive.
If PDF extraction yields < 40 verbs, falls back to the hardcoded list below.
"""
import logging
import re
import sys
import unicodedata
from pathlib import Path
logger = logging.getLogger(__name__)
PROJECT_ROOT = Path(__file__).resolve().parent.parent
PDF_URL = "" # Set to URL or local path of Coffin & Bolozky PDF
PDF_PATH = Path("/tmp/coffin_bolozky.pdf")
OUTPUT_PATH = PROJECT_ROOT / "verbs_input.txt"
# Pages to scan (Appendix 1)
PAGE_START = 390
PAGE_END = 411
# Binyan headings in Hebrew (vowelled and unvowelled variants)
BINYAN_HEADINGS_HEB = [
"פָּעַל",
"פעל",
"נִפְעַל",
"נפעל",
"פִּעֵל",
"פיעל",
"פֻּעַל",
"פועל",
"הִתְפַּעֵל",
"התפעל",
"הִפְעִיל",
"הפעיל",
"הֻפְעַל",
"הופעל",
]
# Binyan heading → canonical name
BINYAN_CANONICAL = {
"פָּעַל": "Pa'al",
"פעל": "Pa'al",
"נִפְעַל": "Nif'al",
"נפעל": "Nif'al",
"פִּעֵל": "Pi'el",
"פיעל": "Pi'el",
"פֻּעַל": "Pu'al",
"פועל": "Pu'al",
"הִתְפַּעֵל": "Hitpa'el",
"התפעל": "Hitpa'el",
"הִפְעִיל": "Hif'il",
"הפעיל": "Hif'il",
"הֻפְעַל": "Huf'al",
"הופעל": "Huf'al",
}
# Passive binyan names — no infinitive, use 3ms past
PASSIVE_BINYANS = {"Pu'al", "Huf'al"}
# ─────────────────────────────────────────────────────────────────────────────
# Fallback hardcoded list (71 verbs from Coffin & Bolozky Appendix 1)
# Pa'al
# ─────────────────────────────────────────────────────────────────────────────
FALLBACK_VERBS = """# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.
# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
# Cambridge University Press, 2005.
# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.
# Pa'al (פָּעַל)
לָלֶכֶת
לָבוֹא
לָשֶׁבֶת
לָקוּם
לָשִׂים
לָדַעַת
לִרְאוֹת
לוֹמַר
לַעֲשׂוֹת
לִתֵּן
לִקְחַת
לֶאֱכֹל
לִשְׁתּוֹת
לִכְתּוֹב
לִקְרוֹא
לִשְׁמוֹר
לִשְׁמֹעַ
לִפְתּוֹחַ
לִסְגּוֹר
לִנְסוֹעַ
לִרְכּוֹב
לִשְׁכַּב
לַחְשׁוֹב
לִבְכּוֹת
לָרוּץ
לִשְׁאֹל
לַעֲנוֹת
לִמְכּוֹר
לִקְנוֹת
לִלְמֹד
# Nif'al (נִפְעַל)
לְהִכָּנֵס
לְהִפָּתַח
לְהִסָּגֵר
לְהִשָּׁמֵר
לְהִמָּצֵא
לְהִרְאוֹת
לְהִכָּתֵב
לְהִשָּׁבֵר
# Pi'el (פִּעֵל)
לְדַבֵּר
לְסַפֵּר
לְבַקֵּשׁ
לְקַבֵּל
לְשַׁלֵּם
לְצַלֵּם
לְנַסּוֹת
לְחַכּוֹת
לְטַלְפֵן
לְבַשֵּׁל
# Pu'al (פֻּעַל) — 3ms past, no infinitive
# 3ms: דֻּבַּר
# 3ms: סֻפַּר
# 3ms: בֻּקַּשׁ
# 3ms: קֻבַּל
# Hitpa'el (הִתְפַּעֵל)
לְהִתְלַבֵּשׁ
לְהִתְרַחֵץ
לְהִתְנַהֵג
לְהִתְחַתֵּן
לְהִתְגּוֹרֵר
לְהִתְכּוֹנֵן
לְהִתְחִיל
# Hif'il (הִפְעִיל)
לְהַגִּיד
לְהַבִּין
לְהַכִּיר
לְהַרְגִּישׁ
לְהַחְלִיט
לְהַתְחִיל
לְהַכְנִיס
לְהוֹצִיא
לְהוֹרִיד
לְהַעְלוֹת
# Huf'al (הֻפְעַל) — 3ms past, no infinitive
# 3ms: הוּגַד
# 3ms: הוּבַן
# 3ms: הוּכְנַס
# 3ms: הוּצָא
"""
def _install_deps():
"""Install pymupdf and python-bidi if not available."""
try:
import bidi # noqa: F401
import fitz # noqa: F401
return True
except ImportError:
logger.info("Installing pymupdf and python-bidi …")
import subprocess
result = subprocess.run(
[sys.executable, "-m", "pip", "install", "pymupdf", "python-bidi", "--break-system-packages", "-q"],
capture_output=True,
)
if result.returncode != 0:
logger.error(f"pip install failed: {result.stderr.decode()[:200]}")
return False
return True
def _download_pdf() -> bool:
"""Download the PDF to /tmp/coffin_bolozky.pdf. Returns True on success."""
if PDF_PATH.exists() and PDF_PATH.stat().st_size > 100_000:
logger.info(f"PDF already cached at {PDF_PATH}")
return True
logger.info(f"Downloading PDF from {PDF_URL}")
try:
import requests
resp = requests.get(PDF_URL, timeout=120, stream=True)
resp.raise_for_status()
PDF_PATH.write_bytes(resp.content)
logger.info(f"PDF downloaded: {PDF_PATH.stat().st_size / 1e6:.1f} MB")
return True
except Exception as e:
logger.error(f"PDF download failed: {e}")
return False
def _needs_bidi_fix(text: str) -> bool:
"""
Heuristic: if Hebrew text starts with what would be a word-final letter
(like ר, ל, ם, ן, ף, ך) where logically it should start with alef/bet/etc.
for known verb forms, it's likely visually ordered and needs reversing.
"""
# Simple check: does a clearly right-to-left prefix appear reversed?
# Try to find לשמור or similar — if we see רומשל instead, we need bidi.
heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]+", text)
if not heb_words:
return False
# Word starting with ל is common for infinitives in Pa'al/Nif'al/Pi'el/Hif'il
# If we see words ending in ל instead of starting, likely reversed
starts_with_lamed = sum(1 for w in heb_words[:50] if w.startswith("ל"))
ends_with_lamed = sum(1 for w in heb_words[:50] if w.endswith("ל"))
return ends_with_lamed > starts_with_lamed
def _strip_nikkud(text: str) -> str:
return "".join(ch for ch in unicodedata.normalize("NFD", text) if unicodedata.category(ch) != "Mn")
def _extract_from_pdf() -> list[tuple[str, str, str]]:
"""
Extract verb entries from PDF pages PAGE_START-PAGE_END.
Returns list of (binyan_name, form_type, form) tuples.
form_type is 'infinitive' or '3ms'.
"""
try:
import fitz
except ImportError:
logger.error("pymupdf not available")
return []
if not PDF_PATH.exists():
return []
entries = []
current_binyan = None
try:
doc = fitz.open(str(PDF_PATH))
except Exception as e:
logger.error(f"Cannot open PDF: {e}")
return []
# Check if we need bidi correction
test_text = ""
try:
for page_num in range(min(PAGE_START, doc.page_count - 1), min(PAGE_START + 3, doc.page_count)):
test_text += doc[page_num].get_text("text")
except Exception: # noqa: S110
pass
use_bidi = _needs_bidi_fix(test_text)
logger.info(f"PDF bidi correction: {'YES' if use_bidi else 'NO'}")
logger.debug(f"Sample text (first 300 chars): {repr(test_text[:300])}")
def fix_text(t: str) -> str:
if not use_bidi:
return t
try:
from bidi.algorithm import get_display
lines = t.split("\n")
fixed = []
for line in lines:
if re.search(r"[\u05d0-\u05ea]", line):
fixed.append(get_display(line)[::-1])
else:
fixed.append(line)
return "\n".join(fixed)
except Exception:
return t
page_end = min(PAGE_END, doc.page_count - 1)
for page_num in range(PAGE_START - 1, page_end): # fitz is 0-indexed
try:
raw = doc[page_num].get_text("text")
except Exception: # noqa: S112
continue
text = fix_text(raw)
lines = text.split("\n")
for line in lines:
line = line.strip()
if not line:
continue
# Check for binyan heading
stripped = _strip_nikkud(line)
for heb_head in BINYAN_HEADINGS_HEB:
if stripped == _strip_nikkud(heb_head) or line == heb_head:
current_binyan = BINYAN_CANONICAL.get(heb_head, heb_head)
logger.info(f" Binyan heading found: {current_binyan} (page {page_num + 1})")
break
if current_binyan is None:
continue
# Look for infinitive marker שֵׁם פֹּעַל or שם פועל
if "שֵׁם פֹּעַל" in line or "שם פועל" in _strip_nikkud(line):
# Infinitive is typically on the same line after the label
# or may be labelled separately
heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{2,}", line)
# Filter out the label words themselves
label_stripped = {_strip_nikkud(w) for w in ["שֵׁם", "פֹּעַל", "שם", "פועל"]}
verbs = [w for w in heb_words if _strip_nikkud(w) not in label_stripped and len(w) >= 3]
for v in verbs:
if current_binyan in PASSIVE_BINYANS:
entries.append((current_binyan, "3ms", v))
else:
entries.append((current_binyan, "infinitive", v))
# For passive binyans, look for 3ms past (הוּא + form pattern)
elif current_binyan in PASSIVE_BINYANS:
# 3ms past forms for Pu'al/Huf'al start with פֻּ/הֻ
heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{3,}", line)
for w in heb_words:
stripped_w = _strip_nikkud(w)
if (
current_binyan == "Pu'al"
and stripped_w.startswith("פ")
or current_binyan == "Huf'al"
and stripped_w.startswith("ה")
):
entries.append((current_binyan, "3ms", w))
doc.close()
logger.info(f"PDF extraction complete: {len(entries)} raw entries")
return entries
def _write_output(entries: list[tuple[str, str, str]]) -> None:
"""Write verbs_input.txt from extracted entries."""
lines = [
"# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.",
"# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.",
"# Cambridge University Press, 2005.",
"# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.",
"",
]
current_binyan = None
seen = set()
for binyan, form_type, form in entries:
if not form or not re.search(r"[\u05d0-\u05ea]", form):
continue
key = (binyan, _strip_nikkud(form))
if key in seen:
continue
seen.add(key)
if binyan != current_binyan:
current_binyan = binyan
lines.append(f"# {binyan} ({_binyan_heb(binyan)})")
if form_type == "3ms":
lines.append(f"# 3ms: {form}")
else:
lines.append(form)
OUTPUT_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
verb_count = sum(1 for ln in lines if ln and not ln.startswith("#"))
passive_count = sum(1 for ln in lines if ln.startswith("# 3ms:"))
logger.info(f"Written {verb_count} active verbs + {passive_count} passive (3ms) → {OUTPUT_PATH}")
def _binyan_heb(name: str) -> str:
mapping = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּעֵל",
"Pu'al": "פֻּעַל",
"Hitpa'el": "הִתְפַּעֵל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
}
return mapping.get(name, name)
def main():
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger.info("=== Coffin & Bolozky Verb Extractor ===")
# Install deps
if not _install_deps():
logger.warning("Dependencies not available — using fallback list")
OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
return
# Download PDF
if not _download_pdf():
logger.warning("PDF download failed — using fallback list")
OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
return
# Extract from PDF
entries = _extract_from_pdf()
unique_verbs = len({_strip_nikkud(f) for _, t, f in entries if t == "infinitive"})
unique_passive = len({_strip_nikkud(f) for _, t, f in entries if t == "3ms"})
logger.info(f"Unique infinitives: {unique_verbs}, passive 3ms: {unique_passive}")
if unique_verbs + unique_passive < 40:
logger.warning(
f"Only {unique_verbs + unique_passive} verbs extracted — below threshold of 40. "
"Using fallback hardcoded list."
)
OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
logger.info(f"Fallback list written → {OUTPUT_PATH}")
return
_write_output(entries)
if __name__ == "__main__":
main()