- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
431 lines
14 KiB
Python
431 lines
14 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Extract verb list from Coffin & Bolozky Appendix 1 (pages 390-411).
|
||
|
||
Citation:
|
||
Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
|
||
Cambridge University Press, 2005.
|
||
|
||
Downloads the PDF, extracts verb entries by binyan, writes verbs_input.txt.
|
||
Pu'al/Huf'al verbs (no infinitive) are written prefixed with '# 3ms:' so the
|
||
conjugation extractor searches by 3ms past form instead of infinitive.
|
||
|
||
If PDF extraction yields < 40 verbs, falls back to the hardcoded list below.
|
||
"""
|
||
|
||
import logging
|
||
import re
|
||
import sys
|
||
import unicodedata
|
||
from pathlib import Path
|
||
|
||
logger = logging.getLogger(__name__)
|
||
|
||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||
PDF_URL = "" # Set to URL or local path of Coffin & Bolozky PDF
|
||
PDF_PATH = Path("/tmp/coffin_bolozky.pdf")
|
||
OUTPUT_PATH = PROJECT_ROOT / "verbs_input.txt"
|
||
|
||
# Pages to scan (Appendix 1)
|
||
PAGE_START = 390
|
||
PAGE_END = 411
|
||
|
||
# Binyan headings in Hebrew (vowelled and unvowelled variants)
|
||
BINYAN_HEADINGS_HEB = [
|
||
"פָּעַל",
|
||
"פעל",
|
||
"נִפְעַל",
|
||
"נפעל",
|
||
"פִּעֵל",
|
||
"פיעל",
|
||
"פֻּעַל",
|
||
"פועל",
|
||
"הִתְפַּעֵל",
|
||
"התפעל",
|
||
"הִפְעִיל",
|
||
"הפעיל",
|
||
"הֻפְעַל",
|
||
"הופעל",
|
||
]
|
||
|
||
# Binyan heading → canonical name
|
||
BINYAN_CANONICAL = {
|
||
"פָּעַל": "Pa'al",
|
||
"פעל": "Pa'al",
|
||
"נִפְעַל": "Nif'al",
|
||
"נפעל": "Nif'al",
|
||
"פִּעֵל": "Pi'el",
|
||
"פיעל": "Pi'el",
|
||
"פֻּעַל": "Pu'al",
|
||
"פועל": "Pu'al",
|
||
"הִתְפַּעֵל": "Hitpa'el",
|
||
"התפעל": "Hitpa'el",
|
||
"הִפְעִיל": "Hif'il",
|
||
"הפעיל": "Hif'il",
|
||
"הֻפְעַל": "Huf'al",
|
||
"הופעל": "Huf'al",
|
||
}
|
||
|
||
# Passive binyan names — no infinitive, use 3ms past
|
||
PASSIVE_BINYANS = {"Pu'al", "Huf'al"}
|
||
|
||
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
# Fallback hardcoded list (71 verbs from Coffin & Bolozky Appendix 1)
|
||
# Pa'al
|
||
# ─────────────────────────────────────────────────────────────────────────────
|
||
FALLBACK_VERBS = """# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.
|
||
# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
|
||
# Cambridge University Press, 2005.
|
||
# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.
|
||
|
||
# Pa'al (פָּעַל)
|
||
לָלֶכֶת
|
||
לָבוֹא
|
||
לָשֶׁבֶת
|
||
לָקוּם
|
||
לָשִׂים
|
||
לָדַעַת
|
||
לִרְאוֹת
|
||
לוֹמַר
|
||
לַעֲשׂוֹת
|
||
לִתֵּן
|
||
לִקְחַת
|
||
לֶאֱכֹל
|
||
לִשְׁתּוֹת
|
||
לִכְתּוֹב
|
||
לִקְרוֹא
|
||
לִשְׁמוֹר
|
||
לִשְׁמֹעַ
|
||
לִפְתּוֹחַ
|
||
לִסְגּוֹר
|
||
לִנְסוֹעַ
|
||
לִרְכּוֹב
|
||
לִשְׁכַּב
|
||
לַחְשׁוֹב
|
||
לִבְכּוֹת
|
||
לָרוּץ
|
||
לִשְׁאֹל
|
||
לַעֲנוֹת
|
||
לִמְכּוֹר
|
||
לִקְנוֹת
|
||
לִלְמֹד
|
||
|
||
# Nif'al (נִפְעַל)
|
||
לְהִכָּנֵס
|
||
לְהִפָּתַח
|
||
לְהִסָּגֵר
|
||
לְהִשָּׁמֵר
|
||
לְהִמָּצֵא
|
||
לְהִרְאוֹת
|
||
לְהִכָּתֵב
|
||
לְהִשָּׁבֵר
|
||
|
||
# Pi'el (פִּעֵל)
|
||
לְדַבֵּר
|
||
לְסַפֵּר
|
||
לְבַקֵּשׁ
|
||
לְקַבֵּל
|
||
לְשַׁלֵּם
|
||
לְצַלֵּם
|
||
לְנַסּוֹת
|
||
לְחַכּוֹת
|
||
לְטַלְפֵן
|
||
לְבַשֵּׁל
|
||
|
||
# Pu'al (פֻּעַל) — 3ms past, no infinitive
|
||
# 3ms: דֻּבַּר
|
||
# 3ms: סֻפַּר
|
||
# 3ms: בֻּקַּשׁ
|
||
# 3ms: קֻבַּל
|
||
|
||
# Hitpa'el (הִתְפַּעֵל)
|
||
לְהִתְלַבֵּשׁ
|
||
לְהִתְרַחֵץ
|
||
לְהִתְנַהֵג
|
||
לְהִתְחַתֵּן
|
||
לְהִתְגּוֹרֵר
|
||
לְהִתְכּוֹנֵן
|
||
לְהִתְחִיל
|
||
|
||
# Hif'il (הִפְעִיל)
|
||
לְהַגִּיד
|
||
לְהַבִּין
|
||
לְהַכִּיר
|
||
לְהַרְגִּישׁ
|
||
לְהַחְלִיט
|
||
לְהַתְחִיל
|
||
לְהַכְנִיס
|
||
לְהוֹצִיא
|
||
לְהוֹרִיד
|
||
לְהַעְלוֹת
|
||
|
||
# Huf'al (הֻפְעַל) — 3ms past, no infinitive
|
||
# 3ms: הוּגַד
|
||
# 3ms: הוּבַן
|
||
# 3ms: הוּכְנַס
|
||
# 3ms: הוּצָא
|
||
"""
|
||
|
||
|
||
def _install_deps():
|
||
"""Install pymupdf and python-bidi if not available."""
|
||
try:
|
||
import bidi # noqa: F401
|
||
import fitz # noqa: F401
|
||
|
||
return True
|
||
except ImportError:
|
||
logger.info("Installing pymupdf and python-bidi …")
|
||
import subprocess
|
||
|
||
result = subprocess.run(
|
||
[sys.executable, "-m", "pip", "install", "pymupdf", "python-bidi", "--break-system-packages", "-q"],
|
||
capture_output=True,
|
||
)
|
||
if result.returncode != 0:
|
||
logger.error(f"pip install failed: {result.stderr.decode()[:200]}")
|
||
return False
|
||
return True
|
||
|
||
|
||
def _download_pdf() -> bool:
|
||
"""Download the PDF to /tmp/coffin_bolozky.pdf. Returns True on success."""
|
||
if PDF_PATH.exists() and PDF_PATH.stat().st_size > 100_000:
|
||
logger.info(f"PDF already cached at {PDF_PATH}")
|
||
return True
|
||
|
||
logger.info(f"Downloading PDF from {PDF_URL} …")
|
||
try:
|
||
import requests
|
||
|
||
resp = requests.get(PDF_URL, timeout=120, stream=True)
|
||
resp.raise_for_status()
|
||
PDF_PATH.write_bytes(resp.content)
|
||
logger.info(f"PDF downloaded: {PDF_PATH.stat().st_size / 1e6:.1f} MB")
|
||
return True
|
||
except Exception as e:
|
||
logger.error(f"PDF download failed: {e}")
|
||
return False
|
||
|
||
|
||
def _needs_bidi_fix(text: str) -> bool:
|
||
"""
|
||
Heuristic: if Hebrew text starts with what would be a word-final letter
|
||
(like ר, ל, ם, ן, ף, ך) where logically it should start with alef/bet/etc.
|
||
for known verb forms, it's likely visually ordered and needs reversing.
|
||
"""
|
||
# Simple check: does a clearly right-to-left prefix appear reversed?
|
||
# Try to find לשמור or similar — if we see רומשל instead, we need bidi.
|
||
heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]+", text)
|
||
if not heb_words:
|
||
return False
|
||
# Word starting with ל is common for infinitives in Pa'al/Nif'al/Pi'el/Hif'il
|
||
# If we see words ending in ל instead of starting, likely reversed
|
||
starts_with_lamed = sum(1 for w in heb_words[:50] if w.startswith("ל"))
|
||
ends_with_lamed = sum(1 for w in heb_words[:50] if w.endswith("ל"))
|
||
return ends_with_lamed > starts_with_lamed
|
||
|
||
|
||
def _strip_nikkud(text: str) -> str:
|
||
return "".join(ch for ch in unicodedata.normalize("NFD", text) if unicodedata.category(ch) != "Mn")
|
||
|
||
|
||
def _extract_from_pdf() -> list[tuple[str, str, str]]:
|
||
"""
|
||
Extract verb entries from PDF pages PAGE_START-PAGE_END.
|
||
Returns list of (binyan_name, form_type, form) tuples.
|
||
form_type is 'infinitive' or '3ms'.
|
||
"""
|
||
try:
|
||
import fitz
|
||
except ImportError:
|
||
logger.error("pymupdf not available")
|
||
return []
|
||
|
||
if not PDF_PATH.exists():
|
||
return []
|
||
|
||
entries = []
|
||
current_binyan = None
|
||
|
||
try:
|
||
doc = fitz.open(str(PDF_PATH))
|
||
except Exception as e:
|
||
logger.error(f"Cannot open PDF: {e}")
|
||
return []
|
||
|
||
# Check if we need bidi correction
|
||
test_text = ""
|
||
try:
|
||
for page_num in range(min(PAGE_START, doc.page_count - 1), min(PAGE_START + 3, doc.page_count)):
|
||
test_text += doc[page_num].get_text("text")
|
||
except Exception: # noqa: S110
|
||
pass
|
||
|
||
use_bidi = _needs_bidi_fix(test_text)
|
||
logger.info(f"PDF bidi correction: {'YES' if use_bidi else 'NO'}")
|
||
logger.debug(f"Sample text (first 300 chars): {repr(test_text[:300])}")
|
||
|
||
def fix_text(t: str) -> str:
|
||
if not use_bidi:
|
||
return t
|
||
try:
|
||
from bidi.algorithm import get_display
|
||
|
||
lines = t.split("\n")
|
||
fixed = []
|
||
for line in lines:
|
||
if re.search(r"[\u05d0-\u05ea]", line):
|
||
fixed.append(get_display(line)[::-1])
|
||
else:
|
||
fixed.append(line)
|
||
return "\n".join(fixed)
|
||
except Exception:
|
||
return t
|
||
|
||
page_end = min(PAGE_END, doc.page_count - 1)
|
||
for page_num in range(PAGE_START - 1, page_end): # fitz is 0-indexed
|
||
try:
|
||
raw = doc[page_num].get_text("text")
|
||
except Exception: # noqa: S112
|
||
continue
|
||
|
||
text = fix_text(raw)
|
||
lines = text.split("\n")
|
||
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
|
||
# Check for binyan heading
|
||
stripped = _strip_nikkud(line)
|
||
for heb_head in BINYAN_HEADINGS_HEB:
|
||
if stripped == _strip_nikkud(heb_head) or line == heb_head:
|
||
current_binyan = BINYAN_CANONICAL.get(heb_head, heb_head)
|
||
logger.info(f" Binyan heading found: {current_binyan} (page {page_num + 1})")
|
||
break
|
||
|
||
if current_binyan is None:
|
||
continue
|
||
|
||
# Look for infinitive marker שֵׁם פֹּעַל or שם פועל
|
||
if "שֵׁם פֹּעַל" in line or "שם פועל" in _strip_nikkud(line):
|
||
# Infinitive is typically on the same line after the label
|
||
# or may be labelled separately
|
||
heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{2,}", line)
|
||
# Filter out the label words themselves
|
||
label_stripped = {_strip_nikkud(w) for w in ["שֵׁם", "פֹּעַל", "שם", "פועל"]}
|
||
verbs = [w for w in heb_words if _strip_nikkud(w) not in label_stripped and len(w) >= 3]
|
||
for v in verbs:
|
||
if current_binyan in PASSIVE_BINYANS:
|
||
entries.append((current_binyan, "3ms", v))
|
||
else:
|
||
entries.append((current_binyan, "infinitive", v))
|
||
|
||
# For passive binyans, look for 3ms past (הוּא + form pattern)
|
||
elif current_binyan in PASSIVE_BINYANS:
|
||
# 3ms past forms for Pu'al/Huf'al start with פֻּ/הֻ
|
||
heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{3,}", line)
|
||
for w in heb_words:
|
||
stripped_w = _strip_nikkud(w)
|
||
if (
|
||
current_binyan == "Pu'al"
|
||
and stripped_w.startswith("פ")
|
||
or current_binyan == "Huf'al"
|
||
and stripped_w.startswith("ה")
|
||
):
|
||
entries.append((current_binyan, "3ms", w))
|
||
|
||
doc.close()
|
||
logger.info(f"PDF extraction complete: {len(entries)} raw entries")
|
||
return entries
|
||
|
||
|
||
def _write_output(entries: list[tuple[str, str, str]]) -> None:
|
||
"""Write verbs_input.txt from extracted entries."""
|
||
lines = [
|
||
"# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.",
|
||
"# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.",
|
||
"# Cambridge University Press, 2005.",
|
||
"# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.",
|
||
"",
|
||
]
|
||
|
||
current_binyan = None
|
||
seen = set()
|
||
|
||
for binyan, form_type, form in entries:
|
||
if not form or not re.search(r"[\u05d0-\u05ea]", form):
|
||
continue
|
||
key = (binyan, _strip_nikkud(form))
|
||
if key in seen:
|
||
continue
|
||
seen.add(key)
|
||
|
||
if binyan != current_binyan:
|
||
current_binyan = binyan
|
||
lines.append(f"# {binyan} ({_binyan_heb(binyan)})")
|
||
|
||
if form_type == "3ms":
|
||
lines.append(f"# 3ms: {form}")
|
||
else:
|
||
lines.append(form)
|
||
|
||
OUTPUT_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||
verb_count = sum(1 for ln in lines if ln and not ln.startswith("#"))
|
||
passive_count = sum(1 for ln in lines if ln.startswith("# 3ms:"))
|
||
logger.info(f"Written {verb_count} active verbs + {passive_count} passive (3ms) → {OUTPUT_PATH}")
|
||
|
||
|
||
def _binyan_heb(name: str) -> str:
|
||
mapping = {
|
||
"Pa'al": "פָּעַל",
|
||
"Nif'al": "נִפְעַל",
|
||
"Pi'el": "פִּעֵל",
|
||
"Pu'al": "פֻּעַל",
|
||
"Hitpa'el": "הִתְפַּעֵל",
|
||
"Hif'il": "הִפְעִיל",
|
||
"Huf'al": "הֻפְעַל",
|
||
}
|
||
return mapping.get(name, name)
|
||
|
||
|
||
def main():
|
||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||
|
||
logger.info("=== Coffin & Bolozky Verb Extractor ===")
|
||
|
||
# Install deps
|
||
if not _install_deps():
|
||
logger.warning("Dependencies not available — using fallback list")
|
||
OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
|
||
return
|
||
|
||
# Download PDF
|
||
if not _download_pdf():
|
||
logger.warning("PDF download failed — using fallback list")
|
||
OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
|
||
return
|
||
|
||
# Extract from PDF
|
||
entries = _extract_from_pdf()
|
||
unique_verbs = len({_strip_nikkud(f) for _, t, f in entries if t == "infinitive"})
|
||
unique_passive = len({_strip_nikkud(f) for _, t, f in entries if t == "3ms"})
|
||
logger.info(f"Unique infinitives: {unique_verbs}, passive 3ms: {unique_passive}")
|
||
|
||
if unique_verbs + unique_passive < 40:
|
||
logger.warning(
|
||
f"Only {unique_verbs + unique_passive} verbs extracted — below threshold of 40. "
|
||
"Using fallback hardcoded list."
|
||
)
|
||
OUTPUT_PATH.write_text(FALLBACK_VERBS, encoding="utf-8")
|
||
logger.info(f"Fallback list written → {OUTPUT_PATH}")
|
||
return
|
||
|
||
_write_output(entries)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|