- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
256 lines
9.6 KiB
Python
256 lines
9.6 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
|
|
|
|
For each verb:
|
|
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
|
|
2. Searches pealim.com to find URL slug
|
|
3. Fetches the page to confirm the binyan
|
|
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
|
|
|
|
Output:
|
|
verbs_input.txt — cleaned verb list for conjugation_extract.py
|
|
Printed validation report table
|
|
|
|
Usage:
|
|
python3 validate_verb_list.py
|
|
|
|
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
|
|
running conjugation extraction.
|
|
"""
|
|
|
|
import re
|
|
import sys
|
|
import time
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
PEALIM_BASE = "https://www.pealim.com"
|
|
REQUEST_DELAY = 1.5
|
|
REQUEST_TIMEOUT = 15
|
|
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
|
|
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
|
|
|
|
# Known problem entries: word → (action, note)
|
|
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
|
|
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
|
|
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
|
|
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
|
|
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
|
|
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
|
|
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
|
|
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
|
|
}
|
|
|
|
# Expected binyan by line range (1-indexed) per plan analysis
|
|
LINE_RANGES: list[tuple[range, str]] = [
|
|
(range(1, 18), "Pa'al"),
|
|
(range(18, 29), "Nif'al"),
|
|
(range(29, 37), "Pi'el"),
|
|
(range(37, 43), "Pu'al"),
|
|
(range(43, 53), "Hitpa'el"),
|
|
(range(53, 63), "Hif'il"),
|
|
(range(63, 71), "Huf'al"),
|
|
]
|
|
|
|
SECTION_HEADERS: dict[str, str] = {
|
|
"Pa'al": "# Pa'al (פָּעַל)",
|
|
"Nif'al": "# Nif'al (נִפְעַל)",
|
|
"Pi'el": "# Pi'el (פִּעֵל)",
|
|
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
|
|
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
|
|
"Hif'il": "# Hif'il (הִפְעִיל)",
|
|
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
|
|
}
|
|
|
|
session = requests.Session()
|
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
|
|
|
|
|
|
def classify_by_line(line_num: int) -> str:
|
|
"""Return expected binyan for a 1-indexed line number."""
|
|
for r, binyan in LINE_RANGES:
|
|
if line_num in r:
|
|
return binyan
|
|
return "Unknown"
|
|
|
|
|
|
def find_slug(query: str) -> str | None:
|
|
"""Search pealim.com and return first URL slug found."""
|
|
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
|
|
try:
|
|
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
|
return slugs[0] if slugs else None
|
|
except Exception as e:
|
|
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
|
|
return None
|
|
|
|
|
|
def get_page_binyan(slug: str) -> str:
|
|
"""Fetch /dict/<slug>/ and extract binyan from page header."""
|
|
url = f"{PEALIM_BASE}/dict/{slug}/"
|
|
try:
|
|
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
|
|
for h3 in soup.find_all("h3", class_="page-header"):
|
|
text = h3.get_text(" ", strip=True)
|
|
for bname in binyan_names:
|
|
if bname in text:
|
|
return bname
|
|
meta = soup.find("meta", {"property": "og:description"})
|
|
if meta:
|
|
desc = meta.get("content", "")
|
|
for bname in binyan_names:
|
|
if bname in desc:
|
|
return bname
|
|
except Exception as e:
|
|
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
|
|
return ""
|
|
|
|
|
|
def main() -> None:
|
|
if not SOURCE_FILE.exists():
|
|
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
|
|
sys.exit(1)
|
|
|
|
lines = [line.strip() for line in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
|
|
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
|
|
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
|
|
|
|
results = []
|
|
|
|
for line_num, word in enumerate(lines, start=1):
|
|
expected_binyan = classify_by_line(line_num)
|
|
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
|
|
|
|
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
|
|
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
|
|
|
|
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
|
|
|
|
if issue_type == "REVIEW":
|
|
# Don't query pealim for known-bad entries
|
|
print("REVIEW (skipping query)")
|
|
results.append(
|
|
{
|
|
"line": line_num,
|
|
"word": word,
|
|
"expected_binyan": expected_binyan,
|
|
"slug": "",
|
|
"page_binyan": "",
|
|
"status": "REVIEW",
|
|
"notes": issue_note,
|
|
"is_3ms": is_3ms_by_position,
|
|
}
|
|
)
|
|
continue
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
slug = find_slug(word)
|
|
|
|
if slug:
|
|
time.sleep(REQUEST_DELAY)
|
|
page_binyan = get_page_binyan(slug)
|
|
else:
|
|
page_binyan = ""
|
|
|
|
# Determine status
|
|
if issue_type == "3ms" or is_3ms_by_position:
|
|
status = "3ms"
|
|
notes = issue_note or "Pu'al/Huf'al 3ms past form"
|
|
elif not slug:
|
|
status = "NOT_FOUND"
|
|
notes = "no search result on pealim.com"
|
|
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
|
|
status = "MISMATCH"
|
|
notes = f"expected {expected_binyan}, page says {page_binyan}"
|
|
else:
|
|
status = "OK"
|
|
notes = ""
|
|
|
|
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
|
|
results.append(
|
|
{
|
|
"line": line_num,
|
|
"word": word,
|
|
"expected_binyan": expected_binyan,
|
|
"slug": slug or "",
|
|
"page_binyan": page_binyan,
|
|
"status": status,
|
|
"notes": notes,
|
|
"is_3ms": is_3ms_by_position or issue_type == "3ms",
|
|
}
|
|
)
|
|
|
|
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
|
|
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
|
|
review_lines: list[str] = []
|
|
|
|
for r in results:
|
|
b = r["expected_binyan"]
|
|
if b not in sections:
|
|
b = list(sections.keys())[0]
|
|
|
|
if r["status"] == "REVIEW":
|
|
review_lines.append(f"# REVIEW: {r['word']} — {r['notes']}")
|
|
elif r["status"] == "3ms":
|
|
sections[b].append(f"# 3ms: {r['word']}")
|
|
elif r["status"] in ("OK", "MISMATCH"):
|
|
sections[b].append(r["word"])
|
|
else: # NOT_FOUND
|
|
sections[b].append(f"# NOT_FOUND: {r['word']} — {r['notes']}")
|
|
|
|
output_lines = [
|
|
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
|
|
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
|
|
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
|
|
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
|
|
"",
|
|
]
|
|
for binyan, header in SECTION_HEADERS.items():
|
|
if sections.get(binyan):
|
|
output_lines.append(header)
|
|
output_lines.extend(sections[binyan])
|
|
output_lines.append("")
|
|
|
|
if review_lines:
|
|
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
|
|
output_lines.extend(review_lines)
|
|
output_lines.append("")
|
|
|
|
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
|
|
print(f"\nWrote → {OUTPUT_FILE}")
|
|
|
|
# ── Print summary table ──────────────────────────────────────────────────────
|
|
print("\n" + "=" * 95)
|
|
print("VALIDATION REPORT")
|
|
print("=" * 95)
|
|
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
|
|
print("-" * 95)
|
|
for r in results:
|
|
print(
|
|
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
|
|
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
|
|
)
|
|
print("=" * 95)
|
|
|
|
counts = {s: sum(1 for r in results if r["status"] == s) for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
|
|
print(
|
|
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
|
|
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
|
|
)
|
|
print(f"Total entries: {len(results)}")
|
|
|
|
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
|
|
print("\n⚠ Review flagged entries in verbs_input.txt before running:\n python3 conjugation_extract.py")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|