hebrew_flash_cards/conjugation_extract.py
Sochen b086123bec feat: add apkg builder, frequency, Ben Yehuda examples, conjugation deck
Implements four major improvements to the Pealim Anki deck pipeline:

1. Automated .apkg generation (genanki) — no more manual Anki Desktop step.
   Both vocabulary and conjugation decks are built programmatically.

2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus.
   Notes sorted by rank so Anki presents most common words first.

3. Example sentences from Ben Yehuda public domain corpus (not pealim.com).
   Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set.

4. Conjugation drill deck — one card per form × verb.
   Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one
   per binyan). Extracts 28 forms each via pealim.com/search/ + table parse.

New files:
  apkg_builder.py     — genanki deck builder for both decks
  benyehuda.py        — Ben Yehuda corpus downloader + sentence indexer
  frequency_lookup.py — FrequencyWords downloader + rank lookup
  verbs_input.txt     — verb input list (7 test verbs, one per binyan)
  data/               — baseline CSVs + generated caches

Updated:
  conjugation_extract.py — rewritten: reads verbs_input.txt, searches
                           /search/?q= for slug, parses table by row labels
  requirements.txt       — add genanki, beautifulsoup4, lxml
  run.py                 — full orchestration pipeline with CLI flags
  .gitignore             — exclude venv/, benyehuda_index.json, audio/, output/

CLI:
  python run.py --skip-scrape --skip-audio --test 20  (quick test)
  python run.py --skip-scrape                          (full build)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-03 01:58:31 +00:00

408 lines
13 KiB
Python
Executable file

#!/usr/bin/env python3
"""
Extract Hebrew verb conjugations from pealim.com.
Input: verbs_input.txt (one Hebrew infinitive per line)
Output: data/conjugations.json
For each verb:
1. Search pealim.com/search/?q=<verb> to find URL slug
2. Fetch /dict/<slug>/ with hebstyle=mo cookie
3. Parse conjugation table by row labels
Resume-safe: verbs already in conjugations.json are skipped.
"""
import json
import logging
import re
import time
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
logger = logging.getLogger(__name__)
PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
# Pronoun labels (for card front display)
PRONOUN_LABELS = {
"present_ms": "",
"present_fs": "",
"present_mp": "",
"present_fp": "",
"past_1s": "אֲנִי",
"past_1p": "אֲנַחְנוּ",
"past_2ms": "אַתָּה",
"past_2fs": "אַתְּ",
"past_2mp": "אַתֶּם",
"past_2fp": "אַתֶּן",
"past_3ms": "הוּא",
"past_3fs": "הִיא",
"past_3p": "הֵם / הֵן",
"future_1s": "אֲנִי",
"future_1p": "אֲנַחְנוּ",
"future_2ms": "אַתָּה",
"future_2fs": "אַתְּ",
"future_2mp": "אַתֶּם",
"future_2fp": "אַתֶּן",
"future_3ms": "הוּא",
"future_3fs": "הִיא",
"future_3mp": "הֵם",
"future_3fp": "הֵן",
"imperative_ms": "אַתָּה",
"imperative_fs": "אַתְּ",
"imperative_mp": "אַתֶּם",
"imperative_fp": "אַתֶּן",
"infinitive": "",
}
# Human-readable tense description for card front
TENSE_DESCRIPTION = {
"present_ms": "הוֹוֶה (זכר יחיד)",
"present_fs": "הוֹוֶה (נקבה יחיד)",
"present_mp": "הוֹוֶה (זכר רבים)",
"present_fp": "הוֹוֶה (נקבה רבים)",
"past_1s": "עָבָר",
"past_1p": "עָבָר",
"past_2ms": "עָבָר",
"past_2fs": "עָבָר",
"past_2mp": "עָבָר",
"past_2fp": "עָבָר",
"past_3ms": "עָבָר",
"past_3fs": "עָבָר",
"past_3p": "עָבָר",
"future_1s": "עָתִיד",
"future_1p": "עָתִיד",
"future_2ms": "עָתִיד",
"future_2fs": "עָתִיד",
"future_2mp": "עָתִיד",
"future_2fp": "עָתִיד",
"future_3ms": "עָתִיד",
"future_3fs": "עָתִיד",
"future_3mp": "עָתִיד",
"future_3fp": "עָתִיד",
"imperative_ms": "צִוּוּי",
"imperative_fs": "צִוּוּי",
"imperative_mp": "צִוּוּי",
"imperative_fp": "צִוּוּי",
"infinitive": "מְקוֹר",
}
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
def _find_slug(infinitive: str) -> str | None:
"""Search pealim.com/search/?q=<verb> and return the URL slug."""
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(infinitive)}"
try:
resp = session.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
# Slugs look like /dict/2255-lishmor/
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
if slugs:
slug = slugs[0]
logger.info(f" Slug: {slug}")
return slug
except Exception as e:
logger.error(f" Error searching for '{infinitive}': {e}")
return None
def _is_passive_binyan(binyan: str) -> bool:
for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]:
if marker.lower() in binyan.lower():
return True
return False
def _get_menukad(cell) -> str:
"""Extract nikkud Hebrew text from a table cell."""
span = cell.find("span", class_="menukad")
if span:
return span.get_text(strip=True)
# fallback: any Hebrew text in cell
txt = cell.get_text(strip=True)
if re.search(r"[\u05d0-\u05ea]", txt):
return txt
return ""
def _parse_table(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the pealim conjugation table and return form_key -> Hebrew form mapping.
Table structure (rows after two header rows):
Row 2 (Present): [label x2] [ms] [fs] [mp] [fp]
Row 3 (Past 1): [Past x1] [1st x1] [1s x2] [1p x2]
Row 4 (Past 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
Row 5 (Past 3): [3rd x1] [3ms] [3fs] [3p x2]
Row 6 (Fut 1): [Future x1] [1st x1] [1s x2] [1p x2]
Row 7 (Fut 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
Row 8 (Fut 3): [3rd x1] [3ms] [3fs] [3mp] [3fp]
Row 9 (Imp): [Imp x2] [ms] [fs] [mp] [fp]
Row 10 (Inf): [Inf x2] [form x4]
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
rows = table.find_all("tr")
if len(rows) < 9:
return {}
forms: dict[str, str] = {}
def row_forms(row_idx: int) -> list[str]:
"""Extract all Hebrew form values from a row (expanding colspans)."""
cells = rows[row_idx].find_all(["th", "td"])
result = []
for cell in cells:
txt = _get_menukad(cell)
colspan = int(cell.get("colspan", 1))
if txt:
for _ in range(colspan):
result.append(txt)
else:
for _ in range(colspan):
result.append("")
return result
def first_heb_forms(row_idx: int) -> list[str]:
"""Get only the Hebrew-text cells from a row (skip label cells)."""
cells = rows[row_idx].find_all(["th", "td"])
result = []
for cell in cells:
txt = _get_menukad(cell)
colspan = int(cell.get("colspan", 1))
if txt and re.search(r"[\u05d0-\u05ea]", txt):
for _ in range(colspan):
result.append(txt)
return result
# Row label detection
def row_label(idx: int) -> str:
row = rows[idx]
return row.get_text(" ", strip=True).lower()
# Find rows by tense label
present_row = past_row = future_row = imp_row = inf_row = -1
for i, row in enumerate(rows):
label = row.get_text(" ", strip=True).lower()
if "present" in label and present_row < 0:
present_row = i
elif "past" in label and past_row < 0:
past_row = i
elif "future" in label and future_row < 0:
future_row = i
elif "imperative" in label and imp_row < 0:
imp_row = i
elif "infinitive" in label and inf_row < 0:
inf_row = i
# Present tense (4 forms: ms fs mp fp)
if present_row >= 0:
hf = first_heb_forms(present_row)
keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
for k, v in zip(keys, hf):
if v:
forms[k] = v
# Past tense (rows: 1st person, 2nd person, 3rd person)
if past_row >= 0:
# 1st person row
hf = first_heb_forms(past_row)
# Row has: [Past label] [1st label] [1s] [1s] [1p] [1p] or just [1s] [1p]
# After label stripping: we get 1s and 1p (possibly duplicated by colspan)
unique = list(dict.fromkeys(hf)) # deduplicate consecutive
if len(unique) >= 1:
forms["past_1s"] = unique[0]
if len(unique) >= 2:
forms["past_1p"] = unique[1]
# 2nd person row
if past_row + 1 < len(rows):
hf2 = first_heb_forms(past_row + 1)
keys = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
for k, v in zip(keys, hf2):
if v:
forms[k] = v
# 3rd person row
if past_row + 2 < len(rows):
hf3 = first_heb_forms(past_row + 2)
# 3ms, 3fs, 3p (3p colspan=2 so may appear twice)
unique3 = list(dict.fromkeys(hf3))
keys3 = ["past_3ms", "past_3fs", "past_3p"]
for k, v in zip(keys3, unique3):
if v:
forms[k] = v
# Future tense
if future_row >= 0:
# 1st person
hf = first_heb_forms(future_row)
unique = list(dict.fromkeys(hf))
if len(unique) >= 1:
forms["future_1s"] = unique[0]
if len(unique) >= 2:
forms["future_1p"] = unique[1]
if future_row + 1 < len(rows):
hf2 = first_heb_forms(future_row + 1)
keys = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
for k, v in zip(keys, hf2):
if v:
forms[k] = v
if future_row + 2 < len(rows):
hf3 = first_heb_forms(future_row + 2)
keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
for k, v in zip(keys3, hf3):
if v:
forms[k] = v
# Imperative
if imp_row >= 0:
hf = first_heb_forms(imp_row)
keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
for k, v in zip(keys, hf):
if v:
forms[k] = v
# Infinitive
if inf_row >= 0:
hf = first_heb_forms(inf_row)
if hf:
forms["infinitive"] = hf[0]
return forms
def _extract_conjugations(slug: str, infinitive: str) -> dict | None:
"""Fetch /dict/<slug>/ and parse conjugation table."""
url = f"{PEALIM_BASE}/dict/{slug}/"
try:
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
except Exception as e:
logger.error(f" Error fetching {url}: {e}")
return None
soup = BeautifulSoup(resp.text, "lxml")
# Extract root from menukad span in header
root = ""
for span in soup.find_all("span", class_="menukad"):
txt = span.get_text(strip=True)
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
root = txt
break
# Extract binyan / verb type from lead text or title
binyan = ""
meta = soup.find("meta", {"property": "og:description"})
if meta:
desc = meta.get("content", "")
for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]:
if bname in desc:
binyan = bname
break
forms = _parse_table(soup)
if not forms:
logger.warning(f" No forms found for {slug}")
return None
is_passive = _is_passive_binyan(binyan)
reference_form = forms.get("infinitive", infinitive) if not is_passive else forms.get("past_3ms", infinitive)
result = {
"infinitive": infinitive,
"slug": slug,
"root": root,
"binyan": binyan,
"is_passive": is_passive,
"reference_form": reference_form,
"forms": {},
}
for key, form in forms.items():
if key in PRONOUN_LABELS:
result["forms"][key] = {
"form": form,
"pronoun": PRONOUN_LABELS[key],
"tense": TENSE_DESCRIPTION.get(key, ""),
}
logger.info(f" Extracted {len(result['forms'])} forms for {infinitive}")
return result
def _load_conjugations() -> dict:
if CONJUGATIONS_PATH.exists():
with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
return json.load(f)
return {}
def _save_conjugations(data: dict) -> None:
CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def main(verbs_file: Path = VERBS_INPUT) -> dict:
"""Read verbs from file and extract conjugations. Returns full conjugations dict."""
if not verbs_file.exists():
logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
return _load_conjugations()
verbs = [v.strip() for v in verbs_file.read_text(encoding="utf-8").splitlines()
if v.strip() and not v.startswith("#")]
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file}")
conjugations = _load_conjugations()
new_count = 0
for verb in verbs:
if verb in conjugations:
logger.info(f"Skipping {verb} (cached)")
continue
logger.info(f"Processing: {verb}")
time.sleep(REQUEST_DELAY)
slug = _find_slug(verb)
if not slug:
logger.warning(f" No slug found for {verb}")
conjugations[verb] = None
_save_conjugations(conjugations)
continue
time.sleep(REQUEST_DELAY)
data = _extract_conjugations(slug, verb)
conjugations[verb] = data
_save_conjugations(conjugations)
new_count += 1
logger.info(f"Done: {new_count} new verbs processed")
return conjugations
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
result = main()
for verb, data in result.items():
if data:
forms = data.get("forms", {})
print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
for k, v in list(forms.items())[:3]:
print(f" {k}: {v['form']}")
else:
print(f"{verb}: no data")