Implements four major improvements to the Pealim Anki deck pipeline:
1. Automated .apkg generation (genanki) — no more manual Anki Desktop step.
Both vocabulary and conjugation decks are built programmatically.
2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus.
Notes sorted by rank so Anki presents most common words first.
3. Example sentences from Ben Yehuda public domain corpus (not pealim.com).
Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set.
4. Conjugation drill deck — one card per form × verb.
Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one
per binyan). Extracts 28 forms each via pealim.com/search/ + table parse.
New files:
apkg_builder.py — genanki deck builder for both decks
benyehuda.py — Ben Yehuda corpus downloader + sentence indexer
frequency_lookup.py — FrequencyWords downloader + rank lookup
verbs_input.txt — verb input list (7 test verbs, one per binyan)
data/ — baseline CSVs + generated caches
Updated:
conjugation_extract.py — rewritten: reads verbs_input.txt, searches
/search/?q= for slug, parses table by row labels
requirements.txt — add genanki, beautifulsoup4, lxml
run.py — full orchestration pipeline with CLI flags
.gitignore — exclude venv/, benyehuda_index.json, audio/, output/
CLI:
python run.py --skip-scrape --skip-audio --test 20 (quick test)
python run.py --skip-scrape (full build)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
408 lines
13 KiB
Python
Executable file
408 lines
13 KiB
Python
Executable file
#!/usr/bin/env python3
|
|
"""
|
|
Extract Hebrew verb conjugations from pealim.com.
|
|
Input: verbs_input.txt (one Hebrew infinitive per line)
|
|
Output: data/conjugations.json
|
|
|
|
For each verb:
|
|
1. Search pealim.com/search/?q=<verb> to find URL slug
|
|
2. Fetch /dict/<slug>/ with hebstyle=mo cookie
|
|
3. Parse conjugation table by row labels
|
|
|
|
Resume-safe: verbs already in conjugations.json are skipped.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
import time
|
|
import urllib.parse
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
PEALIM_BASE = "https://www.pealim.com"
|
|
REQUEST_DELAY = 1.5
|
|
REQUEST_TIMEOUT = 15
|
|
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
|
|
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
|
|
|
|
# Pronoun labels (for card front display)
|
|
PRONOUN_LABELS = {
|
|
"present_ms": "",
|
|
"present_fs": "",
|
|
"present_mp": "",
|
|
"present_fp": "",
|
|
"past_1s": "אֲנִי",
|
|
"past_1p": "אֲנַחְנוּ",
|
|
"past_2ms": "אַתָּה",
|
|
"past_2fs": "אַתְּ",
|
|
"past_2mp": "אַתֶּם",
|
|
"past_2fp": "אַתֶּן",
|
|
"past_3ms": "הוּא",
|
|
"past_3fs": "הִיא",
|
|
"past_3p": "הֵם / הֵן",
|
|
"future_1s": "אֲנִי",
|
|
"future_1p": "אֲנַחְנוּ",
|
|
"future_2ms": "אַתָּה",
|
|
"future_2fs": "אַתְּ",
|
|
"future_2mp": "אַתֶּם",
|
|
"future_2fp": "אַתֶּן",
|
|
"future_3ms": "הוּא",
|
|
"future_3fs": "הִיא",
|
|
"future_3mp": "הֵם",
|
|
"future_3fp": "הֵן",
|
|
"imperative_ms": "אַתָּה",
|
|
"imperative_fs": "אַתְּ",
|
|
"imperative_mp": "אַתֶּם",
|
|
"imperative_fp": "אַתֶּן",
|
|
"infinitive": "",
|
|
}
|
|
|
|
# Human-readable tense description for card front
|
|
TENSE_DESCRIPTION = {
|
|
"present_ms": "הוֹוֶה (זכר יחיד)",
|
|
"present_fs": "הוֹוֶה (נקבה יחיד)",
|
|
"present_mp": "הוֹוֶה (זכר רבים)",
|
|
"present_fp": "הוֹוֶה (נקבה רבים)",
|
|
"past_1s": "עָבָר",
|
|
"past_1p": "עָבָר",
|
|
"past_2ms": "עָבָר",
|
|
"past_2fs": "עָבָר",
|
|
"past_2mp": "עָבָר",
|
|
"past_2fp": "עָבָר",
|
|
"past_3ms": "עָבָר",
|
|
"past_3fs": "עָבָר",
|
|
"past_3p": "עָבָר",
|
|
"future_1s": "עָתִיד",
|
|
"future_1p": "עָתִיד",
|
|
"future_2ms": "עָתִיד",
|
|
"future_2fs": "עָתִיד",
|
|
"future_2mp": "עָתִיד",
|
|
"future_2fp": "עָתִיד",
|
|
"future_3ms": "עָתִיד",
|
|
"future_3fs": "עָתִיד",
|
|
"future_3mp": "עָתִיד",
|
|
"future_3fp": "עָתִיד",
|
|
"imperative_ms": "צִוּוּי",
|
|
"imperative_fs": "צִוּוּי",
|
|
"imperative_mp": "צִוּוּי",
|
|
"imperative_fp": "צִוּוּי",
|
|
"infinitive": "מְקוֹר",
|
|
}
|
|
|
|
session = requests.Session()
|
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
|
|
|
|
|
|
def _find_slug(infinitive: str) -> str | None:
|
|
"""Search pealim.com/search/?q=<verb> and return the URL slug."""
|
|
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(infinitive)}"
|
|
try:
|
|
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
# Slugs look like /dict/2255-lishmor/
|
|
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
|
if slugs:
|
|
slug = slugs[0]
|
|
logger.info(f" Slug: {slug}")
|
|
return slug
|
|
except Exception as e:
|
|
logger.error(f" Error searching for '{infinitive}': {e}")
|
|
return None
|
|
|
|
|
|
def _is_passive_binyan(binyan: str) -> bool:
|
|
for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]:
|
|
if marker.lower() in binyan.lower():
|
|
return True
|
|
return False
|
|
|
|
|
|
def _get_menukad(cell) -> str:
|
|
"""Extract nikkud Hebrew text from a table cell."""
|
|
span = cell.find("span", class_="menukad")
|
|
if span:
|
|
return span.get_text(strip=True)
|
|
# fallback: any Hebrew text in cell
|
|
txt = cell.get_text(strip=True)
|
|
if re.search(r"[\u05d0-\u05ea]", txt):
|
|
return txt
|
|
return ""
|
|
|
|
|
|
def _parse_table(soup: BeautifulSoup) -> dict[str, str]:
|
|
"""
|
|
Parse the pealim conjugation table and return form_key -> Hebrew form mapping.
|
|
|
|
Table structure (rows after two header rows):
|
|
Row 2 (Present): [label x2] [ms] [fs] [mp] [fp]
|
|
Row 3 (Past 1): [Past x1] [1st x1] [1s x2] [1p x2]
|
|
Row 4 (Past 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
|
|
Row 5 (Past 3): [3rd x1] [3ms] [3fs] [3p x2]
|
|
Row 6 (Fut 1): [Future x1] [1st x1] [1s x2] [1p x2]
|
|
Row 7 (Fut 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
|
|
Row 8 (Fut 3): [3rd x1] [3ms] [3fs] [3mp] [3fp]
|
|
Row 9 (Imp): [Imp x2] [ms] [fs] [mp] [fp]
|
|
Row 10 (Inf): [Inf x2] [form x4]
|
|
"""
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return {}
|
|
|
|
rows = table.find_all("tr")
|
|
if len(rows) < 9:
|
|
return {}
|
|
|
|
forms: dict[str, str] = {}
|
|
|
|
def row_forms(row_idx: int) -> list[str]:
|
|
"""Extract all Hebrew form values from a row (expanding colspans)."""
|
|
cells = rows[row_idx].find_all(["th", "td"])
|
|
result = []
|
|
for cell in cells:
|
|
txt = _get_menukad(cell)
|
|
colspan = int(cell.get("colspan", 1))
|
|
if txt:
|
|
for _ in range(colspan):
|
|
result.append(txt)
|
|
else:
|
|
for _ in range(colspan):
|
|
result.append("")
|
|
return result
|
|
|
|
def first_heb_forms(row_idx: int) -> list[str]:
|
|
"""Get only the Hebrew-text cells from a row (skip label cells)."""
|
|
cells = rows[row_idx].find_all(["th", "td"])
|
|
result = []
|
|
for cell in cells:
|
|
txt = _get_menukad(cell)
|
|
colspan = int(cell.get("colspan", 1))
|
|
if txt and re.search(r"[\u05d0-\u05ea]", txt):
|
|
for _ in range(colspan):
|
|
result.append(txt)
|
|
return result
|
|
|
|
# Row label detection
|
|
def row_label(idx: int) -> str:
|
|
row = rows[idx]
|
|
return row.get_text(" ", strip=True).lower()
|
|
|
|
# Find rows by tense label
|
|
present_row = past_row = future_row = imp_row = inf_row = -1
|
|
for i, row in enumerate(rows):
|
|
label = row.get_text(" ", strip=True).lower()
|
|
if "present" in label and present_row < 0:
|
|
present_row = i
|
|
elif "past" in label and past_row < 0:
|
|
past_row = i
|
|
elif "future" in label and future_row < 0:
|
|
future_row = i
|
|
elif "imperative" in label and imp_row < 0:
|
|
imp_row = i
|
|
elif "infinitive" in label and inf_row < 0:
|
|
inf_row = i
|
|
|
|
# Present tense (4 forms: ms fs mp fp)
|
|
if present_row >= 0:
|
|
hf = first_heb_forms(present_row)
|
|
keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
|
|
for k, v in zip(keys, hf):
|
|
if v:
|
|
forms[k] = v
|
|
|
|
# Past tense (rows: 1st person, 2nd person, 3rd person)
|
|
if past_row >= 0:
|
|
# 1st person row
|
|
hf = first_heb_forms(past_row)
|
|
# Row has: [Past label] [1st label] [1s] [1s] [1p] [1p] or just [1s] [1p]
|
|
# After label stripping: we get 1s and 1p (possibly duplicated by colspan)
|
|
unique = list(dict.fromkeys(hf)) # deduplicate consecutive
|
|
if len(unique) >= 1:
|
|
forms["past_1s"] = unique[0]
|
|
if len(unique) >= 2:
|
|
forms["past_1p"] = unique[1]
|
|
|
|
# 2nd person row
|
|
if past_row + 1 < len(rows):
|
|
hf2 = first_heb_forms(past_row + 1)
|
|
keys = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
|
|
for k, v in zip(keys, hf2):
|
|
if v:
|
|
forms[k] = v
|
|
|
|
# 3rd person row
|
|
if past_row + 2 < len(rows):
|
|
hf3 = first_heb_forms(past_row + 2)
|
|
# 3ms, 3fs, 3p (3p colspan=2 so may appear twice)
|
|
unique3 = list(dict.fromkeys(hf3))
|
|
keys3 = ["past_3ms", "past_3fs", "past_3p"]
|
|
for k, v in zip(keys3, unique3):
|
|
if v:
|
|
forms[k] = v
|
|
|
|
# Future tense
|
|
if future_row >= 0:
|
|
# 1st person
|
|
hf = first_heb_forms(future_row)
|
|
unique = list(dict.fromkeys(hf))
|
|
if len(unique) >= 1:
|
|
forms["future_1s"] = unique[0]
|
|
if len(unique) >= 2:
|
|
forms["future_1p"] = unique[1]
|
|
|
|
if future_row + 1 < len(rows):
|
|
hf2 = first_heb_forms(future_row + 1)
|
|
keys = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
|
|
for k, v in zip(keys, hf2):
|
|
if v:
|
|
forms[k] = v
|
|
|
|
if future_row + 2 < len(rows):
|
|
hf3 = first_heb_forms(future_row + 2)
|
|
keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
|
|
for k, v in zip(keys3, hf3):
|
|
if v:
|
|
forms[k] = v
|
|
|
|
# Imperative
|
|
if imp_row >= 0:
|
|
hf = first_heb_forms(imp_row)
|
|
keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
|
|
for k, v in zip(keys, hf):
|
|
if v:
|
|
forms[k] = v
|
|
|
|
# Infinitive
|
|
if inf_row >= 0:
|
|
hf = first_heb_forms(inf_row)
|
|
if hf:
|
|
forms["infinitive"] = hf[0]
|
|
|
|
return forms
|
|
|
|
|
|
def _extract_conjugations(slug: str, infinitive: str) -> dict | None:
|
|
"""Fetch /dict/<slug>/ and parse conjugation table."""
|
|
url = f"{PEALIM_BASE}/dict/{slug}/"
|
|
try:
|
|
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
except Exception as e:
|
|
logger.error(f" Error fetching {url}: {e}")
|
|
return None
|
|
|
|
soup = BeautifulSoup(resp.text, "lxml")
|
|
|
|
# Extract root from menukad span in header
|
|
root = ""
|
|
for span in soup.find_all("span", class_="menukad"):
|
|
txt = span.get_text(strip=True)
|
|
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
|
|
root = txt
|
|
break
|
|
|
|
# Extract binyan / verb type from lead text or title
|
|
binyan = ""
|
|
meta = soup.find("meta", {"property": "og:description"})
|
|
if meta:
|
|
desc = meta.get("content", "")
|
|
for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]:
|
|
if bname in desc:
|
|
binyan = bname
|
|
break
|
|
|
|
forms = _parse_table(soup)
|
|
|
|
if not forms:
|
|
logger.warning(f" No forms found for {slug}")
|
|
return None
|
|
|
|
is_passive = _is_passive_binyan(binyan)
|
|
reference_form = forms.get("infinitive", infinitive) if not is_passive else forms.get("past_3ms", infinitive)
|
|
|
|
result = {
|
|
"infinitive": infinitive,
|
|
"slug": slug,
|
|
"root": root,
|
|
"binyan": binyan,
|
|
"is_passive": is_passive,
|
|
"reference_form": reference_form,
|
|
"forms": {},
|
|
}
|
|
for key, form in forms.items():
|
|
if key in PRONOUN_LABELS:
|
|
result["forms"][key] = {
|
|
"form": form,
|
|
"pronoun": PRONOUN_LABELS[key],
|
|
"tense": TENSE_DESCRIPTION.get(key, ""),
|
|
}
|
|
|
|
logger.info(f" Extracted {len(result['forms'])} forms for {infinitive}")
|
|
return result
|
|
|
|
|
|
def _load_conjugations() -> dict:
|
|
if CONJUGATIONS_PATH.exists():
|
|
with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
def _save_conjugations(data: dict) -> None:
|
|
CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
def main(verbs_file: Path = VERBS_INPUT) -> dict:
|
|
"""Read verbs from file and extract conjugations. Returns full conjugations dict."""
|
|
if not verbs_file.exists():
|
|
logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
|
|
return _load_conjugations()
|
|
|
|
verbs = [v.strip() for v in verbs_file.read_text(encoding="utf-8").splitlines()
|
|
if v.strip() and not v.startswith("#")]
|
|
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file}")
|
|
|
|
conjugations = _load_conjugations()
|
|
new_count = 0
|
|
|
|
for verb in verbs:
|
|
if verb in conjugations:
|
|
logger.info(f"Skipping {verb} (cached)")
|
|
continue
|
|
|
|
logger.info(f"Processing: {verb}")
|
|
time.sleep(REQUEST_DELAY)
|
|
slug = _find_slug(verb)
|
|
if not slug:
|
|
logger.warning(f" No slug found for {verb}")
|
|
conjugations[verb] = None
|
|
_save_conjugations(conjugations)
|
|
continue
|
|
|
|
time.sleep(REQUEST_DELAY)
|
|
data = _extract_conjugations(slug, verb)
|
|
conjugations[verb] = data
|
|
_save_conjugations(conjugations)
|
|
new_count += 1
|
|
|
|
logger.info(f"Done: {new_count} new verbs processed")
|
|
return conjugations
|
|
|
|
|
|
if __name__ == "__main__":
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
result = main()
|
|
for verb, data in result.items():
|
|
if data:
|
|
forms = data.get("forms", {})
|
|
print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
|
|
for k, v in list(forms.items())[:3]:
|
|
print(f" {k}: {v['form']}")
|
|
else:
|
|
print(f"{verb}: no data")
|