- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
237 lines
7.3 KiB
Python
237 lines
7.3 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape ktiv male (plene/vowelless) forms from pealim.com.
|
|
|
|
Uses hebstyle=vl cookie to get vowelless writing with matres lectionis.
|
|
Builds a lookup: ktiv_male_form → [{word_nikkud, form_type, pos, slug}]
|
|
|
|
This enables matching Hebrew text (which is normally in ktiv male)
|
|
against our vocabulary, including conjugated verbs and noun plurals.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import sys
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
|
OUTPUT_PATH = DATA_DIR / "ktiv_male_forms.json"
|
|
COOKIES = {"translit": "none", "hebstyle": "vl"}
|
|
REQUEST_TIMEOUT = 15
|
|
DELAY = 1.5 # seconds between requests
|
|
|
|
|
|
def fetch_verb_ktiv_male(slug: str, infinitive_nikkud: str) -> list[dict]:
|
|
"""Fetch all conjugated forms in ktiv male for a verb."""
|
|
url = f"https://www.pealim.com/dict/{slug}/"
|
|
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
forms = []
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return forms
|
|
|
|
# Also get the infinitive from the page
|
|
lead = soup.find("div", class_="lead")
|
|
if lead:
|
|
inf_spans = lead.find_all("span", class_="menukad")
|
|
for s in inf_spans:
|
|
ktiv = s.text.strip()
|
|
if ktiv:
|
|
forms.append(
|
|
{
|
|
"ktiv_male": ktiv,
|
|
"word_nikkud": infinitive_nikkud,
|
|
"form_type": "infinitive",
|
|
"pos": "Verb",
|
|
"slug": slug,
|
|
}
|
|
)
|
|
|
|
rows = table.find_all("tr")
|
|
for row in rows:
|
|
menukad_spans = row.find_all("span", class_="menukad")
|
|
for span in menukad_spans:
|
|
ktiv = span.text.strip()
|
|
if ktiv and ktiv not in {f["ktiv_male"] for f in forms}:
|
|
forms.append(
|
|
{
|
|
"ktiv_male": ktiv,
|
|
"word_nikkud": infinitive_nikkud,
|
|
"form_type": "conjugation",
|
|
"pos": "Verb",
|
|
"slug": slug,
|
|
}
|
|
)
|
|
|
|
return forms
|
|
|
|
|
|
def fetch_noun_ktiv_male(slug: str, singular_nikkud: str, gender: str) -> list[dict]:
|
|
"""Fetch noun declension forms in ktiv male."""
|
|
url = f"https://www.pealim.com/dict/{slug}/"
|
|
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
forms = []
|
|
table = soup.find("table", class_="conjugation-table")
|
|
if not table:
|
|
return forms
|
|
|
|
rows = table.find_all("tr")
|
|
form_labels = ["absolute_singular", "absolute_plural", "construct_singular", "construct_plural"]
|
|
label_idx = 0
|
|
|
|
for row in rows:
|
|
menukad_spans = row.find_all("span", class_="menukad")
|
|
for span in menukad_spans:
|
|
ktiv = span.text.strip()
|
|
if ktiv:
|
|
ft = form_labels[label_idx] if label_idx < len(form_labels) else "other"
|
|
forms.append(
|
|
{
|
|
"ktiv_male": ktiv,
|
|
"word_nikkud": singular_nikkud,
|
|
"form_type": ft,
|
|
"pos": "Noun",
|
|
"slug": slug,
|
|
"gender": gender,
|
|
}
|
|
)
|
|
label_idx += 1
|
|
|
|
return forms
|
|
|
|
|
|
def scrape_verbs() -> list[dict]:
|
|
"""Scrape ktiv male forms for all verbs in conjugations.json."""
|
|
conj_path = DATA_DIR / "conjugations.json"
|
|
if not conj_path.exists():
|
|
logger.warning("No conjugations.json found")
|
|
return []
|
|
|
|
with open(conj_path) as f:
|
|
conjugations = json.load(f)
|
|
|
|
all_forms = []
|
|
slugs_done = set()
|
|
|
|
for verb, data in conjugations.items():
|
|
if not data or not data.get("slug"):
|
|
continue
|
|
slug = data["slug"]
|
|
if slug in slugs_done:
|
|
continue
|
|
slugs_done.add(slug)
|
|
|
|
try:
|
|
forms = fetch_verb_ktiv_male(slug, verb)
|
|
all_forms.extend(forms)
|
|
logger.info(f" Verb {verb} ({slug}): {len(forms)} forms")
|
|
except Exception as e:
|
|
logger.warning(f" Verb {verb} ({slug}) failed: {e}")
|
|
|
|
time.sleep(DELAY)
|
|
|
|
return all_forms
|
|
|
|
|
|
def scrape_nouns() -> list[dict]:
|
|
"""Scrape ktiv male forms for all nouns in noun_slug_map.json."""
|
|
slug_path = DATA_DIR / "noun_slug_map.json"
|
|
if not slug_path.exists():
|
|
logger.warning("No noun_slug_map.json found")
|
|
return []
|
|
|
|
with open(slug_path) as f:
|
|
slug_map = json.load(f)
|
|
|
|
# Also load existing plurals to get nikkud singular form
|
|
plurals_path = DATA_DIR / "noun_plurals.json"
|
|
plurals = {}
|
|
if plurals_path.exists():
|
|
with open(plurals_path) as f:
|
|
plurals = json.load(f)
|
|
|
|
all_forms = []
|
|
done = 0
|
|
total = len(slug_map)
|
|
|
|
for word, info in slug_map.items():
|
|
slug = info.get("slug", "")
|
|
if not slug:
|
|
continue
|
|
|
|
# Get nikkud form from plurals data or slug map
|
|
nikkud = info.get("word_nikkud", word)
|
|
if word in plurals:
|
|
nikkud = plurals[word].get("singular", nikkud)
|
|
gender = info.get("gender", "")
|
|
|
|
try:
|
|
forms = fetch_noun_ktiv_male(slug, nikkud, gender)
|
|
all_forms.extend(forms)
|
|
done += 1
|
|
if done % 50 == 0:
|
|
logger.info(f" Nouns: {done}/{total} ({len(all_forms)} forms)")
|
|
# Save incrementally
|
|
_save_forms(all_forms, partial=True)
|
|
except Exception as e:
|
|
logger.warning(f" Noun {word} ({slug}) failed: {e}")
|
|
done += 1
|
|
|
|
time.sleep(DELAY)
|
|
|
|
return all_forms
|
|
|
|
|
|
def _save_forms(all_forms: list[dict], partial: bool = False):
|
|
"""Build and save the ktiv male lookup dict."""
|
|
lookup: dict[str, list[dict]] = {}
|
|
for entry in all_forms:
|
|
ktiv = entry["ktiv_male"]
|
|
# Don't include ktiv_male in the stored entry (it's the key)
|
|
stored = {k: v for k, v in entry.items() if k != "ktiv_male"}
|
|
lookup.setdefault(ktiv, []).append(stored)
|
|
|
|
suffix = ".partial" if partial else ""
|
|
out = OUTPUT_PATH.parent / (OUTPUT_PATH.name + suffix)
|
|
with open(out, "w") as f:
|
|
json.dump(lookup, f, ensure_ascii=False, indent=1)
|
|
|
|
logger.info(f" Saved {len(lookup)} unique ktiv male forms → {out}")
|
|
|
|
|
|
def main():
|
|
mode = sys.argv[1] if len(sys.argv) > 1 else "all"
|
|
|
|
all_forms = []
|
|
|
|
if mode in ("all", "verbs"):
|
|
logger.info("=== Scraping verb ktiv male forms ===")
|
|
verb_forms = scrape_verbs()
|
|
all_forms.extend(verb_forms)
|
|
logger.info(f"Verbs done: {len(verb_forms)} forms from {len({f['slug'] for f in verb_forms})} verbs")
|
|
|
|
if mode in ("all", "nouns"):
|
|
logger.info("=== Scraping noun ktiv male forms ===")
|
|
noun_forms = scrape_nouns()
|
|
all_forms.extend(noun_forms)
|
|
logger.info(f"Nouns done: {len(noun_forms)} forms")
|
|
|
|
_save_forms(all_forms)
|
|
logger.info(f"Total: {len(all_forms)} forms → {OUTPUT_PATH}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|