hebrew_flash_cards/scripts/scrape_ktiv_male.py
Sochen 17f7458d19 Sprint 9: cloze cards, plurals deck, project reorg, lint tooling
- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences
- Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns)
- Ktiv male forms expanded to 20,711 entries for sentence matching
- Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for
  one-off tools, tests/ with smoke tests, deleted 3 dead files
- Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig,
  fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars)
- validate_apkg.py: card count range check for optional cloze template
- Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals,
  noun_slug_map, vocab_sentence_matches, epub_sentence_index

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 08:09:39 +00:00

237 lines
7.3 KiB
Python

#!/usr/bin/env python3
"""
Scrape ktiv male (plene/vowelless) forms from pealim.com.
Uses hebstyle=vl cookie to get vowelless writing with matres lectionis.
Builds a lookup: ktiv_male_form → [{word_nikkud, form_type, pos, slug}]
This enables matching Hebrew text (which is normally in ktiv male)
against our vocabulary, including conjugated verbs and noun plurals.
"""
import json
import logging
import sys
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
OUTPUT_PATH = DATA_DIR / "ktiv_male_forms.json"
COOKIES = {"translit": "none", "hebstyle": "vl"}
REQUEST_TIMEOUT = 15
DELAY = 1.5 # seconds between requests
def fetch_verb_ktiv_male(slug: str, infinitive_nikkud: str) -> list[dict]:
"""Fetch all conjugated forms in ktiv male for a verb."""
url = f"https://www.pealim.com/dict/{slug}/"
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
forms = []
table = soup.find("table", class_="conjugation-table")
if not table:
return forms
# Also get the infinitive from the page
lead = soup.find("div", class_="lead")
if lead:
inf_spans = lead.find_all("span", class_="menukad")
for s in inf_spans:
ktiv = s.text.strip()
if ktiv:
forms.append(
{
"ktiv_male": ktiv,
"word_nikkud": infinitive_nikkud,
"form_type": "infinitive",
"pos": "Verb",
"slug": slug,
}
)
rows = table.find_all("tr")
for row in rows:
menukad_spans = row.find_all("span", class_="menukad")
for span in menukad_spans:
ktiv = span.text.strip()
if ktiv and ktiv not in {f["ktiv_male"] for f in forms}:
forms.append(
{
"ktiv_male": ktiv,
"word_nikkud": infinitive_nikkud,
"form_type": "conjugation",
"pos": "Verb",
"slug": slug,
}
)
return forms
def fetch_noun_ktiv_male(slug: str, singular_nikkud: str, gender: str) -> list[dict]:
"""Fetch noun declension forms in ktiv male."""
url = f"https://www.pealim.com/dict/{slug}/"
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
forms = []
table = soup.find("table", class_="conjugation-table")
if not table:
return forms
rows = table.find_all("tr")
form_labels = ["absolute_singular", "absolute_plural", "construct_singular", "construct_plural"]
label_idx = 0
for row in rows:
menukad_spans = row.find_all("span", class_="menukad")
for span in menukad_spans:
ktiv = span.text.strip()
if ktiv:
ft = form_labels[label_idx] if label_idx < len(form_labels) else "other"
forms.append(
{
"ktiv_male": ktiv,
"word_nikkud": singular_nikkud,
"form_type": ft,
"pos": "Noun",
"slug": slug,
"gender": gender,
}
)
label_idx += 1
return forms
def scrape_verbs() -> list[dict]:
"""Scrape ktiv male forms for all verbs in conjugations.json."""
conj_path = DATA_DIR / "conjugations.json"
if not conj_path.exists():
logger.warning("No conjugations.json found")
return []
with open(conj_path) as f:
conjugations = json.load(f)
all_forms = []
slugs_done = set()
for verb, data in conjugations.items():
if not data or not data.get("slug"):
continue
slug = data["slug"]
if slug in slugs_done:
continue
slugs_done.add(slug)
try:
forms = fetch_verb_ktiv_male(slug, verb)
all_forms.extend(forms)
logger.info(f" Verb {verb} ({slug}): {len(forms)} forms")
except Exception as e:
logger.warning(f" Verb {verb} ({slug}) failed: {e}")
time.sleep(DELAY)
return all_forms
def scrape_nouns() -> list[dict]:
"""Scrape ktiv male forms for all nouns in noun_slug_map.json."""
slug_path = DATA_DIR / "noun_slug_map.json"
if not slug_path.exists():
logger.warning("No noun_slug_map.json found")
return []
with open(slug_path) as f:
slug_map = json.load(f)
# Also load existing plurals to get nikkud singular form
plurals_path = DATA_DIR / "noun_plurals.json"
plurals = {}
if plurals_path.exists():
with open(plurals_path) as f:
plurals = json.load(f)
all_forms = []
done = 0
total = len(slug_map)
for word, info in slug_map.items():
slug = info.get("slug", "")
if not slug:
continue
# Get nikkud form from plurals data or slug map
nikkud = info.get("word_nikkud", word)
if word in plurals:
nikkud = plurals[word].get("singular", nikkud)
gender = info.get("gender", "")
try:
forms = fetch_noun_ktiv_male(slug, nikkud, gender)
all_forms.extend(forms)
done += 1
if done % 50 == 0:
logger.info(f" Nouns: {done}/{total} ({len(all_forms)} forms)")
# Save incrementally
_save_forms(all_forms, partial=True)
except Exception as e:
logger.warning(f" Noun {word} ({slug}) failed: {e}")
done += 1
time.sleep(DELAY)
return all_forms
def _save_forms(all_forms: list[dict], partial: bool = False):
"""Build and save the ktiv male lookup dict."""
lookup: dict[str, list[dict]] = {}
for entry in all_forms:
ktiv = entry["ktiv_male"]
# Don't include ktiv_male in the stored entry (it's the key)
stored = {k: v for k, v in entry.items() if k != "ktiv_male"}
lookup.setdefault(ktiv, []).append(stored)
suffix = ".partial" if partial else ""
out = OUTPUT_PATH.parent / (OUTPUT_PATH.name + suffix)
with open(out, "w") as f:
json.dump(lookup, f, ensure_ascii=False, indent=1)
logger.info(f" Saved {len(lookup)} unique ktiv male forms → {out}")
def main():
mode = sys.argv[1] if len(sys.argv) > 1 else "all"
all_forms = []
if mode in ("all", "verbs"):
logger.info("=== Scraping verb ktiv male forms ===")
verb_forms = scrape_verbs()
all_forms.extend(verb_forms)
logger.info(f"Verbs done: {len(verb_forms)} forms from {len({f['slug'] for f in verb_forms})} verbs")
if mode in ("all", "nouns"):
logger.info("=== Scraping noun ktiv male forms ===")
noun_forms = scrape_nouns()
all_forms.extend(noun_forms)
logger.info(f"Nouns done: {len(noun_forms)} forms")
_save_forms(all_forms)
logger.info(f"Total: {len(all_forms)} forms → {OUTPUT_PATH}")
if __name__ == "__main__":
main()