#!/usr/bin/env python3 """ Scrape ktiv male (plene/vowelless) forms from pealim.com. Uses hebstyle=vl cookie to get vowelless writing with matres lectionis. Builds a lookup: ktiv_male_form → [{word_nikkud, form_type, pos, slug}] This enables matching Hebrew text (which is normally in ktiv male) against our vocabulary, including conjugated verbs and noun plurals. """ import json import logging import sys import time from pathlib import Path import requests from bs4 import BeautifulSoup logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) DATA_DIR = Path(__file__).resolve().parent.parent / "data" OUTPUT_PATH = DATA_DIR / "ktiv_male_forms.json" COOKIES = {"translit": "none", "hebstyle": "vl"} REQUEST_TIMEOUT = 15 DELAY = 1.5 # seconds between requests def fetch_verb_ktiv_male(slug: str, infinitive_nikkud: str) -> list[dict]: """Fetch all conjugated forms in ktiv male for a verb.""" url = f"https://www.pealim.com/dict/{slug}/" resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") forms = [] table = soup.find("table", class_="conjugation-table") if not table: return forms # Also get the infinitive from the page lead = soup.find("div", class_="lead") if lead: inf_spans = lead.find_all("span", class_="menukad") for s in inf_spans: ktiv = s.text.strip() if ktiv: forms.append( { "ktiv_male": ktiv, "word_nikkud": infinitive_nikkud, "form_type": "infinitive", "pos": "Verb", "slug": slug, } ) rows = table.find_all("tr") for row in rows: menukad_spans = row.find_all("span", class_="menukad") for span in menukad_spans: ktiv = span.text.strip() if ktiv and ktiv not in {f["ktiv_male"] for f in forms}: forms.append( { "ktiv_male": ktiv, "word_nikkud": infinitive_nikkud, "form_type": "conjugation", "pos": "Verb", "slug": slug, } ) return forms def fetch_noun_ktiv_male(slug: str, singular_nikkud: str, gender: str) -> list[dict]: """Fetch noun declension forms in ktiv male.""" url = f"https://www.pealim.com/dict/{slug}/" resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") forms = [] table = soup.find("table", class_="conjugation-table") if not table: return forms rows = table.find_all("tr") form_labels = ["absolute_singular", "absolute_plural", "construct_singular", "construct_plural"] label_idx = 0 for row in rows: menukad_spans = row.find_all("span", class_="menukad") for span in menukad_spans: ktiv = span.text.strip() if ktiv: ft = form_labels[label_idx] if label_idx < len(form_labels) else "other" forms.append( { "ktiv_male": ktiv, "word_nikkud": singular_nikkud, "form_type": ft, "pos": "Noun", "slug": slug, "gender": gender, } ) label_idx += 1 return forms def scrape_verbs() -> list[dict]: """Scrape ktiv male forms for all verbs in conjugations.json.""" conj_path = DATA_DIR / "conjugations.json" if not conj_path.exists(): logger.warning("No conjugations.json found") return [] with open(conj_path) as f: conjugations = json.load(f) all_forms = [] slugs_done = set() for verb, data in conjugations.items(): if not data or not data.get("slug"): continue slug = data["slug"] if slug in slugs_done: continue slugs_done.add(slug) try: forms = fetch_verb_ktiv_male(slug, verb) all_forms.extend(forms) logger.info(f" Verb {verb} ({slug}): {len(forms)} forms") except Exception as e: logger.warning(f" Verb {verb} ({slug}) failed: {e}") time.sleep(DELAY) return all_forms def scrape_nouns() -> list[dict]: """Scrape ktiv male forms for all nouns in noun_slug_map.json.""" slug_path = DATA_DIR / "noun_slug_map.json" if not slug_path.exists(): logger.warning("No noun_slug_map.json found") return [] with open(slug_path) as f: slug_map = json.load(f) # Also load existing plurals to get nikkud singular form plurals_path = DATA_DIR / "noun_plurals.json" plurals = {} if plurals_path.exists(): with open(plurals_path) as f: plurals = json.load(f) all_forms = [] done = 0 total = len(slug_map) for word, info in slug_map.items(): slug = info.get("slug", "") if not slug: continue # Get nikkud form from plurals data or slug map nikkud = info.get("word_nikkud", word) if word in plurals: nikkud = plurals[word].get("singular", nikkud) gender = info.get("gender", "") try: forms = fetch_noun_ktiv_male(slug, nikkud, gender) all_forms.extend(forms) done += 1 if done % 50 == 0: logger.info(f" Nouns: {done}/{total} ({len(all_forms)} forms)") # Save incrementally _save_forms(all_forms, partial=True) except Exception as e: logger.warning(f" Noun {word} ({slug}) failed: {e}") done += 1 time.sleep(DELAY) return all_forms def _save_forms(all_forms: list[dict], partial: bool = False): """Build and save the ktiv male lookup dict.""" lookup: dict[str, list[dict]] = {} for entry in all_forms: ktiv = entry["ktiv_male"] # Don't include ktiv_male in the stored entry (it's the key) stored = {k: v for k, v in entry.items() if k != "ktiv_male"} lookup.setdefault(ktiv, []).append(stored) suffix = ".partial" if partial else "" out = OUTPUT_PATH.parent / (OUTPUT_PATH.name + suffix) with open(out, "w") as f: json.dump(lookup, f, ensure_ascii=False, indent=1) logger.info(f" Saved {len(lookup)} unique ktiv male forms → {out}") def main(): mode = sys.argv[1] if len(sys.argv) > 1 else "all" all_forms = [] if mode in ("all", "verbs"): logger.info("=== Scraping verb ktiv male forms ===") verb_forms = scrape_verbs() all_forms.extend(verb_forms) logger.info(f"Verbs done: {len(verb_forms)} forms from {len({f['slug'] for f in verb_forms})} verbs") if mode in ("all", "nouns"): logger.info("=== Scraping noun ktiv male forms ===") noun_forms = scrape_nouns() all_forms.extend(noun_forms) logger.info(f"Nouns done: {len(noun_forms)} forms") _save_forms(all_forms) logger.info(f"Total: {len(all_forms)} forms → {OUTPUT_PATH}") if __name__ == "__main__": main()