#!/usr/bin/env python3 """Scrape ktiv male (vowelless plene) conjugation forms for top 500 verbs from pealim.com.""" import json import os import re import sys import time sys.stdout.reconfigure(line_buffering=True) import requests # noqa: E402 from bs4 import BeautifulSoup # noqa: E402 DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data") INPUT_FILE = os.path.join(DATA_DIR, "top_verbs_to_scrape.json") OUTPUT_FILE = os.path.join(DATA_DIR, "ktiv_male_forms.json") PARTIAL_FILE = os.path.join(DATA_DIR, "ktiv_male_forms_partial.json") PROGRESS_FILE = os.path.join(DATA_DIR, "ktiv_scrape_progress.json") COOKIES = {"translit": "none", "hebstyle": "vl"} HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"} DELAY = 1.5 session = requests.Session() session.cookies.update(COOKIES) session.headers.update(HEADERS) def load_json(path): if os.path.exists(path): with open(path, encoding="utf-8") as f: return json.load(f) return {} def save_json(data, path): with open(path, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=1) def search_slug(wni): """Search pealim for a verb and return the first result's slug.""" url = "https://www.pealim.com/search/" resp = session.get(url, params={"q": wni}, timeout=15) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") # Look for result links like /dict/SLUG/ for a in soup.select("a[href]"): href = a["href"] m = re.match(r"/dict/(\d+-[^/]+)/", href) if m: return m.group(1) return None def scrape_verb_forms(slug): """Fetch a verb's detail page and extract all ktiv male conjugation forms.""" url = f"https://www.pealim.com/dict/{slug}/" resp = session.get(url, timeout=15) resp.raise_for_status() soup = BeautifulSoup(resp.text, "html.parser") forms = set() # Get infinitive from div.lead or page title lead = soup.select_one("div.lead") if lead: menukad_spans = lead.select("span.menukad") for span in menukad_spans: text = span.get_text(strip=True) if text: forms.add(text) # Get word_nikkud (the nikkud form of the infinitive) from the page # We need to fetch with mo cookie for that, but we already have it from input data # Instead, get the page title which usually has the nikkud form word_nikkud = None title = soup.select_one("h1") if title: menukad_in_title = title.select_one("span.menukad") if menukad_in_title: word_nikkud = menukad_in_title.get_text(strip=True) # Get ALL span.menukad elements from conjugation tables for span in soup.select("span.menukad"): text = span.get_text(strip=True) if text: forms.add(text) return forms, word_nikkud def main(): verbs = load_json(INPUT_FILE) if not verbs: print("ERROR: No verbs found in input file") sys.exit(1) # Load existing forms existing_forms = load_json(OUTPUT_FILE) new_forms = {} # Will be merged into existing at the end # Load progress to resume progress = load_json(PROGRESS_FILE) done_wnis = set(progress.get("done_wnis", [])) slug_cache = progress.get("slug_cache", {}) # Pre-populate slug cache from conjugations.json conj_file = os.path.join(DATA_DIR, "conjugations.json") if os.path.exists(conj_file): conj_data = load_json(conj_file) for wni_key, cdata in conj_data.items(): if isinstance(cdata, dict) and "slug" in cdata and wni_key not in slug_cache: slug_cache[wni_key] = cdata["slug"] print(f"Pre-populated {len(slug_cache)} slugs from conjugations.json") # Deduplicate verbs by wni seen_wni = set() unique_verbs = [] for v in verbs: if v["wni"] not in seen_wni: seen_wni.add(v["wni"]) unique_verbs.append(v) total = len(unique_verbs) to_scrape = [v for v in unique_verbs if v["wni"] not in done_wnis] print(f"Total unique verbs: {total}, already done: {total - len(to_scrape)}, to scrape: {len(to_scrape)}") scraped_count = 0 skipped_count = 0 total_new_forms = 0 sample_verbs = {} # For summary: wni -> list of forms for i, verb in enumerate(to_scrape): wni = verb["wni"] word_nikkud_input = verb["word"] try: # Step 1: Find slug if wni in slug_cache: slug = slug_cache[wni] else: slug = search_slug(wni) time.sleep(DELAY) if not slug: print(f" [{i + 1}/{len(to_scrape)}] SKIP {wni} - not found on pealim") skipped_count += 1 done_wnis.add(wni) continue slug_cache[wni] = slug # Step 2: Scrape forms forms, page_nikkud = scrape_verb_forms(slug) time.sleep(DELAY) # Use the nikkud form from our input data (more reliable) nikkud_to_use = word_nikkud_input # Build entries for each form for form in forms: entry = { "word_nikkud": nikkud_to_use, "form_type": "conjugation", "pos": "Verb", "slug": slug, } if form not in new_forms: new_forms[form] = [] # Check for duplicate entry if not any(e["slug"] == slug for e in new_forms[form]): new_forms[form].append(entry) total_new_forms += 1 scraped_count += 1 # Collect samples (first 3 completed) if len(sample_verbs) < 3: sample_verbs[wni] = sorted(forms) print(f" [{i + 1}/{len(to_scrape)}] {wni} -> {slug} ({len(forms)} forms)") done_wnis.add(wni) except Exception as e: print(f" [{i + 1}/{len(to_scrape)}] ERROR {wni}: {e}") skipped_count += 1 done_wnis.add(wni) # Save progress every 50 verbs if (i + 1) % 50 == 0: progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache} save_json(progress, PROGRESS_FILE) # Save partial merged result merged = dict(existing_forms) for form, entries in new_forms.items(): if form in merged: existing_slugs = {e["slug"] for e in merged[form]} for entry in entries: if entry["slug"] not in existing_slugs: merged[form].append(entry) else: merged[form] = entries save_json(merged, PARTIAL_FILE) print(f" -- Progress saved at {i + 1}/{len(to_scrape)} --") # Final merge merged = dict(existing_forms) for form, entries in new_forms.items(): if form in merged: existing_slugs = {e["slug"] for e in merged[form]} for entry in entries: if entry["slug"] not in existing_slugs: merged[form].append(entry) else: merged[form] = entries save_json(merged, OUTPUT_FILE) # Save final progress progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache} save_json(progress, PROGRESS_FILE) # Clean up partial file if os.path.exists(PARTIAL_FILE): os.remove(PARTIAL_FILE) # Summary print(f"\n{'=' * 50}") print("SUMMARY") print(f"{'=' * 50}") print(f"Verbs scraped: {scraped_count}") print(f"Verbs skipped: {skipped_count}") print(f"New forms added: {total_new_forms}") print(f"Total unique ktiv male forms: {len(merged)}") print(f"Previous forms count: {len(existing_forms)}") print(f"Net new form keys: {len(merged) - len(existing_forms)}") if sample_verbs: print("\nSample verbs:") for wni, forms in list(sample_verbs.items())[:3]: print(f"\n {wni} ({len(forms)} forms):") for f in forms[:8]: print(f" {f}") if len(forms) > 8: print(f" ... and {len(forms) - 8} more") if __name__ == "__main__": main()