hebrew_flash_cards/scripts/scrape_verb_ktiv.py

#!/usr/bin/env python3
"""Scrape ktiv male (vowelless plene) conjugation forms for top 500 verbs from pealim.com."""

import json
import os
import re
import sys
import time

sys.stdout.reconfigure(line_buffering=True)
import requests  # noqa: E402
from bs4 import BeautifulSoup  # noqa: E402

DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
INPUT_FILE = os.path.join(DATA_DIR, "top_verbs_to_scrape.json")
OUTPUT_FILE = os.path.join(DATA_DIR, "ktiv_male_forms.json")
PARTIAL_FILE = os.path.join(DATA_DIR, "ktiv_male_forms_partial.json")
PROGRESS_FILE = os.path.join(DATA_DIR, "ktiv_scrape_progress.json")

COOKIES = {"translit": "none", "hebstyle": "vl"}
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
DELAY = 1.5

session = requests.Session()
session.cookies.update(COOKIES)
session.headers.update(HEADERS)


def load_json(path):
    if os.path.exists(path):
        with open(path, encoding="utf-8") as f:
            return json.load(f)
    return {}


def save_json(data, path):
    with open(path, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=1)


def search_slug(wni):
    """Search pealim for a verb and return the first result's slug."""
    url = "https://www.pealim.com/search/"
    resp = session.get(url, params={"q": wni}, timeout=15)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    # Look for result links like /dict/SLUG/
    for a in soup.select("a[href]"):
        href = a["href"]
        m = re.match(r"/dict/(\d+-[^/]+)/", href)
        if m:
            return m.group(1)
    return None


def scrape_verb_forms(slug):
    """Fetch a verb's detail page and extract all ktiv male conjugation forms."""
    url = f"https://www.pealim.com/dict/{slug}/"
    resp = session.get(url, timeout=15)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, "html.parser")

    forms = set()

    # Get infinitive from div.lead or page title
    lead = soup.select_one("div.lead")
    if lead:
        menukad_spans = lead.select("span.menukad")
        for span in menukad_spans:
            text = span.get_text(strip=True)
            if text:
                forms.add(text)

    # Get word_nikkud (the nikkud form of the infinitive) from the page
    # We need to fetch with mo cookie for that, but we already have it from input data
    # Instead, get the page title which usually has the nikkud form
    word_nikkud = None
    title = soup.select_one("h1")
    if title:
        menukad_in_title = title.select_one("span.menukad")
        if menukad_in_title:
            word_nikkud = menukad_in_title.get_text(strip=True)

    # Get ALL span.menukad elements from conjugation tables
    for span in soup.select("span.menukad"):
        text = span.get_text(strip=True)
        if text:
            forms.add(text)

    return forms, word_nikkud


def main():
    verbs = load_json(INPUT_FILE)
    if not verbs:
        print("ERROR: No verbs found in input file")
        sys.exit(1)

    # Load existing forms
    existing_forms = load_json(OUTPUT_FILE)
    new_forms = {}  # Will be merged into existing at the end

    # Load progress to resume
    progress = load_json(PROGRESS_FILE)
    done_wnis = set(progress.get("done_wnis", []))
    slug_cache = progress.get("slug_cache", {})

    # Pre-populate slug cache from conjugations.json
    conj_file = os.path.join(DATA_DIR, "conjugations.json")
    if os.path.exists(conj_file):
        conj_data = load_json(conj_file)
        for wni_key, cdata in conj_data.items():
            if isinstance(cdata, dict) and "slug" in cdata and wni_key not in slug_cache:
                slug_cache[wni_key] = cdata["slug"]
        print(f"Pre-populated {len(slug_cache)} slugs from conjugations.json")

    # Deduplicate verbs by wni
    seen_wni = set()
    unique_verbs = []
    for v in verbs:
        if v["wni"] not in seen_wni:
            seen_wni.add(v["wni"])
            unique_verbs.append(v)

    total = len(unique_verbs)
    to_scrape = [v for v in unique_verbs if v["wni"] not in done_wnis]
    print(f"Total unique verbs: {total}, already done: {total - len(to_scrape)}, to scrape: {len(to_scrape)}")

    scraped_count = 0
    skipped_count = 0
    total_new_forms = 0
    sample_verbs = {}  # For summary: wni -> list of forms

    for i, verb in enumerate(to_scrape):
        wni = verb["wni"]
        word_nikkud_input = verb["word"]

        try:
            # Step 1: Find slug
            if wni in slug_cache:
                slug = slug_cache[wni]
            else:
                slug = search_slug(wni)
                time.sleep(DELAY)

            if not slug:
                print(f"  [{i + 1}/{len(to_scrape)}] SKIP {wni} - not found on pealim")
                skipped_count += 1
                done_wnis.add(wni)
                continue

            slug_cache[wni] = slug

            # Step 2: Scrape forms
            forms, page_nikkud = scrape_verb_forms(slug)
            time.sleep(DELAY)

            # Use the nikkud form from our input data (more reliable)
            nikkud_to_use = word_nikkud_input

            # Build entries for each form
            for form in forms:
                entry = {
                    "word_nikkud": nikkud_to_use,
                    "form_type": "conjugation",
                    "pos": "Verb",
                    "slug": slug,
                }
                if form not in new_forms:
                    new_forms[form] = []
                # Check for duplicate entry
                if not any(e["slug"] == slug for e in new_forms[form]):
                    new_forms[form].append(entry)
                    total_new_forms += 1

            scraped_count += 1
            # Collect samples (first 3 completed)
            if len(sample_verbs) < 3:
                sample_verbs[wni] = sorted(forms)

            print(f"  [{i + 1}/{len(to_scrape)}] {wni} -> {slug} ({len(forms)} forms)")
            done_wnis.add(wni)

        except Exception as e:
            print(f"  [{i + 1}/{len(to_scrape)}] ERROR {wni}: {e}")
            skipped_count += 1
            done_wnis.add(wni)

        # Save progress every 50 verbs
        if (i + 1) % 50 == 0:
            progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
            save_json(progress, PROGRESS_FILE)
            # Save partial merged result
            merged = dict(existing_forms)
            for form, entries in new_forms.items():
                if form in merged:
                    existing_slugs = {e["slug"] for e in merged[form]}
                    for entry in entries:
                        if entry["slug"] not in existing_slugs:
                            merged[form].append(entry)
                else:
                    merged[form] = entries
            save_json(merged, PARTIAL_FILE)
            print(f"  -- Progress saved at {i + 1}/{len(to_scrape)} --")

    # Final merge
    merged = dict(existing_forms)
    for form, entries in new_forms.items():
        if form in merged:
            existing_slugs = {e["slug"] for e in merged[form]}
            for entry in entries:
                if entry["slug"] not in existing_slugs:
                    merged[form].append(entry)
        else:
            merged[form] = entries

    save_json(merged, OUTPUT_FILE)

    # Save final progress
    progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
    save_json(progress, PROGRESS_FILE)

    # Clean up partial file
    if os.path.exists(PARTIAL_FILE):
        os.remove(PARTIAL_FILE)

    # Summary
    print(f"\n{'=' * 50}")
    print("SUMMARY")
    print(f"{'=' * 50}")
    print(f"Verbs scraped:         {scraped_count}")
    print(f"Verbs skipped:         {skipped_count}")
    print(f"New forms added:       {total_new_forms}")
    print(f"Total unique ktiv male forms: {len(merged)}")
    print(f"Previous forms count:  {len(existing_forms)}")
    print(f"Net new form keys:     {len(merged) - len(existing_forms)}")

    if sample_verbs:
        print("\nSample verbs:")
        for wni, forms in list(sample_verbs.items())[:3]:
            print(f"\n  {wni} ({len(forms)} forms):")
            for f in forms[:8]:
                print(f"    {f}")
            if len(forms) > 8:
                print(f"    ... and {len(forms) - 8} more")


if __name__ == "__main__":
    main()