hebrew_flash_cards/scripts/scrape_noun_plurals.py

#!/usr/bin/env python3
"""
Scrape pealim.com for noun plural and construct forms.

Step 1: Collect noun slugs from list pages (/dict/?pos=noun&page=N)
Step 2: Fetch detail pages for plural + construct forms
Step 3: Print summary statistics
"""

import json
import re
import time
from pathlib import Path

import requests
from bs4 import BeautifulSoup

BASE_URL = "https://www.pealim.com"
COOKIES = {"translit": "none", "hebstyle": "mo"}
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
SLUG_MAP_FILE = DATA_DIR / "noun_slug_map.json"
PROGRESS_FILE = DATA_DIR / "noun_slug_map_progress.json"
PLURALS_FILE = DATA_DIR / "noun_plurals.json"
DELAY = 1.5  # seconds between requests


def load_json(path, default=None):
    if path.exists():
        with open(path) as f:
            return json.load(f)
    return default if default is not None else {}


def save_json(path, data):
    with open(path, "w") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def fetch_with_retry(url, max_retries=5):
    """Fetch URL with exponential backoff."""
    for attempt in range(max_retries):
        try:
            r = requests.get(url, cookies=COOKIES, headers=HEADERS, timeout=30)
            r.raise_for_status()
            return r
        except (requests.RequestException, ConnectionError) as e:
            wait = min(2**attempt * 2, 60)
            print(f"  Retry {attempt + 1}/{max_retries} for {url}: {e} (waiting {wait}s)")
            time.sleep(wait)
    print(f"  FAILED after {max_retries} retries: {url}")
    return None


def get_total_pages():
    """Get total number of noun list pages."""
    r = fetch_with_retry(f"{BASE_URL}/dict/?pos=noun&page=1")
    if not r:
        return 0
    soup = BeautifulSoup(r.text, "lxml")
    pages = set()
    for a in soup.select("ul.pagination li a"):
        href = a.get("href", "")
        m = re.search(r"page=(\d+)", href)
        if m:
            pages.add(int(m.group(1)))
    return max(pages) if pages else 1


def parse_list_page(html):
    """Parse a noun list page and return list of noun entries."""
    soup = BeautifulSoup(html, "lxml")
    table = soup.select_one("table.dict-table")
    if not table:
        return []

    entries = []
    for row in table.select("tr")[1:]:  # skip header
        tds = row.select("td")
        if len(tds) < 3:
            continue

        # First td: word + link
        first_td = tds[0]
        a = first_td.select_one("a")
        if not a:
            continue
        href = a.get("href", "")
        slug_match = re.search(r"/dict/([^/]+)/", href)
        if not slug_match:
            continue
        slug = slug_match.group(1)

        menukad = first_td.select_one("span.menukad")
        word_nikkud = menukad.get_text(strip=True) if menukad else ""

        # Word without nikkud (strip combining marks)
        word_plain = re.sub(r"[\u0591-\u05C7]", "", word_nikkud)

        # Third td: part of speech
        pos_text = tds[2].get_text(strip=True)

        # Gender
        gender = ""
        if "masculine" in pos_text.lower():
            gender = "masculine"
        elif "feminine" in pos_text.lower():
            gender = "feminine"

        # Mishkal pattern
        mishkal = ""
        m = re.search(r"(\w+)\s*pattern", pos_text.lower())
        if m:
            mishkal = m.group(1)

        entries.append(
            {
                "word_plain": word_plain,
                "slug": slug,
                "word_nikkud": word_nikkud,
                "pos": pos_text,
                "gender": gender,
                "mishkal": mishkal,
            }
        )

    return entries


def step1_collect_slugs():
    """Step 1: Collect noun slugs from list pages."""
    print("=" * 60)
    print("STEP 1: Collecting noun slugs from list pages")
    print("=" * 60)

    slug_map = load_json(SLUG_MAP_FILE, {})
    progress = load_json(PROGRESS_FILE, [])
    completed_pages = set(progress) if isinstance(progress, list) else set()

    # Get total pages
    total_pages = get_total_pages()
    print(f"Total pages: {total_pages}")
    print(f"Already completed: {len(completed_pages)} pages, {len(slug_map)} nouns")

    remaining = [p for p in range(1, total_pages + 1) if p not in completed_pages]
    print(f"Remaining pages: {len(remaining)}")

    if not remaining:
        print("All pages already scraped!")
        return slug_map

    for i, page_num in enumerate(remaining):
        url = f"{BASE_URL}/dict/?pos=noun&page={page_num}"
        r = fetch_with_retry(url)
        if not r:
            print(f"  Skipping page {page_num}")
            continue

        entries = parse_list_page(r.text)
        for entry in entries:
            word = entry["word_plain"]
            slug_map[word] = {
                "slug": entry["slug"],
                "word_nikkud": entry["word_nikkud"],
                "pos": entry["pos"],
                "gender": entry["gender"],
                "mishkal": entry["mishkal"],
            }

        completed_pages.add(page_num)
        done = len(completed_pages)
        print(f"  Page {page_num} ({done}/{total_pages}): {len(entries)} nouns (total: {len(slug_map)})")

        # Save progress every 10 pages
        if (i + 1) % 10 == 0 or page_num == remaining[-1]:
            save_json(SLUG_MAP_FILE, slug_map)
            save_json(PROGRESS_FILE, sorted(completed_pages))
            print(f"  [Saved progress: {len(slug_map)} nouns, {done} pages]")

        time.sleep(DELAY)

    # Final save
    save_json(SLUG_MAP_FILE, slug_map)
    save_json(PROGRESS_FILE, sorted(completed_pages))
    print(f"\nStep 1 complete: {len(slug_map)} total nouns from {len(completed_pages)} pages")
    return slug_map


def parse_detail_page(html, slug, gender, mishkal):
    """Parse a noun detail page for plural/construct forms."""
    soup = BeautifulSoup(html, "lxml")
    tables = soup.select("table.conjugation-table")
    if not tables:
        return None

    table = tables[0]
    rows = table.select("tr")

    result = {
        "slug": slug,
        "singular": "",
        "singular_audio": "",
        "plural": "",
        "plural_audio": "",
        "construct_singular": "",
        "construct_plural": "",
        "gender": gender,
        "mishkal": mishkal,
    }

    for row in rows:
        th = row.select_one("th")
        if not th:
            continue
        label = th.get_text(strip=True).lower()
        tds = row.select("td")

        if "absolute" in label:
            if len(tds) >= 1:
                td = tds[0]
                m = td.select_one("span.menukad")
                result["singular"] = m.get_text(strip=True) if m else ""
                audio_el = td.select_one("[data-audio]")
                result["singular_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
            if len(tds) >= 2:
                td = tds[1]
                m = td.select_one("span.menukad")
                result["plural"] = m.get_text(strip=True) if m else ""
                audio_el = td.select_one("[data-audio]")
                result["plural_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")

        elif "construct" in label:
            if len(tds) >= 1:
                td = tds[0]
                m = td.select_one("span.menukad")
                result["construct_singular"] = m.get_text(strip=True) if m else ""
            if len(tds) >= 2:
                td = tds[1]
                m = td.select_one("span.menukad")
                result["construct_plural"] = m.get_text(strip=True) if m else ""

    return result


def step2_fetch_plurals(slug_map):
    """Step 2: Fetch detail pages for plural + construct forms."""
    print("\n" + "=" * 60)
    print("STEP 2: Fetching plural + construct forms from detail pages")
    print("=" * 60)

    plurals = load_json(PLURALS_FILE, {})
    already_done = set(plurals.keys())

    # Build work list: nouns not yet in plurals
    work = []
    for word, info in slug_map.items():
        if word not in already_done:
            work.append((word, info))

    print(f"Already have plural data: {len(already_done)}")
    print(f"Remaining to fetch: {len(work)}")

    if not work:
        print("All nouns already have plural data!")
        return plurals

    skipped = 0
    for i, (word, info) in enumerate(work):
        slug = info["slug"]
        url = f"{BASE_URL}/dict/{slug}/"
        r = fetch_with_retry(url)
        if not r:
            print(f"  Skipping {word} ({slug})")
            skipped += 1
            continue

        entry = parse_detail_page(r.text, slug, info.get("gender", ""), info.get("mishkal", ""))
        if entry:
            plurals[word] = entry
        else:
            # No declension table - store minimal entry
            plurals[word] = {
                "slug": slug,
                "singular": info.get("word_nikkud", ""),
                "singular_audio": "",
                "plural": "",
                "plural_audio": "",
                "construct_singular": "",
                "construct_plural": "",
                "gender": info.get("gender", ""),
                "mishkal": info.get("mishkal", ""),
                "no_declension_table": True,
            }

        done = len(already_done) + i + 1 - skipped
        total = len(already_done) + len(work)
        if (i + 1) % 50 == 0 or i == 0:
            print(
                f"  [{i + 1}/{len(work)}] {word} ({slug}): "
                f"plural={entry['plural'] if entry else 'N/A'} "
                f"(total: {done}/{total})"
            )

        # Save every 50 entries
        if (i + 1) % 50 == 0 or i == len(work) - 1:
            save_json(PLURALS_FILE, plurals)
            print(f"  [Saved: {len(plurals)} entries]")

        time.sleep(DELAY)

    save_json(PLURALS_FILE, plurals)
    print(f"\nStep 2 complete: {len(plurals)} total noun entries with plural data")
    return plurals


def step3_summary(slug_map, plurals):
    """Step 3: Print summary statistics."""
    print("\n" + "=" * 60)
    print("SUMMARY")
    print("=" * 60)

    total_slugs = len(slug_map)
    total_plurals = len(plurals)
    has_plural = sum(1 for v in plurals.values() if v.get("plural"))
    has_construct = sum(1 for v in plurals.values() if v.get("construct_singular") or v.get("construct_plural"))
    has_audio = sum(1 for v in plurals.values() if v.get("singular_audio") or v.get("plural_audio"))
    no_table = sum(1 for v in plurals.values() if v.get("no_declension_table"))

    # Irregular plurals: masculine with ות- ending, feminine with ים- ending
    irregular = 0
    for _word, v in plurals.items():
        plural = v.get("plural", "")
        gender = v.get("gender", "")
        if not plural or not gender:
            continue
        plain_plural = re.sub(r"[\u0591-\u05C7]", "", plural)
        if (
            gender == "masculine"
            and plain_plural.endswith("ות")
            or gender == "feminine"
            and plain_plural.endswith("ים")
        ):
            irregular += 1

    print(f"Total nouns in slug map:       {total_slugs}")
    print(f"Total nouns with plural data:  {total_plurals}")
    print(f"  - With plural form:          {has_plural}")
    print(f"  - With construct forms:       {has_construct}")
    print(f"  - With audio URLs:            {has_audio}")
    print(f"  - No declension table:        {no_table}")
    print(f"  - Irregular plurals:          {irregular}")


def main():
    print("Pealim Noun Plural Scraper")
    print(f"Data directory: {DATA_DIR}")
    print()

    slug_map = step1_collect_slugs()
    plurals = step2_fetch_plurals(slug_map)
    step3_summary(slug_map, plurals)


if __name__ == "__main__":
    main()