hebrew_flash_cards/scripts/repair_slugs.py

#!/usr/bin/env python3
"""
Repair duplicate slugs in data/words.json.

Homographs (words with identical spelling but different meanings) were
assigned the same slug by the scraper. This script fetches the pealim.com
search page for each affected word, matches entries by meaning (and nikkud),
and writes the corrected slugs back to words.json and the source CSV.

Usage:
    python3 scripts/repair_slugs.py [--dry-run]
"""

from __future__ import annotations

import argparse
import json
import logging
import re
import sys
import time
from collections import defaultdict
from difflib import SequenceMatcher
from pathlib import Path

import pandas as pd
import requests
from bs4 import BeautifulSoup

# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).resolve().parent.parent
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
CSV_PATH = PROJECT_ROOT / "data" / "hebrew_dict_for_anki.csv"

# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
COOKIES: dict[str, str] = {"translit": "none", "hebstyle": "mo"}
REQUEST_DELAY = 1.5  # seconds between requests
REQUEST_TIMEOUT = 15  # seconds

# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s %(levelname)s %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)

# ---------------------------------------------------------------------------
# Similarity helpers
# ---------------------------------------------------------------------------
FUZZY_THRESHOLD = 0.4


def _similarity(a: str, b: str) -> float:
    """Return SequenceMatcher ratio between two strings (both lowercased)."""
    return SequenceMatcher(None, a.lower(), b.lower()).ratio()


def _best_match(
    our_meaning: str,
    candidates: list[dict],
    our_nikkud: str,
) -> tuple[dict | None, float]:
    """
    Return (best_candidate, ratio) by comparing our_meaning against each
    candidate's meaning field.  Nikkud exact-match gives a bonus to break ties.
    """
    best: dict | None = None
    best_score = -1.0

    for cand in candidates:
        ratio = _similarity(our_meaning, cand["meaning"])
        # Nikkud exact match adds a small bonus so the right homograph wins
        # even when meanings are very similar
        if our_nikkud and cand["word"] == our_nikkud:
            ratio = min(1.0, ratio + 0.05)
        if ratio > best_score:
            best_score = ratio
            best = cand

    return best, best_score


# ---------------------------------------------------------------------------
# Search-page parser
# ---------------------------------------------------------------------------
def _parse_search_results(html: bytes) -> list[dict]:
    """
    Parse pealim.com search results page.

    Each ``div.verb-search-result`` block contains:
    - div.verb-search-data > a[href]  → slug
    - div.verb-search-lemma > span.menukad  → nikkud word
    - div.verb-search-binyan  → part of speech
    - div.verb-search-meaning  → meaning text

    Returns a list of dicts with keys: slug, word, pos, meaning.
    """
    soup = BeautifulSoup(html, "html.parser")
    results: list[dict] = []

    for block in soup.find_all("div", class_="verb-search-result"):
        data_div = block.find("div", class_="verb-search-data")
        if not data_div:
            continue

        # Slug from the detail-page link
        slug = ""
        link = data_div.find("a", href=True)
        if link:
            m = re.search(r"/dict/([^/#]+)/", link["href"])
            if m:
                slug = m.group(1)

        # Nikkud word
        lemma_div = block.find("div", class_="verb-search-lemma")
        menukad = lemma_div.find("span", class_="menukad") if lemma_div else None
        word = menukad.get_text(strip=True) if menukad else (lemma_div.get_text(strip=True) if lemma_div else "")

        # Part of speech
        pos_div = block.find("div", class_="verb-search-binyan")
        pos = pos_div.get_text(strip=True).replace("Part of speech:", "").strip() if pos_div else ""

        # Meaning
        meaning_div = block.find("div", class_="verb-search-meaning")
        meaning = meaning_div.get_text(strip=True) if meaning_div else ""

        if slug:
            results.append({"slug": slug, "word": word, "pos": pos, "meaning": meaning})

    return results


def _fetch_search_results(ktiv_male: str) -> list[dict]:
    """Fetch and parse search results for a given consonant-only spelling."""
    url = f"https://www.pealim.com/search/?q={ktiv_male}"
    logger.debug("GET %s", url)
    resp = SESSION.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
    resp.raise_for_status()
    return _parse_search_results(resp.content)


# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def find_duplicate_groups(data: dict) -> dict[str, list[str]]:
    """
    Return mapping slug → [word_key, ...] for all slugs shared by 2+ entries.
    The word_key is the top-level key in words.json (nikkud + PoS + meaning).
    """
    slug_to_keys: dict[str, list[str]] = defaultdict(list)
    for key, entry in data.items():
        slug = entry.get("slug", "")
        if slug:
            slug_to_keys[slug].append(key)
    return {slug: keys for slug, keys in slug_to_keys.items() if len(keys) > 1}


def repair_group(
    slug: str,
    keys: list[str],
    data: dict,
    dry_run: bool,
) -> tuple[int, int]:
    """
    Attempt to repair one group of entries sharing *slug*.

    Homographs can have different ktiv_male spellings (e.g. אבידה vs אבדה for
    the two spellings of אֲבֵדָה).  We therefore build a union of all search
    results obtained by querying each distinct ktiv_male in the group.

    Returns (fixed_count, skipped_count).
    """
    # Collect distinct ktiv_male values across the group (usually one, but
    # sometimes two when homographs have different consonant spellings).
    ktiv_to_keys: dict[str, list[str]] = defaultdict(list)
    for k in keys:
        ktiv = data[k]["word"]["ktiv_male"]
        ktiv_to_keys[ktiv].append(k)

    nikkud_word = data[keys[0]]["word"]["nikkud"]
    logger.info(
        "  Fetching search results for %s — %d entries share slug %s",
        nikkud_word,
        len(keys),
        slug,
    )

    # Fetch search results for every distinct ktiv_male and merge
    all_candidates: list[dict] = []
    seen_slugs: set[str] = set()
    for ktiv in ktiv_to_keys:
        try:
            results = _fetch_search_results(ktiv)
        except requests.RequestException as exc:
            logger.warning("  HTTP error for %s: %s", ktiv, exc)
            results = []
        for r in results:
            if r["slug"] not in seen_slugs:
                seen_slugs.add(r["slug"])
                all_candidates.append(r)
        if len(ktiv_to_keys) > 1:
            # Small delay between sub-queries within the same group
            time.sleep(REQUEST_DELAY)

    if not all_candidates:
        logger.warning("  No search results — skipping group")
        return 0, len(keys)

    # Filter candidates to those whose nikkud word matches the entry's nikkud.
    # This avoids accidentally matching a completely different word that shares
    # the same consonant spelling (e.g. different voweling entirely).
    group_nikkuds = {data[k]["word"]["nikkud"] for k in keys}
    filtered = [c for c in all_candidates if c["word"] in group_nikkuds]

    if not filtered:
        logger.warning(
            "  Search results don't contain nikkud %s — candidates: %s — skipping",
            group_nikkuds,
            [c["word"] for c in all_candidates],
        )
        return 0, len(keys)

    fixed = 0
    skipped = 0

    for key in keys:
        entry = data[key]
        our_meaning = entry.get("meaning", "")
        our_nikkud = entry["word"]["nikkud"]

        # Only consider candidates that match this entry's nikkud
        nikkud_filtered = [c for c in filtered if c["word"] == our_nikkud]
        pool = nikkud_filtered if nikkud_filtered else filtered

        best, score = _best_match(our_meaning, pool, our_nikkud)

        if best is None or score < FUZZY_THRESHOLD:
            logger.warning(
                "    SKIP  key=%s | meaning=%r | best_score=%.2f",
                key,
                our_meaning,
                score,
            )
            skipped += 1
            continue

        new_slug = best["slug"]
        old_slug = entry["slug"]

        if new_slug == old_slug:
            logger.info("    SAME  key=%s | slug=%s (score=%.2f)", key, old_slug, score)
            fixed += 1
            continue

        logger.info(
            "    FIX   key=%s | %s → %s | matched=%r (score=%.2f)",
            key,
            old_slug,
            new_slug,
            best["meaning"],
            score,
        )

        if not dry_run:
            data[key]["slug"] = new_slug

        fixed += 1

    return fixed, skipped


# ---------------------------------------------------------------------------
# CSV update
# ---------------------------------------------------------------------------
def update_csv(data: dict, dry_run: bool) -> None:
    """
    Re-write the CSV so every row's slug column matches words.json.

    The CSV is semicolon-delimited; the slug column is named 'slug'.
    We match rows by 'Word Without Nikkud' (ktiv_male) AND 'Meaning' because
    homographs share the same ktiv_male.
    """
    df = pd.read_csv(CSV_PATH, sep=";", dtype=str)

    if "slug" not in df.columns:
        logger.warning("CSV has no 'slug' column — skipping CSV update")
        return

    # Build a lookup: (ktiv_male, meaning) → new_slug from words.json
    lookup: dict[tuple[str, str], str] = {}
    for entry in data.values():
        ktiv = entry["word"].get("ktiv_male", "")
        meaning = entry.get("meaning", "")
        slug = entry.get("slug", "")
        if ktiv and slug:
            lookup[(ktiv, meaning)] = slug

    changes = 0
    for idx, row in df.iterrows():
        ktiv = str(row.get("Word Without Nikkud", "")).strip()
        meaning = str(row.get("Meaning", "")).strip()
        key = (ktiv, meaning)
        if key in lookup:
            new_slug = lookup[key]
            old_slug = str(row["slug"]).strip()
            if new_slug != old_slug:
                logger.info(
                    "  CSV row %d: %s → %s  (%s)",
                    idx,
                    old_slug,
                    new_slug,
                    ktiv,
                )
                if not dry_run:
                    df.at[idx, "slug"] = new_slug
                changes += 1

    logger.info("CSV: %d slug(s) to update", changes)
    if not dry_run and changes:
        df.to_csv(CSV_PATH, sep=";", index=True)
        logger.info("CSV written to %s", CSV_PATH)
    elif dry_run:
        logger.info("DRY-RUN: CSV not written")


# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
    parser = argparse.ArgumentParser(description="Repair duplicate slugs in data/words.json")
    parser.add_argument(
        "--dry-run",
        action="store_true",
        help="Preview changes without writing any files",
    )
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Enable debug logging",
    )
    args = parser.parse_args(argv)

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    if args.dry_run:
        logger.info("=== DRY-RUN mode — no files will be modified ===")

    # Load data
    logger.info("Loading %s", WORDS_JSON)
    with WORDS_JSON.open(encoding="utf-8") as fh:
        data: dict = json.load(fh)
    logger.info("Loaded %d entries", len(data))

    # Identify duplicate groups
    groups = find_duplicate_groups(data)
    total_groups = len(groups)
    total_entries = sum(len(v) for v in groups.values())
    logger.info(
        "Found %d duplicate-slug groups covering %d entries",
        total_groups,
        total_entries,
    )

    # Process each group
    total_fixed = 0
    total_skipped = 0

    for group_idx, (slug, keys) in enumerate(sorted(groups.items()), 1):
        logger.info(
            "[%d/%d] slug=%s (%d entries)",
            group_idx,
            total_groups,
            slug,
            len(keys),
        )
        fixed, skipped = repair_group(slug, keys, data, dry_run=args.dry_run)
        total_fixed += fixed
        total_skipped += skipped

        # Respectful delay between HTTP requests
        if group_idx < total_groups:
            time.sleep(REQUEST_DELAY)

    logger.info(
        "Summary: %d fixed, %d skipped (out of %d entries in %d groups)",
        total_fixed,
        total_skipped,
        total_entries,
        total_groups,
    )

    # Write updated words.json
    if not args.dry_run:
        logger.info("Writing %s", WORDS_JSON)
        with WORDS_JSON.open("w", encoding="utf-8") as fh:
            json.dump(data, fh, ensure_ascii=False, indent=2)
        logger.info("words.json written")
    else:
        logger.info("DRY-RUN: words.json not written")

    # Update CSV
    logger.info("Updating CSV %s", CSV_PATH)
    update_csv(data, dry_run=args.dry_run)

    return 0 if total_skipped == 0 else 1


if __name__ == "__main__":
    sys.exit(main())