hebrew_flash_cards/scripts/repair_slugs.py
Sochen 08fb7009d8 Sprint 11: unified JSON architecture + consolidated scraping pipeline
Migrate from fragmented CSV + 10 JSON files to a single data/words.json
(9,104 entries) as the unified data store. All GUIDs preserved for Anki
study progress continuity.

New files:
- SCHEMA.yaml: authoritative schema for words.json
- pealim_list_scrape.py: consolidated list page scraper → words.json
- pealim_detail_scrape.py: noun/verb detail scraper → words.json
- pealim_audio_download.py: audio downloader reading from words.json
- scripts/migrate_to_json.py: one-time CSV→JSON migration
- scripts/validate_data.py: 17 data integrity tests
- scripts/check_guid_coverage.py: GUID preservation checker
- scripts/repair_slugs.py: slug deduplication repair tool
- tests/test_scraper_integration.py: live scraper integration tests

Updated:
- apkg_builder.py: reads from words.json (no more pandas)
- run.py: 8-step pipeline (list scrape → frequency → examples →
  detail scrape → audio download → fonts → images → build)
- benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers
  for future words.json integration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:54:58 +00:00

420 lines
14 KiB
Python

#!/usr/bin/env python3
"""
Repair duplicate slugs in data/words.json.
Homographs (words with identical spelling but different meanings) were
assigned the same slug by the scraper. This script fetches the pealim.com
search page for each affected word, matches entries by meaning (and nikkud),
and writes the corrected slugs back to words.json and the source CSV.
Usage:
python3 scripts/repair_slugs.py [--dry-run]
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import sys
import time
from collections import defaultdict
from difflib import SequenceMatcher
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).resolve().parent.parent
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
CSV_PATH = PROJECT_ROOT / "data" / "hebrew_dict_for_anki.csv"
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
COOKIES: dict[str, str] = {"translit": "none", "hebstyle": "mo"}
REQUEST_DELAY = 1.5 # seconds between requests
REQUEST_TIMEOUT = 15 # seconds
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Similarity helpers
# ---------------------------------------------------------------------------
FUZZY_THRESHOLD = 0.4
def _similarity(a: str, b: str) -> float:
"""Return SequenceMatcher ratio between two strings (both lowercased)."""
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def _best_match(
our_meaning: str,
candidates: list[dict],
our_nikkud: str,
) -> tuple[dict | None, float]:
"""
Return (best_candidate, ratio) by comparing our_meaning against each
candidate's meaning field. Nikkud exact-match gives a bonus to break ties.
"""
best: dict | None = None
best_score = -1.0
for cand in candidates:
ratio = _similarity(our_meaning, cand["meaning"])
# Nikkud exact match adds a small bonus so the right homograph wins
# even when meanings are very similar
if our_nikkud and cand["word"] == our_nikkud:
ratio = min(1.0, ratio + 0.05)
if ratio > best_score:
best_score = ratio
best = cand
return best, best_score
# ---------------------------------------------------------------------------
# Search-page parser
# ---------------------------------------------------------------------------
def _parse_search_results(html: bytes) -> list[dict]:
"""
Parse pealim.com search results page.
Each ``div.verb-search-result`` block contains:
- div.verb-search-data > a[href] → slug
- div.verb-search-lemma > span.menukad → nikkud word
- div.verb-search-binyan → part of speech
- div.verb-search-meaning → meaning text
Returns a list of dicts with keys: slug, word, pos, meaning.
"""
soup = BeautifulSoup(html, "html.parser")
results: list[dict] = []
for block in soup.find_all("div", class_="verb-search-result"):
data_div = block.find("div", class_="verb-search-data")
if not data_div:
continue
# Slug from the detail-page link
slug = ""
link = data_div.find("a", href=True)
if link:
m = re.search(r"/dict/([^/#]+)/", link["href"])
if m:
slug = m.group(1)
# Nikkud word
lemma_div = block.find("div", class_="verb-search-lemma")
menukad = lemma_div.find("span", class_="menukad") if lemma_div else None
word = menukad.get_text(strip=True) if menukad else (lemma_div.get_text(strip=True) if lemma_div else "")
# Part of speech
pos_div = block.find("div", class_="verb-search-binyan")
pos = pos_div.get_text(strip=True).replace("Part of speech:", "").strip() if pos_div else ""
# Meaning
meaning_div = block.find("div", class_="verb-search-meaning")
meaning = meaning_div.get_text(strip=True) if meaning_div else ""
if slug:
results.append({"slug": slug, "word": word, "pos": pos, "meaning": meaning})
return results
def _fetch_search_results(ktiv_male: str) -> list[dict]:
"""Fetch and parse search results for a given consonant-only spelling."""
url = f"https://www.pealim.com/search/?q={ktiv_male}"
logger.debug("GET %s", url)
resp = SESSION.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return _parse_search_results(resp.content)
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def find_duplicate_groups(data: dict) -> dict[str, list[str]]:
"""
Return mapping slug → [word_key, ...] for all slugs shared by 2+ entries.
The word_key is the top-level key in words.json (nikkud + PoS + meaning).
"""
slug_to_keys: dict[str, list[str]] = defaultdict(list)
for key, entry in data.items():
slug = entry.get("slug", "")
if slug:
slug_to_keys[slug].append(key)
return {slug: keys for slug, keys in slug_to_keys.items() if len(keys) > 1}
def repair_group(
slug: str,
keys: list[str],
data: dict,
dry_run: bool,
) -> tuple[int, int]:
"""
Attempt to repair one group of entries sharing *slug*.
Homographs can have different ktiv_male spellings (e.g. אבידה vs אבדה for
the two spellings of אֲבֵדָה). We therefore build a union of all search
results obtained by querying each distinct ktiv_male in the group.
Returns (fixed_count, skipped_count).
"""
# Collect distinct ktiv_male values across the group (usually one, but
# sometimes two when homographs have different consonant spellings).
ktiv_to_keys: dict[str, list[str]] = defaultdict(list)
for k in keys:
ktiv = data[k]["word"]["ktiv_male"]
ktiv_to_keys[ktiv].append(k)
nikkud_word = data[keys[0]]["word"]["nikkud"]
logger.info(
" Fetching search results for %s%d entries share slug %s",
nikkud_word,
len(keys),
slug,
)
# Fetch search results for every distinct ktiv_male and merge
all_candidates: list[dict] = []
seen_slugs: set[str] = set()
for ktiv in ktiv_to_keys:
try:
results = _fetch_search_results(ktiv)
except requests.RequestException as exc:
logger.warning(" HTTP error for %s: %s", ktiv, exc)
results = []
for r in results:
if r["slug"] not in seen_slugs:
seen_slugs.add(r["slug"])
all_candidates.append(r)
if len(ktiv_to_keys) > 1:
# Small delay between sub-queries within the same group
time.sleep(REQUEST_DELAY)
if not all_candidates:
logger.warning(" No search results — skipping group")
return 0, len(keys)
# Filter candidates to those whose nikkud word matches the entry's nikkud.
# This avoids accidentally matching a completely different word that shares
# the same consonant spelling (e.g. different voweling entirely).
group_nikkuds = {data[k]["word"]["nikkud"] for k in keys}
filtered = [c for c in all_candidates if c["word"] in group_nikkuds]
if not filtered:
logger.warning(
" Search results don't contain nikkud %s — candidates: %s — skipping",
group_nikkuds,
[c["word"] for c in all_candidates],
)
return 0, len(keys)
fixed = 0
skipped = 0
for key in keys:
entry = data[key]
our_meaning = entry.get("meaning", "")
our_nikkud = entry["word"]["nikkud"]
# Only consider candidates that match this entry's nikkud
nikkud_filtered = [c for c in filtered if c["word"] == our_nikkud]
pool = nikkud_filtered if nikkud_filtered else filtered
best, score = _best_match(our_meaning, pool, our_nikkud)
if best is None or score < FUZZY_THRESHOLD:
logger.warning(
" SKIP key=%s | meaning=%r | best_score=%.2f",
key,
our_meaning,
score,
)
skipped += 1
continue
new_slug = best["slug"]
old_slug = entry["slug"]
if new_slug == old_slug:
logger.info(" SAME key=%s | slug=%s (score=%.2f)", key, old_slug, score)
fixed += 1
continue
logger.info(
" FIX key=%s | %s%s | matched=%r (score=%.2f)",
key,
old_slug,
new_slug,
best["meaning"],
score,
)
if not dry_run:
data[key]["slug"] = new_slug
fixed += 1
return fixed, skipped
# ---------------------------------------------------------------------------
# CSV update
# ---------------------------------------------------------------------------
def update_csv(data: dict, dry_run: bool) -> None:
"""
Re-write the CSV so every row's slug column matches words.json.
The CSV is semicolon-delimited; the slug column is named 'slug'.
We match rows by 'Word Without Nikkud' (ktiv_male) AND 'Meaning' because
homographs share the same ktiv_male.
"""
df = pd.read_csv(CSV_PATH, sep=";", dtype=str)
if "slug" not in df.columns:
logger.warning("CSV has no 'slug' column — skipping CSV update")
return
# Build a lookup: (ktiv_male, meaning) → new_slug from words.json
lookup: dict[tuple[str, str], str] = {}
for entry in data.values():
ktiv = entry["word"].get("ktiv_male", "")
meaning = entry.get("meaning", "")
slug = entry.get("slug", "")
if ktiv and slug:
lookup[(ktiv, meaning)] = slug
changes = 0
for idx, row in df.iterrows():
ktiv = str(row.get("Word Without Nikkud", "")).strip()
meaning = str(row.get("Meaning", "")).strip()
key = (ktiv, meaning)
if key in lookup:
new_slug = lookup[key]
old_slug = str(row["slug"]).strip()
if new_slug != old_slug:
logger.info(
" CSV row %d: %s%s (%s)",
idx,
old_slug,
new_slug,
ktiv,
)
if not dry_run:
df.at[idx, "slug"] = new_slug
changes += 1
logger.info("CSV: %d slug(s) to update", changes)
if not dry_run and changes:
df.to_csv(CSV_PATH, sep=";", index=True)
logger.info("CSV written to %s", CSV_PATH)
elif dry_run:
logger.info("DRY-RUN: CSV not written")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Repair duplicate slugs in data/words.json")
parser.add_argument(
"--dry-run",
action="store_true",
help="Preview changes without writing any files",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable debug logging",
)
args = parser.parse_args(argv)
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if args.dry_run:
logger.info("=== DRY-RUN mode — no files will be modified ===")
# Load data
logger.info("Loading %s", WORDS_JSON)
with WORDS_JSON.open(encoding="utf-8") as fh:
data: dict = json.load(fh)
logger.info("Loaded %d entries", len(data))
# Identify duplicate groups
groups = find_duplicate_groups(data)
total_groups = len(groups)
total_entries = sum(len(v) for v in groups.values())
logger.info(
"Found %d duplicate-slug groups covering %d entries",
total_groups,
total_entries,
)
# Process each group
total_fixed = 0
total_skipped = 0
for group_idx, (slug, keys) in enumerate(sorted(groups.items()), 1):
logger.info(
"[%d/%d] slug=%s (%d entries)",
group_idx,
total_groups,
slug,
len(keys),
)
fixed, skipped = repair_group(slug, keys, data, dry_run=args.dry_run)
total_fixed += fixed
total_skipped += skipped
# Respectful delay between HTTP requests
if group_idx < total_groups:
time.sleep(REQUEST_DELAY)
logger.info(
"Summary: %d fixed, %d skipped (out of %d entries in %d groups)",
total_fixed,
total_skipped,
total_entries,
total_groups,
)
# Write updated words.json
if not args.dry_run:
logger.info("Writing %s", WORDS_JSON)
with WORDS_JSON.open("w", encoding="utf-8") as fh:
json.dump(data, fh, ensure_ascii=False, indent=2)
logger.info("words.json written")
else:
logger.info("DRY-RUN: words.json not written")
# Update CSV
logger.info("Updating CSV %s", CSV_PATH)
update_csv(data, dry_run=args.dry_run)
return 0 if total_skipped == 0 else 1
if __name__ == "__main__":
sys.exit(main())