Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
420 lines
14 KiB
Python
420 lines
14 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Repair duplicate slugs in data/words.json.
|
|
|
|
Homographs (words with identical spelling but different meanings) were
|
|
assigned the same slug by the scraper. This script fetches the pealim.com
|
|
search page for each affected word, matches entries by meaning (and nikkud),
|
|
and writes the corrected slugs back to words.json and the source CSV.
|
|
|
|
Usage:
|
|
python3 scripts/repair_slugs.py [--dry-run]
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import re
|
|
import sys
|
|
import time
|
|
from collections import defaultdict
|
|
from difflib import SequenceMatcher
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Paths
|
|
# ---------------------------------------------------------------------------
|
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
|
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
|
CSV_PATH = PROJECT_ROOT / "data" / "hebrew_dict_for_anki.csv"
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# HTTP session
|
|
# ---------------------------------------------------------------------------
|
|
SESSION = requests.Session()
|
|
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
|
|
COOKIES: dict[str, str] = {"translit": "none", "hebstyle": "mo"}
|
|
REQUEST_DELAY = 1.5 # seconds between requests
|
|
REQUEST_TIMEOUT = 15 # seconds
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Logging
|
|
# ---------------------------------------------------------------------------
|
|
logging.basicConfig(
|
|
level=logging.INFO,
|
|
format="%(asctime)s %(levelname)s %(message)s",
|
|
datefmt="%H:%M:%S",
|
|
)
|
|
logger = logging.getLogger(__name__)
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Similarity helpers
|
|
# ---------------------------------------------------------------------------
|
|
FUZZY_THRESHOLD = 0.4
|
|
|
|
|
|
def _similarity(a: str, b: str) -> float:
|
|
"""Return SequenceMatcher ratio between two strings (both lowercased)."""
|
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
|
|
|
|
|
def _best_match(
|
|
our_meaning: str,
|
|
candidates: list[dict],
|
|
our_nikkud: str,
|
|
) -> tuple[dict | None, float]:
|
|
"""
|
|
Return (best_candidate, ratio) by comparing our_meaning against each
|
|
candidate's meaning field. Nikkud exact-match gives a bonus to break ties.
|
|
"""
|
|
best: dict | None = None
|
|
best_score = -1.0
|
|
|
|
for cand in candidates:
|
|
ratio = _similarity(our_meaning, cand["meaning"])
|
|
# Nikkud exact match adds a small bonus so the right homograph wins
|
|
# even when meanings are very similar
|
|
if our_nikkud and cand["word"] == our_nikkud:
|
|
ratio = min(1.0, ratio + 0.05)
|
|
if ratio > best_score:
|
|
best_score = ratio
|
|
best = cand
|
|
|
|
return best, best_score
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Search-page parser
|
|
# ---------------------------------------------------------------------------
|
|
def _parse_search_results(html: bytes) -> list[dict]:
|
|
"""
|
|
Parse pealim.com search results page.
|
|
|
|
Each ``div.verb-search-result`` block contains:
|
|
- div.verb-search-data > a[href] → slug
|
|
- div.verb-search-lemma > span.menukad → nikkud word
|
|
- div.verb-search-binyan → part of speech
|
|
- div.verb-search-meaning → meaning text
|
|
|
|
Returns a list of dicts with keys: slug, word, pos, meaning.
|
|
"""
|
|
soup = BeautifulSoup(html, "html.parser")
|
|
results: list[dict] = []
|
|
|
|
for block in soup.find_all("div", class_="verb-search-result"):
|
|
data_div = block.find("div", class_="verb-search-data")
|
|
if not data_div:
|
|
continue
|
|
|
|
# Slug from the detail-page link
|
|
slug = ""
|
|
link = data_div.find("a", href=True)
|
|
if link:
|
|
m = re.search(r"/dict/([^/#]+)/", link["href"])
|
|
if m:
|
|
slug = m.group(1)
|
|
|
|
# Nikkud word
|
|
lemma_div = block.find("div", class_="verb-search-lemma")
|
|
menukad = lemma_div.find("span", class_="menukad") if lemma_div else None
|
|
word = menukad.get_text(strip=True) if menukad else (lemma_div.get_text(strip=True) if lemma_div else "")
|
|
|
|
# Part of speech
|
|
pos_div = block.find("div", class_="verb-search-binyan")
|
|
pos = pos_div.get_text(strip=True).replace("Part of speech:", "").strip() if pos_div else ""
|
|
|
|
# Meaning
|
|
meaning_div = block.find("div", class_="verb-search-meaning")
|
|
meaning = meaning_div.get_text(strip=True) if meaning_div else ""
|
|
|
|
if slug:
|
|
results.append({"slug": slug, "word": word, "pos": pos, "meaning": meaning})
|
|
|
|
return results
|
|
|
|
|
|
def _fetch_search_results(ktiv_male: str) -> list[dict]:
|
|
"""Fetch and parse search results for a given consonant-only spelling."""
|
|
url = f"https://www.pealim.com/search/?q={ktiv_male}"
|
|
logger.debug("GET %s", url)
|
|
resp = SESSION.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
|
|
resp.raise_for_status()
|
|
return _parse_search_results(resp.content)
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Core logic
|
|
# ---------------------------------------------------------------------------
|
|
def find_duplicate_groups(data: dict) -> dict[str, list[str]]:
|
|
"""
|
|
Return mapping slug → [word_key, ...] for all slugs shared by 2+ entries.
|
|
The word_key is the top-level key in words.json (nikkud + PoS + meaning).
|
|
"""
|
|
slug_to_keys: dict[str, list[str]] = defaultdict(list)
|
|
for key, entry in data.items():
|
|
slug = entry.get("slug", "")
|
|
if slug:
|
|
slug_to_keys[slug].append(key)
|
|
return {slug: keys for slug, keys in slug_to_keys.items() if len(keys) > 1}
|
|
|
|
|
|
def repair_group(
|
|
slug: str,
|
|
keys: list[str],
|
|
data: dict,
|
|
dry_run: bool,
|
|
) -> tuple[int, int]:
|
|
"""
|
|
Attempt to repair one group of entries sharing *slug*.
|
|
|
|
Homographs can have different ktiv_male spellings (e.g. אבידה vs אבדה for
|
|
the two spellings of אֲבֵדָה). We therefore build a union of all search
|
|
results obtained by querying each distinct ktiv_male in the group.
|
|
|
|
Returns (fixed_count, skipped_count).
|
|
"""
|
|
# Collect distinct ktiv_male values across the group (usually one, but
|
|
# sometimes two when homographs have different consonant spellings).
|
|
ktiv_to_keys: dict[str, list[str]] = defaultdict(list)
|
|
for k in keys:
|
|
ktiv = data[k]["word"]["ktiv_male"]
|
|
ktiv_to_keys[ktiv].append(k)
|
|
|
|
nikkud_word = data[keys[0]]["word"]["nikkud"]
|
|
logger.info(
|
|
" Fetching search results for %s — %d entries share slug %s",
|
|
nikkud_word,
|
|
len(keys),
|
|
slug,
|
|
)
|
|
|
|
# Fetch search results for every distinct ktiv_male and merge
|
|
all_candidates: list[dict] = []
|
|
seen_slugs: set[str] = set()
|
|
for ktiv in ktiv_to_keys:
|
|
try:
|
|
results = _fetch_search_results(ktiv)
|
|
except requests.RequestException as exc:
|
|
logger.warning(" HTTP error for %s: %s", ktiv, exc)
|
|
results = []
|
|
for r in results:
|
|
if r["slug"] not in seen_slugs:
|
|
seen_slugs.add(r["slug"])
|
|
all_candidates.append(r)
|
|
if len(ktiv_to_keys) > 1:
|
|
# Small delay between sub-queries within the same group
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
if not all_candidates:
|
|
logger.warning(" No search results — skipping group")
|
|
return 0, len(keys)
|
|
|
|
# Filter candidates to those whose nikkud word matches the entry's nikkud.
|
|
# This avoids accidentally matching a completely different word that shares
|
|
# the same consonant spelling (e.g. different voweling entirely).
|
|
group_nikkuds = {data[k]["word"]["nikkud"] for k in keys}
|
|
filtered = [c for c in all_candidates if c["word"] in group_nikkuds]
|
|
|
|
if not filtered:
|
|
logger.warning(
|
|
" Search results don't contain nikkud %s — candidates: %s — skipping",
|
|
group_nikkuds,
|
|
[c["word"] for c in all_candidates],
|
|
)
|
|
return 0, len(keys)
|
|
|
|
fixed = 0
|
|
skipped = 0
|
|
|
|
for key in keys:
|
|
entry = data[key]
|
|
our_meaning = entry.get("meaning", "")
|
|
our_nikkud = entry["word"]["nikkud"]
|
|
|
|
# Only consider candidates that match this entry's nikkud
|
|
nikkud_filtered = [c for c in filtered if c["word"] == our_nikkud]
|
|
pool = nikkud_filtered if nikkud_filtered else filtered
|
|
|
|
best, score = _best_match(our_meaning, pool, our_nikkud)
|
|
|
|
if best is None or score < FUZZY_THRESHOLD:
|
|
logger.warning(
|
|
" SKIP key=%s | meaning=%r | best_score=%.2f",
|
|
key,
|
|
our_meaning,
|
|
score,
|
|
)
|
|
skipped += 1
|
|
continue
|
|
|
|
new_slug = best["slug"]
|
|
old_slug = entry["slug"]
|
|
|
|
if new_slug == old_slug:
|
|
logger.info(" SAME key=%s | slug=%s (score=%.2f)", key, old_slug, score)
|
|
fixed += 1
|
|
continue
|
|
|
|
logger.info(
|
|
" FIX key=%s | %s → %s | matched=%r (score=%.2f)",
|
|
key,
|
|
old_slug,
|
|
new_slug,
|
|
best["meaning"],
|
|
score,
|
|
)
|
|
|
|
if not dry_run:
|
|
data[key]["slug"] = new_slug
|
|
|
|
fixed += 1
|
|
|
|
return fixed, skipped
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# CSV update
|
|
# ---------------------------------------------------------------------------
|
|
def update_csv(data: dict, dry_run: bool) -> None:
|
|
"""
|
|
Re-write the CSV so every row's slug column matches words.json.
|
|
|
|
The CSV is semicolon-delimited; the slug column is named 'slug'.
|
|
We match rows by 'Word Without Nikkud' (ktiv_male) AND 'Meaning' because
|
|
homographs share the same ktiv_male.
|
|
"""
|
|
df = pd.read_csv(CSV_PATH, sep=";", dtype=str)
|
|
|
|
if "slug" not in df.columns:
|
|
logger.warning("CSV has no 'slug' column — skipping CSV update")
|
|
return
|
|
|
|
# Build a lookup: (ktiv_male, meaning) → new_slug from words.json
|
|
lookup: dict[tuple[str, str], str] = {}
|
|
for entry in data.values():
|
|
ktiv = entry["word"].get("ktiv_male", "")
|
|
meaning = entry.get("meaning", "")
|
|
slug = entry.get("slug", "")
|
|
if ktiv and slug:
|
|
lookup[(ktiv, meaning)] = slug
|
|
|
|
changes = 0
|
|
for idx, row in df.iterrows():
|
|
ktiv = str(row.get("Word Without Nikkud", "")).strip()
|
|
meaning = str(row.get("Meaning", "")).strip()
|
|
key = (ktiv, meaning)
|
|
if key in lookup:
|
|
new_slug = lookup[key]
|
|
old_slug = str(row["slug"]).strip()
|
|
if new_slug != old_slug:
|
|
logger.info(
|
|
" CSV row %d: %s → %s (%s)",
|
|
idx,
|
|
old_slug,
|
|
new_slug,
|
|
ktiv,
|
|
)
|
|
if not dry_run:
|
|
df.at[idx, "slug"] = new_slug
|
|
changes += 1
|
|
|
|
logger.info("CSV: %d slug(s) to update", changes)
|
|
if not dry_run and changes:
|
|
df.to_csv(CSV_PATH, sep=";", index=True)
|
|
logger.info("CSV written to %s", CSV_PATH)
|
|
elif dry_run:
|
|
logger.info("DRY-RUN: CSV not written")
|
|
|
|
|
|
# ---------------------------------------------------------------------------
|
|
# Main
|
|
# ---------------------------------------------------------------------------
|
|
def main(argv: list[str] | None = None) -> int:
|
|
parser = argparse.ArgumentParser(description="Repair duplicate slugs in data/words.json")
|
|
parser.add_argument(
|
|
"--dry-run",
|
|
action="store_true",
|
|
help="Preview changes without writing any files",
|
|
)
|
|
parser.add_argument(
|
|
"--verbose",
|
|
"-v",
|
|
action="store_true",
|
|
help="Enable debug logging",
|
|
)
|
|
args = parser.parse_args(argv)
|
|
|
|
if args.verbose:
|
|
logging.getLogger().setLevel(logging.DEBUG)
|
|
|
|
if args.dry_run:
|
|
logger.info("=== DRY-RUN mode — no files will be modified ===")
|
|
|
|
# Load data
|
|
logger.info("Loading %s", WORDS_JSON)
|
|
with WORDS_JSON.open(encoding="utf-8") as fh:
|
|
data: dict = json.load(fh)
|
|
logger.info("Loaded %d entries", len(data))
|
|
|
|
# Identify duplicate groups
|
|
groups = find_duplicate_groups(data)
|
|
total_groups = len(groups)
|
|
total_entries = sum(len(v) for v in groups.values())
|
|
logger.info(
|
|
"Found %d duplicate-slug groups covering %d entries",
|
|
total_groups,
|
|
total_entries,
|
|
)
|
|
|
|
# Process each group
|
|
total_fixed = 0
|
|
total_skipped = 0
|
|
|
|
for group_idx, (slug, keys) in enumerate(sorted(groups.items()), 1):
|
|
logger.info(
|
|
"[%d/%d] slug=%s (%d entries)",
|
|
group_idx,
|
|
total_groups,
|
|
slug,
|
|
len(keys),
|
|
)
|
|
fixed, skipped = repair_group(slug, keys, data, dry_run=args.dry_run)
|
|
total_fixed += fixed
|
|
total_skipped += skipped
|
|
|
|
# Respectful delay between HTTP requests
|
|
if group_idx < total_groups:
|
|
time.sleep(REQUEST_DELAY)
|
|
|
|
logger.info(
|
|
"Summary: %d fixed, %d skipped (out of %d entries in %d groups)",
|
|
total_fixed,
|
|
total_skipped,
|
|
total_entries,
|
|
total_groups,
|
|
)
|
|
|
|
# Write updated words.json
|
|
if not args.dry_run:
|
|
logger.info("Writing %s", WORDS_JSON)
|
|
with WORDS_JSON.open("w", encoding="utf-8") as fh:
|
|
json.dump(data, fh, ensure_ascii=False, indent=2)
|
|
logger.info("words.json written")
|
|
else:
|
|
logger.info("DRY-RUN: words.json not written")
|
|
|
|
# Update CSV
|
|
logger.info("Updating CSV %s", CSV_PATH)
|
|
update_csv(data, dry_run=args.dry_run)
|
|
|
|
return 0 if total_skipped == 0 else 1
|
|
|
|
|
|
if __name__ == "__main__":
|
|
sys.exit(main())
|