hebrew_flash_cards/scripts/scrape_verb_ktiv.py
Sochen 17f7458d19 Sprint 9: cloze cards, plurals deck, project reorg, lint tooling
- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences
- Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns)
- Ktiv male forms expanded to 20,711 entries for sentence matching
- Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for
  one-off tools, tests/ with smoke tests, deleted 3 dead files
- Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig,
  fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars)
- validate_apkg.py: card count range check for optional cloze template
- Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals,
  noun_slug_map, vocab_sentence_matches, epub_sentence_index

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 08:09:39 +00:00

250 lines
8.2 KiB
Python

#!/usr/bin/env python3
"""Scrape ktiv male (vowelless plene) conjugation forms for top 500 verbs from pealim.com."""
import json
import os
import re
import sys
import time
sys.stdout.reconfigure(line_buffering=True)
import requests # noqa: E402
from bs4 import BeautifulSoup # noqa: E402
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
INPUT_FILE = os.path.join(DATA_DIR, "top_verbs_to_scrape.json")
OUTPUT_FILE = os.path.join(DATA_DIR, "ktiv_male_forms.json")
PARTIAL_FILE = os.path.join(DATA_DIR, "ktiv_male_forms_partial.json")
PROGRESS_FILE = os.path.join(DATA_DIR, "ktiv_scrape_progress.json")
COOKIES = {"translit": "none", "hebstyle": "vl"}
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
DELAY = 1.5
session = requests.Session()
session.cookies.update(COOKIES)
session.headers.update(HEADERS)
def load_json(path):
if os.path.exists(path):
with open(path, encoding="utf-8") as f:
return json.load(f)
return {}
def save_json(data, path):
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=1)
def search_slug(wni):
"""Search pealim for a verb and return the first result's slug."""
url = "https://www.pealim.com/search/"
resp = session.get(url, params={"q": wni}, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Look for result links like /dict/SLUG/
for a in soup.select("a[href]"):
href = a["href"]
m = re.match(r"/dict/(\d+-[^/]+)/", href)
if m:
return m.group(1)
return None
def scrape_verb_forms(slug):
"""Fetch a verb's detail page and extract all ktiv male conjugation forms."""
url = f"https://www.pealim.com/dict/{slug}/"
resp = session.get(url, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
forms = set()
# Get infinitive from div.lead or page title
lead = soup.select_one("div.lead")
if lead:
menukad_spans = lead.select("span.menukad")
for span in menukad_spans:
text = span.get_text(strip=True)
if text:
forms.add(text)
# Get word_nikkud (the nikkud form of the infinitive) from the page
# We need to fetch with mo cookie for that, but we already have it from input data
# Instead, get the page title which usually has the nikkud form
word_nikkud = None
title = soup.select_one("h1")
if title:
menukad_in_title = title.select_one("span.menukad")
if menukad_in_title:
word_nikkud = menukad_in_title.get_text(strip=True)
# Get ALL span.menukad elements from conjugation tables
for span in soup.select("span.menukad"):
text = span.get_text(strip=True)
if text:
forms.add(text)
return forms, word_nikkud
def main():
verbs = load_json(INPUT_FILE)
if not verbs:
print("ERROR: No verbs found in input file")
sys.exit(1)
# Load existing forms
existing_forms = load_json(OUTPUT_FILE)
new_forms = {} # Will be merged into existing at the end
# Load progress to resume
progress = load_json(PROGRESS_FILE)
done_wnis = set(progress.get("done_wnis", []))
slug_cache = progress.get("slug_cache", {})
# Pre-populate slug cache from conjugations.json
conj_file = os.path.join(DATA_DIR, "conjugations.json")
if os.path.exists(conj_file):
conj_data = load_json(conj_file)
for wni_key, cdata in conj_data.items():
if isinstance(cdata, dict) and "slug" in cdata and wni_key not in slug_cache:
slug_cache[wni_key] = cdata["slug"]
print(f"Pre-populated {len(slug_cache)} slugs from conjugations.json")
# Deduplicate verbs by wni
seen_wni = set()
unique_verbs = []
for v in verbs:
if v["wni"] not in seen_wni:
seen_wni.add(v["wni"])
unique_verbs.append(v)
total = len(unique_verbs)
to_scrape = [v for v in unique_verbs if v["wni"] not in done_wnis]
print(f"Total unique verbs: {total}, already done: {total - len(to_scrape)}, to scrape: {len(to_scrape)}")
scraped_count = 0
skipped_count = 0
total_new_forms = 0
sample_verbs = {} # For summary: wni -> list of forms
for i, verb in enumerate(to_scrape):
wni = verb["wni"]
word_nikkud_input = verb["word"]
try:
# Step 1: Find slug
if wni in slug_cache:
slug = slug_cache[wni]
else:
slug = search_slug(wni)
time.sleep(DELAY)
if not slug:
print(f" [{i + 1}/{len(to_scrape)}] SKIP {wni} - not found on pealim")
skipped_count += 1
done_wnis.add(wni)
continue
slug_cache[wni] = slug
# Step 2: Scrape forms
forms, page_nikkud = scrape_verb_forms(slug)
time.sleep(DELAY)
# Use the nikkud form from our input data (more reliable)
nikkud_to_use = word_nikkud_input
# Build entries for each form
for form in forms:
entry = {
"word_nikkud": nikkud_to_use,
"form_type": "conjugation",
"pos": "Verb",
"slug": slug,
}
if form not in new_forms:
new_forms[form] = []
# Check for duplicate entry
if not any(e["slug"] == slug for e in new_forms[form]):
new_forms[form].append(entry)
total_new_forms += 1
scraped_count += 1
# Collect samples (first 3 completed)
if len(sample_verbs) < 3:
sample_verbs[wni] = sorted(forms)
print(f" [{i + 1}/{len(to_scrape)}] {wni} -> {slug} ({len(forms)} forms)")
done_wnis.add(wni)
except Exception as e:
print(f" [{i + 1}/{len(to_scrape)}] ERROR {wni}: {e}")
skipped_count += 1
done_wnis.add(wni)
# Save progress every 50 verbs
if (i + 1) % 50 == 0:
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
save_json(progress, PROGRESS_FILE)
# Save partial merged result
merged = dict(existing_forms)
for form, entries in new_forms.items():
if form in merged:
existing_slugs = {e["slug"] for e in merged[form]}
for entry in entries:
if entry["slug"] not in existing_slugs:
merged[form].append(entry)
else:
merged[form] = entries
save_json(merged, PARTIAL_FILE)
print(f" -- Progress saved at {i + 1}/{len(to_scrape)} --")
# Final merge
merged = dict(existing_forms)
for form, entries in new_forms.items():
if form in merged:
existing_slugs = {e["slug"] for e in merged[form]}
for entry in entries:
if entry["slug"] not in existing_slugs:
merged[form].append(entry)
else:
merged[form] = entries
save_json(merged, OUTPUT_FILE)
# Save final progress
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
save_json(progress, PROGRESS_FILE)
# Clean up partial file
if os.path.exists(PARTIAL_FILE):
os.remove(PARTIAL_FILE)
# Summary
print(f"\n{'=' * 50}")
print("SUMMARY")
print(f"{'=' * 50}")
print(f"Verbs scraped: {scraped_count}")
print(f"Verbs skipped: {skipped_count}")
print(f"New forms added: {total_new_forms}")
print(f"Total unique ktiv male forms: {len(merged)}")
print(f"Previous forms count: {len(existing_forms)}")
print(f"Net new form keys: {len(merged) - len(existing_forms)}")
if sample_verbs:
print("\nSample verbs:")
for wni, forms in list(sample_verbs.items())[:3]:
print(f"\n {wni} ({len(forms)} forms):")
for f in forms[:8]:
print(f" {f}")
if len(forms) > 8:
print(f" ... and {len(forms) - 8} more")
if __name__ == "__main__":
main()