- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
250 lines
8.2 KiB
Python
250 lines
8.2 KiB
Python
#!/usr/bin/env python3
|
|
"""Scrape ktiv male (vowelless plene) conjugation forms for top 500 verbs from pealim.com."""
|
|
|
|
import json
|
|
import os
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
sys.stdout.reconfigure(line_buffering=True)
|
|
import requests # noqa: E402
|
|
from bs4 import BeautifulSoup # noqa: E402
|
|
|
|
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
|
INPUT_FILE = os.path.join(DATA_DIR, "top_verbs_to_scrape.json")
|
|
OUTPUT_FILE = os.path.join(DATA_DIR, "ktiv_male_forms.json")
|
|
PARTIAL_FILE = os.path.join(DATA_DIR, "ktiv_male_forms_partial.json")
|
|
PROGRESS_FILE = os.path.join(DATA_DIR, "ktiv_scrape_progress.json")
|
|
|
|
COOKIES = {"translit": "none", "hebstyle": "vl"}
|
|
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
|
|
DELAY = 1.5
|
|
|
|
session = requests.Session()
|
|
session.cookies.update(COOKIES)
|
|
session.headers.update(HEADERS)
|
|
|
|
|
|
def load_json(path):
|
|
if os.path.exists(path):
|
|
with open(path, encoding="utf-8") as f:
|
|
return json.load(f)
|
|
return {}
|
|
|
|
|
|
def save_json(data, path):
|
|
with open(path, "w", encoding="utf-8") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=1)
|
|
|
|
|
|
def search_slug(wni):
|
|
"""Search pealim for a verb and return the first result's slug."""
|
|
url = "https://www.pealim.com/search/"
|
|
resp = session.get(url, params={"q": wni}, timeout=15)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
# Look for result links like /dict/SLUG/
|
|
for a in soup.select("a[href]"):
|
|
href = a["href"]
|
|
m = re.match(r"/dict/(\d+-[^/]+)/", href)
|
|
if m:
|
|
return m.group(1)
|
|
return None
|
|
|
|
|
|
def scrape_verb_forms(slug):
|
|
"""Fetch a verb's detail page and extract all ktiv male conjugation forms."""
|
|
url = f"https://www.pealim.com/dict/{slug}/"
|
|
resp = session.get(url, timeout=15)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.text, "html.parser")
|
|
|
|
forms = set()
|
|
|
|
# Get infinitive from div.lead or page title
|
|
lead = soup.select_one("div.lead")
|
|
if lead:
|
|
menukad_spans = lead.select("span.menukad")
|
|
for span in menukad_spans:
|
|
text = span.get_text(strip=True)
|
|
if text:
|
|
forms.add(text)
|
|
|
|
# Get word_nikkud (the nikkud form of the infinitive) from the page
|
|
# We need to fetch with mo cookie for that, but we already have it from input data
|
|
# Instead, get the page title which usually has the nikkud form
|
|
word_nikkud = None
|
|
title = soup.select_one("h1")
|
|
if title:
|
|
menukad_in_title = title.select_one("span.menukad")
|
|
if menukad_in_title:
|
|
word_nikkud = menukad_in_title.get_text(strip=True)
|
|
|
|
# Get ALL span.menukad elements from conjugation tables
|
|
for span in soup.select("span.menukad"):
|
|
text = span.get_text(strip=True)
|
|
if text:
|
|
forms.add(text)
|
|
|
|
return forms, word_nikkud
|
|
|
|
|
|
def main():
|
|
verbs = load_json(INPUT_FILE)
|
|
if not verbs:
|
|
print("ERROR: No verbs found in input file")
|
|
sys.exit(1)
|
|
|
|
# Load existing forms
|
|
existing_forms = load_json(OUTPUT_FILE)
|
|
new_forms = {} # Will be merged into existing at the end
|
|
|
|
# Load progress to resume
|
|
progress = load_json(PROGRESS_FILE)
|
|
done_wnis = set(progress.get("done_wnis", []))
|
|
slug_cache = progress.get("slug_cache", {})
|
|
|
|
# Pre-populate slug cache from conjugations.json
|
|
conj_file = os.path.join(DATA_DIR, "conjugations.json")
|
|
if os.path.exists(conj_file):
|
|
conj_data = load_json(conj_file)
|
|
for wni_key, cdata in conj_data.items():
|
|
if isinstance(cdata, dict) and "slug" in cdata and wni_key not in slug_cache:
|
|
slug_cache[wni_key] = cdata["slug"]
|
|
print(f"Pre-populated {len(slug_cache)} slugs from conjugations.json")
|
|
|
|
# Deduplicate verbs by wni
|
|
seen_wni = set()
|
|
unique_verbs = []
|
|
for v in verbs:
|
|
if v["wni"] not in seen_wni:
|
|
seen_wni.add(v["wni"])
|
|
unique_verbs.append(v)
|
|
|
|
total = len(unique_verbs)
|
|
to_scrape = [v for v in unique_verbs if v["wni"] not in done_wnis]
|
|
print(f"Total unique verbs: {total}, already done: {total - len(to_scrape)}, to scrape: {len(to_scrape)}")
|
|
|
|
scraped_count = 0
|
|
skipped_count = 0
|
|
total_new_forms = 0
|
|
sample_verbs = {} # For summary: wni -> list of forms
|
|
|
|
for i, verb in enumerate(to_scrape):
|
|
wni = verb["wni"]
|
|
word_nikkud_input = verb["word"]
|
|
|
|
try:
|
|
# Step 1: Find slug
|
|
if wni in slug_cache:
|
|
slug = slug_cache[wni]
|
|
else:
|
|
slug = search_slug(wni)
|
|
time.sleep(DELAY)
|
|
|
|
if not slug:
|
|
print(f" [{i + 1}/{len(to_scrape)}] SKIP {wni} - not found on pealim")
|
|
skipped_count += 1
|
|
done_wnis.add(wni)
|
|
continue
|
|
|
|
slug_cache[wni] = slug
|
|
|
|
# Step 2: Scrape forms
|
|
forms, page_nikkud = scrape_verb_forms(slug)
|
|
time.sleep(DELAY)
|
|
|
|
# Use the nikkud form from our input data (more reliable)
|
|
nikkud_to_use = word_nikkud_input
|
|
|
|
# Build entries for each form
|
|
for form in forms:
|
|
entry = {
|
|
"word_nikkud": nikkud_to_use,
|
|
"form_type": "conjugation",
|
|
"pos": "Verb",
|
|
"slug": slug,
|
|
}
|
|
if form not in new_forms:
|
|
new_forms[form] = []
|
|
# Check for duplicate entry
|
|
if not any(e["slug"] == slug for e in new_forms[form]):
|
|
new_forms[form].append(entry)
|
|
total_new_forms += 1
|
|
|
|
scraped_count += 1
|
|
# Collect samples (first 3 completed)
|
|
if len(sample_verbs) < 3:
|
|
sample_verbs[wni] = sorted(forms)
|
|
|
|
print(f" [{i + 1}/{len(to_scrape)}] {wni} -> {slug} ({len(forms)} forms)")
|
|
done_wnis.add(wni)
|
|
|
|
except Exception as e:
|
|
print(f" [{i + 1}/{len(to_scrape)}] ERROR {wni}: {e}")
|
|
skipped_count += 1
|
|
done_wnis.add(wni)
|
|
|
|
# Save progress every 50 verbs
|
|
if (i + 1) % 50 == 0:
|
|
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
|
|
save_json(progress, PROGRESS_FILE)
|
|
# Save partial merged result
|
|
merged = dict(existing_forms)
|
|
for form, entries in new_forms.items():
|
|
if form in merged:
|
|
existing_slugs = {e["slug"] for e in merged[form]}
|
|
for entry in entries:
|
|
if entry["slug"] not in existing_slugs:
|
|
merged[form].append(entry)
|
|
else:
|
|
merged[form] = entries
|
|
save_json(merged, PARTIAL_FILE)
|
|
print(f" -- Progress saved at {i + 1}/{len(to_scrape)} --")
|
|
|
|
# Final merge
|
|
merged = dict(existing_forms)
|
|
for form, entries in new_forms.items():
|
|
if form in merged:
|
|
existing_slugs = {e["slug"] for e in merged[form]}
|
|
for entry in entries:
|
|
if entry["slug"] not in existing_slugs:
|
|
merged[form].append(entry)
|
|
else:
|
|
merged[form] = entries
|
|
|
|
save_json(merged, OUTPUT_FILE)
|
|
|
|
# Save final progress
|
|
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
|
|
save_json(progress, PROGRESS_FILE)
|
|
|
|
# Clean up partial file
|
|
if os.path.exists(PARTIAL_FILE):
|
|
os.remove(PARTIAL_FILE)
|
|
|
|
# Summary
|
|
print(f"\n{'=' * 50}")
|
|
print("SUMMARY")
|
|
print(f"{'=' * 50}")
|
|
print(f"Verbs scraped: {scraped_count}")
|
|
print(f"Verbs skipped: {skipped_count}")
|
|
print(f"New forms added: {total_new_forms}")
|
|
print(f"Total unique ktiv male forms: {len(merged)}")
|
|
print(f"Previous forms count: {len(existing_forms)}")
|
|
print(f"Net new form keys: {len(merged) - len(existing_forms)}")
|
|
|
|
if sample_verbs:
|
|
print("\nSample verbs:")
|
|
for wni, forms in list(sample_verbs.items())[:3]:
|
|
print(f"\n {wni} ({len(forms)} forms):")
|
|
for f in forms[:8]:
|
|
print(f" {f}")
|
|
if len(forms) > 8:
|
|
print(f" ... and {len(forms) - 8} more")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|