- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
365 lines
12 KiB
Python
365 lines
12 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Scrape pealim.com for noun plural and construct forms.
|
|
|
|
Step 1: Collect noun slugs from list pages (/dict/?pos=noun&page=N)
|
|
Step 2: Fetch detail pages for plural + construct forms
|
|
Step 3: Print summary statistics
|
|
"""
|
|
|
|
import json
|
|
import re
|
|
import time
|
|
from pathlib import Path
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
BASE_URL = "https://www.pealim.com"
|
|
COOKIES = {"translit": "none", "hebstyle": "mo"}
|
|
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
|
|
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
|
SLUG_MAP_FILE = DATA_DIR / "noun_slug_map.json"
|
|
PROGRESS_FILE = DATA_DIR / "noun_slug_map_progress.json"
|
|
PLURALS_FILE = DATA_DIR / "noun_plurals.json"
|
|
DELAY = 1.5 # seconds between requests
|
|
|
|
|
|
def load_json(path, default=None):
|
|
if path.exists():
|
|
with open(path) as f:
|
|
return json.load(f)
|
|
return default if default is not None else {}
|
|
|
|
|
|
def save_json(path, data):
|
|
with open(path, "w") as f:
|
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
|
|
|
|
|
def fetch_with_retry(url, max_retries=5):
|
|
"""Fetch URL with exponential backoff."""
|
|
for attempt in range(max_retries):
|
|
try:
|
|
r = requests.get(url, cookies=COOKIES, headers=HEADERS, timeout=30)
|
|
r.raise_for_status()
|
|
return r
|
|
except (requests.RequestException, ConnectionError) as e:
|
|
wait = min(2**attempt * 2, 60)
|
|
print(f" Retry {attempt + 1}/{max_retries} for {url}: {e} (waiting {wait}s)")
|
|
time.sleep(wait)
|
|
print(f" FAILED after {max_retries} retries: {url}")
|
|
return None
|
|
|
|
|
|
def get_total_pages():
|
|
"""Get total number of noun list pages."""
|
|
r = fetch_with_retry(f"{BASE_URL}/dict/?pos=noun&page=1")
|
|
if not r:
|
|
return 0
|
|
soup = BeautifulSoup(r.text, "lxml")
|
|
pages = set()
|
|
for a in soup.select("ul.pagination li a"):
|
|
href = a.get("href", "")
|
|
m = re.search(r"page=(\d+)", href)
|
|
if m:
|
|
pages.add(int(m.group(1)))
|
|
return max(pages) if pages else 1
|
|
|
|
|
|
def parse_list_page(html):
|
|
"""Parse a noun list page and return list of noun entries."""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
table = soup.select_one("table.dict-table")
|
|
if not table:
|
|
return []
|
|
|
|
entries = []
|
|
for row in table.select("tr")[1:]: # skip header
|
|
tds = row.select("td")
|
|
if len(tds) < 3:
|
|
continue
|
|
|
|
# First td: word + link
|
|
first_td = tds[0]
|
|
a = first_td.select_one("a")
|
|
if not a:
|
|
continue
|
|
href = a.get("href", "")
|
|
slug_match = re.search(r"/dict/([^/]+)/", href)
|
|
if not slug_match:
|
|
continue
|
|
slug = slug_match.group(1)
|
|
|
|
menukad = first_td.select_one("span.menukad")
|
|
word_nikkud = menukad.get_text(strip=True) if menukad else ""
|
|
|
|
# Word without nikkud (strip combining marks)
|
|
word_plain = re.sub(r"[\u0591-\u05C7]", "", word_nikkud)
|
|
|
|
# Third td: part of speech
|
|
pos_text = tds[2].get_text(strip=True)
|
|
|
|
# Gender
|
|
gender = ""
|
|
if "masculine" in pos_text.lower():
|
|
gender = "masculine"
|
|
elif "feminine" in pos_text.lower():
|
|
gender = "feminine"
|
|
|
|
# Mishkal pattern
|
|
mishkal = ""
|
|
m = re.search(r"(\w+)\s*pattern", pos_text.lower())
|
|
if m:
|
|
mishkal = m.group(1)
|
|
|
|
entries.append(
|
|
{
|
|
"word_plain": word_plain,
|
|
"slug": slug,
|
|
"word_nikkud": word_nikkud,
|
|
"pos": pos_text,
|
|
"gender": gender,
|
|
"mishkal": mishkal,
|
|
}
|
|
)
|
|
|
|
return entries
|
|
|
|
|
|
def step1_collect_slugs():
|
|
"""Step 1: Collect noun slugs from list pages."""
|
|
print("=" * 60)
|
|
print("STEP 1: Collecting noun slugs from list pages")
|
|
print("=" * 60)
|
|
|
|
slug_map = load_json(SLUG_MAP_FILE, {})
|
|
progress = load_json(PROGRESS_FILE, [])
|
|
completed_pages = set(progress) if isinstance(progress, list) else set()
|
|
|
|
# Get total pages
|
|
total_pages = get_total_pages()
|
|
print(f"Total pages: {total_pages}")
|
|
print(f"Already completed: {len(completed_pages)} pages, {len(slug_map)} nouns")
|
|
|
|
remaining = [p for p in range(1, total_pages + 1) if p not in completed_pages]
|
|
print(f"Remaining pages: {len(remaining)}")
|
|
|
|
if not remaining:
|
|
print("All pages already scraped!")
|
|
return slug_map
|
|
|
|
for i, page_num in enumerate(remaining):
|
|
url = f"{BASE_URL}/dict/?pos=noun&page={page_num}"
|
|
r = fetch_with_retry(url)
|
|
if not r:
|
|
print(f" Skipping page {page_num}")
|
|
continue
|
|
|
|
entries = parse_list_page(r.text)
|
|
for entry in entries:
|
|
word = entry["word_plain"]
|
|
slug_map[word] = {
|
|
"slug": entry["slug"],
|
|
"word_nikkud": entry["word_nikkud"],
|
|
"pos": entry["pos"],
|
|
"gender": entry["gender"],
|
|
"mishkal": entry["mishkal"],
|
|
}
|
|
|
|
completed_pages.add(page_num)
|
|
done = len(completed_pages)
|
|
print(f" Page {page_num} ({done}/{total_pages}): {len(entries)} nouns (total: {len(slug_map)})")
|
|
|
|
# Save progress every 10 pages
|
|
if (i + 1) % 10 == 0 or page_num == remaining[-1]:
|
|
save_json(SLUG_MAP_FILE, slug_map)
|
|
save_json(PROGRESS_FILE, sorted(completed_pages))
|
|
print(f" [Saved progress: {len(slug_map)} nouns, {done} pages]")
|
|
|
|
time.sleep(DELAY)
|
|
|
|
# Final save
|
|
save_json(SLUG_MAP_FILE, slug_map)
|
|
save_json(PROGRESS_FILE, sorted(completed_pages))
|
|
print(f"\nStep 1 complete: {len(slug_map)} total nouns from {len(completed_pages)} pages")
|
|
return slug_map
|
|
|
|
|
|
def parse_detail_page(html, slug, gender, mishkal):
|
|
"""Parse a noun detail page for plural/construct forms."""
|
|
soup = BeautifulSoup(html, "lxml")
|
|
tables = soup.select("table.conjugation-table")
|
|
if not tables:
|
|
return None
|
|
|
|
table = tables[0]
|
|
rows = table.select("tr")
|
|
|
|
result = {
|
|
"slug": slug,
|
|
"singular": "",
|
|
"singular_audio": "",
|
|
"plural": "",
|
|
"plural_audio": "",
|
|
"construct_singular": "",
|
|
"construct_plural": "",
|
|
"gender": gender,
|
|
"mishkal": mishkal,
|
|
}
|
|
|
|
for row in rows:
|
|
th = row.select_one("th")
|
|
if not th:
|
|
continue
|
|
label = th.get_text(strip=True).lower()
|
|
tds = row.select("td")
|
|
|
|
if "absolute" in label:
|
|
if len(tds) >= 1:
|
|
td = tds[0]
|
|
m = td.select_one("span.menukad")
|
|
result["singular"] = m.get_text(strip=True) if m else ""
|
|
audio_el = td.select_one("[data-audio]")
|
|
result["singular_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
|
|
if len(tds) >= 2:
|
|
td = tds[1]
|
|
m = td.select_one("span.menukad")
|
|
result["plural"] = m.get_text(strip=True) if m else ""
|
|
audio_el = td.select_one("[data-audio]")
|
|
result["plural_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
|
|
|
|
elif "construct" in label:
|
|
if len(tds) >= 1:
|
|
td = tds[0]
|
|
m = td.select_one("span.menukad")
|
|
result["construct_singular"] = m.get_text(strip=True) if m else ""
|
|
if len(tds) >= 2:
|
|
td = tds[1]
|
|
m = td.select_one("span.menukad")
|
|
result["construct_plural"] = m.get_text(strip=True) if m else ""
|
|
|
|
return result
|
|
|
|
|
|
def step2_fetch_plurals(slug_map):
|
|
"""Step 2: Fetch detail pages for plural + construct forms."""
|
|
print("\n" + "=" * 60)
|
|
print("STEP 2: Fetching plural + construct forms from detail pages")
|
|
print("=" * 60)
|
|
|
|
plurals = load_json(PLURALS_FILE, {})
|
|
already_done = set(plurals.keys())
|
|
|
|
# Build work list: nouns not yet in plurals
|
|
work = []
|
|
for word, info in slug_map.items():
|
|
if word not in already_done:
|
|
work.append((word, info))
|
|
|
|
print(f"Already have plural data: {len(already_done)}")
|
|
print(f"Remaining to fetch: {len(work)}")
|
|
|
|
if not work:
|
|
print("All nouns already have plural data!")
|
|
return plurals
|
|
|
|
skipped = 0
|
|
for i, (word, info) in enumerate(work):
|
|
slug = info["slug"]
|
|
url = f"{BASE_URL}/dict/{slug}/"
|
|
r = fetch_with_retry(url)
|
|
if not r:
|
|
print(f" Skipping {word} ({slug})")
|
|
skipped += 1
|
|
continue
|
|
|
|
entry = parse_detail_page(r.text, slug, info.get("gender", ""), info.get("mishkal", ""))
|
|
if entry:
|
|
plurals[word] = entry
|
|
else:
|
|
# No declension table - store minimal entry
|
|
plurals[word] = {
|
|
"slug": slug,
|
|
"singular": info.get("word_nikkud", ""),
|
|
"singular_audio": "",
|
|
"plural": "",
|
|
"plural_audio": "",
|
|
"construct_singular": "",
|
|
"construct_plural": "",
|
|
"gender": info.get("gender", ""),
|
|
"mishkal": info.get("mishkal", ""),
|
|
"no_declension_table": True,
|
|
}
|
|
|
|
done = len(already_done) + i + 1 - skipped
|
|
total = len(already_done) + len(work)
|
|
if (i + 1) % 50 == 0 or i == 0:
|
|
print(
|
|
f" [{i + 1}/{len(work)}] {word} ({slug}): "
|
|
f"plural={entry['plural'] if entry else 'N/A'} "
|
|
f"(total: {done}/{total})"
|
|
)
|
|
|
|
# Save every 50 entries
|
|
if (i + 1) % 50 == 0 or i == len(work) - 1:
|
|
save_json(PLURALS_FILE, plurals)
|
|
print(f" [Saved: {len(plurals)} entries]")
|
|
|
|
time.sleep(DELAY)
|
|
|
|
save_json(PLURALS_FILE, plurals)
|
|
print(f"\nStep 2 complete: {len(plurals)} total noun entries with plural data")
|
|
return plurals
|
|
|
|
|
|
def step3_summary(slug_map, plurals):
|
|
"""Step 3: Print summary statistics."""
|
|
print("\n" + "=" * 60)
|
|
print("SUMMARY")
|
|
print("=" * 60)
|
|
|
|
total_slugs = len(slug_map)
|
|
total_plurals = len(plurals)
|
|
has_plural = sum(1 for v in plurals.values() if v.get("plural"))
|
|
has_construct = sum(1 for v in plurals.values() if v.get("construct_singular") or v.get("construct_plural"))
|
|
has_audio = sum(1 for v in plurals.values() if v.get("singular_audio") or v.get("plural_audio"))
|
|
no_table = sum(1 for v in plurals.values() if v.get("no_declension_table"))
|
|
|
|
# Irregular plurals: masculine with ות- ending, feminine with ים- ending
|
|
irregular = 0
|
|
for _word, v in plurals.items():
|
|
plural = v.get("plural", "")
|
|
gender = v.get("gender", "")
|
|
if not plural or not gender:
|
|
continue
|
|
plain_plural = re.sub(r"[\u0591-\u05C7]", "", plural)
|
|
if (
|
|
gender == "masculine"
|
|
and plain_plural.endswith("ות")
|
|
or gender == "feminine"
|
|
and plain_plural.endswith("ים")
|
|
):
|
|
irregular += 1
|
|
|
|
print(f"Total nouns in slug map: {total_slugs}")
|
|
print(f"Total nouns with plural data: {total_plurals}")
|
|
print(f" - With plural form: {has_plural}")
|
|
print(f" - With construct forms: {has_construct}")
|
|
print(f" - With audio URLs: {has_audio}")
|
|
print(f" - No declension table: {no_table}")
|
|
print(f" - Irregular plurals: {irregular}")
|
|
|
|
|
|
def main():
|
|
print("Pealim Noun Plural Scraper")
|
|
print(f"Data directory: {DATA_DIR}")
|
|
print()
|
|
|
|
slug_map = step1_collect_slugs()
|
|
plurals = step2_fetch_plurals(slug_map)
|
|
step3_summary(slug_map, plurals)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|