hebrew_flash_cards/scripts/scrape_noun_plurals.py
Sochen 17f7458d19 Sprint 9: cloze cards, plurals deck, project reorg, lint tooling
- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences
- Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns)
- Ktiv male forms expanded to 20,711 entries for sentence matching
- Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for
  one-off tools, tests/ with smoke tests, deleted 3 dead files
- Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig,
  fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars)
- validate_apkg.py: card count range check for optional cloze template
- Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals,
  noun_slug_map, vocab_sentence_matches, epub_sentence_index

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 08:09:39 +00:00

365 lines
12 KiB
Python

#!/usr/bin/env python3
"""
Scrape pealim.com for noun plural and construct forms.
Step 1: Collect noun slugs from list pages (/dict/?pos=noun&page=N)
Step 2: Fetch detail pages for plural + construct forms
Step 3: Print summary statistics
"""
import json
import re
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://www.pealim.com"
COOKIES = {"translit": "none", "hebstyle": "mo"}
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
SLUG_MAP_FILE = DATA_DIR / "noun_slug_map.json"
PROGRESS_FILE = DATA_DIR / "noun_slug_map_progress.json"
PLURALS_FILE = DATA_DIR / "noun_plurals.json"
DELAY = 1.5 # seconds between requests
def load_json(path, default=None):
if path.exists():
with open(path) as f:
return json.load(f)
return default if default is not None else {}
def save_json(path, data):
with open(path, "w") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def fetch_with_retry(url, max_retries=5):
"""Fetch URL with exponential backoff."""
for attempt in range(max_retries):
try:
r = requests.get(url, cookies=COOKIES, headers=HEADERS, timeout=30)
r.raise_for_status()
return r
except (requests.RequestException, ConnectionError) as e:
wait = min(2**attempt * 2, 60)
print(f" Retry {attempt + 1}/{max_retries} for {url}: {e} (waiting {wait}s)")
time.sleep(wait)
print(f" FAILED after {max_retries} retries: {url}")
return None
def get_total_pages():
"""Get total number of noun list pages."""
r = fetch_with_retry(f"{BASE_URL}/dict/?pos=noun&page=1")
if not r:
return 0
soup = BeautifulSoup(r.text, "lxml")
pages = set()
for a in soup.select("ul.pagination li a"):
href = a.get("href", "")
m = re.search(r"page=(\d+)", href)
if m:
pages.add(int(m.group(1)))
return max(pages) if pages else 1
def parse_list_page(html):
"""Parse a noun list page and return list of noun entries."""
soup = BeautifulSoup(html, "lxml")
table = soup.select_one("table.dict-table")
if not table:
return []
entries = []
for row in table.select("tr")[1:]: # skip header
tds = row.select("td")
if len(tds) < 3:
continue
# First td: word + link
first_td = tds[0]
a = first_td.select_one("a")
if not a:
continue
href = a.get("href", "")
slug_match = re.search(r"/dict/([^/]+)/", href)
if not slug_match:
continue
slug = slug_match.group(1)
menukad = first_td.select_one("span.menukad")
word_nikkud = menukad.get_text(strip=True) if menukad else ""
# Word without nikkud (strip combining marks)
word_plain = re.sub(r"[\u0591-\u05C7]", "", word_nikkud)
# Third td: part of speech
pos_text = tds[2].get_text(strip=True)
# Gender
gender = ""
if "masculine" in pos_text.lower():
gender = "masculine"
elif "feminine" in pos_text.lower():
gender = "feminine"
# Mishkal pattern
mishkal = ""
m = re.search(r"(\w+)\s*pattern", pos_text.lower())
if m:
mishkal = m.group(1)
entries.append(
{
"word_plain": word_plain,
"slug": slug,
"word_nikkud": word_nikkud,
"pos": pos_text,
"gender": gender,
"mishkal": mishkal,
}
)
return entries
def step1_collect_slugs():
"""Step 1: Collect noun slugs from list pages."""
print("=" * 60)
print("STEP 1: Collecting noun slugs from list pages")
print("=" * 60)
slug_map = load_json(SLUG_MAP_FILE, {})
progress = load_json(PROGRESS_FILE, [])
completed_pages = set(progress) if isinstance(progress, list) else set()
# Get total pages
total_pages = get_total_pages()
print(f"Total pages: {total_pages}")
print(f"Already completed: {len(completed_pages)} pages, {len(slug_map)} nouns")
remaining = [p for p in range(1, total_pages + 1) if p not in completed_pages]
print(f"Remaining pages: {len(remaining)}")
if not remaining:
print("All pages already scraped!")
return slug_map
for i, page_num in enumerate(remaining):
url = f"{BASE_URL}/dict/?pos=noun&page={page_num}"
r = fetch_with_retry(url)
if not r:
print(f" Skipping page {page_num}")
continue
entries = parse_list_page(r.text)
for entry in entries:
word = entry["word_plain"]
slug_map[word] = {
"slug": entry["slug"],
"word_nikkud": entry["word_nikkud"],
"pos": entry["pos"],
"gender": entry["gender"],
"mishkal": entry["mishkal"],
}
completed_pages.add(page_num)
done = len(completed_pages)
print(f" Page {page_num} ({done}/{total_pages}): {len(entries)} nouns (total: {len(slug_map)})")
# Save progress every 10 pages
if (i + 1) % 10 == 0 or page_num == remaining[-1]:
save_json(SLUG_MAP_FILE, slug_map)
save_json(PROGRESS_FILE, sorted(completed_pages))
print(f" [Saved progress: {len(slug_map)} nouns, {done} pages]")
time.sleep(DELAY)
# Final save
save_json(SLUG_MAP_FILE, slug_map)
save_json(PROGRESS_FILE, sorted(completed_pages))
print(f"\nStep 1 complete: {len(slug_map)} total nouns from {len(completed_pages)} pages")
return slug_map
def parse_detail_page(html, slug, gender, mishkal):
"""Parse a noun detail page for plural/construct forms."""
soup = BeautifulSoup(html, "lxml")
tables = soup.select("table.conjugation-table")
if not tables:
return None
table = tables[0]
rows = table.select("tr")
result = {
"slug": slug,
"singular": "",
"singular_audio": "",
"plural": "",
"plural_audio": "",
"construct_singular": "",
"construct_plural": "",
"gender": gender,
"mishkal": mishkal,
}
for row in rows:
th = row.select_one("th")
if not th:
continue
label = th.get_text(strip=True).lower()
tds = row.select("td")
if "absolute" in label:
if len(tds) >= 1:
td = tds[0]
m = td.select_one("span.menukad")
result["singular"] = m.get_text(strip=True) if m else ""
audio_el = td.select_one("[data-audio]")
result["singular_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
if len(tds) >= 2:
td = tds[1]
m = td.select_one("span.menukad")
result["plural"] = m.get_text(strip=True) if m else ""
audio_el = td.select_one("[data-audio]")
result["plural_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
elif "construct" in label:
if len(tds) >= 1:
td = tds[0]
m = td.select_one("span.menukad")
result["construct_singular"] = m.get_text(strip=True) if m else ""
if len(tds) >= 2:
td = tds[1]
m = td.select_one("span.menukad")
result["construct_plural"] = m.get_text(strip=True) if m else ""
return result
def step2_fetch_plurals(slug_map):
"""Step 2: Fetch detail pages for plural + construct forms."""
print("\n" + "=" * 60)
print("STEP 2: Fetching plural + construct forms from detail pages")
print("=" * 60)
plurals = load_json(PLURALS_FILE, {})
already_done = set(plurals.keys())
# Build work list: nouns not yet in plurals
work = []
for word, info in slug_map.items():
if word not in already_done:
work.append((word, info))
print(f"Already have plural data: {len(already_done)}")
print(f"Remaining to fetch: {len(work)}")
if not work:
print("All nouns already have plural data!")
return plurals
skipped = 0
for i, (word, info) in enumerate(work):
slug = info["slug"]
url = f"{BASE_URL}/dict/{slug}/"
r = fetch_with_retry(url)
if not r:
print(f" Skipping {word} ({slug})")
skipped += 1
continue
entry = parse_detail_page(r.text, slug, info.get("gender", ""), info.get("mishkal", ""))
if entry:
plurals[word] = entry
else:
# No declension table - store minimal entry
plurals[word] = {
"slug": slug,
"singular": info.get("word_nikkud", ""),
"singular_audio": "",
"plural": "",
"plural_audio": "",
"construct_singular": "",
"construct_plural": "",
"gender": info.get("gender", ""),
"mishkal": info.get("mishkal", ""),
"no_declension_table": True,
}
done = len(already_done) + i + 1 - skipped
total = len(already_done) + len(work)
if (i + 1) % 50 == 0 or i == 0:
print(
f" [{i + 1}/{len(work)}] {word} ({slug}): "
f"plural={entry['plural'] if entry else 'N/A'} "
f"(total: {done}/{total})"
)
# Save every 50 entries
if (i + 1) % 50 == 0 or i == len(work) - 1:
save_json(PLURALS_FILE, plurals)
print(f" [Saved: {len(plurals)} entries]")
time.sleep(DELAY)
save_json(PLURALS_FILE, plurals)
print(f"\nStep 2 complete: {len(plurals)} total noun entries with plural data")
return plurals
def step3_summary(slug_map, plurals):
"""Step 3: Print summary statistics."""
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
total_slugs = len(slug_map)
total_plurals = len(plurals)
has_plural = sum(1 for v in plurals.values() if v.get("plural"))
has_construct = sum(1 for v in plurals.values() if v.get("construct_singular") or v.get("construct_plural"))
has_audio = sum(1 for v in plurals.values() if v.get("singular_audio") or v.get("plural_audio"))
no_table = sum(1 for v in plurals.values() if v.get("no_declension_table"))
# Irregular plurals: masculine with ות- ending, feminine with ים- ending
irregular = 0
for _word, v in plurals.items():
plural = v.get("plural", "")
gender = v.get("gender", "")
if not plural or not gender:
continue
plain_plural = re.sub(r"[\u0591-\u05C7]", "", plural)
if (
gender == "masculine"
and plain_plural.endswith("ות")
or gender == "feminine"
and plain_plural.endswith("ים")
):
irregular += 1
print(f"Total nouns in slug map: {total_slugs}")
print(f"Total nouns with plural data: {total_plurals}")
print(f" - With plural form: {has_plural}")
print(f" - With construct forms: {has_construct}")
print(f" - With audio URLs: {has_audio}")
print(f" - No declension table: {no_table}")
print(f" - Irregular plurals: {irregular}")
def main():
print("Pealim Noun Plural Scraper")
print(f"Data directory: {DATA_DIR}")
print()
slug_map = step1_collect_slugs()
plurals = step2_fetch_plurals(slug_map)
step3_summary(slug_map, plurals)
if __name__ == "__main__":
main()