#!/usr/bin/env python3 """ Scrape pealim.com for noun plural and construct forms. Step 1: Collect noun slugs from list pages (/dict/?pos=noun&page=N) Step 2: Fetch detail pages for plural + construct forms Step 3: Print summary statistics """ import json import re import time from pathlib import Path import requests from bs4 import BeautifulSoup BASE_URL = "https://www.pealim.com" COOKIES = {"translit": "none", "hebstyle": "mo"} HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"} DATA_DIR = Path(__file__).resolve().parent.parent / "data" SLUG_MAP_FILE = DATA_DIR / "noun_slug_map.json" PROGRESS_FILE = DATA_DIR / "noun_slug_map_progress.json" PLURALS_FILE = DATA_DIR / "noun_plurals.json" DELAY = 1.5 # seconds between requests def load_json(path, default=None): if path.exists(): with open(path) as f: return json.load(f) return default if default is not None else {} def save_json(path, data): with open(path, "w") as f: json.dump(data, f, ensure_ascii=False, indent=2) def fetch_with_retry(url, max_retries=5): """Fetch URL with exponential backoff.""" for attempt in range(max_retries): try: r = requests.get(url, cookies=COOKIES, headers=HEADERS, timeout=30) r.raise_for_status() return r except (requests.RequestException, ConnectionError) as e: wait = min(2**attempt * 2, 60) print(f" Retry {attempt + 1}/{max_retries} for {url}: {e} (waiting {wait}s)") time.sleep(wait) print(f" FAILED after {max_retries} retries: {url}") return None def get_total_pages(): """Get total number of noun list pages.""" r = fetch_with_retry(f"{BASE_URL}/dict/?pos=noun&page=1") if not r: return 0 soup = BeautifulSoup(r.text, "lxml") pages = set() for a in soup.select("ul.pagination li a"): href = a.get("href", "") m = re.search(r"page=(\d+)", href) if m: pages.add(int(m.group(1))) return max(pages) if pages else 1 def parse_list_page(html): """Parse a noun list page and return list of noun entries.""" soup = BeautifulSoup(html, "lxml") table = soup.select_one("table.dict-table") if not table: return [] entries = [] for row in table.select("tr")[1:]: # skip header tds = row.select("td") if len(tds) < 3: continue # First td: word + link first_td = tds[0] a = first_td.select_one("a") if not a: continue href = a.get("href", "") slug_match = re.search(r"/dict/([^/]+)/", href) if not slug_match: continue slug = slug_match.group(1) menukad = first_td.select_one("span.menukad") word_nikkud = menukad.get_text(strip=True) if menukad else "" # Word without nikkud (strip combining marks) word_plain = re.sub(r"[\u0591-\u05C7]", "", word_nikkud) # Third td: part of speech pos_text = tds[2].get_text(strip=True) # Gender gender = "" if "masculine" in pos_text.lower(): gender = "masculine" elif "feminine" in pos_text.lower(): gender = "feminine" # Mishkal pattern mishkal = "" m = re.search(r"(\w+)\s*pattern", pos_text.lower()) if m: mishkal = m.group(1) entries.append( { "word_plain": word_plain, "slug": slug, "word_nikkud": word_nikkud, "pos": pos_text, "gender": gender, "mishkal": mishkal, } ) return entries def step1_collect_slugs(): """Step 1: Collect noun slugs from list pages.""" print("=" * 60) print("STEP 1: Collecting noun slugs from list pages") print("=" * 60) slug_map = load_json(SLUG_MAP_FILE, {}) progress = load_json(PROGRESS_FILE, []) completed_pages = set(progress) if isinstance(progress, list) else set() # Get total pages total_pages = get_total_pages() print(f"Total pages: {total_pages}") print(f"Already completed: {len(completed_pages)} pages, {len(slug_map)} nouns") remaining = [p for p in range(1, total_pages + 1) if p not in completed_pages] print(f"Remaining pages: {len(remaining)}") if not remaining: print("All pages already scraped!") return slug_map for i, page_num in enumerate(remaining): url = f"{BASE_URL}/dict/?pos=noun&page={page_num}" r = fetch_with_retry(url) if not r: print(f" Skipping page {page_num}") continue entries = parse_list_page(r.text) for entry in entries: word = entry["word_plain"] slug_map[word] = { "slug": entry["slug"], "word_nikkud": entry["word_nikkud"], "pos": entry["pos"], "gender": entry["gender"], "mishkal": entry["mishkal"], } completed_pages.add(page_num) done = len(completed_pages) print(f" Page {page_num} ({done}/{total_pages}): {len(entries)} nouns (total: {len(slug_map)})") # Save progress every 10 pages if (i + 1) % 10 == 0 or page_num == remaining[-1]: save_json(SLUG_MAP_FILE, slug_map) save_json(PROGRESS_FILE, sorted(completed_pages)) print(f" [Saved progress: {len(slug_map)} nouns, {done} pages]") time.sleep(DELAY) # Final save save_json(SLUG_MAP_FILE, slug_map) save_json(PROGRESS_FILE, sorted(completed_pages)) print(f"\nStep 1 complete: {len(slug_map)} total nouns from {len(completed_pages)} pages") return slug_map def parse_detail_page(html, slug, gender, mishkal): """Parse a noun detail page for plural/construct forms.""" soup = BeautifulSoup(html, "lxml") tables = soup.select("table.conjugation-table") if not tables: return None table = tables[0] rows = table.select("tr") result = { "slug": slug, "singular": "", "singular_audio": "", "plural": "", "plural_audio": "", "construct_singular": "", "construct_plural": "", "gender": gender, "mishkal": mishkal, } for row in rows: th = row.select_one("th") if not th: continue label = th.get_text(strip=True).lower() tds = row.select("td") if "absolute" in label: if len(tds) >= 1: td = tds[0] m = td.select_one("span.menukad") result["singular"] = m.get_text(strip=True) if m else "" audio_el = td.select_one("[data-audio]") result["singular_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "") if len(tds) >= 2: td = tds[1] m = td.select_one("span.menukad") result["plural"] = m.get_text(strip=True) if m else "" audio_el = td.select_one("[data-audio]") result["plural_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "") elif "construct" in label: if len(tds) >= 1: td = tds[0] m = td.select_one("span.menukad") result["construct_singular"] = m.get_text(strip=True) if m else "" if len(tds) >= 2: td = tds[1] m = td.select_one("span.menukad") result["construct_plural"] = m.get_text(strip=True) if m else "" return result def step2_fetch_plurals(slug_map): """Step 2: Fetch detail pages for plural + construct forms.""" print("\n" + "=" * 60) print("STEP 2: Fetching plural + construct forms from detail pages") print("=" * 60) plurals = load_json(PLURALS_FILE, {}) already_done = set(plurals.keys()) # Build work list: nouns not yet in plurals work = [] for word, info in slug_map.items(): if word not in already_done: work.append((word, info)) print(f"Already have plural data: {len(already_done)}") print(f"Remaining to fetch: {len(work)}") if not work: print("All nouns already have plural data!") return plurals skipped = 0 for i, (word, info) in enumerate(work): slug = info["slug"] url = f"{BASE_URL}/dict/{slug}/" r = fetch_with_retry(url) if not r: print(f" Skipping {word} ({slug})") skipped += 1 continue entry = parse_detail_page(r.text, slug, info.get("gender", ""), info.get("mishkal", "")) if entry: plurals[word] = entry else: # No declension table - store minimal entry plurals[word] = { "slug": slug, "singular": info.get("word_nikkud", ""), "singular_audio": "", "plural": "", "plural_audio": "", "construct_singular": "", "construct_plural": "", "gender": info.get("gender", ""), "mishkal": info.get("mishkal", ""), "no_declension_table": True, } done = len(already_done) + i + 1 - skipped total = len(already_done) + len(work) if (i + 1) % 50 == 0 or i == 0: print( f" [{i + 1}/{len(work)}] {word} ({slug}): " f"plural={entry['plural'] if entry else 'N/A'} " f"(total: {done}/{total})" ) # Save every 50 entries if (i + 1) % 50 == 0 or i == len(work) - 1: save_json(PLURALS_FILE, plurals) print(f" [Saved: {len(plurals)} entries]") time.sleep(DELAY) save_json(PLURALS_FILE, plurals) print(f"\nStep 2 complete: {len(plurals)} total noun entries with plural data") return plurals def step3_summary(slug_map, plurals): """Step 3: Print summary statistics.""" print("\n" + "=" * 60) print("SUMMARY") print("=" * 60) total_slugs = len(slug_map) total_plurals = len(plurals) has_plural = sum(1 for v in plurals.values() if v.get("plural")) has_construct = sum(1 for v in plurals.values() if v.get("construct_singular") or v.get("construct_plural")) has_audio = sum(1 for v in plurals.values() if v.get("singular_audio") or v.get("plural_audio")) no_table = sum(1 for v in plurals.values() if v.get("no_declension_table")) # Irregular plurals: masculine with ות- ending, feminine with ים- ending irregular = 0 for _word, v in plurals.items(): plural = v.get("plural", "") gender = v.get("gender", "") if not plural or not gender: continue plain_plural = re.sub(r"[\u0591-\u05C7]", "", plural) if ( gender == "masculine" and plain_plural.endswith("ות") or gender == "feminine" and plain_plural.endswith("ים") ): irregular += 1 print(f"Total nouns in slug map: {total_slugs}") print(f"Total nouns with plural data: {total_plurals}") print(f" - With plural form: {has_plural}") print(f" - With construct forms: {has_construct}") print(f" - With audio URLs: {has_audio}") print(f" - No declension table: {no_table}") print(f" - Irregular plurals: {irregular}") def main(): print("Pealim Noun Plural Scraper") print(f"Data directory: {DATA_DIR}") print() slug_map = step1_collect_slugs() plurals = step2_fetch_plurals(slug_map) step3_summary(slug_map, plurals) if __name__ == "__main__": main()