- Fix PoS substring bug: "Pronoun" no longer matches "Noun" - CSS: reduce sec-label/sec-key font sizes, add .definitions/.conf-entry - Slug-based audio filenames for confusable words (no more collisions) - Scraper captures slug from pealim.com list page links - Confusables: RTL alignment, re-enable audio (remove all-must-have gate) - Plurals: blue given word, gray meaning, labeled mishkal badge - Conjugation: add "אֵיךְ אוֹמְרִים" prompt, tense prefix (בְּ), Prep field from HBPAREN_RE, labeled RelatedVocab - Ben Yehuda: skip stripped fallback for confusable words - Bump RELEASE_TAG to v0.15 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
57 lines
1.9 KiB
Python
57 lines
1.9 KiB
Python
#!/usr/bin/env python3
|
|
"""One-time script: scrape slugs from pealim.com dict pages and add to CSV."""
|
|
|
|
import logging
|
|
import re
|
|
import sys
|
|
import time
|
|
|
|
import pandas as pd
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", stream=sys.stderr)
|
|
logger = logging.getLogger()
|
|
|
|
dict_csv = "data/hebrew_dict_for_anki.csv"
|
|
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
|
logger.info(f"Loaded {len(df)} rows")
|
|
|
|
session = requests.Session()
|
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
|
|
|
|
word_slug_map: dict[str, str] = {}
|
|
total_pages = 608
|
|
|
|
for page_num in range(1, total_pages + 1):
|
|
url = f"https://www.pealim.com/dict/?page={page_num}"
|
|
cookies = {"translit": "none", "hebstyle": "mo"}
|
|
try:
|
|
resp = session.get(url, cookies=cookies, timeout=10)
|
|
resp.raise_for_status()
|
|
soup = BeautifulSoup(resp.content, "html.parser")
|
|
for tr in soup.select("table tr"):
|
|
tds = tr.find_all("td")
|
|
if len(tds) < 4:
|
|
continue
|
|
menukad = tds[0].find("span", class_="menukad")
|
|
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
|
link = tds[0].find("a", href=True)
|
|
slug = ""
|
|
if link:
|
|
m = re.search(r"/dict/([^/]+)/", link["href"])
|
|
if m:
|
|
slug = m.group(1)
|
|
if word and slug:
|
|
word_slug_map[word] = slug
|
|
except Exception as e:
|
|
logger.warning(f"Page {page_num} failed: {e}")
|
|
|
|
if page_num % 50 == 0:
|
|
logger.info(f"Scraped {page_num}/{total_pages} pages ({len(word_slug_map)} slugs)")
|
|
time.sleep(0.8)
|
|
|
|
df["slug"] = df["Word"].map(word_slug_map).fillna("")
|
|
df.to_csv(dict_csv, sep=";", index=True)
|
|
matched = (df["slug"] != "").sum()
|
|
logger.info(f"Done. {matched}/{len(df)} words have slugs. Saved → {dict_csv}")
|