v0.15: PoS fix, slug-based audio, CSS cleanup, template improvements
- Fix PoS substring bug: "Pronoun" no longer matches "Noun" - CSS: reduce sec-label/sec-key font sizes, add .definitions/.conf-entry - Slug-based audio filenames for confusable words (no more collisions) - Scraper captures slug from pealim.com list page links - Confusables: RTL alignment, re-enable audio (remove all-must-have gate) - Plurals: blue given word, gray meaning, labeled mishkal badge - Conjugation: add "אֵיךְ אוֹמְרִים" prompt, tense prefix (בְּ), Prep field from HBPAREN_RE, labeled RelatedVocab - Ben Yehuda: skip stripped fallback for confusable words - Bump RELEASE_TAG to v0.15 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
802c369365
commit
2e48109d7f
6 changed files with 9310 additions and 9157 deletions
107
apkg_builder.py
107
apkg_builder.py
|
|
@ -39,7 +39,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
|||
|
||||
# Release version tag added to all notes so users can identify which release
|
||||
# their cards come from (visible in Anki's Browse view and card info).
|
||||
RELEASE_TAG = "v0.14"
|
||||
RELEASE_TAG = "v0.15"
|
||||
|
||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||
|
|
@ -212,16 +212,26 @@ CARD_CSS = """
|
|||
color: #555;
|
||||
}
|
||||
.sec-label {
|
||||
font-size: 32px;
|
||||
font-size: 20px;
|
||||
font-weight: normal;
|
||||
color: #555;
|
||||
direction: rtl;
|
||||
text-align: center;
|
||||
margin-top: 6px;
|
||||
}
|
||||
.sec-key {
|
||||
font-size: 24px;
|
||||
font-size: 18px;
|
||||
color: #888;
|
||||
}
|
||||
.definitions {
|
||||
direction: rtl;
|
||||
text-align: center;
|
||||
}
|
||||
.conf-entry {
|
||||
margin: 8px 0;
|
||||
font-size: 20px;
|
||||
direction: rtl;
|
||||
}
|
||||
.related-group {
|
||||
direction: rtl;
|
||||
text-align: right;
|
||||
|
|
@ -241,6 +251,7 @@ CARD_CSS = """
|
|||
.root-info { color: #aaa; }
|
||||
.sec-label { color: #aaa; }
|
||||
.sec-key { color: #666; }
|
||||
.conf-entry { color: #ddd; }
|
||||
.hint { color: #777; }
|
||||
.voice-label { color: #888; }
|
||||
.example { color: #bbb; border-right-color: #555; }
|
||||
|
|
@ -361,19 +372,21 @@ VOCAB_MODEL = genanki.Model(
|
|||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
CONJ_FRONT = """
|
||||
<div class="sec-label">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
|
||||
<div class="hebrew">{{Pronoun}}</div>
|
||||
<div class="meaning" style="font-size:28px;">{{ReferenceForm}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
|
||||
<div class="hebrew">{{Tense}}</div>
|
||||
"""
|
||||
|
||||
CONJ_BACK = """
|
||||
{{FrontSide}}<hr>
|
||||
<div class="hebrew">{{ConjugatedForm}}</div>
|
||||
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
|
||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
|
||||
{{#RelatedVocab}}<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
|
||||
{{#RelatedVocab}}<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||
<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
|
||||
"""
|
||||
|
||||
CONJ_CSS = CARD_CSS
|
||||
|
|
@ -393,6 +406,7 @@ CONJ_MODEL = genanki.Model(
|
|||
{"name": "Audio"},
|
||||
{"name": "Meaning"},
|
||||
{"name": "RelatedVocab"},
|
||||
{"name": "Prep"},
|
||||
],
|
||||
templates=[
|
||||
{
|
||||
|
|
@ -441,6 +455,14 @@ PAST_3P_EXPANSION = [
|
|||
("הֵן", "עָבָר"),
|
||||
]
|
||||
|
||||
# Tense labels with "בְּ" prefix for display on cards
|
||||
TENSE_WITH_BE = {
|
||||
"עָבָר": "בֶּעָבָר",
|
||||
"הוֹוֶה": "בַּהוֹוֶה",
|
||||
"עָתִיד": "בֶּעָתִיד",
|
||||
"צִיּוּוּי": "בַּצִּוּוּי",
|
||||
}
|
||||
|
||||
# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
|
||||
VOICE_MAP = {
|
||||
"Pu'al": "סָבִיל",
|
||||
|
|
@ -453,8 +475,15 @@ VOICE_MAP = {
|
|||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR) -> str:
|
||||
"""Return [sound:xxx.mp3] if audio file exists, else empty string."""
|
||||
def _audio_tag(word_no_nikkud: str, audio_dir: Path = AUDIO_DIR, slug: str = "") -> str:
|
||||
"""Return [sound:xxx.mp3] if audio file exists, else empty string.
|
||||
|
||||
Tries slug-based filename first (for confusable words), then consonant-based.
|
||||
"""
|
||||
if slug:
|
||||
slug_path = audio_dir / f"{slug}.mp3"
|
||||
if slug_path.exists():
|
||||
return f"[sound:{slug_path.name}]"
|
||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
|
||||
if not safe:
|
||||
return ""
|
||||
|
|
@ -651,8 +680,9 @@ def _load_emoji_lookup() -> dict[str, str]:
|
|||
|
||||
def _translate_pos(pos_str: str) -> str:
|
||||
"""Translate PoS string to Hebrew. For verbs, appends binyan."""
|
||||
base = pos_str.split("–")[0].split("—")[0].strip()
|
||||
for eng, heb in POS_TO_HEBREW.items():
|
||||
if eng.lower() in pos_str.lower():
|
||||
if base == eng:
|
||||
if eng == "Verb":
|
||||
# Extract binyan from strings like "Verb – Pi'el" or "Verb –pi'el"
|
||||
for binyan_eng, binyan_heb in BINYAN_TO_HEBREW.items():
|
||||
|
|
@ -932,18 +962,20 @@ def build_vocab_deck(
|
|||
# Eng→Heb disambiguation hint (PoS + binyan, shown only for ambiguous meanings)
|
||||
hint_str = _word_meaning_hints.get((word, meaning), "")
|
||||
|
||||
# Audio
|
||||
audio_tag = _audio_tag(word_no_nik) if include_audio else ""
|
||||
# Consonant-only form for confusable detection and cloze matching
|
||||
word_consonants = _strip_nikkud(word)
|
||||
is_confusable = word_consonants in _confusable_words
|
||||
|
||||
# Audio — use slug-based filename for confusable words
|
||||
slug_val = str(row.get("slug", "")).strip()
|
||||
slug_val = "" if slug_val in ("nan", "None") else slug_val
|
||||
audio_tag = _audio_tag(word_no_nik, slug=slug_val if is_confusable else "") if include_audio else ""
|
||||
if audio_tag:
|
||||
mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]")
|
||||
mp3_path = AUDIO_DIR / mp3_name
|
||||
if mp3_path not in media_files:
|
||||
media_files.append(mp3_path)
|
||||
|
||||
# Consonant-only form for confusable detection and cloze matching
|
||||
word_consonants = _strip_nikkud(word)
|
||||
is_confusable = word_consonants in _confusable_words
|
||||
|
||||
# Example sentences — priority: EPUB (nikkud'd) > Ben Yehuda > none
|
||||
# For confusable words (same consonants, different nikkud), only match by
|
||||
# exact nikkud form to avoid showing wrong-word sentences.
|
||||
|
|
@ -1137,6 +1169,12 @@ def build_conj_deck(
|
|||
or verb_meaning.get(infinitive, "")
|
||||
or verb_meaning.get(_strip_nikkud(infinitive), "")
|
||||
)
|
||||
# Extract Hebrew preposition from meaning (e.g., "(על)" → prep_str)
|
||||
prep_str = ""
|
||||
if meaning:
|
||||
preps = HBPAREN_RE.findall(meaning)
|
||||
prep_str = " ".join(f"({p})" for p in preps)
|
||||
|
||||
related = [w for w in root_words.get(root, []) if w != infinitive]
|
||||
related_str = " ".join(related[:8]) if related else ""
|
||||
forms = data["forms"]
|
||||
|
|
@ -1154,10 +1192,13 @@ def build_conj_deck(
|
|||
_voice: str = voice,
|
||||
_meaning: str = meaning,
|
||||
_related_str: str = related_str,
|
||||
_prep_str: str = prep_str,
|
||||
) -> None:
|
||||
nonlocal note_count
|
||||
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
|
||||
return
|
||||
# Apply tense prefix (בְּ)
|
||||
display_tense = TENSE_WITH_BE.get(tense, tense)
|
||||
note = genanki.Note(
|
||||
model=CONJ_MODEL,
|
||||
guid=genanki.guid_for(_infinitive, pronoun, tense),
|
||||
|
|
@ -1165,7 +1206,7 @@ def build_conj_deck(
|
|||
_infinitive,
|
||||
_ref_form,
|
||||
pronoun,
|
||||
tense,
|
||||
display_tense,
|
||||
conj_form,
|
||||
_root,
|
||||
_binyan_heb,
|
||||
|
|
@ -1173,6 +1214,7 @@ def build_conj_deck(
|
|||
audio_tag,
|
||||
_meaning,
|
||||
_related_str,
|
||||
_prep_str,
|
||||
],
|
||||
tags=[RELEASE_TAG],
|
||||
)
|
||||
|
|
@ -1245,7 +1287,7 @@ def build_conj_deck(
|
|||
|
||||
CONF_FRONT = """
|
||||
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
|
||||
<div class="meaning" style="font-size:32px;">מה ההבדל?</div>
|
||||
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
|
||||
"""
|
||||
|
||||
CONF_BACK = """
|
||||
|
|
@ -1293,6 +1335,15 @@ def build_confusables_deck(
|
|||
media_files: list[Path] = []
|
||||
note_count = 0
|
||||
|
||||
# Build slug lookup: word (nikkud) → slug
|
||||
slug_lookup: dict[str, str] = {}
|
||||
if "slug" in df.columns:
|
||||
for _, row in df.iterrows():
|
||||
w = str(row.get("Word", "")).strip()
|
||||
s = str(row.get("slug", "")).strip()
|
||||
if w and s and s not in ("nan", "None"):
|
||||
slug_lookup[w] = s
|
||||
|
||||
# Group by Word Without Nikkud
|
||||
groups = {}
|
||||
for _, row in df.iterrows():
|
||||
|
|
@ -1326,7 +1377,6 @@ def build_confusables_deck(
|
|||
words_display = " / ".join(w for w, _, _ in unique_entries)
|
||||
defs_parts = []
|
||||
audio_parts = []
|
||||
all_have_audio = True
|
||||
for w, m, p in unique_entries:
|
||||
pos_label = f" ({p})" if p else ""
|
||||
defs_parts.append(
|
||||
|
|
@ -1334,19 +1384,14 @@ def build_confusables_deck(
|
|||
f" = {m}{pos_label}</div>"
|
||||
)
|
||||
if include_audio:
|
||||
at = _audio_tag(_strip_nikkud(w))
|
||||
slug = slug_lookup.get(w, "")
|
||||
at = _audio_tag(_strip_nikkud(w), slug=slug)
|
||||
if at and at not in audio_parts:
|
||||
audio_parts.append(at)
|
||||
mp3_name = at.removeprefix("[sound:").removesuffix("]")
|
||||
mp3_path = AUDIO_DIR / mp3_name
|
||||
if mp3_path not in media_files:
|
||||
media_files.append(mp3_path)
|
||||
else:
|
||||
all_have_audio = False
|
||||
|
||||
# Only include audio if every word in the group has it
|
||||
if not all_have_audio:
|
||||
audio_parts = []
|
||||
|
||||
defs_html = "\n".join(defs_parts)
|
||||
audio_html = " ".join(audio_parts)
|
||||
|
|
@ -1382,9 +1427,9 @@ def write_conf_apkg(
|
|||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
PLURAL_FRONT_SG = """
|
||||
<div class="hebrew">{{Singular}}</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">{{Singular}}</div>
|
||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||
<div class="meaning">{{Meaning}}</div>
|
||||
<div class="sec-label">{{Meaning}}</div>
|
||||
<div class="hint" style="font-size:28px;">יָחִיד ← רַבִּים</div>
|
||||
"""
|
||||
|
||||
|
|
@ -1392,11 +1437,11 @@ PLURAL_BACK_SG = """
|
|||
{{FrontSide}}<hr>
|
||||
<div class="hebrew">{{Plural}}</div>
|
||||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||||
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
|
||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
||||
"""
|
||||
|
||||
PLURAL_FRONT_PL = """
|
||||
<div class="hebrew">{{Plural}}</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">{{Plural}}</div>
|
||||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||||
<div class="hint" style="font-size:28px;">רַבִּים ← יָחִיד</div>
|
||||
"""
|
||||
|
|
@ -1405,8 +1450,8 @@ PLURAL_BACK_PL = """
|
|||
{{FrontSide}}<hr>
|
||||
<div class="hebrew">{{Singular}}</div>
|
||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||
<div class="meaning">{{Meaning}}</div>
|
||||
{{#Mishkal}}<div class="freq-badge">{{Mishkal}}</div>{{/Mishkal}}
|
||||
<div class="sec-label">{{Meaning}}</div>
|
||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
||||
"""
|
||||
|
||||
PLURAL_CSS = CARD_CSS
|
||||
|
|
|
|||
|
|
@ -131,13 +131,15 @@ def save_examples_cache() -> None:
|
|||
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
|
||||
|
||||
|
||||
def get_examples(word_nikkud: str) -> list[str]:
|
||||
def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
|
||||
"""
|
||||
Return 0 or 1 example sentences for the given word (nikkud form).
|
||||
|
||||
Lookup strategy:
|
||||
1. Try exact nikkud match in index.
|
||||
2. Fall back to stripped (no-nikkud) match against index keys.
|
||||
Skipped when word's consonants are in confusable_consonants set
|
||||
(to avoid returning sentences for the wrong homograph).
|
||||
|
||||
Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
|
||||
the word as a whole token.
|
||||
|
|
@ -155,7 +157,7 @@ def get_examples(word_nikkud: str) -> list[str]:
|
|||
|
||||
# Lookup: try exact nikkud first, then stripped fallback
|
||||
candidates = _index.get(word, [])
|
||||
if not candidates and word_stripped:
|
||||
if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
|
||||
# Try looking up by stripped form across index keys
|
||||
for k, v in _index.items():
|
||||
if _strip_nikkud(k) == word_stripped:
|
||||
|
|
|
|||
File diff suppressed because it is too large
Load diff
|
|
@ -5,6 +5,7 @@ Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards
|
|||
"""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
|
|
@ -41,7 +42,7 @@ def get_total_pages() -> int:
|
|||
def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
|
||||
"""
|
||||
Parse a dict page with BeautifulSoup to extract word data + audio URL.
|
||||
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
|
||||
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url, slug.
|
||||
"""
|
||||
soup = BeautifulSoup(html_bytes, "html.parser")
|
||||
rows = []
|
||||
|
|
@ -52,6 +53,13 @@ def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
|
|||
# Audio URL from span[data-audio] in first td
|
||||
audio_span = tds[0].find(attrs={"data-audio": True})
|
||||
audio_url = audio_span["data-audio"] if audio_span else ""
|
||||
# Slug from the detail page link (e.g., /dict/6009-av/ → 6009-av)
|
||||
slug = ""
|
||||
link = tds[0].find("a", href=True)
|
||||
if link:
|
||||
m = re.search(r"/dict/([^/]+)/", link["href"])
|
||||
if m:
|
||||
slug = m.group(1)
|
||||
# Word with nikkud
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
|
|
@ -69,6 +77,7 @@ def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
|
|||
"Part of Speech": pos,
|
||||
"Meaning": meaning,
|
||||
"audio_url": audio_url,
|
||||
"slug": slug,
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
|
|
|||
44
run.py
44
run.py
|
|
@ -136,12 +136,35 @@ def step_examples(args, freq_cache: dict):
|
|||
if args.test:
|
||||
df = df.head(args.test)
|
||||
|
||||
# Build confusable consonant set from CSV
|
||||
consonant_counts: dict[str, int] = {}
|
||||
for _, row in df.iterrows():
|
||||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||||
if word_no_nik and word_no_nik not in ("nan", "None"):
|
||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
|
||||
if safe:
|
||||
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
||||
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
||||
|
||||
# Delete stale cache entries for confusable words so they get re-fetched
|
||||
stale_deleted = 0
|
||||
for _, row in df.iterrows():
|
||||
word_nikkud = str(row.get("Word", "")).strip()
|
||||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||||
if word_nikkud and word_no_nik:
|
||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
|
||||
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
|
||||
del benyehuda._examples_cache[word_nikkud]
|
||||
stale_deleted += 1
|
||||
if stale_deleted:
|
||||
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
|
||||
|
||||
logger.info(f" Pre-fetching examples for {len(df)} words …")
|
||||
for _, row in df.iterrows():
|
||||
# Use nikkud word form as primary key (nikkud corpus)
|
||||
word_nikkud = str(row.get("Word", "")).strip()
|
||||
if word_nikkud:
|
||||
benyehuda.get_examples(word_nikkud)
|
||||
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Could not pre-fetch all examples: {e}")
|
||||
|
|
@ -184,6 +207,17 @@ def step_audio(args):
|
|||
if args.test:
|
||||
df = df.head(args.test)
|
||||
|
||||
# Build confusable set: consonant forms that appear more than once
|
||||
confusable_consonants: set[str] = set()
|
||||
consonant_counts: dict[str, int] = {}
|
||||
for _, row in df.iterrows():
|
||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
||||
if word_plain and word_plain not in ("nan", "None"):
|
||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain))
|
||||
if safe:
|
||||
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
||||
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
||||
|
||||
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
|
|
@ -193,6 +227,7 @@ def step_audio(args):
|
|||
word = str(row.get("Word", "")).strip()
|
||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
||||
audio_url = str(row.get("audio_url", "")).strip()
|
||||
slug = str(row.get("slug", "")).strip()
|
||||
|
||||
if not word:
|
||||
continue
|
||||
|
|
@ -200,7 +235,12 @@ def step_audio(args):
|
|||
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
|
||||
if not safe_name:
|
||||
continue
|
||||
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
|
||||
|
||||
# Confusable words: use slug-based filename to avoid collisions
|
||||
if safe_name in confusable_consonants and slug and slug not in ("nan", "None"):
|
||||
mp3_path = AUDIO_DIR / f"{slug}.mp3"
|
||||
else:
|
||||
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
|
||||
|
||||
if mp3_path.exists():
|
||||
skipped += 1
|
||||
|
|
|
|||
57
scripts/add_slugs.py
Normal file
57
scripts/add_slugs.py
Normal file
|
|
@ -0,0 +1,57 @@
|
|||
#!/usr/bin/env python3
|
||||
"""One-time script: scrape slugs from pealim.com dict pages and add to CSV."""
|
||||
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(message)s", stream=sys.stderr)
|
||||
logger = logging.getLogger()
|
||||
|
||||
dict_csv = "data/hebrew_dict_for_anki.csv"
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
logger.info(f"Loaded {len(df)} rows")
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
|
||||
|
||||
word_slug_map: dict[str, str] = {}
|
||||
total_pages = 608
|
||||
|
||||
for page_num in range(1, total_pages + 1):
|
||||
url = f"https://www.pealim.com/dict/?page={page_num}"
|
||||
cookies = {"translit": "none", "hebstyle": "mo"}
|
||||
try:
|
||||
resp = session.get(url, cookies=cookies, timeout=10)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.content, "html.parser")
|
||||
for tr in soup.select("table tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
link = tds[0].find("a", href=True)
|
||||
slug = ""
|
||||
if link:
|
||||
m = re.search(r"/dict/([^/]+)/", link["href"])
|
||||
if m:
|
||||
slug = m.group(1)
|
||||
if word and slug:
|
||||
word_slug_map[word] = slug
|
||||
except Exception as e:
|
||||
logger.warning(f"Page {page_num} failed: {e}")
|
||||
|
||||
if page_num % 50 == 0:
|
||||
logger.info(f"Scraped {page_num}/{total_pages} pages ({len(word_slug_map)} slugs)")
|
||||
time.sleep(0.8)
|
||||
|
||||
df["slug"] = df["Word"].map(word_slug_map).fillna("")
|
||||
df.to_csv(dict_csv, sep=";", index=True)
|
||||
matched = (df["slug"] != "").sum()
|
||||
logger.info(f"Done. {matched}/{len(df)} words have slugs. Saved → {dict_csv}")
|
||||
Loading…
Reference in a new issue