Sprint 7: emoji/prep extraction, conjugation reduction, project rename
- Item 1/2: Extract emoji and Hebrew parentheticals (prepositions) from Meaning field; display emoji with 3.5em font, prep inline after Hebrew word. Add Emoji and Prep fields to Hebrew Flash Cards model. - Item 3: Seeded RNG per verb reduces conjugation cards by ~630 (4 present forms → 1 pronoun each; past_3p → 1 gender). 1st-person forms gain gender label (זכר/נקבה). Total: 1,834 conj cards (was ~2,464). - Item 4: hebrew_extract.py uses BeautifulSoup to capture data-audio URLs from pealim.com list pages during scraping. step_audio() reads audio_url column from CSV (no longer needs audio_extract.py). - Item 5: Rename to 'Hebrew Flash Cards'. New filenames: hebrew_dict.csv, hebrew_extract.py, hebrew_vocabulary.apkg, hebrew_conjugations.apkg. Deck/model names updated throughout. Forgejo repo rename pending (sochen lacks admin rights — Nevo must do via UI). - Fix: Deduplicate entries with same Hebrew word before adding notes (eliminates GUID collisions from duplicate source CSV rows). - Bump RELEASE_TAG to v0.11. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
f8e4873349
commit
64a1b18951
9 changed files with 21580 additions and 60 deletions
16
README.md
16
README.md
|
|
@ -1,4 +1,4 @@
|
|||
# Pealim — Hebrew Vocabulary & Verb Flashcards for Anki
|
||||
# Hebrew Flash Cards — Hebrew Vocabulary & Verb Flashcards for Anki
|
||||
|
||||

|
||||
|
||||
|
|
@ -21,7 +21,7 @@ All card data comes from open or academic sources:
|
|||
|
||||
## Just give me the flashcards
|
||||
|
||||
1. Download the `.apkg` files from [Releases](https://git.nevo.engineer/nevo/pealim/releases)
|
||||
1. Download the `.apkg` files from [Releases](https://git.nevo.engineer/nevo/hebrew_flash_cards/releases)
|
||||
2. Double-click to import into [Anki](https://apps.ankiweb.net/) (free, cross-platform)
|
||||
3. Start studying
|
||||
|
||||
|
|
@ -136,7 +136,7 @@ python run.py --skip-scrape --refresh-examples
|
|||
```
|
||||
python run.py [options]
|
||||
|
||||
--skip-scrape Use cached data/pealim_dict.csv (no pealim.com scraping)
|
||||
--skip-scrape Use cached data/hebrew_dict.csv (no pealim.com scraping)
|
||||
--skip-audio Skip audio .mp3 downloads
|
||||
--skip-examples Skip Ben Yehuda example fetching
|
||||
--only {vocab,conjugations} Run only one deck (skips all unrelated steps)
|
||||
|
|
@ -150,20 +150,20 @@ python run.py [options]
|
|||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `data/pealim_dict.csv` | Raw dictionary |
|
||||
| `data/pealim_dict_for_anki.csv` | Enriched Anki CSV |
|
||||
| `data/hebrew_dict.csv` | Raw dictionary |
|
||||
| `data/hebrew_dict_for_anki.csv` | Enriched Anki CSV |
|
||||
| `data/conjugations.json` | Verb conjugation data |
|
||||
| `data/audio/` | Vocabulary audio (.mp3) |
|
||||
| `data/audio_conj/` | Conjugation audio (.mp3) |
|
||||
| `data/fonts/` | Heebo font files (bundled in .apkg) |
|
||||
| `data/images/` | Noun images from Wikipedia/Commons |
|
||||
| `data/image_cache.json` | Image fetch cache |
|
||||
| `output/pealim_vocabulary.apkg` | Vocabulary Anki deck |
|
||||
| `output/pealim_conjugations.apkg` | Conjugation Anki deck |
|
||||
| `output/hebrew_vocabulary.apkg` | Vocabulary Anki deck |
|
||||
| `output/hebrew_conjugations.apkg` | Conjugation Anki deck |
|
||||
|
||||
### Pipeline overview
|
||||
|
||||
1. `pealim_extract.py` — scrapes pealim.com dictionary
|
||||
1. `hebrew_extract.py` — scrapes pealim.com dictionary
|
||||
2. `frequency_lookup.py` — downloads/loads Hebrew frequency data
|
||||
3. `benyehuda.py` — builds sentence index from Ben-Yehuda corpus
|
||||
4. `extract_verb_list.py` — extracts verb list from Coffin & Bolozky PDF
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ in Anki rather than creating a duplicate.
|
|||
|
||||
import json
|
||||
import logging
|
||||
import random
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
|
@ -27,15 +28,19 @@ CONJ_MODEL_ID = 1_234_567_893
|
|||
|
||||
# Release version tag added to all notes so users can identify which release
|
||||
# their cards come from (visible in Anki's Browse view and card info).
|
||||
RELEASE_TAG = "v0.10"
|
||||
RELEASE_TAG = "v0.11"
|
||||
|
||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||
EMOJI_RE = re.compile(r'[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+')
|
||||
HBPAREN_RE = re.compile(r'\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)')
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
AUDIO_DIR = DATA_DIR / "audio"
|
||||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
|
||||
VOCAB_APKG = OUTPUT_DIR / "pealim_vocabulary.apkg"
|
||||
CONJ_APKG = OUTPUT_DIR / "pealim_conjugations.apkg"
|
||||
VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
|
||||
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Binyan → Hebrew label mapping (for conjugation card display)
|
||||
|
|
@ -163,6 +168,11 @@ CARD_CSS = """
|
|||
margin: 2px 0;
|
||||
font-size: 15px;
|
||||
}
|
||||
.emoji-img {
|
||||
font-size: 3.5em;
|
||||
text-align: center;
|
||||
margin: 0.3em 0;
|
||||
}
|
||||
@media (prefers-color-scheme: dark) {
|
||||
.card { color: #e8e8e8; background: #1c1c1e; }
|
||||
.hebrew { color: #f0f0f0; }
|
||||
|
|
@ -182,7 +192,7 @@ CARD_CSS = """
|
|||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
VOCAB_FRONT_HEB = """
|
||||
<div class="hebrew">{{Word}}</div>
|
||||
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
|
||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||
"""
|
||||
|
||||
|
|
@ -197,6 +207,7 @@ VOCAB_BACK_HEB = """
|
|||
<div class="root-info">{{SharedRoots}}</div>
|
||||
{{/SharedRoots}}
|
||||
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
|
||||
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
|
||||
{{#Example}}
|
||||
<div class="example">{{Example}}</div>
|
||||
{{/Example}}
|
||||
|
|
@ -210,12 +221,13 @@ VOCAB_FRONT_ENG = """
|
|||
VOCAB_BACK_ENG = """
|
||||
{{FrontSide}}
|
||||
<div class="divider"></div>
|
||||
<div class="hebrew">{{Word}}</div>
|
||||
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
|
||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||
{{#WordNoNikkud}}<div class="sec-label">ללא ניקוד: {{WordNoNikkud}}</div>{{/WordNoNikkud}}
|
||||
{{#Root}}<div class="sec-label">שורש: {{Root}}</div>{{/Root}}
|
||||
{{#PoS}}<div class="sec-label">חלק דיבור: {{PoS}}</div>{{/PoS}}
|
||||
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
|
||||
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
|
||||
{{#Example}}
|
||||
<div class="example">{{Example}}</div>
|
||||
{{/Example}}
|
||||
|
|
@ -223,7 +235,7 @@ VOCAB_BACK_ENG = """
|
|||
|
||||
VOCAB_MODEL = genanki.Model(
|
||||
VOCAB_MODEL_ID,
|
||||
"Pealim Hebrew",
|
||||
"Hebrew Flash Cards",
|
||||
fields=[
|
||||
{"name": "Word"},
|
||||
{"name": "Root"},
|
||||
|
|
@ -236,6 +248,8 @@ VOCAB_MODEL = genanki.Model(
|
|||
{"name": "Example"},
|
||||
{"name": "Frequency"},
|
||||
{"name": "Image"},
|
||||
{"name": "Emoji"},
|
||||
{"name": "Prep"},
|
||||
],
|
||||
templates=[
|
||||
{
|
||||
|
|
@ -444,8 +458,9 @@ def build_vocab_deck(
|
|||
df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
|
||||
df = df.sort_values("_freq_rank")
|
||||
|
||||
deck = genanki.Deck(VOCAB_DECK_ID, "Pealim Hebrew Vocabulary")
|
||||
deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
|
||||
media_files: list[Path] = []
|
||||
seen_words: set[str] = set()
|
||||
|
||||
for _, row in df.iterrows():
|
||||
word = str(row.get("Word", "")).strip()
|
||||
|
|
@ -468,6 +483,21 @@ def build_vocab_deck(
|
|||
if not word or not meaning:
|
||||
continue
|
||||
|
||||
# Skip exact duplicates (same Hebrew word with nikkud)
|
||||
if word in seen_words:
|
||||
logger.debug(f" Skipping duplicate word: {word}")
|
||||
continue
|
||||
seen_words.add(word)
|
||||
|
||||
# Extract emoji from meaning
|
||||
emoji_str = ''.join(EMOJI_RE.findall(meaning))
|
||||
meaning_clean = EMOJI_RE.sub('', meaning).strip()
|
||||
|
||||
# Extract Hebrew parentheticals (prepositions) from meaning
|
||||
preps = HBPAREN_RE.findall(meaning_clean)
|
||||
prep_str = ' '.join(f'({p})' for p in preps)
|
||||
meaning_clean = HBPAREN_RE.sub('', meaning_clean).strip().strip(',').strip()
|
||||
|
||||
# Translate PoS to Hebrew
|
||||
pos_heb = _translate_pos(pos_raw) if pos_raw else ""
|
||||
|
||||
|
|
@ -523,7 +553,7 @@ def build_vocab_deck(
|
|||
word,
|
||||
root,
|
||||
pos_heb,
|
||||
meaning,
|
||||
meaning_clean,
|
||||
word_no_nik,
|
||||
related_html or shared_roots,
|
||||
tags_str,
|
||||
|
|
@ -531,11 +561,21 @@ def build_vocab_deck(
|
|||
example_html,
|
||||
freq_display,
|
||||
image_tag,
|
||||
emoji_str,
|
||||
prep_str,
|
||||
],
|
||||
tags=(tags_str.split() if tags_str else []) + [RELEASE_TAG],
|
||||
)
|
||||
deck.add_note(note)
|
||||
|
||||
# Diagnostic: count words with emoji/prep extracted
|
||||
emoji_count = sum(1 for n in deck.notes if n.fields[11])
|
||||
prep_count = sum(1 for n in deck.notes if n.fields[12])
|
||||
if emoji_count:
|
||||
logger.info(f" Emoji extracted: {emoji_count} words")
|
||||
if prep_count:
|
||||
logger.info(f" Hebrew prepositions extracted: {prep_count} words")
|
||||
|
||||
# Diagnostic: count words without PoS coverage in shared_roots
|
||||
other_count = 0
|
||||
for _, row in df.iterrows():
|
||||
|
|
@ -557,7 +597,7 @@ def build_conj_deck(
|
|||
audio_dir: Path = AUDIO_CONJ_DIR,
|
||||
) -> tuple[genanki.Deck, list[Path]]:
|
||||
"""Build the conjugation drill deck from conjugations.json data."""
|
||||
deck = genanki.Deck(CONJ_DECK_ID, "Pealim Hebrew Conjugations")
|
||||
deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
|
||||
media_files: list[Path] = []
|
||||
note_count = 0
|
||||
|
||||
|
|
@ -601,6 +641,9 @@ def build_conj_deck(
|
|||
|
||||
alternate_forms = data.get("alternate_forms", {})
|
||||
|
||||
# Seeded RNG per verb — deterministic pronoun/gender choices
|
||||
verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)
|
||||
|
||||
for form_key, form_data in forms.items():
|
||||
primary_form = form_data.get("form", "")
|
||||
alt_form = alternate_forms.get(form_key, "")
|
||||
|
|
@ -620,16 +663,16 @@ def build_conj_deck(
|
|||
if mp3_path not in media_files:
|
||||
media_files.append(mp3_path)
|
||||
|
||||
# Present tense expansion: 4 form keys → 3 cards each = 12 cards
|
||||
# Present tense expansion: 4 form keys → 1 card each (seeded RNG)
|
||||
if form_key in PRESENT_EXPANSION:
|
||||
for pronoun, tense_label in PRESENT_EXPANSION[form_key]:
|
||||
add_note(pronoun, tense_label, conj_form, audio_tag)
|
||||
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
|
||||
add_note(chosen[0], chosen[1], conj_form, audio_tag)
|
||||
continue
|
||||
|
||||
# Past 3rd plural: same form for m/f → two separate pronoun cards
|
||||
# Past 3rd plural: same form for m/f → 1 card (seeded RNG)
|
||||
if form_key == "past_3p":
|
||||
for pronoun, tense_label in PAST_3P_EXPANSION:
|
||||
add_note(pronoun, tense_label, conj_form, audio_tag)
|
||||
chosen = verb_rng.choice(PAST_3P_EXPANSION)
|
||||
add_note(chosen[0], chosen[1], conj_form, audio_tag)
|
||||
continue
|
||||
|
||||
# 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
|
||||
|
|
@ -649,6 +692,12 @@ def build_conj_deck(
|
|||
# Standard card
|
||||
pronoun = form_data.get("pronoun", "")
|
||||
tense = form_data.get("tense", "")
|
||||
|
||||
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
|
||||
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
|
||||
gender = verb_rng.choice(["זכר", "נקבה"])
|
||||
pronoun = f"{pronoun} ({gender})"
|
||||
|
||||
add_note(pronoun, tense, conj_form, audio_tag)
|
||||
|
||||
|
||||
|
|
@ -707,7 +756,11 @@ def write_conj_apkg(
|
|||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
|
||||
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not csv_path.exists():
|
||||
csv_path = DATA_DIR / "hebrew_dict.csv"
|
||||
if not csv_path.exists():
|
||||
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not csv_path.exists():
|
||||
csv_path = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
|
|
|
|||
|
|
@ -33,7 +33,13 @@ REQUEST_DELAY = 1.5
|
|||
REQUEST_TIMEOUT = 15
|
||||
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
|
||||
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
|
||||
DICT_CSV = Path(__file__).parent / "data" / "pealim_dict_for_anki.csv"
|
||||
DICT_CSV = next(
|
||||
(p for p in [
|
||||
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
|
||||
Path(__file__).parent / "data" / "pealim_dict_for_anki.csv",
|
||||
] if p.exists()),
|
||||
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
|
||||
)
|
||||
|
||||
# Pronoun labels (for card front display)
|
||||
PRONOUN_LABELS = {
|
||||
|
|
|
|||
9106
data/hebrew_dict.csv
Normal file
9106
data/hebrew_dict.csv
Normal file
File diff suppressed because it is too large
Load diff
12111
data/hebrew_dict_for_anki.csv
Normal file
12111
data/hebrew_dict_for_anki.csv
Normal file
File diff suppressed because it is too large
Load diff
219
hebrew_extract.py
Normal file
219
hebrew_extract.py
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Hebrew vocabulary from pealim.com dictionary.
|
||||
Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Session for connection pooling
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
|
||||
})
|
||||
|
||||
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||||
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
|
||||
REQUEST_TIMEOUT = 10 # seconds
|
||||
|
||||
|
||||
def get_total_pages() -> int:
|
||||
"""Dynamically determine total pages from first request."""
|
||||
try:
|
||||
logger.info("Fetching total page count...")
|
||||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||||
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
# Hardcoded — pealim.com has ~608 pages at ~15 words/page
|
||||
return 608
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching page count: {e}. Using default (608).")
|
||||
return 608
|
||||
|
||||
|
||||
def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
|
||||
"""
|
||||
Parse a dict page with BeautifulSoup to extract word data + audio URL.
|
||||
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
|
||||
"""
|
||||
soup = BeautifulSoup(html_bytes, 'html.parser')
|
||||
rows = []
|
||||
for tr in soup.select('table tr'):
|
||||
tds = tr.find_all('td')
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
# Audio URL from span[data-audio] in first td
|
||||
audio_span = tds[0].find(attrs={'data-audio': True})
|
||||
audio_url = audio_span['data-audio'] if audio_span else ''
|
||||
# Word with nikkud
|
||||
menukad = tds[0].find('span', class_='menukad')
|
||||
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
# Root (may be link or plain text)
|
||||
root = tds[1].get_text(strip=True)
|
||||
# Part of speech
|
||||
pos = tds[2].get_text(strip=True)
|
||||
# Meaning
|
||||
meaning = tds[3].get_text(strip=True)
|
||||
if word:
|
||||
rows.append({
|
||||
'Word': word,
|
||||
'Root': root if root else '-',
|
||||
'Part of Speech': pos,
|
||||
'Meaning': meaning,
|
||||
'audio_url': audio_url,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def extract_from_website(max_pages: Optional[int] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Extract dictionary entries from pealim.com.
|
||||
Captures audio URLs from each word entry's data-audio attribute.
|
||||
|
||||
Args:
|
||||
max_pages: Maximum pages to scrape (None = all)
|
||||
|
||||
Returns:
|
||||
DataFrame with Word, Root, Part of Speech, Meaning, Word Without Nikkud, audio_url columns
|
||||
"""
|
||||
total_pages = max_pages or get_total_pages()
|
||||
logger.info(f"Starting extraction from {total_pages} pages...")
|
||||
|
||||
all_rows: list[dict] = []
|
||||
|
||||
for page_num in range(1, total_pages):
|
||||
try:
|
||||
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||||
|
||||
# First request: with nikkud — parse with BeautifulSoup for audio URL
|
||||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
page_rows = _parse_page_with_audio(response.content)
|
||||
|
||||
# Second request: without nikkud — just get the word column
|
||||
cookies_vl = {'translit': 'none', 'hebstyle': 'vl', 'showmeaning': 'off'}
|
||||
resp_vl = session.get(url, cookies=cookies_vl, timeout=REQUEST_TIMEOUT)
|
||||
resp_vl.raise_for_status()
|
||||
soup_vl = BeautifulSoup(resp_vl.content, 'html.parser')
|
||||
no_nik_words = []
|
||||
for tr in soup_vl.select('table tr'):
|
||||
tds = tr.find_all('td')
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
menukad = tds[0].find('span', class_='menukad')
|
||||
w = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
no_nik_words.append(w)
|
||||
|
||||
# Merge no-nikkud words into rows
|
||||
for i, row in enumerate(page_rows):
|
||||
row['Word Without Nikkud'] = no_nik_words[i] if i < len(no_nik_words) else ''
|
||||
|
||||
all_rows.extend(page_rows)
|
||||
|
||||
if page_num % 50 == 0:
|
||||
logger.info(f"Processed {page_num}/{total_pages} pages ({len(all_rows)} words so far)...")
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error on page {page_num}: {e}")
|
||||
continue
|
||||
|
||||
df = pd.DataFrame(all_rows)
|
||||
audio_count = (df['audio_url'] != '').sum() if 'audio_url' in df.columns else 0
|
||||
logger.info(f"Extraction complete. Total words: {len(df)}, with audio URL: {audio_count}")
|
||||
return df
|
||||
|
||||
|
||||
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Transform dictionary DataFrame for Anki import.
|
||||
Adds shared root words and Hebrew tags. Preserves audio_url column.
|
||||
"""
|
||||
logger.info("Preparing data for Anki...")
|
||||
|
||||
# Find shared root words
|
||||
shared_root_words = []
|
||||
for idx, row in df.iterrows():
|
||||
root = row['Root']
|
||||
word = row['Word']
|
||||
|
||||
if root != '-' and pd.notna(root):
|
||||
same_root = df[(df['Root'] == root) & (df['Word'] != word)]['Word'].values
|
||||
shared = ' '.join(str(w) for w in same_root)
|
||||
shared_root_words.append(shared)
|
||||
else:
|
||||
shared_root_words.append('')
|
||||
|
||||
df['shared roots'] = shared_root_words
|
||||
|
||||
# Generate Hebrew tags
|
||||
tags = []
|
||||
for idx, row in df.iterrows():
|
||||
tag_parts = []
|
||||
|
||||
root = str(row['Root']).replace(' ', '').replace('-', '')
|
||||
if 'nan' not in root and root:
|
||||
root_clean = root.replace('.', '')
|
||||
tag_parts.append(f"שורש::{root_clean}")
|
||||
|
||||
pos = str(row['Part of Speech'])
|
||||
pos_tags = {
|
||||
'Adverb': 'תוארי_הפועל',
|
||||
'Pronoun': 'כינויי_גוף',
|
||||
'Noun': 'שם_עצם',
|
||||
'Verb': 'פעלים',
|
||||
'Adjective': 'שם_תואר',
|
||||
'Preposition': 'מילות_יחס',
|
||||
'Conjunction': 'מילות_חיבור',
|
||||
'Particle': 'מילית'
|
||||
}
|
||||
|
||||
for key, value in pos_tags.items():
|
||||
if key in pos:
|
||||
tag_parts.append(value)
|
||||
break
|
||||
|
||||
tags.append(' '.join(tag_parts))
|
||||
|
||||
df['tags'] = tags
|
||||
logger.info("Anki preparation complete.")
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
try:
|
||||
df = extract_from_website()
|
||||
df.to_csv('hebrew_dict.csv', index=True)
|
||||
logger.info("Saved: hebrew_dict.csv")
|
||||
|
||||
df = modify_for_anki(df)
|
||||
df.to_csv('hebrew_dict_for_anki.csv', sep=';', index=True)
|
||||
logger.info("Saved: hebrew_dict_for_anki.csv")
|
||||
|
||||
logger.info("Complete!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -214,7 +214,11 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
|
|||
"""
|
||||
import pandas as pd
|
||||
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
|
|
|
|||
85
run.py
85
run.py
|
|
@ -55,26 +55,31 @@ def parse_args():
|
|||
|
||||
def step_scrape(args):
|
||||
"""Step 1 — scrape or load dictionary."""
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
# Legacy fallback names
|
||||
legacy_dict = DATA_DIR / "pealim_dict.csv"
|
||||
legacy_anki = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
|
||||
if args.skip_scrape:
|
||||
if dict_csv.exists():
|
||||
logger.info(f"[1] Using existing {dict_csv}")
|
||||
elif legacy_dict.exists():
|
||||
logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
|
||||
else:
|
||||
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
logger.info("[1] Scraping dictionary from pealim.com …")
|
||||
import pealim_extract
|
||||
import hebrew_extract
|
||||
import pandas as pd
|
||||
|
||||
df = pealim_extract.extract_from_website()
|
||||
df = hebrew_extract.extract_from_website()
|
||||
df.to_csv(dict_csv, index=True)
|
||||
logger.info(f" Saved {len(df)} words → {dict_csv}")
|
||||
|
||||
df = pealim_extract.modify_for_anki(df)
|
||||
df = hebrew_extract.modify_for_anki(df)
|
||||
df.to_csv(anki_csv, sep=";", index=True)
|
||||
logger.info(f" Saved Anki CSV → {anki_csv}")
|
||||
|
||||
|
|
@ -101,7 +106,11 @@ def step_examples(args, freq_cache: dict):
|
|||
import benyehuda
|
||||
benyehuda.load(force_rebuild=args.refresh_examples)
|
||||
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
|
|
@ -132,22 +141,18 @@ def step_examples(args, freq_cache: dict):
|
|||
|
||||
|
||||
def step_audio(args):
|
||||
"""Step 4 — download vocabulary audio .mp3 files."""
|
||||
"""Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
|
||||
if args.skip_audio:
|
||||
logger.info("[4] Skipping audio (--skip-audio)")
|
||||
return
|
||||
|
||||
logger.info("[4] Downloading vocabulary audio files …")
|
||||
audio_cache_path = DATA_DIR / "audio_cache.json"
|
||||
audio_url_cache: dict = {}
|
||||
if audio_cache_path.exists():
|
||||
with open(audio_cache_path) as f:
|
||||
audio_url_cache = json.load(f)
|
||||
|
||||
import audio_extract as ae
|
||||
ae._audio_cache = audio_url_cache
|
||||
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
|
|
@ -161,12 +166,17 @@ def step_audio(args):
|
|||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
|
||||
if 'audio_url' not in df.columns:
|
||||
logger.warning(" No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
|
||||
return
|
||||
|
||||
if args.test:
|
||||
df = df.head(args.test)
|
||||
|
||||
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
no_url = 0
|
||||
|
||||
def strip_nik(t: str) -> str:
|
||||
return "".join(c for c in unicodedata.normalize("NFD", t)
|
||||
|
|
@ -175,6 +185,8 @@ def step_audio(args):
|
|||
for _, row in df.iterrows():
|
||||
word = str(row.get("Word", "")).strip()
|
||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
||||
audio_url = str(row.get("audio_url", "")).strip()
|
||||
|
||||
if not word:
|
||||
continue
|
||||
|
||||
|
|
@ -187,19 +199,20 @@ def step_audio(args):
|
|||
skipped += 1
|
||||
continue
|
||||
|
||||
audio_url = ae.extract_audio_url(word)
|
||||
if audio_url:
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.3)
|
||||
except Exception as e:
|
||||
logger.debug(f" Audio download failed for {word}: {e}")
|
||||
if not audio_url or audio_url in ("nan", "None", ""):
|
||||
no_url += 1
|
||||
continue
|
||||
|
||||
ae.save_audio_cache(str(audio_cache_path))
|
||||
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached")
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.3)
|
||||
except Exception as e:
|
||||
logger.debug(f" Audio download failed for {word}: {e}")
|
||||
|
||||
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Audio step failed: {e}")
|
||||
|
|
@ -350,7 +363,11 @@ def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache:
|
|||
logger.info("[5] Building vocabulary deck …")
|
||||
import apkg_builder
|
||||
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
|
|
@ -398,7 +415,11 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
|
|||
logger.info("SUMMARY")
|
||||
logger.info("=" * 60)
|
||||
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
if dict_csv.exists():
|
||||
|
|
@ -432,8 +453,8 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
|
|||
found_imgs = sum(1 for v in ic.values() if v)
|
||||
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
|
||||
|
||||
vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
|
||||
conj_apkg = OUTPUT_DIR / "pealim_conjugations.apkg"
|
||||
vocab_apkg = OUTPUT_DIR / "hebrew_vocabulary.apkg"
|
||||
conj_apkg = OUTPUT_DIR / "hebrew_conjugations.apkg"
|
||||
if vocab_apkg.exists():
|
||||
size_mb = vocab_apkg.stat().st_size / 1e6
|
||||
logger.info(f" Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")
|
||||
|
|
|
|||
|
|
@ -20,8 +20,8 @@ import tempfile
|
|||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
VOCAB_APKG = Path("output/pealim_vocabulary.apkg")
|
||||
CONJ_APKG = Path("output/pealim_conjugations.apkg")
|
||||
VOCAB_APKG = Path("output/hebrew_vocabulary.apkg")
|
||||
CONJ_APKG = Path("output/hebrew_conjugations.apkg")
|
||||
|
||||
PASS = "\033[32m✓\033[0m"
|
||||
FAIL = "\033[31m✗\033[0m"
|
||||
|
|
|
|||
Loading…
Reference in a new issue