Sprint 7: emoji/prep extraction, conjugation reduction, project rename

- Item 1/2: Extract emoji and Hebrew parentheticals (prepositions) from
  Meaning field; display emoji with 3.5em font, prep inline after Hebrew
  word. Add Emoji and Prep fields to Hebrew Flash Cards model.
- Item 3: Seeded RNG per verb reduces conjugation cards by ~630 (4 present
  forms → 1 pronoun each; past_3p → 1 gender). 1st-person forms gain gender
  label (זכר/נקבה). Total: 1,834 conj cards (was ~2,464).
- Item 4: hebrew_extract.py uses BeautifulSoup to capture data-audio URLs
  from pealim.com list pages during scraping. step_audio() reads audio_url
  column from CSV (no longer needs audio_extract.py).
- Item 5: Rename to 'Hebrew Flash Cards'. New filenames: hebrew_dict.csv,
  hebrew_extract.py, hebrew_vocabulary.apkg, hebrew_conjugations.apkg.
  Deck/model names updated throughout. Forgejo repo rename pending (sochen
  lacks admin rights — Nevo must do via UI).
- Fix: Deduplicate entries with same Hebrew word before adding notes
  (eliminates GUID collisions from duplicate source CSV rows).
- Bump RELEASE_TAG to v0.11.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-05 05:49:51 +00:00
parent f8e4873349
commit 64a1b18951
9 changed files with 21580 additions and 60 deletions

View file

@ -1,4 +1,4 @@
# Pealim — Hebrew Vocabulary & Verb Flashcards for Anki
# Hebrew Flash Cards — Hebrew Vocabulary & Verb Flashcards for Anki
![Flashcard screenshot](flashcard.png)
@ -21,7 +21,7 @@ All card data comes from open or academic sources:
## Just give me the flashcards
1. Download the `.apkg` files from [Releases](https://git.nevo.engineer/nevo/pealim/releases)
1. Download the `.apkg` files from [Releases](https://git.nevo.engineer/nevo/hebrew_flash_cards/releases)
2. Double-click to import into [Anki](https://apps.ankiweb.net/) (free, cross-platform)
3. Start studying
@ -136,7 +136,7 @@ python run.py --skip-scrape --refresh-examples
```
python run.py [options]
--skip-scrape Use cached data/pealim_dict.csv (no pealim.com scraping)
--skip-scrape Use cached data/hebrew_dict.csv (no pealim.com scraping)
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--only {vocab,conjugations} Run only one deck (skips all unrelated steps)
@ -150,20 +150,20 @@ python run.py [options]
| File | Description |
|------|-------------|
| `data/pealim_dict.csv` | Raw dictionary |
| `data/pealim_dict_for_anki.csv` | Enriched Anki CSV |
| `data/hebrew_dict.csv` | Raw dictionary |
| `data/hebrew_dict_for_anki.csv` | Enriched Anki CSV |
| `data/conjugations.json` | Verb conjugation data |
| `data/audio/` | Vocabulary audio (.mp3) |
| `data/audio_conj/` | Conjugation audio (.mp3) |
| `data/fonts/` | Heebo font files (bundled in .apkg) |
| `data/images/` | Noun images from Wikipedia/Commons |
| `data/image_cache.json` | Image fetch cache |
| `output/pealim_vocabulary.apkg` | Vocabulary Anki deck |
| `output/pealim_conjugations.apkg` | Conjugation Anki deck |
| `output/hebrew_vocabulary.apkg` | Vocabulary Anki deck |
| `output/hebrew_conjugations.apkg` | Conjugation Anki deck |
### Pipeline overview
1. `pealim_extract.py` — scrapes pealim.com dictionary
1. `hebrew_extract.py` — scrapes pealim.com dictionary
2. `frequency_lookup.py` — downloads/loads Hebrew frequency data
3. `benyehuda.py` — builds sentence index from Ben-Yehuda corpus
4. `extract_verb_list.py` — extracts verb list from Coffin & Bolozky PDF

View file

@ -9,6 +9,7 @@ in Anki rather than creating a duplicate.
import json
import logging
import random
import re
import unicodedata
from pathlib import Path
@ -27,15 +28,19 @@ CONJ_MODEL_ID = 1_234_567_893
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.10"
RELEASE_TAG = "v0.11"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r'[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+')
HBPAREN_RE = re.compile(r'\(([\u05b0-\u05ea\u05f0-\u05f4]+)\)')
DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
OUTPUT_DIR = Path(__file__).parent / "output"
VOCAB_APKG = OUTPUT_DIR / "pealim_vocabulary.apkg"
CONJ_APKG = OUTPUT_DIR / "pealim_conjugations.apkg"
VOCAB_APKG = OUTPUT_DIR / "hebrew_vocabulary.apkg"
CONJ_APKG = OUTPUT_DIR / "hebrew_conjugations.apkg"
# ──────────────────────────────────────────────────────────────────────────────
# Binyan → Hebrew label mapping (for conjugation card display)
@ -163,6 +168,11 @@ CARD_CSS = """
margin: 2px 0;
font-size: 15px;
}
.emoji-img {
font-size: 3.5em;
text-align: center;
margin: 0.3em 0;
}
@media (prefers-color-scheme: dark) {
.card { color: #e8e8e8; background: #1c1c1e; }
.hebrew { color: #f0f0f0; }
@ -182,7 +192,7 @@ CARD_CSS = """
# ──────────────────────────────────────────────────────────────────────────────
VOCAB_FRONT_HEB = """
<div class="hebrew">{{Word}}</div>
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
"""
@ -197,6 +207,7 @@ VOCAB_BACK_HEB = """
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
@ -210,12 +221,13 @@ VOCAB_FRONT_ENG = """
VOCAB_BACK_ENG = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#WordNoNikkud}}<div class="sec-label">ללא ניקוד: {{WordNoNikkud}}</div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label">שורש: {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label">חלק דיבור: {{PoS}}</div>{{/PoS}}
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
@ -223,7 +235,7 @@ VOCAB_BACK_ENG = """
VOCAB_MODEL = genanki.Model(
VOCAB_MODEL_ID,
"Pealim Hebrew",
"Hebrew Flash Cards",
fields=[
{"name": "Word"},
{"name": "Root"},
@ -236,6 +248,8 @@ VOCAB_MODEL = genanki.Model(
{"name": "Example"},
{"name": "Frequency"},
{"name": "Image"},
{"name": "Emoji"},
{"name": "Prep"},
],
templates=[
{
@ -444,8 +458,9 @@ def build_vocab_deck(
df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
df = df.sort_values("_freq_rank")
deck = genanki.Deck(VOCAB_DECK_ID, "Pealim Hebrew Vocabulary")
deck = genanki.Deck(VOCAB_DECK_ID, "Hebrew Vocabulary")
media_files: list[Path] = []
seen_words: set[str] = set()
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
@ -468,6 +483,21 @@ def build_vocab_deck(
if not word or not meaning:
continue
# Skip exact duplicates (same Hebrew word with nikkud)
if word in seen_words:
logger.debug(f" Skipping duplicate word: {word}")
continue
seen_words.add(word)
# Extract emoji from meaning
emoji_str = ''.join(EMOJI_RE.findall(meaning))
meaning_clean = EMOJI_RE.sub('', meaning).strip()
# Extract Hebrew parentheticals (prepositions) from meaning
preps = HBPAREN_RE.findall(meaning_clean)
prep_str = ' '.join(f'({p})' for p in preps)
meaning_clean = HBPAREN_RE.sub('', meaning_clean).strip().strip(',').strip()
# Translate PoS to Hebrew
pos_heb = _translate_pos(pos_raw) if pos_raw else ""
@ -523,7 +553,7 @@ def build_vocab_deck(
word,
root,
pos_heb,
meaning,
meaning_clean,
word_no_nik,
related_html or shared_roots,
tags_str,
@ -531,11 +561,21 @@ def build_vocab_deck(
example_html,
freq_display,
image_tag,
emoji_str,
prep_str,
],
tags=(tags_str.split() if tags_str else []) + [RELEASE_TAG],
)
deck.add_note(note)
# Diagnostic: count words with emoji/prep extracted
emoji_count = sum(1 for n in deck.notes if n.fields[11])
prep_count = sum(1 for n in deck.notes if n.fields[12])
if emoji_count:
logger.info(f" Emoji extracted: {emoji_count} words")
if prep_count:
logger.info(f" Hebrew prepositions extracted: {prep_count} words")
# Diagnostic: count words without PoS coverage in shared_roots
other_count = 0
for _, row in df.iterrows():
@ -557,7 +597,7 @@ def build_conj_deck(
audio_dir: Path = AUDIO_CONJ_DIR,
) -> tuple[genanki.Deck, list[Path]]:
"""Build the conjugation drill deck from conjugations.json data."""
deck = genanki.Deck(CONJ_DECK_ID, "Pealim Hebrew Conjugations")
deck = genanki.Deck(CONJ_DECK_ID, "Hebrew Conjugations")
media_files: list[Path] = []
note_count = 0
@ -601,6 +641,9 @@ def build_conj_deck(
alternate_forms = data.get("alternate_forms", {})
# Seeded RNG per verb — deterministic pronoun/gender choices
verb_rng = random.Random(hash(infinitive) & 0xFFFFFFFF)
for form_key, form_data in forms.items():
primary_form = form_data.get("form", "")
alt_form = alternate_forms.get(form_key, "")
@ -620,16 +663,16 @@ def build_conj_deck(
if mp3_path not in media_files:
media_files.append(mp3_path)
# Present tense expansion: 4 form keys → 3 cards each = 12 cards
# Present tense expansion: 4 form keys → 1 card each (seeded RNG)
if form_key in PRESENT_EXPANSION:
for pronoun, tense_label in PRESENT_EXPANSION[form_key]:
add_note(pronoun, tense_label, conj_form, audio_tag)
chosen = verb_rng.choice(PRESENT_EXPANSION[form_key])
add_note(chosen[0], chosen[1], conj_form, audio_tag)
continue
# Past 3rd plural: same form for m/f → two separate pronoun cards
# Past 3rd plural: same form for m/f → 1 card (seeded RNG)
if form_key == "past_3p":
for pronoun, tense_label in PAST_3P_EXPANSION:
add_note(pronoun, tense_label, conj_form, audio_tag)
chosen = verb_rng.choice(PAST_3P_EXPANSION)
add_note(chosen[0], chosen[1], conj_form, audio_tag)
continue
# 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
@ -649,6 +692,12 @@ def build_conj_deck(
# Standard card
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
gender = verb_rng.choice(["זכר", "נקבה"])
pronoun = f"{pronoun} ({gender})"
add_note(pronoun, tense, conj_form, audio_tag)
@ -707,7 +756,11 @@ def write_conj_apkg(
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
if not csv_path.exists():
csv_path = DATA_DIR / "hebrew_dict.csv"
if not csv_path.exists():
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
if not csv_path.exists():
csv_path = DATA_DIR / "pealim_dict.csv"

View file

@ -33,7 +33,13 @@ REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
DICT_CSV = Path(__file__).parent / "data" / "pealim_dict_for_anki.csv"
DICT_CSV = next(
(p for p in [
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
Path(__file__).parent / "data" / "pealim_dict_for_anki.csv",
] if p.exists()),
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
)
# Pronoun labels (for card front display)
PRONOUN_LABELS = {

9106
data/hebrew_dict.csv Normal file

File diff suppressed because it is too large Load diff

12111
data/hebrew_dict_for_anki.csv Normal file

File diff suppressed because it is too large Load diff

219
hebrew_extract.py Normal file
View file

@ -0,0 +1,219 @@
#!/usr/bin/env python3
"""
Extract Hebrew vocabulary from pealim.com dictionary.
Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards.
"""
import requests
import pandas as pd
from bs4 import BeautifulSoup
import logging
import time
from typing import Optional
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Session for connection pooling
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
})
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
REQUEST_TIMEOUT = 10 # seconds
def get_total_pages() -> int:
"""Dynamically determine total pages from first request."""
try:
logger.info("Fetching total page count...")
cookies = {'translit': 'none', 'hebstyle': 'mo'}
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
# Hardcoded — pealim.com has ~608 pages at ~15 words/page
return 608
except Exception as e:
logger.error(f"Error fetching page count: {e}. Using default (608).")
return 608
def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
"""
Parse a dict page with BeautifulSoup to extract word data + audio URL.
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
"""
soup = BeautifulSoup(html_bytes, 'html.parser')
rows = []
for tr in soup.select('table tr'):
tds = tr.find_all('td')
if len(tds) < 4:
continue
# Audio URL from span[data-audio] in first td
audio_span = tds[0].find(attrs={'data-audio': True})
audio_url = audio_span['data-audio'] if audio_span else ''
# Word with nikkud
menukad = tds[0].find('span', class_='menukad')
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
# Root (may be link or plain text)
root = tds[1].get_text(strip=True)
# Part of speech
pos = tds[2].get_text(strip=True)
# Meaning
meaning = tds[3].get_text(strip=True)
if word:
rows.append({
'Word': word,
'Root': root if root else '-',
'Part of Speech': pos,
'Meaning': meaning,
'audio_url': audio_url,
})
return rows
def extract_from_website(max_pages: Optional[int] = None) -> pd.DataFrame:
"""
Extract dictionary entries from pealim.com.
Captures audio URLs from each word entry's data-audio attribute.
Args:
max_pages: Maximum pages to scrape (None = all)
Returns:
DataFrame with Word, Root, Part of Speech, Meaning, Word Without Nikkud, audio_url columns
"""
total_pages = max_pages or get_total_pages()
logger.info(f"Starting extraction from {total_pages} pages...")
all_rows: list[dict] = []
for page_num in range(1, total_pages):
try:
url = f"{PEALIM_DICT_URL}?page={page_num}"
# First request: with nikkud — parse with BeautifulSoup for audio URL
cookies = {'translit': 'none', 'hebstyle': 'mo'}
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
page_rows = _parse_page_with_audio(response.content)
# Second request: without nikkud — just get the word column
cookies_vl = {'translit': 'none', 'hebstyle': 'vl', 'showmeaning': 'off'}
resp_vl = session.get(url, cookies=cookies_vl, timeout=REQUEST_TIMEOUT)
resp_vl.raise_for_status()
soup_vl = BeautifulSoup(resp_vl.content, 'html.parser')
no_nik_words = []
for tr in soup_vl.select('table tr'):
tds = tr.find_all('td')
if len(tds) < 4:
continue
menukad = tds[0].find('span', class_='menukad')
w = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
no_nik_words.append(w)
# Merge no-nikkud words into rows
for i, row in enumerate(page_rows):
row['Word Without Nikkud'] = no_nik_words[i] if i < len(no_nik_words) else ''
all_rows.extend(page_rows)
if page_num % 50 == 0:
logger.info(f"Processed {page_num}/{total_pages} pages ({len(all_rows)} words so far)...")
time.sleep(REQUEST_DELAY)
except requests.RequestException as e:
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
time.sleep(REQUEST_DELAY * 2)
except Exception as e:
logger.error(f"Unexpected error on page {page_num}: {e}")
continue
df = pd.DataFrame(all_rows)
audio_count = (df['audio_url'] != '').sum() if 'audio_url' in df.columns else 0
logger.info(f"Extraction complete. Total words: {len(df)}, with audio URL: {audio_count}")
return df
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
"""
Transform dictionary DataFrame for Anki import.
Adds shared root words and Hebrew tags. Preserves audio_url column.
"""
logger.info("Preparing data for Anki...")
# Find shared root words
shared_root_words = []
for idx, row in df.iterrows():
root = row['Root']
word = row['Word']
if root != '-' and pd.notna(root):
same_root = df[(df['Root'] == root) & (df['Word'] != word)]['Word'].values
shared = ' '.join(str(w) for w in same_root)
shared_root_words.append(shared)
else:
shared_root_words.append('')
df['shared roots'] = shared_root_words
# Generate Hebrew tags
tags = []
for idx, row in df.iterrows():
tag_parts = []
root = str(row['Root']).replace(' ', '').replace('-', '')
if 'nan' not in root and root:
root_clean = root.replace('.', '')
tag_parts.append(f"שורש::{root_clean}")
pos = str(row['Part of Speech'])
pos_tags = {
'Adverb': 'תוארי_הפועל',
'Pronoun': 'כינוייוף',
'Noun': 'שם_עצם',
'Verb': 'פעלים',
'Adjective': 'שם_תואר',
'Preposition': 'מילות_יחס',
'Conjunction': 'מילות_חיבור',
'Particle': 'מילית'
}
for key, value in pos_tags.items():
if key in pos:
tag_parts.append(value)
break
tags.append(' '.join(tag_parts))
df['tags'] = tags
logger.info("Anki preparation complete.")
return df
def main():
"""Main entry point."""
try:
df = extract_from_website()
df.to_csv('hebrew_dict.csv', index=True)
logger.info("Saved: hebrew_dict.csv")
df = modify_for_anki(df)
df.to_csv('hebrew_dict_for_anki.csv', sep=';', index=True)
logger.info("Saved: hebrew_dict_for_anki.csv")
logger.info("Complete!")
except Exception as e:
logger.error(f"Fatal error: {e}")
raise
if __name__ == '__main__':
main()

View file

@ -214,7 +214,11 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
"""
import pandas as pd
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if not dict_csv.exists():

85
run.py
View file

@ -55,26 +55,31 @@ def parse_args():
def step_scrape(args):
"""Step 1 — scrape or load dictionary."""
dict_csv = DATA_DIR / "pealim_dict.csv"
anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
dict_csv = DATA_DIR / "hebrew_dict.csv"
anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
# Legacy fallback names
legacy_dict = DATA_DIR / "pealim_dict.csv"
legacy_anki = DATA_DIR / "pealim_dict_for_anki.csv"
if args.skip_scrape:
if dict_csv.exists():
logger.info(f"[1] Using existing {dict_csv}")
elif legacy_dict.exists():
logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
else:
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
sys.exit(1)
return
logger.info("[1] Scraping dictionary from pealim.com …")
import pealim_extract
import hebrew_extract
import pandas as pd
df = pealim_extract.extract_from_website()
df = hebrew_extract.extract_from_website()
df.to_csv(dict_csv, index=True)
logger.info(f" Saved {len(df)} words → {dict_csv}")
df = pealim_extract.modify_for_anki(df)
df = hebrew_extract.modify_for_anki(df)
df.to_csv(anki_csv, sep=";", index=True)
logger.info(f" Saved Anki CSV → {anki_csv}")
@ -101,7 +106,11 @@ def step_examples(args, freq_cache: dict):
import benyehuda
benyehuda.load(force_rebuild=args.refresh_examples)
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
@ -132,22 +141,18 @@ def step_examples(args, freq_cache: dict):
def step_audio(args):
"""Step 4 — download vocabulary audio .mp3 files."""
"""Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
if args.skip_audio:
logger.info("[4] Skipping audio (--skip-audio)")
return
logger.info("[4] Downloading vocabulary audio files …")
audio_cache_path = DATA_DIR / "audio_cache.json"
audio_url_cache: dict = {}
if audio_cache_path.exists():
with open(audio_cache_path) as f:
audio_url_cache = json.load(f)
import audio_extract as ae
ae._audio_cache = audio_url_cache
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
@ -161,12 +166,17 @@ def step_audio(args):
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
if 'audio_url' not in df.columns:
logger.warning(" No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
return
if args.test:
df = df.head(args.test)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
downloaded = 0
skipped = 0
no_url = 0
def strip_nik(t: str) -> str:
return "".join(c for c in unicodedata.normalize("NFD", t)
@ -175,6 +185,8 @@ def step_audio(args):
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
word_plain = str(row.get("Word Without Nikkud", "")).strip()
audio_url = str(row.get("audio_url", "")).strip()
if not word:
continue
@ -187,19 +199,20 @@ def step_audio(args):
skipped += 1
continue
audio_url = ae.extract_audio_url(word)
if audio_url:
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.3)
except Exception as e:
logger.debug(f" Audio download failed for {word}: {e}")
if not audio_url or audio_url in ("nan", "None", ""):
no_url += 1
continue
ae.save_audio_cache(str(audio_cache_path))
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached")
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.3)
except Exception as e:
logger.debug(f" Audio download failed for {word}: {e}")
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
except Exception as e:
logger.warning(f" Audio step failed: {e}")
@ -350,7 +363,11 @@ def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache:
logger.info("[5] Building vocabulary deck …")
import apkg_builder
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
@ -398,7 +415,11 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
logger.info("SUMMARY")
logger.info("=" * 60)
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if dict_csv.exists():
@ -432,8 +453,8 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
found_imgs = sum(1 for v in ic.values() if v)
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
conj_apkg = OUTPUT_DIR / "pealim_conjugations.apkg"
vocab_apkg = OUTPUT_DIR / "hebrew_vocabulary.apkg"
conj_apkg = OUTPUT_DIR / "hebrew_conjugations.apkg"
if vocab_apkg.exists():
size_mb = vocab_apkg.stat().st_size / 1e6
logger.info(f" Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")

View file

@ -20,8 +20,8 @@ import tempfile
import zipfile
from pathlib import Path
VOCAB_APKG = Path("output/pealim_vocabulary.apkg")
CONJ_APKG = Path("output/pealim_conjugations.apkg")
VOCAB_APKG = Path("output/hebrew_vocabulary.apkg")
CONJ_APKG = Path("output/hebrew_conjugations.apkg")
PASS = "\033[32m✓\033[0m"
FAIL = "\033[31m✗\033[0m"