feat: add apkg builder, frequency, Ben Yehuda examples, conjugation deck

Implements four major improvements to the Pealim Anki deck pipeline:

1. Automated .apkg generation (genanki) — no more manual Anki Desktop step.
   Both vocabulary and conjugation decks are built programmatically.

2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus.
   Notes sorted by rank so Anki presents most common words first.

3. Example sentences from Ben Yehuda public domain corpus (not pealim.com).
   Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set.

4. Conjugation drill deck — one card per form × verb.
   Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one
   per binyan). Extracts 28 forms each via pealim.com/search/ + table parse.

New files:
  apkg_builder.py     — genanki deck builder for both decks
  benyehuda.py        — Ben Yehuda corpus downloader + sentence indexer
  frequency_lookup.py — FrequencyWords downloader + rank lookup
  verbs_input.txt     — verb input list (7 test verbs, one per binyan)
  data/               — baseline CSVs + generated caches

Updated:
  conjugation_extract.py — rewritten: reads verbs_input.txt, searches
                           /search/?q= for slug, parses table by row labels
  requirements.txt       — add genanki, beautifulsoup4, lxml
  run.py                 — full orchestration pipeline with CLI flags
  .gitignore             — exclude venv/, benyehuda_index.json, audio/, output/

CLI:
  python run.py --skip-scrape --skip-audio --test 20  (quick test)
  python run.py --skip-scrape                          (full build)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-03 01:58:31 +00:00
parent e23b353064
commit b086123bec
13 changed files with 23502 additions and 162 deletions

12
.gitignore vendored
View file

@ -8,3 +8,15 @@ lib**
include**
lib64**
pyvenv.cfg
venv/
__pycache__/
*.pyc
# Large generated cache files (rebuild locally)
data/benyehuda_index.json
# Audio directory (large; rebuild with --skip-scrape)
data/audio/
# Output .apkg files (generated by pipeline)
output/

428
apkg_builder.py Normal file
View file

@ -0,0 +1,428 @@
#!/usr/bin/env python3
"""
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
Uses genanki for reliable, stable deck generation.
Deck IDs are hardcoded integers same ID on re-import updates the existing deck
in Anki rather than creating a duplicate.
"""
import json
import logging
import unicodedata
from pathlib import Path
from typing import Optional
import genanki
import pandas as pd
logger = logging.getLogger(__name__)
# Stable deck/model IDs — do not change these
VOCAB_DECK_ID = 1_234_567_890
VOCAB_MODEL_ID = 1_234_567_891
CONJ_DECK_ID = 1_234_567_892
CONJ_MODEL_ID = 1_234_567_893
DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
OUTPUT_DIR = Path(__file__).parent / "output"
VOCAB_APKG = OUTPUT_DIR / "pealim_vocabulary.apkg"
CONJ_APKG = OUTPUT_DIR / "pealim_conjugations.apkg"
# ──────────────────────────────────────────────────────────────────────────────
# Shared CSS
# ──────────────────────────────────────────────────────────────────────────────
CARD_CSS = """
.card {
font-family: Arial, sans-serif;
font-size: 20px;
text-align: center;
color: #222;
background: #fff;
padding: 16px;
}
.hebrew {
font-size: 36px;
font-weight: bold;
direction: rtl;
text-align: right;
line-height: 1.5;
color: #1a1a8c;
}
.hebrew-sm {
font-size: 24px;
direction: rtl;
text-align: right;
color: #333;
}
.label {
font-size: 13px;
color: #888;
text-transform: uppercase;
letter-spacing: 0.05em;
margin-top: 10px;
}
.meaning {
font-size: 28px;
color: #111;
margin: 8px 0;
}
.root-info {
font-size: 16px;
color: #555;
margin-top: 6px;
direction: rtl;
}
.example {
font-size: 16px;
color: #444;
direction: rtl;
text-align: right;
font-style: italic;
margin-top: 10px;
border-left: 3px solid #aaa;
padding-left: 8px;
}
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
.freq-badge {
display: inline-block;
font-size: 12px;
color: #fff;
background: #0070c0;
border-radius: 10px;
padding: 2px 8px;
margin-top: 4px;
}
"""
# ──────────────────────────────────────────────────────────────────────────────
# Vocabulary Deck
# ──────────────────────────────────────────────────────────────────────────────
VOCAB_FRONT_HEB = """
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<div class="label">What does this mean?</div>
"""
VOCAB_BACK_HEB = """
{{FrontSide}}
<div class="divider"></div>
<div class="meaning">{{Meaning}}</div>
<div class="label">Root</div>
<div class="hebrew-sm">{{Root}}</div>
<div class="label">Part of Speech</div>
<div style="font-size:15px;color:#555">{{PoS}}</div>
{{#SharedRoots}}
<div class="label">Related words (same root)</div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Example}}
<div class="label">Example</div>
<div class="example">{{Example}}</div>
{{/Example}}
{{#Frequency}}<div class="freq-badge">Rank #{{Frequency}}</div>{{/Frequency}}
"""
VOCAB_FRONT_ENG = """
<div class="meaning">{{Meaning}}</div>
<div class="label">Translate to Hebrew</div>
"""
VOCAB_BACK_ENG = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<div class="label">Without nikkud</div>
<div class="hebrew-sm">{{WordNoNikkud}}</div>
<div class="label">Root</div>
<div class="hebrew-sm">{{Root}}</div>
<div class="label">Part of Speech</div>
<div style="font-size:15px;color:#555">{{PoS}}</div>
{{#Example}}
<div class="label">Example</div>
<div class="example">{{Example}}</div>
{{/Example}}
"""
VOCAB_MODEL = genanki.Model(
VOCAB_MODEL_ID,
"Pealim Hebrew",
fields=[
{"name": "Word"},
{"name": "Root"},
{"name": "PoS"},
{"name": "Meaning"},
{"name": "WordNoNikkud"},
{"name": "SharedRoots"},
{"name": "Tags"},
{"name": "Audio"},
{"name": "Example"},
{"name": "Frequency"},
],
templates=[
{
"name": "Hebrew → English",
"qfmt": VOCAB_FRONT_HEB,
"afmt": VOCAB_BACK_HEB,
},
{
"name": "English → Hebrew",
"qfmt": VOCAB_FRONT_ENG,
"afmt": VOCAB_BACK_ENG,
},
],
css=CARD_CSS,
)
# ──────────────────────────────────────────────────────────────────────────────
# Conjugation Deck
# ──────────────────────────────────────────────────────────────────────────────
CONJ_FRONT = """
<div class="label">פועל (Verb)</div>
<div class="hebrew">{{ReferenceForm}}</div>
{{#Pronoun}}<div class="hebrew-sm">{{Pronoun}}</div>{{/Pronoun}}
<div class="label">זמן (Tense)</div>
<div class="hebrew-sm">{{Tense}}</div>
<div class="label">מה הצורה? (What is the form?)</div>
"""
CONJ_BACK = """
{{FrontSide}}
<div class="divider"></div>
<div class="hebrew">{{ConjugatedForm}}</div>
<div class="label">שורש (Root): {{Root}} &nbsp;|&nbsp; בניין (Binyan): {{Binyan}}</div>
"""
CONJ_CSS = CARD_CSS + """
.card { direction: rtl; }
.label { direction: ltr; }
"""
CONJ_MODEL = genanki.Model(
CONJ_MODEL_ID,
"Pealim Conjugation",
fields=[
{"name": "Infinitive"},
{"name": "ReferenceForm"},
{"name": "Pronoun"},
{"name": "Tense"},
{"name": "ConjugatedForm"},
{"name": "Root"},
{"name": "Binyan"},
],
templates=[
{
"name": "Conjugation Drill",
"qfmt": CONJ_FRONT,
"afmt": CONJ_BACK,
}
],
css=CONJ_CSS,
)
# ──────────────────────────────────────────────────────────────────────────────
# Helpers
# ──────────────────────────────────────────────────────────────────────────────
def _strip_nikkud(text: str) -> str:
return "".join(
ch for ch in unicodedata.normalize("NFD", text)
if unicodedata.category(ch) != "Mn"
)
def _audio_tag(word_no_nikkud: str) -> str:
"""Return [sound:xxx.mp3] if audio file exists, else empty string."""
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
if not safe:
return ""
mp3_path = AUDIO_DIR / f"{safe}.mp3"
if mp3_path.exists():
return f"[sound:{mp3_path.name}]"
return ""
import re
def build_vocab_deck(
dict_csv: Path,
examples_cache: Optional[dict] = None,
freq_cache: Optional[dict] = None,
limit: Optional[int] = None,
) -> tuple[genanki.Deck, list[Path]]:
"""
Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv).
Returns (deck, list_of_media_files).
"""
logger.info(f"Loading dictionary from {dict_csv}")
# Try semicolon separator first (enriched CSV), fall back to comma
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
if limit:
df = df.head(limit)
logger.info(f" {len(df)} rows loaded")
examples_cache = examples_cache or {}
freq_cache = freq_cache or {}
# Sort by frequency rank (ascending) so Anki presents common words first
def freq_sort_key(row):
word_plain = str(row.get("Word Without Nikkud", row.get("WordNoNikkud", ""))).strip()
word_plain = _strip_nikkud(word_plain)
return freq_cache.get(word_plain, 999_999)
df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
df = df.sort_values("_freq_rank")
deck = genanki.Deck(VOCAB_DECK_ID, "Pealim Hebrew Vocabulary")
media_files: list[Path] = []
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
root = str(row.get("Root", "")).strip()
pos = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
meaning = str(row.get("Meaning", "")).strip()
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
tags_str = str(row.get("tags", row.get("Tags", ""))).strip()
freq_rank = int(row["_freq_rank"]) if row["_freq_rank"] < 999_999 else ""
# Audio
audio_tag = _audio_tag(word_no_nik)
if audio_tag:
mp3_name = audio_tag[7:-1] # strip [sound: and ]
mp3_path = AUDIO_DIR / mp3_name
if mp3_path not in media_files:
media_files.append(mp3_path)
# Example sentences
plain_key = _strip_nikkud(word_no_nik)
examples_list = examples_cache.get(plain_key, examples_cache.get(word_no_nik, []))
example_html = "<br>".join(examples_list[:2]) if examples_list else ""
# Clean up nan values
for val, default in [(root, ""), (pos, ""), (meaning, ""), (word_no_nik, ""),
(shared_roots, ""), (tags_str, "")]:
if val in ("nan", "None"):
val = default
root = "" if root in ("nan", "None", "-") else root
pos = "" if pos in ("nan", "None") else pos
meaning = "" if meaning in ("nan", "None") else meaning
word_no_nik = "" if word_no_nik in ("nan", "None") else word_no_nik
shared_roots = "" if shared_roots in ("nan", "None") else shared_roots
tags_str = "" if tags_str in ("nan", "None") else tags_str
if not word or not meaning:
continue
note = genanki.Note(
model=VOCAB_MODEL,
fields=[
word,
root,
pos,
meaning,
word_no_nik,
shared_roots,
tags_str,
audio_tag,
example_html,
str(freq_rank),
],
tags=tags_str.split() if tags_str else [],
)
deck.add_note(note)
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
return deck, media_files
def build_conj_deck(conjugations: dict) -> genanki.Deck:
"""Build the conjugation drill deck from conjugations.json data."""
deck = genanki.Deck(CONJ_DECK_ID, "Pealim Hebrew Conjugations")
note_count = 0
for infinitive, data in conjugations.items():
if not data or not data.get("forms"):
continue
root = data.get("root", "")
binyan = data.get("binyan", "")
ref_form = data.get("reference_form", infinitive)
for form_key, form_data in data["forms"].items():
conj_form = form_data.get("form", "")
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
continue
note = genanki.Note(
model=CONJ_MODEL,
fields=[
infinitive,
ref_form,
pronoun,
tense,
conj_form,
root,
binyan,
],
)
deck.add_note(note)
note_count += 1
logger.info(f"Conjugation deck: {note_count} notes across {sum(1 for v in conjugations.values() if v)} verbs")
return deck
def write_vocab_apkg(
deck: genanki.Deck,
media_files: list[Path],
out_path: Path = VOCAB_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = genanki.Package(deck)
pkg.media_files = [str(p) for p in media_files if p.exists()]
pkg.write_to_file(str(out_path))
logger.info(f"Vocabulary deck written → {out_path}")
def write_conj_apkg(deck: genanki.Deck, out_path: Path = CONJ_APKG) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
genanki.Package(deck).write_to_file(str(out_path))
logger.info(f"Conjugation deck written → {out_path}")
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
# Quick self-test with 20 words, no audio, no examples
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
if not csv_path.exists():
csv_path = DATA_DIR / "pealim_dict.csv"
deck, media = build_vocab_deck(csv_path, limit=20)
write_vocab_apkg(deck, media)
conj_path = DATA_DIR / "conjugations.json"
if conj_path.exists():
with open(conj_path) as f:
conjugations = json.load(f)
conj_deck = build_conj_deck(conjugations)
write_conj_apkg(conj_deck)

160
benyehuda.py Normal file
View file

@ -0,0 +1,160 @@
#!/usr/bin/env python3
"""
Ben Yehuda corpus example-sentence lookup.
Downloads plaintext-no-nikkud ZIP once, indexes sentences, then answers queries locally.
Exposed API: get_examples(word_no_nikkud) -> list[str]
"""
import json
import logging
import re
import unicodedata
import zipfile
from io import BytesIO
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
CORPUS_URL = (
"https://github.com/projectbenyehuda/public_domain_dump/releases/"
"download/2025-10/txt_stripped.zip"
)
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
REQUEST_TIMEOUT = 120
MIN_SENTENCE_LEN = 15
MAX_EXAMPLES_PER_WORD = 2
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
# Module-level state
_index: dict[str, list[str]] = {} # word -> [sentence, ...]
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
def _strip_nikkud(text: str) -> str:
return "".join(
ch for ch in unicodedata.normalize("NFD", text)
if unicodedata.category(ch) != "Mn"
)
def _split_sentences(text: str) -> list[str]:
"""Split text into sentences on common sentence-ending punctuation."""
raw = re.split(r"[.!?؟\n]{1,3}", text)
out = []
for s in raw:
s = s.strip()
if len(s) >= MIN_SENTENCE_LEN:
out.append(s)
return out
def _build_index(corpus_zip_bytes: bytes) -> None:
"""Parse corpus ZIP and build word → sentences index."""
global _index
_index = {}
logger.info("Building Ben Yehuda index from corpus …")
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
logger.info(f" Corpus contains {len(txt_files)} text files")
for fname in txt_files:
try:
raw = zf.read(fname).decode("utf-8", errors="ignore")
except Exception:
continue
for sentence in _split_sentences(raw):
words = re.findall(r"[\u05d0-\u05ea'\"]+", sentence)
for w in set(words):
if len(w) >= 2:
if w not in _index:
_index[w] = []
if len(_index[w]) < MAX_INDEX_ENTRIES:
_index[w].append(sentence)
logger.info(f"Index built: {len(_index)} unique words")
def _save_index() -> None:
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(INDEX_PATH, "w", encoding="utf-8") as f:
json.dump(_index, f, ensure_ascii=False)
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
def _load_index() -> None:
global _index
with open(INDEX_PATH, encoding="utf-8") as f:
_index = json.load(f)
logger.info(f"Ben Yehuda index loaded: {len(_index)} words")
def load(force_rebuild: bool = False) -> None:
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
global _index, _examples_cache
if _index and not force_rebuild:
return
# Load persisted examples cache
if EXAMPLES_CACHE_PATH.exists():
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
_examples_cache = json.load(f)
if INDEX_PATH.exists() and not force_rebuild:
_load_index()
return
logger.info("Downloading Ben Yehuda corpus … (this may take 1-2 minutes)")
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
resp.raise_for_status()
data = resp.content
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
_build_index(data)
_save_index()
def save_examples_cache() -> None:
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
json.dump(_examples_cache, f, ensure_ascii=False)
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
def get_examples(word_no_nikkud: str) -> list[str]:
"""
Return up to 2 shortest complete sentences (15 chars) containing word_no_nikkud
as a whole token. Results are cached; subsequent calls for the same word are instant.
"""
if not _index:
load()
word = _strip_nikkud(word_no_nikkud.strip())
if word in _examples_cache:
return _examples_cache[word]
candidates = _index.get(word, [])
# Filter: must contain word as whole token (word boundary)
pattern = r"(?<![^\s\W])" + re.escape(word) + r"(?![^\s\W])"
matched = [s for s in candidates if re.search(pattern, s)]
# Sort by length (prefer shorter, more natural sentences)
matched.sort(key=len)
result = matched[:MAX_EXAMPLES_PER_WORD]
_examples_cache[word] = result
return result
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()
tests = ["שלום", "בית", "ספר", "מים", "אהבה", "ילד"]
for w in tests:
exs = get_examples(w)
print(f"\n{w}: {len(exs)} examples")
for ex in exs:
print(f"{ex[:80]}")
save_examples_cache()

View file

@ -1,153 +1,408 @@
#!/usr/bin/env python3
"""
Extract Hebrew verb conjugations from pealim.com.
Scrapes conjugation tables for specific verbs.
Input: verbs_input.txt (one Hebrew infinitive per line)
Output: data/conjugations.json
For each verb:
1. Search pealim.com/search/?q=<verb> to find URL slug
2. Fetch /dict/<slug>/ with hebstyle=mo cookie
3. Parse conjugation table by row labels
Resume-safe: verbs already in conjugations.json are skipped.
"""
import requests
import pandas as pd
import numpy as np
import json
import logging
import re
import time
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)
# Session for connection pooling
PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
# Pronoun labels (for card front display)
PRONOUN_LABELS = {
"present_ms": "",
"present_fs": "",
"present_mp": "",
"present_fp": "",
"past_1s": "אֲנִי",
"past_1p": "אֲנַחְנוּ",
"past_2ms": "אַתָּה",
"past_2fs": "אַתְּ",
"past_2mp": "אַתֶּם",
"past_2fp": "אַתֶּן",
"past_3ms": "הוּא",
"past_3fs": "הִיא",
"past_3p": "הֵם / הֵן",
"future_1s": "אֲנִי",
"future_1p": "אֲנַחְנוּ",
"future_2ms": "אַתָּה",
"future_2fs": "אַתְּ",
"future_2mp": "אַתֶּם",
"future_2fp": "אַתֶּן",
"future_3ms": "הוּא",
"future_3fs": "הִיא",
"future_3mp": "הֵם",
"future_3fp": "הֵן",
"imperative_ms": "אַתָּה",
"imperative_fs": "אַתְּ",
"imperative_mp": "אַתֶּם",
"imperative_fp": "אַתֶּן",
"infinitive": "",
}
# Human-readable tense description for card front
TENSE_DESCRIPTION = {
"present_ms": "הוֹוֶה (זכר יחיד)",
"present_fs": "הוֹוֶה (נקבה יחיד)",
"present_mp": "הוֹוֶה (זכר רבים)",
"present_fp": "הוֹוֶה (נקבה רבים)",
"past_1s": "עָבָר",
"past_1p": "עָבָר",
"past_2ms": "עָבָר",
"past_2fs": "עָבָר",
"past_2mp": "עָבָר",
"past_2fp": "עָבָר",
"past_3ms": "עָבָר",
"past_3fs": "עָבָר",
"past_3p": "עָבָר",
"future_1s": "עָתִיד",
"future_1p": "עָתִיד",
"future_2ms": "עָתִיד",
"future_2fs": "עָתִיד",
"future_2mp": "עָתִיד",
"future_2fp": "עָתִיד",
"future_3ms": "עָתִיד",
"future_3fs": "עָתִיד",
"future_3mp": "עָתִיד",
"future_3fp": "עָתִיד",
"imperative_ms": "צִוּוּי",
"imperative_fs": "צִוּוּי",
"imperative_mp": "צִוּוּי",
"imperative_fp": "צִוּוּי",
"infinitive": "מְקוֹר",
}
session = requests.Session()
session.headers.update({
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
})
PEALIM_BASE_URL = "https://www.pealim.com/dict"
REQUEST_TIMEOUT = 10
REQUEST_DELAY = 1.0 # seconds between requests (respectful scraping)
# Conjugation column order (standard Hebrew verb forms)
CONJUGATION_COLUMNS = [
'present_ms', 'present_fs', 'present_mp', 'present_fp',
'past_1s', 'past_1p', 'past_2ms', 'past_2fs', 'past_2mp', 'past_2fp',
'past_3ms', 'past_3fs', 'past_3p',
'future_1s', 'future_1p', 'future_2ms', 'future_2fs', 'future_2mp', 'future_2fp',
'future_3ms', 'future_3fs', 'future_3mp', 'future_3fp',
'imperative_ms', 'imperative_fs', 'imperative_mp', 'imperative_fp',
'infinitive'
]
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
def extract_verb(url_suffix: str, max_retries: int = 3) -> pd.DataFrame:
"""
Extract conjugation table for a single verb.
Args:
url_suffix: URL suffix (e.g., '2255-lishmor', '860-lishon')
max_retries: Maximum retry attempts on failure
Returns:
DataFrame with conjugation forms, or None if extraction fails
"""
url = f"{PEALIM_BASE_URL}/{url_suffix}"
for attempt in range(max_retries):
def _find_slug(infinitive: str) -> str | None:
"""Search pealim.com/search/?q=<verb> and return the URL slug."""
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(infinitive)}"
try:
logger.info(f"Fetching: {url} (attempt {attempt + 1}/{max_retries})")
resp = session.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
# Slugs look like /dict/2255-lishmor/
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
if slugs:
slug = slugs[0]
logger.info(f" Slug: {slug}")
return slug
except Exception as e:
logger.error(f" Error searching for '{infinitive}': {e}")
return None
cookies = {
'translit': 'none',
'hebstyle': 'bp',
'showmeaning': 'off'
def _is_passive_binyan(binyan: str) -> bool:
for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]:
if marker.lower() in binyan.lower():
return True
return False
def _get_menukad(cell) -> str:
"""Extract nikkud Hebrew text from a table cell."""
span = cell.find("span", class_="menukad")
if span:
return span.get_text(strip=True)
# fallback: any Hebrew text in cell
txt = cell.get_text(strip=True)
if re.search(r"[\u05d0-\u05ea]", txt):
return txt
return ""
def _parse_table(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the pealim conjugation table and return form_key -> Hebrew form mapping.
Table structure (rows after two header rows):
Row 2 (Present): [label x2] [ms] [fs] [mp] [fp]
Row 3 (Past 1): [Past x1] [1st x1] [1s x2] [1p x2]
Row 4 (Past 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
Row 5 (Past 3): [3rd x1] [3ms] [3fs] [3p x2]
Row 6 (Fut 1): [Future x1] [1st x1] [1s x2] [1p x2]
Row 7 (Fut 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
Row 8 (Fut 3): [3rd x1] [3ms] [3fs] [3mp] [3fp]
Row 9 (Imp): [Imp x2] [ms] [fs] [mp] [fp]
Row 10 (Inf): [Inf x2] [form x4]
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
rows = table.find_all("tr")
if len(rows) < 9:
return {}
forms: dict[str, str] = {}
def row_forms(row_idx: int) -> list[str]:
"""Extract all Hebrew form values from a row (expanding colspans)."""
cells = rows[row_idx].find_all(["th", "td"])
result = []
for cell in cells:
txt = _get_menukad(cell)
colspan = int(cell.get("colspan", 1))
if txt:
for _ in range(colspan):
result.append(txt)
else:
for _ in range(colspan):
result.append("")
return result
def first_heb_forms(row_idx: int) -> list[str]:
"""Get only the Hebrew-text cells from a row (skip label cells)."""
cells = rows[row_idx].find_all(["th", "td"])
result = []
for cell in cells:
txt = _get_menukad(cell)
colspan = int(cell.get("colspan", 1))
if txt and re.search(r"[\u05d0-\u05ea]", txt):
for _ in range(colspan):
result.append(txt)
return result
# Row label detection
def row_label(idx: int) -> str:
row = rows[idx]
return row.get_text(" ", strip=True).lower()
# Find rows by tense label
present_row = past_row = future_row = imp_row = inf_row = -1
for i, row in enumerate(rows):
label = row.get_text(" ", strip=True).lower()
if "present" in label and present_row < 0:
present_row = i
elif "past" in label and past_row < 0:
past_row = i
elif "future" in label and future_row < 0:
future_row = i
elif "imperative" in label and imp_row < 0:
imp_row = i
elif "infinitive" in label and inf_row < 0:
inf_row = i
# Present tense (4 forms: ms fs mp fp)
if present_row >= 0:
hf = first_heb_forms(present_row)
keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
for k, v in zip(keys, hf):
if v:
forms[k] = v
# Past tense (rows: 1st person, 2nd person, 3rd person)
if past_row >= 0:
# 1st person row
hf = first_heb_forms(past_row)
# Row has: [Past label] [1st label] [1s] [1s] [1p] [1p] or just [1s] [1p]
# After label stripping: we get 1s and 1p (possibly duplicated by colspan)
unique = list(dict.fromkeys(hf)) # deduplicate consecutive
if len(unique) >= 1:
forms["past_1s"] = unique[0]
if len(unique) >= 2:
forms["past_1p"] = unique[1]
# 2nd person row
if past_row + 1 < len(rows):
hf2 = first_heb_forms(past_row + 1)
keys = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
for k, v in zip(keys, hf2):
if v:
forms[k] = v
# 3rd person row
if past_row + 2 < len(rows):
hf3 = first_heb_forms(past_row + 2)
# 3ms, 3fs, 3p (3p colspan=2 so may appear twice)
unique3 = list(dict.fromkeys(hf3))
keys3 = ["past_3ms", "past_3fs", "past_3p"]
for k, v in zip(keys3, unique3):
if v:
forms[k] = v
# Future tense
if future_row >= 0:
# 1st person
hf = first_heb_forms(future_row)
unique = list(dict.fromkeys(hf))
if len(unique) >= 1:
forms["future_1s"] = unique[0]
if len(unique) >= 2:
forms["future_1p"] = unique[1]
if future_row + 1 < len(rows):
hf2 = first_heb_forms(future_row + 1)
keys = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
for k, v in zip(keys, hf2):
if v:
forms[k] = v
if future_row + 2 < len(rows):
hf3 = first_heb_forms(future_row + 2)
keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
for k, v in zip(keys3, hf3):
if v:
forms[k] = v
# Imperative
if imp_row >= 0:
hf = first_heb_forms(imp_row)
keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
for k, v in zip(keys, hf):
if v:
forms[k] = v
# Infinitive
if inf_row >= 0:
hf = first_heb_forms(inf_row)
if hf:
forms["infinitive"] = hf[0]
return forms
def _extract_conjugations(slug: str, infinitive: str) -> dict | None:
"""Fetch /dict/<slug>/ and parse conjugation table."""
url = f"{PEALIM_BASE}/dict/{slug}/"
try:
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
except Exception as e:
logger.error(f" Error fetching {url}: {e}")
return None
soup = BeautifulSoup(resp.text, "lxml")
# Extract root from menukad span in header
root = ""
for span in soup.find_all("span", class_="menukad"):
txt = span.get_text(strip=True)
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
root = txt
break
# Extract binyan / verb type from lead text or title
binyan = ""
meta = soup.find("meta", {"property": "og:description"})
if meta:
desc = meta.get("content", "")
for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]:
if bname in desc:
binyan = bname
break
forms = _parse_table(soup)
if not forms:
logger.warning(f" No forms found for {slug}")
return None
is_passive = _is_passive_binyan(binyan)
reference_form = forms.get("infinitive", infinitive) if not is_passive else forms.get("past_3ms", infinitive)
result = {
"infinitive": infinitive,
"slug": slug,
"root": root,
"binyan": binyan,
"is_passive": is_passive,
"reference_form": reference_form,
"forms": {},
}
for key, form in forms.items():
if key in PRONOUN_LABELS:
result["forms"][key] = {
"form": form,
"pronoun": PRONOUN_LABELS[key],
"tense": TENSE_DESCRIPTION.get(key, ""),
}
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
logger.info(f" Extracted {len(result['forms'])} forms for {infinitive}")
return result
# Parse HTML table
dfs = pd.read_html(response.content)
if not dfs:
logger.warning(f"No tables found for {url_suffix}")
return None
df = dfs[0]
def _load_conjugations() -> dict:
if CONJUGATIONS_PATH.exists():
with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
return json.load(f)
return {}
# Extract conjugation forms (skip header columns, flatten)
# Adjust indices based on actual table structure
np_flat = df.iloc[:, 2:].values.flatten()
# Remove NaN and invalid entries
np_flat = np.delete(np_flat, [5, 7, 15, 17, 19, 33, 34, 35])
def _save_conjugations(data: dict) -> None:
CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
# Create DataFrame with proper column names
df_result = pd.DataFrame([np_flat], columns=CONJUGATION_COLUMNS)
logger.info(f"✓ Extracted {url_suffix}")
return df_result
def main(verbs_file: Path = VERBS_INPUT) -> dict:
"""Read verbs from file and extract conjugations. Returns full conjugations dict."""
if not verbs_file.exists():
logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
return _load_conjugations()
except requests.RequestException as e:
logger.error(f"Network error for {url_suffix} (attempt {attempt + 1}): {e}")
if attempt < max_retries - 1:
time.sleep(2 ** attempt) # Exponential backoff
verbs = [v.strip() for v in verbs_file.read_text(encoding="utf-8").splitlines()
if v.strip() and not v.startswith("#")]
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file}")
conjugations = _load_conjugations()
new_count = 0
for verb in verbs:
if verb in conjugations:
logger.info(f"Skipping {verb} (cached)")
continue
logger.info(f"Processing: {verb}")
time.sleep(REQUEST_DELAY)
slug = _find_slug(verb)
if not slug:
logger.warning(f" No slug found for {verb}")
conjugations[verb] = None
_save_conjugations(conjugations)
continue
time.sleep(REQUEST_DELAY)
data = _extract_conjugations(slug, verb)
conjugations[verb] = data
_save_conjugations(conjugations)
new_count += 1
logger.info(f"Done: {new_count} new verbs processed")
return conjugations
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
result = main()
for verb, data in result.items():
if data:
forms = data.get("forms", {})
print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
for k, v in list(forms.items())[:3]:
print(f" {k}: {v['form']}")
else:
return None
except Exception as e:
logger.error(f"Error parsing {url_suffix}: {e}")
return None
def extract_from_website(url_suffixes: list = None) -> pd.DataFrame:
"""
Extract conjugations for multiple verbs.
Args:
url_suffixes: List of URL suffixes to process
Returns:
Combined DataFrame with all conjugations
"""
if url_suffixes is None:
# Default verbs: "to guard" and "to sleep"
url_suffixes = ['2255-lishmor', '860-lishon']
logger.info(f"Starting extraction for {len(url_suffixes)} verb(s)...")
all_dfs = []
for url_suffix in url_suffixes:
df = extract_verb(url_suffix)
if df is not None:
all_dfs.append(df)
time.sleep(0.5) # Small delay between requests
if not all_dfs:
logger.error("No data extracted!")
return pd.DataFrame()
combined_df = pd.concat(all_dfs, ignore_index=True)
logger.info(f"Extraction complete. Total verbs: {len(combined_df)}")
return combined_df
def main():
"""Main entry point."""
try:
df = extract_from_website()
if df.empty:
logger.error("No data to save!")
return
df.to_csv('conjugations.csv', sep=';', index=True)
logger.info("Saved: conjugations.csv")
logger.info("\n" + df.to_string())
logger.info("✅ Complete!")
except Exception as e:
logger.error(f"Fatal error: {e}")
raise
if __name__ == '__main__':
main()
print(f"{verb}: no data")

903
data/conjugations.json Normal file
View file

@ -0,0 +1,903 @@
{
"לִשְׁמוֹר": {
"infinitive": "לִשְׁמוֹר",
"slug": "2255-lishmor",
"root": "שׁ - מ - ר",
"binyan": "",
"is_passive": false,
"reference_form": "לִשְׁמֹר",
"forms": {
"present_ms": {
"form": "שׁוֹמֵר",
"pronoun": "",
"tense": "הוֹוֶה (זכר יחיד)"
},
"present_fs": {
"form": "שׁוֹמֶרֶת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה יחיד)"
},
"present_mp": {
"form": "שׁוֹמְרִים",
"pronoun": "",
"tense": "הוֹוֶה (זכר רבים)"
},
"present_fp": {
"form": "שׁוֹמְרוֹת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה רבים)"
},
"past_1s": {
"form": "שָׁמַרְתִּי",
"pronoun": "אֲנִי",
"tense": "עָבָר"
},
"past_1p": {
"form": "שָׁמַרְנוּ",
"pronoun": "אֲנַחְנוּ",
"tense": "עָבָר"
},
"past_2ms": {
"form": "שָׁמַרְתָּ",
"pronoun": "אַתָּה",
"tense": "עָבָר"
},
"past_2fs": {
"form": "שָׁמַרְתְּ",
"pronoun": "אַתְּ",
"tense": "עָבָר"
},
"past_2mp": {
"form": "שְׁמַרְתֶּם",
"pronoun": "אַתֶּם",
"tense": "עָבָר"
},
"past_2fp": {
"form": "שְׁמַרְתֶּן",
"pronoun": "אַתֶּן",
"tense": "עָבָר"
},
"past_3ms": {
"form": "שָׁמַר",
"pronoun": "הוּא",
"tense": "עָבָר"
},
"past_3fs": {
"form": "שָׁמְרָה",
"pronoun": "הִיא",
"tense": "עָבָר"
},
"past_3p": {
"form": "שָׁמְרוּ",
"pronoun": "הֵם / הֵן",
"tense": "עָבָר"
},
"future_1s": {
"form": "אֶשְׁמֹר",
"pronoun": "אֲנִי",
"tense": "עָתִיד"
},
"future_1p": {
"form": "נִשְׁמֹר",
"pronoun": "אֲנַחְנוּ",
"tense": "עָתִיד"
},
"future_2ms": {
"form": "תִּשְׁמֹר",
"pronoun": "אַתָּה",
"tense": "עָתִיד"
},
"future_2fs": {
"form": "תִּשְׁמְרִי",
"pronoun": "אַתְּ",
"tense": "עָתִיד"
},
"future_2mp": {
"form": "תִּשְׁמְרוּ",
"pronoun": "אַתֶּם",
"tense": "עָתִיד"
},
"future_2fp": {
"form": "תִּשְׁמֹרְנָה",
"pronoun": "אַתֶּן",
"tense": "עָתִיד"
},
"future_3ms": {
"form": "יִשְׁמֹר",
"pronoun": "הוּא",
"tense": "עָתִיד"
},
"future_3fs": {
"form": "תִּשְׁמֹר",
"pronoun": "הִיא",
"tense": "עָתִיד"
},
"future_3mp": {
"form": "יִשְׁמְרוּ",
"pronoun": "הֵם",
"tense": "עָתִיד"
},
"future_3fp": {
"form": "תִּשְׁמֹרְנָה",
"pronoun": "הֵן",
"tense": "עָתִיד"
},
"imperative_ms": {
"form": "שְׁמֹר!",
"pronoun": "אַתָּה",
"tense": "צִוּוּי"
},
"imperative_fs": {
"form": "שִׁמְרִי!",
"pronoun": "אַתְּ",
"tense": "צִוּוּי"
},
"imperative_mp": {
"form": "שִׁמְרוּ!",
"pronoun": "אַתֶּם",
"tense": "צִוּוּי"
},
"imperative_fp": {
"form": "שְׁמֹרְנָה!",
"pronoun": "אַתֶּן",
"tense": "צִוּוּי"
},
"infinitive": {
"form": "לִשְׁמֹר",
"pronoun": "",
"tense": "מְקוֹר"
}
}
},
"לְהִשָּׁמֵר": {
"infinitive": "לְהִשָּׁמֵר",
"slug": "2256-lehishamer",
"root": "שׁ - מ - ר",
"binyan": "",
"is_passive": false,
"reference_form": "לְהִשָּׁמֵר",
"forms": {
"present_ms": {
"form": "נִשְׁמָר",
"pronoun": "",
"tense": "הוֹוֶה (זכר יחיד)"
},
"present_fs": {
"form": "נִשְׁמֶרֶת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה יחיד)"
},
"present_mp": {
"form": "נִשְׁמָרִים",
"pronoun": "",
"tense": "הוֹוֶה (זכר רבים)"
},
"present_fp": {
"form": "נִשְׁמָרוֹת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה רבים)"
},
"past_1s": {
"form": "נִשְׁמַרְתִּי",
"pronoun": "אֲנִי",
"tense": "עָבָר"
},
"past_1p": {
"form": "נִשְׁמַרְנוּ",
"pronoun": "אֲנַחְנוּ",
"tense": "עָבָר"
},
"past_2ms": {
"form": "נִשְׁמַרְתָּ",
"pronoun": "אַתָּה",
"tense": "עָבָר"
},
"past_2fs": {
"form": "נִשְׁמַרְתְּ",
"pronoun": "אַתְּ",
"tense": "עָבָר"
},
"past_2mp": {
"form": "נִשְׁמַרְתֶּם",
"pronoun": "אַתֶּם",
"tense": "עָבָר"
},
"past_2fp": {
"form": "נִשְׁמַרְתֶּן",
"pronoun": "אַתֶּן",
"tense": "עָבָר"
},
"past_3ms": {
"form": "נִשְׁמַר",
"pronoun": "הוּא",
"tense": "עָבָר"
},
"past_3fs": {
"form": "נִשְׁמְרָה",
"pronoun": "הִיא",
"tense": "עָבָר"
},
"past_3p": {
"form": "נִשְׁמְרוּ",
"pronoun": "הֵם / הֵן",
"tense": "עָבָר"
},
"future_1s": {
"form": "אֶשָּׁמֵר",
"pronoun": "אֲנִי",
"tense": "עָתִיד"
},
"future_1p": {
"form": "נִשָּׁמֵר",
"pronoun": "אֲנַחְנוּ",
"tense": "עָתִיד"
},
"future_2ms": {
"form": "תִּשָּׁמֵר",
"pronoun": "אַתָּה",
"tense": "עָתִיד"
},
"future_2fs": {
"form": "תִּשָּׁמְרִי",
"pronoun": "אַתְּ",
"tense": "עָתִיד"
},
"future_2mp": {
"form": "תִּשָּׁמְרוּ",
"pronoun": "אַתֶּם",
"tense": "עָתִיד"
},
"future_2fp": {
"form": "תִּשָּׁמַרְנָה",
"pronoun": "אַתֶּן",
"tense": "עָתִיד"
},
"future_3ms": {
"form": "יִשָּׁמֵר",
"pronoun": "הוּא",
"tense": "עָתִיד"
},
"future_3fs": {
"form": "תִּשָּׁמֵר",
"pronoun": "הִיא",
"tense": "עָתִיד"
},
"future_3mp": {
"form": "יִשָּׁמְרוּ",
"pronoun": "הֵם",
"tense": "עָתִיד"
},
"future_3fp": {
"form": "תִּשָּׁמַרְנָה",
"pronoun": "הֵן",
"tense": "עָתִיד"
},
"imperative_ms": {
"form": "הִשָּׁמֵר!",
"pronoun": "אַתָּה",
"tense": "צִוּוּי"
},
"imperative_fs": {
"form": "הִשָּׁמְרִי!",
"pronoun": "אַתְּ",
"tense": "צִוּוּי"
},
"imperative_mp": {
"form": "הִשָּׁמְרוּ!",
"pronoun": "אַתֶּם",
"tense": "צִוּוּי"
},
"imperative_fp": {
"form": "הִשָּׁמַרְנָה!",
"pronoun": "אַתֶּן",
"tense": "צִוּוּי"
},
"infinitive": {
"form": "לְהִשָּׁמֵר",
"pronoun": "",
"tense": "מְקוֹר"
}
}
},
"לְדַבֵּר": {
"infinitive": "לְדַבֵּר",
"slug": "2-ledaber",
"root": "ד - ב - ר",
"binyan": "",
"is_passive": false,
"reference_form": "לְדַבֵּר",
"forms": {
"present_ms": {
"form": "מְדַבֵּר",
"pronoun": "",
"tense": "הוֹוֶה (זכר יחיד)"
},
"present_fs": {
"form": "מְדַבֶּרֶת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה יחיד)"
},
"present_mp": {
"form": "מְדַבְּרִים",
"pronoun": "",
"tense": "הוֹוֶה (זכר רבים)"
},
"present_fp": {
"form": "מְדַבְּרוֹת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה רבים)"
},
"past_1s": {
"form": "דִּבַּרְתִּי",
"pronoun": "אֲנִי",
"tense": "עָבָר"
},
"past_1p": {
"form": "דִּבַּרְנוּ",
"pronoun": "אֲנַחְנוּ",
"tense": "עָבָר"
},
"past_2ms": {
"form": "דִּבַּרְתָּ",
"pronoun": "אַתָּה",
"tense": "עָבָר"
},
"past_2fs": {
"form": "דִּבַּרְתְּ",
"pronoun": "אַתְּ",
"tense": "עָבָר"
},
"past_2mp": {
"form": "דִּבַּרְתֶּם",
"pronoun": "אַתֶּם",
"tense": "עָבָר"
},
"past_2fp": {
"form": "דִּבַּרְתֶּן",
"pronoun": "אַתֶּן",
"tense": "עָבָר"
},
"past_3ms": {
"form": "דִּבֵּר",
"pronoun": "הוּא",
"tense": "עָבָר"
},
"past_3fs": {
"form": "דִּבְּרָה",
"pronoun": "הִיא",
"tense": "עָבָר"
},
"past_3p": {
"form": "דִּבְּרוּ",
"pronoun": "הֵם / הֵן",
"tense": "עָבָר"
},
"future_1s": {
"form": "אֲדַבֵּר",
"pronoun": "אֲנִי",
"tense": "עָתִיד"
},
"future_1p": {
"form": "נְדַבֵּר",
"pronoun": "אֲנַחְנוּ",
"tense": "עָתִיד"
},
"future_2ms": {
"form": "תְּדַבֵּר",
"pronoun": "אַתָּה",
"tense": "עָתִיד"
},
"future_2fs": {
"form": "תְּדַבְּרִי",
"pronoun": "אַתְּ",
"tense": "עָתִיד"
},
"future_2mp": {
"form": "תְּדַבְּרוּ",
"pronoun": "אַתֶּם",
"tense": "עָתִיד"
},
"future_2fp": {
"form": "תְּדַבֵּרְנָה",
"pronoun": "אַתֶּן",
"tense": "עָתִיד"
},
"future_3ms": {
"form": "יְדַבֵּר",
"pronoun": "הוּא",
"tense": "עָתִיד"
},
"future_3fs": {
"form": "תְּדַבֵּר",
"pronoun": "הִיא",
"tense": "עָתִיד"
},
"future_3mp": {
"form": "יְדַבְּרוּ",
"pronoun": "הֵם",
"tense": "עָתִיד"
},
"future_3fp": {
"form": "תְּדַבֵּרְנָה",
"pronoun": "הֵן",
"tense": "עָתִיד"
},
"imperative_ms": {
"form": "דַּבֵּר!",
"pronoun": "אַתָּה",
"tense": "צִוּוּי"
},
"imperative_fs": {
"form": "דַּבְּרִי!",
"pronoun": "אַתְּ",
"tense": "צִוּוּי"
},
"imperative_mp": {
"form": "דַּבְּרוּ!",
"pronoun": "אַתֶּם",
"tense": "צִוּוּי"
},
"imperative_fp": {
"form": "דַּבֵּרְנָה!",
"pronoun": "אַתֶּן",
"tense": "צִוּוּי"
},
"infinitive": {
"form": "לְדַבֵּר",
"pronoun": "",
"tense": "מְקוֹר"
}
}
},
"לְדֻבַּר": {
"infinitive": "לְדֻבַּר",
"slug": "2-ledaber",
"root": "ד - ב - ר",
"binyan": "",
"is_passive": false,
"reference_form": "לְדַבֵּר",
"forms": {
"present_ms": {
"form": "מְדַבֵּר",
"pronoun": "",
"tense": "הוֹוֶה (זכר יחיד)"
},
"present_fs": {
"form": "מְדַבֶּרֶת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה יחיד)"
},
"present_mp": {
"form": "מְדַבְּרִים",
"pronoun": "",
"tense": "הוֹוֶה (זכר רבים)"
},
"present_fp": {
"form": "מְדַבְּרוֹת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה רבים)"
},
"past_1s": {
"form": "דִּבַּרְתִּי",
"pronoun": "אֲנִי",
"tense": "עָבָר"
},
"past_1p": {
"form": "דִּבַּרְנוּ",
"pronoun": "אֲנַחְנוּ",
"tense": "עָבָר"
},
"past_2ms": {
"form": "דִּבַּרְתָּ",
"pronoun": "אַתָּה",
"tense": "עָבָר"
},
"past_2fs": {
"form": "דִּבַּרְתְּ",
"pronoun": "אַתְּ",
"tense": "עָבָר"
},
"past_2mp": {
"form": "דִּבַּרְתֶּם",
"pronoun": "אַתֶּם",
"tense": "עָבָר"
},
"past_2fp": {
"form": "דִּבַּרְתֶּן",
"pronoun": "אַתֶּן",
"tense": "עָבָר"
},
"past_3ms": {
"form": "דִּבֵּר",
"pronoun": "הוּא",
"tense": "עָבָר"
},
"past_3fs": {
"form": "דִּבְּרָה",
"pronoun": "הִיא",
"tense": "עָבָר"
},
"past_3p": {
"form": "דִּבְּרוּ",
"pronoun": "הֵם / הֵן",
"tense": "עָבָר"
},
"future_1s": {
"form": "אֲדַבֵּר",
"pronoun": "אֲנִי",
"tense": "עָתִיד"
},
"future_1p": {
"form": "נְדַבֵּר",
"pronoun": "אֲנַחְנוּ",
"tense": "עָתִיד"
},
"future_2ms": {
"form": "תְּדַבֵּר",
"pronoun": "אַתָּה",
"tense": "עָתִיד"
},
"future_2fs": {
"form": "תְּדַבְּרִי",
"pronoun": "אַתְּ",
"tense": "עָתִיד"
},
"future_2mp": {
"form": "תְּדַבְּרוּ",
"pronoun": "אַתֶּם",
"tense": "עָתִיד"
},
"future_2fp": {
"form": "תְּדַבֵּרְנָה",
"pronoun": "אַתֶּן",
"tense": "עָתִיד"
},
"future_3ms": {
"form": "יְדַבֵּר",
"pronoun": "הוּא",
"tense": "עָתִיד"
},
"future_3fs": {
"form": "תְּדַבֵּר",
"pronoun": "הִיא",
"tense": "עָתִיד"
},
"future_3mp": {
"form": "יְדַבְּרוּ",
"pronoun": "הֵם",
"tense": "עָתִיד"
},
"future_3fp": {
"form": "תְּדַבֵּרְנָה",
"pronoun": "הֵן",
"tense": "עָתִיד"
},
"imperative_ms": {
"form": "דַּבֵּר!",
"pronoun": "אַתָּה",
"tense": "צִוּוּי"
},
"imperative_fs": {
"form": "דַּבְּרִי!",
"pronoun": "אַתְּ",
"tense": "צִוּוּי"
},
"imperative_mp": {
"form": "דַּבְּרוּ!",
"pronoun": "אַתֶּם",
"tense": "צִוּוּי"
},
"imperative_fp": {
"form": "דַּבֵּרְנָה!",
"pronoun": "אַתֶּן",
"tense": "צִוּוּי"
},
"infinitive": {
"form": "לְדַבֵּר",
"pronoun": "",
"tense": "מְקוֹר"
}
}
},
"לְהִתְלַבֵּשׁ": {
"infinitive": "לְהִתְלַבֵּשׁ",
"slug": "974-lehitlabesh",
"root": "ל - ב - שׁ",
"binyan": "",
"is_passive": false,
"reference_form": "לְהִתְלַבֵּשׁ",
"forms": {
"present_ms": {
"form": "מִתְלַבֵּשׁ",
"pronoun": "",
"tense": "הוֹוֶה (זכר יחיד)"
},
"present_fs": {
"form": "מִתְלַבֶּשֶׁת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה יחיד)"
},
"present_mp": {
"form": "מִתְלַבְּשִׁים",
"pronoun": "",
"tense": "הוֹוֶה (זכר רבים)"
},
"present_fp": {
"form": "מִתְלַבְּשׁוֹת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה רבים)"
},
"past_1s": {
"form": "הִתְלַבַּשְׁתִּי",
"pronoun": "אֲנִי",
"tense": "עָבָר"
},
"past_1p": {
"form": "הִתְלַבַּשְׁנוּ",
"pronoun": "אֲנַחְנוּ",
"tense": "עָבָר"
},
"past_2ms": {
"form": "הִתְלַבַּשְׁתָּ",
"pronoun": "אַתָּה",
"tense": "עָבָר"
},
"past_2fs": {
"form": "הִתְלַבַּשְׁתְּ",
"pronoun": "אַתְּ",
"tense": "עָבָר"
},
"past_2mp": {
"form": "הִתְלַבַּשְׁתֶּם",
"pronoun": "אַתֶּם",
"tense": "עָבָר"
},
"past_2fp": {
"form": "הִתְלַבַּשְׁתֶּן",
"pronoun": "אַתֶּן",
"tense": "עָבָר"
},
"past_3ms": {
"form": "הִתְלַבֵּשׁ",
"pronoun": "הוּא",
"tense": "עָבָר"
},
"past_3fs": {
"form": "הִתְלַבְּשָׁה",
"pronoun": "הִיא",
"tense": "עָבָר"
},
"past_3p": {
"form": "הִתְלַבְּשׁוּ",
"pronoun": "הֵם / הֵן",
"tense": "עָבָר"
},
"future_1s": {
"form": "אֶתְלַבֵּשׁ",
"pronoun": "אֲנִי",
"tense": "עָתִיד"
},
"future_1p": {
"form": "נִתְלַבֵּשׁ",
"pronoun": "אֲנַחְנוּ",
"tense": "עָתִיד"
},
"future_2ms": {
"form": "תִּתְלַבֵּשׁ",
"pronoun": "אַתָּה",
"tense": "עָתִיד"
},
"future_2fs": {
"form": "תִּתְלַבְּשִׁי",
"pronoun": "אַתְּ",
"tense": "עָתִיד"
},
"future_2mp": {
"form": "תִּתְלַבְּשׁוּ",
"pronoun": "אַתֶּם",
"tense": "עָתִיד"
},
"future_2fp": {
"form": "תִּתְלַבֵּשְׁנָה",
"pronoun": "אַתֶּן",
"tense": "עָתִיד"
},
"future_3ms": {
"form": "יִתְלַבֵּשׁ",
"pronoun": "הוּא",
"tense": "עָתִיד"
},
"future_3fs": {
"form": "תִּתְלַבֵּשׁ",
"pronoun": "הִיא",
"tense": "עָתִיד"
},
"future_3mp": {
"form": "יִתְלַבְּשׁוּ",
"pronoun": "הֵם",
"tense": "עָתִיד"
},
"future_3fp": {
"form": "תִּתְלַבֵּשְׁנָה",
"pronoun": "הֵן",
"tense": "עָתִיד"
},
"imperative_ms": {
"form": "הִתְלַבֵּשׁ!",
"pronoun": "אַתָּה",
"tense": "צִוּוּי"
},
"imperative_fs": {
"form": "הִתְלַבְּשִׁי!",
"pronoun": "אַתְּ",
"tense": "צִוּוּי"
},
"imperative_mp": {
"form": "הִתְלַבְּשׁוּ!",
"pronoun": "אַתֶּם",
"tense": "צִוּוּי"
},
"imperative_fp": {
"form": "הִתְלַבֵּשְׁנָה!",
"pronoun": "אַתֶּן",
"tense": "צִוּוּי"
},
"infinitive": {
"form": "לְהִתְלַבֵּשׁ",
"pronoun": "",
"tense": "מְקוֹר"
}
}
},
"לְהַגִּיד": {
"infinitive": "לְהַגִּיד",
"slug": "1135-lehagid",
"root": "נ - ג - ד",
"binyan": "",
"is_passive": false,
"reference_form": "לְהַגִּיד",
"forms": {
"present_ms": {
"form": "מַגִּיד",
"pronoun": "",
"tense": "הוֹוֶה (זכר יחיד)"
},
"present_fs": {
"form": "מַגִּידָה",
"pronoun": "",
"tense": "הוֹוֶה (נקבה יחיד)"
},
"present_mp": {
"form": "מַגִּידִים",
"pronoun": "",
"tense": "הוֹוֶה (זכר רבים)"
},
"present_fp": {
"form": "מַגִּידוֹת",
"pronoun": "",
"tense": "הוֹוֶה (נקבה רבים)"
},
"past_1s": {
"form": "הִגַּדְתִּי",
"pronoun": "אֲנִי",
"tense": "עָבָר"
},
"past_1p": {
"form": "הִגַּדְנוּ",
"pronoun": "אֲנַחְנוּ",
"tense": "עָבָר"
},
"past_2ms": {
"form": "הִגַּדְתָּ",
"pronoun": "אַתָּה",
"tense": "עָבָר"
},
"past_2fs": {
"form": "הִגַּדְתְּ",
"pronoun": "אַתְּ",
"tense": "עָבָר"
},
"past_2mp": {
"form": "הִגַּדְתֶּם",
"pronoun": "אַתֶּם",
"tense": "עָבָר"
},
"past_2fp": {
"form": "הִגַּדְתֶּן",
"pronoun": "אַתֶּן",
"tense": "עָבָר"
},
"past_3ms": {
"form": "הִגִּיד",
"pronoun": "הוּא",
"tense": "עָבָר"
},
"past_3fs": {
"form": "הִגִּידָה",
"pronoun": "הִיא",
"tense": "עָבָר"
},
"past_3p": {
"form": "הִגִּידוּ",
"pronoun": "הֵם / הֵן",
"tense": "עָבָר"
},
"future_1s": {
"form": "אַגִּיד",
"pronoun": "אֲנִי",
"tense": "עָתִיד"
},
"future_1p": {
"form": "נַגִּיד",
"pronoun": "אֲנַחְנוּ",
"tense": "עָתִיד"
},
"future_2ms": {
"form": "תַּגִּיד",
"pronoun": "אַתָּה",
"tense": "עָתִיד"
},
"future_2fs": {
"form": "תַּגִּידִי",
"pronoun": "אַתְּ",
"tense": "עָתִיד"
},
"future_2mp": {
"form": "תַּגִּידוּ",
"pronoun": "אַתֶּם",
"tense": "עָתִיד"
},
"future_2fp": {
"form": "תַּגֵּדְנָה",
"pronoun": "אַתֶּן",
"tense": "עָתִיד"
},
"future_3ms": {
"form": "יַגִּיד",
"pronoun": "הוּא",
"tense": "עָתִיד"
},
"future_3fs": {
"form": "תַּגִּיד",
"pronoun": "הִיא",
"tense": "עָתִיד"
},
"future_3mp": {
"form": "יַגִּידוּ",
"pronoun": "הֵם",
"tense": "עָתִיד"
},
"future_3fp": {
"form": "תַּגֵּדְנָה",
"pronoun": "הֵן",
"tense": "עָתִיד"
},
"imperative_ms": {
"form": "הַגֵּד!",
"pronoun": "אַתָּה",
"tense": "צִוּוּי"
},
"imperative_fs": {
"form": "הַגִּידִי!",
"pronoun": "אַתְּ",
"tense": "צִוּוּי"
},
"imperative_mp": {
"form": "הַגִּידוּ!",
"pronoun": "אַתֶּם",
"tense": "צִוּוּי"
},
"imperative_fp": {
"form": "הַגֵּדְנָה!",
"pronoun": "אַתֶּן",
"tense": "צִוּוּי"
},
"infinitive": {
"form": "לְהַגִּיד",
"pronoun": "",
"tense": "מְקוֹר"
}
}
},
"לְהוּגַד": null
}

1
data/examples_cache.json Normal file
View file

@ -0,0 +1 @@
{"אב": ["לא אב לחגלה אתה", "כרחם אב על בנים"], "אבא": ["כך כך אבא יקירי", "“אבא איננו בבית"], "אביבי": ["אמרת: תם אביבי,", "אמרת: תם אביבי,"], "אביב": ["אביב כי יתחדש ", "ברחובות תל־אביב"], "אבידה": ["אבידה בדבר מועט", "ואם לרבות אבידה"], "לאבוד": ["אבל נאנחתי לאבוד", "אנו הולכים לאבוד"], "להיאבד": [], "להתאבד": ["מעמד והחליטה להתאבד", "היא נסתה פעם להתאבד"], "איבוד": ["איבוד דמי משלוח", "איבוד עצמו לדעת"], "התאבדות": ["והביאו לידי התאבדות", "הקלון, בלתי אם התאבדות"], "להאביד": ["ויאמר להאביד זכרם ", "קול שם רשעים להאביד"], "אבדה": ["ועתה אבדה תקותה", "וכל תשועתו אבדה"], "אבוד": ["— הה, הנני אבוד", "“אבוד עצמי לדעת"], "לאבד": ["אין לכם מה לאבד", "יש חשש לאבד שנה"], "אבדון": ["אבדון, אש הנעורת", "אבדון, פתחי עולם"], "אבוקדו": ["מטעים רצופים של עצי אבוקדו ומנגו", "את זרעי הפירות וגידלו מהם שתילים חדשים של אבוקדו"], "אבזם": ["רצו אל הטבח, הגישו לו הפעם חגורה עם אבזם מבריק… הביאו כוס", "רגליו היו עטופות לפפות חדשות ומתניו חגורות חגורה חדשה עם אבזם"], "לאבזר": [], "איבחון": ["לשלבים גבוהים יותר של איבחון וריפוי", "איבחון נחפז, כמוהו כהיסוס מופרז, עלול לגרור תוצאות בלתי־נעימות"]}

File diff suppressed because one or more lines are too long

9106
data/pealim_dict.csv Normal file

File diff suppressed because it is too large Load diff

12111
data/pealim_dict_for_anki.csv Normal file

File diff suppressed because it is too large Load diff

85
frequency_lookup.py Normal file
View file

@ -0,0 +1,85 @@
#!/usr/bin/env python3
"""
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
Downloads he_50k.txt once; subsequent runs read from cache.
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
"""
import json
import logging
import re
import unicodedata
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
FREQ_URL = (
"https://raw.githubusercontent.com/hermitdave/FrequencyWords/"
"master/content/2016/he/he_50k.txt"
)
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
REQUEST_TIMEOUT = 30
# Module-level cache: word_no_nikkud -> rank (1 = most common)
_freq: dict[str, int] = {}
def _strip_nikkud(text: str) -> str:
"""Remove Hebrew nikkud (diacritics) from a string."""
return "".join(
ch for ch in unicodedata.normalize("NFD", text)
if unicodedata.category(ch) != "Mn"
)
def load(cache_path: Path = CACHE_PATH) -> None:
"""Load frequency data from cache, downloading if not present."""
global _freq
if cache_path.exists():
with open(cache_path, encoding="utf-8") as f:
_freq = json.load(f)
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
return
logger.info("Downloading FrequencyWords he_50k.txt …")
resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
rank = 1
for line in resp.text.splitlines():
line = line.strip()
if not line:
continue
parts = line.split()
if len(parts) >= 1:
word = _strip_nikkud(parts[0])
if word and word not in _freq:
_freq[word] = rank
rank += 1
cache_path.parent.mkdir(parents=True, exist_ok=True)
with open(cache_path, "w", encoding="utf-8") as f:
json.dump(_freq, f, ensure_ascii=False)
logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}")
def get_frequency_rank(word_no_nikkud: str) -> int | None:
"""
Return the frequency rank of a word (1 = most common).
Returns None if not found in the corpus.
Strips nikkud from the input before lookup.
"""
if not _freq:
load()
clean = _strip_nikkud(word_no_nikkud.strip())
return _freq.get(clean)
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()
tests = ["שלום", "ספר", "בית", "מים", "כלב"]
for w in tests:
print(f"{w}: rank {get_frequency_rank(w)}")

View file

@ -1,3 +1,6 @@
pandas>=1.3.0
requests>=2.26.0
numpy>=1.21.0
genanki>=0.8.0
beautifulsoup4>=4.11.0
lxml>=4.9.0

311
run.py
View file

@ -1,48 +1,313 @@
#!/usr/bin/env python3
"""
Main entry point: orchestrate dictionary and conjugation extraction.
Pealim Anki Deck Builder full pipeline orchestrator.
Usage:
python run.py [options]
Options:
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-conjugations Skip verb conjugation extraction
--test N Process only the first N dictionary words (for quick testing)
"""
import argparse
import json
import logging
import sys
import time
from pathlib import Path
# Add current directory to path
sys.path.insert(0, str(Path(__file__).parent))
import pealim_extract
import conjugation_extract
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s'
format="%(asctime)s %(levelname)s %(message)s",
)
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
OUTPUT_DIR = Path(__file__).parent / "output"
def main():
"""Run all extraction tasks."""
logger.info("=" * 60)
logger.info("PEALIM EXTRACTION SUITE")
logger.info("=" * 60)
def parse_args():
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument("--skip-conjugations", action="store_true", help="Skip verb conjugation extraction")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
def step_scrape(args):
"""Step 1 — scrape or load dictionary."""
dict_csv = DATA_DIR / "pealim_dict.csv"
anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if args.skip_scrape:
if dict_csv.exists():
logger.info(f"[1] Using existing {dict_csv}")
else:
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
sys.exit(1)
return
logger.info("[1] Scraping dictionary from pealim.com …")
import pealim_extract
import pandas as pd
df = pealim_extract.extract_from_website()
df.to_csv(dict_csv, index=True)
logger.info(f" Saved {len(df)} words → {dict_csv}")
df = pealim_extract.modify_for_anki(df)
df.to_csv(anki_csv, sep=";", index=True)
logger.info(f" Saved Anki CSV → {anki_csv}")
def step_frequency():
"""Step 2 — load/download word frequency data."""
logger.info("[2] Loading word frequency data …")
import frequency_lookup
frequency_lookup.load()
return frequency_lookup._freq
def step_examples(args, freq_cache: dict):
"""Step 3 — load/build Ben Yehuda example index."""
if args.skip_examples:
logger.info("[3] Skipping examples (--skip-examples)")
examples_path = DATA_DIR / "examples_cache.json"
if examples_path.exists():
with open(examples_path) as f:
return json.load(f)
return {}
logger.info("[3] Loading Ben Yehuda example index …")
import benyehuda
benyehuda.load()
# Pre-fetch examples for all words in the dict (uses cache)
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
try:
# Extract dictionary
logger.info("\n[1/2] Extracting dictionary...")
pealim_extract.main()
import pandas as pd
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
# Extract conjugations
logger.info("\n[2/2] Extracting conjugations...")
conjugation_extract.main()
if args.test:
df = df.head(args.test)
logger.info("\n" + "=" * 60)
logger.info("✅ ALL TASKS COMPLETE")
logger.info("=" * 60)
import unicodedata
def strip(t):
return "".join(c for c in unicodedata.normalize("NFD", str(t))
if unicodedata.category(c) != "Mn")
logger.info(f" Pre-fetching examples for {len(df)} words …")
for _, row in df.iterrows():
word_plain = strip(str(row.get("Word Without Nikkud", "")).strip())
if word_plain:
benyehuda.get_examples(word_plain)
except Exception as e:
logger.error(f"\n❌ EXTRACTION FAILED: {e}")
sys.exit(1)
logger.warning(f" Could not pre-fetch all examples: {e}")
benyehuda.save_examples_cache()
return benyehuda._examples_cache
if __name__ == '__main__':
def step_audio(args):
"""Step 4 — download audio .mp3 files."""
if args.skip_audio:
logger.info("[4] Skipping audio (--skip-audio)")
return
logger.info("[4] Downloading audio files …")
# Load audio URL cache (from old workspace if available)
audio_cache_path = DATA_DIR / "audio_cache.json"
audio_url_cache: dict = {}
if audio_cache_path.exists():
with open(audio_cache_path) as f:
audio_url_cache = json.load(f)
import audio_extract as ae
ae._audio_cache = audio_url_cache
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
import pandas as pd
import requests
try:
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
if args.test:
df = df.head(args.test)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
downloaded = 0
skipped = 0
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
word_plain = str(row.get("Word Without Nikkud", "")).strip()
if not word:
continue
import re, unicodedata
def strip_nik(t):
return "".join(c for c in unicodedata.normalize("NFD", t)
if unicodedata.category(c) != "Mn")
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nik(word_plain or word))
if not safe_name:
continue
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
if mp3_path.exists():
skipped += 1
continue
# Get audio URL from cache or fetch
audio_url = ae.extract_audio_url(word)
if audio_url:
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.3)
except Exception as e:
logger.debug(f" Audio download failed for {word}: {e}")
ae.save_audio_cache(str(audio_cache_path))
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached")
except Exception as e:
logger.warning(f" Audio step failed: {e}")
def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
"""Step 5 — build vocabulary .apkg."""
logger.info("[5] Building vocabulary deck …")
import apkg_builder
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
deck, media = apkg_builder.build_vocab_deck(
dict_csv,
examples_cache=examples_cache,
freq_cache=freq_cache,
limit=args.test,
)
apkg_builder.write_vocab_apkg(deck, media)
logger.info(f" Vocabulary .apkg → {apkg_builder.VOCAB_APKG}")
return deck
def step_conjugations(args):
"""Step 6 — extract conjugations and build conjugation deck."""
if args.skip_conjugations:
logger.info("[6] Skipping conjugations (--skip-conjugations)")
return
verbs_file = Path(__file__).parent / "verbs_input.txt"
if not verbs_file.exists():
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
return
logger.info("[6] Extracting verb conjugations …")
import conjugation_extract
conjugations = conjugation_extract.main(verbs_file)
import apkg_builder
conj_deck = apkg_builder.build_conj_deck(conjugations)
apkg_builder.write_conj_apkg(conj_deck)
logger.info(f" Conjugation .apkg → {apkg_builder.CONJ_APKG}")
return conjugations
def print_summary(args, examples_cache, freq_cache, conjugations):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if dict_csv.exists():
import pandas as pd
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except Exception:
df = pd.read_csv(dict_csv, index_col=0)
logger.info(f" Dictionary words: {len(df)}")
logger.info(f" Frequency entries: {len(freq_cache)}")
logger.info(f" Example cache entries: {len(examples_cache)}")
covered = sum(1 for v in examples_cache.values() if v)
if examples_cache:
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100*covered//len(examples_cache)}%)")
audio_dir = DATA_DIR / "audio"
if audio_dir.exists():
mp3s = list(audio_dir.glob("*.mp3"))
logger.info(f" Audio files: {len(mp3s)}")
vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
conj_apkg = OUTPUT_DIR / "pealim_conjugations.apkg"
if vocab_apkg.exists():
size_mb = vocab_apkg.stat().st_size / 1e6
logger.info(f" Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")
if conj_apkg.exists():
size_mb = conj_apkg.stat().st_size / 1e6
logger.info(f" Conjugation .apkg: {size_mb:.1f} MB → {conj_apkg}")
if conjugations:
verb_count = sum(1 for v in conjugations.values() if v)
logger.info(f" Verbs in conjugation deck: {verb_count}")
logger.info("=" * 60)
logger.info("✅ DONE")
def main():
args = parse_args()
logger.info("=" * 60)
logger.info("PEALIM ANKI DECK BUILDER")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
logger.info("=" * 60)
step_scrape(args)
freq_cache = step_frequency()
examples_cache = step_examples(args, freq_cache)
step_audio(args)
step_build_vocab(args, examples_cache, freq_cache)
conjugations = step_conjugations(args)
print_summary(args, examples_cache, freq_cache, conjugations or {})
if __name__ == "__main__":
main()

10
verbs_input.txt Normal file
View file

@ -0,0 +1,10 @@
# One Hebrew infinitive per line.
# Lines starting with # are ignored.
# Initial test set — one verb per binyan:
לִשְׁמוֹר
לְהִשָּׁמֵר
לְדַבֵּר
לְדֻבַּר
לְהִתְלַבֵּשׁ
לְהַגִּיד
לְהוּגַד