feat: add apkg builder, frequency, Ben Yehuda examples, conjugation deck
Implements four major improvements to the Pealim Anki deck pipeline:
1. Automated .apkg generation (genanki) — no more manual Anki Desktop step.
Both vocabulary and conjugation decks are built programmatically.
2. Word frequency ranking from hermitdave/FrequencyWords he_50k corpus.
Notes sorted by rank so Anki presents most common words first.
3. Example sentences from Ben Yehuda public domain corpus (not pealim.com).
Downloads txt_stripped.zip, indexes 25k texts, ~89% coverage on test set.
4. Conjugation drill deck — one card per form × verb.
Input: verbs_input.txt (Hebrew infinitives). Initial set: 7 verbs (one
per binyan). Extracts 28 forms each via pealim.com/search/ + table parse.
New files:
apkg_builder.py — genanki deck builder for both decks
benyehuda.py — Ben Yehuda corpus downloader + sentence indexer
frequency_lookup.py — FrequencyWords downloader + rank lookup
verbs_input.txt — verb input list (7 test verbs, one per binyan)
data/ — baseline CSVs + generated caches
Updated:
conjugation_extract.py — rewritten: reads verbs_input.txt, searches
/search/?q= for slug, parses table by row labels
requirements.txt — add genanki, beautifulsoup4, lxml
run.py — full orchestration pipeline with CLI flags
.gitignore — exclude venv/, benyehuda_index.json, audio/, output/
CLI:
python run.py --skip-scrape --skip-audio --test 20 (quick test)
python run.py --skip-scrape (full build)
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
e23b353064
commit
b086123bec
13 changed files with 23502 additions and 162 deletions
12
.gitignore
vendored
12
.gitignore
vendored
|
|
@ -8,3 +8,15 @@ lib**
|
||||||
include**
|
include**
|
||||||
lib64**
|
lib64**
|
||||||
pyvenv.cfg
|
pyvenv.cfg
|
||||||
|
venv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
|
||||||
|
# Large generated cache files (rebuild locally)
|
||||||
|
data/benyehuda_index.json
|
||||||
|
|
||||||
|
# Audio directory (large; rebuild with --skip-scrape)
|
||||||
|
data/audio/
|
||||||
|
|
||||||
|
# Output .apkg files (generated by pipeline)
|
||||||
|
output/
|
||||||
|
|
|
||||||
428
apkg_builder.py
Normal file
428
apkg_builder.py
Normal file
|
|
@ -0,0 +1,428 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Build Anki .apkg files for both the vocabulary deck and the conjugation deck.
|
||||||
|
Uses genanki for reliable, stable deck generation.
|
||||||
|
|
||||||
|
Deck IDs are hardcoded integers — same ID on re-import updates the existing deck
|
||||||
|
in Anki rather than creating a duplicate.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import genanki
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Stable deck/model IDs — do not change these
|
||||||
|
VOCAB_DECK_ID = 1_234_567_890
|
||||||
|
VOCAB_MODEL_ID = 1_234_567_891
|
||||||
|
CONJ_DECK_ID = 1_234_567_892
|
||||||
|
CONJ_MODEL_ID = 1_234_567_893
|
||||||
|
|
||||||
|
DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
AUDIO_DIR = DATA_DIR / "audio"
|
||||||
|
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||||
|
|
||||||
|
VOCAB_APKG = OUTPUT_DIR / "pealim_vocabulary.apkg"
|
||||||
|
CONJ_APKG = OUTPUT_DIR / "pealim_conjugations.apkg"
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Shared CSS
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
CARD_CSS = """
|
||||||
|
.card {
|
||||||
|
font-family: Arial, sans-serif;
|
||||||
|
font-size: 20px;
|
||||||
|
text-align: center;
|
||||||
|
color: #222;
|
||||||
|
background: #fff;
|
||||||
|
padding: 16px;
|
||||||
|
}
|
||||||
|
.hebrew {
|
||||||
|
font-size: 36px;
|
||||||
|
font-weight: bold;
|
||||||
|
direction: rtl;
|
||||||
|
text-align: right;
|
||||||
|
line-height: 1.5;
|
||||||
|
color: #1a1a8c;
|
||||||
|
}
|
||||||
|
.hebrew-sm {
|
||||||
|
font-size: 24px;
|
||||||
|
direction: rtl;
|
||||||
|
text-align: right;
|
||||||
|
color: #333;
|
||||||
|
}
|
||||||
|
.label {
|
||||||
|
font-size: 13px;
|
||||||
|
color: #888;
|
||||||
|
text-transform: uppercase;
|
||||||
|
letter-spacing: 0.05em;
|
||||||
|
margin-top: 10px;
|
||||||
|
}
|
||||||
|
.meaning {
|
||||||
|
font-size: 28px;
|
||||||
|
color: #111;
|
||||||
|
margin: 8px 0;
|
||||||
|
}
|
||||||
|
.root-info {
|
||||||
|
font-size: 16px;
|
||||||
|
color: #555;
|
||||||
|
margin-top: 6px;
|
||||||
|
direction: rtl;
|
||||||
|
}
|
||||||
|
.example {
|
||||||
|
font-size: 16px;
|
||||||
|
color: #444;
|
||||||
|
direction: rtl;
|
||||||
|
text-align: right;
|
||||||
|
font-style: italic;
|
||||||
|
margin-top: 10px;
|
||||||
|
border-left: 3px solid #aaa;
|
||||||
|
padding-left: 8px;
|
||||||
|
}
|
||||||
|
.divider { border-top: 1px solid #ddd; margin: 10px 0; }
|
||||||
|
.freq-badge {
|
||||||
|
display: inline-block;
|
||||||
|
font-size: 12px;
|
||||||
|
color: #fff;
|
||||||
|
background: #0070c0;
|
||||||
|
border-radius: 10px;
|
||||||
|
padding: 2px 8px;
|
||||||
|
margin-top: 4px;
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Vocabulary Deck
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
VOCAB_FRONT_HEB = """
|
||||||
|
<div class="hebrew">{{Word}}</div>
|
||||||
|
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||||
|
<div class="label">What does this mean?</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
VOCAB_BACK_HEB = """
|
||||||
|
{{FrontSide}}
|
||||||
|
<div class="divider"></div>
|
||||||
|
<div class="meaning">{{Meaning}}</div>
|
||||||
|
<div class="label">Root</div>
|
||||||
|
<div class="hebrew-sm">{{Root}}</div>
|
||||||
|
<div class="label">Part of Speech</div>
|
||||||
|
<div style="font-size:15px;color:#555">{{PoS}}</div>
|
||||||
|
{{#SharedRoots}}
|
||||||
|
<div class="label">Related words (same root)</div>
|
||||||
|
<div class="root-info">{{SharedRoots}}</div>
|
||||||
|
{{/SharedRoots}}
|
||||||
|
{{#Example}}
|
||||||
|
<div class="label">Example</div>
|
||||||
|
<div class="example">{{Example}}</div>
|
||||||
|
{{/Example}}
|
||||||
|
{{#Frequency}}<div class="freq-badge">Rank #{{Frequency}}</div>{{/Frequency}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
VOCAB_FRONT_ENG = """
|
||||||
|
<div class="meaning">{{Meaning}}</div>
|
||||||
|
<div class="label">Translate to Hebrew</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
VOCAB_BACK_ENG = """
|
||||||
|
{{FrontSide}}
|
||||||
|
<div class="divider"></div>
|
||||||
|
<div class="hebrew">{{Word}}</div>
|
||||||
|
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||||
|
<div class="label">Without nikkud</div>
|
||||||
|
<div class="hebrew-sm">{{WordNoNikkud}}</div>
|
||||||
|
<div class="label">Root</div>
|
||||||
|
<div class="hebrew-sm">{{Root}}</div>
|
||||||
|
<div class="label">Part of Speech</div>
|
||||||
|
<div style="font-size:15px;color:#555">{{PoS}}</div>
|
||||||
|
{{#Example}}
|
||||||
|
<div class="label">Example</div>
|
||||||
|
<div class="example">{{Example}}</div>
|
||||||
|
{{/Example}}
|
||||||
|
"""
|
||||||
|
|
||||||
|
VOCAB_MODEL = genanki.Model(
|
||||||
|
VOCAB_MODEL_ID,
|
||||||
|
"Pealim Hebrew",
|
||||||
|
fields=[
|
||||||
|
{"name": "Word"},
|
||||||
|
{"name": "Root"},
|
||||||
|
{"name": "PoS"},
|
||||||
|
{"name": "Meaning"},
|
||||||
|
{"name": "WordNoNikkud"},
|
||||||
|
{"name": "SharedRoots"},
|
||||||
|
{"name": "Tags"},
|
||||||
|
{"name": "Audio"},
|
||||||
|
{"name": "Example"},
|
||||||
|
{"name": "Frequency"},
|
||||||
|
],
|
||||||
|
templates=[
|
||||||
|
{
|
||||||
|
"name": "Hebrew → English",
|
||||||
|
"qfmt": VOCAB_FRONT_HEB,
|
||||||
|
"afmt": VOCAB_BACK_HEB,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "English → Hebrew",
|
||||||
|
"qfmt": VOCAB_FRONT_ENG,
|
||||||
|
"afmt": VOCAB_BACK_ENG,
|
||||||
|
},
|
||||||
|
],
|
||||||
|
css=CARD_CSS,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Conjugation Deck
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
CONJ_FRONT = """
|
||||||
|
<div class="label">פועל (Verb)</div>
|
||||||
|
<div class="hebrew">{{ReferenceForm}}</div>
|
||||||
|
{{#Pronoun}}<div class="hebrew-sm">{{Pronoun}}</div>{{/Pronoun}}
|
||||||
|
<div class="label">זמן (Tense)</div>
|
||||||
|
<div class="hebrew-sm">{{Tense}}</div>
|
||||||
|
<div class="label">מה הצורה? (What is the form?)</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
CONJ_BACK = """
|
||||||
|
{{FrontSide}}
|
||||||
|
<div class="divider"></div>
|
||||||
|
<div class="hebrew">{{ConjugatedForm}}</div>
|
||||||
|
<div class="label">שורש (Root): {{Root}} | בניין (Binyan): {{Binyan}}</div>
|
||||||
|
"""
|
||||||
|
|
||||||
|
CONJ_CSS = CARD_CSS + """
|
||||||
|
.card { direction: rtl; }
|
||||||
|
.label { direction: ltr; }
|
||||||
|
"""
|
||||||
|
|
||||||
|
CONJ_MODEL = genanki.Model(
|
||||||
|
CONJ_MODEL_ID,
|
||||||
|
"Pealim Conjugation",
|
||||||
|
fields=[
|
||||||
|
{"name": "Infinitive"},
|
||||||
|
{"name": "ReferenceForm"},
|
||||||
|
{"name": "Pronoun"},
|
||||||
|
{"name": "Tense"},
|
||||||
|
{"name": "ConjugatedForm"},
|
||||||
|
{"name": "Root"},
|
||||||
|
{"name": "Binyan"},
|
||||||
|
],
|
||||||
|
templates=[
|
||||||
|
{
|
||||||
|
"name": "Conjugation Drill",
|
||||||
|
"qfmt": CONJ_FRONT,
|
||||||
|
"afmt": CONJ_BACK,
|
||||||
|
}
|
||||||
|
],
|
||||||
|
css=CONJ_CSS,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
# Helpers
|
||||||
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
def _strip_nikkud(text: str) -> str:
|
||||||
|
return "".join(
|
||||||
|
ch for ch in unicodedata.normalize("NFD", text)
|
||||||
|
if unicodedata.category(ch) != "Mn"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _audio_tag(word_no_nikkud: str) -> str:
|
||||||
|
"""Return [sound:xxx.mp3] if audio file exists, else empty string."""
|
||||||
|
safe = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
|
||||||
|
if not safe:
|
||||||
|
return ""
|
||||||
|
mp3_path = AUDIO_DIR / f"{safe}.mp3"
|
||||||
|
if mp3_path.exists():
|
||||||
|
return f"[sound:{mp3_path.name}]"
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def build_vocab_deck(
|
||||||
|
dict_csv: Path,
|
||||||
|
examples_cache: Optional[dict] = None,
|
||||||
|
freq_cache: Optional[dict] = None,
|
||||||
|
limit: Optional[int] = None,
|
||||||
|
) -> tuple[genanki.Deck, list[Path]]:
|
||||||
|
"""
|
||||||
|
Build the vocabulary deck from pealim_dict_for_anki.csv (or pealim_dict.csv).
|
||||||
|
Returns (deck, list_of_media_files).
|
||||||
|
"""
|
||||||
|
logger.info(f"Loading dictionary from {dict_csv}")
|
||||||
|
# Try semicolon separator first (enriched CSV), fall back to comma
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||||
|
if df.shape[1] < 3:
|
||||||
|
raise ValueError("too few columns")
|
||||||
|
except Exception:
|
||||||
|
df = pd.read_csv(dict_csv, index_col=0)
|
||||||
|
|
||||||
|
if limit:
|
||||||
|
df = df.head(limit)
|
||||||
|
|
||||||
|
logger.info(f" {len(df)} rows loaded")
|
||||||
|
|
||||||
|
examples_cache = examples_cache or {}
|
||||||
|
freq_cache = freq_cache or {}
|
||||||
|
|
||||||
|
# Sort by frequency rank (ascending) so Anki presents common words first
|
||||||
|
def freq_sort_key(row):
|
||||||
|
word_plain = str(row.get("Word Without Nikkud", row.get("WordNoNikkud", ""))).strip()
|
||||||
|
word_plain = _strip_nikkud(word_plain)
|
||||||
|
return freq_cache.get(word_plain, 999_999)
|
||||||
|
|
||||||
|
df["_freq_rank"] = df.apply(freq_sort_key, axis=1)
|
||||||
|
df = df.sort_values("_freq_rank")
|
||||||
|
|
||||||
|
deck = genanki.Deck(VOCAB_DECK_ID, "Pealim Hebrew Vocabulary")
|
||||||
|
media_files: list[Path] = []
|
||||||
|
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
word = str(row.get("Word", "")).strip()
|
||||||
|
root = str(row.get("Root", "")).strip()
|
||||||
|
pos = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||||||
|
meaning = str(row.get("Meaning", "")).strip()
|
||||||
|
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||||||
|
shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
|
||||||
|
tags_str = str(row.get("tags", row.get("Tags", ""))).strip()
|
||||||
|
freq_rank = int(row["_freq_rank"]) if row["_freq_rank"] < 999_999 else ""
|
||||||
|
|
||||||
|
# Audio
|
||||||
|
audio_tag = _audio_tag(word_no_nik)
|
||||||
|
if audio_tag:
|
||||||
|
mp3_name = audio_tag[7:-1] # strip [sound: and ]
|
||||||
|
mp3_path = AUDIO_DIR / mp3_name
|
||||||
|
if mp3_path not in media_files:
|
||||||
|
media_files.append(mp3_path)
|
||||||
|
|
||||||
|
# Example sentences
|
||||||
|
plain_key = _strip_nikkud(word_no_nik)
|
||||||
|
examples_list = examples_cache.get(plain_key, examples_cache.get(word_no_nik, []))
|
||||||
|
example_html = "<br>".join(examples_list[:2]) if examples_list else ""
|
||||||
|
|
||||||
|
# Clean up nan values
|
||||||
|
for val, default in [(root, ""), (pos, ""), (meaning, ""), (word_no_nik, ""),
|
||||||
|
(shared_roots, ""), (tags_str, "")]:
|
||||||
|
if val in ("nan", "None"):
|
||||||
|
val = default
|
||||||
|
|
||||||
|
root = "" if root in ("nan", "None", "-") else root
|
||||||
|
pos = "" if pos in ("nan", "None") else pos
|
||||||
|
meaning = "" if meaning in ("nan", "None") else meaning
|
||||||
|
word_no_nik = "" if word_no_nik in ("nan", "None") else word_no_nik
|
||||||
|
shared_roots = "" if shared_roots in ("nan", "None") else shared_roots
|
||||||
|
tags_str = "" if tags_str in ("nan", "None") else tags_str
|
||||||
|
|
||||||
|
if not word or not meaning:
|
||||||
|
continue
|
||||||
|
|
||||||
|
note = genanki.Note(
|
||||||
|
model=VOCAB_MODEL,
|
||||||
|
fields=[
|
||||||
|
word,
|
||||||
|
root,
|
||||||
|
pos,
|
||||||
|
meaning,
|
||||||
|
word_no_nik,
|
||||||
|
shared_roots,
|
||||||
|
tags_str,
|
||||||
|
audio_tag,
|
||||||
|
example_html,
|
||||||
|
str(freq_rank),
|
||||||
|
],
|
||||||
|
tags=tags_str.split() if tags_str else [],
|
||||||
|
)
|
||||||
|
deck.add_note(note)
|
||||||
|
|
||||||
|
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
|
||||||
|
return deck, media_files
|
||||||
|
|
||||||
|
|
||||||
|
def build_conj_deck(conjugations: dict) -> genanki.Deck:
|
||||||
|
"""Build the conjugation drill deck from conjugations.json data."""
|
||||||
|
deck = genanki.Deck(CONJ_DECK_ID, "Pealim Hebrew Conjugations")
|
||||||
|
note_count = 0
|
||||||
|
|
||||||
|
for infinitive, data in conjugations.items():
|
||||||
|
if not data or not data.get("forms"):
|
||||||
|
continue
|
||||||
|
root = data.get("root", "")
|
||||||
|
binyan = data.get("binyan", "")
|
||||||
|
ref_form = data.get("reference_form", infinitive)
|
||||||
|
|
||||||
|
for form_key, form_data in data["forms"].items():
|
||||||
|
conj_form = form_data.get("form", "")
|
||||||
|
pronoun = form_data.get("pronoun", "")
|
||||||
|
tense = form_data.get("tense", "")
|
||||||
|
|
||||||
|
if not conj_form or not re.search(r"[\u05d0-\u05ea]", conj_form):
|
||||||
|
continue
|
||||||
|
|
||||||
|
note = genanki.Note(
|
||||||
|
model=CONJ_MODEL,
|
||||||
|
fields=[
|
||||||
|
infinitive,
|
||||||
|
ref_form,
|
||||||
|
pronoun,
|
||||||
|
tense,
|
||||||
|
conj_form,
|
||||||
|
root,
|
||||||
|
binyan,
|
||||||
|
],
|
||||||
|
)
|
||||||
|
deck.add_note(note)
|
||||||
|
note_count += 1
|
||||||
|
|
||||||
|
logger.info(f"Conjugation deck: {note_count} notes across {sum(1 for v in conjugations.values() if v)} verbs")
|
||||||
|
return deck
|
||||||
|
|
||||||
|
|
||||||
|
def write_vocab_apkg(
|
||||||
|
deck: genanki.Deck,
|
||||||
|
media_files: list[Path],
|
||||||
|
out_path: Path = VOCAB_APKG,
|
||||||
|
) -> None:
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
pkg = genanki.Package(deck)
|
||||||
|
pkg.media_files = [str(p) for p in media_files if p.exists()]
|
||||||
|
pkg.write_to_file(str(out_path))
|
||||||
|
logger.info(f"Vocabulary deck written → {out_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def write_conj_apkg(deck: genanki.Deck, out_path: Path = CONJ_APKG) -> None:
|
||||||
|
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
genanki.Package(deck).write_to_file(str(out_path))
|
||||||
|
logger.info(f"Conjugation deck written → {out_path}")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
|
||||||
|
# Quick self-test with 20 words, no audio, no examples
|
||||||
|
csv_path = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||||
|
if not csv_path.exists():
|
||||||
|
csv_path = DATA_DIR / "pealim_dict.csv"
|
||||||
|
|
||||||
|
deck, media = build_vocab_deck(csv_path, limit=20)
|
||||||
|
write_vocab_apkg(deck, media)
|
||||||
|
|
||||||
|
conj_path = DATA_DIR / "conjugations.json"
|
||||||
|
if conj_path.exists():
|
||||||
|
with open(conj_path) as f:
|
||||||
|
conjugations = json.load(f)
|
||||||
|
conj_deck = build_conj_deck(conjugations)
|
||||||
|
write_conj_apkg(conj_deck)
|
||||||
160
benyehuda.py
Normal file
160
benyehuda.py
Normal file
|
|
@ -0,0 +1,160 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Ben Yehuda corpus example-sentence lookup.
|
||||||
|
Downloads plaintext-no-nikkud ZIP once, indexes sentences, then answers queries locally.
|
||||||
|
Exposed API: get_examples(word_no_nikkud) -> list[str]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
import zipfile
|
||||||
|
from io import BytesIO
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
CORPUS_URL = (
|
||||||
|
"https://github.com/projectbenyehuda/public_domain_dump/releases/"
|
||||||
|
"download/2025-10/txt_stripped.zip"
|
||||||
|
)
|
||||||
|
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
|
||||||
|
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
|
||||||
|
REQUEST_TIMEOUT = 120
|
||||||
|
MIN_SENTENCE_LEN = 15
|
||||||
|
MAX_EXAMPLES_PER_WORD = 2
|
||||||
|
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
|
||||||
|
|
||||||
|
# Module-level state
|
||||||
|
_index: dict[str, list[str]] = {} # word -> [sentence, ...]
|
||||||
|
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_nikkud(text: str) -> str:
|
||||||
|
return "".join(
|
||||||
|
ch for ch in unicodedata.normalize("NFD", text)
|
||||||
|
if unicodedata.category(ch) != "Mn"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_sentences(text: str) -> list[str]:
|
||||||
|
"""Split text into sentences on common sentence-ending punctuation."""
|
||||||
|
raw = re.split(r"[.!?؟\n]{1,3}", text)
|
||||||
|
out = []
|
||||||
|
for s in raw:
|
||||||
|
s = s.strip()
|
||||||
|
if len(s) >= MIN_SENTENCE_LEN:
|
||||||
|
out.append(s)
|
||||||
|
return out
|
||||||
|
|
||||||
|
|
||||||
|
def _build_index(corpus_zip_bytes: bytes) -> None:
|
||||||
|
"""Parse corpus ZIP and build word → sentences index."""
|
||||||
|
global _index
|
||||||
|
_index = {}
|
||||||
|
logger.info("Building Ben Yehuda index from corpus …")
|
||||||
|
|
||||||
|
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
|
||||||
|
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
|
||||||
|
logger.info(f" Corpus contains {len(txt_files)} text files")
|
||||||
|
for fname in txt_files:
|
||||||
|
try:
|
||||||
|
raw = zf.read(fname).decode("utf-8", errors="ignore")
|
||||||
|
except Exception:
|
||||||
|
continue
|
||||||
|
for sentence in _split_sentences(raw):
|
||||||
|
words = re.findall(r"[\u05d0-\u05ea'\"]+", sentence)
|
||||||
|
for w in set(words):
|
||||||
|
if len(w) >= 2:
|
||||||
|
if w not in _index:
|
||||||
|
_index[w] = []
|
||||||
|
if len(_index[w]) < MAX_INDEX_ENTRIES:
|
||||||
|
_index[w].append(sentence)
|
||||||
|
|
||||||
|
logger.info(f"Index built: {len(_index)} unique words")
|
||||||
|
|
||||||
|
|
||||||
|
def _save_index() -> None:
|
||||||
|
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(INDEX_PATH, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(_index, f, ensure_ascii=False)
|
||||||
|
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
|
||||||
|
|
||||||
|
|
||||||
|
def _load_index() -> None:
|
||||||
|
global _index
|
||||||
|
with open(INDEX_PATH, encoding="utf-8") as f:
|
||||||
|
_index = json.load(f)
|
||||||
|
logger.info(f"Ben Yehuda index loaded: {len(_index)} words")
|
||||||
|
|
||||||
|
|
||||||
|
def load(force_rebuild: bool = False) -> None:
|
||||||
|
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
|
||||||
|
global _index, _examples_cache
|
||||||
|
if _index and not force_rebuild:
|
||||||
|
return
|
||||||
|
|
||||||
|
# Load persisted examples cache
|
||||||
|
if EXAMPLES_CACHE_PATH.exists():
|
||||||
|
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
|
||||||
|
_examples_cache = json.load(f)
|
||||||
|
|
||||||
|
if INDEX_PATH.exists() and not force_rebuild:
|
||||||
|
_load_index()
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Downloading Ben Yehuda corpus … (this may take 1-2 minutes)")
|
||||||
|
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
|
||||||
|
resp.raise_for_status()
|
||||||
|
data = resp.content
|
||||||
|
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
|
||||||
|
|
||||||
|
_build_index(data)
|
||||||
|
_save_index()
|
||||||
|
|
||||||
|
|
||||||
|
def save_examples_cache() -> None:
|
||||||
|
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(_examples_cache, f, ensure_ascii=False)
|
||||||
|
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_examples(word_no_nikkud: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Return up to 2 shortest complete sentences (≥15 chars) containing word_no_nikkud
|
||||||
|
as a whole token. Results are cached; subsequent calls for the same word are instant.
|
||||||
|
"""
|
||||||
|
if not _index:
|
||||||
|
load()
|
||||||
|
|
||||||
|
word = _strip_nikkud(word_no_nikkud.strip())
|
||||||
|
|
||||||
|
if word in _examples_cache:
|
||||||
|
return _examples_cache[word]
|
||||||
|
|
||||||
|
candidates = _index.get(word, [])
|
||||||
|
# Filter: must contain word as whole token (word boundary)
|
||||||
|
pattern = r"(?<![^\s\W])" + re.escape(word) + r"(?![^\s\W])"
|
||||||
|
matched = [s for s in candidates if re.search(pattern, s)]
|
||||||
|
|
||||||
|
# Sort by length (prefer shorter, more natural sentences)
|
||||||
|
matched.sort(key=len)
|
||||||
|
result = matched[:MAX_EXAMPLES_PER_WORD]
|
||||||
|
_examples_cache[word] = result
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
load()
|
||||||
|
tests = ["שלום", "בית", "ספר", "מים", "אהבה", "ילד"]
|
||||||
|
for w in tests:
|
||||||
|
exs = get_examples(w)
|
||||||
|
print(f"\n{w}: {len(exs)} examples")
|
||||||
|
for ex in exs:
|
||||||
|
print(f" → {ex[:80]}")
|
||||||
|
save_examples_cache()
|
||||||
|
|
@ -1,153 +1,408 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Extract Hebrew verb conjugations from pealim.com.
|
Extract Hebrew verb conjugations from pealim.com.
|
||||||
Scrapes conjugation tables for specific verbs.
|
Input: verbs_input.txt (one Hebrew infinitive per line)
|
||||||
|
Output: data/conjugations.json
|
||||||
|
|
||||||
|
For each verb:
|
||||||
|
1. Search pealim.com/search/?q=<verb> to find URL slug
|
||||||
|
2. Fetch /dict/<slug>/ with hebstyle=mo cookie
|
||||||
|
3. Parse conjugation table by row labels
|
||||||
|
|
||||||
|
Resume-safe: verbs already in conjugations.json are skipped.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import requests
|
import json
|
||||||
import pandas as pd
|
|
||||||
import numpy as np
|
|
||||||
import logging
|
import logging
|
||||||
|
import re
|
||||||
import time
|
import time
|
||||||
|
import urllib.parse
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
# Configure logging
|
|
||||||
logging.basicConfig(
|
|
||||||
level=logging.INFO,
|
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
|
||||||
)
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Session for connection pooling
|
PEALIM_BASE = "https://www.pealim.com"
|
||||||
|
REQUEST_DELAY = 1.5
|
||||||
|
REQUEST_TIMEOUT = 15
|
||||||
|
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
|
||||||
|
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
|
||||||
|
|
||||||
|
# Pronoun labels (for card front display)
|
||||||
|
PRONOUN_LABELS = {
|
||||||
|
"present_ms": "",
|
||||||
|
"present_fs": "",
|
||||||
|
"present_mp": "",
|
||||||
|
"present_fp": "",
|
||||||
|
"past_1s": "אֲנִי",
|
||||||
|
"past_1p": "אֲנַחְנוּ",
|
||||||
|
"past_2ms": "אַתָּה",
|
||||||
|
"past_2fs": "אַתְּ",
|
||||||
|
"past_2mp": "אַתֶּם",
|
||||||
|
"past_2fp": "אַתֶּן",
|
||||||
|
"past_3ms": "הוּא",
|
||||||
|
"past_3fs": "הִיא",
|
||||||
|
"past_3p": "הֵם / הֵן",
|
||||||
|
"future_1s": "אֲנִי",
|
||||||
|
"future_1p": "אֲנַחְנוּ",
|
||||||
|
"future_2ms": "אַתָּה",
|
||||||
|
"future_2fs": "אַתְּ",
|
||||||
|
"future_2mp": "אַתֶּם",
|
||||||
|
"future_2fp": "אַתֶּן",
|
||||||
|
"future_3ms": "הוּא",
|
||||||
|
"future_3fs": "הִיא",
|
||||||
|
"future_3mp": "הֵם",
|
||||||
|
"future_3fp": "הֵן",
|
||||||
|
"imperative_ms": "אַתָּה",
|
||||||
|
"imperative_fs": "אַתְּ",
|
||||||
|
"imperative_mp": "אַתֶּם",
|
||||||
|
"imperative_fp": "אַתֶּן",
|
||||||
|
"infinitive": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Human-readable tense description for card front
|
||||||
|
TENSE_DESCRIPTION = {
|
||||||
|
"present_ms": "הוֹוֶה (זכר יחיד)",
|
||||||
|
"present_fs": "הוֹוֶה (נקבה יחיד)",
|
||||||
|
"present_mp": "הוֹוֶה (זכר רבים)",
|
||||||
|
"present_fp": "הוֹוֶה (נקבה רבים)",
|
||||||
|
"past_1s": "עָבָר",
|
||||||
|
"past_1p": "עָבָר",
|
||||||
|
"past_2ms": "עָבָר",
|
||||||
|
"past_2fs": "עָבָר",
|
||||||
|
"past_2mp": "עָבָר",
|
||||||
|
"past_2fp": "עָבָר",
|
||||||
|
"past_3ms": "עָבָר",
|
||||||
|
"past_3fs": "עָבָר",
|
||||||
|
"past_3p": "עָבָר",
|
||||||
|
"future_1s": "עָתִיד",
|
||||||
|
"future_1p": "עָתִיד",
|
||||||
|
"future_2ms": "עָתִיד",
|
||||||
|
"future_2fs": "עָתִיד",
|
||||||
|
"future_2mp": "עָתִיד",
|
||||||
|
"future_2fp": "עָתִיד",
|
||||||
|
"future_3ms": "עָתִיד",
|
||||||
|
"future_3fs": "עָתִיד",
|
||||||
|
"future_3mp": "עָתִיד",
|
||||||
|
"future_3fp": "עָתִיד",
|
||||||
|
"imperative_ms": "צִוּוּי",
|
||||||
|
"imperative_fs": "צִוּוּי",
|
||||||
|
"imperative_mp": "צִוּוּי",
|
||||||
|
"imperative_fp": "צִוּוּי",
|
||||||
|
"infinitive": "מְקוֹר",
|
||||||
|
}
|
||||||
|
|
||||||
session = requests.Session()
|
session = requests.Session()
|
||||||
session.headers.update({
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
|
||||||
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
|
|
||||||
})
|
|
||||||
|
|
||||||
PEALIM_BASE_URL = "https://www.pealim.com/dict"
|
|
||||||
REQUEST_TIMEOUT = 10
|
|
||||||
REQUEST_DELAY = 1.0 # seconds between requests (respectful scraping)
|
|
||||||
|
|
||||||
# Conjugation column order (standard Hebrew verb forms)
|
|
||||||
CONJUGATION_COLUMNS = [
|
|
||||||
'present_ms', 'present_fs', 'present_mp', 'present_fp',
|
|
||||||
'past_1s', 'past_1p', 'past_2ms', 'past_2fs', 'past_2mp', 'past_2fp',
|
|
||||||
'past_3ms', 'past_3fs', 'past_3p',
|
|
||||||
'future_1s', 'future_1p', 'future_2ms', 'future_2fs', 'future_2mp', 'future_2fp',
|
|
||||||
'future_3ms', 'future_3fs', 'future_3mp', 'future_3fp',
|
|
||||||
'imperative_ms', 'imperative_fs', 'imperative_mp', 'imperative_fp',
|
|
||||||
'infinitive'
|
|
||||||
]
|
|
||||||
|
|
||||||
|
|
||||||
def extract_verb(url_suffix: str, max_retries: int = 3) -> pd.DataFrame:
|
def _find_slug(infinitive: str) -> str | None:
|
||||||
"""
|
"""Search pealim.com/search/?q=<verb> and return the URL slug."""
|
||||||
Extract conjugation table for a single verb.
|
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(infinitive)}"
|
||||||
|
|
||||||
Args:
|
|
||||||
url_suffix: URL suffix (e.g., '2255-lishmor', '860-lishon')
|
|
||||||
max_retries: Maximum retry attempts on failure
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
DataFrame with conjugation forms, or None if extraction fails
|
|
||||||
"""
|
|
||||||
url = f"{PEALIM_BASE_URL}/{url_suffix}"
|
|
||||||
|
|
||||||
for attempt in range(max_retries):
|
|
||||||
try:
|
|
||||||
logger.info(f"Fetching: {url} (attempt {attempt + 1}/{max_retries})")
|
|
||||||
|
|
||||||
cookies = {
|
|
||||||
'translit': 'none',
|
|
||||||
'hebstyle': 'bp',
|
|
||||||
'showmeaning': 'off'
|
|
||||||
}
|
|
||||||
|
|
||||||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
|
||||||
response.raise_for_status()
|
|
||||||
|
|
||||||
# Parse HTML table
|
|
||||||
dfs = pd.read_html(response.content)
|
|
||||||
if not dfs:
|
|
||||||
logger.warning(f"No tables found for {url_suffix}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
df = dfs[0]
|
|
||||||
|
|
||||||
# Extract conjugation forms (skip header columns, flatten)
|
|
||||||
# Adjust indices based on actual table structure
|
|
||||||
np_flat = df.iloc[:, 2:].values.flatten()
|
|
||||||
|
|
||||||
# Remove NaN and invalid entries
|
|
||||||
np_flat = np.delete(np_flat, [5, 7, 15, 17, 19, 33, 34, 35])
|
|
||||||
|
|
||||||
# Create DataFrame with proper column names
|
|
||||||
df_result = pd.DataFrame([np_flat], columns=CONJUGATION_COLUMNS)
|
|
||||||
logger.info(f"✓ Extracted {url_suffix}")
|
|
||||||
|
|
||||||
return df_result
|
|
||||||
|
|
||||||
except requests.RequestException as e:
|
|
||||||
logger.error(f"Network error for {url_suffix} (attempt {attempt + 1}): {e}")
|
|
||||||
if attempt < max_retries - 1:
|
|
||||||
time.sleep(2 ** attempt) # Exponential backoff
|
|
||||||
else:
|
|
||||||
return None
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"Error parsing {url_suffix}: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def extract_from_website(url_suffixes: list = None) -> pd.DataFrame:
|
|
||||||
"""
|
|
||||||
Extract conjugations for multiple verbs.
|
|
||||||
|
|
||||||
Args:
|
|
||||||
url_suffixes: List of URL suffixes to process
|
|
||||||
|
|
||||||
Returns:
|
|
||||||
Combined DataFrame with all conjugations
|
|
||||||
"""
|
|
||||||
if url_suffixes is None:
|
|
||||||
# Default verbs: "to guard" and "to sleep"
|
|
||||||
url_suffixes = ['2255-lishmor', '860-lishon']
|
|
||||||
|
|
||||||
logger.info(f"Starting extraction for {len(url_suffixes)} verb(s)...")
|
|
||||||
|
|
||||||
all_dfs = []
|
|
||||||
for url_suffix in url_suffixes:
|
|
||||||
df = extract_verb(url_suffix)
|
|
||||||
if df is not None:
|
|
||||||
all_dfs.append(df)
|
|
||||||
time.sleep(0.5) # Small delay between requests
|
|
||||||
|
|
||||||
if not all_dfs:
|
|
||||||
logger.error("No data extracted!")
|
|
||||||
return pd.DataFrame()
|
|
||||||
|
|
||||||
combined_df = pd.concat(all_dfs, ignore_index=True)
|
|
||||||
logger.info(f"Extraction complete. Total verbs: {len(combined_df)}")
|
|
||||||
|
|
||||||
return combined_df
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Main entry point."""
|
|
||||||
try:
|
try:
|
||||||
df = extract_from_website()
|
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
if df.empty:
|
# Slugs look like /dict/2255-lishmor/
|
||||||
logger.error("No data to save!")
|
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
||||||
return
|
if slugs:
|
||||||
|
slug = slugs[0]
|
||||||
df.to_csv('conjugations.csv', sep=';', index=True)
|
logger.info(f" Slug: {slug}")
|
||||||
logger.info("Saved: conjugations.csv")
|
return slug
|
||||||
logger.info("\n" + df.to_string())
|
|
||||||
logger.info("✅ Complete!")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Fatal error: {e}")
|
logger.error(f" Error searching for '{infinitive}': {e}")
|
||||||
raise
|
return None
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
def _is_passive_binyan(binyan: str) -> bool:
|
||||||
main()
|
for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]:
|
||||||
|
if marker.lower() in binyan.lower():
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _get_menukad(cell) -> str:
|
||||||
|
"""Extract nikkud Hebrew text from a table cell."""
|
||||||
|
span = cell.find("span", class_="menukad")
|
||||||
|
if span:
|
||||||
|
return span.get_text(strip=True)
|
||||||
|
# fallback: any Hebrew text in cell
|
||||||
|
txt = cell.get_text(strip=True)
|
||||||
|
if re.search(r"[\u05d0-\u05ea]", txt):
|
||||||
|
return txt
|
||||||
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_table(soup: BeautifulSoup) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Parse the pealim conjugation table and return form_key -> Hebrew form mapping.
|
||||||
|
|
||||||
|
Table structure (rows after two header rows):
|
||||||
|
Row 2 (Present): [label x2] [ms] [fs] [mp] [fp]
|
||||||
|
Row 3 (Past 1): [Past x1] [1st x1] [1s x2] [1p x2]
|
||||||
|
Row 4 (Past 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
|
||||||
|
Row 5 (Past 3): [3rd x1] [3ms] [3fs] [3p x2]
|
||||||
|
Row 6 (Fut 1): [Future x1] [1st x1] [1s x2] [1p x2]
|
||||||
|
Row 7 (Fut 2): [2nd x1] [2ms] [2fs] [2mp] [2fp]
|
||||||
|
Row 8 (Fut 3): [3rd x1] [3ms] [3fs] [3mp] [3fp]
|
||||||
|
Row 9 (Imp): [Imp x2] [ms] [fs] [mp] [fp]
|
||||||
|
Row 10 (Inf): [Inf x2] [form x4]
|
||||||
|
"""
|
||||||
|
table = soup.find("table", class_="conjugation-table")
|
||||||
|
if not table:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
rows = table.find_all("tr")
|
||||||
|
if len(rows) < 9:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
forms: dict[str, str] = {}
|
||||||
|
|
||||||
|
def row_forms(row_idx: int) -> list[str]:
|
||||||
|
"""Extract all Hebrew form values from a row (expanding colspans)."""
|
||||||
|
cells = rows[row_idx].find_all(["th", "td"])
|
||||||
|
result = []
|
||||||
|
for cell in cells:
|
||||||
|
txt = _get_menukad(cell)
|
||||||
|
colspan = int(cell.get("colspan", 1))
|
||||||
|
if txt:
|
||||||
|
for _ in range(colspan):
|
||||||
|
result.append(txt)
|
||||||
|
else:
|
||||||
|
for _ in range(colspan):
|
||||||
|
result.append("")
|
||||||
|
return result
|
||||||
|
|
||||||
|
def first_heb_forms(row_idx: int) -> list[str]:
|
||||||
|
"""Get only the Hebrew-text cells from a row (skip label cells)."""
|
||||||
|
cells = rows[row_idx].find_all(["th", "td"])
|
||||||
|
result = []
|
||||||
|
for cell in cells:
|
||||||
|
txt = _get_menukad(cell)
|
||||||
|
colspan = int(cell.get("colspan", 1))
|
||||||
|
if txt and re.search(r"[\u05d0-\u05ea]", txt):
|
||||||
|
for _ in range(colspan):
|
||||||
|
result.append(txt)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Row label detection
|
||||||
|
def row_label(idx: int) -> str:
|
||||||
|
row = rows[idx]
|
||||||
|
return row.get_text(" ", strip=True).lower()
|
||||||
|
|
||||||
|
# Find rows by tense label
|
||||||
|
present_row = past_row = future_row = imp_row = inf_row = -1
|
||||||
|
for i, row in enumerate(rows):
|
||||||
|
label = row.get_text(" ", strip=True).lower()
|
||||||
|
if "present" in label and present_row < 0:
|
||||||
|
present_row = i
|
||||||
|
elif "past" in label and past_row < 0:
|
||||||
|
past_row = i
|
||||||
|
elif "future" in label and future_row < 0:
|
||||||
|
future_row = i
|
||||||
|
elif "imperative" in label and imp_row < 0:
|
||||||
|
imp_row = i
|
||||||
|
elif "infinitive" in label and inf_row < 0:
|
||||||
|
inf_row = i
|
||||||
|
|
||||||
|
# Present tense (4 forms: ms fs mp fp)
|
||||||
|
if present_row >= 0:
|
||||||
|
hf = first_heb_forms(present_row)
|
||||||
|
keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
|
||||||
|
for k, v in zip(keys, hf):
|
||||||
|
if v:
|
||||||
|
forms[k] = v
|
||||||
|
|
||||||
|
# Past tense (rows: 1st person, 2nd person, 3rd person)
|
||||||
|
if past_row >= 0:
|
||||||
|
# 1st person row
|
||||||
|
hf = first_heb_forms(past_row)
|
||||||
|
# Row has: [Past label] [1st label] [1s] [1s] [1p] [1p] or just [1s] [1p]
|
||||||
|
# After label stripping: we get 1s and 1p (possibly duplicated by colspan)
|
||||||
|
unique = list(dict.fromkeys(hf)) # deduplicate consecutive
|
||||||
|
if len(unique) >= 1:
|
||||||
|
forms["past_1s"] = unique[0]
|
||||||
|
if len(unique) >= 2:
|
||||||
|
forms["past_1p"] = unique[1]
|
||||||
|
|
||||||
|
# 2nd person row
|
||||||
|
if past_row + 1 < len(rows):
|
||||||
|
hf2 = first_heb_forms(past_row + 1)
|
||||||
|
keys = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
|
||||||
|
for k, v in zip(keys, hf2):
|
||||||
|
if v:
|
||||||
|
forms[k] = v
|
||||||
|
|
||||||
|
# 3rd person row
|
||||||
|
if past_row + 2 < len(rows):
|
||||||
|
hf3 = first_heb_forms(past_row + 2)
|
||||||
|
# 3ms, 3fs, 3p (3p colspan=2 so may appear twice)
|
||||||
|
unique3 = list(dict.fromkeys(hf3))
|
||||||
|
keys3 = ["past_3ms", "past_3fs", "past_3p"]
|
||||||
|
for k, v in zip(keys3, unique3):
|
||||||
|
if v:
|
||||||
|
forms[k] = v
|
||||||
|
|
||||||
|
# Future tense
|
||||||
|
if future_row >= 0:
|
||||||
|
# 1st person
|
||||||
|
hf = first_heb_forms(future_row)
|
||||||
|
unique = list(dict.fromkeys(hf))
|
||||||
|
if len(unique) >= 1:
|
||||||
|
forms["future_1s"] = unique[0]
|
||||||
|
if len(unique) >= 2:
|
||||||
|
forms["future_1p"] = unique[1]
|
||||||
|
|
||||||
|
if future_row + 1 < len(rows):
|
||||||
|
hf2 = first_heb_forms(future_row + 1)
|
||||||
|
keys = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
|
||||||
|
for k, v in zip(keys, hf2):
|
||||||
|
if v:
|
||||||
|
forms[k] = v
|
||||||
|
|
||||||
|
if future_row + 2 < len(rows):
|
||||||
|
hf3 = first_heb_forms(future_row + 2)
|
||||||
|
keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
|
||||||
|
for k, v in zip(keys3, hf3):
|
||||||
|
if v:
|
||||||
|
forms[k] = v
|
||||||
|
|
||||||
|
# Imperative
|
||||||
|
if imp_row >= 0:
|
||||||
|
hf = first_heb_forms(imp_row)
|
||||||
|
keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
|
||||||
|
for k, v in zip(keys, hf):
|
||||||
|
if v:
|
||||||
|
forms[k] = v
|
||||||
|
|
||||||
|
# Infinitive
|
||||||
|
if inf_row >= 0:
|
||||||
|
hf = first_heb_forms(inf_row)
|
||||||
|
if hf:
|
||||||
|
forms["infinitive"] = hf[0]
|
||||||
|
|
||||||
|
return forms
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_conjugations(slug: str, infinitive: str) -> dict | None:
|
||||||
|
"""Fetch /dict/<slug>/ and parse conjugation table."""
|
||||||
|
url = f"{PEALIM_BASE}/dict/{slug}/"
|
||||||
|
try:
|
||||||
|
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
except Exception as e:
|
||||||
|
logger.error(f" Error fetching {url}: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
soup = BeautifulSoup(resp.text, "lxml")
|
||||||
|
|
||||||
|
# Extract root from menukad span in header
|
||||||
|
root = ""
|
||||||
|
for span in soup.find_all("span", class_="menukad"):
|
||||||
|
txt = span.get_text(strip=True)
|
||||||
|
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
|
||||||
|
root = txt
|
||||||
|
break
|
||||||
|
|
||||||
|
# Extract binyan / verb type from lead text or title
|
||||||
|
binyan = ""
|
||||||
|
meta = soup.find("meta", {"property": "og:description"})
|
||||||
|
if meta:
|
||||||
|
desc = meta.get("content", "")
|
||||||
|
for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]:
|
||||||
|
if bname in desc:
|
||||||
|
binyan = bname
|
||||||
|
break
|
||||||
|
|
||||||
|
forms = _parse_table(soup)
|
||||||
|
|
||||||
|
if not forms:
|
||||||
|
logger.warning(f" No forms found for {slug}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
is_passive = _is_passive_binyan(binyan)
|
||||||
|
reference_form = forms.get("infinitive", infinitive) if not is_passive else forms.get("past_3ms", infinitive)
|
||||||
|
|
||||||
|
result = {
|
||||||
|
"infinitive": infinitive,
|
||||||
|
"slug": slug,
|
||||||
|
"root": root,
|
||||||
|
"binyan": binyan,
|
||||||
|
"is_passive": is_passive,
|
||||||
|
"reference_form": reference_form,
|
||||||
|
"forms": {},
|
||||||
|
}
|
||||||
|
for key, form in forms.items():
|
||||||
|
if key in PRONOUN_LABELS:
|
||||||
|
result["forms"][key] = {
|
||||||
|
"form": form,
|
||||||
|
"pronoun": PRONOUN_LABELS[key],
|
||||||
|
"tense": TENSE_DESCRIPTION.get(key, ""),
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(f" Extracted {len(result['forms'])} forms for {infinitive}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _load_conjugations() -> dict:
|
||||||
|
if CONJUGATIONS_PATH.exists():
|
||||||
|
with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
|
||||||
|
def _save_conjugations(data: dict) -> None:
|
||||||
|
CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
|
||||||
|
def main(verbs_file: Path = VERBS_INPUT) -> dict:
|
||||||
|
"""Read verbs from file and extract conjugations. Returns full conjugations dict."""
|
||||||
|
if not verbs_file.exists():
|
||||||
|
logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
|
||||||
|
return _load_conjugations()
|
||||||
|
|
||||||
|
verbs = [v.strip() for v in verbs_file.read_text(encoding="utf-8").splitlines()
|
||||||
|
if v.strip() and not v.startswith("#")]
|
||||||
|
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file}")
|
||||||
|
|
||||||
|
conjugations = _load_conjugations()
|
||||||
|
new_count = 0
|
||||||
|
|
||||||
|
for verb in verbs:
|
||||||
|
if verb in conjugations:
|
||||||
|
logger.info(f"Skipping {verb} (cached)")
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(f"Processing: {verb}")
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
slug = _find_slug(verb)
|
||||||
|
if not slug:
|
||||||
|
logger.warning(f" No slug found for {verb}")
|
||||||
|
conjugations[verb] = None
|
||||||
|
_save_conjugations(conjugations)
|
||||||
|
continue
|
||||||
|
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
data = _extract_conjugations(slug, verb)
|
||||||
|
conjugations[verb] = data
|
||||||
|
_save_conjugations(conjugations)
|
||||||
|
new_count += 1
|
||||||
|
|
||||||
|
logger.info(f"Done: {new_count} new verbs processed")
|
||||||
|
return conjugations
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||||
|
result = main()
|
||||||
|
for verb, data in result.items():
|
||||||
|
if data:
|
||||||
|
forms = data.get("forms", {})
|
||||||
|
print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
|
||||||
|
for k, v in list(forms.items())[:3]:
|
||||||
|
print(f" {k}: {v['form']}")
|
||||||
|
else:
|
||||||
|
print(f"{verb}: no data")
|
||||||
|
|
|
||||||
903
data/conjugations.json
Normal file
903
data/conjugations.json
Normal file
|
|
@ -0,0 +1,903 @@
|
||||||
|
{
|
||||||
|
"לִשְׁמוֹר": {
|
||||||
|
"infinitive": "לִשְׁמוֹר",
|
||||||
|
"slug": "2255-lishmor",
|
||||||
|
"root": "שׁ - מ - ר",
|
||||||
|
"binyan": "",
|
||||||
|
"is_passive": false,
|
||||||
|
"reference_form": "לִשְׁמֹר",
|
||||||
|
"forms": {
|
||||||
|
"present_ms": {
|
||||||
|
"form": "שׁוֹמֵר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר יחיד)"
|
||||||
|
},
|
||||||
|
"present_fs": {
|
||||||
|
"form": "שׁוֹמֶרֶת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה יחיד)"
|
||||||
|
},
|
||||||
|
"present_mp": {
|
||||||
|
"form": "שׁוֹמְרִים",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר רבים)"
|
||||||
|
},
|
||||||
|
"present_fp": {
|
||||||
|
"form": "שׁוֹמְרוֹת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה רבים)"
|
||||||
|
},
|
||||||
|
"past_1s": {
|
||||||
|
"form": "שָׁמַרְתִּי",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_1p": {
|
||||||
|
"form": "שָׁמַרְנוּ",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2ms": {
|
||||||
|
"form": "שָׁמַרְתָּ",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fs": {
|
||||||
|
"form": "שָׁמַרְתְּ",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2mp": {
|
||||||
|
"form": "שְׁמַרְתֶּם",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fp": {
|
||||||
|
"form": "שְׁמַרְתֶּן",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3ms": {
|
||||||
|
"form": "שָׁמַר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3fs": {
|
||||||
|
"form": "שָׁמְרָה",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3p": {
|
||||||
|
"form": "שָׁמְרוּ",
|
||||||
|
"pronoun": "הֵם / הֵן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"future_1s": {
|
||||||
|
"form": "אֶשְׁמֹר",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_1p": {
|
||||||
|
"form": "נִשְׁמֹר",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2ms": {
|
||||||
|
"form": "תִּשְׁמֹר",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fs": {
|
||||||
|
"form": "תִּשְׁמְרִי",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2mp": {
|
||||||
|
"form": "תִּשְׁמְרוּ",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fp": {
|
||||||
|
"form": "תִּשְׁמֹרְנָה",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3ms": {
|
||||||
|
"form": "יִשְׁמֹר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fs": {
|
||||||
|
"form": "תִּשְׁמֹר",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3mp": {
|
||||||
|
"form": "יִשְׁמְרוּ",
|
||||||
|
"pronoun": "הֵם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fp": {
|
||||||
|
"form": "תִּשְׁמֹרְנָה",
|
||||||
|
"pronoun": "הֵן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"imperative_ms": {
|
||||||
|
"form": "שְׁמֹר!",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fs": {
|
||||||
|
"form": "שִׁמְרִי!",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_mp": {
|
||||||
|
"form": "שִׁמְרוּ!",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fp": {
|
||||||
|
"form": "שְׁמֹרְנָה!",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"infinitive": {
|
||||||
|
"form": "לִשְׁמֹר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "מְקוֹר"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"לְהִשָּׁמֵר": {
|
||||||
|
"infinitive": "לְהִשָּׁמֵר",
|
||||||
|
"slug": "2256-lehishamer",
|
||||||
|
"root": "שׁ - מ - ר",
|
||||||
|
"binyan": "",
|
||||||
|
"is_passive": false,
|
||||||
|
"reference_form": "לְהִשָּׁמֵר",
|
||||||
|
"forms": {
|
||||||
|
"present_ms": {
|
||||||
|
"form": "נִשְׁמָר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר יחיד)"
|
||||||
|
},
|
||||||
|
"present_fs": {
|
||||||
|
"form": "נִשְׁמֶרֶת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה יחיד)"
|
||||||
|
},
|
||||||
|
"present_mp": {
|
||||||
|
"form": "נִשְׁמָרִים",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר רבים)"
|
||||||
|
},
|
||||||
|
"present_fp": {
|
||||||
|
"form": "נִשְׁמָרוֹת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה רבים)"
|
||||||
|
},
|
||||||
|
"past_1s": {
|
||||||
|
"form": "נִשְׁמַרְתִּי",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_1p": {
|
||||||
|
"form": "נִשְׁמַרְנוּ",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2ms": {
|
||||||
|
"form": "נִשְׁמַרְתָּ",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fs": {
|
||||||
|
"form": "נִשְׁמַרְתְּ",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2mp": {
|
||||||
|
"form": "נִשְׁמַרְתֶּם",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fp": {
|
||||||
|
"form": "נִשְׁמַרְתֶּן",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3ms": {
|
||||||
|
"form": "נִשְׁמַר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3fs": {
|
||||||
|
"form": "נִשְׁמְרָה",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3p": {
|
||||||
|
"form": "נִשְׁמְרוּ",
|
||||||
|
"pronoun": "הֵם / הֵן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"future_1s": {
|
||||||
|
"form": "אֶשָּׁמֵר",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_1p": {
|
||||||
|
"form": "נִשָּׁמֵר",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2ms": {
|
||||||
|
"form": "תִּשָּׁמֵר",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fs": {
|
||||||
|
"form": "תִּשָּׁמְרִי",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2mp": {
|
||||||
|
"form": "תִּשָּׁמְרוּ",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fp": {
|
||||||
|
"form": "תִּשָּׁמַרְנָה",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3ms": {
|
||||||
|
"form": "יִשָּׁמֵר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fs": {
|
||||||
|
"form": "תִּשָּׁמֵר",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3mp": {
|
||||||
|
"form": "יִשָּׁמְרוּ",
|
||||||
|
"pronoun": "הֵם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fp": {
|
||||||
|
"form": "תִּשָּׁמַרְנָה",
|
||||||
|
"pronoun": "הֵן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"imperative_ms": {
|
||||||
|
"form": "הִשָּׁמֵר!",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fs": {
|
||||||
|
"form": "הִשָּׁמְרִי!",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_mp": {
|
||||||
|
"form": "הִשָּׁמְרוּ!",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fp": {
|
||||||
|
"form": "הִשָּׁמַרְנָה!",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"infinitive": {
|
||||||
|
"form": "לְהִשָּׁמֵר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "מְקוֹר"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"לְדַבֵּר": {
|
||||||
|
"infinitive": "לְדַבֵּר",
|
||||||
|
"slug": "2-ledaber",
|
||||||
|
"root": "ד - ב - ר",
|
||||||
|
"binyan": "",
|
||||||
|
"is_passive": false,
|
||||||
|
"reference_form": "לְדַבֵּר",
|
||||||
|
"forms": {
|
||||||
|
"present_ms": {
|
||||||
|
"form": "מְדַבֵּר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר יחיד)"
|
||||||
|
},
|
||||||
|
"present_fs": {
|
||||||
|
"form": "מְדַבֶּרֶת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה יחיד)"
|
||||||
|
},
|
||||||
|
"present_mp": {
|
||||||
|
"form": "מְדַבְּרִים",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר רבים)"
|
||||||
|
},
|
||||||
|
"present_fp": {
|
||||||
|
"form": "מְדַבְּרוֹת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה רבים)"
|
||||||
|
},
|
||||||
|
"past_1s": {
|
||||||
|
"form": "דִּבַּרְתִּי",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_1p": {
|
||||||
|
"form": "דִּבַּרְנוּ",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2ms": {
|
||||||
|
"form": "דִּבַּרְתָּ",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fs": {
|
||||||
|
"form": "דִּבַּרְתְּ",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2mp": {
|
||||||
|
"form": "דִּבַּרְתֶּם",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fp": {
|
||||||
|
"form": "דִּבַּרְתֶּן",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3ms": {
|
||||||
|
"form": "דִּבֵּר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3fs": {
|
||||||
|
"form": "דִּבְּרָה",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3p": {
|
||||||
|
"form": "דִּבְּרוּ",
|
||||||
|
"pronoun": "הֵם / הֵן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"future_1s": {
|
||||||
|
"form": "אֲדַבֵּר",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_1p": {
|
||||||
|
"form": "נְדַבֵּר",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2ms": {
|
||||||
|
"form": "תְּדַבֵּר",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fs": {
|
||||||
|
"form": "תְּדַבְּרִי",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2mp": {
|
||||||
|
"form": "תְּדַבְּרוּ",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fp": {
|
||||||
|
"form": "תְּדַבֵּרְנָה",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3ms": {
|
||||||
|
"form": "יְדַבֵּר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fs": {
|
||||||
|
"form": "תְּדַבֵּר",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3mp": {
|
||||||
|
"form": "יְדַבְּרוּ",
|
||||||
|
"pronoun": "הֵם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fp": {
|
||||||
|
"form": "תְּדַבֵּרְנָה",
|
||||||
|
"pronoun": "הֵן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"imperative_ms": {
|
||||||
|
"form": "דַּבֵּר!",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fs": {
|
||||||
|
"form": "דַּבְּרִי!",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_mp": {
|
||||||
|
"form": "דַּבְּרוּ!",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fp": {
|
||||||
|
"form": "דַּבֵּרְנָה!",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"infinitive": {
|
||||||
|
"form": "לְדַבֵּר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "מְקוֹר"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"לְדֻבַּר": {
|
||||||
|
"infinitive": "לְדֻבַּר",
|
||||||
|
"slug": "2-ledaber",
|
||||||
|
"root": "ד - ב - ר",
|
||||||
|
"binyan": "",
|
||||||
|
"is_passive": false,
|
||||||
|
"reference_form": "לְדַבֵּר",
|
||||||
|
"forms": {
|
||||||
|
"present_ms": {
|
||||||
|
"form": "מְדַבֵּר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר יחיד)"
|
||||||
|
},
|
||||||
|
"present_fs": {
|
||||||
|
"form": "מְדַבֶּרֶת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה יחיד)"
|
||||||
|
},
|
||||||
|
"present_mp": {
|
||||||
|
"form": "מְדַבְּרִים",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר רבים)"
|
||||||
|
},
|
||||||
|
"present_fp": {
|
||||||
|
"form": "מְדַבְּרוֹת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה רבים)"
|
||||||
|
},
|
||||||
|
"past_1s": {
|
||||||
|
"form": "דִּבַּרְתִּי",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_1p": {
|
||||||
|
"form": "דִּבַּרְנוּ",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2ms": {
|
||||||
|
"form": "דִּבַּרְתָּ",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fs": {
|
||||||
|
"form": "דִּבַּרְתְּ",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2mp": {
|
||||||
|
"form": "דִּבַּרְתֶּם",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fp": {
|
||||||
|
"form": "דִּבַּרְתֶּן",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3ms": {
|
||||||
|
"form": "דִּבֵּר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3fs": {
|
||||||
|
"form": "דִּבְּרָה",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3p": {
|
||||||
|
"form": "דִּבְּרוּ",
|
||||||
|
"pronoun": "הֵם / הֵן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"future_1s": {
|
||||||
|
"form": "אֲדַבֵּר",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_1p": {
|
||||||
|
"form": "נְדַבֵּר",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2ms": {
|
||||||
|
"form": "תְּדַבֵּר",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fs": {
|
||||||
|
"form": "תְּדַבְּרִי",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2mp": {
|
||||||
|
"form": "תְּדַבְּרוּ",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fp": {
|
||||||
|
"form": "תְּדַבֵּרְנָה",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3ms": {
|
||||||
|
"form": "יְדַבֵּר",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fs": {
|
||||||
|
"form": "תְּדַבֵּר",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3mp": {
|
||||||
|
"form": "יְדַבְּרוּ",
|
||||||
|
"pronoun": "הֵם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fp": {
|
||||||
|
"form": "תְּדַבֵּרְנָה",
|
||||||
|
"pronoun": "הֵן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"imperative_ms": {
|
||||||
|
"form": "דַּבֵּר!",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fs": {
|
||||||
|
"form": "דַּבְּרִי!",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_mp": {
|
||||||
|
"form": "דַּבְּרוּ!",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fp": {
|
||||||
|
"form": "דַּבֵּרְנָה!",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"infinitive": {
|
||||||
|
"form": "לְדַבֵּר",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "מְקוֹר"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"לְהִתְלַבֵּשׁ": {
|
||||||
|
"infinitive": "לְהִתְלַבֵּשׁ",
|
||||||
|
"slug": "974-lehitlabesh",
|
||||||
|
"root": "ל - ב - שׁ",
|
||||||
|
"binyan": "",
|
||||||
|
"is_passive": false,
|
||||||
|
"reference_form": "לְהִתְלַבֵּשׁ",
|
||||||
|
"forms": {
|
||||||
|
"present_ms": {
|
||||||
|
"form": "מִתְלַבֵּשׁ",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר יחיד)"
|
||||||
|
},
|
||||||
|
"present_fs": {
|
||||||
|
"form": "מִתְלַבֶּשֶׁת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה יחיד)"
|
||||||
|
},
|
||||||
|
"present_mp": {
|
||||||
|
"form": "מִתְלַבְּשִׁים",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר רבים)"
|
||||||
|
},
|
||||||
|
"present_fp": {
|
||||||
|
"form": "מִתְלַבְּשׁוֹת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה רבים)"
|
||||||
|
},
|
||||||
|
"past_1s": {
|
||||||
|
"form": "הִתְלַבַּשְׁתִּי",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_1p": {
|
||||||
|
"form": "הִתְלַבַּשְׁנוּ",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2ms": {
|
||||||
|
"form": "הִתְלַבַּשְׁתָּ",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fs": {
|
||||||
|
"form": "הִתְלַבַּשְׁתְּ",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2mp": {
|
||||||
|
"form": "הִתְלַבַּשְׁתֶּם",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fp": {
|
||||||
|
"form": "הִתְלַבַּשְׁתֶּן",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3ms": {
|
||||||
|
"form": "הִתְלַבֵּשׁ",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3fs": {
|
||||||
|
"form": "הִתְלַבְּשָׁה",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3p": {
|
||||||
|
"form": "הִתְלַבְּשׁוּ",
|
||||||
|
"pronoun": "הֵם / הֵן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"future_1s": {
|
||||||
|
"form": "אֶתְלַבֵּשׁ",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_1p": {
|
||||||
|
"form": "נִתְלַבֵּשׁ",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2ms": {
|
||||||
|
"form": "תִּתְלַבֵּשׁ",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fs": {
|
||||||
|
"form": "תִּתְלַבְּשִׁי",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2mp": {
|
||||||
|
"form": "תִּתְלַבְּשׁוּ",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fp": {
|
||||||
|
"form": "תִּתְלַבֵּשְׁנָה",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3ms": {
|
||||||
|
"form": "יִתְלַבֵּשׁ",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fs": {
|
||||||
|
"form": "תִּתְלַבֵּשׁ",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3mp": {
|
||||||
|
"form": "יִתְלַבְּשׁוּ",
|
||||||
|
"pronoun": "הֵם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fp": {
|
||||||
|
"form": "תִּתְלַבֵּשְׁנָה",
|
||||||
|
"pronoun": "הֵן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"imperative_ms": {
|
||||||
|
"form": "הִתְלַבֵּשׁ!",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fs": {
|
||||||
|
"form": "הִתְלַבְּשִׁי!",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_mp": {
|
||||||
|
"form": "הִתְלַבְּשׁוּ!",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fp": {
|
||||||
|
"form": "הִתְלַבֵּשְׁנָה!",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"infinitive": {
|
||||||
|
"form": "לְהִתְלַבֵּשׁ",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "מְקוֹר"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"לְהַגִּיד": {
|
||||||
|
"infinitive": "לְהַגִּיד",
|
||||||
|
"slug": "1135-lehagid",
|
||||||
|
"root": "נ - ג - ד",
|
||||||
|
"binyan": "",
|
||||||
|
"is_passive": false,
|
||||||
|
"reference_form": "לְהַגִּיד",
|
||||||
|
"forms": {
|
||||||
|
"present_ms": {
|
||||||
|
"form": "מַגִּיד",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר יחיד)"
|
||||||
|
},
|
||||||
|
"present_fs": {
|
||||||
|
"form": "מַגִּידָה",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה יחיד)"
|
||||||
|
},
|
||||||
|
"present_mp": {
|
||||||
|
"form": "מַגִּידִים",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (זכר רבים)"
|
||||||
|
},
|
||||||
|
"present_fp": {
|
||||||
|
"form": "מַגִּידוֹת",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "הוֹוֶה (נקבה רבים)"
|
||||||
|
},
|
||||||
|
"past_1s": {
|
||||||
|
"form": "הִגַּדְתִּי",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_1p": {
|
||||||
|
"form": "הִגַּדְנוּ",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2ms": {
|
||||||
|
"form": "הִגַּדְתָּ",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fs": {
|
||||||
|
"form": "הִגַּדְתְּ",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2mp": {
|
||||||
|
"form": "הִגַּדְתֶּם",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_2fp": {
|
||||||
|
"form": "הִגַּדְתֶּן",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3ms": {
|
||||||
|
"form": "הִגִּיד",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3fs": {
|
||||||
|
"form": "הִגִּידָה",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"past_3p": {
|
||||||
|
"form": "הִגִּידוּ",
|
||||||
|
"pronoun": "הֵם / הֵן",
|
||||||
|
"tense": "עָבָר"
|
||||||
|
},
|
||||||
|
"future_1s": {
|
||||||
|
"form": "אַגִּיד",
|
||||||
|
"pronoun": "אֲנִי",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_1p": {
|
||||||
|
"form": "נַגִּיד",
|
||||||
|
"pronoun": "אֲנַחְנוּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2ms": {
|
||||||
|
"form": "תַּגִּיד",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fs": {
|
||||||
|
"form": "תַּגִּידִי",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2mp": {
|
||||||
|
"form": "תַּגִּידוּ",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_2fp": {
|
||||||
|
"form": "תַּגֵּדְנָה",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3ms": {
|
||||||
|
"form": "יַגִּיד",
|
||||||
|
"pronoun": "הוּא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fs": {
|
||||||
|
"form": "תַּגִּיד",
|
||||||
|
"pronoun": "הִיא",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3mp": {
|
||||||
|
"form": "יַגִּידוּ",
|
||||||
|
"pronoun": "הֵם",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"future_3fp": {
|
||||||
|
"form": "תַּגֵּדְנָה",
|
||||||
|
"pronoun": "הֵן",
|
||||||
|
"tense": "עָתִיד"
|
||||||
|
},
|
||||||
|
"imperative_ms": {
|
||||||
|
"form": "הַגֵּד!",
|
||||||
|
"pronoun": "אַתָּה",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fs": {
|
||||||
|
"form": "הַגִּידִי!",
|
||||||
|
"pronoun": "אַתְּ",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_mp": {
|
||||||
|
"form": "הַגִּידוּ!",
|
||||||
|
"pronoun": "אַתֶּם",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"imperative_fp": {
|
||||||
|
"form": "הַגֵּדְנָה!",
|
||||||
|
"pronoun": "אַתֶּן",
|
||||||
|
"tense": "צִוּוּי"
|
||||||
|
},
|
||||||
|
"infinitive": {
|
||||||
|
"form": "לְהַגִּיד",
|
||||||
|
"pronoun": "",
|
||||||
|
"tense": "מְקוֹר"
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"לְהוּגַד": null
|
||||||
|
}
|
||||||
1
data/examples_cache.json
Normal file
1
data/examples_cache.json
Normal file
|
|
@ -0,0 +1 @@
|
||||||
|
{"אב": ["לא אב לחגלה אתה", "כרחם אב על בנים"], "אבא": ["כך כך אבא יקירי", "“אבא איננו בבית"], "אביבי": ["אמרת: תם אביבי,", "אמרת: תם אביבי,"], "אביב": ["אביב כי יתחדש –", "ברחובות תל־אביב"], "אבידה": ["אבידה בדבר מועט", "ואם לרבות אבידה"], "לאבוד": ["אבל נאנחתי לאבוד", "אנו הולכים לאבוד"], "להיאבד": [], "להתאבד": ["מעמד והחליטה להתאבד", "היא נסתה פעם להתאבד"], "איבוד": ["איבוד דמי משלוח", "איבוד עצמו לדעת"], "התאבדות": ["והביאו לידי התאבדות", "הקלון, בלתי אם התאבדות"], "להאביד": ["ויאמר להאביד זכרם –", "קול שם רשעים להאביד"], "אבדה": ["ועתה אבדה תקותה", "וכל תשועתו אבדה"], "אבוד": ["— הה, הנני אבוד", "“אבוד עצמי לדעת"], "לאבד": ["אין לכם מה לאבד", "יש חשש לאבד שנה"], "אבדון": ["אבדון, אש הנעורת", "אבדון, פתחי עולם"], "אבוקדו": ["מטעים רצופים של עצי אבוקדו ומנגו", "את זרעי הפירות וגידלו מהם שתילים חדשים של אבוקדו"], "אבזם": ["רצו אל הטבח, הגישו לו הפעם חגורה עם אבזם מבריק… הביאו כוס", "רגליו היו עטופות לפפות חדשות ומתניו חגורות חגורה חדשה עם אבזם"], "לאבזר": [], "איבחון": ["לשלבים גבוהים יותר של איבחון וריפוי", "איבחון נחפז, כמוהו כהיסוס מופרז, עלול לגרור תוצאות בלתי־נעימות"]}
|
||||||
1
data/frequency_cache.json
Normal file
1
data/frequency_cache.json
Normal file
File diff suppressed because one or more lines are too long
9106
data/pealim_dict.csv
Normal file
9106
data/pealim_dict.csv
Normal file
File diff suppressed because it is too large
Load diff
12111
data/pealim_dict_for_anki.csv
Normal file
12111
data/pealim_dict_for_anki.csv
Normal file
File diff suppressed because it is too large
Load diff
85
frequency_lookup.py
Normal file
85
frequency_lookup.py
Normal file
|
|
@ -0,0 +1,85 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
|
||||||
|
Downloads he_50k.txt once; subsequent runs read from cache.
|
||||||
|
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
FREQ_URL = (
|
||||||
|
"https://raw.githubusercontent.com/hermitdave/FrequencyWords/"
|
||||||
|
"master/content/2016/he/he_50k.txt"
|
||||||
|
)
|
||||||
|
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
|
||||||
|
REQUEST_TIMEOUT = 30
|
||||||
|
|
||||||
|
# Module-level cache: word_no_nikkud -> rank (1 = most common)
|
||||||
|
_freq: dict[str, int] = {}
|
||||||
|
|
||||||
|
|
||||||
|
def _strip_nikkud(text: str) -> str:
|
||||||
|
"""Remove Hebrew nikkud (diacritics) from a string."""
|
||||||
|
return "".join(
|
||||||
|
ch for ch in unicodedata.normalize("NFD", text)
|
||||||
|
if unicodedata.category(ch) != "Mn"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def load(cache_path: Path = CACHE_PATH) -> None:
|
||||||
|
"""Load frequency data from cache, downloading if not present."""
|
||||||
|
global _freq
|
||||||
|
if cache_path.exists():
|
||||||
|
with open(cache_path, encoding="utf-8") as f:
|
||||||
|
_freq = json.load(f)
|
||||||
|
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("Downloading FrequencyWords he_50k.txt …")
|
||||||
|
resp = requests.get(FREQ_URL, timeout=REQUEST_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
|
||||||
|
rank = 1
|
||||||
|
for line in resp.text.splitlines():
|
||||||
|
line = line.strip()
|
||||||
|
if not line:
|
||||||
|
continue
|
||||||
|
parts = line.split()
|
||||||
|
if len(parts) >= 1:
|
||||||
|
word = _strip_nikkud(parts[0])
|
||||||
|
if word and word not in _freq:
|
||||||
|
_freq[word] = rank
|
||||||
|
rank += 1
|
||||||
|
|
||||||
|
cache_path.parent.mkdir(parents=True, exist_ok=True)
|
||||||
|
with open(cache_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(_freq, f, ensure_ascii=False)
|
||||||
|
logger.info(f"Frequency cache saved: {len(_freq)} entries → {cache_path}")
|
||||||
|
|
||||||
|
|
||||||
|
def get_frequency_rank(word_no_nikkud: str) -> int | None:
|
||||||
|
"""
|
||||||
|
Return the frequency rank of a word (1 = most common).
|
||||||
|
Returns None if not found in the corpus.
|
||||||
|
Strips nikkud from the input before lookup.
|
||||||
|
"""
|
||||||
|
if not _freq:
|
||||||
|
load()
|
||||||
|
clean = _strip_nikkud(word_no_nikkud.strip())
|
||||||
|
return _freq.get(clean)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||||
|
load()
|
||||||
|
tests = ["שלום", "ספר", "בית", "מים", "כלב"]
|
||||||
|
for w in tests:
|
||||||
|
print(f"{w}: rank {get_frequency_rank(w)}")
|
||||||
|
|
@ -1,3 +1,6 @@
|
||||||
pandas>=1.3.0
|
pandas>=1.3.0
|
||||||
requests>=2.26.0
|
requests>=2.26.0
|
||||||
numpy>=1.21.0
|
numpy>=1.21.0
|
||||||
|
genanki>=0.8.0
|
||||||
|
beautifulsoup4>=4.11.0
|
||||||
|
lxml>=4.9.0
|
||||||
|
|
|
||||||
317
run.py
317
run.py
|
|
@ -1,48 +1,313 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Main entry point: orchestrate dictionary and conjugation extraction.
|
Pealim Anki Deck Builder — full pipeline orchestrator.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python run.py [options]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
|
||||||
|
--skip-audio Skip audio .mp3 downloads
|
||||||
|
--skip-examples Skip Ben Yehuda example fetching
|
||||||
|
--skip-conjugations Skip verb conjugation extraction
|
||||||
|
--test N Process only the first N dictionary words (for quick testing)
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
# Add current directory to path
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
import pealim_extract
|
|
||||||
import conjugation_extract
|
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
level=logging.INFO,
|
level=logging.INFO,
|
||||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
)
|
)
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
|
||||||
|
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
|
||||||
|
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
||||||
|
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
||||||
|
p.add_argument("--skip-conjugations", action="store_true", help="Skip verb conjugation extraction")
|
||||||
|
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
||||||
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def step_scrape(args):
|
||||||
|
"""Step 1 — scrape or load dictionary."""
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||||
|
anki_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||||
|
|
||||||
|
if args.skip_scrape:
|
||||||
|
if dict_csv.exists():
|
||||||
|
logger.info(f"[1] Using existing {dict_csv}")
|
||||||
|
else:
|
||||||
|
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
|
||||||
|
sys.exit(1)
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("[1] Scraping dictionary from pealim.com …")
|
||||||
|
import pealim_extract
|
||||||
|
import pandas as pd
|
||||||
|
|
||||||
|
df = pealim_extract.extract_from_website()
|
||||||
|
df.to_csv(dict_csv, index=True)
|
||||||
|
logger.info(f" Saved {len(df)} words → {dict_csv}")
|
||||||
|
|
||||||
|
df = pealim_extract.modify_for_anki(df)
|
||||||
|
df.to_csv(anki_csv, sep=";", index=True)
|
||||||
|
logger.info(f" Saved Anki CSV → {anki_csv}")
|
||||||
|
|
||||||
|
|
||||||
|
def step_frequency():
|
||||||
|
"""Step 2 — load/download word frequency data."""
|
||||||
|
logger.info("[2] Loading word frequency data …")
|
||||||
|
import frequency_lookup
|
||||||
|
frequency_lookup.load()
|
||||||
|
return frequency_lookup._freq
|
||||||
|
|
||||||
|
|
||||||
|
def step_examples(args, freq_cache: dict):
|
||||||
|
"""Step 3 — load/build Ben Yehuda example index."""
|
||||||
|
if args.skip_examples:
|
||||||
|
logger.info("[3] Skipping examples (--skip-examples)")
|
||||||
|
examples_path = DATA_DIR / "examples_cache.json"
|
||||||
|
if examples_path.exists():
|
||||||
|
with open(examples_path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
return {}
|
||||||
|
|
||||||
|
logger.info("[3] Loading Ben Yehuda example index …")
|
||||||
|
import benyehuda
|
||||||
|
benyehuda.load()
|
||||||
|
# Pre-fetch examples for all words in the dict (uses cache)
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||||
|
if not dict_csv.exists():
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||||
|
|
||||||
|
try:
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||||
|
if df.shape[1] < 3:
|
||||||
|
raise ValueError
|
||||||
|
except Exception:
|
||||||
|
df = pd.read_csv(dict_csv, index_col=0)
|
||||||
|
|
||||||
|
if args.test:
|
||||||
|
df = df.head(args.test)
|
||||||
|
|
||||||
|
import unicodedata
|
||||||
|
def strip(t):
|
||||||
|
return "".join(c for c in unicodedata.normalize("NFD", str(t))
|
||||||
|
if unicodedata.category(c) != "Mn")
|
||||||
|
|
||||||
|
logger.info(f" Pre-fetching examples for {len(df)} words …")
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
word_plain = strip(str(row.get("Word Without Nikkud", "")).strip())
|
||||||
|
if word_plain:
|
||||||
|
benyehuda.get_examples(word_plain)
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" Could not pre-fetch all examples: {e}")
|
||||||
|
|
||||||
|
benyehuda.save_examples_cache()
|
||||||
|
return benyehuda._examples_cache
|
||||||
|
|
||||||
|
|
||||||
|
def step_audio(args):
|
||||||
|
"""Step 4 — download audio .mp3 files."""
|
||||||
|
if args.skip_audio:
|
||||||
|
logger.info("[4] Skipping audio (--skip-audio)")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("[4] Downloading audio files …")
|
||||||
|
# Load audio URL cache (from old workspace if available)
|
||||||
|
audio_cache_path = DATA_DIR / "audio_cache.json"
|
||||||
|
audio_url_cache: dict = {}
|
||||||
|
if audio_cache_path.exists():
|
||||||
|
with open(audio_cache_path) as f:
|
||||||
|
audio_url_cache = json.load(f)
|
||||||
|
|
||||||
|
import audio_extract as ae
|
||||||
|
ae._audio_cache = audio_url_cache
|
||||||
|
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||||
|
if not dict_csv.exists():
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
try:
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||||
|
if df.shape[1] < 3:
|
||||||
|
raise ValueError
|
||||||
|
except Exception:
|
||||||
|
df = pd.read_csv(dict_csv, index_col=0)
|
||||||
|
|
||||||
|
if args.test:
|
||||||
|
df = df.head(args.test)
|
||||||
|
|
||||||
|
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
downloaded = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for _, row in df.iterrows():
|
||||||
|
word = str(row.get("Word", "")).strip()
|
||||||
|
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
||||||
|
if not word:
|
||||||
|
continue
|
||||||
|
|
||||||
|
import re, unicodedata
|
||||||
|
def strip_nik(t):
|
||||||
|
return "".join(c for c in unicodedata.normalize("NFD", t)
|
||||||
|
if unicodedata.category(c) != "Mn")
|
||||||
|
|
||||||
|
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nik(word_plain or word))
|
||||||
|
if not safe_name:
|
||||||
|
continue
|
||||||
|
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
|
||||||
|
|
||||||
|
if mp3_path.exists():
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Get audio URL from cache or fetch
|
||||||
|
audio_url = ae.extract_audio_url(word)
|
||||||
|
if audio_url:
|
||||||
|
try:
|
||||||
|
resp = requests.get(audio_url, timeout=10)
|
||||||
|
resp.raise_for_status()
|
||||||
|
mp3_path.write_bytes(resp.content)
|
||||||
|
downloaded += 1
|
||||||
|
time.sleep(0.3)
|
||||||
|
except Exception as e:
|
||||||
|
logger.debug(f" Audio download failed for {word}: {e}")
|
||||||
|
|
||||||
|
ae.save_audio_cache(str(audio_cache_path))
|
||||||
|
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning(f" Audio step failed: {e}")
|
||||||
|
|
||||||
|
|
||||||
|
def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
|
||||||
|
"""Step 5 — build vocabulary .apkg."""
|
||||||
|
logger.info("[5] Building vocabulary deck …")
|
||||||
|
import apkg_builder
|
||||||
|
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||||
|
if not dict_csv.exists():
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||||
|
|
||||||
|
deck, media = apkg_builder.build_vocab_deck(
|
||||||
|
dict_csv,
|
||||||
|
examples_cache=examples_cache,
|
||||||
|
freq_cache=freq_cache,
|
||||||
|
limit=args.test,
|
||||||
|
)
|
||||||
|
apkg_builder.write_vocab_apkg(deck, media)
|
||||||
|
logger.info(f" Vocabulary .apkg → {apkg_builder.VOCAB_APKG}")
|
||||||
|
return deck
|
||||||
|
|
||||||
|
|
||||||
|
def step_conjugations(args):
|
||||||
|
"""Step 6 — extract conjugations and build conjugation deck."""
|
||||||
|
if args.skip_conjugations:
|
||||||
|
logger.info("[6] Skipping conjugations (--skip-conjugations)")
|
||||||
|
return
|
||||||
|
|
||||||
|
verbs_file = Path(__file__).parent / "verbs_input.txt"
|
||||||
|
if not verbs_file.exists():
|
||||||
|
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
|
||||||
|
return
|
||||||
|
|
||||||
|
logger.info("[6] Extracting verb conjugations …")
|
||||||
|
import conjugation_extract
|
||||||
|
conjugations = conjugation_extract.main(verbs_file)
|
||||||
|
|
||||||
|
import apkg_builder
|
||||||
|
conj_deck = apkg_builder.build_conj_deck(conjugations)
|
||||||
|
apkg_builder.write_conj_apkg(conj_deck)
|
||||||
|
logger.info(f" Conjugation .apkg → {apkg_builder.CONJ_APKG}")
|
||||||
|
|
||||||
|
return conjugations
|
||||||
|
|
||||||
|
|
||||||
|
def print_summary(args, examples_cache, freq_cache, conjugations):
|
||||||
|
logger.info("")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("SUMMARY")
|
||||||
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||||
|
if not dict_csv.exists():
|
||||||
|
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||||
|
if dict_csv.exists():
|
||||||
|
import pandas as pd
|
||||||
|
try:
|
||||||
|
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||||
|
if df.shape[1] < 3:
|
||||||
|
raise ValueError
|
||||||
|
except Exception:
|
||||||
|
df = pd.read_csv(dict_csv, index_col=0)
|
||||||
|
logger.info(f" Dictionary words: {len(df)}")
|
||||||
|
|
||||||
|
logger.info(f" Frequency entries: {len(freq_cache)}")
|
||||||
|
logger.info(f" Example cache entries: {len(examples_cache)}")
|
||||||
|
covered = sum(1 for v in examples_cache.values() if v)
|
||||||
|
if examples_cache:
|
||||||
|
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100*covered//len(examples_cache)}%)")
|
||||||
|
|
||||||
|
audio_dir = DATA_DIR / "audio"
|
||||||
|
if audio_dir.exists():
|
||||||
|
mp3s = list(audio_dir.glob("*.mp3"))
|
||||||
|
logger.info(f" Audio files: {len(mp3s)}")
|
||||||
|
|
||||||
|
vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
|
||||||
|
conj_apkg = OUTPUT_DIR / "pealim_conjugations.apkg"
|
||||||
|
if vocab_apkg.exists():
|
||||||
|
size_mb = vocab_apkg.stat().st_size / 1e6
|
||||||
|
logger.info(f" Vocabulary .apkg: {size_mb:.1f} MB → {vocab_apkg}")
|
||||||
|
if conj_apkg.exists():
|
||||||
|
size_mb = conj_apkg.stat().st_size / 1e6
|
||||||
|
logger.info(f" Conjugation .apkg: {size_mb:.1f} MB → {conj_apkg}")
|
||||||
|
if conjugations:
|
||||||
|
verb_count = sum(1 for v in conjugations.values() if v)
|
||||||
|
logger.info(f" Verbs in conjugation deck: {verb_count}")
|
||||||
|
|
||||||
|
logger.info("=" * 60)
|
||||||
|
logger.info("✅ DONE")
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
"""Run all extraction tasks."""
|
args = parse_args()
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("PEALIM EXTRACTION SUITE")
|
logger.info("PEALIM ANKI DECK BUILDER")
|
||||||
|
if args.test:
|
||||||
|
logger.info(f" TEST MODE: {args.test} words")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
|
|
||||||
try:
|
step_scrape(args)
|
||||||
# Extract dictionary
|
freq_cache = step_frequency()
|
||||||
logger.info("\n[1/2] Extracting dictionary...")
|
examples_cache = step_examples(args, freq_cache)
|
||||||
pealim_extract.main()
|
step_audio(args)
|
||||||
|
step_build_vocab(args, examples_cache, freq_cache)
|
||||||
# Extract conjugations
|
conjugations = step_conjugations(args)
|
||||||
logger.info("\n[2/2] Extracting conjugations...")
|
|
||||||
conjugation_extract.main()
|
print_summary(args, examples_cache, freq_cache, conjugations or {})
|
||||||
|
|
||||||
logger.info("\n" + "=" * 60)
|
|
||||||
logger.info("✅ ALL TASKS COMPLETE")
|
|
||||||
logger.info("=" * 60)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.error(f"\n❌ EXTRACTION FAILED: {e}")
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|
|
||||||
10
verbs_input.txt
Normal file
10
verbs_input.txt
Normal file
|
|
@ -0,0 +1,10 @@
|
||||||
|
# One Hebrew infinitive per line.
|
||||||
|
# Lines starting with # are ignored.
|
||||||
|
# Initial test set — one verb per binyan:
|
||||||
|
לִשְׁמוֹר
|
||||||
|
לְהִשָּׁמֵר
|
||||||
|
לְדַבֵּר
|
||||||
|
לְדֻבַּר
|
||||||
|
לְהִתְלַבֵּשׁ
|
||||||
|
לְהַגִּיד
|
||||||
|
לְהוּגַד
|
||||||
Loading…
Reference in a new issue