Sprint 14: deck template/CSS overhaul + Sprint 12 detail scrape

Template & CSS fixes (15 items from Mar 9 feedback):
- Fix conjugation front showing 3ms form instead of infinitive
- Rename conjugation model to "Hebrew Conjugation"
- Strip Hebrew parenthesized text from English meanings
- Shoresh separator: spaces → dots (א.כ.ל)
- Remove duplicate English meaning from cloze back
- Remove example sentences from vocab front/back (cloze only)
- Center-align audio buttons on all decks
- Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)"
- Unify sec-key/sec-label fonts, make keys bold
- Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px)
- Center-align related words groups
- Sort confusables by average frequency
- Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning
- Clean duplicate quotation marks in cloze sentences

Sprint 12 carry-forward (detail scrape + EPUB):
- Adjective/preposition detail scraping in pealim_detail_scrape.py
- EPUB example matching rewrite in epub_examples.py
- Delete benyehuda.py and rebuild_sentence_matches.py (merged)
- 49 parser tests for detail scraping
- SCHEMA.yaml updates for new fields

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-10 07:44:47 +00:00
parent 3b0f9defa9
commit efd0745ada
10 changed files with 1669 additions and 741 deletions

View file

@ -138,11 +138,53 @@ entry:
# ktiv_male: "שומר"
# --- Adjective-specific ---
adjective_inflection: null # Reserved for future use
adjective_inflection: null # null for non-adjectives
# When populated:
# ms/fs/mp/fp forms with nikkud/ktiv_male subfields
# ms:
# nikkud: "גָּדוֹל"
# ktiv_male: "גדול"
# fs:
# nikkud: "גְּדוֹלָה"
# ktiv_male: "גדולה"
# mp:
# nikkud: "גְּדוֹלִים"
# ktiv_male: "גדולים"
# fp:
# nikkud: "גְּדוֹלוֹת"
# ktiv_male: "גדולות"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Preposition-specific ---
preposition_inflection: null # Reserved for future use
preposition_inflection: null # null for non-prepositions
# When populated:
# Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)
# 1s:
# nikkud: "שֶׁלִּי"
# ktiv_male: "שלי"
# 1p:
# nikkud: "שֶׁלָּנוּ"
# ktiv_male: "שלנו"
# 2ms:
# nikkud: "שֶׁלְּךָ"
# ktiv_male: "שלך"
# 2fs:
# nikkud: "שֶׁלָּךְ"
# ktiv_male: "שלך"
# 2mp:
# nikkud: "שֶׁלָּכֶם"
# ktiv_male: "שלכם"
# 2fp:
# nikkud: "שֶׁלָּכֶן"
# ktiv_male: "שלכן"
# 3ms:
# nikkud: "שֶׁלּוֹ"
# ktiv_male: "שלו"
# 3fs:
# nikkud: "שֶׁלָּהּ"
# ktiv_male: "שלה"
# 3mp:
# nikkud: "שֶׁלָּהֶם"
# ktiv_male: "שלהם"
# 3fp:
# nikkud: "שֶׁלָּהֶן"
# ktiv_male: "שלהן"

View file

@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.15.1"
RELEASE_TAG = "v0.16"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -117,13 +117,15 @@ CARD_CSS = """
.card {
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
font-size: 20px;
text-align: center;
text-align: right;
color: #222;
background: #fff;
padding: 16px;
max-width: 600px;
margin: 0 auto;
}
.hebrew {
font-size: 36px;
font-size: 42px;
font-weight: bold;
direction: rtl;
text-align: center;
@ -131,32 +133,34 @@ CARD_CSS = """
color: #222;
}
.hebrew-sm {
font-size: 24px;
font-size: 30px;
font-weight: normal;
direction: rtl;
text-align: center;
color: #333;
color: #222;
}
.meaning {
font-size: 28px;
font-size: 34px;
color: #1a1a8c;
margin: 8px 0;
text-align: center;
}
.hint {
font-size: 16px;
color: #888;
font-size: 22px;
color: #555;
margin: 4px 0;
direction: rtl;
text-align: center;
}
.root-info {
font-size: 18px;
color: #555;
font-size: 26px;
color: #222;
margin-top: 6px;
direction: rtl;
}
.example {
font-size: 18px;
color: #444;
font-size: 24px;
color: #222;
direction: rtl;
text-align: right;
font-style: italic;
@ -182,16 +186,17 @@ CARD_CSS = """
color: #555;
}
.sec-label {
font-size: 20px;
font-size: 28px;
font-weight: normal;
color: #555;
color: #222;
direction: rtl;
text-align: center;
margin-top: 6px;
}
.sec-key {
font-size: 18px;
color: #888;
font-size: 28px;
color: #222;
font-weight: bold;
}
.definitions {
direction: rtl;
@ -199,32 +204,37 @@ CARD_CSS = """
}
.conf-entry {
margin: 8px 0;
font-size: 20px;
font-size: 28px;
direction: rtl;
}
.related-group {
direction: rtl;
text-align: right;
text-align: center;
margin: 2px 0;
font-size: 18px;
font-size: 26px;
}
.emoji-img {
font-size: 3.5em;
text-align: center;
margin: 0.3em 0;
}
.card [type="button"], .card button, .replay-button {
display: block !important;
margin: 4px auto !important;
text-align: center;
}
@media (prefers-color-scheme: dark) {
.card { color: #e8e8e8; background: #1c1c1e; }
.hebrew { color: #f0f0f0; }
.hebrew-sm { color: #ddd; }
.hebrew-sm { color: #e0e0e0; }
.meaning { color: #82b0ff; }
.root-info { color: #aaa; }
.sec-label { color: #aaa; }
.sec-key { color: #666; }
.root-info { color: #e0e0e0; }
.sec-label { color: #e0e0e0; }
.sec-key { color: #e0e0e0; }
.conf-entry { color: #ddd; }
.hint { color: #777; }
.voice-label { color: #888; }
.example { color: #bbb; border-right-color: #555; }
.example { color: #e0e0e0; border-right-color: #555; }
.divider { border-top-color: #333; }
.freq-badge { color: #888; border-color: #444; }
}
@ -252,9 +262,6 @@ VOCAB_BACK_HEB = """
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
"""
@ -273,14 +280,15 @@ VOCAB_BACK_ENG = """
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
{{#SharedRoots}}
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
"""
VOCAB_FRONT_CLOZE = """
<div class="example" style="font-size:24px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
<div class="example" style="font-size:32px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
"""
@ -289,7 +297,6 @@ VOCAB_BACK_CLOZE = """
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<div class="meaning">{{Meaning}}</div>
"""
VOCAB_MODEL = genanki.Model(
@ -343,8 +350,8 @@ VOCAB_MODEL = genanki.Model(
CONJ_FRONT = """
<div class="sec-label">אֵיךְ אוֹמְרִים</div>
<div class="hebrew" style="color:#1a1a8c;">{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Pronoun}}</div>
<div class="hebrew" style="color:#1a1a8c;">{{Infinitive}}{{#Prep}} <span class="hebrew-sm">({{Prep}})</span>{{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Tense}}</div>
"""
@ -363,7 +370,7 @@ CONJ_CSS = CARD_CSS
CONJ_MODEL = genanki.Model(
CONJ_MODEL_ID,
"Pealim Conjugation",
"Hebrew Conjugation",
fields=[
{"name": "Infinitive"},
{"name": "ReferenceForm"},
@ -666,8 +673,9 @@ def _load_emoji_lookup() -> dict[str, str]:
def _categorize_pos(pos_str: str) -> str:
"""Return the canonical PoS category key for grouping."""
base = pos_str.split("")[0].split("")[0].strip()
for cat in POS_CATEGORY_LABELS:
if cat.lower() in pos_str.lower():
if base == cat:
return cat
return "Other"
@ -745,10 +753,14 @@ def build_vocab_deck(
word_nikkud = entry["word"]["nikkud"]
word_no_nik = entry["word"].get("ktiv_male", "")
root_list = entry.get("root") or []
root = " ".join(root_list)
root = ".".join(root_list)
pos_raw = entry.get("pos", "")
pos_heb = entry.get("pos_hebrew", "")
meaning = entry.get("meaning", "") or ""
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
meaning = HBPAREN_RE.sub("", meaning).strip()
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
meaning_raw = entry.get("meaning_raw", "") or ""
slug = entry.get("slug", "") or ""
frequency = entry.get("frequency") or 999_999
@ -839,6 +851,9 @@ def build_vocab_deck(
end = cloze_data.get("cloze_word_end")
if cloze_text and start is not None and end is not None:
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
# Clean up duplicate/misplaced quotation marks
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
raw_hint = cloze_data.get("cloze_hint") or ""
if raw_hint:
cloze_hint = raw_hint
@ -871,8 +886,9 @@ def build_vocab_deck(
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
related_html = "\n".join(parts)
# Plural form (for nouns)
# Plural form (nouns only — guard against adjective/verb inflection bleed)
plural_str = ""
if pos_raw.startswith("Noun"):
noun_inflection = entry.get("noun_inflection")
if noun_inflection and noun_inflection.get("plural"):
plural_str = noun_inflection["plural"].get("nikkud", "")
@ -977,18 +993,28 @@ def build_conj_deck(
binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
slug = entry.get("slug", "") or ""
root_list = entry.get("root") or []
root = " ".join(root_list)
root = ".".join(root_list)
voice = VOICE_MAP.get(binyan, "")
meaning_raw = entry.get("meaning_raw", "") or ""
meaning = entry.get("meaning", "") or ""
# Extract Hebrew preposition from meaning_raw
# Extract Hebrew preposition — strip from meaning, show on Hebrew side
prep_str = ""
conj_prep = conj.get("prep")
if conj_prep:
prep_str = f"({conj_prep})"
elif meaning:
preps = HBPAREN_RE.findall(entry.get("meaning_raw", "") or "")
prep_str = " ".join(f"({p})" for p in preps)
# Strip any parentheses from stored prep value
prep_str = conj_prep.strip("() ")
elif meaning_raw:
preps = HBPAREN_RE.findall(meaning_raw)
if preps:
prep_str = preps[0]
# Strip Hebrew prepositions from English meaning to avoid duplication
if prep_str:
meaning = HBPAREN_RE.sub("", meaning).strip()
# Also strip from meaning_raw patterns like "(על)"
meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
# Clean up double spaces and trailing commas
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
related = [w for w in root_words.get(root, []) if w != infinitive]
related_str = " ".join(related[:8]) if related else ""
@ -1024,7 +1050,7 @@ def build_conj_deck(
elif guid_candidates:
note_guid = guid_candidates[0]
else:
note_guid = genanki.guid_for(_infinitive, pronoun, tense)
note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
note = genanki.Note(
model=CONJ_MODEL,
guid=note_guid,
@ -1213,8 +1239,10 @@ def build_conj_deck(
# ──────────────────────────────────────────────────────────────────────────────
CONF_FRONT = """
<div style="direction:rtl; text-align:center;">
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
</div>
"""
CONF_BACK = """
@ -1271,7 +1299,10 @@ def build_confusables_deck(
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
guid_to_entries.setdefault(guid, []).append(entry)
for guid, group_entries in sorted(guid_to_entries.items(), key=lambda x: x[0]):
for guid, group_entries in sorted(
guid_to_entries.items(),
key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
):
if guid in seen_guids:
continue
seen_guids.add(guid)
@ -1366,6 +1397,7 @@ PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
"""
@ -1380,6 +1412,7 @@ PLURAL_BACK_PL = """
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="sec-label">{{Meaning}}</div>
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
"""
@ -1483,10 +1516,11 @@ def build_plural_deck(
plural = noun_inflection["plural"]["nikkud"]
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
gender = noun_inflection.get("gender") or ""
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
mishkal = noun_inflection.get("mishkal") or ""
meaning = entry.get("meaning") or ""
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
root_list = entry.get("root") or []
root = " ".join(root_list)
root = ".".join(root_list)
# GUID from noun_inflection
note_guid_raw = noun_inflection.get("plurals_guid")
@ -1520,7 +1554,7 @@ def build_plural_deck(
meaning,
root,
mishkal,
gender,
gender_heb,
],
tags=tags,
)

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
"""
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
TODO: Rewrite to update words.json examples fields directly instead of
writing to a separate examples_cache.json. Currently the migration script
bridges the gap. See Phase 5 in SPRINT_LOG.md.
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
then answers queries locally.
Exposed API:
load(force_rebuild=False)
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
save_examples_cache()
"""
import json
import logging
import re
import zipfile
from io import BytesIO
from pathlib import Path
import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
REQUEST_TIMEOUT = 120
MIN_SENTENCE_LEN = 20
MAX_SENTENCE_LEN = 200
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
# Module-level state
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
def _split_sentences(text: str) -> list[str]:
"""
Split text into sentences on newlines only (Hebrew sentences don't have
mid-word period issues like English). Min 20 chars, max 200 chars.
"""
out = []
for line in text.split("\n"):
s = line.strip().strip("\"'.,;:!?")
s = s.strip()
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
out.append(s)
return out
def _build_index(corpus_zip_bytes: bytes) -> None:
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
global _index
_index = {}
logger.info("Building Ben Yehuda index from nikkud corpus …")
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
logger.info(f" Corpus contains {len(txt_files)} text files")
for fname in txt_files:
try:
raw = zf.read(fname).decode("utf-8", errors="ignore")
except Exception: # noqa: S112
continue
for sentence in _split_sentences(raw):
# Index by each unique Hebrew token (with nikkud) in the sentence
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
for w in set(words):
if len(w) >= 2:
bucket = _index.setdefault(w, [])
if len(bucket) < MAX_INDEX_ENTRIES:
bucket.append(sentence)
logger.info(f"Index built: {len(_index)} unique word forms")
def _save_index() -> None:
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(INDEX_PATH, "w", encoding="utf-8") as f:
json.dump(_index, f, ensure_ascii=False)
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
def _load_index() -> None:
global _index
with open(INDEX_PATH, encoding="utf-8") as f:
_index = json.load(f)
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
def load(force_rebuild: bool = False) -> None:
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
global _index, _examples_cache
if _index and not force_rebuild:
return
if force_rebuild:
# Delete old index and discard examples cache
if INDEX_PATH.exists():
INDEX_PATH.unlink()
logger.info("Deleted old Ben Yehuda index (force rebuild)")
_examples_cache = {}
else:
# Load persisted examples cache (not needed on rebuild)
if EXAMPLES_CACHE_PATH.exists():
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
_examples_cache = json.load(f)
if INDEX_PATH.exists():
_load_index()
return
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
resp.raise_for_status()
data = resp.content
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
_build_index(data)
_save_index()
def save_examples_cache() -> None:
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
json.dump(_examples_cache, f, ensure_ascii=False)
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
"""
Return 0 or 1 example sentences for the given word (nikkud form).
Lookup strategy:
1. Try exact nikkud match in index.
2. Fall back to stripped (no-nikkud) match against index keys.
Skipped when word's consonants are in confusable_consonants set
(to avoid returning sentences for the wrong homograph).
Returns the single longest sentence MAX_SENTENCE_LEN that contains
the word as a whole token.
"""
if not _index:
load()
word = word_nikkud.strip()
word_stripped = _strip_nikkud(word)
cache_key = word
if cache_key in _examples_cache:
return _examples_cache[cache_key]
# Lookup: try exact nikkud first, then stripped fallback
candidates = _index.get(word, [])
if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
# Try looking up by stripped form across index keys
for k, v in _index.items():
if _strip_nikkud(k) == word_stripped:
candidates = v
break
# Filter: word must appear as a whole token
# Match the stripped form (for robustness with nikkud variants in sentence)
if word_stripped:
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
else:
matched = candidates[:]
# Filter by length
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
if matched:
best = max(matched, key=len)
result = [best]
else:
result = []
_examples_cache[cache_key] = result
return result
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
for w in tests:
exs = get_examples(w)
print(f"\n{w}: {len(exs)} example(s)")
for ex in exs:
print(f"{ex[:100]}")
save_examples_cache()

View file

@ -1,18 +1,17 @@
#!/usr/bin/env python3
"""
Extract example sentences from nikud'd Hebrew EPUBs (and PDFs where possible),
match them against the vocab list, and produce examples_cache.json.
Extract example sentences from nikud'd Hebrew EPUB files, match them against
the vocabulary list in data/words.json, and write matched examples back into
words.json.
Usage:
Usage (standalone):
python3 epub_examples.py
Outputs:
data/epub_sentence_index.json full sentence corpus
data/examples_cache.json best sentence(s) per vocab word
Called from run.py via:
run(words) words dict is passed in and updated in place
"""
import csv
import json
import logging
import os
import re
import zipfile
@ -21,20 +20,38 @@ from pathlib import Path
from helpers import strip_nikkud
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
EPUB_DIR = DATA_DIR / "epubs"
DICT_CSV = DATA_DIR / "hebrew_dict_for_anki.csv"
WORDS_JSON = DATA_DIR / "words.json"
# Book metadata: filename -> display name
EPUB_BOOKS = {
"little_prince.epub": "הנסיך הקטן",
"time_tunnel_82.epub": "מנהרת הזמן 82",
}
def _discover_epubs() -> dict[str, str]:
"""Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
if not EPUB_DIR.exists():
return {}
books: dict[str, str] = {}
for path in sorted(EPUB_DIR.glob("*.epub")):
stem = path.stem
stem_stripped = strip_nikkud(stem).lower()
# Derive a brief English display name from the filename
parts = stem.split(" -- ")
title_part = strip_nikkud(parts[0]).strip().lower()
if "alice" in stem_stripped or "אליס" in title_part:
name = "alice_wonderland"
elif "little_prince" in stem_stripped or "נסיך" in title_part:
name = "little_prince"
elif "מנהרת" in title_part or "time_tunnel" in stem_stripped:
num_match = re.search(r"(\d+)", stem_stripped)
num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "")
name = f"time_tunnel_{num}"
else:
name = stem_stripped[:40]
books[str(path)] = name
return books
# PDF books are excluded — pypdf produces garbled RTL text (reversed chars within
# words). If/when a proper EPUB version becomes available on Calibre, add it to
# EPUB_BOOKS above instead.
PDF_BOOKS: dict[str, str] = {}
# Sentence length bounds (word count)
MIN_WORDS = 4
@ -58,7 +75,7 @@ class _TextExtractor(HTMLParser):
_ = attrs # required by HTMLParser interface
if tag in self.SKIP_TAGS:
self._skip_depth += 1
# Insert space for block-level elements to avoid word concatenation
# Insert newline for block-level elements to avoid word concatenation
if tag in (
"p",
"div",
@ -102,7 +119,6 @@ def extract_text_from_html(html: str) -> str:
def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
"""Get ordered list of content XHTML files from the OPF manifest."""
# Find the OPF file
opf_path = None
for name in zf.namelist():
if name.endswith(".opf"):
@ -124,7 +140,7 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
opf_dir = os.path.dirname(opf_path)
# Extract manifest items: id -> href
manifest = {}
manifest: dict[str, str] = {}
for m in re.finditer(r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
manifest[m.group(1)] = m.group(2)
# Also try reversed attribute order
@ -157,7 +173,12 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
"""Extract sentences from an EPUB file.
Returns list of {"text": str, "book": str, "stripped": str}
Args:
epub_path: Path to the .epub file.
book_name: Human-readable book name used as the ``source`` field.
Returns:
List of ``{"text": str, "source": str}`` dicts.
"""
zf = zipfile.ZipFile(epub_path)
content_files = _content_files_from_epub(zf)
@ -175,41 +196,6 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
return _split_into_sentences(full_text, book_name)
# ── PDF processing ───────────────────────────────────────────────
def extract_sentences_from_pdf(pdf_path: Path, book_name: str) -> list[dict]:
"""Extract sentences from a PDF file (best-effort, handles RTL reversal)."""
try:
import pypdf
except ImportError:
print(f" [SKIP] pypdf not installed, cannot process {pdf_path.name}")
return []
reader = pypdf.PdfReader(pdf_path)
all_text_parts = []
for page in reader.pages:
raw = page.extract_text()
if not raw:
continue
# pypdf often reverses word order for RTL text; fix it
fixed_lines = []
for line in raw.split("\n"):
words = line.split()
# Check if this line is predominantly Hebrew
hebrew_chars = sum(1 for c in line if "\u0590" <= c <= "\u05ff")
if hebrew_chars > len(line) * 0.3 and len(words) > 1:
# Reverse word order
fixed_lines.append(" ".join(reversed(words)))
else:
fixed_lines.append(line)
all_text_parts.append("\n".join(fixed_lines))
full_text = "\n".join(all_text_parts)
return _split_into_sentences(full_text, book_name)
# ── Sentence splitting ───────────────────────────────────────────
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
@ -217,18 +203,27 @@ _SENT_SPLIT = re.compile(r"[.!?\u05C3]+")
# Punctuation to strip from word boundaries when matching
_PUNCT = re.compile(
r'^[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|'
r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
)
def _split_into_sentences(text: str, book_name: str) -> list[dict]:
"""Split text into sentences and filter by length."""
"""Split text into Hebrew sentences and filter by word count.
Args:
text: Raw extracted text from an EPUB chapter.
book_name: Source label for each sentence dict.
Returns:
List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text.
"""
# Normalize whitespace
text = re.sub(r"\s+", " ", text).strip()
raw_sentences = _SENT_SPLIT.split(text)
results = []
seen = set()
results: list[dict] = []
seen: set[str] = set()
for sent in raw_sentences:
sent = sent.strip()
@ -242,205 +237,555 @@ def _split_into_sentences(text: str, book_name: str) -> list[dict]:
if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
continue
# Skip duplicates
stripped = strip_nikkud(sent)
if stripped in seen:
# Deduplicate by exact nikkud text
if sent in seen:
continue
seen.add(stripped)
seen.add(sent)
results.append(
{
"text": sent,
"book": book_name,
"stripped": stripped,
}
)
results.append({"text": sent, "source": book_name})
return results
# ── Vocab loading ────────────────────────────────────────────────
# ── Nikkud index ─────────────────────────────────────────────────
# Unicode ranges for Hebrew combining marks
_NIKKUD_LOW = 0x05B0 # start of vowel points (shva)
_NIKKUD_HIGH = 0x05BD # end of vowel range (meteg); 0x05BE is maqaf (punctuation)
_DAGESH = "\u05bc"
_SHIN_DOT = "\u05c1"
_SIN_DOT = "\u05c2"
# Valid prefix consonants
_PREFIX_CONSONANTS = set("בהוכלמש")
# Named vowel combining marks
_SHVA = "\u05b0"
_HIRIQ = "\u05b4"
_TSERE = "\u05b5"
_SEGOL = "\u05b6"
_PATACH = "\u05b7"
_QAMATZ = "\u05b8"
# Valid nikkud patterns on each prefix consonant.
# Key = consonant, Value = set of frozensets of combining marks valid for that prefix.
_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = {
"ב": {
frozenset({_SHVA, _DAGESH}), # בְּ standard
frozenset({_HIRIQ, _DAGESH}), # בִּ before shva
frozenset({_PATACH, _DAGESH}), # בַּ with definite article
frozenset({_QAMATZ, _DAGESH}), # בָּ before chataf qamatz
frozenset({_SEGOL, _DAGESH}), # בֶּ before chataf segol
},
"כ": {
frozenset({_SHVA, _DAGESH}), # כְּ
frozenset({_HIRIQ, _DAGESH}), # כִּ
frozenset({_PATACH, _DAGESH}), # כַּ
frozenset({_QAMATZ, _DAGESH}), # כָּ
frozenset({_SEGOL, _DAGESH}), # כֶּ
},
"ל": {
frozenset({_SHVA}), # לְ standard
frozenset({_HIRIQ}), # לִ before shva
frozenset({_PATACH}), # לַ with definite article
frozenset({_QAMATZ}), # לָ demonstratives
frozenset({_SEGOL}), # לֶ before chataf segol
},
"ו": {
frozenset({_SHVA}), # וְ standard
frozenset({_DAGESH}), # וּ (shureq) before shva/bumf
frozenset({_PATACH}), # וַ before chataf patach
frozenset({_QAMATZ}), # וָ before chataf qamatz
frozenset({_SEGOL}), # וֶ before chataf segol
frozenset({_HIRIQ}), # וִ before yud-shva
},
"מ": {
frozenset({_HIRIQ}), # מִ standard
frozenset({_TSERE}), # מֵ before gutturals
},
"ש": {
frozenset({_SEGOL, _DAGESH}), # שֶׁ standard
frozenset({_SEGOL, _DAGESH, _SHIN_DOT}), # שֶׁ with explicit shin dot
},
"ה": {
frozenset({_PATACH}), # הַ standard definite article
frozenset({_QAMATZ}), # הָ before gutturals
frozenset({_SEGOL}), # הֶ before qamatz-bearing gutturals
},
}
def load_vocab(csv_path: Path) -> dict:
"""Load vocab CSV and return {stripped_form: nikkud_word} mapping.
def _is_combining_mark(ch: str) -> bool:
"""Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots)."""
cp = ord(ch)
if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH:
return True
return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT)
Also returns reverse mapping for lookup.
Returns (word_to_nikkud, nikkud_words_set)
def _decompose_first_char(token: str) -> tuple[str, frozenset, str]:
"""Split token into (first_consonant, its_combining_marks, remainder).
Args:
token: A nikkud Hebrew token string.
Returns:
A tuple of (consonant, marks, rest). Returns ("", frozenset(), token)
if the token does not start with a Hebrew consonant (aleftav range).
"""
words_by_stripped: dict[str, list[str]] = {} # stripped -> [nikkud words]
if not token:
return ("", frozenset(), token)
with open(csv_path, encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter=";")
for row in reader:
nikkud_word = row.get("Word", "").strip()
word_no_nik = row.get("Word Without Nikkud", "").strip()
if not nikkud_word:
continue
first = token[0]
# Check it's a Hebrew consonant (aleftav)
if not ("\u05d0" <= first <= "\u05ea"):
return ("", frozenset(), token)
# Method 1: strip nikkud from the Word column
stripped_from_nikkud = strip_nikkud(nikkud_word)
# Collect all combining marks that follow the consonant
marks: set[str] = set()
i = 1
while i < len(token):
ch = token[i]
if _is_combining_mark(ch):
marks.add(ch)
i += 1
else:
break
# Add both forms for matching
for form in {stripped_from_nikkud, word_no_nik}:
return (first, frozenset(marks), token[i:])
def _is_valid_prefix(consonant: str, marks: frozenset) -> bool:
"""Check if consonant + marks form a valid Hebrew prefix combination.
Args:
consonant: The prefix consonant character.
marks: Frozenset of combining mark characters on that consonant.
Returns:
True if this is a recognised Hebrew prefix vocalization.
"""
valid = _VALID_PREFIX_MARKS.get(consonant)
if not valid:
return False
# For ש, allow shin dot to be present or absent
if consonant == "ש":
marks_without_shin = marks - {_SHIN_DOT}
return marks_without_shin in valid or marks in valid
return marks in valid
def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str:
"""Reassemble a token from its decomposed parts, sorting marks by codepoint."""
return consonant + "".join(sorted(marks)) + rest
def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]:
"""Try stripping 1 or 2 prefix letters from a nikkud token.
Args:
token: A cleaned nikkud word token.
nikkud_index: Mapping from nikkud form to list of (unique_key, match_type).
Returns:
List of (unique_key, match_type, matched_remainder) for each hit found.
The match_type will have ``"_prefix"`` appended to the base type.
"""
results: list[tuple[str, str, str]] = []
# Try 1-letter prefix
c1, m1, rest1 = _decompose_first_char(token)
if not (c1 and _is_valid_prefix(c1, m1) and rest1):
return results
# Direct match on 1-prefix remainder
if rest1 in nikkud_index:
for unique_key, match_type in nikkud_index[rest1]:
results.append((unique_key, match_type + "_prefix", rest1))
# Try removing dagesh from first letter of remainder
# (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ)
c2, m2, rest2_inner = _decompose_first_char(rest1)
if c2 and _DAGESH in m2:
without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner)
if without_dagesh != rest1 and without_dagesh in nikkud_index:
for unique_key, match_type in nikkud_index[without_dagesh]:
results.append((unique_key, match_type + "_prefix", without_dagesh))
# Try 2-letter prefix (ו and ש commonly stack with another prefix)
if c1 in "וש":
c2b, m2b, rest2b = _decompose_first_char(rest1)
if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b:
if rest2b in nikkud_index:
for unique_key, match_type in nikkud_index[rest2b]:
results.append((unique_key, match_type + "_prefix", rest2b))
# Also try dagesh removal on remainder of 2-letter prefix
c3, m3, rest3_inner = _decompose_first_char(rest2b)
if c3 and _DAGESH in m3:
without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner)
if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index:
for unique_key, match_type in nikkud_index[without_dagesh2]:
results.append((unique_key, match_type + "_prefix", without_dagesh2))
return results
def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
"""Build a mapping from nikkud form to list of (unique_key, match_type).
Indexes the following sources per entry:
- ``word.nikkud`` "direct"
- conjugation active/passive forms "conjugated"
- conjugation infinitive and reference_form "conjugated"
- noun inflection singular/plural/construct/pronominal "inflected"
Args:
words: The full words.json dict keyed by unique_key.
Returns:
Dict mapping each nikkud form to a list of (unique_key, match_type) tuples.
"""
index: dict[str, list[tuple[str, str]]] = {}
def _add(form: str | None, unique_key: str, match_type: str) -> None:
if form:
words_by_stripped.setdefault(form, []).append(nikkud_word)
index.setdefault(form, []).append((unique_key, match_type))
return words_by_stripped
for unique_key, entry in words.items():
# Direct word form
word = entry.get("word") or {}
_add(word.get("nikkud"), unique_key, "direct")
# Conjugation forms
conj = entry.get("conjugation") or {}
for form_entry in conj.get("active_forms") or []:
form = (form_entry.get("form") or {}).get("nikkud")
_add(form, unique_key, "conjugated")
for form_entry in conj.get("hufal_pual_forms") or []:
form = (form_entry.get("form") or {}).get("nikkud")
_add(form, unique_key, "conjugated")
inf = conj.get("infinitive") or {}
_add(inf.get("nikkud"), unique_key, "conjugated")
ref = conj.get("reference_form") or {}
_add(ref.get("nikkud"), unique_key, "conjugated")
# Noun inflection forms
noun = entry.get("noun_inflection") or {}
for field in ("singular", "plural", "construct_singular", "construct_plural"):
sub = noun.get(field) or {}
_add(sub.get("nikkud"), unique_key, "inflected")
pronominal = noun.get("pronominal_suffixes") or {}
for _person, sub in pronominal.items():
if isinstance(sub, dict):
_add(sub.get("nikkud"), unique_key, "inflected")
return index
def _filter_collision_forms(nikkud_index: dict) -> dict:
"""Remove colliding forms for entries that have other unique forms.
A "colliding form" maps to 2+ unique_keys. For each unique_key that
appears in a collision, check whether it also has at least one
non-colliding form in the index. If so, remove it from the colliding
form's entry list. If a unique_key's *only* indexed forms all collide,
keep them (otherwise the entry would get zero matches).
Returns a new index dict with the same structure.
"""
# Identify collision forms and build reverse map (key → its forms)
collision_forms: set[str] = set()
key_to_forms: dict[str, set[str]] = {}
for form, entries in nikkud_index.items():
keys = {uk for uk, _ in entries}
if len(keys) >= 2:
collision_forms.add(form)
for uk, _ in entries:
key_to_forms.setdefault(uk, set()).add(form)
# For each key, check if it has any non-colliding form
keys_with_unique_forms: set[str] = set()
for uk, forms in key_to_forms.items():
if forms - collision_forms:
keys_with_unique_forms.add(uk)
# Build filtered index
filtered: dict[str, list[tuple[str, str]]] = {}
removed = 0
for form, entries in nikkud_index.items():
if form in collision_forms:
kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms]
removed += len(entries) - len(kept)
if kept:
filtered[form] = kept
else:
filtered[form] = entries
logger.info(f" Filtered {removed} collision mappings from entries with unique forms")
return filtered
# ── Matching ─────────────────────────────────────────────────────
def match_sentences(sentences: list[dict], words_by_stripped: dict) -> dict:
"""Match sentences against vocab words.
def match_sentences(
sentences: list[dict],
nikkud_index: dict,
confusable_keys: set[str],
) -> dict:
"""Match sentences to vocab words using the nikkud index.
Returns {nikkud_word: [sentences]} with best (shortest) first.
Args:
sentences: List of ``{"text": str, "source": str}`` dicts.
nikkud_index: Output of ``_build_nikkud_index``.
confusable_keys: Set of unique_keys that are in confusable groups.
Returns:
Dict mapping unique_key list of match dicts, each containing:
``text``, ``source``, ``match_method``, ``word_count``,
``matched_form``, ``char_offset``, ``char_end``.
"""
# Build a set of all stripped forms for fast lookup
all_forms = set(words_by_stripped.keys())
# Hebrew single-letter prefixes: ב, ה, ו, כ, ל, מ, ש, ד (של)
_HEB_PREFIXES = set("בהוכלמשד")
# For each sentence, extract stripped words
matches: dict[str, list[tuple[int, str]]] = {} # nikkud_word -> [(word_count, sentence)]
matches: dict[str, list[dict]] = {}
for sent_info in sentences:
sent_text = sent_info["text"]
sent_stripped = sent_info["stripped"]
word_count = len(sent_text.split())
text = sent_info["text"]
source = sent_info["source"]
words_in_sent = text.split()
word_count = len(words_in_sent)
# Get stripped words from the sentence
raw_words = sent_stripped.split()
# Map: candidate_form -> set of original cleaned words that produced it
# This lets us verify that prefix stripping is plausible
candidates: dict[str, str] = {} # form -> original_word
for w in raw_words:
cleaned = _PUNCT.sub("", w)
char_pos = 0
for raw_word in words_in_sent:
cleaned = _PUNCT.sub("", raw_word)
if not cleaned:
word_start = text.find(raw_word, char_pos)
char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos
continue
# Direct match (always try)
candidates[cleaned] = cleaned
# Prefix stripping: only if remaining stem is >= 2 chars
# and the prefix char is a known Hebrew prefix letter
for prefix_len in (1, 2):
if len(cleaned) > prefix_len + 1:
prefix = cleaned[:prefix_len]
stem = cleaned[prefix_len:]
if all(c in _HEB_PREFIXES for c in prefix) and len(stem) >= 2:
candidates[stem] = cleaned
# Check which vocab words appear in this sentence
matched_forms = set(candidates.keys()) & all_forms
for form in matched_forms:
# Skip spurious matches: very short vocab forms (1-2 chars)
# should only match via direct word match, not prefix stripping
if len(form) <= 2 and form not in {_PUNCT.sub("", w) for w in raw_words}:
# Locate positions within the sentence
word_start_in_sent = text.find(raw_word, char_pos)
if word_start_in_sent < 0:
word_start_in_sent = char_pos
clean_offset_in_raw = raw_word.find(cleaned)
if clean_offset_in_raw < 0:
clean_offset_in_raw = 0
clean_start = word_start_in_sent + clean_offset_in_raw
clean_end = clean_start + len(cleaned)
found: list[tuple[str, str]] = []
# Direct nikkud match
if cleaned in nikkud_index:
for unique_key, match_type in nikkud_index[cleaned]:
found.append((unique_key, match_type))
# Prefix stripping — only if no direct match exists
if cleaned not in nikkud_index:
for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index):
found.append((unique_key, match_type))
for unique_key, match_method in found:
matches.setdefault(unique_key, []).append(
{
"text": text,
"source": source,
"match_method": match_method,
"word_count": word_count,
"matched_form": cleaned,
"char_offset": clean_start,
"char_end": clean_end,
}
)
char_pos = word_start_in_sent + len(raw_word)
return matches
# ── Writing results ──────────────────────────────────────────────
def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int:
"""Update words dict entries with matched example sentences.
Selects up to 3 best sentences per word (scoring prefers 612 word
sentences and non-prefix matches). Also generates a cloze entry for
the top match, unless the word is in the confusable set.
Args:
words: The full words.json dict, modified in place.
matches: Output of ``match_sentences``.
confusable_keys: Set of unique_keys in confusable groups.
Returns:
Count of words.json entries that were updated.
"""
import genanki # noqa: PLC0415 — import only where needed
updated = 0
for unique_key, sent_list in matches.items():
if unique_key not in words:
continue
for nikkud_word in words_by_stripped[form]:
matches.setdefault(nikkud_word, []).append((word_count, sent_text))
# Sort by word count (prefer shorter sentences) and deduplicate
result = {}
for nikkud_word, sent_list in matches.items():
sent_list.sort(key=lambda x: x[0])
seen = set()
unique = []
for _, sent in sent_list:
if sent not in seen:
seen.add(sent)
unique.append(sent)
if len(unique) >= 5: # Keep top 5 per word
break
result[nikkud_word] = unique
entry = words[unique_key]
return result
# Deduplicate by sentence text
seen_texts: set[str] = set()
unique: list[dict] = []
for s in sent_list:
if s["text"] not in seen_texts:
seen_texts.add(s["text"])
unique.append(s)
# Prefer direct matches; only fall back to prefix if none exist
direct = [s for s in unique if "prefix" not in s["match_method"]]
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
pool = direct if direct else prefix_only
# Score: prefer 612 word sentences
def _score(s: dict) -> tuple[int,]:
wc = s["word_count"]
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
return (length_score,)
pool.sort(key=_score)
best = pool[:3]
# Build vetted list
if not entry.get("examples"):
entry["examples"] = {}
examples: dict = entry["examples"]
examples["vetted"] = [
{
"text": s["text"],
"source": s["source"],
"match_method": s["match_method"],
}
for s in best
]
# Build cloze from best sentence (skip confusables)
is_confusable = unique_key in confusable_keys
if not is_confusable and best:
top = best[0]
# Preserve existing cloze_guid if sentence text unchanged
old_cloze = examples.get("cloze") or {}
if old_cloze.get("text") == top["text"]:
cloze_guid = old_cloze.get("cloze_guid")
else:
cloze_guid = genanki.guid_for("cloze", unique_key)
examples["cloze"] = {
"text": top["text"],
"cloze_word_start": top["char_offset"],
"cloze_word_end": top["char_end"],
"cloze_hint": None,
"cloze_guid": cloze_guid,
}
elif is_confusable:
examples.pop("cloze", None)
examples["rejected_count"] = 0
updated += 1
return updated
# ── Main ─────────────────────────────────────────────────────────
# ── Public API ───────────────────────────────────────────────────
def main():
print("=" * 60)
print("EPUB Example Sentence Extraction Pipeline")
print("=" * 60)
def run(words: dict) -> dict:
"""Extract EPUB sentences, match against words, update words dict in place.
# Step 1: Extract sentences from all books
all_sentences = []
book_counts = {}
Called from run.py with the already-loaded words.json dict.
for filename, book_name in EPUB_BOOKS.items():
path = EPUB_DIR / filename
if not path.exists():
print(f"\n[SKIP] {filename} not found")
continue
print(f"\n[EPUB] Extracting: {book_name} ({filename})")
Args:
words: The full words.json dict keyed by unique_key. Modified in place.
Returns:
Summary stats dict with keys ``books``, ``matched``, ``total_vocab``.
"""
logger.info(" Extracting sentences from EPUBs ...")
all_sentences: list[dict] = []
book_counts: dict[str, int] = {}
for filepath, book_name in _discover_epubs().items():
path = Path(filepath)
sentences = extract_sentences_from_epub(path, book_name)
book_counts[book_name] = len(sentences)
all_sentences.extend(sentences)
print(f" -> {len(sentences)} sentences")
logger.info(f" {book_name}: {len(sentences)} sentences")
for filename, book_name in PDF_BOOKS.items():
path = EPUB_DIR / filename
if not path.exists():
print(f"\n[SKIP] {filename} not found")
continue
print(f"\n[PDF] Extracting: {book_name} ({filename})")
sentences = extract_sentences_from_pdf(path, book_name)
book_counts[book_name] = len(sentences)
all_sentences.extend(sentences)
print(f" -> {len(sentences)} sentences")
if not all_sentences:
logger.warning(" No EPUB files found — skipping example extraction")
return {"books": {}, "matched": 0, "total_vocab": len(words)}
print(f"\nTotal sentences: {len(all_sentences)}")
logger.info(f" Total sentences: {len(all_sentences)}")
# Step 2: Save sentence index
index_path = DATA_DIR / "epub_sentence_index.json"
with open(index_path, "w", encoding="utf-8") as f:
json.dump({"sentences": all_sentences}, f, ensure_ascii=False, indent=2)
print(f"\nSaved sentence index: {index_path}")
# Build nikkud index
logger.info(" Building nikkud index from words.json ...")
nikkud_index = _build_nikkud_index(words)
logger.info(f" {len(nikkud_index)} unique nikkud forms indexed")
# Step 3: Load vocab and match
print(f"\nLoading vocab from {DICT_CSV} ...")
words_by_stripped = load_vocab(DICT_CSV)
total_vocab = len({w for wlist in words_by_stripped.values() for w in wlist})
print(f" {total_vocab} unique vocab words ({len(words_by_stripped)} lookup forms)")
# Filter out collision forms for entries that have unique forms
nikkud_index = _filter_collision_forms(nikkud_index)
print("\nMatching sentences against vocab ...")
examples_cache = match_sentences(all_sentences, words_by_stripped)
# Build confusable key set
confusable_keys: set[str] = set()
for key, entry in words.items():
if entry.get("confusable_group"):
confusable_keys.add(key)
# Step 4: Save examples_cache
cache_path = DATA_DIR / "examples_cache.json"
with open(cache_path, "w", encoding="utf-8") as f:
json.dump(examples_cache, f, ensure_ascii=False, indent=2)
print(f"Saved examples cache: {cache_path}")
# Match sentences
logger.info(" Matching sentences against vocab ...")
matches = match_sentences(all_sentences, nikkud_index, confusable_keys)
logger.info(f" {len(matches)} words matched")
# Step 5: Summary stats
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
print("\nSentences per book:")
for book_name, count in book_counts.items():
print(f" {book_name}: {count}")
print(f" Total: {len(all_sentences)}")
# Break down by match method
method_counts: dict[str, int] = {}
for sent_list in matches.values():
for s in sent_list:
method = s["match_method"]
method_counts[method] = method_counts.get(method, 0) + 1
for method, count in sorted(method_counts.items()):
logger.info(f" {method}: {count} sentence-word pairs")
print("\nVocab matching:")
print(f" Total vocab words: {total_vocab}")
print(f" Words with examples: {len(examples_cache)}")
coverage = 100 * len(examples_cache) / total_vocab if total_vocab else 0
print(f" Coverage: {coverage:.1f}%")
# Update words dict in place
updated = update_words_json(words, matches, confusable_keys)
logger.info(f" Updated {updated} entries in words.json")
# Show some sample matches
print("\nSample matches:")
count = 0
for word, sents in examples_cache.items():
if count >= 5:
break
print(f" {word} -> {sents[0][:60]}...")
count += 1
return {
"books": book_counts,
"matched": len(matches),
"total_vocab": len(words),
}
return examples_cache
# ── Standalone entry point ───────────────────────────────────────
if __name__ == "__main__":
main()
import json
logging.basicConfig(level=logging.INFO, format="%(message)s")
words_path = DATA_DIR / "words.json"
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
stats = run(words)
# Save updated words.json
with open(words_path, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)")

View file

@ -2,7 +2,8 @@
"""
Consolidated detail page scraper for pealim.com.
Visits /dict/<slug>/ detail pages for nouns and verbs in data/words.json.
Visits /dict/<slug>/ detail pages for nouns, verbs, adjectives and prepositions
in data/words.json.
Makes two requests per slug:
1. hebstyle=mo cookie nikkud forms
2. hebstyle=vl cookie ktiv male forms
@ -11,7 +12,8 @@ Updates entries in data/words.json with scraped detail data.
Usage:
python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
[--nouns-only | --verbs-only]
[--nouns-only | --verbs-only |
--adjectives-only | --prepositions-only]
"""
import argparse
@ -144,28 +146,128 @@ FORM_KEY_TO_PERSON: dict[str, str] = {
"infinitive": "inf",
}
# Mishkal English name → Hebrew nikkud mapping (common patterns)
MISHKAL_HEBREW: dict[str, str] = {
"CaCaC": "קָטָל",
"CeCeC": "קֶטֶל",
"CiCeC": "קִטֶל",
"CaCeC": "קָטֶל",
"CoCeC": "קוֹטֵל",
"CaCiC": "קָטִיד",
"CaCuC": "קָטוּר",
"miCCaC": "מִקְטָל",
"miCCeC": "מִקְטֶל",
"maCCeC": "מַקְטֶל",
"maCCiC": "מַקְטִיר",
"hiCCiC": "הִקְטִיל",
"CiCCuC": "קִטּוּל",
"hitCaCCeC": "הִתְקַטֵּל",
"CaCCan": "קַטְּלָן",
"CaCCaC": "קַטָּל",
"CiCCon": "קִטְּרוֹן",
"CaCCeC": "קַטֶּלֶת",
# Mishkal English name → Hebrew nikkud mapping
# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
_MISHKAL_HEBREW_Q: dict[str, str] = {
# --- a ---
"aqtal": "אַקְטָל",
"aqtala": "אַקְטָלָה",
# --- e ---
"eqtal": "אֶקְטָל",
# --- h ---
"haqtala": "הַקְטָלָה",
"heqtel": "הֶקְטֵל",
"hiqqatlut": "הִקָּטְלוּת",
"hitqattlut": "הִתְקַטְּלוּת",
# --- m ---
"maqtal": "מַקְטָל",
"maqtel": "מַקְטֵל",
"maqtela": "מַקְטֵלָה",
"maqtelet": "מַקְטֶלֶת",
"maqtil": "מַקְטִיל",
"maqtol": "מַקְטוֹל",
"maqtolet": "מַקְטֹלֶת",
"maqtul": "מַקְטוּל",
"meqattel": "מְקַטֵּל",
"meqila": "מְקִילָה",
"mequla": "מְקוּלָה",
"mequttal": "מְקֻטָּל",
"miqtal": "מִקְטָל",
"miqtala": "מִקְטָלָה",
"miqtelet": "מִקְטֶלֶת",
"miqtol": "מִקְטוֹל",
"miqtolet": "מִקְטֹלֶת",
"mitqattel": "מִתְקַטֵּל",
"muqtal": "מֻקְטָל",
# --- n ---
"niqtal": "נִקְטָל",
# --- q ---
"qal": "קַל",
"qatal": "קָטָל",
"qatel": "קָטֵל",
"qatil": "קָטִיל",
"qatla": "קַטְלָה",
"qatlan": "קַטְלָן",
"qatlut": "קַטְלוּת",
"qatol": "קָטוֹל",
"qaton": "קָטוֹן",
"qattal": "קַטָּל",
"qattala": "קַטָּלָה",
"qattelet": "קַטֶּלֶת",
"qattil": "קַטִּיל",
"qattila": "קַטִּילָה",
"qattolet": "קַטֹּלֶת",
"qattul": "קַטּוּל",
"qatul": "קָטוּל",
"qatut": "קָטוּת",
"qetel": "קֶטֶל",
"qeteh": "קֵטֶה",
"qitla": "קִטְלָה",
"qitlon": "קִטְלוֹן",
"qittalon": "קִטָּלוֹן",
"qittel": "קִטֵּל",
"qittelet": "קִטֶּלֶת",
"qittol": "קִטּוֹל",
"qittolet": "קִטֹּלֶת",
"qittul": "קִטּוּל",
"qol": "קֹל",
"qotal": "קוֹטָל",
"qotel": "קוֹטֵל",
"qotelet": "קוֹטֶלֶת",
"qotla": "קָטְלָה",
"qtal": "קְטָל",
"qtala": "קְטָלָה",
"qtaltal": "קְטַלְטַל",
"qtaltan": "קְטַלְתָּן",
"qtaltolet": "קְטַלְטֹלֶת",
"qtel": "קְטֵל",
"qtela": "קְטֵלָה",
"qtelet": "קְטֶלֶת",
"qtil": "קְטִיל",
"qtila": "קְטִילָה",
"qtili": "קְטִילִי",
"qtol": "קְטוֹל",
"qtola": "קְטוֹלָה",
"qtolet": "קְטֹלֶת",
"qtul": "קְטוּל",
"qtula": "קְטוּלָה",
"qtulla": "קְטֻלָּה",
"qtut": "קְטוּת",
"qutla": "קֻטְלָה",
"quttolet": "קֻטּוֹלֶת",
# --- t ---
"taqtela": "תַּקְטֵלָה",
"taqtil": "תַּקְטִיל",
"taqtit": "תַּקְטִית",
"taqtul": "תַּקְטוּל",
"taqtula": "תַּקְטוּלָה",
"taqtut": "תַּקְטוּת",
"tiqtal": "תִּקְטָל",
"tiqtala": "תִּקְטָלָה",
"tiqtelet": "תִּקְטֶלֶת",
"tiqtolet": "תִּקְטֹלֶת",
"tqilla": "תְּקִלָּה",
"tqula": "תְּקוּלָה",
# --- y ---
"yaqtul": "יַקְטוּל",
}
def _mishkal_to_hebrew(mishkal: str) -> str | None:
"""Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
if not mishkal:
return None
# Try as-is first (q-notation)
result = _MISHKAL_HEBREW_Q.get(mishkal)
if result:
return result
# Convert k-notation to q-notation and retry
q_form = mishkal.replace("k", "q")
return _MISHKAL_HEBREW_Q.get(q_form)
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
@ -452,7 +554,7 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
if mishkal:
result["mishkal"] = mishkal
result["mishkal_hebrew"] = MISHKAL_HEBREW.get(mishkal)
result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)
return result
@ -887,6 +989,228 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
return result
# ---------------------------------------------------------------------------
# Adjective detail parsing
# ---------------------------------------------------------------------------
_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")
def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
"""
Parse the adjective inflection table from a pealim detail page (mo/nikkud).
Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
and audio URL from each.
Returns:
Dict mapping form key ("ms", "fs", "mp", "fp") to
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, dict] = {}
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
nikkud, audio_url = _get_menukad_and_audio(div)
if nikkud:
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
return result
def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the adjective inflection table from a vl (ktiv male) page.
Returns:
Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, str] = {}
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
ktiv = _get_plain_text(div)
if ktiv:
result[form_key] = ktiv
return result
def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
"""
Extract mishkal from the PoS section of an adjective detail page.
Reuses the same extraction logic as _parse_noun_gender_mishkal.
Returns:
Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
"""
_, mishkal = _parse_noun_gender_mishkal(soup)
mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
return mishkal, mishkal_hebrew
def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
"""
Parse adjective detail pages (mo=nikkud, vl=ktiv male).
Returns:
Dict matching the adjective_inflection schema:
{ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
Empty dict if no forms found.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
mo_data = _parse_adjective_table(mo_soup)
vl_data = _parse_adjective_table_vl(vl_soup)
mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)
if not mo_data:
return {}
result: dict = {}
for form_key in _ADJECTIVE_FORM_KEYS:
mo_form = mo_data.get(form_key)
if mo_form:
nikkud = mo_form["nikkud"]
ktiv = vl_data.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
else:
result[form_key] = None
result["mishkal"] = mishkal or None
result["mishkal_hebrew"] = mishkal_hebrew or None
return result
# ---------------------------------------------------------------------------
# Preposition detail parsing
# ---------------------------------------------------------------------------
_PREPOSITION_CELL_IDS: tuple[str, ...] = (
"P-1s",
"P-1p",
"P-2ms",
"P-2fs",
"P-2mp",
"P-2fp",
"P-3ms",
"P-3fs",
"P-3mp",
"P-3fp",
)
_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
"1s",
"1p",
"2ms",
"2fs",
"2mp",
"2fp",
"3ms",
"3fs",
"3mp",
"3fp",
)
def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
"""
Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).
Locates cells by ID (P-1s, P-1p, P-2ms, , P-3fp) and extracts nikkud
text and audio URL from each.
Returns:
Dict mapping person key ("1s", "1p", , "3fp") to
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, dict] = {}
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
nikkud, audio_url = _get_menukad_and_audio(div)
if nikkud:
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
return result
def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the preposition pronominal suffix table from a vl (ktiv male) page.
Returns:
Dict mapping person key ("1s", "1p", , "3fp") to ktiv male string.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, str] = {}
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
ktiv = _get_plain_text(div)
if ktiv:
result[form_key] = ktiv
return result
def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
"""
Parse preposition detail pages (mo=nikkud, vl=ktiv male).
Returns:
Dict matching the preposition_inflection schema:
{1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
Empty dict if no forms found.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
mo_data = _parse_preposition_table(mo_soup)
vl_data = _parse_preposition_table_vl(vl_soup)
if not mo_data:
return {}
result: dict = {}
for form_key in _PREPOSITION_FORM_KEYS:
mo_form = mo_data.get(form_key)
if mo_form:
nikkud = mo_form["nikkud"]
ktiv = vl_data.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
else:
result[form_key] = None
return result
# ---------------------------------------------------------------------------
# Merging strategy
# ---------------------------------------------------------------------------
@ -926,6 +1250,22 @@ def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
return scraped
def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
"""
Merge scraped adjective data into existing adjective_inflection.
No GUIDs to preserve simple overwrite with scraped data.
"""
return dict(scraped)
def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
"""
Merge scraped preposition data into existing preposition_inflection.
No GUIDs to preserve simple overwrite with scraped data.
"""
return dict(scraped)
# ---------------------------------------------------------------------------
# I/O helpers
# ---------------------------------------------------------------------------
@ -953,14 +1293,26 @@ def _save_words(data: dict) -> None:
# ---------------------------------------------------------------------------
def _should_process(entry: dict, pos: str, force: bool, nouns_only: bool, verbs_only: bool) -> bool:
def _should_process(
entry: dict,
pos: str,
force: bool,
nouns_only: bool,
verbs_only: bool,
adjectives_only: bool,
prepositions_only: bool,
) -> bool:
"""Return True if this entry should be scraped."""
if not pos.startswith(("Noun", "Verb")):
if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
return False
if nouns_only and not pos.startswith("Noun"):
return False
if verbs_only and not pos.startswith("Verb"):
return False
if adjectives_only and not pos.startswith("Adjective"):
return False
if prepositions_only and not pos.startswith("Preposition"):
return False
return force or not entry.get("detail_scraped")
@ -969,6 +1321,8 @@ def run(
force_refresh: bool = False,
nouns_only: bool = False,
verbs_only: bool = False,
adjectives_only: bool = False,
prepositions_only: bool = False,
) -> None:
"""
Main scrape loop.
@ -978,13 +1332,24 @@ def run(
force_refresh: Re-scrape entries where detail_scraped=True.
nouns_only: Only scrape noun entries.
verbs_only: Only scrape verb entries.
adjectives_only: Only scrape adjective entries.
prepositions_only: Only scrape preposition entries.
"""
words = _load_words()
candidates = [
(unique_key, entry)
for unique_key, entry in words.items()
if _should_process(entry, entry.get("pos", ""), force_refresh, nouns_only, verbs_only) and entry.get("slug")
if _should_process(
entry,
entry.get("pos", ""),
force_refresh,
nouns_only,
verbs_only,
adjectives_only,
prepositions_only,
)
and entry.get("slug")
]
total = len(candidates)
@ -992,7 +1357,10 @@ def run(
candidates = candidates[:test]
logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
else:
logger.info("Processing %d eligible entries (nouns+verbs) from words.json", total)
logger.info(
"Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
total,
)
processed = 0
errors = 0
@ -1003,7 +1371,14 @@ def run(
word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
url = f"{PEALIM_BASE}/dict/{slug}/"
label = "Noun" if pos.startswith("Noun") else "Verb"
if pos.startswith("Noun"):
label = "Noun"
elif pos.startswith("Verb"):
label = "Verb"
elif pos.startswith("Adjective"):
label = "Adjective"
else:
label = "Preposition"
logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)
# Fetch mo (nikkud) page
@ -1042,7 +1417,7 @@ def run(
errors += 1
continue
else: # Verb
elif pos.startswith("Verb"):
existing_conj = entry.get("conjugation")
scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
if scraped:
@ -1059,6 +1434,41 @@ def run(
errors += 1
continue
elif pos.startswith("Adjective"):
scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
if scraped:
existing_ai = entry.get("adjective_inflection")
merged = _merge_adjective_inflection(existing_ai, scraped)
words[unique_key]["adjective_inflection"] = merged
ms = merged.get("ms", {}) or {}
fs = merged.get("fs", {}) or {}
logger.info(
" ms=%s fs=%s mishkal=%s",
ms.get("nikkud", ""),
fs.get("nikkud", ""),
merged.get("mishkal", ""),
)
else:
logger.warning(" No adjective data scraped for %s", slug)
errors += 1
continue
else: # Preposition
scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
if scraped:
existing_pi = entry.get("preposition_inflection")
merged = _merge_preposition_inflection(existing_pi, scraped)
words[unique_key]["preposition_inflection"] = merged
form_1s = merged.get("1s", {}) or {}
logger.info(
" 1s=%s",
form_1s.get("nikkud", ""),
)
else:
logger.warning(" No preposition data scraped for %s", slug)
errors += 1
continue
except Exception as exc: # noqa: BLE001
logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
errors += 1
@ -1089,7 +1499,7 @@ def run(
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Scrape pealim.com detail pages for nouns and verbs in data/words.json."
description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
)
parser.add_argument(
"--test",
@ -1117,6 +1527,18 @@ def _build_parser() -> argparse.ArgumentParser:
default=False,
help="Only scrape Verb entries.",
)
group.add_argument(
"--adjectives-only",
action="store_true",
default=False,
help="Only scrape Adjective entries.",
)
group.add_argument(
"--prepositions-only",
action="store_true",
default=False,
help="Only scrape Preposition entries.",
)
return parser
@ -1133,4 +1555,6 @@ if __name__ == "__main__":
force_refresh=args.force_refresh_detail,
nouns_only=args.nouns_only,
verbs_only=args.verbs_only,
adjectives_only=args.adjectives_only,
prepositions_only=args.prepositions_only,
)

View file

@ -1,183 +0,0 @@
#!/usr/bin/env python3
"""
Rebuild vocab_sentence_matches.json using both direct word matching
and ktiv male conjugated/declined form matching.
This dramatically improves sentence coverage by matching not just
dictionary forms but all conjugated verbs and declined nouns.
"""
import json
import logging
import re
from pathlib import Path
import pandas as pd
from helpers import strip_nikkud as _strip_nikkud
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
def main():
# Load sentences
with open(DATA_DIR / "epub_sentence_index.json") as f:
sentences = json.load(f).get("sentences", [])
logger.info(f"Loaded {len(sentences)} sentences")
# Load vocab CSV
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
try:
df = pd.read_csv(csv_path, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(csv_path, index_col=0)
logger.info(f"Loaded {len(df)} vocab entries")
# Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
word_lookup: dict[str, list[tuple[str, str]]] = {}
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
wni = str(row.get("Word Without Nikkud", "")).strip()
if not word or word in ("nan", "None"):
continue
stripped = _strip_nikkud(word)
if stripped:
word_lookup.setdefault(stripped, []).append((word, wni))
# Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
ktiv_path = DATA_DIR / "ktiv_male_forms.json"
ktiv_forms: dict[str, list[dict]] = {}
if ktiv_path.exists():
with open(ktiv_path) as f:
ktiv_forms = json.load(f)
logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
else:
logger.warning("No ktiv_male_forms.json — only using direct matching")
# Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
ktiv_to_word: dict[str, set[str]] = {}
for ktiv, entries in ktiv_forms.items():
for entry in entries:
word_nikkud = entry.get("word_nikkud", "")
if word_nikkud:
ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)
# Also add all vocab words' own stripped forms to ktiv_to_word
for stripped, entries in word_lookup.items():
for word_nikkud, _ in entries:
ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)
logger.info(f"Total matchable forms: {len(ktiv_to_word)}")
# Tokenize all sentences once
sentence_tokens: list[tuple[dict, list[str]]] = []
for s in sentences:
stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
tokens = [t for t in tokens if t] # remove empty
sentence_tokens.append((s, tokens))
# Match: for each sentence token, check ktiv_to_word lookup
# Build word_nikkud → [sentence_info]
matches: dict[str, list[dict]] = {} # word_nikkud → [sentences]
for sent, tokens in sentence_tokens:
text = sent.get("text", "")
book = sent.get("book", "")
word_len = len(tokens)
# Skip sentences that are too short or too long
if word_len < 4 or word_len > 15:
continue
for tok in tokens:
if tok in ktiv_to_word:
for word_nikkud in ktiv_to_word[tok]:
matches.setdefault(word_nikkud, []).append(
{
"text": text,
"book": book,
"matched_form": tok,
"word_count": word_len,
}
)
logger.info(f"Words with at least 1 match: {len(matches)}")
# Deduplicate and limit to 3 best sentences per word
# Prefer shorter sentences (6-12 words ideal)
output: dict[str, dict] = {}
for word_nikkud, sents in matches.items():
# Deduplicate by text
seen_texts = set()
unique = []
for s in sents:
if s["text"] not in seen_texts:
seen_texts.add(s["text"])
unique.append(s)
# Score: prefer 6-12 word sentences
def score(s):
wc = s["word_count"]
if 6 <= wc <= 12:
return 0 # ideal
return abs(wc - 9) # distance from ideal
unique.sort(key=score)
best = unique[:3]
# Find the Word Without Nikkud for this word
stripped = _strip_nikkud(word_nikkud)
wni = stripped # default
if stripped in word_lookup:
for wn, w_wni in word_lookup[stripped]:
if wn == word_nikkud:
wni = w_wni
break
output[wni] = {
"word_nikkud": word_nikkud,
"sentences": [{"text": s["text"], "book": s["book"]} for s in best],
}
# Save
out_path = DATA_DIR / "vocab_sentence_matches.json"
with open(out_path, "w") as f:
json.dump(output, f, ensure_ascii=False, indent=1)
total_sents = sum(len(v["sentences"]) for v in output.values())
logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")
# Stats
total_vocab = len(df)
pct = len(output) * 100 / total_vocab
logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")
# Breakdown by match type
direct_only = 0
ktiv_only = 0
both = 0
for _wni, info in output.items():
word = info["word_nikkud"]
stripped = _strip_nikkud(word)
has_direct = stripped in word_lookup
has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
if has_direct and has_ktiv:
both += 1
elif has_ktiv:
ktiv_only += 1
else:
direct_only += 1
logger.info(f" Direct matches only: {direct_only}")
logger.info(f" Ktiv male matches only: {ktiv_only}")
logger.info(f" Both: {both}")
if __name__ == "__main__":
main()

80
run.py
View file

@ -11,7 +11,7 @@ Pipeline steps:
1. List scrape scrape pealim.com list pages words.json (captures slugs)
2. Detail scrape scrape noun/verb detail pages using slugs words.json
3. Frequency load/download word frequency data
4. Examples fetch Ben Yehuda example sentences
4. Examples extract example sentences from Hebrew EPUBs
5. Audio download download audio mp3 files
6. Fonts download Heebo font files
7. Images fetch noun images from Wikipedia
@ -21,9 +21,8 @@ Options:
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-examples Skip EPUB example extraction
--skip-images Skip image fetching for concrete nouns
--refresh-examples Force rebuild of Ben Yehuda index
--test N Limit to first N words/pages
"""
@ -60,9 +59,8 @@ def parse_args():
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
@ -93,22 +91,15 @@ def step_frequency() -> dict[str, int]:
return frequency_lookup._freq
def step_examples(args, _freq_cache: dict):
"""Step 4 — load/build Ben Yehuda example index."""
def step_examples(args) -> dict:
"""Step 4 — extract example sentences from Hebrew EPUBs."""
if args.skip_examples:
logger.info("[4] Skipping examples (--skip-examples)")
examples_path = DATA_DIR / "examples_cache.json"
if examples_path.exists():
with open(examples_path) as f:
return json.load(f)
return {}
logger.info("[4] Loading Ben Yehuda example index")
import benyehuda
logger.info("[4] Extracting EPUB example sentences …")
import epub_examples
benyehuda.load(force_rebuild=args.refresh_examples)
# Read word list from words.json instead of CSV
if not WORDS_JSON.exists():
logger.warning("[4] words.json not found, skipping examples")
return {}
@ -116,41 +107,14 @@ def step_examples(args, _freq_cache: dict):
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
entries = list(words.values())
if args.test:
entries = entries[: args.test]
stats = epub_examples.run(words)
# Build confusable consonant set from words.json
consonant_counts: dict[str, int] = {}
for entry in entries:
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
# Save updated words.json
with open(WORDS_JSON, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
# Delete stale cache entries for confusable words so they get re-fetched
stale_deleted = 0
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if word_nikkud and ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
del benyehuda._examples_cache[word_nikkud]
stale_deleted += 1
if stale_deleted:
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
logger.info(f" Pre-fetching examples for {len(entries)} words …")
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
if word_nikkud:
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
benyehuda.save_examples_cache()
return benyehuda._examples_cache
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
return stats
def step_detail_scrape(args):
@ -250,7 +214,7 @@ def step_build_all(args):
apkg_builder.build_all_variants(words, limit=args.test)
def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
@ -267,10 +231,12 @@ def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: d
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
logger.info(f" Frequency entries: {len(freq_cache)}")
logger.info(f" Example cache entries: {len(examples_cache)}")
covered = sum(1 for v in examples_cache.values() if v)
if examples_cache:
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
matched = example_stats.get("matched", 0)
total = example_stats.get("total_vocab", 0)
if total:
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
for book, count in example_stats.get("books", {}).items():
logger.info(f" {book}: {count} sentences")
if AUDIO_DIR.exists():
mp3s = list(AUDIO_DIR.glob("*.mp3"))
@ -321,8 +287,6 @@ def main():
logger.info(f" MODE: --only {args.only}")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
if args.refresh_examples:
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
logger.info("=" * 60)
def _load_words_for_only() -> dict:
@ -385,13 +349,13 @@ def main():
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
freq_cache = step_frequency() # 3 — word frequency data
examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples
example_stats = step_examples(args) # 4 — EPUB example sentences
step_audio_download(args) # 5 — download audio mp3s
step_fonts(args) # 6 — download Heebo fonts
step_images(args) # 7 — fetch noun images
step_build_all(args) # 8 — build all .apkg variants
print_summary(args, examples_cache, freq_cache)
print_summary(args, example_stats, freq_cache)
if __name__ == "__main__":

View file

@ -32,7 +32,7 @@ DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # aleftav
VALID_PERSON_CODES: frozenset[str] = frozenset(
["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
)
EMOJI_RE = re.compile(
@ -561,6 +561,7 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
"""
name = "conjugation_form_guids"
errors: list[str] = []
warnings: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
@ -580,7 +581,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
guid_candidates = form.get("guid_candidates")
if not guid and not guid_candidates:
errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
# New forms from rescrape use deterministic fallback — warn, don't fail
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
continue
if guid:
@ -597,6 +599,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
else:
seen_guids[candidate] = label
if warnings:
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:

486
tests/test_detail_scrape.py Normal file
View file

@ -0,0 +1,486 @@
"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from pealim_detail_scrape import (
_parse_adjective_table,
_parse_adjective_table_vl,
_parse_preposition_table,
_parse_preposition_table_vl,
_scrape_adjective_detail,
_scrape_preposition_detail,
)
# ---------------------------------------------------------------------------
# Fixtures — real HTML snippets from pealim.com
# ---------------------------------------------------------------------------
ADJECTIVE_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִי</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fs-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִית</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="mp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיִּים</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיּוֹת</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
</tr>
</tbody>
</table>
"""
# VL version: menukad spans contain unvowelled text (hebstyle=vl)
ADJECTIVE_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a"><div><div>
<span class="menukad">אביבי</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fs-a"><div><div>
<span class="menukad">אביבית</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="mp-a"><div><div>
<span class="menukad">אביביים</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fp-a"><div><div>
<span class="menukad">אביביות</span>
</div></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th rowspan="2">Person</th>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<th>1st</th>
<td class="conj-td" colspan="2">
<div id="P-1s"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">&#128266;</span>
<span class="menukad">שֶׁלִּי</span>
</div></div><div class="meaning"><strong>of mine</strong></div></div>
</td>
<td class="conj-td" colspan="2">
<div id="P-1p"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּנוּ</span>
</div></div><div class="meaning"><strong>of ours</strong></div></div>
</td>
</tr>
<tr>
<th>2nd</th>
<td class="conj-td">
<div id="P-2ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">&#128266;</span>
<span class="menukad">שֶׁלְּךָ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּךְ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶם</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶן</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
</td>
</tr>
<tr>
<th>3rd</th>
<td class="conj-td">
<div id="P-3ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">&#128266;</span>
<span class="menukad">שֶׁלּוֹ</span>
</div></div><div class="meaning"><strong>of his</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהּ</span>
</div></div><div class="meaning"><strong>of hers</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶם</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
</td>
<td class="conj-td">
<div id="P-3fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶן</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<th>1st</th>
<td colspan="2"><div id="P-1s"><div><div>
<span class="menukad">שלי</span>
</div></div></div></td>
<td colspan="2"><div id="P-1p"><div><div>
<span class="menukad">שלנו</span>
</div></div></div></td>
</tr>
<tr>
<th>2nd</th>
<td><div id="P-2ms"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2fs"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2mp"><div><div>
<span class="menukad">שלכם</span>
</div></div></div></td>
<td><div id="P-2fp"><div><div>
<span class="menukad">שלכן</span>
</div></div></div></td>
</tr>
<tr>
<th>3rd</th>
<td><div id="P-3ms"><div><div>
<span class="menukad">שלו</span>
</div></div></div></td>
<td><div id="P-3fs"><div><div>
<span class="menukad">שלה</span>
</div></div></div></td>
<td><div id="P-3mp"><div><div>
<span class="menukad">שלהם</span>
</div></div></div></td>
<td><div id="P-3fp"><div><div>
<span class="menukad">שלהן</span>
</div></div></div></td>
</tr>
</tbody>
</table>
"""
# Minimal full-page wrappers so _scrape_*_detail() can parse them
_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
# ---------------------------------------------------------------------------
# Adjective table tests
# ---------------------------------------------------------------------------
class TestParseAdjectiveTable:
"""Tests for _parse_adjective_table (mo/nikkud page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["nikkud"] == "אֲבִיבִי"
def test_fs_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fs"]["nikkud"] == "אֲבִיבִית"
def test_mp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
def test_fp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
def test_audio_url_present(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParseAdjectiveTableVl:
"""Tests for _parse_adjective_table_vl (ktiv male page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["ms"] == "אביבי"
def test_fs_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fs"] == "אביבית"
def test_mp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["mp"] == "אביביים"
def test_fp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fp"] == "אביביות"
# ---------------------------------------------------------------------------
# _scrape_adjective_detail tests
# ---------------------------------------------------------------------------
class TestScrapeAdjectiveDetail:
"""Tests for _scrape_adjective_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["ms"]["nikkud"] == "אֲבִיבִי"
assert result["ms"]["ktiv_male"] == "אביבי"
def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fs"]["nikkud"] == "אֲבִיבִית"
assert result["fs"]["ktiv_male"] == "אביבית"
def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
assert result["mp"]["ktiv_male"] == "אביביים"
def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
assert result["fp"]["ktiv_male"] == "אביביות"
def test_mishkal_key_present(self, result: dict) -> None:
# mishkal may be None since no PoS section is in our minimal fixture
assert "mishkal" in result
def test_mishkal_hebrew_key_present(self, result: dict) -> None:
assert "mishkal_hebrew" in result
def test_all_schema_keys_present(self, result: dict) -> None:
expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
assert expected.issubset(result.keys())
def test_empty_on_no_table(self) -> None:
result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}
# ---------------------------------------------------------------------------
# Preposition table tests
# ---------------------------------------------------------------------------
class TestParsePrepositionTable:
"""Tests for _parse_preposition_table (mo/nikkud page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_nikkud(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
def test_1p_nikkud(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
def test_2ms_nikkud(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
def test_2fs_nikkud(self, result: dict) -> None:
assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
def test_2mp_nikkud(self, result: dict) -> None:
assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
def test_2fp_nikkud(self, result: dict) -> None:
assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
def test_3ms_nikkud(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
def test_3fs_nikkud(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
def test_3mp_nikkud(self, result: dict) -> None:
assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
def test_3fp_nikkud(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
def test_audio_url_present(self, result: dict) -> None:
assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParsePrepositionTableVl:
"""Tests for _parse_preposition_table_vl (ktiv male page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_ktiv(self, result: dict) -> None:
assert result["1s"] == "שלי"
def test_1p_ktiv(self, result: dict) -> None:
assert result["1p"] == "שלנו"
def test_2ms_ktiv(self, result: dict) -> None:
assert result["2ms"] == "שלך"
def test_3ms_ktiv(self, result: dict) -> None:
assert result["3ms"] == "שלו"
def test_3fp_ktiv(self, result: dict) -> None:
assert result["3fp"] == "שלהן"
# ---------------------------------------------------------------------------
# _scrape_preposition_detail tests
# ---------------------------------------------------------------------------
class TestScrapePrepositionDetail:
"""Tests for _scrape_preposition_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_all_ten_person_keys_present(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert expected.issubset(result.keys())
def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
assert result["1s"]["ktiv_male"] == "שלי"
def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
assert result["1p"]["ktiv_male"] == "שלנו"
def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
assert result["2ms"]["ktiv_male"] == "שלך"
def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
assert result["3ms"]["ktiv_male"] == "שלו"
def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
assert result["3fs"]["ktiv_male"] == "שלה"
def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
assert result["3fp"]["ktiv_male"] == "שלהן"
def test_empty_on_no_table(self) -> None:
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}

View file

@ -42,3 +42,17 @@ def test_strip_nikkud_all_marks():
nikkud = "הַמַּלְכָּה"
plain = strip_nikkud(nikkud)
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
def test_categorize_pos_no_substring_match():
"""Regression: 'Pronoun' must NOT match 'Noun' category."""
from apkg_builder import _categorize_pos
assert _categorize_pos("Noun") == "Noun"
assert _categorize_pos("Verb") == "Verb"
assert _categorize_pos("Adjective") == "Adjective"
assert _categorize_pos("Adverb") == "Adverb"
assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
assert _categorize_pos("Preposition") == "Other"
assert _categorize_pos("Conjunction") == "Other"
assert _categorize_pos("Cardinal numeral") == "Other"