diff --git a/SCHEMA.yaml b/SCHEMA.yaml
index 80a4973..132eb38 100644
--- a/SCHEMA.yaml
+++ b/SCHEMA.yaml
@@ -138,11 +138,53 @@ entry:
# ktiv_male: "שומר"
# --- Adjective-specific ---
- adjective_inflection: null # Reserved for future use
+ adjective_inflection: null # null for non-adjectives
# When populated:
- # ms/fs/mp/fp forms with nikkud/ktiv_male subfields
+ # ms:
+ # nikkud: "גָּדוֹל"
+ # ktiv_male: "גדול"
+ # fs:
+ # nikkud: "גְּדוֹלָה"
+ # ktiv_male: "גדולה"
+ # mp:
+ # nikkud: "גְּדוֹלִים"
+ # ktiv_male: "גדולים"
+ # fp:
+ # nikkud: "גְּדוֹלוֹת"
+ # ktiv_male: "גדולות"
+ # mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
+ # mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Preposition-specific ---
- preposition_inflection: null # Reserved for future use
+ preposition_inflection: null # null for non-prepositions
# When populated:
- # Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)
+ # 1s:
+ # nikkud: "שֶׁלִּי"
+ # ktiv_male: "שלי"
+ # 1p:
+ # nikkud: "שֶׁלָּנוּ"
+ # ktiv_male: "שלנו"
+ # 2ms:
+ # nikkud: "שֶׁלְּךָ"
+ # ktiv_male: "שלך"
+ # 2fs:
+ # nikkud: "שֶׁלָּךְ"
+ # ktiv_male: "שלך"
+ # 2mp:
+ # nikkud: "שֶׁלָּכֶם"
+ # ktiv_male: "שלכם"
+ # 2fp:
+ # nikkud: "שֶׁלָּכֶן"
+ # ktiv_male: "שלכן"
+ # 3ms:
+ # nikkud: "שֶׁלּוֹ"
+ # ktiv_male: "שלו"
+ # 3fs:
+ # nikkud: "שֶׁלָּהּ"
+ # ktiv_male: "שלה"
+ # 3mp:
+ # nikkud: "שֶׁלָּהֶם"
+ # ktiv_male: "שלהם"
+ # 3fp:
+ # nikkud: "שֶׁלָּהֶן"
+ # ktiv_male: "שלהן"
diff --git a/apkg_builder.py b/apkg_builder.py
index 74dd182..d038b9e 100644
--- a/apkg_builder.py
+++ b/apkg_builder.py
@@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
-RELEASE_TAG = "v0.15.1"
+RELEASE_TAG = "v0.16"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@@ -117,13 +117,15 @@ CARD_CSS = """
.card {
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
font-size: 20px;
- text-align: center;
+ text-align: right;
color: #222;
background: #fff;
padding: 16px;
+ max-width: 600px;
+ margin: 0 auto;
}
.hebrew {
- font-size: 36px;
+ font-size: 42px;
font-weight: bold;
direction: rtl;
text-align: center;
@@ -131,32 +133,34 @@ CARD_CSS = """
color: #222;
}
.hebrew-sm {
- font-size: 24px;
+ font-size: 30px;
font-weight: normal;
direction: rtl;
text-align: center;
- color: #333;
+ color: #222;
}
.meaning {
- font-size: 28px;
+ font-size: 34px;
color: #1a1a8c;
margin: 8px 0;
+ text-align: center;
}
.hint {
- font-size: 16px;
- color: #888;
+ font-size: 22px;
+ color: #555;
margin: 4px 0;
direction: rtl;
+ text-align: center;
}
.root-info {
- font-size: 18px;
- color: #555;
+ font-size: 26px;
+ color: #222;
margin-top: 6px;
direction: rtl;
}
.example {
- font-size: 18px;
- color: #444;
+ font-size: 24px;
+ color: #222;
direction: rtl;
text-align: right;
font-style: italic;
@@ -182,16 +186,17 @@ CARD_CSS = """
color: #555;
}
.sec-label {
- font-size: 20px;
+ font-size: 28px;
font-weight: normal;
- color: #555;
+ color: #222;
direction: rtl;
text-align: center;
margin-top: 6px;
}
.sec-key {
- font-size: 18px;
- color: #888;
+ font-size: 28px;
+ color: #222;
+ font-weight: bold;
}
.definitions {
direction: rtl;
@@ -199,32 +204,37 @@ CARD_CSS = """
}
.conf-entry {
margin: 8px 0;
- font-size: 20px;
+ font-size: 28px;
direction: rtl;
}
.related-group {
direction: rtl;
- text-align: right;
+ text-align: center;
margin: 2px 0;
- font-size: 18px;
+ font-size: 26px;
}
.emoji-img {
font-size: 3.5em;
text-align: center;
margin: 0.3em 0;
}
+.card [type="button"], .card button, .replay-button {
+ display: block !important;
+ margin: 4px auto !important;
+ text-align: center;
+}
@media (prefers-color-scheme: dark) {
.card { color: #e8e8e8; background: #1c1c1e; }
.hebrew { color: #f0f0f0; }
- .hebrew-sm { color: #ddd; }
+ .hebrew-sm { color: #e0e0e0; }
.meaning { color: #82b0ff; }
- .root-info { color: #aaa; }
- .sec-label { color: #aaa; }
- .sec-key { color: #666; }
+ .root-info { color: #e0e0e0; }
+ .sec-label { color: #e0e0e0; }
+ .sec-key { color: #e0e0e0; }
.conf-entry { color: #ddd; }
.hint { color: #777; }
.voice-label { color: #888; }
- .example { color: #bbb; border-right-color: #555; }
+ .example { color: #e0e0e0; border-right-color: #555; }
.divider { border-top-color: #333; }
.freq-badge { color: #888; border-color: #444; }
}
@@ -252,9 +262,6 @@ VOCAB_BACK_HEB = """
{{SharedRoots}}
{{/SharedRoots}}
{{#Plural}}רַבִּים: {{Plural}}
{{/Plural}}
-{{#Example}}
-{{Example}}
-{{/Example}}
{{#Frequency}}#{{Frequency}}
{{/Frequency}}
"""
@@ -273,14 +280,15 @@ VOCAB_BACK_ENG = """
{{#WordNoNikkud}}לְלֹא נִיקּוּד: {{WordNoNikkud}}
{{/WordNoNikkud}}
{{#Root}}שֹׁרֶשׁ: {{Root}}
{{/Root}}
{{#PoS}}חֵלֶק דִּיבּוּר: {{PoS}}
{{/PoS}}
+{{#SharedRoots}}
+מִילִים קְשׁוּרוֹת:
+{{SharedRoots}}
+{{/SharedRoots}}
{{#Plural}}רַבִּים: {{Plural}}
{{/Plural}}
-{{#Example}}
-{{Example}}
-{{/Example}}
"""
VOCAB_FRONT_CLOZE = """
-{{ClozeExample}}
+{{ClozeExample}}
{{#ClozeHint}}{{ClozeHint}}
{{/ClozeHint}}
"""
@@ -289,7 +297,6 @@ VOCAB_BACK_CLOZE = """
{{Word}}
{{#Audio}}{{Audio}}
{{/Audio}}
-{{Meaning}}
"""
VOCAB_MODEL = genanki.Model(
@@ -343,8 +350,8 @@ VOCAB_MODEL = genanki.Model(
CONJ_FRONT = """
אֵיךְ אוֹמְרִים
-{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Pronoun}}
+{{Infinitive}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Tense}}
"""
@@ -363,7 +370,7 @@ CONJ_CSS = CARD_CSS
CONJ_MODEL = genanki.Model(
CONJ_MODEL_ID,
- "Pealim Conjugation",
+ "Hebrew Conjugation",
fields=[
{"name": "Infinitive"},
{"name": "ReferenceForm"},
@@ -666,8 +673,9 @@ def _load_emoji_lookup() -> dict[str, str]:
def _categorize_pos(pos_str: str) -> str:
"""Return the canonical PoS category key for grouping."""
+ base = pos_str.split("–")[0].split("—")[0].strip()
for cat in POS_CATEGORY_LABELS:
- if cat.lower() in pos_str.lower():
+ if base == cat:
return cat
return "Other"
@@ -745,10 +753,14 @@ def build_vocab_deck(
word_nikkud = entry["word"]["nikkud"]
word_no_nik = entry["word"].get("ktiv_male", "")
root_list = entry.get("root") or []
- root = " ".join(root_list)
+ root = ".".join(root_list)
pos_raw = entry.get("pos", "")
pos_heb = entry.get("pos_hebrew", "")
- meaning = entry.get("meaning", "") or ""
+ meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
+ meaning = HBPAREN_RE.sub("", meaning).strip()
+ meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
+ meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
+ meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
meaning_raw = entry.get("meaning_raw", "") or ""
slug = entry.get("slug", "") or ""
frequency = entry.get("frequency") or 999_999
@@ -839,6 +851,9 @@ def build_vocab_deck(
end = cloze_data.get("cloze_word_end")
if cloze_text and start is not None and end is not None:
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
+ # Clean up duplicate/misplaced quotation marks
+ cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
+ cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
raw_hint = cloze_data.get("cloze_hint") or ""
if raw_hint:
cloze_hint = raw_hint
@@ -871,11 +886,12 @@ def build_vocab_deck(
parts.append(f'{label}: {" ".join(rw_words)}
')
related_html = "\n".join(parts)
- # Plural form (for nouns)
+ # Plural form (nouns only — guard against adjective/verb inflection bleed)
plural_str = ""
- noun_inflection = entry.get("noun_inflection")
- if noun_inflection and noun_inflection.get("plural"):
- plural_str = noun_inflection["plural"].get("nikkud", "")
+ if pos_raw.startswith("Noun"):
+ noun_inflection = entry.get("noun_inflection")
+ if noun_inflection and noun_inflection.get("plural"):
+ plural_str = noun_inflection["plural"].get("nikkud", "")
# Image
image_tag = ""
@@ -977,18 +993,28 @@ def build_conj_deck(
binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
slug = entry.get("slug", "") or ""
root_list = entry.get("root") or []
- root = " ".join(root_list)
+ root = ".".join(root_list)
voice = VOICE_MAP.get(binyan, "")
+ meaning_raw = entry.get("meaning_raw", "") or ""
meaning = entry.get("meaning", "") or ""
- # Extract Hebrew preposition from meaning_raw
+ # Extract Hebrew preposition — strip from meaning, show on Hebrew side
prep_str = ""
conj_prep = conj.get("prep")
if conj_prep:
- prep_str = f"({conj_prep})"
- elif meaning:
- preps = HBPAREN_RE.findall(entry.get("meaning_raw", "") or "")
- prep_str = " ".join(f"({p})" for p in preps)
+ # Strip any parentheses from stored prep value
+ prep_str = conj_prep.strip("() ")
+ elif meaning_raw:
+ preps = HBPAREN_RE.findall(meaning_raw)
+ if preps:
+ prep_str = preps[0]
+ # Strip Hebrew prepositions from English meaning to avoid duplication
+ if prep_str:
+ meaning = HBPAREN_RE.sub("", meaning).strip()
+ # Also strip from meaning_raw patterns like "(על)"
+ meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
+ # Clean up double spaces and trailing commas
+ meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
related = [w for w in root_words.get(root, []) if w != infinitive]
related_str = " ".join(related[:8]) if related else ""
@@ -1024,7 +1050,7 @@ def build_conj_deck(
elif guid_candidates:
note_guid = guid_candidates[0]
else:
- note_guid = genanki.guid_for(_infinitive, pronoun, tense)
+ note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
note = genanki.Note(
model=CONJ_MODEL,
guid=note_guid,
@@ -1213,8 +1239,10 @@ def build_conj_deck(
# ──────────────────────────────────────────────────────────────────────────────
CONF_FRONT = """
+
"""
CONF_BACK = """
@@ -1271,7 +1299,10 @@ def build_confusables_deck(
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
guid_to_entries.setdefault(guid, []).append(entry)
- for guid, group_entries in sorted(guid_to_entries.items(), key=lambda x: x[0]):
+ for guid, group_entries in sorted(
+ guid_to_entries.items(),
+ key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
+ ):
if guid in seen_guids:
continue
seen_guids.add(guid)
@@ -1366,6 +1397,7 @@ PLURAL_BACK_SG = """
{{FrontSide}}
{{Plural}}
{{#PluralAudio}}{{PluralAudio}}
{{/PluralAudio}}
+{{#Gender}}מִין: {{Gender}}
{{/Gender}}
{{#Mishkal}}מִשְׁקָל: {{Mishkal}}
{{/Mishkal}}
"""
@@ -1380,6 +1412,7 @@ PLURAL_BACK_PL = """
{{Singular}}
{{#SingularAudio}}{{SingularAudio}}
{{/SingularAudio}}
{{Meaning}}
+{{#Gender}}מִין: {{Gender}}
{{/Gender}}
{{#Mishkal}}מִשְׁקָל: {{Mishkal}}
{{/Mishkal}}
"""
@@ -1483,10 +1516,11 @@ def build_plural_deck(
plural = noun_inflection["plural"]["nikkud"]
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
gender = noun_inflection.get("gender") or ""
+ gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
mishkal = noun_inflection.get("mishkal") or ""
- meaning = entry.get("meaning") or ""
+ meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
root_list = entry.get("root") or []
- root = " ".join(root_list)
+ root = ".".join(root_list)
# GUID from noun_inflection
note_guid_raw = noun_inflection.get("plurals_guid")
@@ -1520,7 +1554,7 @@ def build_plural_deck(
meaning,
root,
mishkal,
- gender,
+ gender_heb,
],
tags=tags,
)
diff --git a/benyehuda.py b/benyehuda.py
deleted file mode 100644
index e3e94e8..0000000
--- a/benyehuda.py
+++ /dev/null
@@ -1,202 +0,0 @@
-#!/usr/bin/env python3
-"""
-Ben Yehuda corpus example-sentence lookup (nikkud corpus).
-
-TODO: Rewrite to update words.json examples fields directly instead of
-writing to a separate examples_cache.json. Currently the migration script
-bridges the gap. See Phase 5 in SPRINT_LOG.md.
-
-Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
-then answers queries locally.
-
-Exposed API:
- load(force_rebuild=False)
- get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
- save_examples_cache()
-"""
-
-import json
-import logging
-import re
-import zipfile
-from io import BytesIO
-from pathlib import Path
-
-import requests
-
-from helpers import strip_nikkud as _strip_nikkud
-
-logger = logging.getLogger(__name__)
-
-# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
-CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
-INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
-EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
-REQUEST_TIMEOUT = 120
-MIN_SENTENCE_LEN = 20
-MAX_SENTENCE_LEN = 200
-MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
-
-# Module-level state
-_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
-_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
-
-
-def _split_sentences(text: str) -> list[str]:
- """
- Split text into sentences on newlines only (Hebrew sentences don't have
- mid-word period issues like English). Min 20 chars, max 200 chars.
- """
- out = []
- for line in text.split("\n"):
- s = line.strip().strip("\"'.,;:!?")
- s = s.strip()
- if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
- out.append(s)
- return out
-
-
-def _build_index(corpus_zip_bytes: bytes) -> None:
- """Parse corpus ZIP and build word (nikkud) → sentences index."""
- global _index
- _index = {}
- logger.info("Building Ben Yehuda index from nikkud corpus …")
-
- with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
- txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
- logger.info(f" Corpus contains {len(txt_files)} text files")
- for fname in txt_files:
- try:
- raw = zf.read(fname).decode("utf-8", errors="ignore")
- except Exception: # noqa: S112
- continue
- for sentence in _split_sentences(raw):
- # Index by each unique Hebrew token (with nikkud) in the sentence
- words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
- for w in set(words):
- if len(w) >= 2:
- bucket = _index.setdefault(w, [])
- if len(bucket) < MAX_INDEX_ENTRIES:
- bucket.append(sentence)
-
- logger.info(f"Index built: {len(_index)} unique word forms")
-
-
-def _save_index() -> None:
- INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
- with open(INDEX_PATH, "w", encoding="utf-8") as f:
- json.dump(_index, f, ensure_ascii=False)
- logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
-
-
-def _load_index() -> None:
- global _index
- with open(INDEX_PATH, encoding="utf-8") as f:
- _index = json.load(f)
- logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
-
-
-def load(force_rebuild: bool = False) -> None:
- """Load or build the Ben Yehuda index. Downloads corpus if needed."""
- global _index, _examples_cache
- if _index and not force_rebuild:
- return
-
- if force_rebuild:
- # Delete old index and discard examples cache
- if INDEX_PATH.exists():
- INDEX_PATH.unlink()
- logger.info("Deleted old Ben Yehuda index (force rebuild)")
- _examples_cache = {}
- else:
- # Load persisted examples cache (not needed on rebuild)
- if EXAMPLES_CACHE_PATH.exists():
- with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
- _examples_cache = json.load(f)
-
- if INDEX_PATH.exists():
- _load_index()
- return
-
- logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
- resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
- resp.raise_for_status()
- data = resp.content
- logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
-
- _build_index(data)
- _save_index()
-
-
-def save_examples_cache() -> None:
- EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
- with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
- json.dump(_examples_cache, f, ensure_ascii=False)
- logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
-
-
-def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
- """
- Return 0 or 1 example sentences for the given word (nikkud form).
-
- Lookup strategy:
- 1. Try exact nikkud match in index.
- 2. Fall back to stripped (no-nikkud) match against index keys.
- Skipped when word's consonants are in confusable_consonants set
- (to avoid returning sentences for the wrong homograph).
-
- Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
- the word as a whole token.
- """
- if not _index:
- load()
-
- word = word_nikkud.strip()
- word_stripped = _strip_nikkud(word)
-
- cache_key = word
-
- if cache_key in _examples_cache:
- return _examples_cache[cache_key]
-
- # Lookup: try exact nikkud first, then stripped fallback
- candidates = _index.get(word, [])
- if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
- # Try looking up by stripped form across index keys
- for k, v in _index.items():
- if _strip_nikkud(k) == word_stripped:
- candidates = v
- break
-
- # Filter: word must appear as a whole token
- # Match the stripped form (for robustness with nikkud variants in sentence)
- if word_stripped:
- pattern = r"(? display name
-EPUB_BOOKS = {
- "little_prince.epub": "הנסיך הקטן",
- "time_tunnel_82.epub": "מנהרת הזמן 82",
-}
+def _discover_epubs() -> dict[str, str]:
+ """Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
+ if not EPUB_DIR.exists():
+ return {}
+ books: dict[str, str] = {}
+ for path in sorted(EPUB_DIR.glob("*.epub")):
+ stem = path.stem
+ stem_stripped = strip_nikkud(stem).lower()
+ # Derive a brief English display name from the filename
+ parts = stem.split(" -- ")
+ title_part = strip_nikkud(parts[0]).strip().lower()
+ if "alice" in stem_stripped or "אליס" in title_part:
+ name = "alice_wonderland"
+ elif "little_prince" in stem_stripped or "נסיך" in title_part:
+ name = "little_prince"
+ elif "מנהרת" in title_part or "time_tunnel" in stem_stripped:
+ num_match = re.search(r"(\d+)", stem_stripped)
+ num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "")
+ name = f"time_tunnel_{num}"
+ else:
+ name = stem_stripped[:40]
+ books[str(path)] = name
+ return books
-# PDF books are excluded — pypdf produces garbled RTL text (reversed chars within
-# words). If/when a proper EPUB version becomes available on Calibre, add it to
-# EPUB_BOOKS above instead.
-PDF_BOOKS: dict[str, str] = {}
# Sentence length bounds (word count)
MIN_WORDS = 4
@@ -58,7 +75,7 @@ class _TextExtractor(HTMLParser):
_ = attrs # required by HTMLParser interface
if tag in self.SKIP_TAGS:
self._skip_depth += 1
- # Insert space for block-level elements to avoid word concatenation
+ # Insert newline for block-level elements to avoid word concatenation
if tag in (
"p",
"div",
@@ -102,7 +119,6 @@ def extract_text_from_html(html: str) -> str:
def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
"""Get ordered list of content XHTML files from the OPF manifest."""
- # Find the OPF file
opf_path = None
for name in zf.namelist():
if name.endswith(".opf"):
@@ -124,7 +140,7 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
opf_dir = os.path.dirname(opf_path)
# Extract manifest items: id -> href
- manifest = {}
+ manifest: dict[str, str] = {}
for m in re.finditer(r'- ]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
manifest[m.group(1)] = m.group(2)
# Also try reversed attribute order
@@ -157,7 +173,12 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
"""Extract sentences from an EPUB file.
- Returns list of {"text": str, "book": str, "stripped": str}
+ Args:
+ epub_path: Path to the .epub file.
+ book_name: Human-readable book name used as the ``source`` field.
+
+ Returns:
+ List of ``{"text": str, "source": str}`` dicts.
"""
zf = zipfile.ZipFile(epub_path)
content_files = _content_files_from_epub(zf)
@@ -175,41 +196,6 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
return _split_into_sentences(full_text, book_name)
-# ── PDF processing ───────────────────────────────────────────────
-
-
-def extract_sentences_from_pdf(pdf_path: Path, book_name: str) -> list[dict]:
- """Extract sentences from a PDF file (best-effort, handles RTL reversal)."""
- try:
- import pypdf
- except ImportError:
- print(f" [SKIP] pypdf not installed, cannot process {pdf_path.name}")
- return []
-
- reader = pypdf.PdfReader(pdf_path)
- all_text_parts = []
-
- for page in reader.pages:
- raw = page.extract_text()
- if not raw:
- continue
- # pypdf often reverses word order for RTL text; fix it
- fixed_lines = []
- for line in raw.split("\n"):
- words = line.split()
- # Check if this line is predominantly Hebrew
- hebrew_chars = sum(1 for c in line if "\u0590" <= c <= "\u05ff")
- if hebrew_chars > len(line) * 0.3 and len(words) > 1:
- # Reverse word order
- fixed_lines.append(" ".join(reversed(words)))
- else:
- fixed_lines.append(line)
- all_text_parts.append("\n".join(fixed_lines))
-
- full_text = "\n".join(all_text_parts)
- return _split_into_sentences(full_text, book_name)
-
-
# ── Sentence splitting ───────────────────────────────────────────
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
@@ -217,18 +203,27 @@ _SENT_SPLIT = re.compile(r"[.!?\u05C3]+")
# Punctuation to strip from word boundaries when matching
_PUNCT = re.compile(
- r'^[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
+ r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|'
+ r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
)
def _split_into_sentences(text: str, book_name: str) -> list[dict]:
- """Split text into sentences and filter by length."""
+ """Split text into Hebrew sentences and filter by word count.
+
+ Args:
+ text: Raw extracted text from an EPUB chapter.
+ book_name: Source label for each sentence dict.
+
+ Returns:
+ List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text.
+ """
# Normalize whitespace
text = re.sub(r"\s+", " ", text).strip()
raw_sentences = _SENT_SPLIT.split(text)
- results = []
- seen = set()
+ results: list[dict] = []
+ seen: set[str] = set()
for sent in raw_sentences:
sent = sent.strip()
@@ -242,205 +237,555 @@ def _split_into_sentences(text: str, book_name: str) -> list[dict]:
if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
continue
- # Skip duplicates
- stripped = strip_nikkud(sent)
- if stripped in seen:
+ # Deduplicate by exact nikkud text
+ if sent in seen:
continue
- seen.add(stripped)
+ seen.add(sent)
- results.append(
- {
- "text": sent,
- "book": book_name,
- "stripped": stripped,
- }
- )
+ results.append({"text": sent, "source": book_name})
return results
-# ── Vocab loading ────────────────────────────────────────────────
+# ── Nikkud index ─────────────────────────────────────────────────
+
+# Unicode ranges for Hebrew combining marks
+_NIKKUD_LOW = 0x05B0 # start of vowel points (shva)
+_NIKKUD_HIGH = 0x05BD # end of vowel range (meteg); 0x05BE is maqaf (punctuation)
+_DAGESH = "\u05bc"
+_SHIN_DOT = "\u05c1"
+_SIN_DOT = "\u05c2"
+
+# Valid prefix consonants
+_PREFIX_CONSONANTS = set("בהוכלמש")
+
+# Named vowel combining marks
+_SHVA = "\u05b0"
+_HIRIQ = "\u05b4"
+_TSERE = "\u05b5"
+_SEGOL = "\u05b6"
+_PATACH = "\u05b7"
+_QAMATZ = "\u05b8"
+
+# Valid nikkud patterns on each prefix consonant.
+# Key = consonant, Value = set of frozensets of combining marks valid for that prefix.
+_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = {
+ "ב": {
+ frozenset({_SHVA, _DAGESH}), # בְּ standard
+ frozenset({_HIRIQ, _DAGESH}), # בִּ before shva
+ frozenset({_PATACH, _DAGESH}), # בַּ with definite article
+ frozenset({_QAMATZ, _DAGESH}), # בָּ before chataf qamatz
+ frozenset({_SEGOL, _DAGESH}), # בֶּ before chataf segol
+ },
+ "כ": {
+ frozenset({_SHVA, _DAGESH}), # כְּ
+ frozenset({_HIRIQ, _DAGESH}), # כִּ
+ frozenset({_PATACH, _DAGESH}), # כַּ
+ frozenset({_QAMATZ, _DAGESH}), # כָּ
+ frozenset({_SEGOL, _DAGESH}), # כֶּ
+ },
+ "ל": {
+ frozenset({_SHVA}), # לְ standard
+ frozenset({_HIRIQ}), # לִ before shva
+ frozenset({_PATACH}), # לַ with definite article
+ frozenset({_QAMATZ}), # לָ demonstratives
+ frozenset({_SEGOL}), # לֶ before chataf segol
+ },
+ "ו": {
+ frozenset({_SHVA}), # וְ standard
+ frozenset({_DAGESH}), # וּ (shureq) before shva/bumf
+ frozenset({_PATACH}), # וַ before chataf patach
+ frozenset({_QAMATZ}), # וָ before chataf qamatz
+ frozenset({_SEGOL}), # וֶ before chataf segol
+ frozenset({_HIRIQ}), # וִ before yud-shva
+ },
+ "מ": {
+ frozenset({_HIRIQ}), # מִ standard
+ frozenset({_TSERE}), # מֵ before gutturals
+ },
+ "ש": {
+ frozenset({_SEGOL, _DAGESH}), # שֶׁ standard
+ frozenset({_SEGOL, _DAGESH, _SHIN_DOT}), # שֶׁ with explicit shin dot
+ },
+ "ה": {
+ frozenset({_PATACH}), # הַ standard definite article
+ frozenset({_QAMATZ}), # הָ before gutturals
+ frozenset({_SEGOL}), # הֶ before qamatz-bearing gutturals
+ },
+}
-def load_vocab(csv_path: Path) -> dict:
- """Load vocab CSV and return {stripped_form: nikkud_word} mapping.
+def _is_combining_mark(ch: str) -> bool:
+ """Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots)."""
+ cp = ord(ch)
+ if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH:
+ return True
+ return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT)
- Also returns reverse mapping for lookup.
- Returns (word_to_nikkud, nikkud_words_set)
+
+def _decompose_first_char(token: str) -> tuple[str, frozenset, str]:
+ """Split token into (first_consonant, its_combining_marks, remainder).
+
+ Args:
+ token: A nikkud Hebrew token string.
+
+ Returns:
+ A tuple of (consonant, marks, rest). Returns ("", frozenset(), token)
+ if the token does not start with a Hebrew consonant (alef–tav range).
"""
- words_by_stripped: dict[str, list[str]] = {} # stripped -> [nikkud words]
+ if not token:
+ return ("", frozenset(), token)
- with open(csv_path, encoding="utf-8") as f:
- reader = csv.DictReader(f, delimiter=";")
- for row in reader:
- nikkud_word = row.get("Word", "").strip()
- word_no_nik = row.get("Word Without Nikkud", "").strip()
- if not nikkud_word:
- continue
+ first = token[0]
+ # Check it's a Hebrew consonant (alef–tav)
+ if not ("\u05d0" <= first <= "\u05ea"):
+ return ("", frozenset(), token)
- # Method 1: strip nikkud from the Word column
- stripped_from_nikkud = strip_nikkud(nikkud_word)
+ # Collect all combining marks that follow the consonant
+ marks: set[str] = set()
+ i = 1
+ while i < len(token):
+ ch = token[i]
+ if _is_combining_mark(ch):
+ marks.add(ch)
+ i += 1
+ else:
+ break
- # Add both forms for matching
- for form in {stripped_from_nikkud, word_no_nik}:
- if form:
- words_by_stripped.setdefault(form, []).append(nikkud_word)
+ return (first, frozenset(marks), token[i:])
- return words_by_stripped
+
+def _is_valid_prefix(consonant: str, marks: frozenset) -> bool:
+ """Check if consonant + marks form a valid Hebrew prefix combination.
+
+ Args:
+ consonant: The prefix consonant character.
+ marks: Frozenset of combining mark characters on that consonant.
+
+ Returns:
+ True if this is a recognised Hebrew prefix vocalization.
+ """
+ valid = _VALID_PREFIX_MARKS.get(consonant)
+ if not valid:
+ return False
+ # For ש, allow shin dot to be present or absent
+ if consonant == "ש":
+ marks_without_shin = marks - {_SHIN_DOT}
+ return marks_without_shin in valid or marks in valid
+ return marks in valid
+
+
+def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str:
+ """Reassemble a token from its decomposed parts, sorting marks by codepoint."""
+ return consonant + "".join(sorted(marks)) + rest
+
+
+def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]:
+ """Try stripping 1 or 2 prefix letters from a nikkud token.
+
+ Args:
+ token: A cleaned nikkud word token.
+ nikkud_index: Mapping from nikkud form to list of (unique_key, match_type).
+
+ Returns:
+ List of (unique_key, match_type, matched_remainder) for each hit found.
+ The match_type will have ``"_prefix"`` appended to the base type.
+ """
+ results: list[tuple[str, str, str]] = []
+
+ # Try 1-letter prefix
+ c1, m1, rest1 = _decompose_first_char(token)
+ if not (c1 and _is_valid_prefix(c1, m1) and rest1):
+ return results
+
+ # Direct match on 1-prefix remainder
+ if rest1 in nikkud_index:
+ for unique_key, match_type in nikkud_index[rest1]:
+ results.append((unique_key, match_type + "_prefix", rest1))
+
+ # Try removing dagesh from first letter of remainder
+ # (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ)
+ c2, m2, rest2_inner = _decompose_first_char(rest1)
+ if c2 and _DAGESH in m2:
+ without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner)
+ if without_dagesh != rest1 and without_dagesh in nikkud_index:
+ for unique_key, match_type in nikkud_index[without_dagesh]:
+ results.append((unique_key, match_type + "_prefix", without_dagesh))
+
+ # Try 2-letter prefix (ו and ש commonly stack with another prefix)
+ if c1 in "וש":
+ c2b, m2b, rest2b = _decompose_first_char(rest1)
+ if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b:
+ if rest2b in nikkud_index:
+ for unique_key, match_type in nikkud_index[rest2b]:
+ results.append((unique_key, match_type + "_prefix", rest2b))
+
+ # Also try dagesh removal on remainder of 2-letter prefix
+ c3, m3, rest3_inner = _decompose_first_char(rest2b)
+ if c3 and _DAGESH in m3:
+ without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner)
+ if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index:
+ for unique_key, match_type in nikkud_index[without_dagesh2]:
+ results.append((unique_key, match_type + "_prefix", without_dagesh2))
+
+ return results
+
+
+def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
+ """Build a mapping from nikkud form to list of (unique_key, match_type).
+
+ Indexes the following sources per entry:
+
+ - ``word.nikkud`` → "direct"
+ - conjugation active/passive forms → "conjugated"
+ - conjugation infinitive and reference_form → "conjugated"
+ - noun inflection singular/plural/construct/pronominal → "inflected"
+
+ Args:
+ words: The full words.json dict keyed by unique_key.
+
+ Returns:
+ Dict mapping each nikkud form to a list of (unique_key, match_type) tuples.
+ """
+ index: dict[str, list[tuple[str, str]]] = {}
+
+ def _add(form: str | None, unique_key: str, match_type: str) -> None:
+ if form:
+ index.setdefault(form, []).append((unique_key, match_type))
+
+ for unique_key, entry in words.items():
+ # Direct word form
+ word = entry.get("word") or {}
+ _add(word.get("nikkud"), unique_key, "direct")
+
+ # Conjugation forms
+ conj = entry.get("conjugation") or {}
+
+ for form_entry in conj.get("active_forms") or []:
+ form = (form_entry.get("form") or {}).get("nikkud")
+ _add(form, unique_key, "conjugated")
+
+ for form_entry in conj.get("hufal_pual_forms") or []:
+ form = (form_entry.get("form") or {}).get("nikkud")
+ _add(form, unique_key, "conjugated")
+
+ inf = conj.get("infinitive") or {}
+ _add(inf.get("nikkud"), unique_key, "conjugated")
+
+ ref = conj.get("reference_form") or {}
+ _add(ref.get("nikkud"), unique_key, "conjugated")
+
+ # Noun inflection forms
+ noun = entry.get("noun_inflection") or {}
+
+ for field in ("singular", "plural", "construct_singular", "construct_plural"):
+ sub = noun.get(field) or {}
+ _add(sub.get("nikkud"), unique_key, "inflected")
+
+ pronominal = noun.get("pronominal_suffixes") or {}
+ for _person, sub in pronominal.items():
+ if isinstance(sub, dict):
+ _add(sub.get("nikkud"), unique_key, "inflected")
+
+ return index
+
+
+def _filter_collision_forms(nikkud_index: dict) -> dict:
+ """Remove colliding forms for entries that have other unique forms.
+
+ A "colliding form" maps to 2+ unique_keys. For each unique_key that
+ appears in a collision, check whether it also has at least one
+ non-colliding form in the index. If so, remove it from the colliding
+ form's entry list. If a unique_key's *only* indexed forms all collide,
+ keep them (otherwise the entry would get zero matches).
+
+ Returns a new index dict with the same structure.
+ """
+ # Identify collision forms and build reverse map (key → its forms)
+ collision_forms: set[str] = set()
+ key_to_forms: dict[str, set[str]] = {}
+
+ for form, entries in nikkud_index.items():
+ keys = {uk for uk, _ in entries}
+ if len(keys) >= 2:
+ collision_forms.add(form)
+ for uk, _ in entries:
+ key_to_forms.setdefault(uk, set()).add(form)
+
+ # For each key, check if it has any non-colliding form
+ keys_with_unique_forms: set[str] = set()
+ for uk, forms in key_to_forms.items():
+ if forms - collision_forms:
+ keys_with_unique_forms.add(uk)
+
+ # Build filtered index
+ filtered: dict[str, list[tuple[str, str]]] = {}
+ removed = 0
+ for form, entries in nikkud_index.items():
+ if form in collision_forms:
+ kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms]
+ removed += len(entries) - len(kept)
+ if kept:
+ filtered[form] = kept
+ else:
+ filtered[form] = entries
+
+ logger.info(f" Filtered {removed} collision mappings from entries with unique forms")
+ return filtered
# ── Matching ─────────────────────────────────────────────────────
-def match_sentences(sentences: list[dict], words_by_stripped: dict) -> dict:
- """Match sentences against vocab words.
+def match_sentences(
+ sentences: list[dict],
+ nikkud_index: dict,
+ confusable_keys: set[str],
+) -> dict:
+ """Match sentences to vocab words using the nikkud index.
- Returns {nikkud_word: [sentences]} with best (shortest) first.
+ Args:
+ sentences: List of ``{"text": str, "source": str}`` dicts.
+ nikkud_index: Output of ``_build_nikkud_index``.
+ confusable_keys: Set of unique_keys that are in confusable groups.
+
+ Returns:
+ Dict mapping unique_key → list of match dicts, each containing:
+ ``text``, ``source``, ``match_method``, ``word_count``,
+ ``matched_form``, ``char_offset``, ``char_end``.
"""
- # Build a set of all stripped forms for fast lookup
- all_forms = set(words_by_stripped.keys())
-
- # Hebrew single-letter prefixes: ב, ה, ו, כ, ל, מ, ש, ד (של)
- _HEB_PREFIXES = set("בהוכלמשד")
-
- # For each sentence, extract stripped words
- matches: dict[str, list[tuple[int, str]]] = {} # nikkud_word -> [(word_count, sentence)]
+ matches: dict[str, list[dict]] = {}
for sent_info in sentences:
- sent_text = sent_info["text"]
- sent_stripped = sent_info["stripped"]
- word_count = len(sent_text.split())
+ text = sent_info["text"]
+ source = sent_info["source"]
+ words_in_sent = text.split()
+ word_count = len(words_in_sent)
- # Get stripped words from the sentence
- raw_words = sent_stripped.split()
- # Map: candidate_form -> set of original cleaned words that produced it
- # This lets us verify that prefix stripping is plausible
- candidates: dict[str, str] = {} # form -> original_word
- for w in raw_words:
- cleaned = _PUNCT.sub("", w)
+ char_pos = 0
+ for raw_word in words_in_sent:
+ cleaned = _PUNCT.sub("", raw_word)
if not cleaned:
+ word_start = text.find(raw_word, char_pos)
+ char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos
continue
- # Direct match (always try)
- candidates[cleaned] = cleaned
- # Prefix stripping: only if remaining stem is >= 2 chars
- # and the prefix char is a known Hebrew prefix letter
- for prefix_len in (1, 2):
- if len(cleaned) > prefix_len + 1:
- prefix = cleaned[:prefix_len]
- stem = cleaned[prefix_len:]
- if all(c in _HEB_PREFIXES for c in prefix) and len(stem) >= 2:
- candidates[stem] = cleaned
- # Check which vocab words appear in this sentence
- matched_forms = set(candidates.keys()) & all_forms
- for form in matched_forms:
- # Skip spurious matches: very short vocab forms (1-2 chars)
- # should only match via direct word match, not prefix stripping
- if len(form) <= 2 and form not in {_PUNCT.sub("", w) for w in raw_words}:
- continue
- for nikkud_word in words_by_stripped[form]:
- matches.setdefault(nikkud_word, []).append((word_count, sent_text))
+ # Locate positions within the sentence
+ word_start_in_sent = text.find(raw_word, char_pos)
+ if word_start_in_sent < 0:
+ word_start_in_sent = char_pos
+ clean_offset_in_raw = raw_word.find(cleaned)
+ if clean_offset_in_raw < 0:
+ clean_offset_in_raw = 0
+ clean_start = word_start_in_sent + clean_offset_in_raw
+ clean_end = clean_start + len(cleaned)
- # Sort by word count (prefer shorter sentences) and deduplicate
- result = {}
- for nikkud_word, sent_list in matches.items():
- sent_list.sort(key=lambda x: x[0])
- seen = set()
- unique = []
- for _, sent in sent_list:
- if sent not in seen:
- seen.add(sent)
- unique.append(sent)
- if len(unique) >= 5: # Keep top 5 per word
- break
- result[nikkud_word] = unique
+ found: list[tuple[str, str]] = []
- return result
+ # Direct nikkud match
+ if cleaned in nikkud_index:
+ for unique_key, match_type in nikkud_index[cleaned]:
+ found.append((unique_key, match_type))
+
+ # Prefix stripping — only if no direct match exists
+ if cleaned not in nikkud_index:
+ for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index):
+ found.append((unique_key, match_type))
+
+ for unique_key, match_method in found:
+ matches.setdefault(unique_key, []).append(
+ {
+ "text": text,
+ "source": source,
+ "match_method": match_method,
+ "word_count": word_count,
+ "matched_form": cleaned,
+ "char_offset": clean_start,
+ "char_end": clean_end,
+ }
+ )
+
+ char_pos = word_start_in_sent + len(raw_word)
+
+ return matches
-# ── Main ─────────────────────────────────────────────────────────
+# ── Writing results ──────────────────────────────────────────────
-def main():
- print("=" * 60)
- print("EPUB Example Sentence Extraction Pipeline")
- print("=" * 60)
+def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int:
+ """Update words dict entries with matched example sentences.
- # Step 1: Extract sentences from all books
- all_sentences = []
- book_counts = {}
+ Selects up to 3 best sentences per word (scoring prefers 6–12 word
+ sentences and non-prefix matches). Also generates a cloze entry for
+ the top match, unless the word is in the confusable set.
- for filename, book_name in EPUB_BOOKS.items():
- path = EPUB_DIR / filename
- if not path.exists():
- print(f"\n[SKIP] {filename} not found")
+ Args:
+ words: The full words.json dict, modified in place.
+ matches: Output of ``match_sentences``.
+ confusable_keys: Set of unique_keys in confusable groups.
+
+ Returns:
+ Count of words.json entries that were updated.
+ """
+ import genanki # noqa: PLC0415 — import only where needed
+
+ updated = 0
+
+ for unique_key, sent_list in matches.items():
+ if unique_key not in words:
continue
- print(f"\n[EPUB] Extracting: {book_name} ({filename})")
+
+ entry = words[unique_key]
+
+ # Deduplicate by sentence text
+ seen_texts: set[str] = set()
+ unique: list[dict] = []
+ for s in sent_list:
+ if s["text"] not in seen_texts:
+ seen_texts.add(s["text"])
+ unique.append(s)
+
+ # Prefer direct matches; only fall back to prefix if none exist
+ direct = [s for s in unique if "prefix" not in s["match_method"]]
+ prefix_only = [s for s in unique if "prefix" in s["match_method"]]
+ pool = direct if direct else prefix_only
+
+ # Score: prefer 6–12 word sentences
+ def _score(s: dict) -> tuple[int,]:
+ wc = s["word_count"]
+ length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
+ return (length_score,)
+
+ pool.sort(key=_score)
+ best = pool[:3]
+
+ # Build vetted list
+ if not entry.get("examples"):
+ entry["examples"] = {}
+ examples: dict = entry["examples"]
+ examples["vetted"] = [
+ {
+ "text": s["text"],
+ "source": s["source"],
+ "match_method": s["match_method"],
+ }
+ for s in best
+ ]
+
+ # Build cloze from best sentence (skip confusables)
+ is_confusable = unique_key in confusable_keys
+ if not is_confusable and best:
+ top = best[0]
+ # Preserve existing cloze_guid if sentence text unchanged
+ old_cloze = examples.get("cloze") or {}
+ if old_cloze.get("text") == top["text"]:
+ cloze_guid = old_cloze.get("cloze_guid")
+ else:
+ cloze_guid = genanki.guid_for("cloze", unique_key)
+
+ examples["cloze"] = {
+ "text": top["text"],
+ "cloze_word_start": top["char_offset"],
+ "cloze_word_end": top["char_end"],
+ "cloze_hint": None,
+ "cloze_guid": cloze_guid,
+ }
+ elif is_confusable:
+ examples.pop("cloze", None)
+
+ examples["rejected_count"] = 0
+ updated += 1
+
+ return updated
+
+
+# ── Public API ───────────────────────────────────────────────────
+
+
+def run(words: dict) -> dict:
+ """Extract EPUB sentences, match against words, update words dict in place.
+
+ Called from run.py with the already-loaded words.json dict.
+
+ Args:
+ words: The full words.json dict keyed by unique_key. Modified in place.
+
+ Returns:
+ Summary stats dict with keys ``books``, ``matched``, ``total_vocab``.
+ """
+ logger.info(" Extracting sentences from EPUBs ...")
+ all_sentences: list[dict] = []
+ book_counts: dict[str, int] = {}
+
+ for filepath, book_name in _discover_epubs().items():
+ path = Path(filepath)
sentences = extract_sentences_from_epub(path, book_name)
book_counts[book_name] = len(sentences)
all_sentences.extend(sentences)
- print(f" -> {len(sentences)} sentences")
+ logger.info(f" {book_name}: {len(sentences)} sentences")
- for filename, book_name in PDF_BOOKS.items():
- path = EPUB_DIR / filename
- if not path.exists():
- print(f"\n[SKIP] {filename} not found")
- continue
- print(f"\n[PDF] Extracting: {book_name} ({filename})")
- sentences = extract_sentences_from_pdf(path, book_name)
- book_counts[book_name] = len(sentences)
- all_sentences.extend(sentences)
- print(f" -> {len(sentences)} sentences")
+ if not all_sentences:
+ logger.warning(" No EPUB files found — skipping example extraction")
+ return {"books": {}, "matched": 0, "total_vocab": len(words)}
- print(f"\nTotal sentences: {len(all_sentences)}")
+ logger.info(f" Total sentences: {len(all_sentences)}")
- # Step 2: Save sentence index
- index_path = DATA_DIR / "epub_sentence_index.json"
- with open(index_path, "w", encoding="utf-8") as f:
- json.dump({"sentences": all_sentences}, f, ensure_ascii=False, indent=2)
- print(f"\nSaved sentence index: {index_path}")
+ # Build nikkud index
+ logger.info(" Building nikkud index from words.json ...")
+ nikkud_index = _build_nikkud_index(words)
+ logger.info(f" {len(nikkud_index)} unique nikkud forms indexed")
- # Step 3: Load vocab and match
- print(f"\nLoading vocab from {DICT_CSV} ...")
- words_by_stripped = load_vocab(DICT_CSV)
- total_vocab = len({w for wlist in words_by_stripped.values() for w in wlist})
- print(f" {total_vocab} unique vocab words ({len(words_by_stripped)} lookup forms)")
+ # Filter out collision forms for entries that have unique forms
+ nikkud_index = _filter_collision_forms(nikkud_index)
- print("\nMatching sentences against vocab ...")
- examples_cache = match_sentences(all_sentences, words_by_stripped)
+ # Build confusable key set
+ confusable_keys: set[str] = set()
+ for key, entry in words.items():
+ if entry.get("confusable_group"):
+ confusable_keys.add(key)
- # Step 4: Save examples_cache
- cache_path = DATA_DIR / "examples_cache.json"
- with open(cache_path, "w", encoding="utf-8") as f:
- json.dump(examples_cache, f, ensure_ascii=False, indent=2)
- print(f"Saved examples cache: {cache_path}")
+ # Match sentences
+ logger.info(" Matching sentences against vocab ...")
+ matches = match_sentences(all_sentences, nikkud_index, confusable_keys)
+ logger.info(f" {len(matches)} words matched")
- # Step 5: Summary stats
- print("\n" + "=" * 60)
- print("SUMMARY")
- print("=" * 60)
- print("\nSentences per book:")
- for book_name, count in book_counts.items():
- print(f" {book_name}: {count}")
- print(f" Total: {len(all_sentences)}")
+ # Break down by match method
+ method_counts: dict[str, int] = {}
+ for sent_list in matches.values():
+ for s in sent_list:
+ method = s["match_method"]
+ method_counts[method] = method_counts.get(method, 0) + 1
+ for method, count in sorted(method_counts.items()):
+ logger.info(f" {method}: {count} sentence-word pairs")
- print("\nVocab matching:")
- print(f" Total vocab words: {total_vocab}")
- print(f" Words with examples: {len(examples_cache)}")
- coverage = 100 * len(examples_cache) / total_vocab if total_vocab else 0
- print(f" Coverage: {coverage:.1f}%")
+ # Update words dict in place
+ updated = update_words_json(words, matches, confusable_keys)
+ logger.info(f" Updated {updated} entries in words.json")
- # Show some sample matches
- print("\nSample matches:")
- count = 0
- for word, sents in examples_cache.items():
- if count >= 5:
- break
- print(f" {word} -> {sents[0][:60]}...")
- count += 1
+ return {
+ "books": book_counts,
+ "matched": len(matches),
+ "total_vocab": len(words),
+ }
- return examples_cache
+# ── Standalone entry point ───────────────────────────────────────
if __name__ == "__main__":
- main()
+ import json
+
+ logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+ words_path = DATA_DIR / "words.json"
+ with open(words_path, encoding="utf-8") as f:
+ words = json.load(f)
+
+ stats = run(words)
+
+ # Save updated words.json
+ with open(words_path, "w", encoding="utf-8") as f:
+ json.dump(words, f, ensure_ascii=False, indent=2)
+
+ coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0
+ logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)")
diff --git a/pealim_detail_scrape.py b/pealim_detail_scrape.py
index 22bef58..36730ba 100644
--- a/pealim_detail_scrape.py
+++ b/pealim_detail_scrape.py
@@ -2,7 +2,8 @@
"""
Consolidated detail page scraper for pealim.com.
-Visits /dict// detail pages for nouns and verbs in data/words.json.
+Visits /dict// detail pages for nouns, verbs, adjectives and prepositions
+in data/words.json.
Makes two requests per slug:
1. hebstyle=mo cookie → nikkud forms
2. hebstyle=vl cookie → ktiv male forms
@@ -11,7 +12,8 @@ Updates entries in data/words.json with scraped detail data.
Usage:
python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
- [--nouns-only | --verbs-only]
+ [--nouns-only | --verbs-only |
+ --adjectives-only | --prepositions-only]
"""
import argparse
@@ -144,28 +146,128 @@ FORM_KEY_TO_PERSON: dict[str, str] = {
"infinitive": "inf",
}
-# Mishkal English name → Hebrew nikkud mapping (common patterns)
-MISHKAL_HEBREW: dict[str, str] = {
- "CaCaC": "קָטָל",
- "CeCeC": "קֶטֶל",
- "CiCeC": "קִטֶל",
- "CaCeC": "קָטֶל",
- "CoCeC": "קוֹטֵל",
- "CaCiC": "קָטִיד",
- "CaCuC": "קָטוּר",
- "miCCaC": "מִקְטָל",
- "miCCeC": "מִקְטֶל",
- "maCCeC": "מַקְטֶל",
- "maCCiC": "מַקְטִיר",
- "hiCCiC": "הִקְטִיל",
- "CiCCuC": "קִטּוּל",
- "hitCaCCeC": "הִתְקַטֵּל",
- "CaCCan": "קַטְּלָן",
- "CaCCaC": "קַטָּל",
- "CiCCon": "קִטְּרוֹן",
- "CaCCeC": "קַטֶּלֶת",
+# Mishkal English name → Hebrew nikkud mapping
+# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
+# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
+# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
+_MISHKAL_HEBREW_Q: dict[str, str] = {
+ # --- a ---
+ "aqtal": "אַקְטָל",
+ "aqtala": "אַקְטָלָה",
+ # --- e ---
+ "eqtal": "אֶקְטָל",
+ # --- h ---
+ "haqtala": "הַקְטָלָה",
+ "heqtel": "הֶקְטֵל",
+ "hiqqatlut": "הִקָּטְלוּת",
+ "hitqattlut": "הִתְקַטְּלוּת",
+ # --- m ---
+ "maqtal": "מַקְטָל",
+ "maqtel": "מַקְטֵל",
+ "maqtela": "מַקְטֵלָה",
+ "maqtelet": "מַקְטֶלֶת",
+ "maqtil": "מַקְטִיל",
+ "maqtol": "מַקְטוֹל",
+ "maqtolet": "מַקְטֹלֶת",
+ "maqtul": "מַקְטוּל",
+ "meqattel": "מְקַטֵּל",
+ "meqila": "מְקִילָה",
+ "mequla": "מְקוּלָה",
+ "mequttal": "מְקֻטָּל",
+ "miqtal": "מִקְטָל",
+ "miqtala": "מִקְטָלָה",
+ "miqtelet": "מִקְטֶלֶת",
+ "miqtol": "מִקְטוֹל",
+ "miqtolet": "מִקְטֹלֶת",
+ "mitqattel": "מִתְקַטֵּל",
+ "muqtal": "מֻקְטָל",
+ # --- n ---
+ "niqtal": "נִקְטָל",
+ # --- q ---
+ "qal": "קַל",
+ "qatal": "קָטָל",
+ "qatel": "קָטֵל",
+ "qatil": "קָטִיל",
+ "qatla": "קַטְלָה",
+ "qatlan": "קַטְלָן",
+ "qatlut": "קַטְלוּת",
+ "qatol": "קָטוֹל",
+ "qaton": "קָטוֹן",
+ "qattal": "קַטָּל",
+ "qattala": "קַטָּלָה",
+ "qattelet": "קַטֶּלֶת",
+ "qattil": "קַטִּיל",
+ "qattila": "קַטִּילָה",
+ "qattolet": "קַטֹּלֶת",
+ "qattul": "קַטּוּל",
+ "qatul": "קָטוּל",
+ "qatut": "קָטוּת",
+ "qetel": "קֶטֶל",
+ "qeteh": "קֵטֶה",
+ "qitla": "קִטְלָה",
+ "qitlon": "קִטְלוֹן",
+ "qittalon": "קִטָּלוֹן",
+ "qittel": "קִטֵּל",
+ "qittelet": "קִטֶּלֶת",
+ "qittol": "קִטּוֹל",
+ "qittolet": "קִטֹּלֶת",
+ "qittul": "קִטּוּל",
+ "qol": "קֹל",
+ "qotal": "קוֹטָל",
+ "qotel": "קוֹטֵל",
+ "qotelet": "קוֹטֶלֶת",
+ "qotla": "קָטְלָה",
+ "qtal": "קְטָל",
+ "qtala": "קְטָלָה",
+ "qtaltal": "קְטַלְטַל",
+ "qtaltan": "קְטַלְתָּן",
+ "qtaltolet": "קְטַלְטֹלֶת",
+ "qtel": "קְטֵל",
+ "qtela": "קְטֵלָה",
+ "qtelet": "קְטֶלֶת",
+ "qtil": "קְטִיל",
+ "qtila": "קְטִילָה",
+ "qtili": "קְטִילִי",
+ "qtol": "קְטוֹל",
+ "qtola": "קְטוֹלָה",
+ "qtolet": "קְטֹלֶת",
+ "qtul": "קְטוּל",
+ "qtula": "קְטוּלָה",
+ "qtulla": "קְטֻלָּה",
+ "qtut": "קְטוּת",
+ "qutla": "קֻטְלָה",
+ "quttolet": "קֻטּוֹלֶת",
+ # --- t ---
+ "taqtela": "תַּקְטֵלָה",
+ "taqtil": "תַּקְטִיל",
+ "taqtit": "תַּקְטִית",
+ "taqtul": "תַּקְטוּל",
+ "taqtula": "תַּקְטוּלָה",
+ "taqtut": "תַּקְטוּת",
+ "tiqtal": "תִּקְטָל",
+ "tiqtala": "תִּקְטָלָה",
+ "tiqtelet": "תִּקְטֶלֶת",
+ "tiqtolet": "תִּקְטֹלֶת",
+ "tqilla": "תְּקִלָּה",
+ "tqula": "תְּקוּלָה",
+ # --- y ---
+ "yaqtul": "יַקְטוּל",
}
+
+def _mishkal_to_hebrew(mishkal: str) -> str | None:
+ """Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
+ if not mishkal:
+ return None
+ # Try as-is first (q-notation)
+ result = _MISHKAL_HEBREW_Q.get(mishkal)
+ if result:
+ return result
+ # Convert k-notation to q-notation and retry
+ q_form = mishkal.replace("k", "q")
+ return _MISHKAL_HEBREW_Q.get(q_form)
+
+
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
@@ -452,7 +554,7 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
if mishkal:
result["mishkal"] = mishkal
- result["mishkal_hebrew"] = MISHKAL_HEBREW.get(mishkal)
+ result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)
return result
@@ -887,6 +989,228 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
return result
+# ---------------------------------------------------------------------------
+# Adjective detail parsing
+# ---------------------------------------------------------------------------
+
+_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
+_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")
+
+
+def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
+ """
+ Parse the adjective inflection table from a pealim detail page (mo/nikkud).
+
+ Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
+ and audio URL from each.
+
+ Returns:
+ Dict mapping form key ("ms", "fs", "mp", "fp") to
+ {"nikkud": str, "audio_url": str}, or empty dict if table not found.
+ """
+ table = soup.find("table", class_="conjugation-table")
+ if not table:
+ return {}
+
+ result: dict[str, dict] = {}
+ for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
+ div = table.find(id=cell_id)
+ if not div:
+ continue
+ nikkud, audio_url = _get_menukad_and_audio(div)
+ if nikkud:
+ result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
+
+ return result
+
+
+def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
+ """
+ Parse the adjective inflection table from a vl (ktiv male) page.
+
+ Returns:
+ Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
+ """
+ table = soup.find("table", class_="conjugation-table")
+ if not table:
+ return {}
+
+ result: dict[str, str] = {}
+ for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
+ div = table.find(id=cell_id)
+ if not div:
+ continue
+ ktiv = _get_plain_text(div)
+ if ktiv:
+ result[form_key] = ktiv
+
+ return result
+
+
+def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
+ """
+ Extract mishkal from the PoS section of an adjective detail page.
+
+ Reuses the same extraction logic as _parse_noun_gender_mishkal.
+
+ Returns:
+ Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
+ """
+ _, mishkal = _parse_noun_gender_mishkal(soup)
+ mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
+ return mishkal, mishkal_hebrew
+
+
+def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
+ """
+ Parse adjective detail pages (mo=nikkud, vl=ktiv male).
+
+ Returns:
+ Dict matching the adjective_inflection schema:
+ {ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
+ Empty dict if no forms found.
+ """
+ mo_soup = BeautifulSoup(mo_html, "lxml")
+ vl_soup = BeautifulSoup(vl_html, "lxml")
+
+ mo_data = _parse_adjective_table(mo_soup)
+ vl_data = _parse_adjective_table_vl(vl_soup)
+ mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)
+
+ if not mo_data:
+ return {}
+
+ result: dict = {}
+ for form_key in _ADJECTIVE_FORM_KEYS:
+ mo_form = mo_data.get(form_key)
+ if mo_form:
+ nikkud = mo_form["nikkud"]
+ ktiv = vl_data.get(form_key, "")
+ if not ktiv:
+ logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
+ result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
+ else:
+ result[form_key] = None
+
+ result["mishkal"] = mishkal or None
+ result["mishkal_hebrew"] = mishkal_hebrew or None
+
+ return result
+
+
+# ---------------------------------------------------------------------------
+# Preposition detail parsing
+# ---------------------------------------------------------------------------
+
+_PREPOSITION_CELL_IDS: tuple[str, ...] = (
+ "P-1s",
+ "P-1p",
+ "P-2ms",
+ "P-2fs",
+ "P-2mp",
+ "P-2fp",
+ "P-3ms",
+ "P-3fs",
+ "P-3mp",
+ "P-3fp",
+)
+_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
+ "1s",
+ "1p",
+ "2ms",
+ "2fs",
+ "2mp",
+ "2fp",
+ "3ms",
+ "3fs",
+ "3mp",
+ "3fp",
+)
+
+
+def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
+ """
+ Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).
+
+ Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud
+ text and audio URL from each.
+
+ Returns:
+ Dict mapping person key ("1s", "1p", …, "3fp") to
+ {"nikkud": str, "audio_url": str}, or empty dict if table not found.
+ """
+ table = soup.find("table", class_="conjugation-table")
+ if not table:
+ return {}
+
+ result: dict[str, dict] = {}
+ for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
+ div = table.find(id=cell_id)
+ if not div:
+ continue
+ nikkud, audio_url = _get_menukad_and_audio(div)
+ if nikkud:
+ result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
+
+ return result
+
+
+def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
+ """
+ Parse the preposition pronominal suffix table from a vl (ktiv male) page.
+
+ Returns:
+ Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string.
+ """
+ table = soup.find("table", class_="conjugation-table")
+ if not table:
+ return {}
+
+ result: dict[str, str] = {}
+ for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
+ div = table.find(id=cell_id)
+ if not div:
+ continue
+ ktiv = _get_plain_text(div)
+ if ktiv:
+ result[form_key] = ktiv
+
+ return result
+
+
+def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
+ """
+ Parse preposition detail pages (mo=nikkud, vl=ktiv male).
+
+ Returns:
+ Dict matching the preposition_inflection schema:
+ {1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
+ Empty dict if no forms found.
+ """
+ mo_soup = BeautifulSoup(mo_html, "lxml")
+ vl_soup = BeautifulSoup(vl_html, "lxml")
+
+ mo_data = _parse_preposition_table(mo_soup)
+ vl_data = _parse_preposition_table_vl(vl_soup)
+
+ if not mo_data:
+ return {}
+
+ result: dict = {}
+ for form_key in _PREPOSITION_FORM_KEYS:
+ mo_form = mo_data.get(form_key)
+ if mo_form:
+ nikkud = mo_form["nikkud"]
+ ktiv = vl_data.get(form_key, "")
+ if not ktiv:
+ logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
+ result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
+ else:
+ result[form_key] = None
+
+ return result
+
+
# ---------------------------------------------------------------------------
# Merging strategy
# ---------------------------------------------------------------------------
@@ -926,6 +1250,22 @@ def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
return scraped
+def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
+ """
+ Merge scraped adjective data into existing adjective_inflection.
+ No GUIDs to preserve — simple overwrite with scraped data.
+ """
+ return dict(scraped)
+
+
+def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
+ """
+ Merge scraped preposition data into existing preposition_inflection.
+ No GUIDs to preserve — simple overwrite with scraped data.
+ """
+ return dict(scraped)
+
+
# ---------------------------------------------------------------------------
# I/O helpers
# ---------------------------------------------------------------------------
@@ -953,14 +1293,26 @@ def _save_words(data: dict) -> None:
# ---------------------------------------------------------------------------
-def _should_process(entry: dict, pos: str, force: bool, nouns_only: bool, verbs_only: bool) -> bool:
+def _should_process(
+ entry: dict,
+ pos: str,
+ force: bool,
+ nouns_only: bool,
+ verbs_only: bool,
+ adjectives_only: bool,
+ prepositions_only: bool,
+) -> bool:
"""Return True if this entry should be scraped."""
- if not pos.startswith(("Noun", "Verb")):
+ if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
return False
if nouns_only and not pos.startswith("Noun"):
return False
if verbs_only and not pos.startswith("Verb"):
return False
+ if adjectives_only and not pos.startswith("Adjective"):
+ return False
+ if prepositions_only and not pos.startswith("Preposition"):
+ return False
return force or not entry.get("detail_scraped")
@@ -969,6 +1321,8 @@ def run(
force_refresh: bool = False,
nouns_only: bool = False,
verbs_only: bool = False,
+ adjectives_only: bool = False,
+ prepositions_only: bool = False,
) -> None:
"""
Main scrape loop.
@@ -978,13 +1332,24 @@ def run(
force_refresh: Re-scrape entries where detail_scraped=True.
nouns_only: Only scrape noun entries.
verbs_only: Only scrape verb entries.
+ adjectives_only: Only scrape adjective entries.
+ prepositions_only: Only scrape preposition entries.
"""
words = _load_words()
candidates = [
(unique_key, entry)
for unique_key, entry in words.items()
- if _should_process(entry, entry.get("pos", ""), force_refresh, nouns_only, verbs_only) and entry.get("slug")
+ if _should_process(
+ entry,
+ entry.get("pos", ""),
+ force_refresh,
+ nouns_only,
+ verbs_only,
+ adjectives_only,
+ prepositions_only,
+ )
+ and entry.get("slug")
]
total = len(candidates)
@@ -992,7 +1357,10 @@ def run(
candidates = candidates[:test]
logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
else:
- logger.info("Processing %d eligible entries (nouns+verbs) from words.json", total)
+ logger.info(
+ "Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
+ total,
+ )
processed = 0
errors = 0
@@ -1003,7 +1371,14 @@ def run(
word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
url = f"{PEALIM_BASE}/dict/{slug}/"
- label = "Noun" if pos.startswith("Noun") else "Verb"
+ if pos.startswith("Noun"):
+ label = "Noun"
+ elif pos.startswith("Verb"):
+ label = "Verb"
+ elif pos.startswith("Adjective"):
+ label = "Adjective"
+ else:
+ label = "Preposition"
logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)
# Fetch mo (nikkud) page
@@ -1042,7 +1417,7 @@ def run(
errors += 1
continue
- else: # Verb
+ elif pos.startswith("Verb"):
existing_conj = entry.get("conjugation")
scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
if scraped:
@@ -1059,6 +1434,41 @@ def run(
errors += 1
continue
+ elif pos.startswith("Adjective"):
+ scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
+ if scraped:
+ existing_ai = entry.get("adjective_inflection")
+ merged = _merge_adjective_inflection(existing_ai, scraped)
+ words[unique_key]["adjective_inflection"] = merged
+ ms = merged.get("ms", {}) or {}
+ fs = merged.get("fs", {}) or {}
+ logger.info(
+ " ms=%s fs=%s mishkal=%s",
+ ms.get("nikkud", "—"),
+ fs.get("nikkud", "—"),
+ merged.get("mishkal", "—"),
+ )
+ else:
+ logger.warning(" No adjective data scraped for %s", slug)
+ errors += 1
+ continue
+
+ else: # Preposition
+ scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
+ if scraped:
+ existing_pi = entry.get("preposition_inflection")
+ merged = _merge_preposition_inflection(existing_pi, scraped)
+ words[unique_key]["preposition_inflection"] = merged
+ form_1s = merged.get("1s", {}) or {}
+ logger.info(
+ " 1s=%s",
+ form_1s.get("nikkud", "—"),
+ )
+ else:
+ logger.warning(" No preposition data scraped for %s", slug)
+ errors += 1
+ continue
+
except Exception as exc: # noqa: BLE001
logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
errors += 1
@@ -1089,7 +1499,7 @@ def run(
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
- description="Scrape pealim.com detail pages for nouns and verbs in data/words.json."
+ description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
)
parser.add_argument(
"--test",
@@ -1117,6 +1527,18 @@ def _build_parser() -> argparse.ArgumentParser:
default=False,
help="Only scrape Verb entries.",
)
+ group.add_argument(
+ "--adjectives-only",
+ action="store_true",
+ default=False,
+ help="Only scrape Adjective entries.",
+ )
+ group.add_argument(
+ "--prepositions-only",
+ action="store_true",
+ default=False,
+ help="Only scrape Preposition entries.",
+ )
return parser
@@ -1133,4 +1555,6 @@ if __name__ == "__main__":
force_refresh=args.force_refresh_detail,
nouns_only=args.nouns_only,
verbs_only=args.verbs_only,
+ adjectives_only=args.adjectives_only,
+ prepositions_only=args.prepositions_only,
)
diff --git a/rebuild_sentence_matches.py b/rebuild_sentence_matches.py
deleted file mode 100644
index 1d8b1cb..0000000
--- a/rebuild_sentence_matches.py
+++ /dev/null
@@ -1,183 +0,0 @@
-#!/usr/bin/env python3
-"""
-Rebuild vocab_sentence_matches.json using both direct word matching
-and ktiv male conjugated/declined form matching.
-
-This dramatically improves sentence coverage by matching not just
-dictionary forms but all conjugated verbs and declined nouns.
-"""
-
-import json
-import logging
-import re
-from pathlib import Path
-
-import pandas as pd
-
-from helpers import strip_nikkud as _strip_nikkud
-
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
-logger = logging.getLogger(__name__)
-
-DATA_DIR = Path(__file__).parent / "data"
-
-
-def main():
- # Load sentences
- with open(DATA_DIR / "epub_sentence_index.json") as f:
- sentences = json.load(f).get("sentences", [])
- logger.info(f"Loaded {len(sentences)} sentences")
-
- # Load vocab CSV
- csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
- try:
- df = pd.read_csv(csv_path, sep=";", index_col=0)
- if df.shape[1] < 3:
- raise ValueError
- except (ValueError, pd.errors.ParserError):
- df = pd.read_csv(csv_path, index_col=0)
- logger.info(f"Loaded {len(df)} vocab entries")
-
- # Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
- word_lookup: dict[str, list[tuple[str, str]]] = {}
- for _, row in df.iterrows():
- word = str(row.get("Word", "")).strip()
- wni = str(row.get("Word Without Nikkud", "")).strip()
- if not word or word in ("nan", "None"):
- continue
- stripped = _strip_nikkud(word)
- if stripped:
- word_lookup.setdefault(stripped, []).append((word, wni))
-
- # Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
- ktiv_path = DATA_DIR / "ktiv_male_forms.json"
- ktiv_forms: dict[str, list[dict]] = {}
- if ktiv_path.exists():
- with open(ktiv_path) as f:
- ktiv_forms = json.load(f)
- logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
- else:
- logger.warning("No ktiv_male_forms.json — only using direct matching")
-
- # Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
- ktiv_to_word: dict[str, set[str]] = {}
- for ktiv, entries in ktiv_forms.items():
- for entry in entries:
- word_nikkud = entry.get("word_nikkud", "")
- if word_nikkud:
- ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)
-
- # Also add all vocab words' own stripped forms to ktiv_to_word
- for stripped, entries in word_lookup.items():
- for word_nikkud, _ in entries:
- ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)
-
- logger.info(f"Total matchable forms: {len(ktiv_to_word)}")
-
- # Tokenize all sentences once
- sentence_tokens: list[tuple[dict, list[str]]] = []
- for s in sentences:
- stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
- tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
- tokens = [t for t in tokens if t] # remove empty
- sentence_tokens.append((s, tokens))
-
- # Match: for each sentence token, check ktiv_to_word lookup
- # Build word_nikkud → [sentence_info]
- matches: dict[str, list[dict]] = {} # word_nikkud → [sentences]
-
- for sent, tokens in sentence_tokens:
- text = sent.get("text", "")
- book = sent.get("book", "")
- word_len = len(tokens)
-
- # Skip sentences that are too short or too long
- if word_len < 4 or word_len > 15:
- continue
-
- for tok in tokens:
- if tok in ktiv_to_word:
- for word_nikkud in ktiv_to_word[tok]:
- matches.setdefault(word_nikkud, []).append(
- {
- "text": text,
- "book": book,
- "matched_form": tok,
- "word_count": word_len,
- }
- )
-
- logger.info(f"Words with at least 1 match: {len(matches)}")
-
- # Deduplicate and limit to 3 best sentences per word
- # Prefer shorter sentences (6-12 words ideal)
- output: dict[str, dict] = {}
- for word_nikkud, sents in matches.items():
- # Deduplicate by text
- seen_texts = set()
- unique = []
- for s in sents:
- if s["text"] not in seen_texts:
- seen_texts.add(s["text"])
- unique.append(s)
-
- # Score: prefer 6-12 word sentences
- def score(s):
- wc = s["word_count"]
- if 6 <= wc <= 12:
- return 0 # ideal
- return abs(wc - 9) # distance from ideal
-
- unique.sort(key=score)
- best = unique[:3]
-
- # Find the Word Without Nikkud for this word
- stripped = _strip_nikkud(word_nikkud)
- wni = stripped # default
- if stripped in word_lookup:
- for wn, w_wni in word_lookup[stripped]:
- if wn == word_nikkud:
- wni = w_wni
- break
-
- output[wni] = {
- "word_nikkud": word_nikkud,
- "sentences": [{"text": s["text"], "book": s["book"]} for s in best],
- }
-
- # Save
- out_path = DATA_DIR / "vocab_sentence_matches.json"
- with open(out_path, "w") as f:
- json.dump(output, f, ensure_ascii=False, indent=1)
-
- total_sents = sum(len(v["sentences"]) for v in output.values())
- logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")
-
- # Stats
- total_vocab = len(df)
- pct = len(output) * 100 / total_vocab
- logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")
-
- # Breakdown by match type
- direct_only = 0
- ktiv_only = 0
- both = 0
- for _wni, info in output.items():
- word = info["word_nikkud"]
- stripped = _strip_nikkud(word)
- has_direct = stripped in word_lookup
- has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
- if has_direct and has_ktiv:
- both += 1
- elif has_ktiv:
- ktiv_only += 1
- else:
- direct_only += 1
-
- logger.info(f" Direct matches only: {direct_only}")
- logger.info(f" Ktiv male matches only: {ktiv_only}")
- logger.info(f" Both: {both}")
-
-
-if __name__ == "__main__":
- main()
diff --git a/run.py b/run.py
index 93142f5..b3d527c 100644
--- a/run.py
+++ b/run.py
@@ -11,7 +11,7 @@ Pipeline steps:
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
3. Frequency — load/download word frequency data
- 4. Examples — fetch Ben Yehuda example sentences
+ 4. Examples — extract example sentences from Hebrew EPUBs
5. Audio download — download audio mp3 files
6. Fonts — download Heebo font files
7. Images — fetch noun images from Wikipedia
@@ -21,9 +21,8 @@ Options:
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-audio Skip audio .mp3 downloads
- --skip-examples Skip Ben Yehuda example fetching
+ --skip-examples Skip EPUB example extraction
--skip-images Skip image fetching for concrete nouns
- --refresh-examples Force rebuild of Ben Yehuda index
--test N Limit to first N words/pages
"""
@@ -60,9 +59,8 @@ def parse_args():
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
- p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
+ p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
- p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
@@ -93,22 +91,15 @@ def step_frequency() -> dict[str, int]:
return frequency_lookup._freq
-def step_examples(args, _freq_cache: dict):
- """Step 4 — load/build Ben Yehuda example index."""
+def step_examples(args) -> dict:
+ """Step 4 — extract example sentences from Hebrew EPUBs."""
if args.skip_examples:
logger.info("[4] Skipping examples (--skip-examples)")
- examples_path = DATA_DIR / "examples_cache.json"
- if examples_path.exists():
- with open(examples_path) as f:
- return json.load(f)
return {}
- logger.info("[4] Loading Ben Yehuda example index …")
- import benyehuda
+ logger.info("[4] Extracting EPUB example sentences …")
+ import epub_examples
- benyehuda.load(force_rebuild=args.refresh_examples)
-
- # Read word list from words.json instead of CSV
if not WORDS_JSON.exists():
logger.warning("[4] words.json not found, skipping examples")
return {}
@@ -116,41 +107,14 @@ def step_examples(args, _freq_cache: dict):
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
- entries = list(words.values())
- if args.test:
- entries = entries[: args.test]
+ stats = epub_examples.run(words)
- # Build confusable consonant set from words.json
- consonant_counts: dict[str, int] = {}
- for entry in entries:
- ktiv_male = entry.get("word", {}).get("ktiv_male", "")
- if ktiv_male:
- safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
- if safe:
- consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
- confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
+ # Save updated words.json
+ with open(WORDS_JSON, "w", encoding="utf-8") as f:
+ json.dump(words, f, ensure_ascii=False, indent=2)
- # Delete stale cache entries for confusable words so they get re-fetched
- stale_deleted = 0
- for entry in entries:
- word_nikkud = entry.get("word", {}).get("nikkud", "")
- ktiv_male = entry.get("word", {}).get("ktiv_male", "")
- if word_nikkud and ktiv_male:
- safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
- if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
- del benyehuda._examples_cache[word_nikkud]
- stale_deleted += 1
- if stale_deleted:
- logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
-
- logger.info(f" Pre-fetching examples for {len(entries)} words …")
- for entry in entries:
- word_nikkud = entry.get("word", {}).get("nikkud", "")
- if word_nikkud:
- benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
-
- benyehuda.save_examples_cache()
- return benyehuda._examples_cache
+ logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
+ return stats
def step_detail_scrape(args):
@@ -250,7 +214,7 @@ def step_build_all(args):
apkg_builder.build_all_variants(words, limit=args.test)
-def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
+def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
@@ -267,10 +231,12 @@ def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: d
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
logger.info(f" Frequency entries: {len(freq_cache)}")
- logger.info(f" Example cache entries: {len(examples_cache)}")
- covered = sum(1 for v in examples_cache.values() if v)
- if examples_cache:
- logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
+ matched = example_stats.get("matched", 0)
+ total = example_stats.get("total_vocab", 0)
+ if total:
+ logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
+ for book, count in example_stats.get("books", {}).items():
+ logger.info(f" {book}: {count} sentences")
if AUDIO_DIR.exists():
mp3s = list(AUDIO_DIR.glob("*.mp3"))
@@ -321,8 +287,6 @@ def main():
logger.info(f" MODE: --only {args.only}")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
- if args.refresh_examples:
- logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
logger.info("=" * 60)
def _load_words_for_only() -> dict:
@@ -385,13 +349,13 @@ def main():
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
freq_cache = step_frequency() # 3 — word frequency data
- examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples
+ example_stats = step_examples(args) # 4 — EPUB example sentences
step_audio_download(args) # 5 — download audio mp3s
step_fonts(args) # 6 — download Heebo fonts
step_images(args) # 7 — fetch noun images
step_build_all(args) # 8 — build all .apkg variants
- print_summary(args, examples_cache, freq_cache)
+ print_summary(args, example_stats, freq_cache)
if __name__ == "__main__":
diff --git a/scripts/validate_data.py b/scripts/validate_data.py
index 69dfb9d..5ce760d 100644
--- a/scripts/validate_data.py
+++ b/scripts/validate_data.py
@@ -32,7 +32,7 @@ DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav
VALID_PERSON_CODES: frozenset[str] = frozenset(
- ["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
+ ["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
)
EMOJI_RE = re.compile(
@@ -561,6 +561,7 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
"""
name = "conjugation_form_guids"
errors: list[str] = []
+ warnings: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
@@ -580,7 +581,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
guid_candidates = form.get("guid_candidates")
if not guid and not guid_candidates:
- errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
+ # New forms from rescrape use deterministic fallback — warn, don't fail
+ warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
continue
if guid:
@@ -597,6 +599,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
else:
seen_guids[candidate] = label
+ if warnings:
+ _warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
diff --git a/tests/test_detail_scrape.py b/tests/test_detail_scrape.py
new file mode 100644
index 0000000..8a040c5
--- /dev/null
+++ b/tests/test_detail_scrape.py
@@ -0,0 +1,486 @@
+"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from pealim_detail_scrape import (
+ _parse_adjective_table,
+ _parse_adjective_table_vl,
+ _parse_preposition_table,
+ _parse_preposition_table_vl,
+ _scrape_adjective_detail,
+ _scrape_preposition_detail,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures — real HTML snippets from pealim.com
+# ---------------------------------------------------------------------------
+
+ADJECTIVE_MO_TABLE = """
+
+
+
+ | Singular |
+ Plural |
+
+
+ | Masculine |
+ Feminine |
+ Masculine |
+ Feminine |
+
+
+
+
+
+
+
+ spring-like, vernal
+
+ |
+
+
+
+ spring-like, vernal
+
+ |
+
+
+
+ spring-like, vernal
+
+ |
+
+
+
+ spring-like, vernal
+
+ |
+
+
+
+"""
+
+# VL version: menukad spans contain unvowelled text (hebstyle=vl)
+ADJECTIVE_VL_TABLE = """
+
+
+
+ |
+
+ |
+
+
+ |
+
+
+ |
+
+
+ |
+
+
+
+"""
+
+PREPOSITION_MO_TABLE = """
+
+
+
+ | Person |
+ Singular |
+ Plural |
+
+
+ | Masculine |
+ Feminine |
+ Masculine |
+ Feminine |
+
+
+
+
+ | 1st |
+
+
+ |
+
+
+ |
+
+
+ | 2nd |
+
+
+ |
+
+
+ |
+
+
+ |
+
+
+ |
+
+
+ | 3rd |
+
+
+ |
+
+
+ |
+
+
+ |
+
+
+ |
+
+
+
+"""
+
+PREPOSITION_VL_TABLE = """
+
+
+
+ | 1st |
+ |
+ |
+
+
+ | 2nd |
+ |
+ |
+ |
+ |
+
+
+ | 3rd |
+ |
+ |
+ |
+ |
+
+
+
+"""
+
+# Minimal full-page wrappers so _scrape_*_detail() can parse them
+_ADJECTIVE_MO_PAGE = f"{ADJECTIVE_MO_TABLE}"
+_ADJECTIVE_VL_PAGE = f"{ADJECTIVE_VL_TABLE}"
+_PREPOSITION_MO_PAGE = f"{PREPOSITION_MO_TABLE}"
+_PREPOSITION_VL_PAGE = f"{PREPOSITION_VL_TABLE}"
+
+
+# ---------------------------------------------------------------------------
+# Adjective table tests
+# ---------------------------------------------------------------------------
+
+
+class TestParseAdjectiveTable:
+ """Tests for _parse_adjective_table (mo/nikkud page)."""
+
+ def test_returns_four_form_keys(self) -> None:
+ result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+ assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
+
+ def test_ms_nikkud(self) -> None:
+ result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+ assert result["ms"]["nikkud"] == "אֲבִיבִי"
+
+ def test_fs_nikkud(self) -> None:
+ result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+ assert result["fs"]["nikkud"] == "אֲבִיבִית"
+
+ def test_mp_nikkud(self) -> None:
+ result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+ assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
+
+ def test_fp_nikkud(self) -> None:
+ result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+ assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
+
+ def test_audio_url_present(self) -> None:
+ result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+ assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
+
+ def test_empty_on_missing_table(self) -> None:
+ result = _parse_adjective_table(__import__("bs4").BeautifulSoup("", "lxml"))
+ assert result == {}
+
+
+class TestParseAdjectiveTableVl:
+ """Tests for _parse_adjective_table_vl (ktiv male page)."""
+
+ def test_returns_four_form_keys(self) -> None:
+ result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+ assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
+
+ def test_ms_ktiv(self) -> None:
+ result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+ assert result["ms"] == "אביבי"
+
+ def test_fs_ktiv(self) -> None:
+ result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+ assert result["fs"] == "אביבית"
+
+ def test_mp_ktiv(self) -> None:
+ result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+ assert result["mp"] == "אביביים"
+
+ def test_fp_ktiv(self) -> None:
+ result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+ assert result["fp"] == "אביביות"
+
+
+# ---------------------------------------------------------------------------
+# _scrape_adjective_detail tests
+# ---------------------------------------------------------------------------
+
+
+class TestScrapeAdjectiveDetail:
+ """Tests for _scrape_adjective_detail — schema compliance."""
+
+ @pytest.fixture()
+ def result(self) -> dict:
+ return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
+
+ def test_returns_non_empty_dict(self, result: dict) -> None:
+ assert result
+
+ def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["ms"]["nikkud"] == "אֲבִיבִי"
+ assert result["ms"]["ktiv_male"] == "אביבי"
+
+ def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["fs"]["nikkud"] == "אֲבִיבִית"
+ assert result["fs"]["ktiv_male"] == "אביבית"
+
+ def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
+ assert result["mp"]["ktiv_male"] == "אביביים"
+
+ def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
+ assert result["fp"]["ktiv_male"] == "אביביות"
+
+ def test_mishkal_key_present(self, result: dict) -> None:
+ # mishkal may be None since no PoS section is in our minimal fixture
+ assert "mishkal" in result
+
+ def test_mishkal_hebrew_key_present(self, result: dict) -> None:
+ assert "mishkal_hebrew" in result
+
+ def test_all_schema_keys_present(self, result: dict) -> None:
+ expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
+ assert expected.issubset(result.keys())
+
+ def test_empty_on_no_table(self) -> None:
+ result = _scrape_adjective_detail("missing", "", "")
+ assert result == {}
+
+
+# ---------------------------------------------------------------------------
+# Preposition table tests
+# ---------------------------------------------------------------------------
+
+
+class TestParsePrepositionTable:
+ """Tests for _parse_preposition_table (mo/nikkud page)."""
+
+ @pytest.fixture()
+ def result(self) -> dict:
+ return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
+
+ def test_returns_ten_form_keys(self, result: dict) -> None:
+ expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
+ assert set(result.keys()) == expected
+
+ def test_1s_nikkud(self, result: dict) -> None:
+ assert result["1s"]["nikkud"] == "שֶׁלִּי"
+
+ def test_1p_nikkud(self, result: dict) -> None:
+ assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
+
+ def test_2ms_nikkud(self, result: dict) -> None:
+ assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
+
+ def test_2fs_nikkud(self, result: dict) -> None:
+ assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
+
+ def test_2mp_nikkud(self, result: dict) -> None:
+ assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
+
+ def test_2fp_nikkud(self, result: dict) -> None:
+ assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
+
+ def test_3ms_nikkud(self, result: dict) -> None:
+ assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
+
+ def test_3fs_nikkud(self, result: dict) -> None:
+ assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
+
+ def test_3mp_nikkud(self, result: dict) -> None:
+ assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
+
+ def test_3fp_nikkud(self, result: dict) -> None:
+ assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
+
+ def test_audio_url_present(self, result: dict) -> None:
+ assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
+
+ def test_empty_on_missing_table(self) -> None:
+ result = _parse_preposition_table(__import__("bs4").BeautifulSoup("", "lxml"))
+ assert result == {}
+
+
+class TestParsePrepositionTableVl:
+ """Tests for _parse_preposition_table_vl (ktiv male page)."""
+
+ @pytest.fixture()
+ def result(self) -> dict:
+ return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
+
+ def test_returns_ten_form_keys(self, result: dict) -> None:
+ expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
+ assert set(result.keys()) == expected
+
+ def test_1s_ktiv(self, result: dict) -> None:
+ assert result["1s"] == "שלי"
+
+ def test_1p_ktiv(self, result: dict) -> None:
+ assert result["1p"] == "שלנו"
+
+ def test_2ms_ktiv(self, result: dict) -> None:
+ assert result["2ms"] == "שלך"
+
+ def test_3ms_ktiv(self, result: dict) -> None:
+ assert result["3ms"] == "שלו"
+
+ def test_3fp_ktiv(self, result: dict) -> None:
+ assert result["3fp"] == "שלהן"
+
+
+# ---------------------------------------------------------------------------
+# _scrape_preposition_detail tests
+# ---------------------------------------------------------------------------
+
+
+class TestScrapePrepositionDetail:
+ """Tests for _scrape_preposition_detail — schema compliance."""
+
+ @pytest.fixture()
+ def result(self) -> dict:
+ return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
+
+ def test_returns_non_empty_dict(self, result: dict) -> None:
+ assert result
+
+ def test_all_ten_person_keys_present(self, result: dict) -> None:
+ expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
+ assert expected.issubset(result.keys())
+
+ def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["1s"]["nikkud"] == "שֶׁלִּי"
+ assert result["1s"]["ktiv_male"] == "שלי"
+
+ def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
+ assert result["1p"]["ktiv_male"] == "שלנו"
+
+ def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
+ assert result["2ms"]["ktiv_male"] == "שלך"
+
+ def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
+ assert result["3ms"]["ktiv_male"] == "שלו"
+
+ def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
+ assert result["3fs"]["ktiv_male"] == "שלה"
+
+ def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
+ assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
+ assert result["3fp"]["ktiv_male"] == "שלהן"
+
+ def test_empty_on_no_table(self) -> None:
+ result = _scrape_preposition_detail("missing", "", "")
+ assert result == {}
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index 78851ef..d64223d 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -42,3 +42,17 @@ def test_strip_nikkud_all_marks():
nikkud = "הַמַּלְכָּה"
plain = strip_nikkud(nikkud)
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
+
+
+def test_categorize_pos_no_substring_match():
+ """Regression: 'Pronoun' must NOT match 'Noun' category."""
+ from apkg_builder import _categorize_pos
+
+ assert _categorize_pos("Noun") == "Noun"
+ assert _categorize_pos("Verb") == "Verb"
+ assert _categorize_pos("Adjective") == "Adjective"
+ assert _categorize_pos("Adverb") == "Adverb"
+ assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
+ assert _categorize_pos("Preposition") == "Other"
+ assert _categorize_pos("Conjunction") == "Other"
+ assert _categorize_pos("Cardinal numeral") == "Other"