Compare commits

...

3 commits

Author SHA1 Message Date
efd0745ada Sprint 14: deck template/CSS overhaul + Sprint 12 detail scrape
Template & CSS fixes (15 items from Mar 9 feedback):
- Fix conjugation front showing 3ms form instead of infinitive
- Rename conjugation model to "Hebrew Conjugation"
- Strip Hebrew parenthesized text from English meanings
- Shoresh separator: spaces → dots (א.כ.ל)
- Remove duplicate English meaning from cloze back
- Remove example sentences from vocab front/back (cloze only)
- Center-align audio buttons on all decks
- Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)"
- Unify sec-key/sec-label fonts, make keys bold
- Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px)
- Center-align related words groups
- Sort confusables by average frequency
- Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning
- Clean duplicate quotation marks in cloze sentences

Sprint 12 carry-forward (detail scrape + EPUB):
- Adjective/preposition detail scraping in pealim_detail_scrape.py
- EPUB example matching rewrite in epub_examples.py
- Delete benyehuda.py and rebuild_sentence_matches.py (merged)
- 49 parser tests for detail scraping
- SCHEMA.yaml updates for new fields

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 07:44:47 +00:00
3b0f9defa9 feat: YAP-cleaned frequency corpus + two-tier assignment pipeline
- Add clean_frequency_corpus.py: YAP morphological analyzer removes
  prefix+word combos (e.g. בבית=ב+בית) from he_50k frequency data.
  Headwords always protected. 30,430 clean entries from 49,999 raw.
- Add assign_frequency.py: two-tier assignment with PoS-aware homograph
  handling. Tier 1 matches headwords; Tier 2 matches inflections (any rank)
  and conjugations (rank>5000 only, to avoid false positives).
  Function words claim frequency over content words in homograph groups,
  with manual overrides for 12 common dual-use words.
- frequency_lookup.py auto-prefers frequency_clean.json when available
- 6,691 entries now have frequency (was 5,974), 717 newly assigned

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 06:22:55 +00:00
b8b65442cb restore epub_examples.py and rebuild_sentence_matches.py
Accidentally removed in 6c2a0f8 — these are the EPUB sentence
extraction and matching scripts used to build vetted_sentences.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 04:33:32 +00:00
15 changed files with 1885939 additions and 65808 deletions

View file

@ -138,11 +138,53 @@ entry:
# ktiv_male: "שומר"
# --- Adjective-specific ---
adjective_inflection: null # Reserved for future use
adjective_inflection: null # null for non-adjectives
# When populated:
# ms/fs/mp/fp forms with nikkud/ktiv_male subfields
# ms:
# nikkud: "גָּדוֹל"
# ktiv_male: "גדול"
# fs:
# nikkud: "גְּדוֹלָה"
# ktiv_male: "גדולה"
# mp:
# nikkud: "גְּדוֹלִים"
# ktiv_male: "גדולים"
# fp:
# nikkud: "גְּדוֹלוֹת"
# ktiv_male: "גדולות"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Preposition-specific ---
preposition_inflection: null # Reserved for future use
preposition_inflection: null # null for non-prepositions
# When populated:
# Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)
# 1s:
# nikkud: "שֶׁלִּי"
# ktiv_male: "שלי"
# 1p:
# nikkud: "שֶׁלָּנוּ"
# ktiv_male: "שלנו"
# 2ms:
# nikkud: "שֶׁלְּךָ"
# ktiv_male: "שלך"
# 2fs:
# nikkud: "שֶׁלָּךְ"
# ktiv_male: "שלך"
# 2mp:
# nikkud: "שֶׁלָּכֶם"
# ktiv_male: "שלכם"
# 2fp:
# nikkud: "שֶׁלָּכֶן"
# ktiv_male: "שלכן"
# 3ms:
# nikkud: "שֶׁלּוֹ"
# ktiv_male: "שלו"
# 3fs:
# nikkud: "שֶׁלָּהּ"
# ktiv_male: "שלה"
# 3mp:
# nikkud: "שֶׁלָּהֶם"
# ktiv_male: "שלהם"
# 3fp:
# nikkud: "שֶׁלָּהֶן"
# ktiv_male: "שלהן"

View file

@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.15.1"
RELEASE_TAG = "v0.16"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -117,13 +117,15 @@ CARD_CSS = """
.card {
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
font-size: 20px;
text-align: center;
text-align: right;
color: #222;
background: #fff;
padding: 16px;
max-width: 600px;
margin: 0 auto;
}
.hebrew {
font-size: 36px;
font-size: 42px;
font-weight: bold;
direction: rtl;
text-align: center;
@ -131,32 +133,34 @@ CARD_CSS = """
color: #222;
}
.hebrew-sm {
font-size: 24px;
font-size: 30px;
font-weight: normal;
direction: rtl;
text-align: center;
color: #333;
color: #222;
}
.meaning {
font-size: 28px;
font-size: 34px;
color: #1a1a8c;
margin: 8px 0;
text-align: center;
}
.hint {
font-size: 16px;
color: #888;
font-size: 22px;
color: #555;
margin: 4px 0;
direction: rtl;
text-align: center;
}
.root-info {
font-size: 18px;
color: #555;
font-size: 26px;
color: #222;
margin-top: 6px;
direction: rtl;
}
.example {
font-size: 18px;
color: #444;
font-size: 24px;
color: #222;
direction: rtl;
text-align: right;
font-style: italic;
@ -182,16 +186,17 @@ CARD_CSS = """
color: #555;
}
.sec-label {
font-size: 20px;
font-size: 28px;
font-weight: normal;
color: #555;
color: #222;
direction: rtl;
text-align: center;
margin-top: 6px;
}
.sec-key {
font-size: 18px;
color: #888;
font-size: 28px;
color: #222;
font-weight: bold;
}
.definitions {
direction: rtl;
@ -199,32 +204,37 @@ CARD_CSS = """
}
.conf-entry {
margin: 8px 0;
font-size: 20px;
font-size: 28px;
direction: rtl;
}
.related-group {
direction: rtl;
text-align: right;
text-align: center;
margin: 2px 0;
font-size: 18px;
font-size: 26px;
}
.emoji-img {
font-size: 3.5em;
text-align: center;
margin: 0.3em 0;
}
.card [type="button"], .card button, .replay-button {
display: block !important;
margin: 4px auto !important;
text-align: center;
}
@media (prefers-color-scheme: dark) {
.card { color: #e8e8e8; background: #1c1c1e; }
.hebrew { color: #f0f0f0; }
.hebrew-sm { color: #ddd; }
.hebrew-sm { color: #e0e0e0; }
.meaning { color: #82b0ff; }
.root-info { color: #aaa; }
.sec-label { color: #aaa; }
.sec-key { color: #666; }
.root-info { color: #e0e0e0; }
.sec-label { color: #e0e0e0; }
.sec-key { color: #e0e0e0; }
.conf-entry { color: #ddd; }
.hint { color: #777; }
.voice-label { color: #888; }
.example { color: #bbb; border-right-color: #555; }
.example { color: #e0e0e0; border-right-color: #555; }
.divider { border-top-color: #333; }
.freq-badge { color: #888; border-color: #444; }
}
@ -252,9 +262,6 @@ VOCAB_BACK_HEB = """
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
"""
@ -273,14 +280,15 @@ VOCAB_BACK_ENG = """
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
{{#SharedRoots}}
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Example}}
<div class="example">{{Example}}</div>
{{/Example}}
"""
VOCAB_FRONT_CLOZE = """
<div class="example" style="font-size:24px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
<div class="example" style="font-size:32px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
"""
@ -289,7 +297,6 @@ VOCAB_BACK_CLOZE = """
<div class="divider"></div>
<div class="hebrew">{{Word}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
<div class="meaning">{{Meaning}}</div>
"""
VOCAB_MODEL = genanki.Model(
@ -343,8 +350,8 @@ VOCAB_MODEL = genanki.Model(
CONJ_FRONT = """
<div class="sec-label">אֵיךְ אוֹמְרִים</div>
<div class="hebrew" style="color:#1a1a8c;">{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Pronoun}}</div>
<div class="hebrew" style="color:#1a1a8c;">{{Infinitive}}{{#Prep}} <span class="hebrew-sm">({{Prep}})</span>{{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
<div class="hebrew">{{Tense}}</div>
"""
@ -363,7 +370,7 @@ CONJ_CSS = CARD_CSS
CONJ_MODEL = genanki.Model(
CONJ_MODEL_ID,
"Pealim Conjugation",
"Hebrew Conjugation",
fields=[
{"name": "Infinitive"},
{"name": "ReferenceForm"},
@ -666,8 +673,9 @@ def _load_emoji_lookup() -> dict[str, str]:
def _categorize_pos(pos_str: str) -> str:
"""Return the canonical PoS category key for grouping."""
base = pos_str.split("")[0].split("")[0].strip()
for cat in POS_CATEGORY_LABELS:
if cat.lower() in pos_str.lower():
if base == cat:
return cat
return "Other"
@ -745,10 +753,14 @@ def build_vocab_deck(
word_nikkud = entry["word"]["nikkud"]
word_no_nik = entry["word"].get("ktiv_male", "")
root_list = entry.get("root") or []
root = " ".join(root_list)
root = ".".join(root_list)
pos_raw = entry.get("pos", "")
pos_heb = entry.get("pos_hebrew", "")
meaning = entry.get("meaning", "") or ""
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
meaning = HBPAREN_RE.sub("", meaning).strip()
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
meaning_raw = entry.get("meaning_raw", "") or ""
slug = entry.get("slug", "") or ""
frequency = entry.get("frequency") or 999_999
@ -839,6 +851,9 @@ def build_vocab_deck(
end = cloze_data.get("cloze_word_end")
if cloze_text and start is not None and end is not None:
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
# Clean up duplicate/misplaced quotation marks
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
raw_hint = cloze_data.get("cloze_hint") or ""
if raw_hint:
cloze_hint = raw_hint
@ -871,11 +886,12 @@ def build_vocab_deck(
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
related_html = "\n".join(parts)
# Plural form (for nouns)
# Plural form (nouns only — guard against adjective/verb inflection bleed)
plural_str = ""
noun_inflection = entry.get("noun_inflection")
if noun_inflection and noun_inflection.get("plural"):
plural_str = noun_inflection["plural"].get("nikkud", "")
if pos_raw.startswith("Noun"):
noun_inflection = entry.get("noun_inflection")
if noun_inflection and noun_inflection.get("plural"):
plural_str = noun_inflection["plural"].get("nikkud", "")
# Image
image_tag = ""
@ -977,18 +993,28 @@ def build_conj_deck(
binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
slug = entry.get("slug", "") or ""
root_list = entry.get("root") or []
root = " ".join(root_list)
root = ".".join(root_list)
voice = VOICE_MAP.get(binyan, "")
meaning_raw = entry.get("meaning_raw", "") or ""
meaning = entry.get("meaning", "") or ""
# Extract Hebrew preposition from meaning_raw
# Extract Hebrew preposition — strip from meaning, show on Hebrew side
prep_str = ""
conj_prep = conj.get("prep")
if conj_prep:
prep_str = f"({conj_prep})"
elif meaning:
preps = HBPAREN_RE.findall(entry.get("meaning_raw", "") or "")
prep_str = " ".join(f"({p})" for p in preps)
# Strip any parentheses from stored prep value
prep_str = conj_prep.strip("() ")
elif meaning_raw:
preps = HBPAREN_RE.findall(meaning_raw)
if preps:
prep_str = preps[0]
# Strip Hebrew prepositions from English meaning to avoid duplication
if prep_str:
meaning = HBPAREN_RE.sub("", meaning).strip()
# Also strip from meaning_raw patterns like "(על)"
meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
# Clean up double spaces and trailing commas
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
related = [w for w in root_words.get(root, []) if w != infinitive]
related_str = " ".join(related[:8]) if related else ""
@ -1024,7 +1050,7 @@ def build_conj_deck(
elif guid_candidates:
note_guid = guid_candidates[0]
else:
note_guid = genanki.guid_for(_infinitive, pronoun, tense)
note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
note = genanki.Note(
model=CONJ_MODEL,
guid=note_guid,
@ -1213,8 +1239,10 @@ def build_conj_deck(
# ──────────────────────────────────────────────────────────────────────────────
CONF_FRONT = """
<div style="direction:rtl; text-align:center;">
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
</div>
"""
CONF_BACK = """
@ -1271,7 +1299,10 @@ def build_confusables_deck(
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
guid_to_entries.setdefault(guid, []).append(entry)
for guid, group_entries in sorted(guid_to_entries.items(), key=lambda x: x[0]):
for guid, group_entries in sorted(
guid_to_entries.items(),
key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
):
if guid in seen_guids:
continue
seen_guids.add(guid)
@ -1366,6 +1397,7 @@ PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
"""
@ -1380,6 +1412,7 @@ PLURAL_BACK_PL = """
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="sec-label">{{Meaning}}</div>
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
"""
@ -1483,10 +1516,11 @@ def build_plural_deck(
plural = noun_inflection["plural"]["nikkud"]
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
gender = noun_inflection.get("gender") or ""
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
mishkal = noun_inflection.get("mishkal") or ""
meaning = entry.get("meaning") or ""
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
root_list = entry.get("root") or []
root = " ".join(root_list)
root = ".".join(root_list)
# GUID from noun_inflection
note_guid_raw = noun_inflection.get("plurals_guid")
@ -1520,7 +1554,7 @@ def build_plural_deck(
meaning,
root,
mishkal,
gender,
gender_heb,
],
tags=tags,
)

View file

@ -1,202 +0,0 @@
#!/usr/bin/env python3
"""
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
TODO: Rewrite to update words.json examples fields directly instead of
writing to a separate examples_cache.json. Currently the migration script
bridges the gap. See Phase 5 in SPRINT_LOG.md.
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
then answers queries locally.
Exposed API:
load(force_rebuild=False)
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
save_examples_cache()
"""
import json
import logging
import re
import zipfile
from io import BytesIO
from pathlib import Path
import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
REQUEST_TIMEOUT = 120
MIN_SENTENCE_LEN = 20
MAX_SENTENCE_LEN = 200
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
# Module-level state
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
def _split_sentences(text: str) -> list[str]:
"""
Split text into sentences on newlines only (Hebrew sentences don't have
mid-word period issues like English). Min 20 chars, max 200 chars.
"""
out = []
for line in text.split("\n"):
s = line.strip().strip("\"'.,;:!?")
s = s.strip()
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
out.append(s)
return out
def _build_index(corpus_zip_bytes: bytes) -> None:
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
global _index
_index = {}
logger.info("Building Ben Yehuda index from nikkud corpus …")
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
logger.info(f" Corpus contains {len(txt_files)} text files")
for fname in txt_files:
try:
raw = zf.read(fname).decode("utf-8", errors="ignore")
except Exception: # noqa: S112
continue
for sentence in _split_sentences(raw):
# Index by each unique Hebrew token (with nikkud) in the sentence
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
for w in set(words):
if len(w) >= 2:
bucket = _index.setdefault(w, [])
if len(bucket) < MAX_INDEX_ENTRIES:
bucket.append(sentence)
logger.info(f"Index built: {len(_index)} unique word forms")
def _save_index() -> None:
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(INDEX_PATH, "w", encoding="utf-8") as f:
json.dump(_index, f, ensure_ascii=False)
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
def _load_index() -> None:
global _index
with open(INDEX_PATH, encoding="utf-8") as f:
_index = json.load(f)
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
def load(force_rebuild: bool = False) -> None:
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
global _index, _examples_cache
if _index and not force_rebuild:
return
if force_rebuild:
# Delete old index and discard examples cache
if INDEX_PATH.exists():
INDEX_PATH.unlink()
logger.info("Deleted old Ben Yehuda index (force rebuild)")
_examples_cache = {}
else:
# Load persisted examples cache (not needed on rebuild)
if EXAMPLES_CACHE_PATH.exists():
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
_examples_cache = json.load(f)
if INDEX_PATH.exists():
_load_index()
return
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
resp.raise_for_status()
data = resp.content
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
_build_index(data)
_save_index()
def save_examples_cache() -> None:
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
json.dump(_examples_cache, f, ensure_ascii=False)
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
"""
Return 0 or 1 example sentences for the given word (nikkud form).
Lookup strategy:
1. Try exact nikkud match in index.
2. Fall back to stripped (no-nikkud) match against index keys.
Skipped when word's consonants are in confusable_consonants set
(to avoid returning sentences for the wrong homograph).
Returns the single longest sentence MAX_SENTENCE_LEN that contains
the word as a whole token.
"""
if not _index:
load()
word = word_nikkud.strip()
word_stripped = _strip_nikkud(word)
cache_key = word
if cache_key in _examples_cache:
return _examples_cache[cache_key]
# Lookup: try exact nikkud first, then stripped fallback
candidates = _index.get(word, [])
if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
# Try looking up by stripped form across index keys
for k, v in _index.items():
if _strip_nikkud(k) == word_stripped:
candidates = v
break
# Filter: word must appear as a whole token
# Match the stripped form (for robustness with nikkud variants in sentence)
if word_stripped:
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
else:
matched = candidates[:]
# Filter by length
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
if matched:
best = max(matched, key=len)
result = [best]
else:
result = []
_examples_cache[cache_key] = result
return result
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
for w in tests:
exs = get_examples(w)
print(f"\n{w}: {len(exs)} example(s)")
for ex in exs:
print(f"{ex[:100]}")
save_examples_cache()

File diff suppressed because one or more lines are too long

97847
data/frequency_discarded.json Normal file

File diff suppressed because it is too large Load diff

1850838
data/words.json

File diff suppressed because it is too large Load diff

791
epub_examples.py Normal file
View file

@ -0,0 +1,791 @@
#!/usr/bin/env python3
"""
Extract example sentences from nikud'd Hebrew EPUB files, match them against
the vocabulary list in data/words.json, and write matched examples back into
words.json.
Usage (standalone):
python3 epub_examples.py
Called from run.py via:
run(words) words dict is passed in and updated in place
"""
import logging
import os
import re
import zipfile
from html.parser import HTMLParser
from pathlib import Path
from helpers import strip_nikkud
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
EPUB_DIR = DATA_DIR / "epubs"
WORDS_JSON = DATA_DIR / "words.json"
# Book metadata: filename -> display name
def _discover_epubs() -> dict[str, str]:
"""Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
if not EPUB_DIR.exists():
return {}
books: dict[str, str] = {}
for path in sorted(EPUB_DIR.glob("*.epub")):
stem = path.stem
stem_stripped = strip_nikkud(stem).lower()
# Derive a brief English display name from the filename
parts = stem.split(" -- ")
title_part = strip_nikkud(parts[0]).strip().lower()
if "alice" in stem_stripped or "אליס" in title_part:
name = "alice_wonderland"
elif "little_prince" in stem_stripped or "נסיך" in title_part:
name = "little_prince"
elif "מנהרת" in title_part or "time_tunnel" in stem_stripped:
num_match = re.search(r"(\d+)", stem_stripped)
num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "")
name = f"time_tunnel_{num}"
else:
name = stem_stripped[:40]
books[str(path)] = name
return books
# Sentence length bounds (word count)
MIN_WORDS = 4
MAX_WORDS = 15
# ── HTML text extraction ─────────────────────────────────────────
class _TextExtractor(HTMLParser):
"""Extract text content from HTML, skipping script/style tags."""
SKIP_TAGS = {"script", "style", "head"}
def __init__(self):
super().__init__()
self.parts: list[str] = []
self._skip_depth = 0
def handle_starttag(self, tag, attrs):
_ = attrs # required by HTMLParser interface
if tag in self.SKIP_TAGS:
self._skip_depth += 1
# Insert newline for block-level elements to avoid word concatenation
if tag in (
"p",
"div",
"br",
"li",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"td",
"th",
"tr",
"blockquote",
"section",
):
self.parts.append("\n")
def handle_endtag(self, tag):
if tag in self.SKIP_TAGS:
self._skip_depth = max(0, self._skip_depth - 1)
def handle_data(self, data):
if self._skip_depth == 0:
self.parts.append(data)
def get_text(self) -> str:
return "".join(self.parts)
def extract_text_from_html(html: str) -> str:
"""Parse HTML and return plain text."""
parser = _TextExtractor()
parser.feed(html)
return parser.get_text()
# ── EPUB processing ──────────────────────────────────────────────
def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
"""Get ordered list of content XHTML files from the OPF manifest."""
opf_path = None
for name in zf.namelist():
if name.endswith(".opf"):
opf_path = name
break
if not opf_path:
# Fallback: just use all xhtml files
return sorted(
n
for n in zf.namelist()
if n.endswith((".xhtml", ".html"))
and "toc" not in n.lower()
and "cover" not in n.lower()
and "nav" not in n.lower()
)
# Parse OPF to get spine order
opf_content = zf.read(opf_path).decode("utf-8")
opf_dir = os.path.dirname(opf_path)
# Extract manifest items: id -> href
manifest: dict[str, str] = {}
for m in re.finditer(r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
manifest[m.group(1)] = m.group(2)
# Also try reversed attribute order
for m in re.finditer(r'<item\s+[^>]*href="([^"]+)"[^>]*id="([^"]+)"', opf_content):
manifest[m.group(2)] = m.group(1)
# Extract spine order
spine_ids = re.findall(r'<itemref\s+[^>]*idref="([^"]+)"', opf_content)
result = []
for sid in spine_ids:
href = manifest.get(sid, "")
if href and href.endswith((".xhtml", ".html")):
full_path = os.path.join(opf_dir, href) if opf_dir else href
# Normalize path separators
full_path = full_path.replace("\\", "/")
if full_path in zf.namelist():
result.append(full_path)
if not result:
# Fallback
return sorted(
n
for n in zf.namelist()
if n.endswith((".xhtml", ".html")) and "toc" not in n.lower() and "cover" not in n.lower()
)
return result
def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
"""Extract sentences from an EPUB file.
Args:
epub_path: Path to the .epub file.
book_name: Human-readable book name used as the ``source`` field.
Returns:
List of ``{"text": str, "source": str}`` dicts.
"""
zf = zipfile.ZipFile(epub_path)
content_files = _content_files_from_epub(zf)
all_text = []
for cf in content_files:
try:
html = zf.read(cf).decode("utf-8")
except (KeyError, UnicodeDecodeError):
continue
text = extract_text_from_html(html)
all_text.append(text)
full_text = "\n".join(all_text)
return _split_into_sentences(full_text, book_name)
# ── Sentence splitting ───────────────────────────────────────────
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
_SENT_SPLIT = re.compile(r"[.!?\u05C3]+")
# Punctuation to strip from word boundaries when matching
_PUNCT = re.compile(
r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|'
r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
)
def _split_into_sentences(text: str, book_name: str) -> list[dict]:
"""Split text into Hebrew sentences and filter by word count.
Args:
text: Raw extracted text from an EPUB chapter.
book_name: Source label for each sentence dict.
Returns:
List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text.
"""
# Normalize whitespace
text = re.sub(r"\s+", " ", text).strip()
raw_sentences = _SENT_SPLIT.split(text)
results: list[dict] = []
seen: set[str] = set()
for sent in raw_sentences:
sent = sent.strip()
if not sent:
continue
# Count Hebrew words (skip non-Hebrew tokens like numbers)
words = sent.split()
hebrew_words = [w for w in words if any("\u0590" <= c <= "\u05ff" for c in w)]
if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
continue
# Deduplicate by exact nikkud text
if sent in seen:
continue
seen.add(sent)
results.append({"text": sent, "source": book_name})
return results
# ── Nikkud index ─────────────────────────────────────────────────
# Unicode ranges for Hebrew combining marks
_NIKKUD_LOW = 0x05B0 # start of vowel points (shva)
_NIKKUD_HIGH = 0x05BD # end of vowel range (meteg); 0x05BE is maqaf (punctuation)
_DAGESH = "\u05bc"
_SHIN_DOT = "\u05c1"
_SIN_DOT = "\u05c2"
# Valid prefix consonants
_PREFIX_CONSONANTS = set("בהוכלמש")
# Named vowel combining marks
_SHVA = "\u05b0"
_HIRIQ = "\u05b4"
_TSERE = "\u05b5"
_SEGOL = "\u05b6"
_PATACH = "\u05b7"
_QAMATZ = "\u05b8"
# Valid nikkud patterns on each prefix consonant.
# Key = consonant, Value = set of frozensets of combining marks valid for that prefix.
_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = {
"ב": {
frozenset({_SHVA, _DAGESH}), # בְּ standard
frozenset({_HIRIQ, _DAGESH}), # בִּ before shva
frozenset({_PATACH, _DAGESH}), # בַּ with definite article
frozenset({_QAMATZ, _DAGESH}), # בָּ before chataf qamatz
frozenset({_SEGOL, _DAGESH}), # בֶּ before chataf segol
},
"כ": {
frozenset({_SHVA, _DAGESH}), # כְּ
frozenset({_HIRIQ, _DAGESH}), # כִּ
frozenset({_PATACH, _DAGESH}), # כַּ
frozenset({_QAMATZ, _DAGESH}), # כָּ
frozenset({_SEGOL, _DAGESH}), # כֶּ
},
"ל": {
frozenset({_SHVA}), # לְ standard
frozenset({_HIRIQ}), # לִ before shva
frozenset({_PATACH}), # לַ with definite article
frozenset({_QAMATZ}), # לָ demonstratives
frozenset({_SEGOL}), # לֶ before chataf segol
},
"ו": {
frozenset({_SHVA}), # וְ standard
frozenset({_DAGESH}), # וּ (shureq) before shva/bumf
frozenset({_PATACH}), # וַ before chataf patach
frozenset({_QAMATZ}), # וָ before chataf qamatz
frozenset({_SEGOL}), # וֶ before chataf segol
frozenset({_HIRIQ}), # וִ before yud-shva
},
"מ": {
frozenset({_HIRIQ}), # מִ standard
frozenset({_TSERE}), # מֵ before gutturals
},
"ש": {
frozenset({_SEGOL, _DAGESH}), # שֶׁ standard
frozenset({_SEGOL, _DAGESH, _SHIN_DOT}), # שֶׁ with explicit shin dot
},
"ה": {
frozenset({_PATACH}), # הַ standard definite article
frozenset({_QAMATZ}), # הָ before gutturals
frozenset({_SEGOL}), # הֶ before qamatz-bearing gutturals
},
}
def _is_combining_mark(ch: str) -> bool:
"""Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots)."""
cp = ord(ch)
if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH:
return True
return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT)
def _decompose_first_char(token: str) -> tuple[str, frozenset, str]:
"""Split token into (first_consonant, its_combining_marks, remainder).
Args:
token: A nikkud Hebrew token string.
Returns:
A tuple of (consonant, marks, rest). Returns ("", frozenset(), token)
if the token does not start with a Hebrew consonant (aleftav range).
"""
if not token:
return ("", frozenset(), token)
first = token[0]
# Check it's a Hebrew consonant (aleftav)
if not ("\u05d0" <= first <= "\u05ea"):
return ("", frozenset(), token)
# Collect all combining marks that follow the consonant
marks: set[str] = set()
i = 1
while i < len(token):
ch = token[i]
if _is_combining_mark(ch):
marks.add(ch)
i += 1
else:
break
return (first, frozenset(marks), token[i:])
def _is_valid_prefix(consonant: str, marks: frozenset) -> bool:
"""Check if consonant + marks form a valid Hebrew prefix combination.
Args:
consonant: The prefix consonant character.
marks: Frozenset of combining mark characters on that consonant.
Returns:
True if this is a recognised Hebrew prefix vocalization.
"""
valid = _VALID_PREFIX_MARKS.get(consonant)
if not valid:
return False
# For ש, allow shin dot to be present or absent
if consonant == "ש":
marks_without_shin = marks - {_SHIN_DOT}
return marks_without_shin in valid or marks in valid
return marks in valid
def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str:
"""Reassemble a token from its decomposed parts, sorting marks by codepoint."""
return consonant + "".join(sorted(marks)) + rest
def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]:
"""Try stripping 1 or 2 prefix letters from a nikkud token.
Args:
token: A cleaned nikkud word token.
nikkud_index: Mapping from nikkud form to list of (unique_key, match_type).
Returns:
List of (unique_key, match_type, matched_remainder) for each hit found.
The match_type will have ``"_prefix"`` appended to the base type.
"""
results: list[tuple[str, str, str]] = []
# Try 1-letter prefix
c1, m1, rest1 = _decompose_first_char(token)
if not (c1 and _is_valid_prefix(c1, m1) and rest1):
return results
# Direct match on 1-prefix remainder
if rest1 in nikkud_index:
for unique_key, match_type in nikkud_index[rest1]:
results.append((unique_key, match_type + "_prefix", rest1))
# Try removing dagesh from first letter of remainder
# (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ)
c2, m2, rest2_inner = _decompose_first_char(rest1)
if c2 and _DAGESH in m2:
without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner)
if without_dagesh != rest1 and without_dagesh in nikkud_index:
for unique_key, match_type in nikkud_index[without_dagesh]:
results.append((unique_key, match_type + "_prefix", without_dagesh))
# Try 2-letter prefix (ו and ש commonly stack with another prefix)
if c1 in "וש":
c2b, m2b, rest2b = _decompose_first_char(rest1)
if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b:
if rest2b in nikkud_index:
for unique_key, match_type in nikkud_index[rest2b]:
results.append((unique_key, match_type + "_prefix", rest2b))
# Also try dagesh removal on remainder of 2-letter prefix
c3, m3, rest3_inner = _decompose_first_char(rest2b)
if c3 and _DAGESH in m3:
without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner)
if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index:
for unique_key, match_type in nikkud_index[without_dagesh2]:
results.append((unique_key, match_type + "_prefix", without_dagesh2))
return results
def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
"""Build a mapping from nikkud form to list of (unique_key, match_type).
Indexes the following sources per entry:
- ``word.nikkud`` "direct"
- conjugation active/passive forms "conjugated"
- conjugation infinitive and reference_form "conjugated"
- noun inflection singular/plural/construct/pronominal "inflected"
Args:
words: The full words.json dict keyed by unique_key.
Returns:
Dict mapping each nikkud form to a list of (unique_key, match_type) tuples.
"""
index: dict[str, list[tuple[str, str]]] = {}
def _add(form: str | None, unique_key: str, match_type: str) -> None:
if form:
index.setdefault(form, []).append((unique_key, match_type))
for unique_key, entry in words.items():
# Direct word form
word = entry.get("word") or {}
_add(word.get("nikkud"), unique_key, "direct")
# Conjugation forms
conj = entry.get("conjugation") or {}
for form_entry in conj.get("active_forms") or []:
form = (form_entry.get("form") or {}).get("nikkud")
_add(form, unique_key, "conjugated")
for form_entry in conj.get("hufal_pual_forms") or []:
form = (form_entry.get("form") or {}).get("nikkud")
_add(form, unique_key, "conjugated")
inf = conj.get("infinitive") or {}
_add(inf.get("nikkud"), unique_key, "conjugated")
ref = conj.get("reference_form") or {}
_add(ref.get("nikkud"), unique_key, "conjugated")
# Noun inflection forms
noun = entry.get("noun_inflection") or {}
for field in ("singular", "plural", "construct_singular", "construct_plural"):
sub = noun.get(field) or {}
_add(sub.get("nikkud"), unique_key, "inflected")
pronominal = noun.get("pronominal_suffixes") or {}
for _person, sub in pronominal.items():
if isinstance(sub, dict):
_add(sub.get("nikkud"), unique_key, "inflected")
return index
def _filter_collision_forms(nikkud_index: dict) -> dict:
"""Remove colliding forms for entries that have other unique forms.
A "colliding form" maps to 2+ unique_keys. For each unique_key that
appears in a collision, check whether it also has at least one
non-colliding form in the index. If so, remove it from the colliding
form's entry list. If a unique_key's *only* indexed forms all collide,
keep them (otherwise the entry would get zero matches).
Returns a new index dict with the same structure.
"""
# Identify collision forms and build reverse map (key → its forms)
collision_forms: set[str] = set()
key_to_forms: dict[str, set[str]] = {}
for form, entries in nikkud_index.items():
keys = {uk for uk, _ in entries}
if len(keys) >= 2:
collision_forms.add(form)
for uk, _ in entries:
key_to_forms.setdefault(uk, set()).add(form)
# For each key, check if it has any non-colliding form
keys_with_unique_forms: set[str] = set()
for uk, forms in key_to_forms.items():
if forms - collision_forms:
keys_with_unique_forms.add(uk)
# Build filtered index
filtered: dict[str, list[tuple[str, str]]] = {}
removed = 0
for form, entries in nikkud_index.items():
if form in collision_forms:
kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms]
removed += len(entries) - len(kept)
if kept:
filtered[form] = kept
else:
filtered[form] = entries
logger.info(f" Filtered {removed} collision mappings from entries with unique forms")
return filtered
# ── Matching ─────────────────────────────────────────────────────
def match_sentences(
sentences: list[dict],
nikkud_index: dict,
confusable_keys: set[str],
) -> dict:
"""Match sentences to vocab words using the nikkud index.
Args:
sentences: List of ``{"text": str, "source": str}`` dicts.
nikkud_index: Output of ``_build_nikkud_index``.
confusable_keys: Set of unique_keys that are in confusable groups.
Returns:
Dict mapping unique_key list of match dicts, each containing:
``text``, ``source``, ``match_method``, ``word_count``,
``matched_form``, ``char_offset``, ``char_end``.
"""
matches: dict[str, list[dict]] = {}
for sent_info in sentences:
text = sent_info["text"]
source = sent_info["source"]
words_in_sent = text.split()
word_count = len(words_in_sent)
char_pos = 0
for raw_word in words_in_sent:
cleaned = _PUNCT.sub("", raw_word)
if not cleaned:
word_start = text.find(raw_word, char_pos)
char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos
continue
# Locate positions within the sentence
word_start_in_sent = text.find(raw_word, char_pos)
if word_start_in_sent < 0:
word_start_in_sent = char_pos
clean_offset_in_raw = raw_word.find(cleaned)
if clean_offset_in_raw < 0:
clean_offset_in_raw = 0
clean_start = word_start_in_sent + clean_offset_in_raw
clean_end = clean_start + len(cleaned)
found: list[tuple[str, str]] = []
# Direct nikkud match
if cleaned in nikkud_index:
for unique_key, match_type in nikkud_index[cleaned]:
found.append((unique_key, match_type))
# Prefix stripping — only if no direct match exists
if cleaned not in nikkud_index:
for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index):
found.append((unique_key, match_type))
for unique_key, match_method in found:
matches.setdefault(unique_key, []).append(
{
"text": text,
"source": source,
"match_method": match_method,
"word_count": word_count,
"matched_form": cleaned,
"char_offset": clean_start,
"char_end": clean_end,
}
)
char_pos = word_start_in_sent + len(raw_word)
return matches
# ── Writing results ──────────────────────────────────────────────
def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int:
"""Update words dict entries with matched example sentences.
Selects up to 3 best sentences per word (scoring prefers 612 word
sentences and non-prefix matches). Also generates a cloze entry for
the top match, unless the word is in the confusable set.
Args:
words: The full words.json dict, modified in place.
matches: Output of ``match_sentences``.
confusable_keys: Set of unique_keys in confusable groups.
Returns:
Count of words.json entries that were updated.
"""
import genanki # noqa: PLC0415 — import only where needed
updated = 0
for unique_key, sent_list in matches.items():
if unique_key not in words:
continue
entry = words[unique_key]
# Deduplicate by sentence text
seen_texts: set[str] = set()
unique: list[dict] = []
for s in sent_list:
if s["text"] not in seen_texts:
seen_texts.add(s["text"])
unique.append(s)
# Prefer direct matches; only fall back to prefix if none exist
direct = [s for s in unique if "prefix" not in s["match_method"]]
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
pool = direct if direct else prefix_only
# Score: prefer 612 word sentences
def _score(s: dict) -> tuple[int,]:
wc = s["word_count"]
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
return (length_score,)
pool.sort(key=_score)
best = pool[:3]
# Build vetted list
if not entry.get("examples"):
entry["examples"] = {}
examples: dict = entry["examples"]
examples["vetted"] = [
{
"text": s["text"],
"source": s["source"],
"match_method": s["match_method"],
}
for s in best
]
# Build cloze from best sentence (skip confusables)
is_confusable = unique_key in confusable_keys
if not is_confusable and best:
top = best[0]
# Preserve existing cloze_guid if sentence text unchanged
old_cloze = examples.get("cloze") or {}
if old_cloze.get("text") == top["text"]:
cloze_guid = old_cloze.get("cloze_guid")
else:
cloze_guid = genanki.guid_for("cloze", unique_key)
examples["cloze"] = {
"text": top["text"],
"cloze_word_start": top["char_offset"],
"cloze_word_end": top["char_end"],
"cloze_hint": None,
"cloze_guid": cloze_guid,
}
elif is_confusable:
examples.pop("cloze", None)
examples["rejected_count"] = 0
updated += 1
return updated
# ── Public API ───────────────────────────────────────────────────
def run(words: dict) -> dict:
"""Extract EPUB sentences, match against words, update words dict in place.
Called from run.py with the already-loaded words.json dict.
Args:
words: The full words.json dict keyed by unique_key. Modified in place.
Returns:
Summary stats dict with keys ``books``, ``matched``, ``total_vocab``.
"""
logger.info(" Extracting sentences from EPUBs ...")
all_sentences: list[dict] = []
book_counts: dict[str, int] = {}
for filepath, book_name in _discover_epubs().items():
path = Path(filepath)
sentences = extract_sentences_from_epub(path, book_name)
book_counts[book_name] = len(sentences)
all_sentences.extend(sentences)
logger.info(f" {book_name}: {len(sentences)} sentences")
if not all_sentences:
logger.warning(" No EPUB files found — skipping example extraction")
return {"books": {}, "matched": 0, "total_vocab": len(words)}
logger.info(f" Total sentences: {len(all_sentences)}")
# Build nikkud index
logger.info(" Building nikkud index from words.json ...")
nikkud_index = _build_nikkud_index(words)
logger.info(f" {len(nikkud_index)} unique nikkud forms indexed")
# Filter out collision forms for entries that have unique forms
nikkud_index = _filter_collision_forms(nikkud_index)
# Build confusable key set
confusable_keys: set[str] = set()
for key, entry in words.items():
if entry.get("confusable_group"):
confusable_keys.add(key)
# Match sentences
logger.info(" Matching sentences against vocab ...")
matches = match_sentences(all_sentences, nikkud_index, confusable_keys)
logger.info(f" {len(matches)} words matched")
# Break down by match method
method_counts: dict[str, int] = {}
for sent_list in matches.values():
for s in sent_list:
method = s["match_method"]
method_counts[method] = method_counts.get(method, 0) + 1
for method, count in sorted(method_counts.items()):
logger.info(f" {method}: {count} sentence-word pairs")
# Update words dict in place
updated = update_words_json(words, matches, confusable_keys)
logger.info(f" Updated {updated} entries in words.json")
return {
"books": book_counts,
"matched": len(matches),
"total_vocab": len(words),
}
# ── Standalone entry point ───────────────────────────────────────
if __name__ == "__main__":
import json
logging.basicConfig(level=logging.INFO, format="%(message)s")
words_path = DATA_DIR / "words.json"
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
stats = run(words)
# Save updated words.json
with open(words_path, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)")

View file

@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json"
REQUEST_TIMEOUT = 30
# Module-level cache: word_no_nikkud -> rank (1 = most common)
@ -26,12 +27,19 @@ _freq: dict[str, int] = {}
def load(cache_path: Path = CACHE_PATH) -> None:
"""Load frequency data from cache, downloading if not present."""
"""Load frequency data from cache, downloading if not present.
Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json.
"""
global _freq
if cache_path.exists():
with open(cache_path, encoding="utf-8") as f:
# Prefer YAP-cleaned frequency data if available
clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None
load_path = clean_path if clean_path and clean_path.exists() else cache_path
if load_path.exists():
with open(load_path, encoding="utf-8") as f:
_freq = json.load(f)
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
label = "clean" if load_path == clean_path else "raw"
logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries")
return
logger.info("Downloading FrequencyWords he_50k.txt …")

View file

@ -2,7 +2,8 @@
"""
Consolidated detail page scraper for pealim.com.
Visits /dict/<slug>/ detail pages for nouns and verbs in data/words.json.
Visits /dict/<slug>/ detail pages for nouns, verbs, adjectives and prepositions
in data/words.json.
Makes two requests per slug:
1. hebstyle=mo cookie nikkud forms
2. hebstyle=vl cookie ktiv male forms
@ -11,7 +12,8 @@ Updates entries in data/words.json with scraped detail data.
Usage:
python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
[--nouns-only | --verbs-only]
[--nouns-only | --verbs-only |
--adjectives-only | --prepositions-only]
"""
import argparse
@ -144,28 +146,128 @@ FORM_KEY_TO_PERSON: dict[str, str] = {
"infinitive": "inf",
}
# Mishkal English name → Hebrew nikkud mapping (common patterns)
MISHKAL_HEBREW: dict[str, str] = {
"CaCaC": "קָטָל",
"CeCeC": "קֶטֶל",
"CiCeC": "קִטֶל",
"CaCeC": "קָטֶל",
"CoCeC": "קוֹטֵל",
"CaCiC": "קָטִיד",
"CaCuC": "קָטוּר",
"miCCaC": "מִקְטָל",
"miCCeC": "מִקְטֶל",
"maCCeC": "מַקְטֶל",
"maCCiC": "מַקְטִיר",
"hiCCiC": "הִקְטִיל",
"CiCCuC": "קִטּוּל",
"hitCaCCeC": "הִתְקַטֵּל",
"CaCCan": "קַטְּלָן",
"CaCCaC": "קַטָּל",
"CiCCon": "קִטְּרוֹן",
"CaCCeC": "קַטֶּלֶת",
# Mishkal English name → Hebrew nikkud mapping
# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
_MISHKAL_HEBREW_Q: dict[str, str] = {
# --- a ---
"aqtal": "אַקְטָל",
"aqtala": "אַקְטָלָה",
# --- e ---
"eqtal": "אֶקְטָל",
# --- h ---
"haqtala": "הַקְטָלָה",
"heqtel": "הֶקְטֵל",
"hiqqatlut": "הִקָּטְלוּת",
"hitqattlut": "הִתְקַטְּלוּת",
# --- m ---
"maqtal": "מַקְטָל",
"maqtel": "מַקְטֵל",
"maqtela": "מַקְטֵלָה",
"maqtelet": "מַקְטֶלֶת",
"maqtil": "מַקְטִיל",
"maqtol": "מַקְטוֹל",
"maqtolet": "מַקְטֹלֶת",
"maqtul": "מַקְטוּל",
"meqattel": "מְקַטֵּל",
"meqila": "מְקִילָה",
"mequla": "מְקוּלָה",
"mequttal": "מְקֻטָּל",
"miqtal": "מִקְטָל",
"miqtala": "מִקְטָלָה",
"miqtelet": "מִקְטֶלֶת",
"miqtol": "מִקְטוֹל",
"miqtolet": "מִקְטֹלֶת",
"mitqattel": "מִתְקַטֵּל",
"muqtal": "מֻקְטָל",
# --- n ---
"niqtal": "נִקְטָל",
# --- q ---
"qal": "קַל",
"qatal": "קָטָל",
"qatel": "קָטֵל",
"qatil": "קָטִיל",
"qatla": "קַטְלָה",
"qatlan": "קַטְלָן",
"qatlut": "קַטְלוּת",
"qatol": "קָטוֹל",
"qaton": "קָטוֹן",
"qattal": "קַטָּל",
"qattala": "קַטָּלָה",
"qattelet": "קַטֶּלֶת",
"qattil": "קַטִּיל",
"qattila": "קַטִּילָה",
"qattolet": "קַטֹּלֶת",
"qattul": "קַטּוּל",
"qatul": "קָטוּל",
"qatut": "קָטוּת",
"qetel": "קֶטֶל",
"qeteh": "קֵטֶה",
"qitla": "קִטְלָה",
"qitlon": "קִטְלוֹן",
"qittalon": "קִטָּלוֹן",
"qittel": "קִטֵּל",
"qittelet": "קִטֶּלֶת",
"qittol": "קִטּוֹל",
"qittolet": "קִטֹּלֶת",
"qittul": "קִטּוּל",
"qol": "קֹל",
"qotal": "קוֹטָל",
"qotel": "קוֹטֵל",
"qotelet": "קוֹטֶלֶת",
"qotla": "קָטְלָה",
"qtal": "קְטָל",
"qtala": "קְטָלָה",
"qtaltal": "קְטַלְטַל",
"qtaltan": "קְטַלְתָּן",
"qtaltolet": "קְטַלְטֹלֶת",
"qtel": "קְטֵל",
"qtela": "קְטֵלָה",
"qtelet": "קְטֶלֶת",
"qtil": "קְטִיל",
"qtila": "קְטִילָה",
"qtili": "קְטִילִי",
"qtol": "קְטוֹל",
"qtola": "קְטוֹלָה",
"qtolet": "קְטֹלֶת",
"qtul": "קְטוּל",
"qtula": "קְטוּלָה",
"qtulla": "קְטֻלָּה",
"qtut": "קְטוּת",
"qutla": "קֻטְלָה",
"quttolet": "קֻטּוֹלֶת",
# --- t ---
"taqtela": "תַּקְטֵלָה",
"taqtil": "תַּקְטִיל",
"taqtit": "תַּקְטִית",
"taqtul": "תַּקְטוּל",
"taqtula": "תַּקְטוּלָה",
"taqtut": "תַּקְטוּת",
"tiqtal": "תִּקְטָל",
"tiqtala": "תִּקְטָלָה",
"tiqtelet": "תִּקְטֶלֶת",
"tiqtolet": "תִּקְטֹלֶת",
"tqilla": "תְּקִלָּה",
"tqula": "תְּקוּלָה",
# --- y ---
"yaqtul": "יַקְטוּל",
}
def _mishkal_to_hebrew(mishkal: str) -> str | None:
"""Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
if not mishkal:
return None
# Try as-is first (q-notation)
result = _MISHKAL_HEBREW_Q.get(mishkal)
if result:
return result
# Convert k-notation to q-notation and retry
q_form = mishkal.replace("k", "q")
return _MISHKAL_HEBREW_Q.get(q_form)
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
@ -452,7 +554,7 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
if mishkal:
result["mishkal"] = mishkal
result["mishkal_hebrew"] = MISHKAL_HEBREW.get(mishkal)
result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)
return result
@ -887,6 +989,228 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
return result
# ---------------------------------------------------------------------------
# Adjective detail parsing
# ---------------------------------------------------------------------------
_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")
def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
"""
Parse the adjective inflection table from a pealim detail page (mo/nikkud).
Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
and audio URL from each.
Returns:
Dict mapping form key ("ms", "fs", "mp", "fp") to
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, dict] = {}
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
nikkud, audio_url = _get_menukad_and_audio(div)
if nikkud:
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
return result
def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the adjective inflection table from a vl (ktiv male) page.
Returns:
Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, str] = {}
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
ktiv = _get_plain_text(div)
if ktiv:
result[form_key] = ktiv
return result
def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
"""
Extract mishkal from the PoS section of an adjective detail page.
Reuses the same extraction logic as _parse_noun_gender_mishkal.
Returns:
Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
"""
_, mishkal = _parse_noun_gender_mishkal(soup)
mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
return mishkal, mishkal_hebrew
def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
"""
Parse adjective detail pages (mo=nikkud, vl=ktiv male).
Returns:
Dict matching the adjective_inflection schema:
{ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
Empty dict if no forms found.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
mo_data = _parse_adjective_table(mo_soup)
vl_data = _parse_adjective_table_vl(vl_soup)
mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)
if not mo_data:
return {}
result: dict = {}
for form_key in _ADJECTIVE_FORM_KEYS:
mo_form = mo_data.get(form_key)
if mo_form:
nikkud = mo_form["nikkud"]
ktiv = vl_data.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
else:
result[form_key] = None
result["mishkal"] = mishkal or None
result["mishkal_hebrew"] = mishkal_hebrew or None
return result
# ---------------------------------------------------------------------------
# Preposition detail parsing
# ---------------------------------------------------------------------------
_PREPOSITION_CELL_IDS: tuple[str, ...] = (
"P-1s",
"P-1p",
"P-2ms",
"P-2fs",
"P-2mp",
"P-2fp",
"P-3ms",
"P-3fs",
"P-3mp",
"P-3fp",
)
_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
"1s",
"1p",
"2ms",
"2fs",
"2mp",
"2fp",
"3ms",
"3fs",
"3mp",
"3fp",
)
def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
"""
Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).
Locates cells by ID (P-1s, P-1p, P-2ms, , P-3fp) and extracts nikkud
text and audio URL from each.
Returns:
Dict mapping person key ("1s", "1p", , "3fp") to
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, dict] = {}
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
nikkud, audio_url = _get_menukad_and_audio(div)
if nikkud:
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
return result
def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
"""
Parse the preposition pronominal suffix table from a vl (ktiv male) page.
Returns:
Dict mapping person key ("1s", "1p", , "3fp") to ktiv male string.
"""
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
result: dict[str, str] = {}
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
div = table.find(id=cell_id)
if not div:
continue
ktiv = _get_plain_text(div)
if ktiv:
result[form_key] = ktiv
return result
def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
"""
Parse preposition detail pages (mo=nikkud, vl=ktiv male).
Returns:
Dict matching the preposition_inflection schema:
{1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
Empty dict if no forms found.
"""
mo_soup = BeautifulSoup(mo_html, "lxml")
vl_soup = BeautifulSoup(vl_html, "lxml")
mo_data = _parse_preposition_table(mo_soup)
vl_data = _parse_preposition_table_vl(vl_soup)
if not mo_data:
return {}
result: dict = {}
for form_key in _PREPOSITION_FORM_KEYS:
mo_form = mo_data.get(form_key)
if mo_form:
nikkud = mo_form["nikkud"]
ktiv = vl_data.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
else:
result[form_key] = None
return result
# ---------------------------------------------------------------------------
# Merging strategy
# ---------------------------------------------------------------------------
@ -926,6 +1250,22 @@ def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
return scraped
def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
"""
Merge scraped adjective data into existing adjective_inflection.
No GUIDs to preserve simple overwrite with scraped data.
"""
return dict(scraped)
def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
"""
Merge scraped preposition data into existing preposition_inflection.
No GUIDs to preserve simple overwrite with scraped data.
"""
return dict(scraped)
# ---------------------------------------------------------------------------
# I/O helpers
# ---------------------------------------------------------------------------
@ -953,14 +1293,26 @@ def _save_words(data: dict) -> None:
# ---------------------------------------------------------------------------
def _should_process(entry: dict, pos: str, force: bool, nouns_only: bool, verbs_only: bool) -> bool:
def _should_process(
entry: dict,
pos: str,
force: bool,
nouns_only: bool,
verbs_only: bool,
adjectives_only: bool,
prepositions_only: bool,
) -> bool:
"""Return True if this entry should be scraped."""
if not pos.startswith(("Noun", "Verb")):
if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
return False
if nouns_only and not pos.startswith("Noun"):
return False
if verbs_only and not pos.startswith("Verb"):
return False
if adjectives_only and not pos.startswith("Adjective"):
return False
if prepositions_only and not pos.startswith("Preposition"):
return False
return force or not entry.get("detail_scraped")
@ -969,6 +1321,8 @@ def run(
force_refresh: bool = False,
nouns_only: bool = False,
verbs_only: bool = False,
adjectives_only: bool = False,
prepositions_only: bool = False,
) -> None:
"""
Main scrape loop.
@ -978,13 +1332,24 @@ def run(
force_refresh: Re-scrape entries where detail_scraped=True.
nouns_only: Only scrape noun entries.
verbs_only: Only scrape verb entries.
adjectives_only: Only scrape adjective entries.
prepositions_only: Only scrape preposition entries.
"""
words = _load_words()
candidates = [
(unique_key, entry)
for unique_key, entry in words.items()
if _should_process(entry, entry.get("pos", ""), force_refresh, nouns_only, verbs_only) and entry.get("slug")
if _should_process(
entry,
entry.get("pos", ""),
force_refresh,
nouns_only,
verbs_only,
adjectives_only,
prepositions_only,
)
and entry.get("slug")
]
total = len(candidates)
@ -992,7 +1357,10 @@ def run(
candidates = candidates[:test]
logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
else:
logger.info("Processing %d eligible entries (nouns+verbs) from words.json", total)
logger.info(
"Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
total,
)
processed = 0
errors = 0
@ -1003,7 +1371,14 @@ def run(
word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
url = f"{PEALIM_BASE}/dict/{slug}/"
label = "Noun" if pos.startswith("Noun") else "Verb"
if pos.startswith("Noun"):
label = "Noun"
elif pos.startswith("Verb"):
label = "Verb"
elif pos.startswith("Adjective"):
label = "Adjective"
else:
label = "Preposition"
logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)
# Fetch mo (nikkud) page
@ -1042,7 +1417,7 @@ def run(
errors += 1
continue
else: # Verb
elif pos.startswith("Verb"):
existing_conj = entry.get("conjugation")
scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
if scraped:
@ -1059,6 +1434,41 @@ def run(
errors += 1
continue
elif pos.startswith("Adjective"):
scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
if scraped:
existing_ai = entry.get("adjective_inflection")
merged = _merge_adjective_inflection(existing_ai, scraped)
words[unique_key]["adjective_inflection"] = merged
ms = merged.get("ms", {}) or {}
fs = merged.get("fs", {}) or {}
logger.info(
" ms=%s fs=%s mishkal=%s",
ms.get("nikkud", ""),
fs.get("nikkud", ""),
merged.get("mishkal", ""),
)
else:
logger.warning(" No adjective data scraped for %s", slug)
errors += 1
continue
else: # Preposition
scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
if scraped:
existing_pi = entry.get("preposition_inflection")
merged = _merge_preposition_inflection(existing_pi, scraped)
words[unique_key]["preposition_inflection"] = merged
form_1s = merged.get("1s", {}) or {}
logger.info(
" 1s=%s",
form_1s.get("nikkud", ""),
)
else:
logger.warning(" No preposition data scraped for %s", slug)
errors += 1
continue
except Exception as exc: # noqa: BLE001
logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
errors += 1
@ -1089,7 +1499,7 @@ def run(
def _build_parser() -> argparse.ArgumentParser:
parser = argparse.ArgumentParser(
description="Scrape pealim.com detail pages for nouns and verbs in data/words.json."
description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
)
parser.add_argument(
"--test",
@ -1117,6 +1527,18 @@ def _build_parser() -> argparse.ArgumentParser:
default=False,
help="Only scrape Verb entries.",
)
group.add_argument(
"--adjectives-only",
action="store_true",
default=False,
help="Only scrape Adjective entries.",
)
group.add_argument(
"--prepositions-only",
action="store_true",
default=False,
help="Only scrape Preposition entries.",
)
return parser
@ -1133,4 +1555,6 @@ if __name__ == "__main__":
force_refresh=args.force_refresh_detail,
nouns_only=args.nouns_only,
verbs_only=args.verbs_only,
adjectives_only=args.adjectives_only,
prepositions_only=args.prepositions_only,
)

80
run.py
View file

@ -11,7 +11,7 @@ Pipeline steps:
1. List scrape scrape pealim.com list pages words.json (captures slugs)
2. Detail scrape scrape noun/verb detail pages using slugs words.json
3. Frequency load/download word frequency data
4. Examples fetch Ben Yehuda example sentences
4. Examples extract example sentences from Hebrew EPUBs
5. Audio download download audio mp3 files
6. Fonts download Heebo font files
7. Images fetch noun images from Wikipedia
@ -21,9 +21,8 @@ Options:
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-examples Skip EPUB example extraction
--skip-images Skip image fetching for concrete nouns
--refresh-examples Force rebuild of Ben Yehuda index
--test N Limit to first N words/pages
"""
@ -60,9 +59,8 @@ def parse_args():
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
@ -93,22 +91,15 @@ def step_frequency() -> dict[str, int]:
return frequency_lookup._freq
def step_examples(args, _freq_cache: dict):
"""Step 4 — load/build Ben Yehuda example index."""
def step_examples(args) -> dict:
"""Step 4 — extract example sentences from Hebrew EPUBs."""
if args.skip_examples:
logger.info("[4] Skipping examples (--skip-examples)")
examples_path = DATA_DIR / "examples_cache.json"
if examples_path.exists():
with open(examples_path) as f:
return json.load(f)
return {}
logger.info("[4] Loading Ben Yehuda example index")
import benyehuda
logger.info("[4] Extracting EPUB example sentences …")
import epub_examples
benyehuda.load(force_rebuild=args.refresh_examples)
# Read word list from words.json instead of CSV
if not WORDS_JSON.exists():
logger.warning("[4] words.json not found, skipping examples")
return {}
@ -116,41 +107,14 @@ def step_examples(args, _freq_cache: dict):
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
entries = list(words.values())
if args.test:
entries = entries[: args.test]
stats = epub_examples.run(words)
# Build confusable consonant set from words.json
consonant_counts: dict[str, int] = {}
for entry in entries:
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
# Save updated words.json
with open(WORDS_JSON, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
# Delete stale cache entries for confusable words so they get re-fetched
stale_deleted = 0
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if word_nikkud and ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
del benyehuda._examples_cache[word_nikkud]
stale_deleted += 1
if stale_deleted:
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
logger.info(f" Pre-fetching examples for {len(entries)} words …")
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
if word_nikkud:
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
benyehuda.save_examples_cache()
return benyehuda._examples_cache
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
return stats
def step_detail_scrape(args):
@ -250,7 +214,7 @@ def step_build_all(args):
apkg_builder.build_all_variants(words, limit=args.test)
def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
@ -267,10 +231,12 @@ def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: d
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
logger.info(f" Frequency entries: {len(freq_cache)}")
logger.info(f" Example cache entries: {len(examples_cache)}")
covered = sum(1 for v in examples_cache.values() if v)
if examples_cache:
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
matched = example_stats.get("matched", 0)
total = example_stats.get("total_vocab", 0)
if total:
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
for book, count in example_stats.get("books", {}).items():
logger.info(f" {book}: {count} sentences")
if AUDIO_DIR.exists():
mp3s = list(AUDIO_DIR.glob("*.mp3"))
@ -321,8 +287,6 @@ def main():
logger.info(f" MODE: --only {args.only}")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
if args.refresh_examples:
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
logger.info("=" * 60)
def _load_words_for_only() -> dict:
@ -385,13 +349,13 @@ def main():
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
freq_cache = step_frequency() # 3 — word frequency data
examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples
example_stats = step_examples(args) # 4 — EPUB example sentences
step_audio_download(args) # 5 — download audio mp3s
step_fonts(args) # 6 — download Heebo fonts
step_images(args) # 7 — fetch noun images
step_build_all(args) # 8 — build all .apkg variants
print_summary(args, examples_cache, freq_cache)
print_summary(args, example_stats, freq_cache)
if __name__ == "__main__":

392
scripts/assign_frequency.py Normal file
View file

@ -0,0 +1,392 @@
#!/usr/bin/env python3
"""Assign frequency ranks from the cleaned corpus to words.json entries.
Two-tier assignment with PoS priority:
Tier 1: Match headword ktiv_male directly against corpus
Tier 2: Match conjugated/inflected forms (only if no other entry already
claimed that corpus word via tier 1)
PoS priority (based on standalone-word likelihood in Hebrew text):
כינוייוף (Pronoun) > מילות_חיבור (Conjunction) > שם_תואר (Adjective) >
מילית (Particle) > שם_עצם (Noun) > תוארי_הפועל (Adverb) >
מילות_יחס (Preposition) > פעלים (Verb)
Usage:
python3 scripts/assign_frequency.py # assign and save
python3 scripts/assign_frequency.py --dry-run # preview only
python3 scripts/assign_frequency.py --stats # show statistics only
"""
from __future__ import annotations
import argparse
import json
import logging
from collections import defaultdict
from pathlib import Path
logger = logging.getLogger(__name__)
PROJECT_ROOT = Path(__file__).parent.parent
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
# Function word PoS — these dominate content words in homograph groups
FUNCTION_POS = frozenset({"כינוייוף", "מילות_חיבור", "מילית", "מילות_יחס", "תוארי_הפועל"})
# Content PoS that loses frequency when a function word dominates
# Adjectives also lose (e.g. כן "honest" vs כן "yes") — they're rare collisions
CONTENT_POS = frozenset({"שם_עצם", "שם_תואר", "פעלים"})
# Manual overrides: at these corpus ranks, ALL homographs share frequency.
# These are cases where the content word is genuinely common enough to deserve it.
# e.g. rank 15: עם "people" (NN) alongside עם "with" (PREP)
# Manual overrides: at these ktiv_male forms, ALL homographs share frequency.
# These are cases where the content word is genuinely common enough to deserve it.
SHARE_ALL_WORDS = frozenset(
{
"עם", # "people" (NN) + "with" (PREP)
"שם", # "name" (NN) + "there" (ADV)
"אל", # "god" (NN) + "to" (PREP) + "don't" (PART)
"עד", # "witness"/"eternity" (NN) + "until" (PREP)
"פה", # "mouth" (NN) + "here" (ADV)
"לאחר", # "to be late" (VB) + "after" (PREP)
"יופי", # "beauty" (NN) + "great!" (ADV)
"המון", # "crowd" (NN) + "lots of" (ADV)
"חבל", # "rope" (NN) + "it's a pity" (ADV)
"ראשית", # "beginning" (NN) + "firstly" (ADV)
"עקב", # "heel"/"footprint" (NN) + "due to" (CONJ)
"אולם", # "hall" (NN) + "however" (ADV)
}
)
def _get_pos_tag(entry: dict) -> str:
"""Extract primary PoS tag from entry's tags field."""
tags = (entry.get("tags") or "").split()
for t in tags:
if not t.startswith("שורש"):
return t
return "unknown"
def _build_form_index(words: dict) -> dict[str, list[tuple[str, str]]]:
"""Build reverse index: ktiv_male_form -> [(unique_key, match_type), ...]"""
index: dict[str, list[tuple[str, str]]] = defaultdict(list)
for key, entry in words.items():
w = entry.get("word") or {}
if km := w.get("ktiv_male"):
index[km].append((key, "headword"))
# Verb conjugations: indexed for new-assignment-only matching (no upgrades).
# Conjugated forms collide with unrelated headwords, so tier 2 only uses
# these for entries that have NO existing frequency.
conj = entry.get("conjugation") or {}
for form in conj.get("active_forms") or []:
if isinstance(form, dict):
form_data = form.get("form") or {}
if km2 := form_data.get("ktiv_male"):
km2 = km2.rstrip("!\u200f ")
index[km2].append((key, "conjugation"))
for hp in conj.get("hufal_pual_forms") or []:
if isinstance(hp, dict):
hp_data = hp.get("form") or {}
if km3 := hp_data.get("ktiv_male"):
km3 = km3.rstrip("!\u200f ")
index[km3].append((key, "conjugation"))
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
for inf_data in (entry.get(field) or {}).values():
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
index[km4].append((key, "inflection"))
return dict(index)
def _should_get_frequency(
entry: dict,
all_headword_entries: list[tuple[str, str]],
corpus_word: str,
words: dict,
) -> bool:
"""Decide if an entry should get frequency in a homograph group.
Rules:
- If only one entry matches, it always gets frequency.
- If SHARE_ALL_WORDS includes this corpus word, all entries share.
- If the group has function words AND content words, content words lose.
- Otherwise all entries share.
"""
if len(all_headword_entries) <= 1:
return True
if corpus_word in SHARE_ALL_WORDS:
return True
pos = _get_pos_tag(entry)
has_function = any(_get_pos_tag(words[k]) in FUNCTION_POS for k, _ in all_headword_entries)
return not (has_function and pos in CONTENT_POS)
def assign_frequencies(
words: dict,
freq_corpus: dict[str, int],
raw_corpus: dict[str, int] | None = None,
upgrade: bool = False,
) -> dict[str, dict]:
"""Assign frequency ranks to words.json entries. Returns assignment details.
freq_corpus controls which words are valid (cleaned corpus).
raw_corpus provides original rank numbers (with gaps). If not provided,
uses freq_corpus ranks (re-ranked, no gaps).
upgrade: if True, tier 2 can upgrade an entry's rank when a conjugated/inflected
form has a better (lower) rank than the headword match.
"""
rank_source = raw_corpus if raw_corpus is not None else freq_corpus
form_index = _build_form_index(words)
# Track which corpus words have been claimed by tier 1
tier1_claimed: set[str] = set()
# Results tracking
assignments: dict[str, dict] = {} # unique_key -> {rank, source, corpus_word}
# --- Tier 1: headword matches ---
# For each corpus word, find all headword matches and assign to eligible entries.
# Homograph groups: function words get frequency, content words don't (unless overridden).
corpus_by_rank = sorted(freq_corpus.items(), key=lambda x: x[1])
for corpus_word, _clean_rank in corpus_by_rank:
matches = form_index.get(corpus_word, [])
headword_matches = [(k, t) for k, t in matches if t == "headword"]
if not headword_matches:
continue
original_rank = rank_source.get(corpus_word, _clean_rank)
assigned_any = False
for entry_key, _ in headword_matches:
if entry_key in assignments:
continue
if _should_get_frequency(words[entry_key], headword_matches, corpus_word, words):
assignments[entry_key] = {
"rank": original_rank,
"source": "headword",
"corpus_word": corpus_word,
}
assigned_any = True
if assigned_any:
tier1_claimed.add(corpus_word)
tier1_count = len(assignments)
logger.info("Tier 1 (headword): %d entries assigned", tier1_count)
# --- Tier 2: conjugation/inflection matches ---
# Only use corpus words NOT claimed in tier 1.
# A corpus word that matches an inflection is "owned" by that headword —
# it cannot also upgrade an unrelated verb via conjugation.
# Upgrades (when enabled) only apply within the same match type priority.
for corpus_word, _clean_rank in corpus_by_rank:
if corpus_word in tier1_claimed:
continue
matches = form_index.get(corpus_word, [])
secondary_matches = [(k, t) for k, t in matches if t in ("conjugation", "inflection")]
if not secondary_matches:
continue
original_rank = rank_source.get(corpus_word, _clean_rank)
# Split by type: inflections take priority over conjugations
inflection_matches = [(k, t) for k, t in secondary_matches if t == "inflection"]
conjugation_matches = [(k, t) for k, t in secondary_matches if t == "conjugation"]
# If any inflection matches exist, this corpus word belongs to inflection.
# Don't let conjugations claim it.
active_matches = inflection_matches if inflection_matches else conjugation_matches
for entry_key, match_type in active_matches:
existing = assignments.get(entry_key)
if existing is None:
# New assignment — conjugations only allowed for rank > 5000
# (too many false positives in the important tiers)
if match_type == "conjugation" and original_rank <= 5000:
continue
assignments[entry_key] = {
"rank": original_rank,
"source": match_type,
"corpus_word": corpus_word,
}
break
if upgrade and match_type == "inflection" and original_rank < existing["rank"]:
# Upgrade — only allowed for inflections (conjugations collide too much)
assignments[entry_key] = {
"rank": original_rank,
"source": f"upgrade:{match_type}",
"corpus_word": corpus_word,
}
break
tier2_count = len(assignments) - tier1_count
logger.info("Tier 2 (conjugation/inflection): %d entries assigned", tier2_count)
return assignments
def print_stats(words: dict, assignments: dict, freq_corpus: dict) -> None:
"""Print detailed statistics about frequency assignment."""
total = len(words)
assigned = len(assignments)
previously_had = sum(1 for e in words.values() if e.get("frequency") is not None)
print(f"\n{'=' * 60}")
print("Frequency Assignment Statistics")
print(f"{'=' * 60}")
print(f"Words.json entries: {total}")
print(f"Clean corpus size: {len(freq_corpus)}")
print(f"Previously had freq: {previously_had}")
print(f"Now assigned: {assigned}")
print(f"Newly gained: {assigned - previously_had}")
print(f"Still unlisted: {total - assigned}")
# By tier
tier1 = sum(1 for a in assignments.values() if a["source"] == "headword")
tier2_conj = sum(1 for a in assignments.values() if a["source"] == "conjugation")
tier2_inf = sum(1 for a in assignments.values() if a["source"] == "inflection")
print("\nBy assignment tier:")
print(f" Tier 1 (headword): {tier1}")
print(f" Tier 2 (conjugation): {tier2_conj}")
print(f" Tier 2 (inflection): {tier2_inf}")
# By PoS
print("\nBy PoS:")
from collections import Counter
pos_assigned = Counter()
pos_total = Counter()
for k, v in words.items():
pos = _get_pos_tag(v)
pos_total[pos] += 1
if k in assignments:
pos_assigned[pos] += 1
pos_order = [
"כינוייוף",
"מילות_חיבור",
"שם_תואר",
"מילית",
"שם_עצם",
"תוארי_הפועל",
"מילות_יחס",
"פעלים",
"unknown",
]
for pos in sorted(pos_total, key=lambda p: pos_order.index(p) if p in pos_order else 99):
a = pos_assigned[pos]
t = pos_total[pos]
pct = a / t * 100 if t else 0
print(f" {pos:20s}: {a:5d}/{t:5d} ({pct:.0f}%)")
# By frequency tier (using apkg_builder tiers)
print("\nBy frequency tier:")
tiers = {
"Core (1-500)": (1, 500),
"Essential (501-1500)": (501, 1500),
"Intermediate (1501-3000)": (1501, 3000),
"Upper-intermediate (3001-5000)": (3001, 5000),
"Advanced (5001-10000)": (5001, 10000),
"Rare (10001+)": (10001, 999999),
}
for label, (lo, hi) in tiers.items():
count = sum(1 for a in assignments.values() if lo <= a["rank"] <= hi)
print(f" {label:35s}: {count}")
# Top 20 newly assigned (entries that didn't have frequency before)
newly = []
for k, a in assignments.items():
if words[k].get("frequency") is None:
w = words[k].get("word", {})
newly.append((a["rank"], k, w.get("ktiv_male", ""), a["source"], a["corpus_word"]))
newly.sort()
if newly:
print("\nTop 20 newly assigned entries:")
for rank, _key, ktiv, source, corpus_word in newly[:20]:
print(f" rank {rank:5d}: {ktiv:15s} via {source:12s} (corpus: {corpus_word})")
# Entries that LOST frequency (had it before, not assigned now)
lost = []
for k, v in words.items():
old_freq = v.get("frequency")
if old_freq is not None and k not in assignments:
w = v.get("word", {})
lost.append((old_freq, k, w.get("ktiv_male", "")))
lost.sort()
if lost:
print(f"\nEntries that would LOSE frequency ({len(lost)} total):")
for rank, _key, ktiv in lost[:20]:
print(f" was rank {rank:5d}: {ktiv}")
def main() -> None:
parser = argparse.ArgumentParser(description="Assign frequency to words.json")
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
parser.add_argument("--stats", action="store_true", help="Show statistics only")
parser.add_argument(
"--upgrade", action="store_true", help="Allow tier 2 to upgrade headword rank from conjugated forms"
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# Load data
freq_path = CLEAN_CACHE if CLEAN_CACHE.exists() else RAW_CACHE
logger.info("Loading frequency corpus: %s", freq_path)
with open(freq_path, encoding="utf-8") as f:
freq_corpus: dict[str, int] = json.load(f)
# Load raw corpus for original rank numbers (with gaps)
raw_corpus: dict[str, int] | None = None
if RAW_CACHE.exists() and freq_path != RAW_CACHE:
with open(RAW_CACHE, encoding="utf-8") as f:
raw_corpus = json.load(f)
logger.info("Using original ranks from %s", RAW_CACHE)
with open(WORDS_JSON, encoding="utf-8") as f:
words: dict = json.load(f)
logger.info("Corpus: %d entries, Words.json: %d entries", len(freq_corpus), len(words))
# Run assignment
assignments = assign_frequencies(words, freq_corpus, raw_corpus, upgrade=args.upgrade)
# Stats
print_stats(words, assignments, freq_corpus)
if args.stats or args.dry_run:
if args.dry_run:
logger.info("Dry run — no changes saved")
return
# Apply to words.json
changed = 0
for key, entry in words.items():
if key in assignments:
new_rank = assignments[key]["rank"]
if entry.get("frequency") != new_rank:
entry["frequency"] = new_rank
changed += 1
else:
if entry.get("frequency") is not None:
entry["frequency"] = None
changed += 1
with open(WORDS_JSON, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
logger.info("Updated %d entries in words.json", changed)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,400 @@
#!/usr/bin/env python3
"""Clean the Hebrew frequency corpus by removing prefix+word combinations.
Two modes:
--mode yap (default) Use YAP morphological analyzer for accurate prefix detection.
Requires YAP API running at localhost:8000.
--mode heuristic Use rule-based prefix stripping (no external dependencies).
Both modes preserve words that exist as known dictionary forms in words.json.
Usage:
python3 scripts/clean_frequency_corpus.py # YAP mode
python3 scripts/clean_frequency_corpus.py --mode heuristic # heuristic fallback
python3 scripts/clean_frequency_corpus.py --dry-run # preview only
python3 scripts/clean_frequency_corpus.py --resume # resume YAP from checkpoint
python3 scripts/clean_frequency_corpus.py --limit 1000 # process first N entries
Input: data/frequency_cache.json (raw he_50k.txt, 49999 entries)
Output: data/frequency_clean.json (filtered, prefix combos removed)
data/frequency_discarded.json (discarded entries with reason)
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
import time
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
PROJECT_ROOT = Path(__file__).parent.parent
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
DISCARDED = PROJECT_ROOT / "data" / "frequency_discarded.json"
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
CHECKPOINT = PROJECT_ROOT / "data" / "_yap_checkpoint.json"
YAP_URL = os.environ.get("YAP_URL", "http://localhost:8000/yap/heb/joint")
YAP_TIMEOUT = 10
BATCH_SAVE_INTERVAL = 500
# --- YAP mode constants ---
# POS tags that indicate a prefix
PREFIX_POS = frozenset({"PREPOSITION", "CONJ", "DEF", "REL"})
# POS tags for the host word that make the combo a false positive
HOST_POS = frozenset({"NN", "NNP", "NNT", "PRP", "CD", "DT", "EX"})
# --- Heuristic mode constants ---
# Hebrew prefix combinations, longest first for greedy matching.
PREFIXES = [
# 4-char
"וכשמ",
"וכשב",
"וכשל",
"וכשה",
# 3-char
"וכש",
"ומה",
"ובה",
"וכה",
"ולה",
"ומש",
"ובש",
"וכב",
"ולב",
"ומב",
"וכל",
"ולכ",
"שבה",
"שמה",
# 2-char
"כש",
"מה",
"בה",
"כה",
"לה",
"מש",
"בש",
"וב",
"וה",
"וכ",
"ול",
"ומ",
"וש",
"כב",
"לב",
"מב",
"כל",
"לכ",
"שב",
"שה",
"שכ",
"של",
"שמ",
# 1-char
"ב",
"ה",
"ו",
"כ",
"ל",
"מ",
"ש",
]
MIN_REMAINDER_LEN = 2
def _load_known_forms(words_path: Path) -> set[str]:
"""Load all known ktiv_male forms from words.json."""
if not words_path.exists():
logger.warning("words.json not found at %s — no dictionary filter", words_path)
return set()
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
known: set[str] = set()
for entry in words.values():
w = entry.get("word") or {}
if km := w.get("ktiv_male"):
known.add(km)
for form in entry.get("active_forms") or []:
if isinstance(form, dict) and (km2 := form.get("ktiv_male")):
known.add(km2)
for hp in entry.get("hufal_pual_forms") or []:
if isinstance(hp, dict) and (km3 := hp.get("ktiv_male")):
known.add(km3)
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
for inf_data in (entry.get(field) or {}).values():
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
known.add(km4)
logger.info("Loaded %d known dictionary forms from words.json", len(known))
return known
# ── YAP mode ──────────────────────────────────────────────────────────────
def query_yap(word: str) -> dict | None:
"""Send a single word to YAP and return the JSON response."""
payload = {"text": f"{word} "}
try:
resp = requests.post(YAP_URL, json=payload, timeout=YAP_TIMEOUT)
resp.raise_for_status()
return resp.json()
except requests.RequestException as e:
logger.warning("YAP request failed for '%s': %s", word, e)
return None
def is_prefix_combo_yap(yap_response: dict) -> tuple[bool, str]:
"""Check if any morphological analysis segments the word as prefix+host.
Conservative: if ANY analysis in the lattice shows prefix+host discard.
"""
lattice = yap_response.get("ma_lattice", "")
if not lattice:
return False, ""
arcs = []
for line in lattice.strip().split("\n"):
if not line.strip():
continue
parts = line.split("\t")
if len(parts) < 6:
continue
arcs.append(
{
"from": parts[0],
"to": parts[1],
"form": parts[2],
"lemma": parts[3],
"cpos": parts[4],
"pos": parts[5],
}
)
if len(arcs) < 2:
return False, ""
for a in arcs:
if a["cpos"] not in PREFIX_POS and a["pos"] not in PREFIX_POS:
continue
for b in arcs:
if b["from"] != a["to"]:
continue
if b["cpos"] in HOST_POS or b["pos"] in HOST_POS:
reason = f"{a['form']}({a['cpos']})+{b['form']}({b['cpos']})"
return True, reason
return False, ""
# ── Heuristic mode ────────────────────────────────────────────────────────
def find_prefix_decomposition(word: str, freq: dict[str, int]) -> tuple[str, str] | None:
"""Check if word is a prefix+higher-ranked-word combo (heuristic)."""
if len(word) <= MIN_REMAINDER_LEN:
return None
word_rank = freq.get(word, 999999)
for prefix in PREFIXES:
if not word.startswith(prefix):
continue
remainder = word[len(prefix) :]
if len(remainder) < MIN_REMAINDER_LEN:
continue
if remainder in freq and freq[remainder] < word_rank:
return prefix, remainder
return None
# ── Main ──────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="Clean frequency corpus")
parser.add_argument("--mode", choices=["yap", "heuristic"], default="yap", help="Detection mode")
parser.add_argument("--dry-run", action="store_true", help="Show removals without saving")
parser.add_argument("--resume", action="store_true", help="Resume YAP mode from checkpoint")
parser.add_argument("--limit", type=int, default=0, help="Process only first N words (0=all)")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
if not RAW_CACHE.exists():
logger.error("Raw frequency cache not found: %s", RAW_CACHE)
sys.exit(1)
with open(RAW_CACHE, encoding="utf-8") as f:
raw_freq: dict[str, int] = json.load(f)
logger.info("Raw frequency corpus: %d entries", len(raw_freq))
# Sort by rank
words_by_rank = sorted(raw_freq.items(), key=lambda x: x[1])
if args.limit:
words_by_rank = words_by_rank[: args.limit]
if args.mode == "yap":
discarded_list = _run_yap_mode(words_by_rank, args)
else:
known_forms = _load_known_forms(WORDS_JSON)
discarded_list = _run_heuristic_mode(words_by_rank, raw_freq, known_forms)
kept_count = len(words_by_rank) - len(discarded_list)
logger.info("Done. Kept: %d, Discarded: %d", kept_count, len(discarded_list))
if args.dry_run:
logger.info("Dry run — no files written")
return
# Build clean frequency dict (re-ranked without gaps)
discarded_words = {d["word"] for d in discarded_list}
clean_freq: dict[str, int] = {}
new_rank = 1
for word, _rank in words_by_rank:
if word not in discarded_words:
clean_freq[word] = new_rank
new_rank += 1
with open(CLEAN_CACHE, "w", encoding="utf-8") as f:
json.dump(clean_freq, f, ensure_ascii=False)
logger.info("Clean frequency saved: %d entries → %s", len(clean_freq), CLEAN_CACHE)
with open(DISCARDED, "w", encoding="utf-8") as f:
json.dump(discarded_list, f, ensure_ascii=False, indent=2)
logger.info("Discarded entries saved: %d%s", len(discarded_list), DISCARDED)
def _run_yap_mode(
words_by_rank: list[tuple[str, int]],
args: argparse.Namespace,
) -> list[dict]:
"""Run YAP-based prefix detection."""
# Check YAP connectivity
test = query_yap("בדיקה")
if test is None:
logger.error("Cannot connect to YAP API at %s", YAP_URL)
sys.exit(1)
logger.info("YAP API connected")
# Load checkpoint if resuming
analyzed: dict[str, dict] = {}
if args.resume and CHECKPOINT.exists():
with open(CHECKPOINT, encoding="utf-8") as f:
analyzed = json.load(f)
logger.info("Resumed from checkpoint: %d words already analyzed", len(analyzed))
discarded_list: list[dict] = []
discarded_count = 0
kept_count = 0
error_count = 0
for i, (word, rank) in enumerate(words_by_rank):
# Already analyzed (from checkpoint)
if word in analyzed:
if analyzed[word]["discard"]:
discarded_count += 1
discarded_list.append({"word": word, "original_rank": rank, "reason": analyzed[word]["reason"]})
else:
kept_count += 1
continue
# Trivial: single char, ASCII, or too short
if len(word) <= 1 or word.isascii():
analyzed[word] = {"discard": False, "reason": ""}
kept_count += 1
continue
result = query_yap(word)
if result is None:
analyzed[word] = {"discard": False, "reason": "yap_error"}
error_count += 1
kept_count += 1
time.sleep(0.5)
continue
is_combo, reason = is_prefix_combo_yap(result)
analyzed[word] = {"discard": is_combo, "reason": reason}
if is_combo:
discarded_count += 1
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
if rank <= 500 or discarded_count <= 50:
logger.info(" DISCARD rank %5d: %s (%s)", rank, word, reason)
else:
kept_count += 1
# Rate limit
if i % 10 == 0:
time.sleep(0.01)
# Checkpoint
if (i + 1) % BATCH_SAVE_INTERVAL == 0:
if not args.dry_run:
with open(CHECKPOINT, "w", encoding="utf-8") as f:
json.dump(analyzed, f, ensure_ascii=False)
logger.info(
" [%d/%d] kept=%d discarded=%d errors=%d",
i + 1,
len(words_by_rank),
kept_count,
discarded_count,
error_count,
)
# Final checkpoint save
if not args.dry_run and CHECKPOINT.exists():
CHECKPOINT.unlink()
if error_count:
logger.warning("%d YAP errors encountered", error_count)
return discarded_list
def _run_heuristic_mode(
words_by_rank: list[tuple[str, int]],
raw_freq: dict[str, int],
known_forms: set[str],
) -> list[dict]:
"""Run heuristic prefix detection (no external dependencies)."""
discarded_list: list[dict] = []
discarded_count = 0
for word, rank in words_by_rank:
if len(word) <= 1 or word.isascii():
continue
# Known dictionary form → keep
if word in known_forms:
continue
result = find_prefix_decomposition(word, raw_freq)
if result is not None:
prefix, remainder = result
discarded_count += 1
reason = f"{prefix}+{remainder} (rank {raw_freq[remainder]})"
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
if rank <= 500 or discarded_count <= 50:
logger.info(" DISCARD rank %5d: %s = %s", rank, word, reason)
return discarded_list
if __name__ == "__main__":
main()

View file

@ -32,7 +32,7 @@ DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # aleftav
VALID_PERSON_CODES: frozenset[str] = frozenset(
["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
)
EMOJI_RE = re.compile(
@ -561,6 +561,7 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
"""
name = "conjugation_form_guids"
errors: list[str] = []
warnings: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
@ -580,7 +581,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
guid_candidates = form.get("guid_candidates")
if not guid and not guid_candidates:
errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
# New forms from rescrape use deterministic fallback — warn, don't fail
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
continue
if guid:
@ -597,6 +599,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
else:
seen_guids[candidate] = label
if warnings:
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:

486
tests/test_detail_scrape.py Normal file
View file

@ -0,0 +1,486 @@
"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from pealim_detail_scrape import (
_parse_adjective_table,
_parse_adjective_table_vl,
_parse_preposition_table,
_parse_preposition_table_vl,
_scrape_adjective_detail,
_scrape_preposition_detail,
)
# ---------------------------------------------------------------------------
# Fixtures — real HTML snippets from pealim.com
# ---------------------------------------------------------------------------
ADJECTIVE_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִי</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fs-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִית</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="mp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיִּים</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיּוֹת</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
</tr>
</tbody>
</table>
"""
# VL version: menukad spans contain unvowelled text (hebstyle=vl)
ADJECTIVE_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a"><div><div>
<span class="menukad">אביבי</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fs-a"><div><div>
<span class="menukad">אביבית</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="mp-a"><div><div>
<span class="menukad">אביביים</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fp-a"><div><div>
<span class="menukad">אביביות</span>
</div></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th rowspan="2">Person</th>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<th>1st</th>
<td class="conj-td" colspan="2">
<div id="P-1s"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">&#128266;</span>
<span class="menukad">שֶׁלִּי</span>
</div></div><div class="meaning"><strong>of mine</strong></div></div>
</td>
<td class="conj-td" colspan="2">
<div id="P-1p"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּנוּ</span>
</div></div><div class="meaning"><strong>of ours</strong></div></div>
</td>
</tr>
<tr>
<th>2nd</th>
<td class="conj-td">
<div id="P-2ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">&#128266;</span>
<span class="menukad">שֶׁלְּךָ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּךְ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶם</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶן</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
</td>
</tr>
<tr>
<th>3rd</th>
<td class="conj-td">
<div id="P-3ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">&#128266;</span>
<span class="menukad">שֶׁלּוֹ</span>
</div></div><div class="meaning"><strong>of his</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהּ</span>
</div></div><div class="meaning"><strong>of hers</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶם</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
</td>
<td class="conj-td">
<div id="P-3fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶן</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<th>1st</th>
<td colspan="2"><div id="P-1s"><div><div>
<span class="menukad">שלי</span>
</div></div></div></td>
<td colspan="2"><div id="P-1p"><div><div>
<span class="menukad">שלנו</span>
</div></div></div></td>
</tr>
<tr>
<th>2nd</th>
<td><div id="P-2ms"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2fs"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2mp"><div><div>
<span class="menukad">שלכם</span>
</div></div></div></td>
<td><div id="P-2fp"><div><div>
<span class="menukad">שלכן</span>
</div></div></div></td>
</tr>
<tr>
<th>3rd</th>
<td><div id="P-3ms"><div><div>
<span class="menukad">שלו</span>
</div></div></div></td>
<td><div id="P-3fs"><div><div>
<span class="menukad">שלה</span>
</div></div></div></td>
<td><div id="P-3mp"><div><div>
<span class="menukad">שלהם</span>
</div></div></div></td>
<td><div id="P-3fp"><div><div>
<span class="menukad">שלהן</span>
</div></div></div></td>
</tr>
</tbody>
</table>
"""
# Minimal full-page wrappers so _scrape_*_detail() can parse them
_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
# ---------------------------------------------------------------------------
# Adjective table tests
# ---------------------------------------------------------------------------
class TestParseAdjectiveTable:
"""Tests for _parse_adjective_table (mo/nikkud page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["nikkud"] == "אֲבִיבִי"
def test_fs_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fs"]["nikkud"] == "אֲבִיבִית"
def test_mp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
def test_fp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
def test_audio_url_present(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParseAdjectiveTableVl:
"""Tests for _parse_adjective_table_vl (ktiv male page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["ms"] == "אביבי"
def test_fs_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fs"] == "אביבית"
def test_mp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["mp"] == "אביביים"
def test_fp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fp"] == "אביביות"
# ---------------------------------------------------------------------------
# _scrape_adjective_detail tests
# ---------------------------------------------------------------------------
class TestScrapeAdjectiveDetail:
"""Tests for _scrape_adjective_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["ms"]["nikkud"] == "אֲבִיבִי"
assert result["ms"]["ktiv_male"] == "אביבי"
def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fs"]["nikkud"] == "אֲבִיבִית"
assert result["fs"]["ktiv_male"] == "אביבית"
def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
assert result["mp"]["ktiv_male"] == "אביביים"
def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
assert result["fp"]["ktiv_male"] == "אביביות"
def test_mishkal_key_present(self, result: dict) -> None:
# mishkal may be None since no PoS section is in our minimal fixture
assert "mishkal" in result
def test_mishkal_hebrew_key_present(self, result: dict) -> None:
assert "mishkal_hebrew" in result
def test_all_schema_keys_present(self, result: dict) -> None:
expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
assert expected.issubset(result.keys())
def test_empty_on_no_table(self) -> None:
result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}
# ---------------------------------------------------------------------------
# Preposition table tests
# ---------------------------------------------------------------------------
class TestParsePrepositionTable:
"""Tests for _parse_preposition_table (mo/nikkud page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_nikkud(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
def test_1p_nikkud(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
def test_2ms_nikkud(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
def test_2fs_nikkud(self, result: dict) -> None:
assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
def test_2mp_nikkud(self, result: dict) -> None:
assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
def test_2fp_nikkud(self, result: dict) -> None:
assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
def test_3ms_nikkud(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
def test_3fs_nikkud(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
def test_3mp_nikkud(self, result: dict) -> None:
assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
def test_3fp_nikkud(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
def test_audio_url_present(self, result: dict) -> None:
assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParsePrepositionTableVl:
"""Tests for _parse_preposition_table_vl (ktiv male page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_ktiv(self, result: dict) -> None:
assert result["1s"] == "שלי"
def test_1p_ktiv(self, result: dict) -> None:
assert result["1p"] == "שלנו"
def test_2ms_ktiv(self, result: dict) -> None:
assert result["2ms"] == "שלך"
def test_3ms_ktiv(self, result: dict) -> None:
assert result["3ms"] == "שלו"
def test_3fp_ktiv(self, result: dict) -> None:
assert result["3fp"] == "שלהן"
# ---------------------------------------------------------------------------
# _scrape_preposition_detail tests
# ---------------------------------------------------------------------------
class TestScrapePrepositionDetail:
"""Tests for _scrape_preposition_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_all_ten_person_keys_present(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert expected.issubset(result.keys())
def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
assert result["1s"]["ktiv_male"] == "שלי"
def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
assert result["1p"]["ktiv_male"] == "שלנו"
def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
assert result["2ms"]["ktiv_male"] == "שלך"
def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
assert result["3ms"]["ktiv_male"] == "שלו"
def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
assert result["3fs"]["ktiv_male"] == "שלה"
def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
assert result["3fp"]["ktiv_male"] == "שלהן"
def test_empty_on_no_table(self) -> None:
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}

View file

@ -42,3 +42,17 @@ def test_strip_nikkud_all_marks():
nikkud = "הַמַּלְכָּה"
plain = strip_nikkud(nikkud)
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
def test_categorize_pos_no_substring_match():
"""Regression: 'Pronoun' must NOT match 'Noun' category."""
from apkg_builder import _categorize_pos
assert _categorize_pos("Noun") == "Noun"
assert _categorize_pos("Verb") == "Verb"
assert _categorize_pos("Adjective") == "Adjective"
assert _categorize_pos("Adverb") == "Adverb"
assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
assert _categorize_pos("Preposition") == "Other"
assert _categorize_pos("Conjunction") == "Other"
assert _categorize_pos("Cardinal numeral") == "Other"