Compare commits

..

No commits in common. "master" and "v0.14" have entirely different histories.

52 changed files with 64788 additions and 2454714 deletions

View file

@ -1,26 +0,0 @@
{
"hooks": {
"PostToolUse": [
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=\"$CLAUDE_FILE_PATH\"; if [ -n \"$file\" ] && echo \"$file\" | grep -q '\\.py$'; then ruff format --quiet \"$file\" && ruff check --fix --quiet \"$file\" 2>/dev/null; fi"
}
]
}
],
"PreToolUse": [
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=\"$CLAUDE_FILE_PATH\"; if echo \"$file\" | grep -qE '(legacy_guid_map\\.json|\\.env)$'; then echo 'BLOCKED: Protected file — legacy_guid_map.json and .env are read-only' >&2; exit 2; fi"
}
]
}
]
}
}

9
.gitignore vendored
View file

@ -15,7 +15,6 @@ __pycache__/
# Large generated cache files (rebuild locally)
data/benyehuda_index.json
data/colliding_forms.json
# Audio directories (large; rebuild locally)
data/audio/
@ -48,14 +47,6 @@ data/epubs/
# Stray deck files
Everything__*.apkg
*.apkg
# Legacy CSV files (replaced by data/words.json)
*.csv
data/*.csv
# Dead whitelist files
vulture_whitelist.py
# Release artifacts — distributed via Forgejo releases, not committed to tree
releases/

View file

@ -56,7 +56,7 @@ Fields on each card:
| Plural form | for nouns: רבים: שֻׁלְחָנוֹת |
| Disambiguation hint | for ambiguous Eng→Heb cards |
Cards are presented in **frequency order** — Anki will show you the most common words first. Note that because frequency is collected with words without nikkud, words that have the same letters but different nikkud will be assigned the same frequency.
Cards are presented in **frequency order** — Anki will show you the most common words first.
### Eng→Heb disambiguation

View file

@ -1,192 +0,0 @@
# Hebrew Flash Cards — Unified Data Schema (words.json)
# Revised based on Nevo's feedback (2026-03-08)
#
# Top-level: dict keyed by unique_key
# Unique key: nikkud word for most entries (e.g. "אָב")
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
#
# Hebrew text fields use nikkud/ktiv_male subfields:
# field:
# nikkud: "אָב" # with nikkud (hebstyle=mo)
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
#
# Pronoun notation for conjugation forms uses grammatical codes:
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
entry:
# --- Core Identity ---
word:
nikkud: "אָב"
ktiv_male: "אב"
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
root: ["א", "ב"] # Shoresh as list of consonant chars
pos: "Noun" # Part of speech in English (as from pealim)
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
audio_url: "https://..." # Pealim audio URL
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
tags: "" # Pealim tags if any
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
# --- Identity & Progress ---
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
# --- Frequency ---
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
# --- Display Enrichment ---
emoji: "👨"
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
image: "father.jpg" # Wikipedia/Commons image filename, or null
image_source: "wikipedia" # One of: wikipedia, commons, null
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
# --- Shared Roots ---
shared_roots: [] # List of unique_keys of other words sharing the same root
# Computed by iterating all entries and grouping by root
# --- Confusables ---
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
# --- Example Sentences ---
examples:
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
vetted: true
cloze: # Best sentence for cloze card, or null
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
cloze_word_start: 0 # Character offset of the clozed word in text
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
cloze_hint: "family member"
cloze_guid: "def456..." # GUID for the cloze note
difficulty_score: 234 # Median frequency rank of context words (lower = easier); optional
rejected_count: 0
# --- Noun-specific: Inflection Forms ---
noun_inflection: null # null for non-nouns
# When populated:
# plurals_guid: "ghi789..." # GUID for plurals deck note
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
# nikkud: "אָב"
# ktiv_male: "אב"
# plural:
# nikkud: "אָבוֹת"
# ktiv_male: "אבות"
# singular_audio: "6009-av.mp3"
# plural_audio: null # TODO: scrape from detail pages
# construct_singular:
# nikkud: "אֲבִי"
# ktiv_male: "אבי"
# construct_plural:
# nikkud: "אֲבוֹת"
# ktiv_male: "אבות"
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
# 1s:
# nikkud: "אָבִי"
# ktiv_male: "אבי"
# 1p:
# nikkud: "אָבִינוּ"
# ktiv_male: "אבינו"
# 2ms: ...
# 2fs: ...
# 2mp: ...
# 2fp: ...
# 3ms: ...
# 3fs: ...
# 3mp: ...
# 3fp: ...
# gender: "masculine"
# gender_hebrew:
# nikkud: "זָכָר"
# ktiv_male: "זכר"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Verb-specific: Conjugation Data ---
conjugation: null # null for non-verbs
# When populated:
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
# infinitive:
# nikkud: "לִשְׁמֹר"
# ktiv_male: "לשמור"
# reference_form: # 3ms past (the citation form)
# nikkud: "שָׁמַר"
# ktiv_male: "שמר"
# binyan: "Pa'al" # English binyan name
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
# prep: "על" # Hebrew preposition the verb takes, or null
# active_forms:
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# tense: "עָבָר"
# form:
# nikkud: "שָׁמַרְתִּי"
# ktiv_male: "שמרתי"
# audio_url: "https://..."
# audio_file: null # For future use
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
# nikkud: "שֻׁמַּר"
# ktiv_male: "שומר"
# --- Adjective-specific ---
adjective_inflection: null # null for non-adjectives
# When populated:
# ms:
# nikkud: "גָּדוֹל"
# ktiv_male: "גדול"
# fs:
# nikkud: "גְּדוֹלָה"
# ktiv_male: "גדולה"
# mp:
# nikkud: "גְּדוֹלִים"
# ktiv_male: "גדולים"
# fp:
# nikkud: "גְּדוֹלוֹת"
# ktiv_male: "גדולות"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Preposition-specific ---
preposition_inflection: null # null for non-prepositions
# When populated:
# 1s:
# nikkud: "שֶׁלִּי"
# ktiv_male: "שלי"
# 1p:
# nikkud: "שֶׁלָּנוּ"
# ktiv_male: "שלנו"
# 2ms:
# nikkud: "שֶׁלְּךָ"
# ktiv_male: "שלך"
# 2fs:
# nikkud: "שֶׁלָּךְ"
# ktiv_male: "שלך"
# 2mp:
# nikkud: "שֶׁלָּכֶם"
# ktiv_male: "שלכם"
# 2fp:
# nikkud: "שֶׁלָּכֶן"
# ktiv_male: "שלכן"
# 3ms:
# nikkud: "שֶׁלּוֹ"
# ktiv_male: "שלו"
# 3fs:
# nikkud: "שֶׁלָּהּ"
# ktiv_male: "שלה"
# 3mp:
# nikkud: "שֶׁלָּהֶם"
# ktiv_male: "שלהם"
# 3fp:
# nikkud: "שֶׁלָּהֶן"
# ktiv_male: "שלהן"

File diff suppressed because it is too large Load diff

196
benyehuda.py Normal file
View file

@ -0,0 +1,196 @@
#!/usr/bin/env python3
"""
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
then answers queries locally.
Exposed API:
load(force_rebuild=False)
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
save_examples_cache()
"""
import json
import logging
import re
import zipfile
from io import BytesIO
from pathlib import Path
import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
REQUEST_TIMEOUT = 120
MIN_SENTENCE_LEN = 20
MAX_SENTENCE_LEN = 200
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
# Module-level state
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
def _split_sentences(text: str) -> list[str]:
"""
Split text into sentences on newlines only (Hebrew sentences don't have
mid-word period issues like English). Min 20 chars, max 200 chars.
"""
out = []
for line in text.split("\n"):
s = line.strip().strip("\"'.,;:!?")
s = s.strip()
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
out.append(s)
return out
def _build_index(corpus_zip_bytes: bytes) -> None:
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
global _index
_index = {}
logger.info("Building Ben Yehuda index from nikkud corpus …")
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
logger.info(f" Corpus contains {len(txt_files)} text files")
for fname in txt_files:
try:
raw = zf.read(fname).decode("utf-8", errors="ignore")
except Exception: # noqa: S112
continue
for sentence in _split_sentences(raw):
# Index by each unique Hebrew token (with nikkud) in the sentence
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
for w in set(words):
if len(w) >= 2:
bucket = _index.setdefault(w, [])
if len(bucket) < MAX_INDEX_ENTRIES:
bucket.append(sentence)
logger.info(f"Index built: {len(_index)} unique word forms")
def _save_index() -> None:
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(INDEX_PATH, "w", encoding="utf-8") as f:
json.dump(_index, f, ensure_ascii=False)
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
def _load_index() -> None:
global _index
with open(INDEX_PATH, encoding="utf-8") as f:
_index = json.load(f)
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
def load(force_rebuild: bool = False) -> None:
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
global _index, _examples_cache
if _index and not force_rebuild:
return
if force_rebuild:
# Delete old index and discard examples cache
if INDEX_PATH.exists():
INDEX_PATH.unlink()
logger.info("Deleted old Ben Yehuda index (force rebuild)")
_examples_cache = {}
else:
# Load persisted examples cache (not needed on rebuild)
if EXAMPLES_CACHE_PATH.exists():
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
_examples_cache = json.load(f)
if INDEX_PATH.exists():
_load_index()
return
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
resp.raise_for_status()
data = resp.content
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
_build_index(data)
_save_index()
def save_examples_cache() -> None:
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
json.dump(_examples_cache, f, ensure_ascii=False)
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
def get_examples(word_nikkud: str) -> list[str]:
"""
Return 0 or 1 example sentences for the given word (nikkud form).
Lookup strategy:
1. Try exact nikkud match in index.
2. Fall back to stripped (no-nikkud) match against index keys.
Returns the single longest sentence MAX_SENTENCE_LEN that contains
the word as a whole token.
"""
if not _index:
load()
word = word_nikkud.strip()
word_stripped = _strip_nikkud(word)
cache_key = word
if cache_key in _examples_cache:
return _examples_cache[cache_key]
# Lookup: try exact nikkud first, then stripped fallback
candidates = _index.get(word, [])
if not candidates and word_stripped:
# Try looking up by stripped form across index keys
for k, v in _index.items():
if _strip_nikkud(k) == word_stripped:
candidates = v
break
# Filter: word must appear as a whole token
# Match the stripped form (for robustness with nikkud variants in sentence)
if word_stripped:
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
else:
matched = candidates[:]
# Filter by length
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
if matched:
best = max(matched, key=len)
result = [best]
else:
result = []
_examples_cache[cache_key] = result
return result
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
for w in tests:
exs = get_examples(w)
print(f"\n{w}: {len(exs)} example(s)")
for ex in exs:
print(f"{ex[:100]}")
save_examples_cache()

View file

@ -1,110 +0,0 @@
<!DOCTYPE html>
<html dir="rtl">
<head>
<meta charset="utf-8">
<style>
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
.card-content { padding: 16px; text-align: center; }
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
.emoji-img { font-size: 48px; text-align: center; margin: 4px 0; }
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
.example { font-size: 24px; color: #222; padding: 6px 8px; direction: rtl; text-align: center; border-left: 3px solid #ccc; font-style: italic; margin: 6px auto; max-width: 90%; }
.voice-label { font-size: 20px; color: #888; }
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
.more-header {
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
}
.more-header::-webkit-details-marker { display: none; }
.more-header::before { content: "○ "; font-size: 14px; }
details[open] > .more-header::before { content: "● "; }
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
</style>
</head>
<body>
<h2 style="font-family:sans-serif;direction:ltr;">Vocab: English → Hebrew (BACK) — collapsed</h2>
<div class="card-container">
<div class="card-label">English → Hebrew — Back (default: collapsed)</div>
<div class="card-content">
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
<div class="emoji-img">📍</div>
<div class="divider"></div>
<div class="hebrew">פַּעַם</div>
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
<div class="sec-table">
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
</div>
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
</div>
</details>
</div>
</div>
<h2 style="font-family:sans-serif;direction:ltr;">Same card — EXPANDED</h2>
<div class="card-container">
<div class="card-label">English → Hebrew — Back (expanded)</div>
<div class="card-content">
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
<div class="emoji-img">📍</div>
<div class="divider"></div>
<div class="hebrew">פַּעַם</div>
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
<div class="sec-table">
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
</div>
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
</div>
</details>
</div>
</div>
</body>
</html>

View file

@ -1,114 +0,0 @@
<!DOCTYPE html>
<html dir="rtl">
<head>
<meta charset="utf-8">
<style>
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
.card-content { padding: 16px; text-align: center; }
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
.voice-label { font-size: 20px; color: #888; }
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
.more-header {
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
}
.more-header::-webkit-details-marker { display: none; }
.more-header::before { content: "○ "; font-size: 14px; }
details[open] > .more-header::before { content: "● "; }
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
</style>
</head>
<body>
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — FRONT</h2>
<div class="card-container">
<div class="card-label">Front</div>
<div class="card-content">
<div class="hint">אֵיךְ אוֹמְרִים</div>
<div class="hebrew">אַתָּה</div>
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
<div class="hebrew">בַּהוֹוֶה</div>
</div>
</div>
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (collapsed)</h2>
<div class="card-container">
<div class="card-label">Back — default state</div>
<div class="card-content">
<div class="hint">אֵיךְ אוֹמְרִים</div>
<div class="hebrew">אַתָּה</div>
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
<div class="hebrew">בַּהוֹוֶה</div>
<hr>
<div class="hebrew">שׁוֹמֵר (על)</div>
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
<div class="sec-table">
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
</div>
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
</div>
</details>
</div>
</div>
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (expanded)</h2>
<div class="card-container">
<div class="card-label">Back — expanded</div>
<div class="card-content">
<div class="hint">אֵיךְ אוֹמְרִים</div>
<div class="hebrew">אַתָּה</div>
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
<div class="hebrew">בַּהוֹוֶה</div>
<hr>
<div class="hebrew">שׁוֹמֵר (על)</div>
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
<div class="sec-table">
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
</div>
<div class="divider" style="margin:6px 0;"></div>
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
<div class="sec-table">
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
</div>
</details>
</div>
</div>
</body>
</html>

690
conjugation_extract.py Executable file
View file

@ -0,0 +1,690 @@
#!/usr/bin/env python3
"""
Extract Hebrew verb conjugations from pealim.com.
Input: verbs_input.txt (one Hebrew infinitive per line;
lines starting with '# 3ms:' search by 3ms past form for Pu'al/Huf'al)
Output: data/conjugations.json
For each verb:
1. Search pealim.com/search/?q=<verb> to find URL slug
2. Fetch /dict/<slug>/ with hebstyle=mo cookie
3. Parse conjugation table by row labels
4. Capture audio URLs per form
5. Parse passive (Pu'al/Huf'al) forms from the same page
Resume-safe: verbs already in conjugations.json are skipped.
"""
import json
import logging
import re
import time
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
DICT_CSV = next(
(
p
for p in [
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
Path(__file__).parent / "data" / "pealim_dict_for_anki.csv",
]
if p.exists()
),
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
)
# Pronoun labels (for card front display)
PRONOUN_LABELS = {
"present_ms": "",
"present_fs": "",
"present_mp": "",
"present_fp": "",
"past_1s": "אֲנִי",
"past_1p": "אֲנַחְנוּ",
"past_2ms": "אַתָּה",
"past_2fs": "אַתְּ",
"past_2mp": "אַתֶּם",
"past_2fp": "אַתֶּן",
"past_3ms": "הוּא",
"past_3fs": "הִיא",
"past_3p": "הֵם / הֵן",
"future_1s": "אֲנִי",
"future_1p": "אֲנַחְנוּ",
"future_2ms": "אַתָּה",
"future_2fs": "אַתְּ",
"future_2mp": "אַתֶּם",
"future_2fp": "אַתֶּן",
"future_3ms": "הוּא",
"future_3fs": "הִיא",
"future_3mp": "הֵם",
"future_3fp": "הֵן",
"imperative_ms": "אַתָּה",
"imperative_fs": "אַתְּ",
"imperative_mp": "אַתֶּם",
"imperative_fp": "אַתֶּן",
"infinitive": "",
}
# Human-readable tense description for card front
TENSE_DESCRIPTION = {
"present_ms": "הוֹוֶה",
"present_fs": "הוֹוֶה",
"present_mp": "הוֹוֶה",
"present_fp": "הוֹוֶה",
"past_1s": "עָבָר",
"past_1p": "עָבָר",
"past_2ms": "עָבָר",
"past_2fs": "עָבָר",
"past_2mp": "עָבָר",
"past_2fp": "עָבָר",
"past_3ms": "עָבָר",
"past_3fs": "עָבָר",
"past_3p": "עָבָר",
"future_1s": "עָתִיד",
"future_1p": "עָתִיד",
"future_2ms": "עָתִיד",
"future_2fs": "עָתִיד",
"future_2mp": "עָתִיד",
"future_2fp": "עָתִיד",
"future_3ms": "עָתִיד",
"future_3fs": "עָתִיד",
"future_3mp": "עָתִיד",
"future_3fp": "עָתִיד",
"imperative_ms": "צִוּוּי",
"imperative_fs": "צִוּוּי",
"imperative_mp": "צִוּוּי",
"imperative_fp": "צִוּוּי",
"infinitive": "מְקוֹר",
}
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
def _build_pos_lookup() -> dict[str, str]:
"""Build word_stripped → binyan dict from pealim_dict_for_anki.csv."""
lookup: dict[str, str] = {}
if not DICT_CSV.exists():
return lookup
try:
import pandas as pd
try:
df = pd.read_csv(DICT_CSV, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(DICT_CSV, index_col=0)
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
pos = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
if word and pos and "nan" not in pos.lower():
lookup[_strip_nikkud(word)] = pos
except Exception as e:
logger.debug(f"Could not load PoS lookup: {e}")
return lookup
# Cache PoS lookup (built once)
_pos_lookup: dict[str, str] | None = None
def _get_pos_lookup() -> dict[str, str]:
global _pos_lookup
if _pos_lookup is None:
_pos_lookup = _build_pos_lookup()
return _pos_lookup
def _binyan_from_pos(word: str) -> str:
"""Look up binyan from PoS field: 'Verb pa\'al' or 'Verb Pi\'el' → canonical name."""
lookup = _get_pos_lookup()
pos_str = lookup.get(_strip_nikkud(word), "")
if not pos_str:
return ""
pos_lower = pos_str.lower()
# Map lowercase pealim.com PoS variants → canonical names
for bname, variants in [
("Pa'al", ["pa'al", "paal"]),
("Nif'al", ["nif'al", "nifal"]),
("Pi'el", ["pi'el", "piel"]),
("Pu'al", ["pu'al", "pual"]),
("Hitpa'el", ["hitpa'el", "hitpael"]),
("Hif'il", ["hif'il", "hifil"]),
("Huf'al", ["huf'al", "hufal"]),
]:
if any(v in pos_lower for v in variants):
return bname
return ""
def _find_slug(query: str) -> str | None:
"""Search pealim.com/search/?q=<verb> and return the URL slug."""
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
try:
resp = session.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
if slugs:
slug = slugs[0]
logger.info(f" Slug: {slug}")
return slug
except Exception as e:
logger.error(f" Error searching for '{query}': {e}")
return None
def _is_passive_binyan(binyan: str) -> bool:
"""Return True if the binyan is a passive (Pu'al or Huf'al)."""
return any(marker in binyan for marker in ("פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al"))
def _get_menukad(cell) -> tuple[str, str]:
"""
Extract nikkud Hebrew text and audio URL from a table cell.
Returns (form_text, audio_url).
"""
# Audio URL
audio_span = cell.find("span", class_=lambda c: c and "audio-play" in c)
audio_url = ""
if audio_span:
audio_url = audio_span.get("data-audio", "")
span = cell.find("span", class_="menukad")
if span:
return span.get_text(strip=True), audio_url
txt = cell.get_text(strip=True)
if re.search(r"[\u05d0-\u05ea]", txt):
return txt, audio_url
return "", audio_url
def _parse_table(soup: BeautifulSoup, passive: bool = False, table_el=None) -> dict[str, dict]:
"""
Parse the pealim conjugation table and return form_key -> {form, audio_url} mapping.
If passive=True, look for the passive table (after "Passive" heading).
If table_el is provided (and passive=False), parse that table directly.
"""
if passive:
# Find <h3> containing "Passive"
passive_h3 = None
for h3 in soup.find_all("h3"):
if "passive" in h3.get_text(strip=True).lower():
passive_h3 = h3
break
if not passive_h3:
return {}
# Find next conjugation table after this heading
table = None
for sib in passive_h3.find_all_next():
if sib.name == "table" and "conjugation-table" in sib.get("class", []):
table = sib
break
if not table:
return {}
elif table_el is not None:
table = table_el
else:
table = soup.find("table", class_="conjugation-table")
if not table:
return {}
rows = table.find_all("tr")
if len(rows) < 9:
return {}
forms: dict[str, dict] = {}
def first_heb_forms(row_idx: int) -> list[tuple[str, str]]:
"""Get only the Hebrew-text cells from a row (skip label cells)."""
cells = rows[row_idx].find_all(["th", "td"])
result = []
for cell in cells:
txt, audio_url = _get_menukad(cell)
colspan = int(cell.get("colspan", 1))
if txt and re.search(r"[\u05d0-\u05ea]", txt):
for _ in range(colspan):
result.append((txt, audio_url))
return result
def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
"""Return pairs with duplicate form-text entries removed (first occurrence kept)."""
seen: set[str] = set()
out: list[tuple[str, str]] = []
for pair in pairs:
if pair[0] not in seen:
seen.add(pair[0])
out.append(pair)
return out
# Find rows by tense label
present_row = past_row = future_row = imp_row = inf_row = -1
for i, row in enumerate(rows):
label = row.get_text(" ", strip=True).lower()
if "present" in label and present_row < 0:
present_row = i
elif "past" in label and past_row < 0:
past_row = i
elif "future" in label and future_row < 0:
future_row = i
elif "imperative" in label and imp_row < 0:
imp_row = i
elif "infinitive" in label and inf_row < 0:
inf_row = i
def store(key: str, form: str, audio_url: str) -> None:
if form:
forms[key] = {"form": form, "audio_url": audio_url}
# Present tense (4 forms: ms fs mp fp)
if present_row >= 0:
hf = first_heb_forms(present_row)
keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
for k, (v, au) in zip(keys, hf, strict=False):
store(k, v, au)
# Past tense
if past_row >= 0:
unique = deduplicate(first_heb_forms(past_row))
if len(unique) >= 1:
store("past_1s", unique[0][0], unique[0][1])
if len(unique) >= 2:
store("past_1p", unique[1][0], unique[1][1])
if past_row + 1 < len(rows):
hf2 = first_heb_forms(past_row + 1)
keys2 = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
for k, (v, au) in zip(keys2, hf2, strict=False):
store(k, v, au)
if past_row + 2 < len(rows):
unique3 = deduplicate(first_heb_forms(past_row + 2))
keys3 = ["past_3ms", "past_3fs", "past_3p"]
for k, (v, au) in zip(keys3, unique3, strict=False):
store(k, v, au)
# Future tense
if future_row >= 0:
unique_f = deduplicate(first_heb_forms(future_row))
if len(unique_f) >= 1:
store("future_1s", unique_f[0][0], unique_f[0][1])
if len(unique_f) >= 2:
store("future_1p", unique_f[1][0], unique_f[1][1])
if future_row + 1 < len(rows):
hf2 = first_heb_forms(future_row + 1)
keys2 = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
for k, (v, au) in zip(keys2, hf2, strict=False):
store(k, v, au)
if future_row + 2 < len(rows):
hf3 = first_heb_forms(future_row + 2)
keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
for k, (v, au) in zip(keys3, hf3, strict=False):
store(k, v, au)
# Imperative
if imp_row >= 0:
hf = first_heb_forms(imp_row)
keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
for k, (v, au) in zip(keys, hf, strict=False):
store(k, v, au)
# Infinitive
if inf_row >= 0:
hf = first_heb_forms(inf_row)
if hf:
store("infinitive", hf[0][0], hf[0][1])
return forms
def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
"""Extract binyan from page header span."""
for h3 in soup.find_all("h3", class_="page-header"):
text = h3.get_text(" ", strip=True)
for bname in BINYAN_NAMES:
if bname in text:
return bname
# Also try og:description
meta = soup.find("meta", {"property": "og:description"})
if meta:
desc = meta.get("content", "")
for bname in BINYAN_NAMES:
if bname in desc:
return bname
return ""
def _extract_passive_binyan_from_page(soup: BeautifulSoup) -> str:
"""Extract passive binyan name from passive section heading."""
for h3 in soup.find_all("h3"):
text = h3.get_text(" ", strip=True)
if "passive" in text.lower():
for bname in ("Pu'al", "Huf'al"):
if bname in text:
return bname
# Infer: Pa'al/Pi'el → Pu'al; Hif'il → Huf'al (stored as span text)
span = h3.find("span", class_="small")
if span:
span_text = span.get_text(strip=True)
for bname in ("Pu'al", "Huf'al"):
if bname in span_text:
return bname
return ""
def _extract_conjugations(
slug: str, search_term: str, is_3ms_search: bool = False, binyan_hint: str = ""
) -> dict | None:
"""Fetch /dict/<slug>/ and parse conjugation table (active + passive)."""
url = f"{PEALIM_BASE}/dict/{slug}/"
try:
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
except Exception as e:
logger.error(f" Error fetching {url}: {e}")
return None
soup = BeautifulSoup(resp.text, "lxml")
# Extract meaning from <div class="lead"> (English translation)
meaning = ""
lead_div = soup.find("div", class_="lead")
if lead_div:
meaning = lead_div.get_text(strip=True)
# Extract root
root = ""
for span in soup.find_all("span", class_="menukad"):
txt = span.get_text(strip=True)
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
root = txt
break
# Extract binyan: try PoS lookup first, then page header, then section hint
binyan = _binyan_from_pos(search_term) if not is_3ms_search else ""
if not binyan:
binyan = _extract_binyan_from_page(soup)
if not binyan:
binyan = binyan_hint
# Parse active forms table
forms_raw = _parse_table(soup, passive=False)
if not forms_raw:
logger.warning(f" No forms found for {slug}")
return None
is_passive = _is_passive_binyan(binyan)
# For passive binyan search (3ms search), the "active" table is actually the passive one
# Determine reference form
infinitive_form = forms_raw.get("infinitive", {}).get("form", "") if not is_passive else ""
past_3ms_form = forms_raw.get("past_3ms", {}).get("form", "")
reference_form = (past_3ms_form or search_term) if is_passive else (infinitive_form or search_term)
# Build active result
result = {
"infinitive": search_term,
"slug": slug,
"root": root,
"binyan": binyan,
"meaning": meaning,
"is_passive": is_passive,
"reference_form": reference_form,
"forms": {},
}
for key, form_data in forms_raw.items():
if key in PRONOUN_LABELS:
result["forms"][key] = {
"form": form_data["form"],
"audio_url": form_data.get("audio_url", ""),
"pronoun": PRONOUN_LABELS[key],
"tense": TENSE_DESCRIPTION.get(key, ""),
}
# Check for a second conjugation table (alternate paradigm, e.g. להתגלות)
# Collect all active tables (exclude passive tables which follow the "Passive" h3)
passive_h3 = next(
(h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
None,
)
passive_table_ids = {
id(t) for t in (passive_h3.find_all_next("table", class_="conjugation-table") if passive_h3 else [])
}
active_tables = [t for t in soup.find_all("table", class_="conjugation-table") if id(t) not in passive_table_ids]
if len(active_tables) >= 2:
alt_raw = _parse_table(soup, passive=False, table_el=active_tables[1])
alternate_forms = {}
for key, form_data in alt_raw.items():
if key in PRONOUN_LABELS:
alt_form = form_data["form"]
primary_form = forms_raw.get(key, {}).get("form", "")
if alt_form and alt_form != primary_form:
alternate_forms[key] = alt_form
if alternate_forms:
result["alternate_forms"] = alternate_forms
logger.info(f" Found {len(alternate_forms)} alternate forms for {search_term}")
logger.info(f" Extracted {len(result['forms'])} forms for {search_term}")
return result
def _load_conjugations() -> dict:
if CONJUGATIONS_PATH.exists():
with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
return json.load(f)
return {}
def _save_conjugations(data: dict) -> None:
CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def _extract_passive_from_active_slug(active_slug: str, search_term: str, binyan_hint: str = "") -> dict | None:
"""Fetch active verb page and extract only the passive section forms.
Used for Pu'al/Huf'al 3ms entries where we know the active verb's slug."""
url = f"{PEALIM_BASE}/dict/{active_slug}/"
try:
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
except Exception as e:
logger.error(f" Error fetching {url}: {e}")
return None
soup = BeautifulSoup(resp.text, "lxml")
# Extract meaning (this is the active verb's meaning — useful context for passive)
meaning = ""
lead_div = soup.find("div", class_="lead")
if lead_div:
meaning = lead_div.get_text(strip=True)
root = ""
for span in soup.find_all("span", class_="menukad"):
txt = span.get_text(strip=True)
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
root = txt
break
active_binyan = _extract_binyan_from_page(soup)
active_forms_raw = _parse_table(soup, passive=False)
active_infinitive = active_forms_raw.get("infinitive", {}).get("form", "")
passive_forms_raw = _parse_table(soup, passive=True)
if not passive_forms_raw:
logger.warning(f" No passive forms found on {active_slug} for {search_term}")
return None
passive_binyan = _extract_passive_binyan_from_page(soup)
if not passive_binyan:
passive_binyan = "Pu'al" if active_binyan == "Pi'el" else "Huf'al" if active_binyan == "Hif'il" else ""
if not passive_binyan:
passive_binyan = binyan_hint
result = {
"infinitive": search_term,
"slug": active_slug,
"root": root,
"binyan": passive_binyan,
"meaning": meaning,
"is_passive": True,
"reference_form": active_infinitive or search_term,
"forms": {},
}
for key, form_data in passive_forms_raw.items():
if key in PRONOUN_LABELS:
result["forms"][key] = {
"form": form_data["form"],
"audio_url": form_data.get("audio_url", ""),
"pronoun": PRONOUN_LABELS[key],
"tense": TENSE_DESCRIPTION.get(key, ""),
}
logger.info(f" Extracted {len(result['forms'])} passive forms for {search_term} from {active_slug}")
return result
def main(verbs_file: Path = VERBS_INPUT) -> dict:
"""Read verbs from file and extract conjugations. Returns full conjugations dict."""
if not verbs_file.exists():
logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
return _load_conjugations()
raw_lines = verbs_file.read_text(encoding="utf-8").splitlines()
# Parse slug overrides: "# slug: VERB SLUG" anywhere in the file
slug_overrides: dict[str, str] = {}
for line in raw_lines:
stripped = line.strip()
if stripped.startswith("# slug:"):
parts = stripped[len("# slug:") :].strip().split()
if len(parts) >= 2:
slug_overrides[parts[0]] = parts[1]
# Map section header keywords → binyan name (for binyan_hint fallback)
SECTION_BINYAN = {
"pa'al": "Pa'al",
"nif'al": "Nif'al",
"pi'el": "Pi'el",
"pu'al": "Pu'al",
"hitpa'el": "Hitpa'el",
"hif'il": "Hif'il",
"huf'al": "Huf'al",
}
# Parse: regular verbs and # 3ms: lines (optional active slug on 3ms lines)
# Track current section binyan from comment headers for use as a hint
verbs: list[tuple[str, bool, str | None, str]] = [] # (search_term, is_3ms_search, active_slug, binyan_hint)
current_binyan_hint = ""
for line in raw_lines:
stripped = line.strip()
if not stripped or stripped.startswith("# slug:"):
continue
if stripped.startswith("# 3ms:"):
parts = stripped[len("# 3ms:") :].strip().split()
if parts:
form = parts[0]
active_slug = parts[1] if len(parts) >= 2 else None
verbs.append((form, True, active_slug, current_binyan_hint))
elif stripped.startswith("#"):
# Check if this is a section header setting the binyan context
low = stripped.lower()
for key, bname in SECTION_BINYAN.items():
if key in low:
current_binyan_hint = bname
break
else:
verbs.append((stripped, False, None, current_binyan_hint))
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file} ({sum(1 for _, p, _, _ in verbs if p)} passive 3ms)")
if slug_overrides:
logger.info(f" Slug overrides: {slug_overrides}")
conjugations = _load_conjugations()
new_count = 0
for verb, is_3ms, active_slug, binyan_hint in verbs:
if verb in conjugations:
logger.info(f"Skipping {verb} (cached)")
continue
logger.info(f"Processing: {verb} {'(3ms search)' if is_3ms else ''}")
time.sleep(REQUEST_DELAY)
if is_3ms:
# Passive-only extraction: use provided active slug or search to find it
if active_slug:
slug = active_slug
logger.info(f" Using active slug {slug} for passive extraction")
else:
slug = _find_slug(verb)
if not slug:
logger.warning(f" No slug found for {verb}")
conjugations[verb] = None
_save_conjugations(conjugations)
continue
logger.info(f" Found active slug {slug} for passive extraction")
time.sleep(REQUEST_DELAY)
data = _extract_passive_from_active_slug(slug, verb, binyan_hint=binyan_hint)
else:
override = slug_overrides.get(verb)
if override:
logger.info(f" Slug override: {override}")
slug = override
else:
slug = _find_slug(verb)
if not slug:
logger.warning(f" No slug found for {verb}")
conjugations[verb] = None
_save_conjugations(conjugations)
continue
time.sleep(REQUEST_DELAY)
data = _extract_conjugations(slug, verb, is_3ms_search=False, binyan_hint=binyan_hint)
conjugations[verb] = data
_save_conjugations(conjugations)
new_count += 1
logger.info(f"Done: {new_count} new verbs processed")
return conjugations
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
result = main()
for verb, data in result.items():
if data:
forms = data.get("forms", {})
print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
sample_form = next(iter(forms.values()), {}) if forms else {}
print(f" sample audio_url: {sample_form.get('audio_url', 'MISSING')[:60]}")
else:
print(f"{verb}: no data")

File diff suppressed because it is too large Load diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load diff

9121
data/hebrew_dict.csv Normal file

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

9106
data/pealim_dict.csv Normal file

File diff suppressed because it is too large Load diff

12111
data/pealim_dict_for_anki.csv Normal file

File diff suppressed because it is too large Load diff

2297586
data/words.json

File diff suppressed because it is too large Load diff

View file

@ -1,150 +0,0 @@
# Adaptive Sentence Difficulty Cloze — v0.20 Design Spec
**Date:** 2026-03-15
**Status:** Approved
**Release:** v0.20
## Problem
Cloze cards currently select the example sentence closest to 9 words in length. This ignores whether the surrounding context words are familiar to the learner. A sentence full of rare words is harder than one with common words, regardless of length.
## Solution
Replace the length-based `_score()` function in `epub_examples.py` with a **frequency-based difficulty score**. The easiest sentence (most common context words) becomes the cloze. All vetted sentences remain on the card, ordered easy→hard.
## Scoring Pipeline
### Token Frequency Lookup (5-tier)
Given a nikkud sentence token, resolve its frequency rank:
1. **Known mapping** — look up token in the nikkud→ktiv_male map built from words.json headwords, conjugations, and inflections (94k mappings). If found, look up the ktiv_male in the frequency data.
2. **Nikkud prefix stripping** — use `_try_strip_prefix()` to strip validated Hebrew prefixes (בהוכלמש), then resolve the remainder via the known mapping.
3. **Academy rules converter** — apply `nikkud_to_ktiv_male.convert()` (91.6% accuracy) to produce ktiv_male, look up in frequency data.
4. **strip_nikkud fallback** — use `helpers.strip_nikkud()` as a lossy fallback.
5. **Ktiv_male prefix stripping** — strip 1-2 character Hebrew prefixes from the converted/stripped form and look up the stem.
Tokens not found in any tier are assigned a default high rank (50,000).
**Coverage:** ~93% of example sentence tokens resolve to a frequency rank (measured empirically on 7,588 sentences).
**Frequency data source:** Use `frequency_lookup.py` which auto-selects `frequency_clean.json` when available, falling back to `frequency_cache.json`.
### Sentence Difficulty Score
For a given word's candidate sentence:
1. Tokenize: split on whitespace, strip punctuation (.,!?;:"'"״׳–—()[]{}), split on maqaf (־).
2. Exclude the target word's token using `cloze_word_start`/`cloze_word_end` offsets from the matched sentence.
3. For each remaining token (length >= 2), resolve its frequency rank via the 5-tier pipeline.
4. **Score = median frequency rank of context tokens.**
Lower score = easier (context words are more common). Median resists outliers (one rare proper noun shouldn't dominate).
### Integration Point
The scoring integrates into `epub_examples.py`'s existing `_score()` closure inside `update_words_json()` (line ~677). Currently:
```python
def _score(s: dict) -> tuple[int,]:
wc = s["word_count"]
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
return (length_score,)
```
New scoring replaces length with frequency-based difficulty. The `_score` function gains access to the frequency pipeline via closure over the nikkud_map, nikkud_index, and freq_data built once at the start of `update_words_json()`.
**Minimum sentence length:** Reduced from 4 words to 3 words (`MIN_WORDS = 3` in epub_examples.py). Hebrew is more concise than English — 3-word sentences are valid and common. This expands the candidate pool for cloze selection.
**Behavioral change:** Because `pool.sort(key=_score)` determines which 3 sentences are selected as `best = pool[:3]`, changing the scoring function changes **which sentences are selected**, not just their order. This is intentional — we want the easiest sentences as cloze candidates, not the closest-to-9-words ones. Existing cloze GUIDs will be preserved when the same sentence text is re-selected; entries where a different sentence wins will get new GUIDs.
## Data Model Changes
### words.json
The `examples.cloze` dict (single sentence) gains an optional `difficulty_score` field:
```json
{
"examples": {
"vetted": [
{"text": "...", "source": "...", "match_method": "..."},
{"text": "...", "source": "...", "match_method": "..."}
],
"cloze": {
"text": "...",
"cloze_word_start": 5,
"cloze_word_end": 10,
"cloze_hint": null,
"cloze_guid": "abc123",
"difficulty_score": 234
}
}
}
```
The vetted list is also sorted by difficulty (easiest first), so the card back shows sentences in pedagogically useful order.
### SCHEMA.yaml
Add `difficulty_score` as optional integer field under `examples.cloze`.
## Implementation Scope
### New file: `sentence_difficulty.py`
Standalone module for sentence scoring. No pipeline step — called by `epub_examples.py`.
- `score_sentence(sentence_text: str, target_start: int, target_end: int, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — returns median context frequency rank. Uses `target_start`/`target_end` character offsets to exclude the cloze target token.
- `build_nikkud_map(words: dict) -> dict[str, str]` — builds nikkud→ktiv_male lookup from words.json (headwords + conjugation forms + noun inflections). Returns `{nikkud_form: ktiv_male_form}`. Implementation note: should share iteration logic with `epub_examples._build_nikkud_index()` or derive from its output to avoid duplicating the traversal of words.json forms.
- `_resolve_token_frequency(token: str, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — the 5-tier lookup. Uses `_try_strip_prefix` from epub_examples (made importable by removing underscore or adding a public wrapper).
### Modified files
- **`epub_examples.py`**:
- Import `sentence_difficulty.score_sentence` and `sentence_difficulty.build_nikkud_map`
- In `update_words_json()`: build nikkud_map and load freq_data once at start (before per-word loop)
- Replace `_score()` closure with frequency-based scoring that calls `score_sentence()`
- Sort vetted list by difficulty score (easiest first)
- Store `difficulty_score` in the cloze dict
- Make `_try_strip_prefix` importable (rename to `try_strip_prefix` or add public alias)
- **`frequency_lookup.py`** — add `get_freq_data() -> dict` public accessor to expose the loaded frequency dict (avoids accessing private `_freq` directly)
- **`SCHEMA.yaml`** — add `difficulty_score` field
- **`run.py`** — no changes; scoring happens inside epub_examples step
### Not modified
- **`apkg_builder.py`** — reads cloze as-is; vetted order is already respected
- **`nikkud_to_ktiv_male.py`** — used as-is
- **Card templates** — no changes needed
## Dependencies
- `nikkud_to_ktiv_male.convert()` — Academy rules converter (already written)
- `epub_examples._try_strip_prefix()` / `_build_nikkud_index()` — nikkud prefix stripping and index
- `frequency_lookup.py` — loads frequency data (auto-selects clean vs cache)
- `helpers.strip_nikkud()` — fallback converter
## Validation
- **Unit tests** for `score_sentence()` with known easy/hard sentences
- **Unit tests** for `_resolve_token_frequency()` covering all 5 tiers
- **Integration test**: verify cloze selection picks easiest sentence, vetted list is sorted
- **Spot check**: manually review 10 words with 3+ sentences to confirm ordering
- **Regression**: existing tests pass, GUID coverage unchanged, deck validates
## Constraints
- `examples.cloze` remains a single dict (not converted to list)
- No new Anki card types or fields
- No runtime JS in Anki cards
- No network calls during scoring
- `difficulty_score` is informational metadata; card rendering doesn't depend on it
- Existing cloze GUIDs preserved when the same sentence is re-selected
## Scope Exclusions (Future Work)
- **Pronominal suffix stripping** — would improve the ~7% unscored token rate; deferred (PROJECT_NOTES.md)
- **Kamatz katan disambiguation** — requires morphological analysis; accepted limitation
- **Per-learner adaptive difficulty** — requires Anki plugin; out of scope for static deck
- **Multiple cloze sentences per card** — would require schema migration to list; deferred

File diff suppressed because it is too large Load diff

View file

@ -3,10 +3,6 @@
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
Downloads he_50k.txt once; subsequent runs read from cache.
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
TODO: Rewrite to update words.json frequency field directly instead of
writing to a separate frequency_cache.json. Currently the migration script
bridges the gap. See Phase 5 in SPRINT_LOG.md.
"""
import json
@ -15,11 +11,12 @@ from pathlib import Path
import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json"
REQUEST_TIMEOUT = 30
# Module-level cache: word_no_nikkud -> rank (1 = most common)
@ -27,19 +24,12 @@ _freq: dict[str, int] = {}
def load(cache_path: Path = CACHE_PATH) -> None:
"""Load frequency data from cache, downloading if not present.
Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json.
"""
"""Load frequency data from cache, downloading if not present."""
global _freq
# Prefer YAP-cleaned frequency data if available
clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None
load_path = clean_path if clean_path and clean_path.exists() else cache_path
if load_path.exists():
with open(load_path, encoding="utf-8") as f:
if cache_path.exists():
with open(cache_path, encoding="utf-8") as f:
_freq = json.load(f)
label = "clean" if load_path == clean_path else "raw"
logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries")
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
return
logger.info("Downloading FrequencyWords he_50k.txt …")
@ -51,7 +41,7 @@ def load(cache_path: Path = CACHE_PATH) -> None:
line = line.strip()
if not line:
continue
word = line.split()[0]
word = _strip_nikkud(line.split()[0])
if word and word not in _freq:
_freq[word] = rank
rank += 1
@ -66,24 +56,14 @@ def get_frequency_rank(word_no_nikkud: str) -> int | None:
"""
Return the frequency rank of a word (1 = most common).
Returns None if not found in the corpus.
Expects ktiv male (no nikkud) input.
Strips nikkud from the input before lookup.
"""
if not _freq:
load()
clean = word_no_nikkud.strip()
clean = _strip_nikkud(word_no_nikkud.strip())
return _freq.get(clean)
def get_freq_data() -> dict[str, int]:
"""Return the full frequency dict (word -> rank).
Auto-loads from cache if not yet loaded.
"""
if not _freq:
load()
return _freq
if __name__ == "__main__":
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
load()

216
hebrew_extract.py Normal file
View file

@ -0,0 +1,216 @@
#!/usr/bin/env python3
"""
Extract Hebrew vocabulary from pealim.com dictionary.
Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards.
"""
import logging
import time
import pandas as pd
import requests
from bs4 import BeautifulSoup
# Configure logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# Session for connection pooling
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
REQUEST_TIMEOUT = 10 # seconds
def get_total_pages() -> int:
"""Dynamically determine total pages from first request."""
try:
logger.info("Fetching total page count...")
cookies = {"translit": "none", "hebstyle": "mo"}
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
# Hardcoded — pealim.com has ~608 pages at ~15 words/page
return 608
except Exception as e:
logger.error(f"Error fetching page count: {e}. Using default (608).")
return 608
def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
"""
Parse a dict page with BeautifulSoup to extract word data + audio URL.
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
"""
soup = BeautifulSoup(html_bytes, "html.parser")
rows = []
for tr in soup.select("table tr"):
tds = tr.find_all("td")
if len(tds) < 4:
continue
# Audio URL from span[data-audio] in first td
audio_span = tds[0].find(attrs={"data-audio": True})
audio_url = audio_span["data-audio"] if audio_span else ""
# Word with nikkud
menukad = tds[0].find("span", class_="menukad")
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
# Root (may be link or plain text)
root = tds[1].get_text(strip=True)
# Part of speech
pos = tds[2].get_text(strip=True)
# Meaning
meaning = tds[3].get_text(strip=True)
if word:
rows.append(
{
"Word": word,
"Root": root if root else "-",
"Part of Speech": pos,
"Meaning": meaning,
"audio_url": audio_url,
}
)
return rows
def extract_from_website(max_pages: int | None = None) -> pd.DataFrame:
"""
Extract dictionary entries from pealim.com.
Captures audio URLs from each word entry's data-audio attribute.
Args:
max_pages: Maximum pages to scrape (None = all)
Returns:
DataFrame with Word, Root, Part of Speech, Meaning, Word Without Nikkud, audio_url columns
"""
total_pages = max_pages or get_total_pages()
logger.info(f"Starting extraction from {total_pages} pages...")
all_rows: list[dict] = []
for page_num in range(1, total_pages + 1):
try:
url = f"{PEALIM_DICT_URL}?page={page_num}"
# First request: with nikkud — parse with BeautifulSoup for audio URL
cookies = {"translit": "none", "hebstyle": "mo"}
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
response.raise_for_status()
page_rows = _parse_page_with_audio(response.content)
# Second request: without nikkud — just get the word column
cookies_vl = {"translit": "none", "hebstyle": "vl", "showmeaning": "off"}
resp_vl = session.get(url, cookies=cookies_vl, timeout=REQUEST_TIMEOUT)
resp_vl.raise_for_status()
soup_vl = BeautifulSoup(resp_vl.content, "html.parser")
no_nik_words = []
for tr in soup_vl.select("table tr"):
tds = tr.find_all("td")
if len(tds) < 4:
continue
menukad = tds[0].find("span", class_="menukad")
w = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
no_nik_words.append(w)
# Merge no-nikkud words into rows
for i, row in enumerate(page_rows):
row["Word Without Nikkud"] = no_nik_words[i] if i < len(no_nik_words) else ""
all_rows.extend(page_rows)
if page_num % 50 == 0:
logger.info(f"Processed {page_num}/{total_pages} pages ({len(all_rows)} words so far)...")
time.sleep(REQUEST_DELAY)
except requests.RequestException as e:
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
time.sleep(REQUEST_DELAY * 2)
except Exception as e:
logger.error(f"Unexpected error on page {page_num}: {e}")
continue
df = pd.DataFrame(all_rows)
audio_count = (df["audio_url"] != "").sum() if "audio_url" in df.columns else 0
logger.info(f"Extraction complete. Total words: {len(df)}, with audio URL: {audio_count}")
return df
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
"""
Transform dictionary DataFrame for Anki import.
Adds shared root words and Hebrew tags. Preserves audio_url column.
"""
logger.info("Preparing data for Anki...")
# Find shared root words
shared_root_words = []
for _idx, row in df.iterrows():
root = row["Root"]
word = row["Word"]
if root != "-" and pd.notna(root):
same_root = df[(df["Root"] == root) & (df["Word"] != word)]["Word"].values
shared = " ".join(str(w) for w in same_root)
shared_root_words.append(shared)
else:
shared_root_words.append("")
df["shared roots"] = shared_root_words
# Generate Hebrew tags
tags = []
for _idx, row in df.iterrows():
tag_parts = []
root = str(row["Root"]).replace(" ", "").replace("-", "")
if "nan" not in root and root:
root_clean = root.replace(".", "")
tag_parts.append(f"שורש::{root_clean}")
pos = str(row["Part of Speech"])
pos_tags = {
"Adverb": "תוארי_הפועל",
"Pronoun": "כינוייוף",
"Noun": "שם_עצם",
"Verb": "פעלים",
"Adjective": "שם_תואר",
"Preposition": "מילות_יחס",
"Conjunction": "מילות_חיבור",
"Particle": "מילית",
}
for key, value in pos_tags.items():
if key in pos:
tag_parts.append(value)
break
tags.append(" ".join(tag_parts))
df["tags"] = tags
logger.info("Anki preparation complete.")
return df
def main():
"""Main entry point."""
try:
df = extract_from_website()
df.to_csv("hebrew_dict.csv", index=True)
logger.info("Saved: hebrew_dict.csv")
df = modify_for_anki(df)
df.to_csv("hebrew_dict_for_anki.csv", sep=";", index=True)
logger.info("Saved: hebrew_dict_for_anki.csv")
logger.info("Complete!")
except Exception as e:
logger.error(f"Fatal error: {e}")
raise
if __name__ == "__main__":
main()

View file

@ -2,10 +2,6 @@
"""
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
TODO: Rewrite to update words.json image/image_source fields directly instead of
writing to a separate image_cache.json. Currently the migration script bridges
the gap. See Phase 5 in SPRINT_LOG.md.
Scope: Noun PoS entries only. Concreteness heuristic:
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
-ship, -ure, -al, -ing when not a gerund, -ence)
@ -31,6 +27,8 @@ from pathlib import Path
import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
@ -61,6 +59,7 @@ session.headers.update(
)
def is_concrete(english_meaning: str) -> bool:
"""Return True if the English meaning looks like a concrete noun."""
meaning = english_meaning.strip().lower()
@ -76,7 +75,7 @@ def is_concrete(english_meaning: str) -> bool:
def _safe_name(word_no_nikkud: str) -> str:
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud))
return hebrew_only if hebrew_only else "unknown"
@ -259,7 +258,7 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
if single_word and word_plain != single_word:
continue
cache_key = word_plain
cache_key = word_plain or _strip_nikkud(word)
if cache_key in cache:
skipped_cached += 1

View file

@ -1,185 +0,0 @@
"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling).
Implements Hebrew Academy rules for matres lectionis insertion:
- Rule A: U vowel (kubutz) always insert vav
- Rule B: O vowel (holam on non-vav) insert vav
- Rule C: I vowel (hiriq) insert yod (conditionally)
- Rule D: E vowel (tsere) insert yod (limited cases)
- Rule E/F: Consonantal vav/yod doubling
Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/
"""
import unicodedata
# Hebrew nikkud code points
SHVA = "\u05b0"
HATAF_SEGOL = "\u05b1"
HATAF_PATAH = "\u05b2"
HATAF_KAMATZ = "\u05b3"
HIRIQ = "\u05b4"
TSERE = "\u05b5"
SEGOL = "\u05b6"
PATAH = "\u05b7"
KAMATZ = "\u05b8"
HOLAM = "\u05b9"
HOLAM_HASER = "\u05ba"
KUBUTZ = "\u05bb"
DAGESH = "\u05bc"
METEG = "\u05bd"
RAFE = "\u05bf"
SHIN_DOT = "\u05c1"
SIN_DOT = "\u05c2"
VAV = "ו"
YOD = "י"
MAQAF = "־"
VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ}
NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT}
def _parse_segments(text: str) -> list[tuple[str, list[str]]]:
"""Parse nikkud text into (character, [marks]) segments."""
segments: list[tuple[str, list[str]]] = []
cur_char: str | None = None
cur_marks: list[str] = []
for ch in text:
if unicodedata.category(ch) == "Mn":
cur_marks.append(ch)
else:
if cur_char is not None:
segments.append((cur_char, cur_marks))
cur_char = ch
cur_marks = []
if cur_char is not None:
segments.append((cur_char, cur_marks))
return segments
def _get_vowel(marks: list[str]) -> str | None:
"""Extract the vowel mark from a list of combining marks."""
for m in marks:
if m in VOWELS:
return m
return None
def _has_dagesh(marks: list[str]) -> bool:
return DAGESH in marks
def _is_hebrew_letter(ch: str) -> bool:
return "\u05d0" <= ch <= "\u05ea"
def convert(text: str) -> str:
"""Convert nikkud Hebrew text to ktiv male.
Strips all nikkud marks and inserts matres lectionis (vav/yod)
according to Hebrew Academy spelling rules.
"""
segments = _parse_segments(text)
result: list[str] = []
for i, (ch, marks) in enumerate(segments):
if not _is_hebrew_letter(ch):
# Non-Hebrew character: output as-is (no marks)
result.append(ch)
continue
vowel = _get_vowel(marks)
has_dag = _has_dagesh(marks)
# Output the base letter (strip all nikkud marks)
result.append(ch)
# --- Rule A: U vowel (kubutz) → always add vav ---
if vowel == KUBUTZ:
result.append(VAV)
continue
# --- Shuruk detection ---
# Vav with dagesh and no other vowel = shuruk (already a mater)
# Vav with dagesh AND a vowel = consonantal vav (ב with dagesh)
# If letter is vav with dagesh only → it's shuruk, already output
if ch == VAV and has_dag and vowel is None:
# Shuruk: vav IS the mater lectionis, already output
continue
# --- Rule B: O vowel (holam) → add vav ---
if vowel in (HOLAM, HOLAM_HASER):
if ch != VAV:
# Exception: holam before aleph (pe-aleph verbs) — no vav
# e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד
next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א"
if not next_is_aleph:
result.append(VAV)
# If ch IS vav (holam male), vav already output
continue
# --- Rule C: I vowel (hiriq) → conditionally add yod ---
if vowel == HIRIQ:
if ch == YOD:
# Yod already present, don't double
continue
# Don't insert yod if next letter is already yod
if i + 1 < len(segments) and segments[i + 1][0] == YOD:
continue
# Rule C Section 3: Don't add yod if the NEXT consonant
# has shva (indicating shva nach on that consonant)
add_yod = True
if i + 1 < len(segments):
next_ch, next_marks = segments[i + 1]
next_vowel = _get_vowel(next_marks)
# Shva on next consonant = shva nach → don't add yod
# UNLESS next consonant also has dagesh (= shva na / doubled)
next_has_dagesh = _has_dagesh(next_marks)
if next_vowel == SHVA and not next_has_dagesh:
add_yod = False
# No vowel on next consonant (word-final) = closed syllable
# → don't add yod (e.g., suffix -תי -נו -תם)
elif next_vowel is None and _is_hebrew_letter(next_ch):
# Check if this is truly word-final or next-to-last
remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0]))
if remaining_letters <= 2:
# Short suffix like תי, נו — don't add yod
add_yod = False
if add_yod:
result.append(YOD)
continue
# --- Rule D: E vowel (tsere/segol) → generally NO yod ---
# Exception (b): tsere before guttural/resh gets yod ONLY
# in word-initial position (dagesh substitution in Hif'il/noun patterns)
# e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע
# but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר
if vowel == TSERE:
add_yod = False
if i + 1 < len(segments):
next_ch = segments[i + 1][0]
if next_ch in "אהחער":
# Only at word-initial (pos 0) or after prefix (pos 1)
# where dagesh substitution applies
hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0]))
if hebrew_pos <= 1:
add_yod = True
if add_yod:
result.append(YOD)
continue
# All other vowels (patah, kamatz, segol, shva, hataf-*):
# No mater lectionis insertion needed
return "".join(result)

BIN
pealim.apkg Normal file

Binary file not shown.

View file

@ -1,348 +0,0 @@
#!/usr/bin/env python3
"""Download audio files from URLs stored in words.json.
Three audio categories are handled:
1. Vocab audio data/audio/{audio_file}
2. Noun plural data/audio/{slug}_plural.mp3
3. Conjugation data/audio_conj/{slug}_{form_key}.mp3
data/audio_conj/{slug}_passive_{form_key}.mp3
"""
import argparse
import json
import logging
import re
import time
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
WORDS_JSON = DATA_DIR / "words.json"
DOWNLOAD_DELAY = 0.3
MAX_RETRIES = 3
# Map Hebrew tense names to English prefixes for form_key construction.
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
# appear in the current dataset but the form_key collapses to bare "infinitive".
TENSE_TO_PREFIX = {
"הוֹוֶה": "present",
"עָבָר": "past",
"עָתִיד": "future",
"צִוּוּי": "imperative",
"מְקוֹר": "infinitive",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_audio_file(entry: dict) -> str:
"""Derive the vocab audio filename when audio_file is absent.
Slug-based for confusable entries (slug contains the disambiguating ID),
consonant-only for all others.
Args:
entry: A words.json entry dict.
Returns:
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
"""
audio_file = entry.get("audio_file", "")
if audio_file:
return audio_file
# Fallback: use slug for confusables, ktiv_male for others
slug = entry.get("slug", "")
if entry.get("confusable_group"):
return f"{slug}.mp3"
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
return f"{safe_name}.mp3"
def _form_key(person: str, tense: str) -> str:
"""Build a filesystem-safe form key from person and tense fields.
Args:
person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
tense: Hebrew tense string from the conjugation form.
Returns:
Form key such as ``"past_1s"`` or ``"present_ms"``.
Infinitive tense always returns ``"infinitive"`` (no person suffix).
"""
prefix = TENSE_TO_PREFIX.get(tense, tense)
if prefix == "infinitive":
return "infinitive"
return f"{prefix}_{person}"
def _download(url: str, dest: Path, session: requests.Session) -> bool:
"""Download *url* to *dest*, retrying up to MAX_RETRIES times.
Skips the download silently if *dest* already exists.
Args:
url: HTTP(S) URL to download.
dest: Local path to write the file to.
session: Shared requests session.
Returns:
``True`` if the file was downloaded (or already existed),
``False`` if all retries were exhausted.
"""
if dest.exists():
return True
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = session.get(url, timeout=15)
resp.raise_for_status()
dest.write_bytes(resp.content)
logger.debug("Downloaded %s%s", url, dest.name)
return True
except requests.RequestException as exc:
wait = 2**attempt
if attempt < MAX_RETRIES:
logger.warning(
"Attempt %d/%d failed for %s (%s) — retrying in %ds",
attempt,
MAX_RETRIES,
url,
exc,
wait,
)
time.sleep(wait)
else:
logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
return False
# ---------------------------------------------------------------------------
# Per-category downloaders
# ---------------------------------------------------------------------------
def download_vocab_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int, int]:
"""Download vocabulary audio files.
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached, no_url) counts.
"""
downloaded = cached = no_url = 0
for entry in entries:
url: str | None = entry.get("audio_url")
if not url:
no_url += 1
continue
audio_file: str | None = entry.get("audio_file")
if not audio_file:
audio_file = _make_audio_file(entry)
dest = AUDIO_DIR / audio_file
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
else:
no_url += 1 # count persistent failures alongside missing URLs
return downloaded, cached, no_url
def download_noun_plural_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int]:
"""Download noun plural audio files.
Destination: ``data/audio/{slug}_plural.mp3``
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached) counts.
"""
downloaded = cached = 0
for entry in entries:
ni = entry.get("noun_inflection")
if not ni or not isinstance(ni, dict):
continue
url: str | None = ni.get("plural_audio")
if not url or not url.startswith("http"):
continue
slug: str = entry["slug"]
dest = AUDIO_DIR / f"{slug}_plural.mp3"
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
return downloaded, cached
def download_conjugation_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int, int]:
"""Download conjugation form audio files.
Active forms ``data/audio_conj/{slug}_{form_key}.mp3``
Passive forms ``data/audio_conj/{slug}_passive_{form_key}.mp3``
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached, failed) counts.
"""
downloaded = cached = failed = 0
for entry in entries:
conj = entry.get("conjugation")
if not conj:
continue
slug: str = entry["slug"]
form_sets: list[tuple[str, list]] = [
("", conj.get("active_forms") or []),
("passive_", conj.get("hufal_pual_forms") or []),
]
for prefix, forms in form_sets:
for form in forms:
url: str | None = form.get("audio_url")
if not url:
continue
key = _form_key(form.get("person", ""), form.get("tense", ""))
dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
else:
failed += 1
return downloaded, cached, failed
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
"""Parse CLI args and run the audio download pipeline."""
parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
parser.add_argument(
"--skip-vocab",
action="store_true",
help="Skip vocabulary audio downloads.",
)
parser.add_argument(
"--skip-conj",
action="store_true",
help="Skip conjugation audio downloads.",
)
parser.add_argument(
"--test",
metavar="N",
type=int,
default=None,
help="Limit processing to the first N words.json entries.",
)
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
with open(WORDS_JSON, encoding="utf-8") as fh:
raw: dict[str, dict] = json.load(fh)
entries = list(raw.values())
if args.test is not None:
entries = entries[: args.test]
logger.info("[4] Downloading audio files …")
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
# --- Vocab ---
if not args.skip_vocab:
v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
else:
v_dl = v_cached = v_no_url = 0
# --- Noun plural ---
np_dl, np_cached = download_noun_plural_audio(entries, session)
# --- Conjugation ---
if not args.skip_conj:
c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
else:
c_dl = c_cached = c_failed = 0
# --- Summary ---
if not args.skip_vocab:
logger.info(
" Vocab: %d downloaded, %d cached, %d no URL",
v_dl,
v_cached,
v_no_url,
)
logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached)
if not args.skip_conj:
failed_msg = f", {c_failed} failed" if c_failed else ""
logger.info(
" Conjugation: %d downloaded, %d cached%s",
c_dl,
c_cached,
failed_msg,
)
if __name__ == "__main__":
main()

File diff suppressed because it is too large Load diff

9106
pealim_dict.csv Normal file

File diff suppressed because it is too large Load diff

12111
pealim_dict_for_anki.csv Normal file

File diff suppressed because it is too large Load diff

View file

@ -1,714 +0,0 @@
#!/usr/bin/env python3
"""
Consolidated list page scraper for pealim.com.
Scrapes /dict/?page=N with two cookie variants (hebstyle=mo for nikkud,
hebstyle=vl for ktiv male) and writes results directly to data/words.json.
Usage:
python3 pealim_list_scrape.py [--test N] [--force-refresh]
"""
import argparse
import json
import logging
import os
import re
import time
from datetime import date
from pathlib import Path
import requests
from bs4 import BeautifulSoup
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).parent
DATA_DIR = PROJECT_ROOT / "data"
WORDS_JSON = DATA_DIR / "words.json"
PROGRESS_JSON = DATA_DIR / "list_scrape_progress.json"
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
REQUEST_DELAY = 1.5 # seconds between requests
REQUEST_TIMEOUT = 15 # seconds
DEFAULT_TOTAL_PAGES = 608
SAVE_EVERY = 10 # pages between incremental saves
TODAY = date.today().isoformat()
# Prefer lxml if available; html.parser is the fallback
try:
import lxml # type: ignore[import-untyped] # noqa: F401
BS4_PARSER = "lxml"
except ImportError:
BS4_PARSER = "html.parser"
# ---------------------------------------------------------------------------
# Part-of-speech mappings
# ---------------------------------------------------------------------------
POS_HEBREW: dict[str, str] = {
"Noun": "שֵׁם עֶצֶם",
"Verb": "פֹּעַל",
"Adjective": "שֵׁם תֹּאַר",
"Adverb": "תֹּאַר הַפֹּעַל",
"Pronoun": "כִּנּוּי גּוּף",
"Preposition": "מִילַּת יַחַס",
"Conjunction": "מִילַּת חִבּוּר",
"Interjection": "מִילַּת קְרִיאָה",
"Numeral": "שֵׁם מִסְפָּר",
"Cardinal numeral": "שֵׁם מִסְפָּר",
"Particle": "מִילִּית",
"Determiner": "מְגַדִּיר",
"Existential": "מִילַּת קִיּוּם",
"Interrogative": "מִילַּת שְׁאֵלָה",
}
# Use exact match on the POS string prefix; longer keys must be checked first.
POS_HEBREW_ORDERED: list[tuple[str, str]] = sorted(POS_HEBREW.items(), key=lambda x: -len(x[0]))
BINYAN_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּיעֵל",
"Pu'al": "פֻּעַל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
"Hitpa'el": "הִתְפַּעֵל",
}
# Regex for extracting emoji characters
EMOJI_RE = re.compile(
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF\uFE0E\uFE0F\u200D]+",
re.UNICODE,
)
# Regex for extracting Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
# Fields that must never be overwritten when updating an existing entry
PROTECTED_FIELDS = frozenset(
[
"vocab_legacy_guid",
"confusables_guid",
"frequency",
"pseudo_frequency",
"emoji",
"emoji_source",
"emoji_visible",
"image",
"image_source",
"hint",
"examples",
"noun_inflection",
"conjugation",
"adjective_inflection",
"preposition_inflection",
]
)
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki-scraper/1.0)"})
# ---------------------------------------------------------------------------
# Default entry template
# ---------------------------------------------------------------------------
def _default_entry() -> dict:
"""Return a fresh entry with all fields initialised to safe defaults."""
return {
"word": {"nikkud": "", "ktiv_male": ""},
"slug": "",
"root": [],
"pos": "",
"pos_hebrew": "",
"meaning": "",
"meaning_raw": "",
"audio_url": "",
"audio_file": "",
"tags": "",
"last_scrape_date": "",
"vocab_legacy_guid": None,
"frequency": None,
"pseudo_frequency": None,
"emoji": None,
"emoji_source": None,
"emoji_visible": False,
"image": None,
"image_source": None,
"hint": "",
"prep": None,
"shared_roots": [],
"confusable_group": None,
"confusables_guid": None,
"examples": None,
"noun_inflection": None,
"conjugation": None,
"adjective_inflection": None,
"preposition_inflection": None,
}
# ---------------------------------------------------------------------------
# Parsing helpers
# ---------------------------------------------------------------------------
def _extract_emoji(text: str) -> str | None:
"""Return the first emoji run found in *text*, or None."""
m = EMOJI_RE.search(text)
return m.group(0) if m else None
def _clean_meaning(raw: str) -> str:
"""Strip emoji, Hebrew parenthesized prepositions, and extra whitespace from a raw meaning string."""
cleaned = EMOJI_RE.sub("", raw)
cleaned = HBPAREN_RE.sub("", cleaned)
return " ".join(cleaned.split())
def _parse_pos(pos_raw: str) -> tuple[str, str]:
"""
Parse raw PoS string into (pos_en, pos_hebrew).
Examples:
"Noun masculine" ("Noun", "שֵׁם עֶצֶם")
"Verb pa'al" ("Verb", "פֹּעַל — פָּעַל")
"Cardinal numeral" ("Cardinal numeral", "שֵׁם מִסְפָּר")
"""
# Strip leading/trailing whitespace; normalise dashes
pos_clean = pos_raw.strip()
# Determine the base English PoS with longest-match strategy
pos_en = ""
for key, _ in POS_HEBREW_ORDERED:
if pos_clean.startswith(key):
pos_en = key
break
if not pos_en:
# Fallback: take everything up to " " or the full string
pos_en = pos_clean.split(" ")[0].split(" - ")[0].strip()
pos_heb = POS_HEBREW.get(pos_en, pos_en)
# For verbs, attempt to append binyan
if pos_en == "Verb":
# Look for binyan after dash; pealim uses "Verb pa'al"
dash_parts = re.split(r"\s*[-]\s*", pos_clean)
if len(dash_parts) >= 2:
binyan_raw = dash_parts[1].strip()
# Normalise capitalisation for lookup: "pa'al" → "Pa'al"
binyan_key = binyan_raw.capitalize()
# Handle mixed-case entries like "Nif'al"
for bkey in BINYAN_HEBREW:
if bkey.lower() == binyan_raw.lower():
binyan_key = bkey
break
binyan_heb = BINYAN_HEBREW.get(binyan_key)
if binyan_heb:
pos_heb = f"{pos_heb}{binyan_heb}"
return pos_en, pos_heb
def _parse_root(root_raw: str) -> list[str]:
"""
Convert raw root text to a list of consonants.
Pealim shows roots as "פ - ע - ל" or "פ.ע.ל" or "" (no root).
"""
if not root_raw or root_raw in ("-", "", ""):
return []
# Split on " - " or "." separators
parts = re.split(r"\s*[-–—.]\s*", root_raw.strip())
return [p.strip() for p in parts if p.strip()]
def _build_tags(pos_en: str, root: list[str]) -> str:
"""
Generate Anki tags string matching the existing project convention.
Examples:
pos=Noun, root=[] "שם_עצם"
pos=Noun, root=["א","ב"] "שורש::אב שם_עצם"
pos=Verb, root=["שמר"] "שורש::שמר פעלים"
"""
pos_tag_map = {
"Noun": "שם_עצם",
"Verb": "פעלים",
"Adjective": "שם_תואר",
"Adverb": "תוארי_הפועל",
"Pronoun": "כינוייוף",
"Preposition": "מילות_יחס",
"Conjunction": "מילות_חיבור",
"Particle": "מילית",
"Numeral": "שם_מספר",
"Cardinal numeral": "שם_מספר",
"Determiner": "מגדיר",
"Existential": "מילת_קיום",
"Interrogative": "מילת_שאלה",
"Interjection": "מילת_קריאה",
}
parts: list[str] = []
if root:
root_str = "".join(root)
parts.append(f"שורש::{root_str}")
pos_heb_tag = pos_tag_map.get(pos_en, "")
if pos_heb_tag:
parts.append(pos_heb_tag)
return " ".join(parts)
def _compute_audio_file(slug: str, ktiv_male: str) -> str:
"""
Return the local audio filename for an entry.
The actual confusable detection happens later (after all pages are scraped);
here we store a placeholder that post_process() will correct.
We default to the consonant-based name; confusables get slug-based names.
"""
consonants = ktiv_male or ""
return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
# ---------------------------------------------------------------------------
# Page parsing
# ---------------------------------------------------------------------------
def _parse_mo_page(html: bytes) -> list[dict]:
"""
Parse a hebstyle=mo (nikkud) list page.
Returns a list of raw row dicts with keys:
nikkud, slug, root_raw, pos_raw, meaning_raw, audio_url
"""
soup = BeautifulSoup(html, BS4_PARSER)
rows: list[dict] = []
for tr in soup.select("table tr"):
tds = tr.find_all("td")
if len(tds) < 4:
continue
# Audio URL
audio_span = tds[0].find(attrs={"data-audio": True})
audio_url: str = audio_span["data-audio"] if audio_span else ""
# Slug
slug = ""
link = tds[0].find("a", href=True)
if link:
m = re.search(r"/dict/([^/]+)/", link["href"])
if m:
slug = m.group(1)
# Nikkud word
menukad = tds[0].find("span", class_="menukad")
nikkud = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
root_raw = tds[1].get_text(strip=True)
pos_raw = tds[2].get_text(strip=True)
meaning_raw = tds[3].get_text(strip=True)
if nikkud:
rows.append(
{
"nikkud": nikkud,
"slug": slug,
"root_raw": root_raw,
"pos_raw": pos_raw,
"meaning_raw": meaning_raw,
"audio_url": audio_url,
}
)
return rows
def _parse_vl_words(html: bytes) -> list[str]:
"""
Parse a hebstyle=vl (ktiv male) list page.
Returns ordered list of ktiv male strings (one per table row).
"""
soup = BeautifulSoup(html, BS4_PARSER)
words: list[str] = []
for tr in soup.select("table tr"):
tds = tr.find_all("td")
if len(tds) < 4:
continue
menukad = tds[0].find("span", class_="menukad")
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
words.append(word)
return words
# ---------------------------------------------------------------------------
# words.json I/O
# ---------------------------------------------------------------------------
def _load_words() -> dict:
"""Load words.json; return empty dict if missing."""
if not WORDS_JSON.exists():
logger.info("data/words.json not found — starting fresh.")
return {}
with WORDS_JSON.open(encoding="utf-8") as fh:
return json.load(fh)
def _save_words(words: dict) -> None:
"""Atomically write words to words.json via a .tmp file."""
tmp = WORDS_JSON.with_suffix(".json.tmp")
with tmp.open("w", encoding="utf-8") as fh:
json.dump(words, fh, ensure_ascii=False, indent=2)
os.replace(tmp, WORDS_JSON)
logger.info("Saved data/words.json (%d entries)", len(words))
# ---------------------------------------------------------------------------
# Progress tracking
# ---------------------------------------------------------------------------
def _load_progress() -> set[int]:
"""Return set of already-completed page numbers."""
if not PROGRESS_JSON.exists():
return set()
with PROGRESS_JSON.open(encoding="utf-8") as fh:
data = json.load(fh)
return set(data.get("completed_pages", []))
def _save_progress(completed: set[int]) -> None:
"""Atomically write progress file."""
tmp = PROGRESS_JSON.with_suffix(".json.tmp")
with tmp.open("w", encoding="utf-8") as fh:
json.dump({"completed_pages": sorted(completed)}, fh)
os.replace(tmp, PROGRESS_JSON)
# ---------------------------------------------------------------------------
# Unique key generation
# ---------------------------------------------------------------------------
def _make_unique_key(nikkud: str, pos_en: str, meaning: str, existing_keys: set[str]) -> str:
"""
Generate a collision-free unique key for a new entry.
Escalation:
1. nikkud
2. nikkud|pos_en
3. nikkud|pos_en|meaning
4. nikkud|pos_en|meaning|N (N = 2, 3, )
"""
candidate = nikkud
if candidate not in existing_keys:
return candidate
candidate = f"{nikkud}|{pos_en}"
if candidate not in existing_keys:
return candidate
candidate = f"{nikkud}|{pos_en}|{meaning}"
if candidate not in existing_keys:
return candidate
n = 2
while True:
candidate = f"{nikkud}|{pos_en}|{meaning}|{n}"
if candidate not in existing_keys:
return candidate
n += 1
# ---------------------------------------------------------------------------
# Core: merge one scraped row into words dict
# ---------------------------------------------------------------------------
def _merge_row(
words: dict,
slug_index: dict[str, str],
nikkud: str,
ktiv_male: str,
slug: str,
root_raw: str,
pos_raw: str,
meaning_raw_raw: str,
audio_url: str,
) -> None:
"""
Upsert a single scraped row into *words* in-place.
*slug_index* maps slug unique_key for fast lookup and is updated here
when a new entry is created.
"""
# Derived fields
pos_en, pos_heb = _parse_pos(pos_raw)
root = _parse_root(root_raw)
meaning_raw = meaning_raw_raw
meaning = _clean_meaning(meaning_raw)
emoji = _extract_emoji(meaning_raw_raw)
tags = _build_tags(pos_en, root)
audio_file = _compute_audio_file(slug, ktiv_male)
# Extract Hebrew preposition(s) from the raw meaning (e.g. "(על)" → "על")
prep_matches = HBPAREN_RE.findall(meaning_raw)
prep: str | None = " ".join(prep_matches) if prep_matches else None
# ---- locate existing entry ----
unique_key: str | None = slug_index.get(slug) if slug else None
if unique_key and unique_key in words:
# Update list-level fields only; never touch protected fields
entry = words[unique_key]
entry["word"]["nikkud"] = nikkud
entry["word"]["ktiv_male"] = ktiv_male
entry["slug"] = slug
entry["root"] = root
entry["pos"] = pos_en
entry["pos_hebrew"] = pos_heb
entry["meaning"] = meaning
entry["meaning_raw"] = meaning_raw
entry["prep"] = prep
entry["audio_url"] = audio_url
entry["audio_file"] = audio_file
entry["tags"] = tags
entry["last_scrape_date"] = TODAY
else:
# Create new entry
unique_key = _make_unique_key(nikkud, pos_en, meaning, set(words.keys()))
entry = _default_entry()
entry["word"]["nikkud"] = nikkud
entry["word"]["ktiv_male"] = ktiv_male
entry["slug"] = slug
entry["root"] = root
entry["pos"] = pos_en
entry["pos_hebrew"] = pos_heb
entry["meaning"] = meaning
entry["meaning_raw"] = meaning_raw
entry["prep"] = prep
entry["emoji"] = emoji
entry["emoji_source"] = "from_pealim" if emoji else None
entry["audio_url"] = audio_url
entry["audio_file"] = audio_file
entry["tags"] = tags
entry["last_scrape_date"] = TODAY
words[unique_key] = entry
if slug:
slug_index[slug] = unique_key
# ---------------------------------------------------------------------------
# Post-processing: recompute confusable_group, shared_roots, audio_file
# ---------------------------------------------------------------------------
def _post_process(words: dict) -> None:
"""
After all pages are scraped, recompute derived cross-entry fields:
- confusable_group: entries sharing the same ktiv_male (2+)
- shared_roots: entries sharing the same root (excluding self)
- audio_file: slug-based for confusables, consonant-based otherwise
"""
logger.info("Post-processing: recomputing confusable groups and shared roots...")
# --- confusable groups ---
ktiv_to_keys: dict[str, list[str]] = {}
for key, entry in words.items():
ktiv = entry.get("word", {}).get("ktiv_male", "")
if ktiv:
ktiv_to_keys.setdefault(ktiv, []).append(key)
for _, entry in words.items():
ktiv = entry.get("word", {}).get("ktiv_male", "")
group = ktiv_to_keys.get(ktiv, [])
if len(group) >= 2:
entry["confusable_group"] = sorted(group)
# Confusable → slug-based audio filename
slug = entry.get("slug", "")
if slug:
entry["audio_file"] = f"{slug}.mp3"
else:
# Only clear confusable_group if it wasn't set by enrichment (i.e. no confusables_guid)
if not entry.get("confusables_guid"):
entry["confusable_group"] = None
# Non-confusable → consonant-based audio filename
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
consonants = ktiv_male or ""
slug = entry.get("slug", "")
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
# --- shared roots ---
root_to_keys: dict[str, list[str]] = {}
for key, entry in words.items():
root = entry.get("root")
if root:
root_str = "|".join(root) # canonical form for grouping
root_to_keys.setdefault(root_str, []).append(key)
for key, entry in words.items():
root = entry.get("root")
if root:
root_str = "|".join(root)
siblings = root_to_keys.get(root_str, [])
entry["shared_roots"] = sorted(k for k in siblings if k != key)
else:
entry["shared_roots"] = []
logger.info("Post-processing complete.")
# ---------------------------------------------------------------------------
# Scraping loop
# ---------------------------------------------------------------------------
def _build_slug_index(words: dict) -> dict[str, str]:
"""Build slug → unique_key lookup from the current words dict."""
index: dict[str, str] = {}
for key, entry in words.items():
slug = entry.get("slug", "")
if slug and slug not in index:
index[slug] = key
return index
def _fetch_page(url: str, cookies: dict) -> bytes | None:
"""Fetch a single page; return raw bytes or None on failure."""
try:
resp = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp.content
except requests.RequestException as exc:
logger.error("Request failed for %s: %s", url, exc)
return None
def run_scrape(total_pages: int, force_refresh: bool) -> None:
"""
Main scrape loop.
Args:
total_pages: Number of list pages to scrape.
force_refresh: If True, ignore progress file and re-scrape all pages.
"""
words = _load_words()
slug_index = _build_slug_index(words)
completed = set() if force_refresh else _load_progress()
if force_refresh and completed:
logger.info("--force-refresh: ignoring %d completed pages.", len(completed))
pages_to_do = [p for p in range(1, total_pages + 1) if p not in completed]
logger.info(
"Pages to scrape: %d / %d (already done: %d)",
len(pages_to_do),
total_pages,
len(completed),
)
pages_since_save = 0
for page_num in pages_to_do:
url = f"{PEALIM_DICT_URL}?page={page_num}"
logger.info("Scraping page %d / %d", page_num, total_pages)
# --- hebstyle=mo (nikkud + audio + slug) ---
mo_html = _fetch_page(url, {"translit": "none", "hebstyle": "mo"})
if mo_html is None:
logger.warning("Skipping page %d (mo fetch failed).", page_num)
time.sleep(REQUEST_DELAY * 2)
continue
time.sleep(REQUEST_DELAY)
# --- hebstyle=vl (ktiv male) ---
vl_html = _fetch_page(url, {"translit": "none", "hebstyle": "vl"})
if vl_html is None:
logger.warning("Skipping page %d (vl fetch failed).", page_num)
time.sleep(REQUEST_DELAY * 2)
continue
# Parse
mo_rows = _parse_mo_page(mo_html)
vl_words = _parse_vl_words(vl_html)
if not mo_rows:
logger.warning("Page %d returned no rows — might be past end.", page_num)
completed.add(page_num)
_save_progress(completed)
time.sleep(REQUEST_DELAY)
continue
# Merge each row
for i, row in enumerate(mo_rows):
ktiv_male = vl_words[i] if i < len(vl_words) else ""
_merge_row(
words=words,
slug_index=slug_index,
nikkud=row["nikkud"],
ktiv_male=ktiv_male,
slug=row["slug"],
root_raw=row["root_raw"],
pos_raw=row["pos_raw"],
meaning_raw_raw=row["meaning_raw"],
audio_url=row["audio_url"],
)
completed.add(page_num)
pages_since_save += 1
# Incremental save every SAVE_EVERY pages
if pages_since_save >= SAVE_EVERY:
_save_words(words)
_save_progress(completed)
pages_since_save = 0
time.sleep(REQUEST_DELAY)
# Final save + post-processing
logger.info("All pages scraped. Running post-processing…")
_post_process(words)
_save_words(words)
_save_progress(completed)
logger.info("Done. Total entries in words.json: %d", len(words))
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
"""Entry point."""
parser = argparse.ArgumentParser(description="Scrape pealim.com list pages into data/words.json.")
parser.add_argument(
"--test",
metavar="N",
type=int,
default=None,
help="Scrape only the first N pages (for testing).",
)
parser.add_argument(
"--force-refresh",
action="store_true",
default=False,
help="Re-scrape all pages, ignoring existing progress.",
)
args = parser.parse_args()
total_pages = args.test if args.test is not None else DEFAULT_TOTAL_PAGES
logger.info(
"Starting pealim list scraper | pages=%d | force=%s | parser=%s",
total_pages,
args.force_refresh,
BS4_PARSER,
)
run_scrape(total_pages=total_pages, force_refresh=args.force_refresh)
if __name__ == "__main__":
main()

View file

@ -25,9 +25,6 @@ dev = [
[tool.pytest.ini_options]
testpaths = ["tests"]
markers = [
"integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
]
[tool.ruff]
target-version = "py311"

183
rebuild_sentence_matches.py Normal file
View file

@ -0,0 +1,183 @@
#!/usr/bin/env python3
"""
Rebuild vocab_sentence_matches.json using both direct word matching
and ktiv male conjugated/declined form matching.
This dramatically improves sentence coverage by matching not just
dictionary forms but all conjugated verbs and declined nouns.
"""
import json
import logging
import re
from pathlib import Path
import pandas as pd
from helpers import strip_nikkud as _strip_nikkud
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
def main():
# Load sentences
with open(DATA_DIR / "epub_sentence_index.json") as f:
sentences = json.load(f).get("sentences", [])
logger.info(f"Loaded {len(sentences)} sentences")
# Load vocab CSV
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
try:
df = pd.read_csv(csv_path, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(csv_path, index_col=0)
logger.info(f"Loaded {len(df)} vocab entries")
# Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
word_lookup: dict[str, list[tuple[str, str]]] = {}
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
wni = str(row.get("Word Without Nikkud", "")).strip()
if not word or word in ("nan", "None"):
continue
stripped = _strip_nikkud(word)
if stripped:
word_lookup.setdefault(stripped, []).append((word, wni))
# Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
ktiv_path = DATA_DIR / "ktiv_male_forms.json"
ktiv_forms: dict[str, list[dict]] = {}
if ktiv_path.exists():
with open(ktiv_path) as f:
ktiv_forms = json.load(f)
logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
else:
logger.warning("No ktiv_male_forms.json — only using direct matching")
# Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
ktiv_to_word: dict[str, set[str]] = {}
for ktiv, entries in ktiv_forms.items():
for entry in entries:
word_nikkud = entry.get("word_nikkud", "")
if word_nikkud:
ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)
# Also add all vocab words' own stripped forms to ktiv_to_word
for stripped, entries in word_lookup.items():
for word_nikkud, _ in entries:
ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)
logger.info(f"Total matchable forms: {len(ktiv_to_word)}")
# Tokenize all sentences once
sentence_tokens: list[tuple[dict, list[str]]] = []
for s in sentences:
stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
tokens = [t for t in tokens if t] # remove empty
sentence_tokens.append((s, tokens))
# Match: for each sentence token, check ktiv_to_word lookup
# Build word_nikkud → [sentence_info]
matches: dict[str, list[dict]] = {} # word_nikkud → [sentences]
for sent, tokens in sentence_tokens:
text = sent.get("text", "")
book = sent.get("book", "")
word_len = len(tokens)
# Skip sentences that are too short or too long
if word_len < 4 or word_len > 15:
continue
for tok in tokens:
if tok in ktiv_to_word:
for word_nikkud in ktiv_to_word[tok]:
matches.setdefault(word_nikkud, []).append(
{
"text": text,
"book": book,
"matched_form": tok,
"word_count": word_len,
}
)
logger.info(f"Words with at least 1 match: {len(matches)}")
# Deduplicate and limit to 3 best sentences per word
# Prefer shorter sentences (6-12 words ideal)
output: dict[str, dict] = {}
for word_nikkud, sents in matches.items():
# Deduplicate by text
seen_texts = set()
unique = []
for s in sents:
if s["text"] not in seen_texts:
seen_texts.add(s["text"])
unique.append(s)
# Score: prefer 6-12 word sentences
def score(s):
wc = s["word_count"]
if 6 <= wc <= 12:
return 0 # ideal
return abs(wc - 9) # distance from ideal
unique.sort(key=score)
best = unique[:3]
# Find the Word Without Nikkud for this word
stripped = _strip_nikkud(word_nikkud)
wni = stripped # default
if stripped in word_lookup:
for wn, w_wni in word_lookup[stripped]:
if wn == word_nikkud:
wni = w_wni
break
output[wni] = {
"word_nikkud": word_nikkud,
"sentences": [{"text": s["text"], "book": s["book"]} for s in best],
}
# Save
out_path = DATA_DIR / "vocab_sentence_matches.json"
with open(out_path, "w") as f:
json.dump(output, f, ensure_ascii=False, indent=1)
total_sents = sum(len(v["sentences"]) for v in output.values())
logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")
# Stats
total_vocab = len(df)
pct = len(output) * 100 / total_vocab
logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")
# Breakdown by match type
direct_only = 0
ktiv_only = 0
both = 0
for _wni, info in output.items():
word = info["word_nikkud"]
stripped = _strip_nikkud(word)
has_direct = stripped in word_lookup
has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
if has_direct and has_ktiv:
both += 1
elif has_ktiv:
ktiv_only += 1
else:
direct_only += 1
logger.info(f" Direct matches only: {direct_only}")
logger.info(f" Ktiv male matches only: {ktiv_only}")
logger.info(f" Both: {both}")
if __name__ == "__main__":
main()

View file

@ -20,11 +20,8 @@ from pathlib import Path
import requests
sys.path.insert(0, "/home/node/projects")
import load_keeshare
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["password"]
FORGEJO_TOKEN = "f023bd4cfd4b77aac584647f2fa8481df3906578"
OUTPUT_DIR = Path(__file__).parent / "output"
# All deck variants to include in release

505
run.py
View file

@ -7,23 +7,13 @@ Usage:
Options:
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
Pipeline steps:
1. List scrape scrape pealim.com list pages words.json (captures slugs)
2. Detail scrape scrape noun/verb detail pages using slugs words.json
3. Frequency load/download word frequency data
4. Examples extract example sentences from Hebrew EPUBs
5. Audio download download audio mp3 files
6. Fonts download Heebo font files
7. Images fetch noun images from Wikipedia
8. Build build all .apkg deck variants
Options:
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip EPUB example extraction
--skip-examples Skip Ben Yehuda example fetching
--skip-conjugations Skip verb conjugation extraction
--skip-images Skip image fetching for concrete nouns
--test N Limit to first N words/pages
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
--test N Process only the first N dictionary words (for quick testing)
"""
import argparse
@ -31,8 +21,11 @@ import json
import logging
import re
import sys
import time
from pathlib import Path
from helpers import strip_nikkud
sys.path.insert(0, str(Path(__file__).parent))
logging.basicConfig(
@ -46,7 +39,6 @@ OUTPUT_DIR = Path(__file__).parent / "output"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
FONTS_DIR = DATA_DIR / "fonts"
WORDS_JSON = DATA_DIR / "words.json"
def parse_args():
@ -56,117 +48,282 @@ def parse_args():
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
help="Run only one deck (skips all unrelated steps)",
)
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument(
"--skip-conjugations",
action="store_true",
help="Skip verb conjugation extraction (deprecated: use --only vocab)",
)
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
def step_list_scrape(args):
"""Step 1 — scrape pealim.com list pages → words.json."""
def step_scrape(args):
"""Step 1 — scrape or load dictionary."""
dict_csv = DATA_DIR / "hebrew_dict.csv"
anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
# Legacy fallback names
legacy_dict = DATA_DIR / "pealim_dict.csv"
if args.skip_scrape:
if WORDS_JSON.exists():
logger.info("[1] Using existing words.json (--skip-scrape)")
if dict_csv.exists():
logger.info(f"[1] Using existing {dict_csv}")
elif legacy_dict.exists():
logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
else:
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
sys.exit(1)
return
logger.info("[1] Scraping dictionary list pages from pealim.com …")
import pealim_list_scrape
logger.info("[1] Scraping dictionary from pealim.com …")
total_pages = args.test if args.test else None
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
import hebrew_extract
df = hebrew_extract.extract_from_website()
df.to_csv(dict_csv, index=True)
logger.info(f" Saved {len(df)} words → {dict_csv}")
df = hebrew_extract.modify_for_anki(df)
df.to_csv(anki_csv, sep=";", index=True)
logger.info(f" Saved Anki CSV → {anki_csv}")
def step_frequency() -> dict[str, int]:
"""Step 3 — load/download word frequency data."""
logger.info("[3] Loading word frequency data …")
"""Step 2 — load/download word frequency data."""
logger.info("[2] Loading word frequency data …")
import frequency_lookup
frequency_lookup.load()
return frequency_lookup._freq
def step_examples(args) -> dict:
"""Step 4 — extract example sentences from Hebrew EPUBs."""
def step_examples(args, freq_cache: dict):
"""Step 3 — load/build Ben Yehuda example index."""
if args.skip_examples:
logger.info("[4] Skipping examples (--skip-examples)")
logger.info("[3] Skipping examples (--skip-examples)")
examples_path = DATA_DIR / "examples_cache.json"
if examples_path.exists():
with open(examples_path) as f:
return json.load(f)
return {}
logger.info("[4] Extracting EPUB example sentences …")
import epub_examples
logger.info("[3] Loading Ben Yehuda example index")
import benyehuda
if not WORDS_JSON.exists():
logger.warning("[4] words.json not found, skipping examples")
return {}
benyehuda.load(force_rebuild=args.refresh_examples)
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
stats = epub_examples.run(words)
try:
import pandas as pd
# Save updated words.json
with open(WORDS_JSON, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
return stats
if args.test:
df = df.head(args.test)
logger.info(f" Pre-fetching examples for {len(df)} words …")
for _, row in df.iterrows():
# Use nikkud word form as primary key (nikkud corpus)
word_nikkud = str(row.get("Word", "")).strip()
if word_nikkud:
benyehuda.get_examples(word_nikkud)
except Exception as e:
logger.warning(f" Could not pre-fetch all examples: {e}")
benyehuda.save_examples_cache()
return benyehuda._examples_cache
def step_detail_scrape(args):
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
if args.skip_detail:
logger.info("[2] Skipping detail scrape (--skip-detail)")
return
logger.info("[2] Scraping detail pages from pealim.com …")
import pealim_detail_scrape
test_limit = args.test if args.test else None
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
def step_audio_download(args):
"""Step 5 — download audio .mp3 files from URLs in words.json."""
def step_audio(args):
"""Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
if args.skip_audio:
logger.info("[5] Skipping audio (--skip-audio)")
logger.info("[4] Skipping audio (--skip-audio)")
return
logger.info("[5] Downloading audio files …")
logger.info("[4] Downloading vocabulary audio files …")
import pealim_audio_download
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
test_limit = args.test if args.test else None
pealim_audio_download.run(test=test_limit)
import pandas as pd
import requests
try:
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
if "audio_url" not in df.columns:
logger.warning(" No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
return
if args.test:
df = df.head(args.test)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
downloaded = 0
skipped = 0
no_url = 0
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
word_plain = str(row.get("Word Without Nikkud", "")).strip()
audio_url = str(row.get("audio_url", "")).strip()
if not word:
continue
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
if not safe_name:
continue
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
if mp3_path.exists():
skipped += 1
continue
if not audio_url or audio_url in ("nan", "None", ""):
no_url += 1
continue
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.3)
except Exception as e:
logger.debug(f" Audio download failed for {word}: {e}")
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
except Exception as e:
logger.warning(f" Audio step failed: {e}")
def step_fonts(_args: argparse.Namespace):
"""Step 6 — download Heebo font files (one-time, cached)."""
def step_conj_audio(args, conjugations: dict):
"""Step 4b — download conjugation audio .mp3 files."""
if args.skip_audio:
logger.info("[4b] Skipping conjugation audio (--skip-audio)")
return
logger.info("[4b] Downloading conjugation audio files …")
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
import requests
downloaded = 0
skipped = 0
failed = 0
for _infinitive, data in conjugations.items():
if not data or not data.get("forms"):
continue
slug = data.get("slug", "")
if not slug:
continue
# Active forms
for form_key, form_data in data["forms"].items():
audio_url = form_data.get("audio_url", "")
if not audio_url:
continue
filename = f"{slug}_{form_key}.mp3"
mp3_path = AUDIO_CONJ_DIR / filename
if mp3_path.exists():
skipped += 1
continue
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.2)
except Exception as e:
logger.debug(f" Conj audio failed {filename}: {e}")
failed += 1
# Passive partner forms
passive = data.get("passive_partner")
if passive and passive.get("forms"):
for form_key, form_data in passive["forms"].items():
audio_url = form_data.get("audio_url", "")
if not audio_url:
continue
filename = f"{slug}_passive_{form_key}.mp3"
mp3_path = AUDIO_CONJ_DIR / filename
if mp3_path.exists():
skipped += 1
continue
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.2)
except Exception as e:
logger.debug(f" Conj audio failed {filename}: {e}")
failed += 1
logger.info(f" Conjugation audio: {downloaded} downloaded, {skipped} cached, {failed} failed")
def step_fonts(args):
"""Step 4c — download Heebo font files (one-time, cached)."""
FONTS_DIR.mkdir(parents=True, exist_ok=True)
regular = FONTS_DIR / "_Heebo-Regular.ttf"
bold = FONTS_DIR / "_Heebo-Bold.ttf"
if regular.exists() and bold.exists():
logger.info("[6] Heebo fonts already cached")
logger.info("[4c] Heebo fonts already cached")
return
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
# Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
import requests as _req
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
headers = {
# Request TTF (not woff2) so Anki can embed them
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
}
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
try:
css_resp = _req.get(css_url, headers=headers, timeout=15)
css_resp.raise_for_status()
css_text = css_resp.text
# Find all src: url(...) references (may be woff2 for modern UA)
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
logger.debug(f" Found {len(font_urls)} font URL(s) in CSS")
# Prefer TTF; if only woff2 available, download first two and note
downloaded = []
for i, fu in enumerate(font_urls[:2]):
fu = fu.strip("'\"")
dest = regular if i == 0 else bold
@ -175,74 +332,142 @@ def step_fonts(_args: argparse.Namespace):
fr = _req.get(fu, timeout=15)
fr.raise_for_status()
dest.write_bytes(fr.content)
downloaded.append(dest.name)
logger.info(f" Downloaded → {dest.name}")
if not downloaded:
logger.info(" All font files already present")
except Exception as e:
logger.warning(f" Heebo download failed: {e}")
logger.warning(" Cards will fall back to Arial Hebrew / David.")
logger.warning(
" To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
"from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
f"into {FONTS_DIR}"
)
def step_images(args) -> dict:
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
"""Step 4d — fetch images for concrete nouns (resume-safe)."""
if args.skip_images:
logger.info("[7] Skipping images (--skip-images)")
logger.info("[4d] Skipping images (--skip-images)")
cache_path = DATA_DIR / "image_cache.json"
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
return {}
limit = args.test
logger.info("[7] Fetching images for concrete nouns …")
limit = args.test # When in test mode, limit images too
logger.info("[4d] Fetching images for concrete nouns …")
import image_fetch
return image_fetch.run(limit=limit)
def step_build_all(args):
"""Step 8 — build all 12 release variants from the unified words.json."""
logger.info("[8] Building all deck variants …")
def step_build_all(
args, examples_cache: dict, freq_cache: dict, conjugations: dict | None, image_cache: dict | None = None
):
"""Step 5 — build all 6 release variants (4 vocab + 2 conj)."""
logger.info("[5] Building all deck variants …")
import apkg_builder
if not WORDS_JSON.exists():
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
sys.exit(1)
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
apkg_builder.build_all_variants(words, limit=args.test)
apkg_builder.build_all_variants(
dict_csv,
conjugations=conjugations or {},
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache or {},
limit=args.test,
)
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
def step_conjugations(args):
"""Step 6 — extract conjugations (returns data; building handled by step_build_all).
--skip-conjugations skips re-extraction from pealim.com but still loads
from cache so conj deck variants are built correctly.
"""
conj_cache = DATA_DIR / "conjugations.json"
if args.skip_conjugations:
if conj_cache.exists():
logger.info("[6] --skip-conjugations: loading from cache …")
with open(conj_cache) as f:
import json as _json
return _json.load(f)
logger.info("[6] --skip-conjugations: no cache found, skipping conj decks")
return None
verbs_file = Path(__file__).parent / "verbs_input.txt"
if not verbs_file.exists():
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
return None
if conj_cache.exists():
logger.info("[6] Using cached conjugations.json …")
with open(conj_cache) as f:
import json as _json
conjugations = _json.load(f)
else:
logger.info("[6] Extracting verb conjugations …")
import conjugation_extract
conjugations = conjugation_extract.main(verbs_file)
# Download conjugation audio
step_conj_audio(args, conjugations)
return conjugations
def print_summary(args, examples_cache, freq_cache, conjugations):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
if WORDS_JSON.exists():
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
logger.info(f" Dictionary words: {len(words)}")
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if dict_csv.exists():
import pandas as pd
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
logger.info(f" Dictionary words: {len(df)}")
logger.info(f" Frequency entries: {len(freq_cache)}")
matched = example_stats.get("matched", 0)
total = example_stats.get("total_vocab", 0)
if total:
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
for book, count in example_stats.get("books", {}).items():
logger.info(f" {book}: {count} sentences")
logger.info(f" Example cache entries: {len(examples_cache)}")
covered = sum(1 for v in examples_cache.values() if v)
if examples_cache:
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
if AUDIO_DIR.exists():
mp3s = list(AUDIO_DIR.glob("*.mp3"))
logger.info(f" Vocabulary audio files: {len(mp3s)}")
if AUDIO_CONJ_DIR.exists():
# Count only files that will be bundled: active non-infinitive forms
# (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
mp3s = [
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
]
@ -273,6 +498,9 @@ def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: di
if apkg.exists():
size_mb = apkg.stat().st_size / 1e6
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
if conjugations:
verb_count = sum(1 for v in conjugations.values() if v)
logger.info(f" Verbs in conjugation deck: {verb_count}")
logger.info("=" * 60)
logger.info("DONE")
@ -287,75 +515,92 @@ def main():
logger.info(f" MODE: --only {args.only}")
if args.test:
logger.info(f" TEST MODE: {args.test} words")
if args.refresh_examples:
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
logger.info("=" * 60)
def _load_words_for_only() -> dict:
if not WORDS_JSON.exists():
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
sys.exit(1)
with open(WORDS_JSON, encoding="utf-8") as f:
return json.load(f)
if args.only == "conjugations":
step_fonts(args)
conjugations = step_conjugations(args)
if conjugations:
import apkg_builder
words = _load_words_for_only()
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
deck, media = apkg_builder.build_conj_deck(
conjugations,
include_audio=audio,
dict_csv=dict_csv,
)
apkg_builder.write_conj_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
print_summary(args, {}, {}, conjugations or {})
return
if args.only == "confusables":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
deck, media = apkg_builder.build_confusables_deck(dict_csv, include_audio=audio)
apkg_builder.write_conf_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
print_summary(args, {}, {}, {})
return
if args.only == "plurals":
step_fonts(args)
import apkg_builder
words = _load_words_for_only()
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
deck, media = apkg_builder.build_plural_deck(dict_csv=dict_csv, include_audio=audio)
apkg_builder.write_plural_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
print_summary(args, {}, {}, {})
return
if args.only == "complete":
step_fonts(args)
freq_cache = step_frequency() if not args.skip_scrape else {}
examples_cache = step_examples(args, freq_cache) if not args.skip_examples else {}
image_cache = step_images(args) if not args.skip_images else {}
conjugations = step_conjugations(args)
import apkg_builder
words = _load_words_for_only()
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
emoji_lookup = apkg_builder._load_emoji_lookup()
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
decks, media = apkg_builder.build_complete_deck(
words,
include_audio=audio,
dict_csv,
conjugations=conjugations or {},
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache,
emoji_lookup=emoji_lookup,
include_audio=audio,
)
apkg_builder.write_complete_apkg(decks, media, out_path=path)
print_summary(args, {}, {})
print_summary(args, examples_cache, freq_cache, conjugations or {})
return
# Full pipeline
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
freq_cache = step_frequency() # 3 — word frequency data
example_stats = step_examples(args) # 4 — EPUB example sentences
step_audio_download(args) # 5 — download audio mp3s
step_fonts(args) # 6 — download Heebo fonts
step_images(args) # 7 — fetch noun images
step_build_all(args) # 8 — build all .apkg variants
if args.only == "vocab":
args.skip_conjugations = True
print_summary(args, example_stats, freq_cache)
step_scrape(args)
freq_cache = step_frequency()
examples_cache = step_examples(args, freq_cache)
step_audio(args)
step_fonts(args)
image_cache = step_images(args)
conjugations = step_conjugations(args)
step_build_all(args, examples_cache, freq_cache, conjugations, image_cache)
print_summary(args, examples_cache, freq_cache, conjugations or {})
if __name__ == "__main__":

View file

@ -1,392 +0,0 @@
#!/usr/bin/env python3
"""Assign frequency ranks from the cleaned corpus to words.json entries.
Two-tier assignment with PoS priority:
Tier 1: Match headword ktiv_male directly against corpus
Tier 2: Match conjugated/inflected forms (only if no other entry already
claimed that corpus word via tier 1)
PoS priority (based on standalone-word likelihood in Hebrew text):
כינוייוף (Pronoun) > מילות_חיבור (Conjunction) > שם_תואר (Adjective) >
מילית (Particle) > שם_עצם (Noun) > תוארי_הפועל (Adverb) >
מילות_יחס (Preposition) > פעלים (Verb)
Usage:
python3 scripts/assign_frequency.py # assign and save
python3 scripts/assign_frequency.py --dry-run # preview only
python3 scripts/assign_frequency.py --stats # show statistics only
"""
from __future__ import annotations
import argparse
import json
import logging
from collections import defaultdict
from pathlib import Path
logger = logging.getLogger(__name__)
PROJECT_ROOT = Path(__file__).parent.parent
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
# Function word PoS — these dominate content words in homograph groups
FUNCTION_POS = frozenset({"כינוייוף", "מילות_חיבור", "מילית", "מילות_יחס", "תוארי_הפועל"})
# Content PoS that loses frequency when a function word dominates
# Adjectives also lose (e.g. כן "honest" vs כן "yes") — they're rare collisions
CONTENT_POS = frozenset({"שם_עצם", "שם_תואר", "פעלים"})
# Manual overrides: at these corpus ranks, ALL homographs share frequency.
# These are cases where the content word is genuinely common enough to deserve it.
# e.g. rank 15: עם "people" (NN) alongside עם "with" (PREP)
# Manual overrides: at these ktiv_male forms, ALL homographs share frequency.
# These are cases where the content word is genuinely common enough to deserve it.
SHARE_ALL_WORDS = frozenset(
{
"עם", # "people" (NN) + "with" (PREP)
"שם", # "name" (NN) + "there" (ADV)
"אל", # "god" (NN) + "to" (PREP) + "don't" (PART)
"עד", # "witness"/"eternity" (NN) + "until" (PREP)
"פה", # "mouth" (NN) + "here" (ADV)
"לאחר", # "to be late" (VB) + "after" (PREP)
"יופי", # "beauty" (NN) + "great!" (ADV)
"המון", # "crowd" (NN) + "lots of" (ADV)
"חבל", # "rope" (NN) + "it's a pity" (ADV)
"ראשית", # "beginning" (NN) + "firstly" (ADV)
"עקב", # "heel"/"footprint" (NN) + "due to" (CONJ)
"אולם", # "hall" (NN) + "however" (ADV)
}
)
def _get_pos_tag(entry: dict) -> str:
"""Extract primary PoS tag from entry's tags field."""
tags = (entry.get("tags") or "").split()
for t in tags:
if not t.startswith("שורש"):
return t
return "unknown"
def _build_form_index(words: dict) -> dict[str, list[tuple[str, str]]]:
"""Build reverse index: ktiv_male_form -> [(unique_key, match_type), ...]"""
index: dict[str, list[tuple[str, str]]] = defaultdict(list)
for key, entry in words.items():
w = entry.get("word") or {}
if km := w.get("ktiv_male"):
index[km].append((key, "headword"))
# Verb conjugations: indexed for new-assignment-only matching (no upgrades).
# Conjugated forms collide with unrelated headwords, so tier 2 only uses
# these for entries that have NO existing frequency.
conj = entry.get("conjugation") or {}
for form in conj.get("active_forms") or []:
if isinstance(form, dict):
form_data = form.get("form") or {}
if km2 := form_data.get("ktiv_male"):
km2 = km2.rstrip("!\u200f ")
index[km2].append((key, "conjugation"))
for hp in conj.get("hufal_pual_forms") or []:
if isinstance(hp, dict):
hp_data = hp.get("form") or {}
if km3 := hp_data.get("ktiv_male"):
km3 = km3.rstrip("!\u200f ")
index[km3].append((key, "conjugation"))
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
for inf_data in (entry.get(field) or {}).values():
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
index[km4].append((key, "inflection"))
return dict(index)
def _should_get_frequency(
entry: dict,
all_headword_entries: list[tuple[str, str]],
corpus_word: str,
words: dict,
) -> bool:
"""Decide if an entry should get frequency in a homograph group.
Rules:
- If only one entry matches, it always gets frequency.
- If SHARE_ALL_WORDS includes this corpus word, all entries share.
- If the group has function words AND content words, content words lose.
- Otherwise all entries share.
"""
if len(all_headword_entries) <= 1:
return True
if corpus_word in SHARE_ALL_WORDS:
return True
pos = _get_pos_tag(entry)
has_function = any(_get_pos_tag(words[k]) in FUNCTION_POS for k, _ in all_headword_entries)
return not (has_function and pos in CONTENT_POS)
def assign_frequencies(
words: dict,
freq_corpus: dict[str, int],
raw_corpus: dict[str, int] | None = None,
upgrade: bool = False,
) -> dict[str, dict]:
"""Assign frequency ranks to words.json entries. Returns assignment details.
freq_corpus controls which words are valid (cleaned corpus).
raw_corpus provides original rank numbers (with gaps). If not provided,
uses freq_corpus ranks (re-ranked, no gaps).
upgrade: if True, tier 2 can upgrade an entry's rank when a conjugated/inflected
form has a better (lower) rank than the headword match.
"""
rank_source = raw_corpus if raw_corpus is not None else freq_corpus
form_index = _build_form_index(words)
# Track which corpus words have been claimed by tier 1
tier1_claimed: set[str] = set()
# Results tracking
assignments: dict[str, dict] = {} # unique_key -> {rank, source, corpus_word}
# --- Tier 1: headword matches ---
# For each corpus word, find all headword matches and assign to eligible entries.
# Homograph groups: function words get frequency, content words don't (unless overridden).
corpus_by_rank = sorted(freq_corpus.items(), key=lambda x: x[1])
for corpus_word, _clean_rank in corpus_by_rank:
matches = form_index.get(corpus_word, [])
headword_matches = [(k, t) for k, t in matches if t == "headword"]
if not headword_matches:
continue
original_rank = rank_source.get(corpus_word, _clean_rank)
assigned_any = False
for entry_key, _ in headword_matches:
if entry_key in assignments:
continue
if _should_get_frequency(words[entry_key], headword_matches, corpus_word, words):
assignments[entry_key] = {
"rank": original_rank,
"source": "headword",
"corpus_word": corpus_word,
}
assigned_any = True
if assigned_any:
tier1_claimed.add(corpus_word)
tier1_count = len(assignments)
logger.info("Tier 1 (headword): %d entries assigned", tier1_count)
# --- Tier 2: conjugation/inflection matches ---
# Only use corpus words NOT claimed in tier 1.
# A corpus word that matches an inflection is "owned" by that headword —
# it cannot also upgrade an unrelated verb via conjugation.
# Upgrades (when enabled) only apply within the same match type priority.
for corpus_word, _clean_rank in corpus_by_rank:
if corpus_word in tier1_claimed:
continue
matches = form_index.get(corpus_word, [])
secondary_matches = [(k, t) for k, t in matches if t in ("conjugation", "inflection")]
if not secondary_matches:
continue
original_rank = rank_source.get(corpus_word, _clean_rank)
# Split by type: inflections take priority over conjugations
inflection_matches = [(k, t) for k, t in secondary_matches if t == "inflection"]
conjugation_matches = [(k, t) for k, t in secondary_matches if t == "conjugation"]
# If any inflection matches exist, this corpus word belongs to inflection.
# Don't let conjugations claim it.
active_matches = inflection_matches if inflection_matches else conjugation_matches
for entry_key, match_type in active_matches:
existing = assignments.get(entry_key)
if existing is None:
# New assignment — conjugations only allowed for rank > 5000
# (too many false positives in the important tiers)
if match_type == "conjugation" and original_rank <= 5000:
continue
assignments[entry_key] = {
"rank": original_rank,
"source": match_type,
"corpus_word": corpus_word,
}
break
if upgrade and match_type == "inflection" and original_rank < existing["rank"]:
# Upgrade — only allowed for inflections (conjugations collide too much)
assignments[entry_key] = {
"rank": original_rank,
"source": f"upgrade:{match_type}",
"corpus_word": corpus_word,
}
break
tier2_count = len(assignments) - tier1_count
logger.info("Tier 2 (conjugation/inflection): %d entries assigned", tier2_count)
return assignments
def print_stats(words: dict, assignments: dict, freq_corpus: dict) -> None:
"""Print detailed statistics about frequency assignment."""
total = len(words)
assigned = len(assignments)
previously_had = sum(1 for e in words.values() if e.get("frequency") is not None)
print(f"\n{'=' * 60}")
print("Frequency Assignment Statistics")
print(f"{'=' * 60}")
print(f"Words.json entries: {total}")
print(f"Clean corpus size: {len(freq_corpus)}")
print(f"Previously had freq: {previously_had}")
print(f"Now assigned: {assigned}")
print(f"Newly gained: {assigned - previously_had}")
print(f"Still unlisted: {total - assigned}")
# By tier
tier1 = sum(1 for a in assignments.values() if a["source"] == "headword")
tier2_conj = sum(1 for a in assignments.values() if a["source"] == "conjugation")
tier2_inf = sum(1 for a in assignments.values() if a["source"] == "inflection")
print("\nBy assignment tier:")
print(f" Tier 1 (headword): {tier1}")
print(f" Tier 2 (conjugation): {tier2_conj}")
print(f" Tier 2 (inflection): {tier2_inf}")
# By PoS
print("\nBy PoS:")
from collections import Counter
pos_assigned = Counter()
pos_total = Counter()
for k, v in words.items():
pos = _get_pos_tag(v)
pos_total[pos] += 1
if k in assignments:
pos_assigned[pos] += 1
pos_order = [
"כינוייוף",
"מילות_חיבור",
"שם_תואר",
"מילית",
"שם_עצם",
"תוארי_הפועל",
"מילות_יחס",
"פעלים",
"unknown",
]
for pos in sorted(pos_total, key=lambda p: pos_order.index(p) if p in pos_order else 99):
a = pos_assigned[pos]
t = pos_total[pos]
pct = a / t * 100 if t else 0
print(f" {pos:20s}: {a:5d}/{t:5d} ({pct:.0f}%)")
# By frequency tier (using apkg_builder tiers)
print("\nBy frequency tier:")
tiers = {
"Core (1-500)": (1, 500),
"Essential (501-1500)": (501, 1500),
"Intermediate (1501-3000)": (1501, 3000),
"Upper-intermediate (3001-5000)": (3001, 5000),
"Advanced (5001-10000)": (5001, 10000),
"Rare (10001+)": (10001, 999999),
}
for label, (lo, hi) in tiers.items():
count = sum(1 for a in assignments.values() if lo <= a["rank"] <= hi)
print(f" {label:35s}: {count}")
# Top 20 newly assigned (entries that didn't have frequency before)
newly = []
for k, a in assignments.items():
if words[k].get("frequency") is None:
w = words[k].get("word", {})
newly.append((a["rank"], k, w.get("ktiv_male", ""), a["source"], a["corpus_word"]))
newly.sort()
if newly:
print("\nTop 20 newly assigned entries:")
for rank, _key, ktiv, source, corpus_word in newly[:20]:
print(f" rank {rank:5d}: {ktiv:15s} via {source:12s} (corpus: {corpus_word})")
# Entries that LOST frequency (had it before, not assigned now)
lost = []
for k, v in words.items():
old_freq = v.get("frequency")
if old_freq is not None and k not in assignments:
w = v.get("word", {})
lost.append((old_freq, k, w.get("ktiv_male", "")))
lost.sort()
if lost:
print(f"\nEntries that would LOSE frequency ({len(lost)} total):")
for rank, _key, ktiv in lost[:20]:
print(f" was rank {rank:5d}: {ktiv}")
def main() -> None:
parser = argparse.ArgumentParser(description="Assign frequency to words.json")
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
parser.add_argument("--stats", action="store_true", help="Show statistics only")
parser.add_argument(
"--upgrade", action="store_true", help="Allow tier 2 to upgrade headword rank from conjugated forms"
)
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
# Load data
freq_path = CLEAN_CACHE if CLEAN_CACHE.exists() else RAW_CACHE
logger.info("Loading frequency corpus: %s", freq_path)
with open(freq_path, encoding="utf-8") as f:
freq_corpus: dict[str, int] = json.load(f)
# Load raw corpus for original rank numbers (with gaps)
raw_corpus: dict[str, int] | None = None
if RAW_CACHE.exists() and freq_path != RAW_CACHE:
with open(RAW_CACHE, encoding="utf-8") as f:
raw_corpus = json.load(f)
logger.info("Using original ranks from %s", RAW_CACHE)
with open(WORDS_JSON, encoding="utf-8") as f:
words: dict = json.load(f)
logger.info("Corpus: %d entries, Words.json: %d entries", len(freq_corpus), len(words))
# Run assignment
assignments = assign_frequencies(words, freq_corpus, raw_corpus, upgrade=args.upgrade)
# Stats
print_stats(words, assignments, freq_corpus)
if args.stats or args.dry_run:
if args.dry_run:
logger.info("Dry run — no changes saved")
return
# Apply to words.json
changed = 0
for key, entry in words.items():
if key in assignments:
new_rank = assignments[key]["rank"]
if entry.get("frequency") != new_rank:
entry["frequency"] = new_rank
changed += 1
else:
if entry.get("frequency") is not None:
entry["frequency"] = None
changed += 1
with open(WORDS_JSON, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
logger.info("Updated %d entries in words.json", changed)
if __name__ == "__main__":
main()

View file

@ -1,269 +0,0 @@
#!/usr/bin/env python3
"""Assign pseudo-frequency to confusable groups using English word frequency.
Problem: Confusable entries share the same ktiv_male and thus the same Hebrew
frequency rank. This script uses English frequency to differentiate them so
Anki sorts more-common meanings first.
Algorithm:
1. For each confusable group where all entries share the same Hebrew frequency,
extract the first meaningful English keyword from each entry's meaning field.
2. Look up English frequency rank for each keyword.
3. Assign pseudo_frequency: the most frequent English meaning keeps the original
Hebrew rank; less frequent meanings get progressively higher (worse) ranks
by adding an offset (100 * position in group).
Usage:
python3 scripts/assign_pseudo_frequency.py # assign and save
python3 scripts/assign_pseudo_frequency.py --dry-run # preview only
"""
from __future__ import annotations
import argparse
import json
import logging
import re
from collections import defaultdict
from pathlib import Path
logger = logging.getLogger(__name__)
PROJECT_ROOT = Path(__file__).parent.parent
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt"
# Words too common/vague to use as frequency signal
_EN_STOP = frozenset(
{
"to",
"be",
"a",
"an",
"the",
"of",
"in",
"on",
"at",
"for",
"and",
"with",
"by",
"or",
"but",
"not",
"as",
"its",
"it",
"is",
"was",
"are",
"from",
"that",
"this",
"have",
"has",
"had",
"do",
"does",
"did",
"will",
"would",
"can",
"could",
"may",
"might",
"shall",
"should",
"must",
"no",
"yes",
"very",
"too",
"also",
"just",
"only",
"so",
"up",
"out",
"into",
"over",
"after",
"before",
"about",
"more",
"than",
"other",
"some",
"any",
"all",
"each",
"every",
"both",
"few",
"many",
"much",
"most",
"such",
"own",
"same",
"well",
"still",
"even",
"how",
"what",
"when",
"where",
"which",
"who",
"whom",
"whose",
"why",
"because",
"if",
"then",
"else",
"while",
"until",
"though",
"whether",
}
)
def _load_en_freq() -> dict[str, int]:
"""Load English frequency data: word -> rank (1 = most common)."""
freq: dict[str, int] = {}
rank = 1
with open(EN_FREQ_PATH, encoding="utf-8") as f:
for line in f:
parts = line.strip().split()
if parts:
word = parts[0].lower()
if word not in freq:
freq[word] = rank
rank += 1
return freq
def _extract_keywords(meaning: str) -> list[str]:
"""Extract meaningful English keywords from a meaning string.
Returns list of lowercase words, filtered for stop words and short words.
"""
# Strip parenthesized content, punctuation
cleaned = re.sub(r"\([^)]*\)", " ", meaning)
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP]
def assign_pseudo_frequencies(
words: dict,
en_freq: dict[str, int],
dry_run: bool = False,
) -> int:
"""Assign pseudo_frequency to confusable groups. Returns count of changes."""
# Group by confusables_guid
groups: dict[str, list[str]] = defaultdict(list)
for key, entry in words.items():
cg = entry.get("confusables_guid")
if cg:
groups[cg].append(key)
changes = 0
assigned_groups = 0
skipped_diff = 0
skipped_no_en = 0
for _guid, keys in groups.items():
entries = [words[k] for k in keys]
freqs = [e.get("frequency") for e in entries]
# Skip groups that are already differentiated
unique_freqs = set(freqs)
if len(unique_freqs) > 1:
skipped_diff += 1
continue
base_freq = freqs[0] # All same (or all None)
# Look up English frequency for each entry
en_ranks: list[tuple[int, str]] = [] # (en_rank, key)
for key, entry in zip(keys, entries, strict=True):
keywords = _extract_keywords(entry.get("meaning", ""))
en_rank = 999_999
for kw in keywords[:5]:
r = en_freq.get(kw)
if r is not None:
en_rank = r
break
en_ranks.append((en_rank, key))
# Sort by English frequency (lower rank = more common)
en_ranks.sort()
# Check if all entries have the same English rank (no signal)
if len({r for r, _ in en_ranks}) <= 1:
skipped_no_en += 1
continue
assigned_groups += 1
# Assign pseudo_frequency: most common gets base, others get offset
for position, (en_rank, key) in enumerate(en_ranks):
pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank
if not dry_run:
words[key]["pseudo_frequency"] = pseudo
changes += 1
if dry_run:
meaning = words[key].get("meaning", "")[:40]
logger.info(
" [en:%5d] pseudo=%6d %s",
en_rank,
pseudo,
meaning,
)
logger.info(
"Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal",
assigned_groups,
skipped_diff,
skipped_no_en,
)
return changes
def main() -> None:
parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables")
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
logger.info("Loading English frequency data: %s", EN_FREQ_PATH)
en_freq = _load_en_freq()
logger.info("English frequency: %d entries", len(en_freq))
with open(WORDS_JSON, encoding="utf-8") as f:
words: dict = json.load(f)
changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run)
if args.dry_run:
logger.info("Dry run — %d changes would be made", changes)
return
with open(WORDS_JSON, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
logger.info("Saved %d pseudo-frequency assignments to words.json", changes)
if __name__ == "__main__":
main()

View file

@ -1,212 +0,0 @@
"""Check that every GUID in the last-release complete .apkg exists in words.json.
Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
then compares against all GUID fields stored in data/words.json.
Usage:
python3 scripts/check_guid_coverage.py
python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
python3 scripts/check_guid_coverage.py --verbose
"""
from __future__ import annotations
import argparse
import json
import os
import sqlite3
import sys
import tempfile
import zipfile
from pathlib import Path
from typing import Any
PROJECT_ROOT = Path(__file__).parent.parent
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
# Known model IDs (from apkg_builder.py)
MODEL_IDS = {
1701222017968: "vocab",
1234567893: "conjugation",
1234567897: "plurals",
1234567895: "confusables",
}
def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
"""Extract GUIDs from .apkg grouped by model ID."""
by_model: dict[int, set[str]] = {}
with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
z.extractall(td)
db_path = os.path.join(td, "collection.anki2")
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("SELECT guid, mid FROM notes")
for guid, mid in cur.fetchall():
by_model.setdefault(mid, set()).add(guid)
conn.close()
return by_model
def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
"""Collect all GUIDs from words.json grouped by deck type."""
vocab_guids: set[str] = set()
cloze_guids: set[str] = set()
conj_guids: set[str] = set()
plurals_guids: set[str] = set()
confusables_guids: set[str] = set()
for entry in data.values():
# Vocab legacy GUID
g = entry.get("vocab_legacy_guid")
if g:
vocab_guids.add(g)
# Cloze GUID (stored in examples.cloze.cloze_guid)
examples = entry.get("examples")
if examples:
cloze = examples.get("cloze")
if cloze:
g = cloze.get("cloze_guid")
if g:
cloze_guids.add(g)
# Plurals GUID (stored inside noun_inflection)
ni = entry.get("noun_inflection")
if ni:
g = ni.get("plurals_guid")
if g:
plurals_guids.add(g)
# Confusables GUID (top-level)
g = entry.get("confusables_guid")
if g:
confusables_guids.add(g)
# Conjugation form GUIDs
conj = entry.get("conjugation")
if conj:
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
g = form.get("guid")
if g:
conj_guids.add(g)
gc = form.get("guid_candidates")
if gc:
for g2 in gc:
conj_guids.add(g2)
return {
"vocab": vocab_guids,
"cloze": cloze_guids,
"conjugation": conj_guids,
"plurals": plurals_guids,
"confusables": confusables_guids,
}
def main() -> None:
parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
parser.add_argument(
"--apkg",
type=Path,
default=DEFAULT_APKG,
help=f"Path to .apkg file (default: {DEFAULT_APKG})",
)
parser.add_argument("--verbose", "-v", action="store_true")
args = parser.parse_args()
if not args.apkg.exists():
print(f"ERROR: apkg not found: {args.apkg}")
sys.exit(2)
if not WORDS_JSON.exists():
print(f"ERROR: words.json not found: {WORDS_JSON}")
sys.exit(2)
print(f"Checking: {args.apkg}")
print(f"Against: {WORDS_JSON}")
print()
apkg_by_model = extract_apkg_guids(args.apkg)
data = json.load(WORDS_JSON.open(encoding="utf-8"))
wj = collect_words_json_guids(data)
total_apkg = sum(len(s) for s in apkg_by_model.values())
total_wj = sum(len(s) for s in wj.values())
print(f"Total GUIDs in apkg: {total_apkg}")
print(f"Total GUIDs in words.json: {total_wj}")
print()
all_missing = 0
all_extra = 0
for mid, deck_name in MODEL_IDS.items():
apkg_set = apkg_by_model.get(mid, set())
# Map apkg model to words.json GUID sets
if deck_name == "vocab":
# Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
# They share the note GUID — vocab_legacy_guid IS the note guid
wj_set = wj["vocab"] | wj["cloze"]
elif deck_name == "conjugation":
wj_set = wj["conjugation"]
elif deck_name == "plurals":
wj_set = wj["plurals"]
elif deck_name == "confusables":
wj_set = wj["confusables"]
else:
wj_set = set()
missing = apkg_set - wj_set
extra = wj_set - apkg_set
matched = apkg_set & wj_set
all_missing += len(missing)
all_extra += len(extra)
status = "PASS" if not missing else "FAIL"
print(f" {status} {deck_name} (mid={mid})")
print(
f" apkg={len(apkg_set)}, words.json={len(wj_set)}, "
f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
)
if missing and args.verbose:
# Try to find what word each missing GUID belongs to in the apkg
print(" Missing GUIDs (in apkg, not in words.json):")
for g in sorted(missing)[:20]:
print(f" {g!r}")
if len(missing) > 20:
print(f" ... ({len(missing) - 20} more)")
if extra and args.verbose:
print(" Extra GUIDs (in words.json, not in apkg):")
for g in sorted(extra)[:10]:
print(f" {g!r}")
if len(extra) > 10:
print(f" ... ({len(extra) - 10} more)")
print()
# Check for unknown model IDs in apkg
unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
if unknown_mids:
print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}")
for mid in unknown_mids:
print(f" mid={mid}: {len(apkg_by_model[mid])} notes")
print("" * 60)
if all_missing:
print(f" FAILED: {all_missing} apkg GUIDs not found in words.json")
print(" (These notes would lose study progress on reimport)")
sys.exit(1)
else:
print(f" All {total_apkg} apkg GUIDs accounted for in words.json.")
sys.exit(0)
if __name__ == "__main__":
main()

View file

@ -1,400 +0,0 @@
#!/usr/bin/env python3
"""Clean the Hebrew frequency corpus by removing prefix+word combinations.
Two modes:
--mode yap (default) Use YAP morphological analyzer for accurate prefix detection.
Requires YAP API running at localhost:8000.
--mode heuristic Use rule-based prefix stripping (no external dependencies).
Both modes preserve words that exist as known dictionary forms in words.json.
Usage:
python3 scripts/clean_frequency_corpus.py # YAP mode
python3 scripts/clean_frequency_corpus.py --mode heuristic # heuristic fallback
python3 scripts/clean_frequency_corpus.py --dry-run # preview only
python3 scripts/clean_frequency_corpus.py --resume # resume YAP from checkpoint
python3 scripts/clean_frequency_corpus.py --limit 1000 # process first N entries
Input: data/frequency_cache.json (raw he_50k.txt, 49999 entries)
Output: data/frequency_clean.json (filtered, prefix combos removed)
data/frequency_discarded.json (discarded entries with reason)
"""
from __future__ import annotations
import argparse
import json
import logging
import os
import sys
import time
from pathlib import Path
import requests
logger = logging.getLogger(__name__)
PROJECT_ROOT = Path(__file__).parent.parent
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
DISCARDED = PROJECT_ROOT / "data" / "frequency_discarded.json"
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
CHECKPOINT = PROJECT_ROOT / "data" / "_yap_checkpoint.json"
YAP_URL = os.environ.get("YAP_URL", "http://localhost:8000/yap/heb/joint")
YAP_TIMEOUT = 10
BATCH_SAVE_INTERVAL = 500
# --- YAP mode constants ---
# POS tags that indicate a prefix
PREFIX_POS = frozenset({"PREPOSITION", "CONJ", "DEF", "REL"})
# POS tags for the host word that make the combo a false positive
HOST_POS = frozenset({"NN", "NNP", "NNT", "PRP", "CD", "DT", "EX"})
# --- Heuristic mode constants ---
# Hebrew prefix combinations, longest first for greedy matching.
PREFIXES = [
# 4-char
"וכשמ",
"וכשב",
"וכשל",
"וכשה",
# 3-char
"וכש",
"ומה",
"ובה",
"וכה",
"ולה",
"ומש",
"ובש",
"וכב",
"ולב",
"ומב",
"וכל",
"ולכ",
"שבה",
"שמה",
# 2-char
"כש",
"מה",
"בה",
"כה",
"לה",
"מש",
"בש",
"וב",
"וה",
"וכ",
"ול",
"ומ",
"וש",
"כב",
"לב",
"מב",
"כל",
"לכ",
"שב",
"שה",
"שכ",
"של",
"שמ",
# 1-char
"ב",
"ה",
"ו",
"כ",
"ל",
"מ",
"ש",
]
MIN_REMAINDER_LEN = 2
def _load_known_forms(words_path: Path) -> set[str]:
"""Load all known ktiv_male forms from words.json."""
if not words_path.exists():
logger.warning("words.json not found at %s — no dictionary filter", words_path)
return set()
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
known: set[str] = set()
for entry in words.values():
w = entry.get("word") or {}
if km := w.get("ktiv_male"):
known.add(km)
for form in entry.get("active_forms") or []:
if isinstance(form, dict) and (km2 := form.get("ktiv_male")):
known.add(km2)
for hp in entry.get("hufal_pual_forms") or []:
if isinstance(hp, dict) and (km3 := hp.get("ktiv_male")):
known.add(km3)
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
for inf_data in (entry.get(field) or {}).values():
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
known.add(km4)
logger.info("Loaded %d known dictionary forms from words.json", len(known))
return known
# ── YAP mode ──────────────────────────────────────────────────────────────
def query_yap(word: str) -> dict | None:
"""Send a single word to YAP and return the JSON response."""
payload = {"text": f"{word} "}
try:
resp = requests.post(YAP_URL, json=payload, timeout=YAP_TIMEOUT)
resp.raise_for_status()
return resp.json()
except requests.RequestException as e:
logger.warning("YAP request failed for '%s': %s", word, e)
return None
def is_prefix_combo_yap(yap_response: dict) -> tuple[bool, str]:
"""Check if any morphological analysis segments the word as prefix+host.
Conservative: if ANY analysis in the lattice shows prefix+host discard.
"""
lattice = yap_response.get("ma_lattice", "")
if not lattice:
return False, ""
arcs = []
for line in lattice.strip().split("\n"):
if not line.strip():
continue
parts = line.split("\t")
if len(parts) < 6:
continue
arcs.append(
{
"from": parts[0],
"to": parts[1],
"form": parts[2],
"lemma": parts[3],
"cpos": parts[4],
"pos": parts[5],
}
)
if len(arcs) < 2:
return False, ""
for a in arcs:
if a["cpos"] not in PREFIX_POS and a["pos"] not in PREFIX_POS:
continue
for b in arcs:
if b["from"] != a["to"]:
continue
if b["cpos"] in HOST_POS or b["pos"] in HOST_POS:
reason = f"{a['form']}({a['cpos']})+{b['form']}({b['cpos']})"
return True, reason
return False, ""
# ── Heuristic mode ────────────────────────────────────────────────────────
def find_prefix_decomposition(word: str, freq: dict[str, int]) -> tuple[str, str] | None:
"""Check if word is a prefix+higher-ranked-word combo (heuristic)."""
if len(word) <= MIN_REMAINDER_LEN:
return None
word_rank = freq.get(word, 999999)
for prefix in PREFIXES:
if not word.startswith(prefix):
continue
remainder = word[len(prefix) :]
if len(remainder) < MIN_REMAINDER_LEN:
continue
if remainder in freq and freq[remainder] < word_rank:
return prefix, remainder
return None
# ── Main ──────────────────────────────────────────────────────────────────
def main() -> None:
parser = argparse.ArgumentParser(description="Clean frequency corpus")
parser.add_argument("--mode", choices=["yap", "heuristic"], default="yap", help="Detection mode")
parser.add_argument("--dry-run", action="store_true", help="Show removals without saving")
parser.add_argument("--resume", action="store_true", help="Resume YAP mode from checkpoint")
parser.add_argument("--limit", type=int, default=0, help="Process only first N words (0=all)")
args = parser.parse_args()
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
if not RAW_CACHE.exists():
logger.error("Raw frequency cache not found: %s", RAW_CACHE)
sys.exit(1)
with open(RAW_CACHE, encoding="utf-8") as f:
raw_freq: dict[str, int] = json.load(f)
logger.info("Raw frequency corpus: %d entries", len(raw_freq))
# Sort by rank
words_by_rank = sorted(raw_freq.items(), key=lambda x: x[1])
if args.limit:
words_by_rank = words_by_rank[: args.limit]
if args.mode == "yap":
discarded_list = _run_yap_mode(words_by_rank, args)
else:
known_forms = _load_known_forms(WORDS_JSON)
discarded_list = _run_heuristic_mode(words_by_rank, raw_freq, known_forms)
kept_count = len(words_by_rank) - len(discarded_list)
logger.info("Done. Kept: %d, Discarded: %d", kept_count, len(discarded_list))
if args.dry_run:
logger.info("Dry run — no files written")
return
# Build clean frequency dict (re-ranked without gaps)
discarded_words = {d["word"] for d in discarded_list}
clean_freq: dict[str, int] = {}
new_rank = 1
for word, _rank in words_by_rank:
if word not in discarded_words:
clean_freq[word] = new_rank
new_rank += 1
with open(CLEAN_CACHE, "w", encoding="utf-8") as f:
json.dump(clean_freq, f, ensure_ascii=False)
logger.info("Clean frequency saved: %d entries → %s", len(clean_freq), CLEAN_CACHE)
with open(DISCARDED, "w", encoding="utf-8") as f:
json.dump(discarded_list, f, ensure_ascii=False, indent=2)
logger.info("Discarded entries saved: %d%s", len(discarded_list), DISCARDED)
def _run_yap_mode(
words_by_rank: list[tuple[str, int]],
args: argparse.Namespace,
) -> list[dict]:
"""Run YAP-based prefix detection."""
# Check YAP connectivity
test = query_yap("בדיקה")
if test is None:
logger.error("Cannot connect to YAP API at %s", YAP_URL)
sys.exit(1)
logger.info("YAP API connected")
# Load checkpoint if resuming
analyzed: dict[str, dict] = {}
if args.resume and CHECKPOINT.exists():
with open(CHECKPOINT, encoding="utf-8") as f:
analyzed = json.load(f)
logger.info("Resumed from checkpoint: %d words already analyzed", len(analyzed))
discarded_list: list[dict] = []
discarded_count = 0
kept_count = 0
error_count = 0
for i, (word, rank) in enumerate(words_by_rank):
# Already analyzed (from checkpoint)
if word in analyzed:
if analyzed[word]["discard"]:
discarded_count += 1
discarded_list.append({"word": word, "original_rank": rank, "reason": analyzed[word]["reason"]})
else:
kept_count += 1
continue
# Trivial: single char, ASCII, or too short
if len(word) <= 1 or word.isascii():
analyzed[word] = {"discard": False, "reason": ""}
kept_count += 1
continue
result = query_yap(word)
if result is None:
analyzed[word] = {"discard": False, "reason": "yap_error"}
error_count += 1
kept_count += 1
time.sleep(0.5)
continue
is_combo, reason = is_prefix_combo_yap(result)
analyzed[word] = {"discard": is_combo, "reason": reason}
if is_combo:
discarded_count += 1
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
if rank <= 500 or discarded_count <= 50:
logger.info(" DISCARD rank %5d: %s (%s)", rank, word, reason)
else:
kept_count += 1
# Rate limit
if i % 10 == 0:
time.sleep(0.01)
# Checkpoint
if (i + 1) % BATCH_SAVE_INTERVAL == 0:
if not args.dry_run:
with open(CHECKPOINT, "w", encoding="utf-8") as f:
json.dump(analyzed, f, ensure_ascii=False)
logger.info(
" [%d/%d] kept=%d discarded=%d errors=%d",
i + 1,
len(words_by_rank),
kept_count,
discarded_count,
error_count,
)
# Final checkpoint save
if not args.dry_run and CHECKPOINT.exists():
CHECKPOINT.unlink()
if error_count:
logger.warning("%d YAP errors encountered", error_count)
return discarded_list
def _run_heuristic_mode(
words_by_rank: list[tuple[str, int]],
raw_freq: dict[str, int],
known_forms: set[str],
) -> list[dict]:
"""Run heuristic prefix detection (no external dependencies)."""
discarded_list: list[dict] = []
discarded_count = 0
for word, rank in words_by_rank:
if len(word) <= 1 or word.isascii():
continue
# Known dictionary form → keep
if word in known_forms:
continue
result = find_prefix_decomposition(word, raw_freq)
if result is not None:
prefix, remainder = result
discarded_count += 1
reason = f"{prefix}+{remainder} (rank {raw_freq[remainder]})"
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
if rank <= 500 or discarded_count <= 50:
logger.info(" DISCARD rank %5d: %s = %s", rank, word, reason)
return discarded_list
if __name__ == "__main__":
main()

View file

@ -0,0 +1,405 @@
#!/usr/bin/env python3
"""
Extract sentences from PDF books and match vocab words to sentences.
1. Extract sentences from alice.pdf and lion_strawberry.pdf
2. Merge into existing epub_sentence_index.json
3. Match vocab words to sentences, produce vocab_sentence_matches.json
"""
import json
import os
import re
import sys
# Use the venv with pymupdf
sys.path.insert(0, "/home/node/projects/pealim/venv_pdf/lib/python3.11/site-packages")
# Also need the main venv for pandas
sys.path.insert(0, "/home/node/projects/pealim/lib/python3.11/site-packages")
import fitz
import pandas as pd
BASE_DIR = "/home/node/projects/pealim"
DATA_DIR = os.path.join(BASE_DIR, "data")
EPUBS_DIR = os.path.join(DATA_DIR, "epubs")
SENTENCE_INDEX = os.path.join(DATA_DIR, "epub_sentence_index.json")
VOCAB_CSV = os.path.join(DATA_DIR, "hebrew_dict_for_anki.csv")
MATCHES_FILE = os.path.join(DATA_DIR, "vocab_sentence_matches.json")
NIKKUD_RE = re.compile(r"[\u0591-\u05C7]")
HEBREW_RE = re.compile(r"[\u05d0-\u05ea]")
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea\ufb20-\ufb4f]")
def strip_nikkud(text):
"""Remove all Hebrew nikkud/cantillation marks."""
return NIKKUD_RE.sub("", text)
def collapse_hebrew_spaces(text):
"""Collapse spaces between Hebrew letter fragments (for badly-encoded PDFs).
Strategy: strip nikkud first, then iteratively remove spaces between
Hebrew characters. Real word boundaries are detected by:
- Final-form letters (ם ן ף ך ץ) followed by space
- Punctuation (.,;:!?"')
- Non-Hebrew characters
"""
stripped = strip_nikkud(text)
# Normalize presentation forms to standard Hebrew
# FB20-FB4F contains presentation forms
for code in range(0xFB2A, 0xFB50):
ch = chr(code)
if ch in stripped:
# Map shin/sin dots, dagesh forms back to base
# FB2A = שׁ (shin+dot), FB2B = שׂ (sin+dot)
base_map = {
"\ufb2a": "ש",
"\ufb2b": "ש",
"\ufb35": "ו",
"\ufb4b": "ו",
"\ufb30": "א",
"\ufb31": "ב",
"\ufb32": "ג",
"\ufb33": "ד",
"\ufb34": "ה",
"\ufb36": "ז",
"\ufb38": "ט",
"\ufb39": "י",
"\ufb3a": "כ",
"\ufb3b": "כ",
"\ufb3c": "ל",
"\ufb3e": "מ",
"\ufb40": "נ",
"\ufb41": "ס",
"\ufb43": "פ",
"\ufb44": "פ",
"\ufb46": "צ",
"\ufb47": "ק",
"\ufb48": "ר",
"\ufb49": "ש",
"\ufb4a": "ת",
}
if ch in base_map:
stripped = stripped.replace(ch, base_map[ch])
# Replace multiple spaces with single
stripped = re.sub(r" {2,}", " ", stripped)
# Now rebuild text, keeping spaces only at word boundaries
# Word boundary markers: final-form letters, punctuation, non-Hebrew
final_forms = set("םןףךץ")
result = []
i = 0
chars = list(stripped)
while i < len(chars):
if chars[i] != " ":
result.append(chars[i])
i += 1
continue
# It's a space. Decide if it's a word boundary.
# Look back for the last non-space character
prev_ch = None
for j in range(len(result) - 1, -1, -1):
if result[j] != " ":
prev_ch = result[j]
break
# Look forward for next non-space character
next_ch = None
for j in range(i + 1, len(chars)):
if chars[j] != " ":
next_ch = chars[j]
break
is_boundary = False
# After final-form letter = word boundary
if prev_ch and prev_ch in final_forms:
is_boundary = True
# Before/after punctuation or non-Hebrew = word boundary
if prev_ch and not HEBREW_RE.match(prev_ch):
is_boundary = True
if next_ch and not HEBREW_RE.match(next_ch):
is_boundary = True
# If either side is not Hebrew at all, boundary
if prev_ch is None or next_ch is None:
is_boundary = True
if is_boundary:
result.append(" ")
# else: skip the space (collapse intra-word gap)
i += 1
return "".join(result).strip()
def extract_pdf_sentences(pdf_path, book_name):
"""Extract sentences from a PDF file."""
doc = fitz.open(pdf_path)
sentences = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
if not text.strip():
continue
# Split into lines first, then split on sentence-ending punctuation
lines = text.split("\n")
raw_sentences = []
for line in lines:
line = line.strip()
if not line:
continue
# Split on sentence-ending punctuation followed by space or at end
parts = re.split(r"(?<=[.?!])\s+", line)
raw_sentences.extend(parts)
for sent in raw_sentences:
sent = sent.strip()
if not sent:
continue
# Must contain Hebrew characters
if not HEBREW_RE.search(sent):
continue
# Create stripped version (no nikkud, collapsed spaces for PDF)
stripped = collapse_hebrew_spaces(sent)
# Count Hebrew words in stripped version
words = [w for w in stripped.split() if HEBREW_RE.search(w)]
word_count = len(words)
# Filter: 4-15 Hebrew words
if word_count < 4 or word_count > 15:
continue
# Drop metadata-like lines
# Page numbers (just digits)
if re.match(r"^\d+$", sent.strip()):
continue
# Copyright text
if any(kw in sent.lower() for kw in ["copyright", "©", "isbn", "printed in"]):
continue
sentences.append(
{
"text": sent,
"book": book_name,
"stripped": stripped,
}
)
doc.close()
return sentences
def has_extractable_text(pdf_path):
"""Check if a PDF has extractable text."""
doc = fitz.open(pdf_path)
text_found = False
for i in range(min(len(doc), 10)):
if doc[i].get_text().strip():
text_found = True
break
doc.close()
return text_found
def load_sentence_index():
"""Load existing sentence index."""
if os.path.exists(SENTENCE_INDEX):
with open(SENTENCE_INDEX, encoding="utf-8") as f:
return json.load(f)
return {"sentences": []}
def save_sentence_index(data):
"""Save sentence index."""
with open(SENTENCE_INDEX, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def match_vocab_to_sentences(sentences, vocab_df):
"""Match vocab words to sentences."""
matches = {}
# Build lookup: word_no_nikkud -> word_nikkud
vocab_words = []
for _, row in vocab_df.iterrows():
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
word_nik = str(row.get("Word", "")).strip()
if word_no_nik and word_nik:
vocab_words.append((word_no_nik, word_nik))
print(f"Matching {len(vocab_words)} vocab words against {len(sentences)} sentences...")
# Precompute: for each sentence, get the stripped text
sent_data = []
for s in sentences:
stripped = s.get("stripped", "")
# For PDF sentences, stripped already has collapsed spaces but words may be joined
# For EPUB sentences, stripped has proper word spacing
sent_data.append(
{
"text": s["text"],
"book": s["book"],
"stripped": stripped,
"word_count": len(stripped.split()),
}
)
matched_count = 0
for word_no_nik, word_nik in vocab_words:
if len(word_no_nik) < 2:
continue
# Build regex for word boundary matching
# Use both approaches: proper word boundary and substring for PDF text
pattern = re.compile(r"(?:^|\s)" + re.escape(word_no_nik) + r"(?:\s|$)")
# For PDF texts with collapsed spaces, also try substring match
# but only for words >= 3 chars to avoid false positives
use_substring = len(word_no_nik) >= 3
word_matches = []
for sd in sent_data:
stripped = sd["stripped"]
# Try word-boundary match first
if pattern.search(stripped):
word_matches.append(sd)
elif use_substring and word_no_nik in stripped:
# Substring match for PDF texts with collapsed spaces
# Verify it's not part of a longer word by checking the character
# before and after in the collapsed text
idx = stripped.find(word_no_nik)
before_ok = idx == 0 or not HEBREW_RE.match(stripped[idx - 1])
after_idx = idx + len(word_no_nik)
after_ok = after_idx >= len(stripped) or not HEBREW_RE.match(stripped[after_idx])
# Only count if at least one boundary is clear
# (for PDF collapsed text, boundaries are often missing)
# For PDF books, we accept substring matches
if sd["book"] in ("אליס בארץ הפלאות", "האריה שאהב תות") or before_ok or after_ok:
word_matches.append(sd)
if word_matches:
matched_count += 1
# Sort by preference: 6-12 words ideal, then shorter is better
def score(sd):
wc = sd["word_count"]
if 6 <= wc <= 12:
return (0, wc) # ideal range, prefer shorter
if wc < 6:
return (1, -wc) # too short
return (2, wc) # too long
word_matches.sort(key=score)
best = word_matches[:3]
matches[word_no_nik] = {
"word_nikkud": word_nik,
"sentences": [{"text": m["text"], "book": m["book"]} for m in best],
}
print(
f"Words with at least 1 match: {matched_count}/{len(vocab_words)} ({100 * matched_count / len(vocab_words):.1f}%)"
)
return matches
def main():
# ── Step 1: Extract from PDFs ──
pdfs = [
("alice.pdf", "אליס בארץ הפלאות"),
("lion_strawberry.pdf", "האריה שאהב תות"),
]
all_new_sentences = []
for filename, book_name in pdfs:
pdf_path = os.path.join(EPUBS_DIR, filename)
if not os.path.exists(pdf_path):
print(f"SKIP: {filename} not found")
continue
if not has_extractable_text(pdf_path):
print(f"SKIP: {filename} has no extractable text (likely scanned images)")
continue
print(f"Extracting from {filename} ({book_name})...")
sentences = extract_pdf_sentences(pdf_path, book_name)
print(f" Extracted {len(sentences)} sentences")
all_new_sentences.extend(sentences)
# ── Step 2: Merge with existing index ──
index = load_sentence_index()
existing_count = len(index["sentences"])
# Deduplicate by (stripped, book)
existing_keys = set()
for s in index["sentences"]:
key = (s.get("stripped", ""), s.get("book", ""))
existing_keys.add(key)
added = 0
for s in all_new_sentences:
key = (s["stripped"], s["book"])
if key not in existing_keys:
index["sentences"].append(s)
existing_keys.add(key)
added += 1
save_sentence_index(index)
total = len(index["sentences"])
print(f"\nSentence index: {existing_count} existing + {added} new = {total} total")
# ── Per-book stats ──
book_counts = {}
for s in index["sentences"]:
book = s.get("book", "unknown")
book_counts[book] = book_counts.get(book, 0) + 1
print("\nSentences per book:")
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
print(f" {book}: {count}")
# ── Step 3: Match vocab words to sentences ──
print(f"\nLoading vocab from {VOCAB_CSV}...")
vocab_df = pd.read_csv(VOCAB_CSV, sep=";", index_col=0)
print(f" {len(vocab_df)} vocab words loaded")
matches = match_vocab_to_sentences(index["sentences"], vocab_df)
with open(MATCHES_FILE, "w", encoding="utf-8") as f:
json.dump(matches, f, ensure_ascii=False, indent=2)
print(f"\nWrote {len(matches)} word matches to {MATCHES_FILE}")
# ── Step 4: Summary stats ──
total_words = len(vocab_df)
matched_words = len(matches)
print(f"\n{'=' * 50}")
print("SUMMARY")
print(f"{'=' * 50}")
print(f"Total sentences: {total}")
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
print(f" {book}: {count}")
print(f"Total vocab words: {total_words}")
print(f"Words with sentences: {matched_words} ({100 * matched_words / total_words:.1f}%)")
print(f"Words without sentences: {total_words - matched_words}")
if __name__ == "__main__":
main()

237
scripts/scrape_ktiv_male.py Normal file
View file

@ -0,0 +1,237 @@
#!/usr/bin/env python3
"""
Scrape ktiv male (plene/vowelless) forms from pealim.com.
Uses hebstyle=vl cookie to get vowelless writing with matres lectionis.
Builds a lookup: ktiv_male_form [{word_nikkud, form_type, pos, slug}]
This enables matching Hebrew text (which is normally in ktiv male)
against our vocabulary, including conjugated verbs and noun plurals.
"""
import json
import logging
import sys
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
OUTPUT_PATH = DATA_DIR / "ktiv_male_forms.json"
COOKIES = {"translit": "none", "hebstyle": "vl"}
REQUEST_TIMEOUT = 15
DELAY = 1.5 # seconds between requests
def fetch_verb_ktiv_male(slug: str, infinitive_nikkud: str) -> list[dict]:
"""Fetch all conjugated forms in ktiv male for a verb."""
url = f"https://www.pealim.com/dict/{slug}/"
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
forms = []
table = soup.find("table", class_="conjugation-table")
if not table:
return forms
# Also get the infinitive from the page
lead = soup.find("div", class_="lead")
if lead:
inf_spans = lead.find_all("span", class_="menukad")
for s in inf_spans:
ktiv = s.text.strip()
if ktiv:
forms.append(
{
"ktiv_male": ktiv,
"word_nikkud": infinitive_nikkud,
"form_type": "infinitive",
"pos": "Verb",
"slug": slug,
}
)
rows = table.find_all("tr")
for row in rows:
menukad_spans = row.find_all("span", class_="menukad")
for span in menukad_spans:
ktiv = span.text.strip()
if ktiv and ktiv not in {f["ktiv_male"] for f in forms}:
forms.append(
{
"ktiv_male": ktiv,
"word_nikkud": infinitive_nikkud,
"form_type": "conjugation",
"pos": "Verb",
"slug": slug,
}
)
return forms
def fetch_noun_ktiv_male(slug: str, singular_nikkud: str, gender: str) -> list[dict]:
"""Fetch noun declension forms in ktiv male."""
url = f"https://www.pealim.com/dict/{slug}/"
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
forms = []
table = soup.find("table", class_="conjugation-table")
if not table:
return forms
rows = table.find_all("tr")
form_labels = ["absolute_singular", "absolute_plural", "construct_singular", "construct_plural"]
label_idx = 0
for row in rows:
menukad_spans = row.find_all("span", class_="menukad")
for span in menukad_spans:
ktiv = span.text.strip()
if ktiv:
ft = form_labels[label_idx] if label_idx < len(form_labels) else "other"
forms.append(
{
"ktiv_male": ktiv,
"word_nikkud": singular_nikkud,
"form_type": ft,
"pos": "Noun",
"slug": slug,
"gender": gender,
}
)
label_idx += 1
return forms
def scrape_verbs() -> list[dict]:
"""Scrape ktiv male forms for all verbs in conjugations.json."""
conj_path = DATA_DIR / "conjugations.json"
if not conj_path.exists():
logger.warning("No conjugations.json found")
return []
with open(conj_path) as f:
conjugations = json.load(f)
all_forms = []
slugs_done = set()
for verb, data in conjugations.items():
if not data or not data.get("slug"):
continue
slug = data["slug"]
if slug in slugs_done:
continue
slugs_done.add(slug)
try:
forms = fetch_verb_ktiv_male(slug, verb)
all_forms.extend(forms)
logger.info(f" Verb {verb} ({slug}): {len(forms)} forms")
except Exception as e:
logger.warning(f" Verb {verb} ({slug}) failed: {e}")
time.sleep(DELAY)
return all_forms
def scrape_nouns() -> list[dict]:
"""Scrape ktiv male forms for all nouns in noun_slug_map.json."""
slug_path = DATA_DIR / "noun_slug_map.json"
if not slug_path.exists():
logger.warning("No noun_slug_map.json found")
return []
with open(slug_path) as f:
slug_map = json.load(f)
# Also load existing plurals to get nikkud singular form
plurals_path = DATA_DIR / "noun_plurals.json"
plurals = {}
if plurals_path.exists():
with open(plurals_path) as f:
plurals = json.load(f)
all_forms = []
done = 0
total = len(slug_map)
for word, info in slug_map.items():
slug = info.get("slug", "")
if not slug:
continue
# Get nikkud form from plurals data or slug map
nikkud = info.get("word_nikkud", word)
if word in plurals:
nikkud = plurals[word].get("singular", nikkud)
gender = info.get("gender", "")
try:
forms = fetch_noun_ktiv_male(slug, nikkud, gender)
all_forms.extend(forms)
done += 1
if done % 50 == 0:
logger.info(f" Nouns: {done}/{total} ({len(all_forms)} forms)")
# Save incrementally
_save_forms(all_forms, partial=True)
except Exception as e:
logger.warning(f" Noun {word} ({slug}) failed: {e}")
done += 1
time.sleep(DELAY)
return all_forms
def _save_forms(all_forms: list[dict], partial: bool = False):
"""Build and save the ktiv male lookup dict."""
lookup: dict[str, list[dict]] = {}
for entry in all_forms:
ktiv = entry["ktiv_male"]
# Don't include ktiv_male in the stored entry (it's the key)
stored = {k: v for k, v in entry.items() if k != "ktiv_male"}
lookup.setdefault(ktiv, []).append(stored)
suffix = ".partial" if partial else ""
out = OUTPUT_PATH.parent / (OUTPUT_PATH.name + suffix)
with open(out, "w") as f:
json.dump(lookup, f, ensure_ascii=False, indent=1)
logger.info(f" Saved {len(lookup)} unique ktiv male forms → {out}")
def main():
mode = sys.argv[1] if len(sys.argv) > 1 else "all"
all_forms = []
if mode in ("all", "verbs"):
logger.info("=== Scraping verb ktiv male forms ===")
verb_forms = scrape_verbs()
all_forms.extend(verb_forms)
logger.info(f"Verbs done: {len(verb_forms)} forms from {len({f['slug'] for f in verb_forms})} verbs")
if mode in ("all", "nouns"):
logger.info("=== Scraping noun ktiv male forms ===")
noun_forms = scrape_nouns()
all_forms.extend(noun_forms)
logger.info(f"Nouns done: {len(noun_forms)} forms")
_save_forms(all_forms)
logger.info(f"Total: {len(all_forms)} forms → {OUTPUT_PATH}")
if __name__ == "__main__":
main()

View file

@ -0,0 +1,365 @@
#!/usr/bin/env python3
"""
Scrape pealim.com for noun plural and construct forms.
Step 1: Collect noun slugs from list pages (/dict/?pos=noun&page=N)
Step 2: Fetch detail pages for plural + construct forms
Step 3: Print summary statistics
"""
import json
import re
import time
from pathlib import Path
import requests
from bs4 import BeautifulSoup
BASE_URL = "https://www.pealim.com"
COOKIES = {"translit": "none", "hebstyle": "mo"}
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
SLUG_MAP_FILE = DATA_DIR / "noun_slug_map.json"
PROGRESS_FILE = DATA_DIR / "noun_slug_map_progress.json"
PLURALS_FILE = DATA_DIR / "noun_plurals.json"
DELAY = 1.5 # seconds between requests
def load_json(path, default=None):
if path.exists():
with open(path) as f:
return json.load(f)
return default if default is not None else {}
def save_json(path, data):
with open(path, "w") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def fetch_with_retry(url, max_retries=5):
"""Fetch URL with exponential backoff."""
for attempt in range(max_retries):
try:
r = requests.get(url, cookies=COOKIES, headers=HEADERS, timeout=30)
r.raise_for_status()
return r
except (requests.RequestException, ConnectionError) as e:
wait = min(2**attempt * 2, 60)
print(f" Retry {attempt + 1}/{max_retries} for {url}: {e} (waiting {wait}s)")
time.sleep(wait)
print(f" FAILED after {max_retries} retries: {url}")
return None
def get_total_pages():
"""Get total number of noun list pages."""
r = fetch_with_retry(f"{BASE_URL}/dict/?pos=noun&page=1")
if not r:
return 0
soup = BeautifulSoup(r.text, "lxml")
pages = set()
for a in soup.select("ul.pagination li a"):
href = a.get("href", "")
m = re.search(r"page=(\d+)", href)
if m:
pages.add(int(m.group(1)))
return max(pages) if pages else 1
def parse_list_page(html):
"""Parse a noun list page and return list of noun entries."""
soup = BeautifulSoup(html, "lxml")
table = soup.select_one("table.dict-table")
if not table:
return []
entries = []
for row in table.select("tr")[1:]: # skip header
tds = row.select("td")
if len(tds) < 3:
continue
# First td: word + link
first_td = tds[0]
a = first_td.select_one("a")
if not a:
continue
href = a.get("href", "")
slug_match = re.search(r"/dict/([^/]+)/", href)
if not slug_match:
continue
slug = slug_match.group(1)
menukad = first_td.select_one("span.menukad")
word_nikkud = menukad.get_text(strip=True) if menukad else ""
# Word without nikkud (strip combining marks)
word_plain = re.sub(r"[\u0591-\u05C7]", "", word_nikkud)
# Third td: part of speech
pos_text = tds[2].get_text(strip=True)
# Gender
gender = ""
if "masculine" in pos_text.lower():
gender = "masculine"
elif "feminine" in pos_text.lower():
gender = "feminine"
# Mishkal pattern
mishkal = ""
m = re.search(r"(\w+)\s*pattern", pos_text.lower())
if m:
mishkal = m.group(1)
entries.append(
{
"word_plain": word_plain,
"slug": slug,
"word_nikkud": word_nikkud,
"pos": pos_text,
"gender": gender,
"mishkal": mishkal,
}
)
return entries
def step1_collect_slugs():
"""Step 1: Collect noun slugs from list pages."""
print("=" * 60)
print("STEP 1: Collecting noun slugs from list pages")
print("=" * 60)
slug_map = load_json(SLUG_MAP_FILE, {})
progress = load_json(PROGRESS_FILE, [])
completed_pages = set(progress) if isinstance(progress, list) else set()
# Get total pages
total_pages = get_total_pages()
print(f"Total pages: {total_pages}")
print(f"Already completed: {len(completed_pages)} pages, {len(slug_map)} nouns")
remaining = [p for p in range(1, total_pages + 1) if p not in completed_pages]
print(f"Remaining pages: {len(remaining)}")
if not remaining:
print("All pages already scraped!")
return slug_map
for i, page_num in enumerate(remaining):
url = f"{BASE_URL}/dict/?pos=noun&page={page_num}"
r = fetch_with_retry(url)
if not r:
print(f" Skipping page {page_num}")
continue
entries = parse_list_page(r.text)
for entry in entries:
word = entry["word_plain"]
slug_map[word] = {
"slug": entry["slug"],
"word_nikkud": entry["word_nikkud"],
"pos": entry["pos"],
"gender": entry["gender"],
"mishkal": entry["mishkal"],
}
completed_pages.add(page_num)
done = len(completed_pages)
print(f" Page {page_num} ({done}/{total_pages}): {len(entries)} nouns (total: {len(slug_map)})")
# Save progress every 10 pages
if (i + 1) % 10 == 0 or page_num == remaining[-1]:
save_json(SLUG_MAP_FILE, slug_map)
save_json(PROGRESS_FILE, sorted(completed_pages))
print(f" [Saved progress: {len(slug_map)} nouns, {done} pages]")
time.sleep(DELAY)
# Final save
save_json(SLUG_MAP_FILE, slug_map)
save_json(PROGRESS_FILE, sorted(completed_pages))
print(f"\nStep 1 complete: {len(slug_map)} total nouns from {len(completed_pages)} pages")
return slug_map
def parse_detail_page(html, slug, gender, mishkal):
"""Parse a noun detail page for plural/construct forms."""
soup = BeautifulSoup(html, "lxml")
tables = soup.select("table.conjugation-table")
if not tables:
return None
table = tables[0]
rows = table.select("tr")
result = {
"slug": slug,
"singular": "",
"singular_audio": "",
"plural": "",
"plural_audio": "",
"construct_singular": "",
"construct_plural": "",
"gender": gender,
"mishkal": mishkal,
}
for row in rows:
th = row.select_one("th")
if not th:
continue
label = th.get_text(strip=True).lower()
tds = row.select("td")
if "absolute" in label:
if len(tds) >= 1:
td = tds[0]
m = td.select_one("span.menukad")
result["singular"] = m.get_text(strip=True) if m else ""
audio_el = td.select_one("[data-audio]")
result["singular_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
if len(tds) >= 2:
td = tds[1]
m = td.select_one("span.menukad")
result["plural"] = m.get_text(strip=True) if m else ""
audio_el = td.select_one("[data-audio]")
result["plural_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
elif "construct" in label:
if len(tds) >= 1:
td = tds[0]
m = td.select_one("span.menukad")
result["construct_singular"] = m.get_text(strip=True) if m else ""
if len(tds) >= 2:
td = tds[1]
m = td.select_one("span.menukad")
result["construct_plural"] = m.get_text(strip=True) if m else ""
return result
def step2_fetch_plurals(slug_map):
"""Step 2: Fetch detail pages for plural + construct forms."""
print("\n" + "=" * 60)
print("STEP 2: Fetching plural + construct forms from detail pages")
print("=" * 60)
plurals = load_json(PLURALS_FILE, {})
already_done = set(plurals.keys())
# Build work list: nouns not yet in plurals
work = []
for word, info in slug_map.items():
if word not in already_done:
work.append((word, info))
print(f"Already have plural data: {len(already_done)}")
print(f"Remaining to fetch: {len(work)}")
if not work:
print("All nouns already have plural data!")
return plurals
skipped = 0
for i, (word, info) in enumerate(work):
slug = info["slug"]
url = f"{BASE_URL}/dict/{slug}/"
r = fetch_with_retry(url)
if not r:
print(f" Skipping {word} ({slug})")
skipped += 1
continue
entry = parse_detail_page(r.text, slug, info.get("gender", ""), info.get("mishkal", ""))
if entry:
plurals[word] = entry
else:
# No declension table - store minimal entry
plurals[word] = {
"slug": slug,
"singular": info.get("word_nikkud", ""),
"singular_audio": "",
"plural": "",
"plural_audio": "",
"construct_singular": "",
"construct_plural": "",
"gender": info.get("gender", ""),
"mishkal": info.get("mishkal", ""),
"no_declension_table": True,
}
done = len(already_done) + i + 1 - skipped
total = len(already_done) + len(work)
if (i + 1) % 50 == 0 or i == 0:
print(
f" [{i + 1}/{len(work)}] {word} ({slug}): "
f"plural={entry['plural'] if entry else 'N/A'} "
f"(total: {done}/{total})"
)
# Save every 50 entries
if (i + 1) % 50 == 0 or i == len(work) - 1:
save_json(PLURALS_FILE, plurals)
print(f" [Saved: {len(plurals)} entries]")
time.sleep(DELAY)
save_json(PLURALS_FILE, plurals)
print(f"\nStep 2 complete: {len(plurals)} total noun entries with plural data")
return plurals
def step3_summary(slug_map, plurals):
"""Step 3: Print summary statistics."""
print("\n" + "=" * 60)
print("SUMMARY")
print("=" * 60)
total_slugs = len(slug_map)
total_plurals = len(plurals)
has_plural = sum(1 for v in plurals.values() if v.get("plural"))
has_construct = sum(1 for v in plurals.values() if v.get("construct_singular") or v.get("construct_plural"))
has_audio = sum(1 for v in plurals.values() if v.get("singular_audio") or v.get("plural_audio"))
no_table = sum(1 for v in plurals.values() if v.get("no_declension_table"))
# Irregular plurals: masculine with ות- ending, feminine with ים- ending
irregular = 0
for _word, v in plurals.items():
plural = v.get("plural", "")
gender = v.get("gender", "")
if not plural or not gender:
continue
plain_plural = re.sub(r"[\u0591-\u05C7]", "", plural)
if (
gender == "masculine"
and plain_plural.endswith("ות")
or gender == "feminine"
and plain_plural.endswith("ים")
):
irregular += 1
print(f"Total nouns in slug map: {total_slugs}")
print(f"Total nouns with plural data: {total_plurals}")
print(f" - With plural form: {has_plural}")
print(f" - With construct forms: {has_construct}")
print(f" - With audio URLs: {has_audio}")
print(f" - No declension table: {no_table}")
print(f" - Irregular plurals: {irregular}")
def main():
print("Pealim Noun Plural Scraper")
print(f"Data directory: {DATA_DIR}")
print()
slug_map = step1_collect_slugs()
plurals = step2_fetch_plurals(slug_map)
step3_summary(slug_map, plurals)
if __name__ == "__main__":
main()

250
scripts/scrape_verb_ktiv.py Normal file
View file

@ -0,0 +1,250 @@
#!/usr/bin/env python3
"""Scrape ktiv male (vowelless plene) conjugation forms for top 500 verbs from pealim.com."""
import json
import os
import re
import sys
import time
sys.stdout.reconfigure(line_buffering=True)
import requests # noqa: E402
from bs4 import BeautifulSoup # noqa: E402
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
INPUT_FILE = os.path.join(DATA_DIR, "top_verbs_to_scrape.json")
OUTPUT_FILE = os.path.join(DATA_DIR, "ktiv_male_forms.json")
PARTIAL_FILE = os.path.join(DATA_DIR, "ktiv_male_forms_partial.json")
PROGRESS_FILE = os.path.join(DATA_DIR, "ktiv_scrape_progress.json")
COOKIES = {"translit": "none", "hebstyle": "vl"}
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
DELAY = 1.5
session = requests.Session()
session.cookies.update(COOKIES)
session.headers.update(HEADERS)
def load_json(path):
if os.path.exists(path):
with open(path, encoding="utf-8") as f:
return json.load(f)
return {}
def save_json(data, path):
with open(path, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=1)
def search_slug(wni):
"""Search pealim for a verb and return the first result's slug."""
url = "https://www.pealim.com/search/"
resp = session.get(url, params={"q": wni}, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
# Look for result links like /dict/SLUG/
for a in soup.select("a[href]"):
href = a["href"]
m = re.match(r"/dict/(\d+-[^/]+)/", href)
if m:
return m.group(1)
return None
def scrape_verb_forms(slug):
"""Fetch a verb's detail page and extract all ktiv male conjugation forms."""
url = f"https://www.pealim.com/dict/{slug}/"
resp = session.get(url, timeout=15)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "html.parser")
forms = set()
# Get infinitive from div.lead or page title
lead = soup.select_one("div.lead")
if lead:
menukad_spans = lead.select("span.menukad")
for span in menukad_spans:
text = span.get_text(strip=True)
if text:
forms.add(text)
# Get word_nikkud (the nikkud form of the infinitive) from the page
# We need to fetch with mo cookie for that, but we already have it from input data
# Instead, get the page title which usually has the nikkud form
word_nikkud = None
title = soup.select_one("h1")
if title:
menukad_in_title = title.select_one("span.menukad")
if menukad_in_title:
word_nikkud = menukad_in_title.get_text(strip=True)
# Get ALL span.menukad elements from conjugation tables
for span in soup.select("span.menukad"):
text = span.get_text(strip=True)
if text:
forms.add(text)
return forms, word_nikkud
def main():
verbs = load_json(INPUT_FILE)
if not verbs:
print("ERROR: No verbs found in input file")
sys.exit(1)
# Load existing forms
existing_forms = load_json(OUTPUT_FILE)
new_forms = {} # Will be merged into existing at the end
# Load progress to resume
progress = load_json(PROGRESS_FILE)
done_wnis = set(progress.get("done_wnis", []))
slug_cache = progress.get("slug_cache", {})
# Pre-populate slug cache from conjugations.json
conj_file = os.path.join(DATA_DIR, "conjugations.json")
if os.path.exists(conj_file):
conj_data = load_json(conj_file)
for wni_key, cdata in conj_data.items():
if isinstance(cdata, dict) and "slug" in cdata and wni_key not in slug_cache:
slug_cache[wni_key] = cdata["slug"]
print(f"Pre-populated {len(slug_cache)} slugs from conjugations.json")
# Deduplicate verbs by wni
seen_wni = set()
unique_verbs = []
for v in verbs:
if v["wni"] not in seen_wni:
seen_wni.add(v["wni"])
unique_verbs.append(v)
total = len(unique_verbs)
to_scrape = [v for v in unique_verbs if v["wni"] not in done_wnis]
print(f"Total unique verbs: {total}, already done: {total - len(to_scrape)}, to scrape: {len(to_scrape)}")
scraped_count = 0
skipped_count = 0
total_new_forms = 0
sample_verbs = {} # For summary: wni -> list of forms
for i, verb in enumerate(to_scrape):
wni = verb["wni"]
word_nikkud_input = verb["word"]
try:
# Step 1: Find slug
if wni in slug_cache:
slug = slug_cache[wni]
else:
slug = search_slug(wni)
time.sleep(DELAY)
if not slug:
print(f" [{i + 1}/{len(to_scrape)}] SKIP {wni} - not found on pealim")
skipped_count += 1
done_wnis.add(wni)
continue
slug_cache[wni] = slug
# Step 2: Scrape forms
forms, page_nikkud = scrape_verb_forms(slug)
time.sleep(DELAY)
# Use the nikkud form from our input data (more reliable)
nikkud_to_use = word_nikkud_input
# Build entries for each form
for form in forms:
entry = {
"word_nikkud": nikkud_to_use,
"form_type": "conjugation",
"pos": "Verb",
"slug": slug,
}
if form not in new_forms:
new_forms[form] = []
# Check for duplicate entry
if not any(e["slug"] == slug for e in new_forms[form]):
new_forms[form].append(entry)
total_new_forms += 1
scraped_count += 1
# Collect samples (first 3 completed)
if len(sample_verbs) < 3:
sample_verbs[wni] = sorted(forms)
print(f" [{i + 1}/{len(to_scrape)}] {wni} -> {slug} ({len(forms)} forms)")
done_wnis.add(wni)
except Exception as e:
print(f" [{i + 1}/{len(to_scrape)}] ERROR {wni}: {e}")
skipped_count += 1
done_wnis.add(wni)
# Save progress every 50 verbs
if (i + 1) % 50 == 0:
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
save_json(progress, PROGRESS_FILE)
# Save partial merged result
merged = dict(existing_forms)
for form, entries in new_forms.items():
if form in merged:
existing_slugs = {e["slug"] for e in merged[form]}
for entry in entries:
if entry["slug"] not in existing_slugs:
merged[form].append(entry)
else:
merged[form] = entries
save_json(merged, PARTIAL_FILE)
print(f" -- Progress saved at {i + 1}/{len(to_scrape)} --")
# Final merge
merged = dict(existing_forms)
for form, entries in new_forms.items():
if form in merged:
existing_slugs = {e["slug"] for e in merged[form]}
for entry in entries:
if entry["slug"] not in existing_slugs:
merged[form].append(entry)
else:
merged[form] = entries
save_json(merged, OUTPUT_FILE)
# Save final progress
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
save_json(progress, PROGRESS_FILE)
# Clean up partial file
if os.path.exists(PARTIAL_FILE):
os.remove(PARTIAL_FILE)
# Summary
print(f"\n{'=' * 50}")
print("SUMMARY")
print(f"{'=' * 50}")
print(f"Verbs scraped: {scraped_count}")
print(f"Verbs skipped: {skipped_count}")
print(f"New forms added: {total_new_forms}")
print(f"Total unique ktiv male forms: {len(merged)}")
print(f"Previous forms count: {len(existing_forms)}")
print(f"Net new form keys: {len(merged) - len(existing_forms)}")
if sample_verbs:
print("\nSample verbs:")
for wni, forms in list(sample_verbs.items())[:3]:
print(f"\n {wni} ({len(forms)} forms):")
for f in forms[:8]:
print(f" {f}")
if len(forms) > 8:
print(f" ... and {len(forms) - 8} more")
if __name__ == "__main__":
main()

View file

@ -1,919 +0,0 @@
"""Standalone integrity validator for data/words.json.
Validates the unified Hebrew Flash Cards data against the schema defined in
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
Usage:
python3 scripts/validate_data.py
python3 scripts/validate_data.py --verbose
python3 scripts/validate_data.py --test confusable_symmetric
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import unicodedata
from pathlib import Path
from typing import Any
# ---------------------------------------------------------------------------
# Bootstrap: make project root importable so helpers.py is accessible
# ---------------------------------------------------------------------------
sys.path.insert(0, str(Path(__file__).parent.parent))
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # aleftav
VALID_PERSON_CODES: frozenset[str] = frozenset(
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
)
EMOJI_RE = re.compile(
r"[\U0001f600-\U0001f64f"
r"\U0001f300-\U0001f5ff"
r"\U0001f680-\U0001f6ff"
r"\U0001f1e0-\U0001f1ff"
r"\U00002702-\U000027b0"
r"\U0001f900-\U0001f9ff"
r"\U0001fa00-\U0001fa6f"
r"\U0001fa70-\U0001faff]"
)
# ---------------------------------------------------------------------------
# Result tracking
# ---------------------------------------------------------------------------
_failures: list[str] = []
_warnings: list[str] = []
_verbose: bool = False
def _pass(name: str) -> None:
print(f" PASS {name}")
def _fail(name: str, details: list[str]) -> None:
global _failures
_failures.append(name)
print(f" FAIL {name}")
for d in details:
print(f" {d}")
def _warn(name: str, details: list[str]) -> None:
global _warnings
_warnings.extend(details)
print(f" WARN {name}")
for d in details:
print(f" {d}")
def _verbose_print(msg: str) -> None:
if _verbose:
print(f" {msg}")
# ---------------------------------------------------------------------------
# Helper: load data
# ---------------------------------------------------------------------------
def load_data() -> dict[str, Any]:
"""Load words.json and return the parsed dict."""
if not DATA_FILE.exists():
print(f"ERROR: data file not found: {DATA_FILE}")
sys.exit(2)
with DATA_FILE.open(encoding="utf-8") as fh:
return json.load(fh)
def _is_hebrew_consonant(ch: str) -> bool:
"""Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
only the first base character after NFD decomposition.
"""
normalized = unicodedata.normalize("NFD", ch)
# The first codepoint is the base consonant; the rest are combining marks.
base = normalized[0]
cp = ord(base)
return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
# ---------------------------------------------------------------------------
# Individual tests
# ---------------------------------------------------------------------------
def test_required_fields(data: dict[str, Any]) -> None:
"""Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
name = "required_fields"
errors: list[str] = []
warn_details: list[str] = []
for key, entry in data.items():
word = entry.get("word")
if not isinstance(word, dict):
errors.append(f"[{key}] 'word' is missing or not a dict")
else:
if not word.get("nikkud"):
errors.append(f"[{key}] word.nikkud is missing or empty")
if not word.get("ktiv_male"):
errors.append(f"[{key}] word.ktiv_male is missing or empty")
if not entry.get("slug"):
errors.append(f"[{key}] 'slug' is missing or empty")
if not entry.get("pos"):
errors.append(f"[{key}] 'pos' is missing or empty")
if not entry.get("meaning"):
errors.append(f"[{key}] 'meaning' is missing or empty")
if entry.get("frequency") is None:
warn_details.append(f"[{key}] 'frequency' is null/missing")
if warn_details:
_warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
if len(warn_details) > 20 and not _verbose:
print(f" ... ({len(warn_details) - 20} more; use --verbose)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_root_format(data: dict[str, Any]) -> None:
"""root is a list of 2-5 Hebrew consonant chars, or an empty list."""
name = "root_format"
errors: list[str] = []
for key, entry in data.items():
root = entry.get("root")
if root is None:
errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
continue
if not isinstance(root, list):
errors.append(f"[{key}] 'root' is not a list: {root!r}")
continue
if len(root) == 0:
continue # rootless word — valid
if not (2 <= len(root) <= 5):
errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
continue
for ch in root:
# A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
# Validate by checking the base consonant after NFD decomposition.
if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
break
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_unique_slugs(data: dict[str, Any]) -> None:
"""All non-empty slugs are unique across entries — each pealim page is a distinct word."""
name = "unique_slugs"
seen: dict[str, list[str]] = {}
for key, entry in data.items():
slug = entry.get("slug")
if slug:
seen.setdefault(slug, []).append(key)
dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
if dups:
errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001
"""JSON loaded without top-level key collisions.
Python's json.load silently keeps the last value on duplicate keys;
we re-parse with a custom object_pairs_hook to detect them.
The pre-parsed ``_data`` dict is not used here because we need to
re-read the raw file to catch duplicate keys that json.load would
silently merge.
"""
name = "no_duplicate_keys"
duplicates: list[str] = []
def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
d: dict[str, Any] = {}
for k, v in pairs:
if k in d:
duplicates.append(k)
d[k] = v
return d
with DATA_FILE.open(encoding="utf-8") as fh:
json.load(fh, object_pairs_hook=_detect_dups)
if duplicates:
_fail(name, [f"duplicate key: {k!r}" for k in duplicates])
else:
_pass(name)
def test_confusable_symmetric(data: dict[str, Any]) -> None:
"""If A lists B in confusable_group, B must list A."""
name = "confusable_symmetric"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
for other_key in group:
other = data.get(other_key)
if other is None:
errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
continue
other_group = other.get("confusable_group") or []
if key not in other_group:
errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
"""Every key in shared_roots must exist as a top-level key."""
name = "shared_roots_valid_keys"
errors: list[str] = []
for key, entry in data.items():
shared = entry.get("shared_roots")
if not shared:
continue
for ref_key in shared:
if ref_key not in data:
errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_unique_legacy_guids(data: dict[str, Any]) -> None:
"""No two entries share the same vocab_legacy_guid (excluding null).
Exception: entries that share the same word.nikkud value inherited the
same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
These are tolerated the duplicate GUID is a known artefact of how
legacy GUIDs were generated from the nikkud word alone.
"""
name = "unique_legacy_guids"
seen: dict[str, list[str]] = {}
for key, entry in data.items():
guid = entry.get("vocab_legacy_guid")
if guid:
seen.setdefault(guid, []).append(key)
errors: list[str] = []
for guid, keys in seen.items():
if len(keys) <= 1:
continue
# Tolerate sharing if ALL entries with this GUID share the same word.nikkud
nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
if len(nikkud_values) == 1:
# Same nikkud -> inherited from same legacy card; tolerable
_verbose_print(
f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
)
continue
errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
"""noun_inflection must be null if pos doesn't start with 'Noun'.
Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
"""
name = "no_noun_inflection_on_non_nouns"
errors: list[str] = []
for key, entry in data.items():
pos = entry.get("pos") or ""
noun_inf = entry.get("noun_inflection")
if not pos.startswith("Noun") and noun_inf is not None:
errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
_verbose_print(f"offending entry: {key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
"""meaning field must not contain inline emoji characters."""
name = "no_emoji_in_meaning"
errors: list[str] = []
for key, entry in data.items():
meaning = entry.get("meaning") or ""
if EMOJI_RE.search(meaning):
errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
"""For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
Uses nikkud (exact) matching, not stripped matching.
"""
name = "example_sentences_contain_word"
errors: list[str] = []
for key, entry in data.items():
examples = entry.get("examples")
if not examples:
continue
vetted = examples.get("vetted")
if not vetted:
continue
word_obj = entry.get("word") or {}
nikkud_word = word_obj.get("nikkud") or ""
if not nikkud_word:
continue
found = any(nikkud_word in (s.get("text") or "") for s in vetted)
if not found:
sentences_preview = [s.get("text", "") for s in vetted[:2]]
errors.append(
f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
)
if errors:
_warn(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
_pass(name)
def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
"""cloze_word_start/end must be within text bounds when present.
Null offsets are tolerated (and warned separately) because some sentences
contain only inflected/construct/plural forms that cannot be matched back
to the base nikkud or ktiv_male this is a data quality issue in
vetted_sentences.json, not a schema violation.
"""
name = "cloze_offsets_valid"
errors: list[str] = []
null_warn: list[str] = []
for key, entry in data.items():
examples = entry.get("examples")
if not examples:
continue
cloze = examples.get("cloze")
if not cloze:
continue
text = cloze.get("text") or ""
start = cloze.get("cloze_word_start")
end = cloze.get("cloze_word_end")
if start is None or end is None:
null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
continue
text_len = len(text)
if not isinstance(start, int) or not isinstance(end, int):
errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
continue
if start < 0 or end < 0:
errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
continue
if start >= end:
errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
continue
if end > text_len:
errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
if null_warn:
_warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
if len(null_warn) > 20 and not _verbose:
print(f" ... ({len(null_warn) - 20} more; use --verbose)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
"""hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
name = "hufal_pual_only_on_hifil_piel"
errors: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
hufal_pual = conj.get("hufal_pual_forms")
if hufal_pual is None:
continue
binyan = conj.get("binyan") or ""
binyan_lower = binyan.lower()
if "hif" not in binyan_lower and "pi" not in binyan_lower:
errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
"""All entries in a confusable_group must share the same word.ktiv_male."""
name = "confusable_group_shares_ktiv_male"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
my_word = entry.get("word") or {}
my_ktiv = my_word.get("ktiv_male")
if not my_ktiv:
continue
for other_key in group:
other = data.get(other_key)
if not other:
continue # already caught by confusable_symmetric
other_word = other.get("word") or {}
other_ktiv = other_word.get("ktiv_male")
if other_ktiv and other_ktiv != my_ktiv:
errors.append(
f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
)
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_confusables_guid(data: dict[str, Any]) -> None:
"""confusables_guid must be consistent within each confusable_group.
Rules:
- If confusable_group is non-null, confusables_guid must be non-null.
- If confusable_group is null, confusables_guid must be null.
- All entries that share a confusable_group must share the same
confusables_guid value.
"""
name = "confusables_guid"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
guid = entry.get("confusables_guid")
if group and not guid:
errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
elif not group and guid is not None:
errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
if not group or not guid:
continue
for other_key in group:
other = data.get(other_key)
if not other:
continue # already caught by confusable_symmetric
other_guid = other.get("confusables_guid")
if other_guid != guid:
errors.append(
f"[{key}] confusables_guid={guid!r} but confusable member "
f"{other_key!r} has confusables_guid={other_guid!r}"
)
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_conjugation_form_guids(data: dict[str, Any]) -> None:
"""Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
Rules:
- Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
1st person forms where multiple GUIDs are possible).
- No two forms within the same verb (across both form lists) may share a GUID.
"""
name = "conjugation_form_guids"
errors: list[str] = []
warnings: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
person = form.get("person", "?")
label = f"{form_list_key}[{person}]"
guid = form.get("guid")
guid_candidates = form.get("guid_candidates")
if not guid and not guid_candidates:
# New forms from rescrape use deterministic fallback — warn, don't fail
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
continue
if guid:
if guid in seen_guids:
errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
else:
seen_guids[guid] = label
elif guid_candidates:
for candidate in guid_candidates:
if candidate in seen_guids:
errors.append(
f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
)
else:
seen_guids[candidate] = label
if warnings:
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_conjugation_person_codes(data: dict[str, Any]) -> None:
"""active_forms person codes must be from the defined valid set."""
name = "conjugation_person_codes"
errors: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
person = form.get("person")
if person not in VALID_PERSON_CODES:
errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
"""For confusable words, their example sentences must not contain the wrong
homograph's nikkud word.
Specifically: if A and B are confusable (same ktiv_male), A's vetted
sentences must not contain B's nikkud form, and vice versa.
"""
name = "no_stripped_form_sentence_collisions"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
examples = entry.get("examples")
if not examples:
continue
vetted = examples.get("vetted")
if not vetted:
continue
my_word = entry.get("word") or {}
my_nikkud = my_word.get("nikkud") or ""
my_texts = [s.get("text") or "" for s in vetted]
for other_key in group:
other = data.get(other_key)
if not other:
continue
other_word = other.get("word") or {}
other_nikkud = other_word.get("nikkud") or ""
if not other_nikkud or other_nikkud == my_nikkud:
continue # same nikkud homographs are ok (we can't distinguish by nikkud)
for text in my_texts:
if other_nikkud in text:
errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
_verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
break # one error per (key, other_key) pair is enough
if errors:
_warn(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
_pass(name)
def test_no_shared_confusable_examples(data: dict[str, Any]) -> None:
"""Within each confusable group, no two entries should share the same set of vetted sentence texts.
Shared examples indicate the deduplication step in epub_examples.py
failed to assign examples to only the highest-frequency member.
"""
name = "no_shared_confusable_examples"
errors: list[str] = []
from collections import defaultdict
# Build confusable group map
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
for key, entry in data.items():
cg = entry.get("confusable_group")
if cg:
group_id = tuple(sorted(cg))
group_map[group_id].append(key)
for _group_id, members in group_map.items():
if len(members) < 2:
continue
# Collect sentence text sets per member
text_sets: dict[str, frozenset[str]] = {}
for key in members:
vetted = (data[key].get("examples") or {}).get("vetted") or []
texts = frozenset(e.get("text", "") for e in vetted)
if texts:
text_sets[key] = texts
# Check for identical sets
seen: dict[frozenset[str], str] = {}
for key, texts in text_sets.items():
if texts in seen:
meaning_a = (data[seen[texts]].get("meaning") or "")[:30]
meaning_b = (data[key].get("meaning") or "")[:30]
errors.append(
f"{seen[texts]} ({meaning_a}) and {key} ({meaning_b}) share {len(texts)} identical example(s)"
)
else:
seen[texts] = key
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
"""English meanings must not contain bare Hebrew text (spoils the card)."""
name = "no_hebrew_in_meaning"
errors: list[str] = []
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
for key, entry in data.items():
meaning = entry.get("meaning") or ""
# Apply same cleaning pipeline as apkg_builder
cleaned = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(", ;:")
if hebrew_re.search(cleaned):
errors.append(f"[{key}] meaning still contains Hebrew after cleaning: {cleaned!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_mishkal_consistency(data: dict[str, Any]) -> None:
"""mishkal_hebrew must match mishkal via _mishkal_to_hebrew conversion."""
name = "mishkal_consistency"
errors: list[str] = []
try:
from pealim_detail_scrape import _mishkal_to_hebrew
except ImportError:
_warn(name, ["Could not import _mishkal_to_hebrew — skipping"])
return
for key, entry in data.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_eng = infl.get("mishkal") or ""
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_eng and mishkal_heb:
expected = _mishkal_to_hebrew(mishkal_eng) or ""
if expected and expected != mishkal_heb:
errors.append(f"[{key}] {infl_key}: {mishkal_eng}{mishkal_heb} (expected {expected})")
if mishkal_heb and not mishkal_eng:
errors.append(f"[{key}] {infl_key}: has mishkal_hebrew but no mishkal")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
# ---------------------------------------------------------------------------
# Stats summary
# ---------------------------------------------------------------------------
def print_stats(data: dict[str, Any]) -> None:
"""Print a summary of dataset coverage metrics."""
total = len(data)
with_conj = sum(1 for e in data.values() if e.get("conjugation"))
with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
with_image = sum(1 for e in data.values() if e.get("image"))
with_emoji = sum(1 for e in data.values() if e.get("emoji"))
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
with_mishkal = sum(
1
for e in data.values()
if (e.get("noun_inflection") or {}).get("mishkal") or (e.get("adjective_inflection") or {}).get("mishkal")
)
print()
print("Stats Summary")
print("" * 42)
print(f" Total entries: {total:>6}")
print(f" With conjugation data: {with_conj:>6}")
print(f" With noun_inflection: {with_noun_inf:>6}")
print(f" With mishkal: {with_mishkal:>6}")
print(f" With vetted examples: {with_vetted:>6}")
print(f" With cloze examples: {with_cloze:>6}")
print(f" With images: {with_image:>6}")
print(f" With emoji: {with_emoji:>6}")
print(f" With legacy GUIDs: {with_guid:>6}")
print(f" In confusable groups: {in_confusable:>6}")
print(f" With shared roots: {with_shared_roots:>6}")
# ---------------------------------------------------------------------------
# Test registry
# ---------------------------------------------------------------------------
ALL_TESTS: dict[str, Any] = {
"required_fields": test_required_fields,
"root_format": test_root_format,
"unique_slugs": test_unique_slugs,
"no_duplicate_keys": test_no_duplicate_keys,
"confusable_symmetric": test_confusable_symmetric,
"shared_roots_valid_keys": test_shared_roots_valid_keys,
"unique_legacy_guids": test_unique_legacy_guids,
"no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
"no_emoji_in_meaning": test_no_emoji_in_meaning,
"example_sentences_contain_word": test_example_sentences_contain_word,
"cloze_offsets_valid": test_cloze_offsets_valid,
"hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
"confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
"confusables_guid": test_confusables_guid,
"conjugation_form_guids": test_conjugation_form_guids,
"conjugation_person_codes": test_conjugation_person_codes,
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
"no_shared_confusable_examples": test_no_shared_confusable_examples,
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
"mishkal_consistency": test_mishkal_consistency,
}
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
global _verbose
parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Print full details for all failures (not just first 20).",
)
parser.add_argument(
"--test",
metavar="NAME",
help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
)
args = parser.parse_args()
_verbose = args.verbose
data = load_data()
# Select tests to run
if args.test:
if args.test not in ALL_TESTS:
print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
sys.exit(2)
tests_to_run = {args.test: ALL_TESTS[args.test]}
else:
tests_to_run = ALL_TESTS
print(f"Validating {DATA_FILE} ({len(data)} entries)")
print("" * 60)
# no_duplicate_keys needs the file, not the pre-parsed dict
for test_fn in tests_to_run.values():
test_fn(data)
# Summary
if not args.test:
print_stats(data)
print()
print("" * 60)
if _warnings:
print(f" Warnings : {len(_warnings)}")
if _failures:
print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
sys.exit(1)
else:
print(f" All {len(tests_to_run)} test(s) passed.")
sys.exit(0)
if __name__ == "__main__":
main()

View file

@ -1,198 +0,0 @@
"""Sentence difficulty scoring by context-word frequency.
Scores sentences by the median frequency rank of context words
(excluding the cloze target). Lower score = easier sentence.
Used by epub_examples.py to select the best cloze sentence.
"""
from __future__ import annotations
from statistics import median
import helpers
import nikkud_to_ktiv_male
DEFAULT_RANK = 50_000
# Hebrew prefix consonants for ktiv_male prefix stripping (tier 5)
_KM_PREFIX_CHARS = set("בהוכלמשע")
# Punctuation to strip from tokens
_PUNCT = set('.,!?;:"\'"״׳–—()[]{}')
# Maqaf (Hebrew hyphen) — splits tokens
_MAQAF = "־"
def build_nikkud_map(words: dict) -> dict[str, str]:
"""Build nikkud→ktiv_male lookup from words.json.
Indexes: headwords, conjugation forms (active, passive, infinitive,
reference_form), noun inflections (singular, plural, construct,
pronominal suffixes), and adjective inflections (ms/fs/mp/fp).
Args:
words: The full words.json dict keyed by unique_key.
Returns:
Dict mapping nikkud form to ktiv_male string.
When collisions occur, last-write wins (acceptable for frequency lookup).
"""
nmap: dict[str, str] = {}
def _add(nikkud: str | None, ktiv_male: str | None) -> None:
if nikkud and ktiv_male:
nmap[nikkud] = ktiv_male
for entry in words.values():
word = entry.get("word") or {}
_add(word.get("nikkud"), word.get("ktiv_male"))
# Conjugation forms
conj = entry.get("conjugation") or {}
for form_entry in conj.get("active_forms") or []:
form = form_entry.get("form") or {}
_add(form.get("nikkud"), form.get("ktiv_male"))
for form_entry in conj.get("hufal_pual_forms") or []:
form = form_entry.get("form") or {}
_add(form.get("nikkud"), form.get("ktiv_male"))
inf = conj.get("infinitive") or {}
_add(inf.get("nikkud"), inf.get("ktiv_male"))
ref = conj.get("reference_form") or {}
_add(ref.get("nikkud"), ref.get("ktiv_male"))
# Noun inflection forms
noun = entry.get("noun_inflection") or {}
for field in ("singular", "plural", "construct_singular", "construct_plural"):
sub = noun.get(field) or {}
nikkud_form = sub.get("nikkud")
ktiv = sub.get("ktiv_male")
_add(nikkud_form, ktiv)
# Index construct forms without maqaf
if nikkud_form and nikkud_form.endswith("־") and ktiv:
_add(nikkud_form[:-1], ktiv)
pronominal = noun.get("pronominal_suffixes") or {}
for sub in pronominal.values():
if isinstance(sub, dict):
_add(sub.get("nikkud"), sub.get("ktiv_male"))
# Adjective inflection forms
adj = entry.get("adjective_inflection") or {}
for field in ("ms", "fs", "mp", "fp"):
sub = adj.get(field) or {}
_add(sub.get("nikkud"), sub.get("ktiv_male"))
return nmap
def _resolve_token_frequency(
token: str,
nikkud_map: dict[str, str],
nikkud_index: dict,
freq_data: dict[str, int],
) -> int:
"""Resolve a nikkud sentence token to its frequency rank.
Uses a 5-tier pipeline:
1. Known mapping (nikkud_map from words.json)
2. Nikkud prefix stripping (epub_examples.try_strip_prefix)
3. Academy rules converter (nikkud_to_ktiv_male.convert)
4. strip_nikkud fallback (helpers.strip_nikkud)
5. Ktiv_male prefix stripping on the converted form
Returns:
Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found.
"""
# Tier 1: Direct lookup in nikkud→ktiv_male map
ktiv = nikkud_map.get(token)
if ktiv and ktiv in freq_data:
return freq_data[ktiv]
# Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map
from epub_examples import try_strip_prefix
prefix_hits = try_strip_prefix(token, nikkud_index)
for _unique_key, _match_type, matched_remainder in prefix_hits:
remainder_ktiv = nikkud_map.get(matched_remainder)
if remainder_ktiv and remainder_ktiv in freq_data:
return freq_data[remainder_ktiv]
# Tier 3: Academy rules converter
converted = nikkud_to_ktiv_male.convert(token)
if converted in freq_data:
return freq_data[converted]
# Tier 4: strip_nikkud fallback
stripped = helpers.strip_nikkud(token)
if stripped != converted and stripped in freq_data:
return freq_data[stripped]
# Tier 5: Ktiv_male prefix stripping on converted/stripped form
for form in (converted, stripped):
for prefix_len in (1, 2):
if len(form) > prefix_len + 1:
prefix = form[:prefix_len]
if all(c in _KM_PREFIX_CHARS for c in prefix):
stem = form[prefix_len:]
if stem in freq_data:
return freq_data[stem]
return DEFAULT_RANK
def score_sentence(
text: str,
target_start: int,
target_end: int,
nikkud_map: dict[str, str],
nikkud_index: dict,
freq_data: dict[str, int],
) -> int:
"""Score a sentence by median frequency rank of context words.
Args:
text: The full sentence text (with nikkud).
target_start: Character offset where the cloze target word starts.
target_end: Character offset where the cloze target word ends.
nikkud_map: nikkudktiv_male mapping from build_nikkud_map().
nikkud_index: nikkud index from epub_examples._build_nikkud_index().
freq_data: Frequency dict from frequency_lookup.get_freq_data().
Returns:
Median frequency rank of context tokens (int). Lower = easier.
Returns DEFAULT_RANK if no scoreable context tokens.
"""
# Tokenize: split on whitespace, then split on maqaf
raw_tokens = text.split()
tokens_with_pos: list[tuple[str, int, int]] = []
pos = 0
for raw in raw_tokens:
start = text.index(raw, pos)
# Split on maqaf
parts = raw.split(_MAQAF)
sub_pos = start
for part in parts:
if part:
tokens_with_pos.append((part, sub_pos, sub_pos + len(part)))
sub_pos += len(part) + 1 # +1 for maqaf
pos = start + len(raw)
# Filter: exclude target word, strip punctuation, skip short tokens
context_ranks: list[int] = []
for token, tok_start, tok_end in tokens_with_pos:
# Exclude target word by overlap with char offsets
if tok_start < target_end and tok_end > target_start:
continue
# Strip punctuation from edges
cleaned = token.strip("".join(_PUNCT))
if len(cleaned) < 2:
continue
rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data)
context_ranks.append(rank)
if not context_ranks:
return DEFAULT_RANK
return int(median(context_ranks))

View file

@ -1,246 +0,0 @@
"""Unit tests for apkg_builder — Sprint 15 learnings.
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
meanings, PoS exact matching, gender field population, and mishkal data integrity.
"""
import json
import re
import sys
from pathlib import Path
import pytest
# Ensure project root is on path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from apkg_builder import _categorize_pos, _cloze_prefix_len
# ---------------------------------------------------------------------------
# Cloze prefix preservation
# ---------------------------------------------------------------------------
class TestClozePrefix:
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
def test_single_prefix_bet(self):
# בַּתּוֹר = bet + patach + tor
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
def test_single_prefix_lamed(self):
# לַמֶּלֶךְ = lamed + patach + melech
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
def test_two_consonant_prefix(self):
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
token = "שֶׁבַּתּוֹר"
word = "תּוֹר"
prefix_len = _cloze_prefix_len(token, word)
assert prefix_len > 0
assert token[prefix_len:].startswith(word)
def test_no_prefix_direct_match(self):
# Word appears at start — no prefix
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
def test_empty_inputs(self):
assert _cloze_prefix_len("", "תּוֹר") == 0
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
assert _cloze_prefix_len("", "") == 0
def test_non_prefix_letter_returns_zero(self):
# If the "prefix" chars aren't valid prefix letters, return 0
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
def test_prefix_preserves_nikkud(self):
# Verify that prefix_len includes nikkud marks
token = "בַּתּוֹר"
word = "תּוֹר"
prefix_len = _cloze_prefix_len(token, word)
prefix = token[:prefix_len]
# Prefix should contain at least bet + nikkud mark(s)
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
assert base_letters == ["ב"]
# ---------------------------------------------------------------------------
# PoS exact matching (no substring collisions)
# ---------------------------------------------------------------------------
class TestCategorizePos:
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
def test_noun_exact(self):
assert _categorize_pos("Noun") == "Noun"
def test_pronoun_is_other(self):
assert _categorize_pos("Pronoun") == "Other"
def test_verb_exact(self):
assert _categorize_pos("Verb") == "Verb"
def test_noun_with_dash(self):
assert _categorize_pos("Noun masculine") == "Noun"
def test_adjective(self):
assert _categorize_pos("Adjective") == "Adjective"
def test_conjunction_is_other(self):
assert _categorize_pos("Conjunction") == "Other"
# ---------------------------------------------------------------------------
# Hebrew spoiler stripping from English meanings
# ---------------------------------------------------------------------------
class TestHebrewSpoilerStripping:
"""English meanings must not contain Hebrew text (spoils the card)."""
# Use the same regex from apkg_builder.py
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
@staticmethod
def _strip_hebrew(meaning: str) -> str:
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
meaning = re.sub(r"[;:]\s*—", "", meaning)
meaning = re.sub(r";\s*:", ";", meaning)
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
def test_pure_english_unchanged(self):
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
def test_hebrew_word_removed(self):
result = self._strip_hebrew("to eat; אכל")
assert "אכל" not in result
def test_hebrew_with_nikkud_removed(self):
result = self._strip_hebrew("tall; גָּבוֹהַּ")
assert "גָּבוֹהַּ" not in result
assert "tall" in result
def test_no_residual_hebrew_in_real_data(self):
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
# The regex used in apkg_builder
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
spoilers = []
for key, entry in words.items():
meaning = entry.get("meaning") or ""
cleaned = self._strip_hebrew(meaning)
if hebrew_re.search(cleaned):
spoilers.append(f"{key}: {cleaned!r}")
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
# ---------------------------------------------------------------------------
# Gender field for nouns (words.json data integrity)
# ---------------------------------------------------------------------------
class TestGenderDataIntegrity:
"""Nouns with noun_inflection should have gender populated."""
@pytest.fixture()
def words(self):
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
return json.load(f)
def test_nouns_have_gender(self, words):
"""Nouns with noun_inflection should have a valid gender."""
missing = []
for key, entry in words.items():
pos = entry.get("pos") or ""
ni = entry.get("noun_inflection")
if pos.startswith("Noun") and ni:
gender = ni.get("gender") or ""
if gender not in ("masculine", "feminine", "masculine and feminine"):
missing.append(f"{key}: gender={gender!r}")
# Allow up to 7% missing (loan words, compound words, etc.)
noun_count = sum(
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
)
if noun_count > 0:
pct_missing = len(missing) / noun_count
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
# ---------------------------------------------------------------------------
# Mishkal data integrity
# ---------------------------------------------------------------------------
class TestMishkalIntegrity:
"""Validate mishkal data consistency in words.json."""
@pytest.fixture()
def words(self):
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
return json.load(f)
def test_mishkal_hebrew_matches_english(self, words):
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
from pealim_detail_scrape import _mishkal_to_hebrew
mismatches = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_eng = infl.get("mishkal") or ""
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_eng and mishkal_heb:
expected = _mishkal_to_hebrew(mishkal_eng) or ""
if expected and expected != mishkal_heb:
mismatches.append(f"{key}: {mishkal_eng}{mishkal_heb} (expected {expected})")
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
def test_mishkal_hebrew_is_hebrew(self, words):
"""mishkal_hebrew must contain Hebrew characters."""
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
bad = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_heb and not hebrew_re.search(mishkal_heb):
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
def test_no_orphaned_mishkal(self, words):
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
orphans = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_heb = infl.get("mishkal_hebrew") or ""
mishkal_eng = infl.get("mishkal") or ""
if mishkal_heb and not mishkal_eng:
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"

View file

@ -1,524 +0,0 @@
"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from pealim_detail_scrape import (
_parse_adjective_table,
_parse_adjective_table_vl,
_parse_preposition_table,
_parse_preposition_table_vl,
_scrape_adjective_detail,
_scrape_preposition_detail,
)
# ---------------------------------------------------------------------------
# Fixtures — real HTML snippets from pealim.com
# ---------------------------------------------------------------------------
ADJECTIVE_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִי</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fs-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִית</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="mp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיִּים</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיּוֹת</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
</tr>
</tbody>
</table>
"""
# VL version: menukad spans contain unvowelled text (hebstyle=vl)
ADJECTIVE_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a"><div><div>
<span class="menukad">אביבי</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fs-a"><div><div>
<span class="menukad">אביבית</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="mp-a"><div><div>
<span class="menukad">אביביים</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fp-a"><div><div>
<span class="menukad">אביביות</span>
</div></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th rowspan="2">Person</th>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<th>1st</th>
<td class="conj-td" colspan="2">
<div id="P-1s"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">&#128266;</span>
<span class="menukad">שֶׁלִּי</span>
</div></div><div class="meaning"><strong>of mine</strong></div></div>
</td>
<td class="conj-td" colspan="2">
<div id="P-1p"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּנוּ</span>
</div></div><div class="meaning"><strong>of ours</strong></div></div>
</td>
</tr>
<tr>
<th>2nd</th>
<td class="conj-td">
<div id="P-2ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">&#128266;</span>
<span class="menukad">שֶׁלְּךָ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּךְ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶם</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶן</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
</td>
</tr>
<tr>
<th>3rd</th>
<td class="conj-td">
<div id="P-3ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">&#128266;</span>
<span class="menukad">שֶׁלּוֹ</span>
</div></div><div class="meaning"><strong>of his</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהּ</span>
</div></div><div class="meaning"><strong>of hers</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶם</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
</td>
<td class="conj-td">
<div id="P-3fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶן</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<th>1st</th>
<td colspan="2"><div id="P-1s"><div><div>
<span class="menukad">שלי</span>
</div></div></div></td>
<td colspan="2"><div id="P-1p"><div><div>
<span class="menukad">שלנו</span>
</div></div></div></td>
</tr>
<tr>
<th>2nd</th>
<td><div id="P-2ms"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2fs"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2mp"><div><div>
<span class="menukad">שלכם</span>
</div></div></div></td>
<td><div id="P-2fp"><div><div>
<span class="menukad">שלכן</span>
</div></div></div></td>
</tr>
<tr>
<th>3rd</th>
<td><div id="P-3ms"><div><div>
<span class="menukad">שלו</span>
</div></div></div></td>
<td><div id="P-3fs"><div><div>
<span class="menukad">שלה</span>
</div></div></div></td>
<td><div id="P-3mp"><div><div>
<span class="menukad">שלהם</span>
</div></div></div></td>
<td><div id="P-3fp"><div><div>
<span class="menukad">שלהן</span>
</div></div></div></td>
</tr>
</tbody>
</table>
"""
# Minimal full-page wrappers so _scrape_*_detail() can parse them
_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
# ---------------------------------------------------------------------------
# Adjective table tests
# ---------------------------------------------------------------------------
class TestParseAdjectiveTable:
"""Tests for _parse_adjective_table (mo/nikkud page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["nikkud"] == "אֲבִיבִי"
def test_fs_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fs"]["nikkud"] == "אֲבִיבִית"
def test_mp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
def test_fp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
def test_audio_url_present(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParseAdjectiveTableVl:
"""Tests for _parse_adjective_table_vl (ktiv male page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["ms"] == "אביבי"
def test_fs_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fs"] == "אביבית"
def test_mp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["mp"] == "אביביים"
def test_fp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fp"] == "אביביות"
# ---------------------------------------------------------------------------
# _scrape_adjective_detail tests
# ---------------------------------------------------------------------------
class TestScrapeAdjectiveDetail:
"""Tests for _scrape_adjective_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["ms"]["nikkud"] == "אֲבִיבִי"
assert result["ms"]["ktiv_male"] == "אביבי"
def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fs"]["nikkud"] == "אֲבִיבִית"
assert result["fs"]["ktiv_male"] == "אביבית"
def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
assert result["mp"]["ktiv_male"] == "אביביים"
def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
assert result["fp"]["ktiv_male"] == "אביביות"
def test_mishkal_key_present(self, result: dict) -> None:
# mishkal may be None since no PoS section is in our minimal fixture
assert "mishkal" in result
def test_mishkal_hebrew_key_present(self, result: dict) -> None:
assert "mishkal_hebrew" in result
def test_all_schema_keys_present(self, result: dict) -> None:
expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
assert expected.issubset(result.keys())
def test_empty_on_no_table(self) -> None:
result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}
# ---------------------------------------------------------------------------
# Preposition table tests
# ---------------------------------------------------------------------------
class TestParsePrepositionTable:
"""Tests for _parse_preposition_table (mo/nikkud page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_nikkud(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
def test_1p_nikkud(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
def test_2ms_nikkud(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
def test_2fs_nikkud(self, result: dict) -> None:
assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
def test_2mp_nikkud(self, result: dict) -> None:
assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
def test_2fp_nikkud(self, result: dict) -> None:
assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
def test_3ms_nikkud(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
def test_3fs_nikkud(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
def test_3mp_nikkud(self, result: dict) -> None:
assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
def test_3fp_nikkud(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
def test_audio_url_present(self, result: dict) -> None:
assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParsePrepositionTableVl:
"""Tests for _parse_preposition_table_vl (ktiv male page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_ktiv(self, result: dict) -> None:
assert result["1s"] == "שלי"
def test_1p_ktiv(self, result: dict) -> None:
assert result["1p"] == "שלנו"
def test_2ms_ktiv(self, result: dict) -> None:
assert result["2ms"] == "שלך"
def test_3ms_ktiv(self, result: dict) -> None:
assert result["3ms"] == "שלו"
def test_3fp_ktiv(self, result: dict) -> None:
assert result["3fp"] == "שלהן"
# ---------------------------------------------------------------------------
# _scrape_preposition_detail tests
# ---------------------------------------------------------------------------
class TestScrapePrepositionDetail:
"""Tests for _scrape_preposition_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_all_ten_person_keys_present(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert expected.issubset(result.keys())
def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
assert result["1s"]["ktiv_male"] == "שלי"
def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
assert result["1p"]["ktiv_male"] == "שלנו"
def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
assert result["2ms"]["ktiv_male"] == "שלך"
def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
assert result["3ms"]["ktiv_male"] == "שלו"
def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
assert result["3fs"]["ktiv_male"] == "שלה"
def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
assert result["3fp"]["ktiv_male"] == "שלהן"
def test_empty_on_no_table(self) -> None:
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}
# ---------------------------------------------------------------------------
# Tests for _parse_noun_gender_mishkal mishkal extraction
# ---------------------------------------------------------------------------
from bs4 import BeautifulSoup # noqa: E402
from pealim_detail_scrape import _parse_noun_gender_mishkal # noqa: E402
class TestNounGenderMishkal:
def test_noun_with_mishkal(self):
html = '<p>Noun <a href="/dict/?pos=noun&amp;nm=qetel"><i>ketel</i> pattern</a>, masculine</p>'
soup = BeautifulSoup(html, "html.parser")
gender, mishkal = _parse_noun_gender_mishkal(soup)
assert gender == "masculine"
assert mishkal == "ketel"
def test_noun_without_mishkal(self):
html = "<p>Noun masculine</p>"
soup = BeautifulSoup(html, "html.parser")
gender, mishkal = _parse_noun_gender_mishkal(soup)
assert gender == "masculine"
assert mishkal == ""
def test_adjective_mishkal(self):
html = '<p>Adjective <a href="/dict/?pos=adjective&amp;am=qatul"><i>katul</i> pattern</a></p>'
soup = BeautifulSoup(html, "html.parser")
_, mishkal = _parse_noun_gender_mishkal(soup)
assert mishkal == "katul"
def test_feminine_noun(self):
html = '<p>Noun <a href="/dict/?pos=noun&amp;nm=qetel"><i>ketel</i> pattern</a>, feminine</p>'
soup = BeautifulSoup(html, "html.parser")
gender, mishkal = _parse_noun_gender_mishkal(soup)
assert gender == "feminine"
assert mishkal == "ketel"

View file

@ -1,127 +0,0 @@
"""Tests for epub_examples deduplication of confusable group examples."""
from epub_examples import _deduplicate_confusable_examples
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
"""Build a minimal words.json entry for testing."""
entry = {
"meaning": meaning,
"confusable_group": confusable_group,
}
if vetted_texts is not None:
entry["examples"] = {
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
}
if frequency_rank is not None:
entry["frequency_rank"] = frequency_rank
return entry
class TestDeduplicateConfusableExamples:
"""Tests for _deduplicate_confusable_examples()."""
def test_shared_examples_kept_on_higher_frequency(self):
"""When two confusables share identical examples, the one with
lower frequency_rank (more common) keeps them."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["key_a"]["examples"]["vetted"]) == 2
assert words["key_b"]["examples"]["vetted"] == []
def test_no_action_when_examples_differ(self):
"""Groups with different example sets are left untouched."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
assert len(words["key_a"]["examples"]["vetted"]) == 1
assert len(words["key_b"]["examples"]["vetted"]) == 1
def test_no_action_when_one_has_no_examples(self):
"""If only one member has examples, nothing to deduplicate."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("meaning2", group, frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
def test_no_frequency_uses_alphabetical_tiebreak(self):
"""When no member has frequency data, first alphabetically wins."""
group = ["alpha_key", "beta_key"]
words = {
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
"beta_key": _make_entry("meaning2", group, ["sent1"]),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
assert words["beta_key"]["examples"]["vetted"] == []
def test_three_way_group(self):
"""Three-member group: highest frequency wins, other two cleared."""
group = ["key_a", "key_b", "key_c"]
words = {
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 2
assert len(words["key_a"]["examples"]["vetted"]) == 2
assert words["key_b"]["examples"]["vetted"] == []
assert words["key_c"]["examples"]["vetted"] == []
def test_cloze_removed_from_losers(self):
"""Losing entries should have their cloze data removed too."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
}
# Add cloze to both
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert "cloze" not in words["key_b"]["examples"]
def test_no_confusable_groups_returns_zero(self):
"""Words without confusable_group are ignored."""
words = {
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
def test_mixed_frequency_and_none(self):
"""Member with frequency beats member without."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
"key_b": _make_entry("no_freq", group, ["sent1"]),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["key_a"]["examples"]["vetted"]) == 1
assert words["key_b"]["examples"]["vetted"] == []
def test_partial_overlap_not_deduplicated(self):
"""Groups with overlapping but not identical sentence sets are not touched."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0

View file

@ -1,83 +0,0 @@
"""Integration tests for frequency-based sentence scoring in update_words_json."""
def _make_sentence(text, source="test", match_method="direct", word_count=None, char_offset=0, char_end=3):
"""Build a minimal sentence dict as match_sentences would produce."""
if word_count is None:
word_count = len(text.split())
return {
"text": text,
"source": source,
"match_method": match_method,
"word_count": word_count,
"char_offset": char_offset,
"char_end": char_end,
}
class TestScoringIntegration:
"""Tests that update_words_json uses frequency scoring."""
def test_cloze_has_difficulty_score(self):
"""Cloze dict includes difficulty_score field."""
from epub_examples import update_words_json
words = {
"טוֹב": {
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
"examples": {},
}
}
matches = {
"טוֹב": [
_make_sentence("הוּא אָדָם טוֹב מְאוֹד", char_offset=10, char_end=13),
]
}
update_words_json(words, matches, confusable_keys=set())
cloze = words["טוֹב"]["examples"].get("cloze")
assert cloze is not None
assert "difficulty_score" in cloze
assert isinstance(cloze["difficulty_score"], int)
def test_vetted_sorted_by_difficulty(self):
"""Vetted sentences are sorted easiest first."""
from epub_examples import update_words_json
words = {
"טוֹב": {
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
"examples": {},
}
}
matches = {
"טוֹב": [
_make_sentence("הוּא טוֹב", char_offset=4, char_end=7),
_make_sentence("הַתַּפְנִיט טוֹב בְּיוֹתֵר", char_offset=10, char_end=13),
_make_sentence("אֲנִי טוֹב הַיּוֹם", char_offset=5, char_end=8),
]
}
update_words_json(words, matches, confusable_keys=set())
vetted = words["טוֹב"]["examples"]["vetted"]
assert len(vetted) == 3
def test_easiest_sentence_becomes_cloze(self):
"""The sentence with the lowest difficulty score becomes the cloze."""
from epub_examples import update_words_json
words = {
"טוֹב": {
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
"examples": {},
}
}
easy_text = "הוּא טוֹב מְאוֹד"
hard_text = "הַפַּרְנָסִימוֹן טוֹב לְהַפְלִיא"
matches = {
"טוֹב": [
_make_sentence(hard_text, char_offset=14, char_end=17),
_make_sentence(easy_text, char_offset=4, char_end=7),
]
}
update_words_json(words, matches, confusable_keys=set())
cloze = words["טוֹב"]["examples"]["cloze"]
assert cloze["text"] == easy_text

View file

@ -1,441 +0,0 @@
#!/usr/bin/env python3
"""Integration tests: scrape real pealim.com pages and validate data.
These tests hit pealim.com directly. They are skipped when the environment
variable SKIP_INTEGRATION is set to any non-empty string.
Run with:
pytest tests/test_scraper_integration.py -v -m integration
"""
import json
import os
import re
import sys
import time
from pathlib import Path
import pytest
# Add project root to path so all sibling modules are importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import pealim_detail_scrape
import pealim_list_scrape
# ---------------------------------------------------------------------------
# Skip marker
# ---------------------------------------------------------------------------
skip_integration = pytest.mark.skipif(
bool(os.environ.get("SKIP_INTEGRATION", "")),
reason="SKIP_INTEGRATION is set",
)
# A known Hif'il verb slug that is not page-1 dependent.
# לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
HIFIL_VERB_SLUG = "1135-lehagid"
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
HIFIL_VERB_MEANING = "to say, to tell"
# Minimum expected entries from a single list page
MIN_LIST_ENTRIES = 10
# Hebrew character regex (Unicode block U+05D0U+05EA)
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
# Slug pattern: one or more digits, hyphen, one or more word chars
SLUG_RE = re.compile(r"^\d+-\w+$")
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _has_hebrew(text: str) -> bool:
"""Return True if *text* contains at least one Hebrew consonant."""
return bool(HEBREW_CHAR_RE.search(text))
def _words_from_file(path: Path) -> dict:
with path.open(encoding="utf-8") as fh:
return json.load(fh)
# ---------------------------------------------------------------------------
# Test class: list page scrape
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestListScrape:
"""Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
# Scrape exactly one page
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
assert words_path.exists(), "words.json was not created after scrape"
words = _words_from_file(words_path)
assert len(words) >= MIN_LIST_ENTRIES, (
f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
)
def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
for key, entry in words.items():
word_block = entry.get("word", {})
nikkud = word_block.get("nikkud", "")
ktiv_male = word_block.get("ktiv_male", "")
slug = entry.get("slug", "")
pos = entry.get("pos", "")
meaning = entry.get("meaning", "")
assert nikkud, f"Entry '{key}': word.nikkud is empty"
assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
assert slug, f"Entry '{key}': slug is empty"
assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
assert pos, f"Entry '{key}': pos is empty"
assert meaning, f"Entry '{key}': meaning is empty"
def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""At least one entry on page 1 must have a non-empty root list."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
entries_with_root = [e for e in words.values() if e.get("root")]
assert entries_with_root, "No entries on page 1 have a non-empty root list"
def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""At least one entry on page 1 must have a non-empty audio_url."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
entries_with_audio = [e for e in words.values() if e.get("audio_url")]
assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
for key, entry in words.items():
assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
# ---------------------------------------------------------------------------
# Test class: noun detail scrape
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestDetailScrapeNoun:
"""Validate pealim_detail_scrape for a real noun detail page."""
def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
"""Return the first (key, entry) pair that is a Noun with a non-empty root."""
for key, entry in words.items():
if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
return key, entry
return None
def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
"""
Scrape page 1 into a fresh words.json and return (path, words).
Uses list scraper monkeypatched to tmp_path.
"""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
return words_path, words
def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After detail scrape, noun_inflection must not be null."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, noun_entry = pair
# Now monkeypatch detail scraper and run it on just this noun
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
# Small rate-limit delay between list scrape and detail scrape
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
entry = updated_words.get(noun_key, {})
assert entry.get("noun_inflection") is not None, (
f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
)
def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _noun_entry = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
singular = ni.get("singular") or {}
plural = ni.get("plural") or {}
assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Noun gender must be 'masculine' or 'feminine'."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _noun_entry = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
gender = ni.get("gender", "")
assert gender in ("masculine", "feminine"), (
f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
)
def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""detail_scraped must be True after a successful noun detail scrape."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _ = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
assert updated_words[noun_key].get("detail_scraped") is True, (
f"detail_scraped is not True after scrape for '{noun_key}'"
)
# ---------------------------------------------------------------------------
# Test class: verb detail scrape (Hif'il)
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestDetailScrapeVerb:
"""Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
def _build_test_words_json(self, tmp_path: Path) -> Path:
"""
Write a minimal words.json containing only the known Hif'il verb entry.
The detail scraper's run() will pick it up because pos starts with 'Verb'
and detail_scraped is absent/False.
"""
words_path = tmp_path / "words.json"
entry = {
"word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
"slug": HIFIL_VERB_SLUG,
"root": ["נ", "ג", "ד"],
"pos": "Verb",
"pos_hebrew": "פֹּעַל — הִפְעִיל",
"meaning": HIFIL_VERB_MEANING,
"meaning_raw": HIFIL_VERB_MEANING,
"audio_url": "",
"audio_file": "להגיד.mp3",
"tags": "שורש::נגד פעלים",
"last_scrape_date": "2026-03-08",
"vocab_legacy_guid": None,
"frequency": None,
"pseudo_frequency": None,
"emoji": None,
"emoji_source": None,
"emoji_visible": False,
"image": None,
"image_source": None,
"hint": "",
"shared_roots": [],
"confusable_group": None,
"confusables_guid": None,
"examples": None,
"noun_inflection": None,
"conjugation": None,
"adjective_inflection": None,
"preposition_inflection": None,
# Intentionally no detail_scraped key so the scraper processes it
}
words = {HIFIL_VERB_NIKKUD: entry}
with words_path.open("w", encoding="utf-8") as fh:
json.dump(words, fh, ensure_ascii=False, indent=2)
return words_path
def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After detail scrape, conjugation must not be null for the Hif'il verb."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
entry = words.get(HIFIL_VERB_NIKKUD, {})
assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
assert conj.get("binyan_hebrew") == "הִפְעִיל", (
f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
)
def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
infinitive = conj.get("infinitive") or {}
reference_form = conj.get("reference_form") or {}
inf_nikkud = infinitive.get("nikkud", "")
ref_nikkud = reference_form.get("nikkud", "")
assert inf_nikkud and _has_hebrew(inf_nikkud), (
f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
)
assert ref_nikkud and _has_hebrew(ref_nikkud), (
f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
)
def test_verb_detail_active_forms_count_and_structure(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""active_forms must be a list of at least 20 entries, each with required sub-fields."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
active_forms = conj.get("active_forms")
assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
for i, form in enumerate(active_forms):
assert form.get("person"), f"active_forms[{i}].person is empty"
assert form.get("tense"), f"active_forms[{i}].tense is empty"
form_block = form.get("form") or {}
assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
)
assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
)
def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
hufal_forms = conj.get("hufal_pual_forms")
assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
ref_passive = conj.get("reference_form_passive")
assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
passive_nikkud = (ref_passive or {}).get("nikkud", "")
assert passive_nikkud and _has_hebrew(passive_nikkud), (
f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
)
def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""detail_scraped must be True after a successful verb detail scrape."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
entry = words.get(HIFIL_VERB_NIKKUD, {})
assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"

View file

@ -1,207 +0,0 @@
"""Tests for sentence difficulty scoring."""
import json
from pathlib import Path
import pytest
import frequency_lookup
from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence
class TestBuildNikkudMap:
def test_maps_direct_headwords(self):
words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
nmap = build_nikkud_map(words)
assert nmap["אָב"] == "אב"
def test_maps_conjugation_forms(self):
words = {
"שָׁמַר": {
"word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
"conjugation": {
"active_forms": [
{
"person": "1s",
"tense": "עָבָר",
"form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
},
],
"infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
"reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
},
}
}
nmap = build_nikkud_map(words)
assert nmap["שָׁמַרְתִּי"] == "שמרתי"
assert nmap["לִשְׁמֹר"] == "לשמור"
def test_maps_noun_inflections(self):
words = {
"אָב": {
"word": {"nikkud": "אָב", "ktiv_male": "אב"},
"noun_inflection": {
"singular": {"nikkud": "אָב", "ktiv_male": "אב"},
"plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
"pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
},
}
}
nmap = build_nikkud_map(words)
assert nmap["אָבוֹת"] == "אבות"
assert nmap["אָבִי"] == "אבי"
def test_maps_adjective_inflections(self):
words = {
"גָּדוֹל": {
"word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
"adjective_inflection": {
"ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
"fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
"mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
"fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
},
}
}
nmap = build_nikkud_map(words)
assert nmap["גְּדוֹלָה"] == "גדולה"
assert nmap["גְּדוֹלִים"] == "גדולים"
def test_construct_forms_strip_maqaf(self):
words = {
"בֵּית": {
"word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
"noun_inflection": {
"construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
},
}
}
nmap = build_nikkud_map(words)
assert "בֵּית־" in nmap
assert "בֵּית" in nmap
def test_handles_missing_fields(self):
words = {
"test": {
"word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
"conjugation": None,
"noun_inflection": None,
"adjective_inflection": None,
}
}
nmap = build_nikkud_map(words)
assert nmap["טֶסְט"] == "טסט"
def test_real_words_json_coverage(self):
words_path = Path(__file__).parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
nmap = build_nikkud_map(words)
assert len(nmap) > 90_000
class TestResolveTokenFrequency:
@pytest.fixture()
def freq_setup(self):
frequency_lookup.load()
freq_data = frequency_lookup.get_freq_data()
words_path = Path(__file__).parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
from epub_examples import _build_nikkud_index
nikkud_map = build_nikkud_map(words)
nikkud_index = _build_nikkud_index(words)
return nikkud_map, nikkud_index, freq_data
def test_tier1_known_mapping(self, freq_setup):
nikkud_map, nikkud_index, freq_data = freq_setup
rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
assert rank is not None
assert rank < 50_000
def test_tier3_academy_converter(self, freq_setup):
nikkud_map, nikkud_index, freq_data = freq_setup
rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
assert rank is not None
assert rank < 1000
def test_unknown_token_returns_default(self, freq_setup):
nikkud_map, nikkud_index, freq_data = freq_setup
rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
assert rank == 50_000
def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
nikkud_map, nikkud_index, freq_data = freq_setup
assert freq_data.get("שלום") is not None
class TestScoreSentence:
@pytest.fixture()
def scoring_setup(self):
frequency_lookup.load()
freq_data = frequency_lookup.get_freq_data()
words_path = Path(__file__).parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
from epub_examples import _build_nikkud_index
nikkud_map = build_nikkud_map(words)
nikkud_index = _build_nikkud_index(words)
return nikkud_map, nikkud_index, freq_data
def test_returns_integer(self, scoring_setup):
nmap, nidx, freq = scoring_setup
text = "הוּא הָלַךְ הַבַּיְתָה"
start = text.index("הָלַךְ")
end = start + len("הָלַךְ")
score = score_sentence(text, start, end, nmap, nidx, freq)
assert isinstance(score, int)
def test_easy_sentence_scores_lower(self, scoring_setup):
nmap, nidx, freq = scoring_setup
easy = "הוּא אָמַר שָׁלוֹם"
easy_start = easy.index("אָמַר")
easy_end = easy_start + len("אָמַר")
hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
hard_start = hard.index("נִשְׁתַּטֵּחַ")
hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
assert easy_score < hard_score
def test_single_context_token(self, scoring_setup):
nmap, nidx, freq = scoring_setup
text = "הוּא טוֹב"
start = 0
end = len("הוּא")
score = score_sentence(text, start, end, nmap, nidx, freq)
assert isinstance(score, int)
def test_handles_punctuation(self, scoring_setup):
nmap, nidx, freq = scoring_setup
text = '"הוּא טוֹב!"'
start = text.index("טוֹב")
end = start + len("טוֹב")
score = score_sentence(text, start, end, nmap, nidx, freq)
assert isinstance(score, int)
def test_splits_on_maqaf(self, scoring_setup):
nmap, nidx, freq = scoring_setup
text = "בֵּית־סֵפֶר גָּדוֹל"
start = text.index("גָּדוֹל")
end = start + len("גָּדוֹל")
score = score_sentence(text, start, end, nmap, nidx, freq)
assert isinstance(score, int)
def test_no_context_tokens_returns_default(self, scoring_setup):
nmap, nidx, freq = scoring_setup
text = "א ב"
score = score_sentence(text, 0, 1, nmap, nidx, freq)
assert score == DEFAULT_RANK

View file

@ -25,7 +25,8 @@ def test_apkg_builder_imports():
def test_data_files_exist():
data_dir = Path(__file__).resolve().parent.parent / "data"
assert (data_dir / "words.json").exists(), "words.json missing"
assert (data_dir / "hebrew_dict_for_anki.csv").exists(), "vocab CSV missing"
assert (data_dir / "conjugations.json").exists(), "conjugations cache missing"
def test_strip_nikkud_idempotent():
@ -41,18 +42,4 @@ def test_strip_nikkud_all_marks():
# Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
nikkud = "הַמַּלְכָּה"
plain = strip_nikkud(nikkud)
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
def test_categorize_pos_no_substring_match():
"""Regression: 'Pronoun' must NOT match 'Noun' category."""
from apkg_builder import _categorize_pos
assert _categorize_pos("Noun") == "Noun"
assert _categorize_pos("Verb") == "Verb"
assert _categorize_pos("Adjective") == "Adjective"
assert _categorize_pos("Adverb") == "Adverb"
assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
assert _categorize_pos("Preposition") == "Other"
assert _categorize_pos("Conjunction") == "Other"
assert _categorize_pos("Cardinal numeral") == "Other"
assert all(ch < "\u0591" or ch > "\u05C7" for ch in plain), f"Residual nikkud in: {plain}"

256
validate_verb_list.py Normal file
View file

@ -0,0 +1,256 @@
#!/usr/bin/env python3
"""
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
For each verb:
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
2. Searches pealim.com to find URL slug
3. Fetches the page to confirm the binyan
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
Output:
verbs_input.txt cleaned verb list for conjugation_extract.py
Printed validation report table
Usage:
python3 validate_verb_list.py
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
running conjugation extraction.
"""
import re
import sys
import time
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
# Known problem entries: word → (action, note)
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
}
# Expected binyan by line range (1-indexed) per plan analysis
LINE_RANGES: list[tuple[range, str]] = [
(range(1, 18), "Pa'al"),
(range(18, 29), "Nif'al"),
(range(29, 37), "Pi'el"),
(range(37, 43), "Pu'al"),
(range(43, 53), "Hitpa'el"),
(range(53, 63), "Hif'il"),
(range(63, 71), "Huf'al"),
]
SECTION_HEADERS: dict[str, str] = {
"Pa'al": "# Pa'al (פָּעַל)",
"Nif'al": "# Nif'al (נִפְעַל)",
"Pi'el": "# Pi'el (פִּעֵל)",
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
"Hif'il": "# Hif'il (הִפְעִיל)",
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
}
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
def classify_by_line(line_num: int) -> str:
"""Return expected binyan for a 1-indexed line number."""
for r, binyan in LINE_RANGES:
if line_num in r:
return binyan
return "Unknown"
def find_slug(query: str) -> str | None:
"""Search pealim.com and return first URL slug found."""
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
try:
resp = session.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
return slugs[0] if slugs else None
except Exception as e:
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
return None
def get_page_binyan(slug: str) -> str:
"""Fetch /dict/<slug>/ and extract binyan from page header."""
url = f"{PEALIM_BASE}/dict/{slug}/"
try:
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
for h3 in soup.find_all("h3", class_="page-header"):
text = h3.get_text(" ", strip=True)
for bname in binyan_names:
if bname in text:
return bname
meta = soup.find("meta", {"property": "og:description"})
if meta:
desc = meta.get("content", "")
for bname in binyan_names:
if bname in desc:
return bname
except Exception as e:
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
return ""
def main() -> None:
if not SOURCE_FILE.exists():
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
sys.exit(1)
lines = [line.strip() for line in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
results = []
for line_num, word in enumerate(lines, start=1):
expected_binyan = classify_by_line(line_num)
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
if issue_type == "REVIEW":
# Don't query pealim for known-bad entries
print("REVIEW (skipping query)")
results.append(
{
"line": line_num,
"word": word,
"expected_binyan": expected_binyan,
"slug": "",
"page_binyan": "",
"status": "REVIEW",
"notes": issue_note,
"is_3ms": is_3ms_by_position,
}
)
continue
time.sleep(REQUEST_DELAY)
slug = find_slug(word)
if slug:
time.sleep(REQUEST_DELAY)
page_binyan = get_page_binyan(slug)
else:
page_binyan = ""
# Determine status
if issue_type == "3ms" or is_3ms_by_position:
status = "3ms"
notes = issue_note or "Pu'al/Huf'al 3ms past form"
elif not slug:
status = "NOT_FOUND"
notes = "no search result on pealim.com"
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
status = "MISMATCH"
notes = f"expected {expected_binyan}, page says {page_binyan}"
else:
status = "OK"
notes = ""
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
results.append(
{
"line": line_num,
"word": word,
"expected_binyan": expected_binyan,
"slug": slug or "",
"page_binyan": page_binyan,
"status": status,
"notes": notes,
"is_3ms": is_3ms_by_position or issue_type == "3ms",
}
)
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
review_lines: list[str] = []
for r in results:
b = r["expected_binyan"]
if b not in sections:
b = list(sections.keys())[0]
if r["status"] == "REVIEW":
review_lines.append(f"# REVIEW: {r['word']}{r['notes']}")
elif r["status"] == "3ms":
sections[b].append(f"# 3ms: {r['word']}")
elif r["status"] in ("OK", "MISMATCH"):
sections[b].append(r["word"])
else: # NOT_FOUND
sections[b].append(f"# NOT_FOUND: {r['word']}{r['notes']}")
output_lines = [
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
"",
]
for binyan, header in SECTION_HEADERS.items():
if sections.get(binyan):
output_lines.append(header)
output_lines.extend(sections[binyan])
output_lines.append("")
if review_lines:
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
output_lines.extend(review_lines)
output_lines.append("")
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
print(f"\nWrote → {OUTPUT_FILE}")
# ── Print summary table ──────────────────────────────────────────────────────
print("\n" + "=" * 95)
print("VALIDATION REPORT")
print("=" * 95)
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
print("-" * 95)
for r in results:
print(
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
)
print("=" * 95)
counts = {s: sum(1 for r in results if r["status"] == s) for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
print(
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
)
print(f"Total entries: {len(results)}")
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
print("\n⚠ Review flagged entries in verbs_input.txt before running:\n python3 conjugation_extract.py")
if __name__ == "__main__":
main()

3
vulture_whitelist.py Normal file
View file

@ -0,0 +1,3 @@
# Vulture whitelist: suppress false positives for interface methods
# HTMLParser.handle_starttag requires (self, tag, attrs) signature
attrs # noqa