Sprint 14: deck template/CSS overhaul + Sprint 12 detail scrape
Template & CSS fixes (15 items from Mar 9 feedback): - Fix conjugation front showing 3ms form instead of infinitive - Rename conjugation model to "Hebrew Conjugation" - Strip Hebrew parenthesized text from English meanings - Shoresh separator: spaces → dots (א.כ.ל) - Remove duplicate English meaning from cloze back - Remove example sentences from vocab front/back (cloze only) - Center-align audio buttons on all decks - Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)" - Unify sec-key/sec-label fonts, make keys bold - Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px) - Center-align related words groups - Sort confusables by average frequency - Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning - Clean duplicate quotation marks in cloze sentences Sprint 12 carry-forward (detail scrape + EPUB): - Adjective/preposition detail scraping in pealim_detail_scrape.py - EPUB example matching rewrite in epub_examples.py - Delete benyehuda.py and rebuild_sentence_matches.py (merged) - 49 parser tests for detail scraping - SCHEMA.yaml updates for new fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3b0f9defa9
commit
efd0745ada
10 changed files with 1669 additions and 741 deletions
50
SCHEMA.yaml
50
SCHEMA.yaml
|
|
@ -138,11 +138,53 @@ entry:
|
||||||
# ktiv_male: "שומר"
|
# ktiv_male: "שומר"
|
||||||
|
|
||||||
# --- Adjective-specific ---
|
# --- Adjective-specific ---
|
||||||
adjective_inflection: null # Reserved for future use
|
adjective_inflection: null # null for non-adjectives
|
||||||
# When populated:
|
# When populated:
|
||||||
# ms/fs/mp/fp forms with nikkud/ktiv_male subfields
|
# ms:
|
||||||
|
# nikkud: "גָּדוֹל"
|
||||||
|
# ktiv_male: "גדול"
|
||||||
|
# fs:
|
||||||
|
# nikkud: "גְּדוֹלָה"
|
||||||
|
# ktiv_male: "גדולה"
|
||||||
|
# mp:
|
||||||
|
# nikkud: "גְּדוֹלִים"
|
||||||
|
# ktiv_male: "גדולים"
|
||||||
|
# fp:
|
||||||
|
# nikkud: "גְּדוֹלוֹת"
|
||||||
|
# ktiv_male: "גדולות"
|
||||||
|
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
||||||
|
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
||||||
|
|
||||||
# --- Preposition-specific ---
|
# --- Preposition-specific ---
|
||||||
preposition_inflection: null # Reserved for future use
|
preposition_inflection: null # null for non-prepositions
|
||||||
# When populated:
|
# When populated:
|
||||||
# Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)
|
# 1s:
|
||||||
|
# nikkud: "שֶׁלִּי"
|
||||||
|
# ktiv_male: "שלי"
|
||||||
|
# 1p:
|
||||||
|
# nikkud: "שֶׁלָּנוּ"
|
||||||
|
# ktiv_male: "שלנו"
|
||||||
|
# 2ms:
|
||||||
|
# nikkud: "שֶׁלְּךָ"
|
||||||
|
# ktiv_male: "שלך"
|
||||||
|
# 2fs:
|
||||||
|
# nikkud: "שֶׁלָּךְ"
|
||||||
|
# ktiv_male: "שלך"
|
||||||
|
# 2mp:
|
||||||
|
# nikkud: "שֶׁלָּכֶם"
|
||||||
|
# ktiv_male: "שלכם"
|
||||||
|
# 2fp:
|
||||||
|
# nikkud: "שֶׁלָּכֶן"
|
||||||
|
# ktiv_male: "שלכן"
|
||||||
|
# 3ms:
|
||||||
|
# nikkud: "שֶׁלּוֹ"
|
||||||
|
# ktiv_male: "שלו"
|
||||||
|
# 3fs:
|
||||||
|
# nikkud: "שֶׁלָּהּ"
|
||||||
|
# ktiv_male: "שלה"
|
||||||
|
# 3mp:
|
||||||
|
# nikkud: "שֶׁלָּהֶם"
|
||||||
|
# ktiv_male: "שלהם"
|
||||||
|
# 3fp:
|
||||||
|
# nikkud: "שֶׁלָּהֶן"
|
||||||
|
# ktiv_male: "שלהן"
|
||||||
|
|
|
||||||
138
apkg_builder.py
138
apkg_builder.py
|
|
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
||||||
|
|
||||||
# Release version tag added to all notes so users can identify which release
|
# Release version tag added to all notes so users can identify which release
|
||||||
# their cards come from (visible in Anki's Browse view and card info).
|
# their cards come from (visible in Anki's Browse view and card info).
|
||||||
RELEASE_TAG = "v0.15.1"
|
RELEASE_TAG = "v0.16"
|
||||||
|
|
||||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||||
|
|
@ -117,13 +117,15 @@ CARD_CSS = """
|
||||||
.card {
|
.card {
|
||||||
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
|
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
|
||||||
font-size: 20px;
|
font-size: 20px;
|
||||||
text-align: center;
|
text-align: right;
|
||||||
color: #222;
|
color: #222;
|
||||||
background: #fff;
|
background: #fff;
|
||||||
padding: 16px;
|
padding: 16px;
|
||||||
|
max-width: 600px;
|
||||||
|
margin: 0 auto;
|
||||||
}
|
}
|
||||||
.hebrew {
|
.hebrew {
|
||||||
font-size: 36px;
|
font-size: 42px;
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
text-align: center;
|
text-align: center;
|
||||||
|
|
@ -131,32 +133,34 @@ CARD_CSS = """
|
||||||
color: #222;
|
color: #222;
|
||||||
}
|
}
|
||||||
.hebrew-sm {
|
.hebrew-sm {
|
||||||
font-size: 24px;
|
font-size: 30px;
|
||||||
font-weight: normal;
|
font-weight: normal;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
text-align: center;
|
text-align: center;
|
||||||
color: #333;
|
color: #222;
|
||||||
}
|
}
|
||||||
.meaning {
|
.meaning {
|
||||||
font-size: 28px;
|
font-size: 34px;
|
||||||
color: #1a1a8c;
|
color: #1a1a8c;
|
||||||
margin: 8px 0;
|
margin: 8px 0;
|
||||||
|
text-align: center;
|
||||||
}
|
}
|
||||||
.hint {
|
.hint {
|
||||||
font-size: 16px;
|
font-size: 22px;
|
||||||
color: #888;
|
color: #555;
|
||||||
margin: 4px 0;
|
margin: 4px 0;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
|
text-align: center;
|
||||||
}
|
}
|
||||||
.root-info {
|
.root-info {
|
||||||
font-size: 18px;
|
font-size: 26px;
|
||||||
color: #555;
|
color: #222;
|
||||||
margin-top: 6px;
|
margin-top: 6px;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
}
|
}
|
||||||
.example {
|
.example {
|
||||||
font-size: 18px;
|
font-size: 24px;
|
||||||
color: #444;
|
color: #222;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
text-align: right;
|
text-align: right;
|
||||||
font-style: italic;
|
font-style: italic;
|
||||||
|
|
@ -182,16 +186,17 @@ CARD_CSS = """
|
||||||
color: #555;
|
color: #555;
|
||||||
}
|
}
|
||||||
.sec-label {
|
.sec-label {
|
||||||
font-size: 20px;
|
font-size: 28px;
|
||||||
font-weight: normal;
|
font-weight: normal;
|
||||||
color: #555;
|
color: #222;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
text-align: center;
|
text-align: center;
|
||||||
margin-top: 6px;
|
margin-top: 6px;
|
||||||
}
|
}
|
||||||
.sec-key {
|
.sec-key {
|
||||||
font-size: 18px;
|
font-size: 28px;
|
||||||
color: #888;
|
color: #222;
|
||||||
|
font-weight: bold;
|
||||||
}
|
}
|
||||||
.definitions {
|
.definitions {
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
|
|
@ -199,32 +204,37 @@ CARD_CSS = """
|
||||||
}
|
}
|
||||||
.conf-entry {
|
.conf-entry {
|
||||||
margin: 8px 0;
|
margin: 8px 0;
|
||||||
font-size: 20px;
|
font-size: 28px;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
}
|
}
|
||||||
.related-group {
|
.related-group {
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
text-align: right;
|
text-align: center;
|
||||||
margin: 2px 0;
|
margin: 2px 0;
|
||||||
font-size: 18px;
|
font-size: 26px;
|
||||||
}
|
}
|
||||||
.emoji-img {
|
.emoji-img {
|
||||||
font-size: 3.5em;
|
font-size: 3.5em;
|
||||||
text-align: center;
|
text-align: center;
|
||||||
margin: 0.3em 0;
|
margin: 0.3em 0;
|
||||||
}
|
}
|
||||||
|
.card [type="button"], .card button, .replay-button {
|
||||||
|
display: block !important;
|
||||||
|
margin: 4px auto !important;
|
||||||
|
text-align: center;
|
||||||
|
}
|
||||||
@media (prefers-color-scheme: dark) {
|
@media (prefers-color-scheme: dark) {
|
||||||
.card { color: #e8e8e8; background: #1c1c1e; }
|
.card { color: #e8e8e8; background: #1c1c1e; }
|
||||||
.hebrew { color: #f0f0f0; }
|
.hebrew { color: #f0f0f0; }
|
||||||
.hebrew-sm { color: #ddd; }
|
.hebrew-sm { color: #e0e0e0; }
|
||||||
.meaning { color: #82b0ff; }
|
.meaning { color: #82b0ff; }
|
||||||
.root-info { color: #aaa; }
|
.root-info { color: #e0e0e0; }
|
||||||
.sec-label { color: #aaa; }
|
.sec-label { color: #e0e0e0; }
|
||||||
.sec-key { color: #666; }
|
.sec-key { color: #e0e0e0; }
|
||||||
.conf-entry { color: #ddd; }
|
.conf-entry { color: #ddd; }
|
||||||
.hint { color: #777; }
|
.hint { color: #777; }
|
||||||
.voice-label { color: #888; }
|
.voice-label { color: #888; }
|
||||||
.example { color: #bbb; border-right-color: #555; }
|
.example { color: #e0e0e0; border-right-color: #555; }
|
||||||
.divider { border-top-color: #333; }
|
.divider { border-top-color: #333; }
|
||||||
.freq-badge { color: #888; border-color: #444; }
|
.freq-badge { color: #888; border-color: #444; }
|
||||||
}
|
}
|
||||||
|
|
@ -252,9 +262,6 @@ VOCAB_BACK_HEB = """
|
||||||
<div class="root-info">{{SharedRoots}}</div>
|
<div class="root-info">{{SharedRoots}}</div>
|
||||||
{{/SharedRoots}}
|
{{/SharedRoots}}
|
||||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
||||||
{{#Example}}
|
|
||||||
<div class="example">{{Example}}</div>
|
|
||||||
{{/Example}}
|
|
||||||
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
|
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -273,14 +280,15 @@ VOCAB_BACK_ENG = """
|
||||||
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
|
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
|
||||||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
||||||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
||||||
|
{{#SharedRoots}}
|
||||||
|
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||||
|
<div class="root-info">{{SharedRoots}}</div>
|
||||||
|
{{/SharedRoots}}
|
||||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
||||||
{{#Example}}
|
|
||||||
<div class="example">{{Example}}</div>
|
|
||||||
{{/Example}}
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
VOCAB_FRONT_CLOZE = """
|
VOCAB_FRONT_CLOZE = """
|
||||||
<div class="example" style="font-size:24px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
|
<div class="example" style="font-size:32px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
|
||||||
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
|
{{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -289,7 +297,6 @@ VOCAB_BACK_CLOZE = """
|
||||||
<div class="divider"></div>
|
<div class="divider"></div>
|
||||||
<div class="hebrew">{{Word}}</div>
|
<div class="hebrew">{{Word}}</div>
|
||||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||||
<div class="meaning">{{Meaning}}</div>
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
VOCAB_MODEL = genanki.Model(
|
VOCAB_MODEL = genanki.Model(
|
||||||
|
|
@ -343,8 +350,8 @@ VOCAB_MODEL = genanki.Model(
|
||||||
|
|
||||||
CONJ_FRONT = """
|
CONJ_FRONT = """
|
||||||
<div class="sec-label">אֵיךְ אוֹמְרִים</div>
|
<div class="sec-label">אֵיךְ אוֹמְרִים</div>
|
||||||
<div class="hebrew" style="color:#1a1a8c;">{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
|
|
||||||
<div class="hebrew">{{Pronoun}}</div>
|
<div class="hebrew">{{Pronoun}}</div>
|
||||||
|
<div class="hebrew" style="color:#1a1a8c;">{{Infinitive}}{{#Prep}} <span class="hebrew-sm">({{Prep}})</span>{{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
|
||||||
<div class="hebrew">{{Tense}}</div>
|
<div class="hebrew">{{Tense}}</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -363,7 +370,7 @@ CONJ_CSS = CARD_CSS
|
||||||
|
|
||||||
CONJ_MODEL = genanki.Model(
|
CONJ_MODEL = genanki.Model(
|
||||||
CONJ_MODEL_ID,
|
CONJ_MODEL_ID,
|
||||||
"Pealim Conjugation",
|
"Hebrew Conjugation",
|
||||||
fields=[
|
fields=[
|
||||||
{"name": "Infinitive"},
|
{"name": "Infinitive"},
|
||||||
{"name": "ReferenceForm"},
|
{"name": "ReferenceForm"},
|
||||||
|
|
@ -666,8 +673,9 @@ def _load_emoji_lookup() -> dict[str, str]:
|
||||||
|
|
||||||
def _categorize_pos(pos_str: str) -> str:
|
def _categorize_pos(pos_str: str) -> str:
|
||||||
"""Return the canonical PoS category key for grouping."""
|
"""Return the canonical PoS category key for grouping."""
|
||||||
|
base = pos_str.split("–")[0].split("—")[0].strip()
|
||||||
for cat in POS_CATEGORY_LABELS:
|
for cat in POS_CATEGORY_LABELS:
|
||||||
if cat.lower() in pos_str.lower():
|
if base == cat:
|
||||||
return cat
|
return cat
|
||||||
return "Other"
|
return "Other"
|
||||||
|
|
||||||
|
|
@ -745,10 +753,14 @@ def build_vocab_deck(
|
||||||
word_nikkud = entry["word"]["nikkud"]
|
word_nikkud = entry["word"]["nikkud"]
|
||||||
word_no_nik = entry["word"].get("ktiv_male", "")
|
word_no_nik = entry["word"].get("ktiv_male", "")
|
||||||
root_list = entry.get("root") or []
|
root_list = entry.get("root") or []
|
||||||
root = " ".join(root_list)
|
root = ".".join(root_list)
|
||||||
pos_raw = entry.get("pos", "")
|
pos_raw = entry.get("pos", "")
|
||||||
pos_heb = entry.get("pos_hebrew", "")
|
pos_heb = entry.get("pos_hebrew", "")
|
||||||
meaning = entry.get("meaning", "") or ""
|
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
|
||||||
|
meaning = HBPAREN_RE.sub("", meaning).strip()
|
||||||
|
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
|
||||||
|
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
|
||||||
|
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
|
||||||
meaning_raw = entry.get("meaning_raw", "") or ""
|
meaning_raw = entry.get("meaning_raw", "") or ""
|
||||||
slug = entry.get("slug", "") or ""
|
slug = entry.get("slug", "") or ""
|
||||||
frequency = entry.get("frequency") or 999_999
|
frequency = entry.get("frequency") or 999_999
|
||||||
|
|
@ -839,6 +851,9 @@ def build_vocab_deck(
|
||||||
end = cloze_data.get("cloze_word_end")
|
end = cloze_data.get("cloze_word_end")
|
||||||
if cloze_text and start is not None and end is not None:
|
if cloze_text and start is not None and end is not None:
|
||||||
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
|
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
|
||||||
|
# Clean up duplicate/misplaced quotation marks
|
||||||
|
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
|
||||||
|
cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
|
||||||
raw_hint = cloze_data.get("cloze_hint") or ""
|
raw_hint = cloze_data.get("cloze_hint") or ""
|
||||||
if raw_hint:
|
if raw_hint:
|
||||||
cloze_hint = raw_hint
|
cloze_hint = raw_hint
|
||||||
|
|
@ -871,11 +886,12 @@ def build_vocab_deck(
|
||||||
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
|
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
|
||||||
related_html = "\n".join(parts)
|
related_html = "\n".join(parts)
|
||||||
|
|
||||||
# Plural form (for nouns)
|
# Plural form (nouns only — guard against adjective/verb inflection bleed)
|
||||||
plural_str = ""
|
plural_str = ""
|
||||||
noun_inflection = entry.get("noun_inflection")
|
if pos_raw.startswith("Noun"):
|
||||||
if noun_inflection and noun_inflection.get("plural"):
|
noun_inflection = entry.get("noun_inflection")
|
||||||
plural_str = noun_inflection["plural"].get("nikkud", "")
|
if noun_inflection and noun_inflection.get("plural"):
|
||||||
|
plural_str = noun_inflection["plural"].get("nikkud", "")
|
||||||
|
|
||||||
# Image
|
# Image
|
||||||
image_tag = ""
|
image_tag = ""
|
||||||
|
|
@ -977,18 +993,28 @@ def build_conj_deck(
|
||||||
binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
|
binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
|
||||||
slug = entry.get("slug", "") or ""
|
slug = entry.get("slug", "") or ""
|
||||||
root_list = entry.get("root") or []
|
root_list = entry.get("root") or []
|
||||||
root = " ".join(root_list)
|
root = ".".join(root_list)
|
||||||
voice = VOICE_MAP.get(binyan, "")
|
voice = VOICE_MAP.get(binyan, "")
|
||||||
|
|
||||||
|
meaning_raw = entry.get("meaning_raw", "") or ""
|
||||||
meaning = entry.get("meaning", "") or ""
|
meaning = entry.get("meaning", "") or ""
|
||||||
# Extract Hebrew preposition from meaning_raw
|
# Extract Hebrew preposition — strip from meaning, show on Hebrew side
|
||||||
prep_str = ""
|
prep_str = ""
|
||||||
conj_prep = conj.get("prep")
|
conj_prep = conj.get("prep")
|
||||||
if conj_prep:
|
if conj_prep:
|
||||||
prep_str = f"({conj_prep})"
|
# Strip any parentheses from stored prep value
|
||||||
elif meaning:
|
prep_str = conj_prep.strip("() ")
|
||||||
preps = HBPAREN_RE.findall(entry.get("meaning_raw", "") or "")
|
elif meaning_raw:
|
||||||
prep_str = " ".join(f"({p})" for p in preps)
|
preps = HBPAREN_RE.findall(meaning_raw)
|
||||||
|
if preps:
|
||||||
|
prep_str = preps[0]
|
||||||
|
# Strip Hebrew prepositions from English meaning to avoid duplication
|
||||||
|
if prep_str:
|
||||||
|
meaning = HBPAREN_RE.sub("", meaning).strip()
|
||||||
|
# Also strip from meaning_raw patterns like "(על)"
|
||||||
|
meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
|
||||||
|
# Clean up double spaces and trailing commas
|
||||||
|
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
|
||||||
|
|
||||||
related = [w for w in root_words.get(root, []) if w != infinitive]
|
related = [w for w in root_words.get(root, []) if w != infinitive]
|
||||||
related_str = " ".join(related[:8]) if related else ""
|
related_str = " ".join(related[:8]) if related else ""
|
||||||
|
|
@ -1024,7 +1050,7 @@ def build_conj_deck(
|
||||||
elif guid_candidates:
|
elif guid_candidates:
|
||||||
note_guid = guid_candidates[0]
|
note_guid = guid_candidates[0]
|
||||||
else:
|
else:
|
||||||
note_guid = genanki.guid_for(_infinitive, pronoun, tense)
|
note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
|
||||||
note = genanki.Note(
|
note = genanki.Note(
|
||||||
model=CONJ_MODEL,
|
model=CONJ_MODEL,
|
||||||
guid=note_guid,
|
guid=note_guid,
|
||||||
|
|
@ -1213,8 +1239,10 @@ def build_conj_deck(
|
||||||
# ──────────────────────────────────────────────────────────────────────────────
|
# ──────────────────────────────────────────────────────────────────────────────
|
||||||
|
|
||||||
CONF_FRONT = """
|
CONF_FRONT = """
|
||||||
|
<div style="direction:rtl; text-align:center;">
|
||||||
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
|
<div class="hebrew" style="font-size:36px;">{{Words}}</div>
|
||||||
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
|
<div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
|
||||||
|
</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
CONF_BACK = """
|
CONF_BACK = """
|
||||||
|
|
@ -1271,7 +1299,10 @@ def build_confusables_deck(
|
||||||
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
|
guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
|
||||||
guid_to_entries.setdefault(guid, []).append(entry)
|
guid_to_entries.setdefault(guid, []).append(entry)
|
||||||
|
|
||||||
for guid, group_entries in sorted(guid_to_entries.items(), key=lambda x: x[0]):
|
for guid, group_entries in sorted(
|
||||||
|
guid_to_entries.items(),
|
||||||
|
key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
|
||||||
|
):
|
||||||
if guid in seen_guids:
|
if guid in seen_guids:
|
||||||
continue
|
continue
|
||||||
seen_guids.add(guid)
|
seen_guids.add(guid)
|
||||||
|
|
@ -1366,6 +1397,7 @@ PLURAL_BACK_SG = """
|
||||||
{{FrontSide}}<hr>
|
{{FrontSide}}<hr>
|
||||||
<div class="hebrew">{{Plural}}</div>
|
<div class="hebrew">{{Plural}}</div>
|
||||||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||||||
|
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
|
||||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -1380,6 +1412,7 @@ PLURAL_BACK_PL = """
|
||||||
<div class="hebrew">{{Singular}}</div>
|
<div class="hebrew">{{Singular}}</div>
|
||||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||||
<div class="sec-label">{{Meaning}}</div>
|
<div class="sec-label">{{Meaning}}</div>
|
||||||
|
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
|
||||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -1483,10 +1516,11 @@ def build_plural_deck(
|
||||||
plural = noun_inflection["plural"]["nikkud"]
|
plural = noun_inflection["plural"]["nikkud"]
|
||||||
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
|
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
|
||||||
gender = noun_inflection.get("gender") or ""
|
gender = noun_inflection.get("gender") or ""
|
||||||
|
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
|
||||||
mishkal = noun_inflection.get("mishkal") or ""
|
mishkal = noun_inflection.get("mishkal") or ""
|
||||||
meaning = entry.get("meaning") or ""
|
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
|
||||||
root_list = entry.get("root") or []
|
root_list = entry.get("root") or []
|
||||||
root = " ".join(root_list)
|
root = ".".join(root_list)
|
||||||
|
|
||||||
# GUID from noun_inflection
|
# GUID from noun_inflection
|
||||||
note_guid_raw = noun_inflection.get("plurals_guid")
|
note_guid_raw = noun_inflection.get("plurals_guid")
|
||||||
|
|
@ -1520,7 +1554,7 @@ def build_plural_deck(
|
||||||
meaning,
|
meaning,
|
||||||
root,
|
root,
|
||||||
mishkal,
|
mishkal,
|
||||||
gender,
|
gender_heb,
|
||||||
],
|
],
|
||||||
tags=tags,
|
tags=tags,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
202
benyehuda.py
202
benyehuda.py
|
|
@ -1,202 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
|
|
||||||
|
|
||||||
TODO: Rewrite to update words.json examples fields directly instead of
|
|
||||||
writing to a separate examples_cache.json. Currently the migration script
|
|
||||||
bridges the gap. See Phase 5 in SPRINT_LOG.md.
|
|
||||||
|
|
||||||
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
|
|
||||||
then answers queries locally.
|
|
||||||
|
|
||||||
Exposed API:
|
|
||||||
load(force_rebuild=False)
|
|
||||||
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
|
|
||||||
save_examples_cache()
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
import zipfile
|
|
||||||
from io import BytesIO
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from helpers import strip_nikkud as _strip_nikkud
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
|
|
||||||
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
|
|
||||||
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
|
|
||||||
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
|
|
||||||
REQUEST_TIMEOUT = 120
|
|
||||||
MIN_SENTENCE_LEN = 20
|
|
||||||
MAX_SENTENCE_LEN = 200
|
|
||||||
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
|
|
||||||
|
|
||||||
# Module-level state
|
|
||||||
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
|
|
||||||
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
|
|
||||||
|
|
||||||
|
|
||||||
def _split_sentences(text: str) -> list[str]:
|
|
||||||
"""
|
|
||||||
Split text into sentences on newlines only (Hebrew sentences don't have
|
|
||||||
mid-word period issues like English). Min 20 chars, max 200 chars.
|
|
||||||
"""
|
|
||||||
out = []
|
|
||||||
for line in text.split("\n"):
|
|
||||||
s = line.strip().strip("\"'.,;:!?")
|
|
||||||
s = s.strip()
|
|
||||||
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
|
|
||||||
out.append(s)
|
|
||||||
return out
|
|
||||||
|
|
||||||
|
|
||||||
def _build_index(corpus_zip_bytes: bytes) -> None:
|
|
||||||
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
|
|
||||||
global _index
|
|
||||||
_index = {}
|
|
||||||
logger.info("Building Ben Yehuda index from nikkud corpus …")
|
|
||||||
|
|
||||||
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
|
|
||||||
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
|
|
||||||
logger.info(f" Corpus contains {len(txt_files)} text files")
|
|
||||||
for fname in txt_files:
|
|
||||||
try:
|
|
||||||
raw = zf.read(fname).decode("utf-8", errors="ignore")
|
|
||||||
except Exception: # noqa: S112
|
|
||||||
continue
|
|
||||||
for sentence in _split_sentences(raw):
|
|
||||||
# Index by each unique Hebrew token (with nikkud) in the sentence
|
|
||||||
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
|
|
||||||
for w in set(words):
|
|
||||||
if len(w) >= 2:
|
|
||||||
bucket = _index.setdefault(w, [])
|
|
||||||
if len(bucket) < MAX_INDEX_ENTRIES:
|
|
||||||
bucket.append(sentence)
|
|
||||||
|
|
||||||
logger.info(f"Index built: {len(_index)} unique word forms")
|
|
||||||
|
|
||||||
|
|
||||||
def _save_index() -> None:
|
|
||||||
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
with open(INDEX_PATH, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(_index, f, ensure_ascii=False)
|
|
||||||
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
|
|
||||||
|
|
||||||
|
|
||||||
def _load_index() -> None:
|
|
||||||
global _index
|
|
||||||
with open(INDEX_PATH, encoding="utf-8") as f:
|
|
||||||
_index = json.load(f)
|
|
||||||
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
|
|
||||||
|
|
||||||
|
|
||||||
def load(force_rebuild: bool = False) -> None:
|
|
||||||
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
|
|
||||||
global _index, _examples_cache
|
|
||||||
if _index and not force_rebuild:
|
|
||||||
return
|
|
||||||
|
|
||||||
if force_rebuild:
|
|
||||||
# Delete old index and discard examples cache
|
|
||||||
if INDEX_PATH.exists():
|
|
||||||
INDEX_PATH.unlink()
|
|
||||||
logger.info("Deleted old Ben Yehuda index (force rebuild)")
|
|
||||||
_examples_cache = {}
|
|
||||||
else:
|
|
||||||
# Load persisted examples cache (not needed on rebuild)
|
|
||||||
if EXAMPLES_CACHE_PATH.exists():
|
|
||||||
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
|
|
||||||
_examples_cache = json.load(f)
|
|
||||||
|
|
||||||
if INDEX_PATH.exists():
|
|
||||||
_load_index()
|
|
||||||
return
|
|
||||||
|
|
||||||
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
|
|
||||||
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
|
|
||||||
resp.raise_for_status()
|
|
||||||
data = resp.content
|
|
||||||
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
|
|
||||||
|
|
||||||
_build_index(data)
|
|
||||||
_save_index()
|
|
||||||
|
|
||||||
|
|
||||||
def save_examples_cache() -> None:
|
|
||||||
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
|
||||||
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
|
|
||||||
json.dump(_examples_cache, f, ensure_ascii=False)
|
|
||||||
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
|
|
||||||
|
|
||||||
|
|
||||||
def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
|
|
||||||
"""
|
|
||||||
Return 0 or 1 example sentences for the given word (nikkud form).
|
|
||||||
|
|
||||||
Lookup strategy:
|
|
||||||
1. Try exact nikkud match in index.
|
|
||||||
2. Fall back to stripped (no-nikkud) match against index keys.
|
|
||||||
Skipped when word's consonants are in confusable_consonants set
|
|
||||||
(to avoid returning sentences for the wrong homograph).
|
|
||||||
|
|
||||||
Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
|
|
||||||
the word as a whole token.
|
|
||||||
"""
|
|
||||||
if not _index:
|
|
||||||
load()
|
|
||||||
|
|
||||||
word = word_nikkud.strip()
|
|
||||||
word_stripped = _strip_nikkud(word)
|
|
||||||
|
|
||||||
cache_key = word
|
|
||||||
|
|
||||||
if cache_key in _examples_cache:
|
|
||||||
return _examples_cache[cache_key]
|
|
||||||
|
|
||||||
# Lookup: try exact nikkud first, then stripped fallback
|
|
||||||
candidates = _index.get(word, [])
|
|
||||||
if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
|
|
||||||
# Try looking up by stripped form across index keys
|
|
||||||
for k, v in _index.items():
|
|
||||||
if _strip_nikkud(k) == word_stripped:
|
|
||||||
candidates = v
|
|
||||||
break
|
|
||||||
|
|
||||||
# Filter: word must appear as a whole token
|
|
||||||
# Match the stripped form (for robustness with nikkud variants in sentence)
|
|
||||||
if word_stripped:
|
|
||||||
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
|
|
||||||
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
|
|
||||||
else:
|
|
||||||
matched = candidates[:]
|
|
||||||
|
|
||||||
# Filter by length
|
|
||||||
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
|
|
||||||
|
|
||||||
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
|
|
||||||
if matched:
|
|
||||||
best = max(matched, key=len)
|
|
||||||
result = [best]
|
|
||||||
else:
|
|
||||||
result = []
|
|
||||||
|
|
||||||
_examples_cache[cache_key] = result
|
|
||||||
return result
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
|
||||||
load()
|
|
||||||
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
|
|
||||||
for w in tests:
|
|
||||||
exs = get_examples(w)
|
|
||||||
print(f"\n{w}: {len(exs)} example(s)")
|
|
||||||
for ex in exs:
|
|
||||||
print(f" → {ex[:100]}")
|
|
||||||
save_examples_cache()
|
|
||||||
765
epub_examples.py
765
epub_examples.py
|
|
@ -1,18 +1,17 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
"""
|
"""
|
||||||
Extract example sentences from nikud'd Hebrew EPUBs (and PDFs where possible),
|
Extract example sentences from nikud'd Hebrew EPUB files, match them against
|
||||||
match them against the vocab list, and produce examples_cache.json.
|
the vocabulary list in data/words.json, and write matched examples back into
|
||||||
|
words.json.
|
||||||
|
|
||||||
Usage:
|
Usage (standalone):
|
||||||
python3 epub_examples.py
|
python3 epub_examples.py
|
||||||
|
|
||||||
Outputs:
|
Called from run.py via:
|
||||||
data/epub_sentence_index.json — full sentence corpus
|
run(words) — words dict is passed in and updated in place
|
||||||
data/examples_cache.json — best sentence(s) per vocab word
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import csv
|
import logging
|
||||||
import json
|
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
import zipfile
|
import zipfile
|
||||||
|
|
@ -21,20 +20,38 @@ from pathlib import Path
|
||||||
|
|
||||||
from helpers import strip_nikkud
|
from helpers import strip_nikkud
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
DATA_DIR = Path(__file__).parent / "data"
|
DATA_DIR = Path(__file__).parent / "data"
|
||||||
EPUB_DIR = DATA_DIR / "epubs"
|
EPUB_DIR = DATA_DIR / "epubs"
|
||||||
DICT_CSV = DATA_DIR / "hebrew_dict_for_anki.csv"
|
WORDS_JSON = DATA_DIR / "words.json"
|
||||||
|
|
||||||
|
|
||||||
# Book metadata: filename -> display name
|
# Book metadata: filename -> display name
|
||||||
EPUB_BOOKS = {
|
def _discover_epubs() -> dict[str, str]:
|
||||||
"little_prince.epub": "הנסיך הקטן",
|
"""Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
|
||||||
"time_tunnel_82.epub": "מנהרת הזמן 82",
|
if not EPUB_DIR.exists():
|
||||||
}
|
return {}
|
||||||
|
books: dict[str, str] = {}
|
||||||
|
for path in sorted(EPUB_DIR.glob("*.epub")):
|
||||||
|
stem = path.stem
|
||||||
|
stem_stripped = strip_nikkud(stem).lower()
|
||||||
|
# Derive a brief English display name from the filename
|
||||||
|
parts = stem.split(" -- ")
|
||||||
|
title_part = strip_nikkud(parts[0]).strip().lower()
|
||||||
|
if "alice" in stem_stripped or "אליס" in title_part:
|
||||||
|
name = "alice_wonderland"
|
||||||
|
elif "little_prince" in stem_stripped or "נסיך" in title_part:
|
||||||
|
name = "little_prince"
|
||||||
|
elif "מנהרת" in title_part or "time_tunnel" in stem_stripped:
|
||||||
|
num_match = re.search(r"(\d+)", stem_stripped)
|
||||||
|
num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "")
|
||||||
|
name = f"time_tunnel_{num}"
|
||||||
|
else:
|
||||||
|
name = stem_stripped[:40]
|
||||||
|
books[str(path)] = name
|
||||||
|
return books
|
||||||
|
|
||||||
# PDF books are excluded — pypdf produces garbled RTL text (reversed chars within
|
|
||||||
# words). If/when a proper EPUB version becomes available on Calibre, add it to
|
|
||||||
# EPUB_BOOKS above instead.
|
|
||||||
PDF_BOOKS: dict[str, str] = {}
|
|
||||||
|
|
||||||
# Sentence length bounds (word count)
|
# Sentence length bounds (word count)
|
||||||
MIN_WORDS = 4
|
MIN_WORDS = 4
|
||||||
|
|
@ -58,7 +75,7 @@ class _TextExtractor(HTMLParser):
|
||||||
_ = attrs # required by HTMLParser interface
|
_ = attrs # required by HTMLParser interface
|
||||||
if tag in self.SKIP_TAGS:
|
if tag in self.SKIP_TAGS:
|
||||||
self._skip_depth += 1
|
self._skip_depth += 1
|
||||||
# Insert space for block-level elements to avoid word concatenation
|
# Insert newline for block-level elements to avoid word concatenation
|
||||||
if tag in (
|
if tag in (
|
||||||
"p",
|
"p",
|
||||||
"div",
|
"div",
|
||||||
|
|
@ -102,7 +119,6 @@ def extract_text_from_html(html: str) -> str:
|
||||||
|
|
||||||
def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
|
def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
|
||||||
"""Get ordered list of content XHTML files from the OPF manifest."""
|
"""Get ordered list of content XHTML files from the OPF manifest."""
|
||||||
# Find the OPF file
|
|
||||||
opf_path = None
|
opf_path = None
|
||||||
for name in zf.namelist():
|
for name in zf.namelist():
|
||||||
if name.endswith(".opf"):
|
if name.endswith(".opf"):
|
||||||
|
|
@ -124,7 +140,7 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
|
||||||
opf_dir = os.path.dirname(opf_path)
|
opf_dir = os.path.dirname(opf_path)
|
||||||
|
|
||||||
# Extract manifest items: id -> href
|
# Extract manifest items: id -> href
|
||||||
manifest = {}
|
manifest: dict[str, str] = {}
|
||||||
for m in re.finditer(r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
|
for m in re.finditer(r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
|
||||||
manifest[m.group(1)] = m.group(2)
|
manifest[m.group(1)] = m.group(2)
|
||||||
# Also try reversed attribute order
|
# Also try reversed attribute order
|
||||||
|
|
@ -157,7 +173,12 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
|
||||||
def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
|
def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
|
||||||
"""Extract sentences from an EPUB file.
|
"""Extract sentences from an EPUB file.
|
||||||
|
|
||||||
Returns list of {"text": str, "book": str, "stripped": str}
|
Args:
|
||||||
|
epub_path: Path to the .epub file.
|
||||||
|
book_name: Human-readable book name used as the ``source`` field.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ``{"text": str, "source": str}`` dicts.
|
||||||
"""
|
"""
|
||||||
zf = zipfile.ZipFile(epub_path)
|
zf = zipfile.ZipFile(epub_path)
|
||||||
content_files = _content_files_from_epub(zf)
|
content_files = _content_files_from_epub(zf)
|
||||||
|
|
@ -175,41 +196,6 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
|
||||||
return _split_into_sentences(full_text, book_name)
|
return _split_into_sentences(full_text, book_name)
|
||||||
|
|
||||||
|
|
||||||
# ── PDF processing ───────────────────────────────────────────────
|
|
||||||
|
|
||||||
|
|
||||||
def extract_sentences_from_pdf(pdf_path: Path, book_name: str) -> list[dict]:
|
|
||||||
"""Extract sentences from a PDF file (best-effort, handles RTL reversal)."""
|
|
||||||
try:
|
|
||||||
import pypdf
|
|
||||||
except ImportError:
|
|
||||||
print(f" [SKIP] pypdf not installed, cannot process {pdf_path.name}")
|
|
||||||
return []
|
|
||||||
|
|
||||||
reader = pypdf.PdfReader(pdf_path)
|
|
||||||
all_text_parts = []
|
|
||||||
|
|
||||||
for page in reader.pages:
|
|
||||||
raw = page.extract_text()
|
|
||||||
if not raw:
|
|
||||||
continue
|
|
||||||
# pypdf often reverses word order for RTL text; fix it
|
|
||||||
fixed_lines = []
|
|
||||||
for line in raw.split("\n"):
|
|
||||||
words = line.split()
|
|
||||||
# Check if this line is predominantly Hebrew
|
|
||||||
hebrew_chars = sum(1 for c in line if "\u0590" <= c <= "\u05ff")
|
|
||||||
if hebrew_chars > len(line) * 0.3 and len(words) > 1:
|
|
||||||
# Reverse word order
|
|
||||||
fixed_lines.append(" ".join(reversed(words)))
|
|
||||||
else:
|
|
||||||
fixed_lines.append(line)
|
|
||||||
all_text_parts.append("\n".join(fixed_lines))
|
|
||||||
|
|
||||||
full_text = "\n".join(all_text_parts)
|
|
||||||
return _split_into_sentences(full_text, book_name)
|
|
||||||
|
|
||||||
|
|
||||||
# ── Sentence splitting ───────────────────────────────────────────
|
# ── Sentence splitting ───────────────────────────────────────────
|
||||||
|
|
||||||
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
|
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
|
||||||
|
|
@ -217,18 +203,27 @@ _SENT_SPLIT = re.compile(r"[.!?\u05C3]+")
|
||||||
|
|
||||||
# Punctuation to strip from word boundaries when matching
|
# Punctuation to strip from word boundaries when matching
|
||||||
_PUNCT = re.compile(
|
_PUNCT = re.compile(
|
||||||
r'^[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
|
r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|'
|
||||||
|
r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _split_into_sentences(text: str, book_name: str) -> list[dict]:
|
def _split_into_sentences(text: str, book_name: str) -> list[dict]:
|
||||||
"""Split text into sentences and filter by length."""
|
"""Split text into Hebrew sentences and filter by word count.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text: Raw extracted text from an EPUB chapter.
|
||||||
|
book_name: Source label for each sentence dict.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text.
|
||||||
|
"""
|
||||||
# Normalize whitespace
|
# Normalize whitespace
|
||||||
text = re.sub(r"\s+", " ", text).strip()
|
text = re.sub(r"\s+", " ", text).strip()
|
||||||
|
|
||||||
raw_sentences = _SENT_SPLIT.split(text)
|
raw_sentences = _SENT_SPLIT.split(text)
|
||||||
results = []
|
results: list[dict] = []
|
||||||
seen = set()
|
seen: set[str] = set()
|
||||||
|
|
||||||
for sent in raw_sentences:
|
for sent in raw_sentences:
|
||||||
sent = sent.strip()
|
sent = sent.strip()
|
||||||
|
|
@ -242,205 +237,555 @@ def _split_into_sentences(text: str, book_name: str) -> list[dict]:
|
||||||
if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
|
if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
# Skip duplicates
|
# Deduplicate by exact nikkud text
|
||||||
stripped = strip_nikkud(sent)
|
if sent in seen:
|
||||||
if stripped in seen:
|
|
||||||
continue
|
continue
|
||||||
seen.add(stripped)
|
seen.add(sent)
|
||||||
|
|
||||||
results.append(
|
results.append({"text": sent, "source": book_name})
|
||||||
{
|
|
||||||
"text": sent,
|
|
||||||
"book": book_name,
|
|
||||||
"stripped": stripped,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|
||||||
# ── Vocab loading ────────────────────────────────────────────────
|
# ── Nikkud index ─────────────────────────────────────────────────
|
||||||
|
|
||||||
|
# Unicode ranges for Hebrew combining marks
|
||||||
|
_NIKKUD_LOW = 0x05B0 # start of vowel points (shva)
|
||||||
|
_NIKKUD_HIGH = 0x05BD # end of vowel range (meteg); 0x05BE is maqaf (punctuation)
|
||||||
|
_DAGESH = "\u05bc"
|
||||||
|
_SHIN_DOT = "\u05c1"
|
||||||
|
_SIN_DOT = "\u05c2"
|
||||||
|
|
||||||
|
# Valid prefix consonants
|
||||||
|
_PREFIX_CONSONANTS = set("בהוכלמש")
|
||||||
|
|
||||||
|
# Named vowel combining marks
|
||||||
|
_SHVA = "\u05b0"
|
||||||
|
_HIRIQ = "\u05b4"
|
||||||
|
_TSERE = "\u05b5"
|
||||||
|
_SEGOL = "\u05b6"
|
||||||
|
_PATACH = "\u05b7"
|
||||||
|
_QAMATZ = "\u05b8"
|
||||||
|
|
||||||
|
# Valid nikkud patterns on each prefix consonant.
|
||||||
|
# Key = consonant, Value = set of frozensets of combining marks valid for that prefix.
|
||||||
|
_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = {
|
||||||
|
"ב": {
|
||||||
|
frozenset({_SHVA, _DAGESH}), # בְּ standard
|
||||||
|
frozenset({_HIRIQ, _DAGESH}), # בִּ before shva
|
||||||
|
frozenset({_PATACH, _DAGESH}), # בַּ with definite article
|
||||||
|
frozenset({_QAMATZ, _DAGESH}), # בָּ before chataf qamatz
|
||||||
|
frozenset({_SEGOL, _DAGESH}), # בֶּ before chataf segol
|
||||||
|
},
|
||||||
|
"כ": {
|
||||||
|
frozenset({_SHVA, _DAGESH}), # כְּ
|
||||||
|
frozenset({_HIRIQ, _DAGESH}), # כִּ
|
||||||
|
frozenset({_PATACH, _DAGESH}), # כַּ
|
||||||
|
frozenset({_QAMATZ, _DAGESH}), # כָּ
|
||||||
|
frozenset({_SEGOL, _DAGESH}), # כֶּ
|
||||||
|
},
|
||||||
|
"ל": {
|
||||||
|
frozenset({_SHVA}), # לְ standard
|
||||||
|
frozenset({_HIRIQ}), # לִ before shva
|
||||||
|
frozenset({_PATACH}), # לַ with definite article
|
||||||
|
frozenset({_QAMATZ}), # לָ demonstratives
|
||||||
|
frozenset({_SEGOL}), # לֶ before chataf segol
|
||||||
|
},
|
||||||
|
"ו": {
|
||||||
|
frozenset({_SHVA}), # וְ standard
|
||||||
|
frozenset({_DAGESH}), # וּ (shureq) before shva/bumf
|
||||||
|
frozenset({_PATACH}), # וַ before chataf patach
|
||||||
|
frozenset({_QAMATZ}), # וָ before chataf qamatz
|
||||||
|
frozenset({_SEGOL}), # וֶ before chataf segol
|
||||||
|
frozenset({_HIRIQ}), # וִ before yud-shva
|
||||||
|
},
|
||||||
|
"מ": {
|
||||||
|
frozenset({_HIRIQ}), # מִ standard
|
||||||
|
frozenset({_TSERE}), # מֵ before gutturals
|
||||||
|
},
|
||||||
|
"ש": {
|
||||||
|
frozenset({_SEGOL, _DAGESH}), # שֶׁ standard
|
||||||
|
frozenset({_SEGOL, _DAGESH, _SHIN_DOT}), # שֶׁ with explicit shin dot
|
||||||
|
},
|
||||||
|
"ה": {
|
||||||
|
frozenset({_PATACH}), # הַ standard definite article
|
||||||
|
frozenset({_QAMATZ}), # הָ before gutturals
|
||||||
|
frozenset({_SEGOL}), # הֶ before qamatz-bearing gutturals
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
def load_vocab(csv_path: Path) -> dict:
|
def _is_combining_mark(ch: str) -> bool:
|
||||||
"""Load vocab CSV and return {stripped_form: nikkud_word} mapping.
|
"""Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots)."""
|
||||||
|
cp = ord(ch)
|
||||||
|
if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH:
|
||||||
|
return True
|
||||||
|
return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT)
|
||||||
|
|
||||||
Also returns reverse mapping for lookup.
|
|
||||||
Returns (word_to_nikkud, nikkud_words_set)
|
def _decompose_first_char(token: str) -> tuple[str, frozenset, str]:
|
||||||
|
"""Split token into (first_consonant, its_combining_marks, remainder).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token: A nikkud Hebrew token string.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A tuple of (consonant, marks, rest). Returns ("", frozenset(), token)
|
||||||
|
if the token does not start with a Hebrew consonant (alef–tav range).
|
||||||
"""
|
"""
|
||||||
words_by_stripped: dict[str, list[str]] = {} # stripped -> [nikkud words]
|
if not token:
|
||||||
|
return ("", frozenset(), token)
|
||||||
|
|
||||||
with open(csv_path, encoding="utf-8") as f:
|
first = token[0]
|
||||||
reader = csv.DictReader(f, delimiter=";")
|
# Check it's a Hebrew consonant (alef–tav)
|
||||||
for row in reader:
|
if not ("\u05d0" <= first <= "\u05ea"):
|
||||||
nikkud_word = row.get("Word", "").strip()
|
return ("", frozenset(), token)
|
||||||
word_no_nik = row.get("Word Without Nikkud", "").strip()
|
|
||||||
if not nikkud_word:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Method 1: strip nikkud from the Word column
|
# Collect all combining marks that follow the consonant
|
||||||
stripped_from_nikkud = strip_nikkud(nikkud_word)
|
marks: set[str] = set()
|
||||||
|
i = 1
|
||||||
|
while i < len(token):
|
||||||
|
ch = token[i]
|
||||||
|
if _is_combining_mark(ch):
|
||||||
|
marks.add(ch)
|
||||||
|
i += 1
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
|
||||||
# Add both forms for matching
|
return (first, frozenset(marks), token[i:])
|
||||||
for form in {stripped_from_nikkud, word_no_nik}:
|
|
||||||
if form:
|
|
||||||
words_by_stripped.setdefault(form, []).append(nikkud_word)
|
|
||||||
|
|
||||||
return words_by_stripped
|
|
||||||
|
def _is_valid_prefix(consonant: str, marks: frozenset) -> bool:
|
||||||
|
"""Check if consonant + marks form a valid Hebrew prefix combination.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
consonant: The prefix consonant character.
|
||||||
|
marks: Frozenset of combining mark characters on that consonant.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
True if this is a recognised Hebrew prefix vocalization.
|
||||||
|
"""
|
||||||
|
valid = _VALID_PREFIX_MARKS.get(consonant)
|
||||||
|
if not valid:
|
||||||
|
return False
|
||||||
|
# For ש, allow shin dot to be present or absent
|
||||||
|
if consonant == "ש":
|
||||||
|
marks_without_shin = marks - {_SHIN_DOT}
|
||||||
|
return marks_without_shin in valid or marks in valid
|
||||||
|
return marks in valid
|
||||||
|
|
||||||
|
|
||||||
|
def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str:
|
||||||
|
"""Reassemble a token from its decomposed parts, sorting marks by codepoint."""
|
||||||
|
return consonant + "".join(sorted(marks)) + rest
|
||||||
|
|
||||||
|
|
||||||
|
def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]:
|
||||||
|
"""Try stripping 1 or 2 prefix letters from a nikkud token.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
token: A cleaned nikkud word token.
|
||||||
|
nikkud_index: Mapping from nikkud form to list of (unique_key, match_type).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of (unique_key, match_type, matched_remainder) for each hit found.
|
||||||
|
The match_type will have ``"_prefix"`` appended to the base type.
|
||||||
|
"""
|
||||||
|
results: list[tuple[str, str, str]] = []
|
||||||
|
|
||||||
|
# Try 1-letter prefix
|
||||||
|
c1, m1, rest1 = _decompose_first_char(token)
|
||||||
|
if not (c1 and _is_valid_prefix(c1, m1) and rest1):
|
||||||
|
return results
|
||||||
|
|
||||||
|
# Direct match on 1-prefix remainder
|
||||||
|
if rest1 in nikkud_index:
|
||||||
|
for unique_key, match_type in nikkud_index[rest1]:
|
||||||
|
results.append((unique_key, match_type + "_prefix", rest1))
|
||||||
|
|
||||||
|
# Try removing dagesh from first letter of remainder
|
||||||
|
# (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ)
|
||||||
|
c2, m2, rest2_inner = _decompose_first_char(rest1)
|
||||||
|
if c2 and _DAGESH in m2:
|
||||||
|
without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner)
|
||||||
|
if without_dagesh != rest1 and without_dagesh in nikkud_index:
|
||||||
|
for unique_key, match_type in nikkud_index[without_dagesh]:
|
||||||
|
results.append((unique_key, match_type + "_prefix", without_dagesh))
|
||||||
|
|
||||||
|
# Try 2-letter prefix (ו and ש commonly stack with another prefix)
|
||||||
|
if c1 in "וש":
|
||||||
|
c2b, m2b, rest2b = _decompose_first_char(rest1)
|
||||||
|
if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b:
|
||||||
|
if rest2b in nikkud_index:
|
||||||
|
for unique_key, match_type in nikkud_index[rest2b]:
|
||||||
|
results.append((unique_key, match_type + "_prefix", rest2b))
|
||||||
|
|
||||||
|
# Also try dagesh removal on remainder of 2-letter prefix
|
||||||
|
c3, m3, rest3_inner = _decompose_first_char(rest2b)
|
||||||
|
if c3 and _DAGESH in m3:
|
||||||
|
without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner)
|
||||||
|
if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index:
|
||||||
|
for unique_key, match_type in nikkud_index[without_dagesh2]:
|
||||||
|
results.append((unique_key, match_type + "_prefix", without_dagesh2))
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||||
|
"""Build a mapping from nikkud form to list of (unique_key, match_type).
|
||||||
|
|
||||||
|
Indexes the following sources per entry:
|
||||||
|
|
||||||
|
- ``word.nikkud`` → "direct"
|
||||||
|
- conjugation active/passive forms → "conjugated"
|
||||||
|
- conjugation infinitive and reference_form → "conjugated"
|
||||||
|
- noun inflection singular/plural/construct/pronominal → "inflected"
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: The full words.json dict keyed by unique_key.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping each nikkud form to a list of (unique_key, match_type) tuples.
|
||||||
|
"""
|
||||||
|
index: dict[str, list[tuple[str, str]]] = {}
|
||||||
|
|
||||||
|
def _add(form: str | None, unique_key: str, match_type: str) -> None:
|
||||||
|
if form:
|
||||||
|
index.setdefault(form, []).append((unique_key, match_type))
|
||||||
|
|
||||||
|
for unique_key, entry in words.items():
|
||||||
|
# Direct word form
|
||||||
|
word = entry.get("word") or {}
|
||||||
|
_add(word.get("nikkud"), unique_key, "direct")
|
||||||
|
|
||||||
|
# Conjugation forms
|
||||||
|
conj = entry.get("conjugation") or {}
|
||||||
|
|
||||||
|
for form_entry in conj.get("active_forms") or []:
|
||||||
|
form = (form_entry.get("form") or {}).get("nikkud")
|
||||||
|
_add(form, unique_key, "conjugated")
|
||||||
|
|
||||||
|
for form_entry in conj.get("hufal_pual_forms") or []:
|
||||||
|
form = (form_entry.get("form") or {}).get("nikkud")
|
||||||
|
_add(form, unique_key, "conjugated")
|
||||||
|
|
||||||
|
inf = conj.get("infinitive") or {}
|
||||||
|
_add(inf.get("nikkud"), unique_key, "conjugated")
|
||||||
|
|
||||||
|
ref = conj.get("reference_form") or {}
|
||||||
|
_add(ref.get("nikkud"), unique_key, "conjugated")
|
||||||
|
|
||||||
|
# Noun inflection forms
|
||||||
|
noun = entry.get("noun_inflection") or {}
|
||||||
|
|
||||||
|
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||||
|
sub = noun.get(field) or {}
|
||||||
|
_add(sub.get("nikkud"), unique_key, "inflected")
|
||||||
|
|
||||||
|
pronominal = noun.get("pronominal_suffixes") or {}
|
||||||
|
for _person, sub in pronominal.items():
|
||||||
|
if isinstance(sub, dict):
|
||||||
|
_add(sub.get("nikkud"), unique_key, "inflected")
|
||||||
|
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def _filter_collision_forms(nikkud_index: dict) -> dict:
|
||||||
|
"""Remove colliding forms for entries that have other unique forms.
|
||||||
|
|
||||||
|
A "colliding form" maps to 2+ unique_keys. For each unique_key that
|
||||||
|
appears in a collision, check whether it also has at least one
|
||||||
|
non-colliding form in the index. If so, remove it from the colliding
|
||||||
|
form's entry list. If a unique_key's *only* indexed forms all collide,
|
||||||
|
keep them (otherwise the entry would get zero matches).
|
||||||
|
|
||||||
|
Returns a new index dict with the same structure.
|
||||||
|
"""
|
||||||
|
# Identify collision forms and build reverse map (key → its forms)
|
||||||
|
collision_forms: set[str] = set()
|
||||||
|
key_to_forms: dict[str, set[str]] = {}
|
||||||
|
|
||||||
|
for form, entries in nikkud_index.items():
|
||||||
|
keys = {uk for uk, _ in entries}
|
||||||
|
if len(keys) >= 2:
|
||||||
|
collision_forms.add(form)
|
||||||
|
for uk, _ in entries:
|
||||||
|
key_to_forms.setdefault(uk, set()).add(form)
|
||||||
|
|
||||||
|
# For each key, check if it has any non-colliding form
|
||||||
|
keys_with_unique_forms: set[str] = set()
|
||||||
|
for uk, forms in key_to_forms.items():
|
||||||
|
if forms - collision_forms:
|
||||||
|
keys_with_unique_forms.add(uk)
|
||||||
|
|
||||||
|
# Build filtered index
|
||||||
|
filtered: dict[str, list[tuple[str, str]]] = {}
|
||||||
|
removed = 0
|
||||||
|
for form, entries in nikkud_index.items():
|
||||||
|
if form in collision_forms:
|
||||||
|
kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms]
|
||||||
|
removed += len(entries) - len(kept)
|
||||||
|
if kept:
|
||||||
|
filtered[form] = kept
|
||||||
|
else:
|
||||||
|
filtered[form] = entries
|
||||||
|
|
||||||
|
logger.info(f" Filtered {removed} collision mappings from entries with unique forms")
|
||||||
|
return filtered
|
||||||
|
|
||||||
|
|
||||||
# ── Matching ─────────────────────────────────────────────────────
|
# ── Matching ─────────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def match_sentences(sentences: list[dict], words_by_stripped: dict) -> dict:
|
def match_sentences(
|
||||||
"""Match sentences against vocab words.
|
sentences: list[dict],
|
||||||
|
nikkud_index: dict,
|
||||||
|
confusable_keys: set[str],
|
||||||
|
) -> dict:
|
||||||
|
"""Match sentences to vocab words using the nikkud index.
|
||||||
|
|
||||||
Returns {nikkud_word: [sentences]} with best (shortest) first.
|
Args:
|
||||||
|
sentences: List of ``{"text": str, "source": str}`` dicts.
|
||||||
|
nikkud_index: Output of ``_build_nikkud_index``.
|
||||||
|
confusable_keys: Set of unique_keys that are in confusable groups.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping unique_key → list of match dicts, each containing:
|
||||||
|
``text``, ``source``, ``match_method``, ``word_count``,
|
||||||
|
``matched_form``, ``char_offset``, ``char_end``.
|
||||||
"""
|
"""
|
||||||
# Build a set of all stripped forms for fast lookup
|
matches: dict[str, list[dict]] = {}
|
||||||
all_forms = set(words_by_stripped.keys())
|
|
||||||
|
|
||||||
# Hebrew single-letter prefixes: ב, ה, ו, כ, ל, מ, ש, ד (של)
|
|
||||||
_HEB_PREFIXES = set("בהוכלמשד")
|
|
||||||
|
|
||||||
# For each sentence, extract stripped words
|
|
||||||
matches: dict[str, list[tuple[int, str]]] = {} # nikkud_word -> [(word_count, sentence)]
|
|
||||||
|
|
||||||
for sent_info in sentences:
|
for sent_info in sentences:
|
||||||
sent_text = sent_info["text"]
|
text = sent_info["text"]
|
||||||
sent_stripped = sent_info["stripped"]
|
source = sent_info["source"]
|
||||||
word_count = len(sent_text.split())
|
words_in_sent = text.split()
|
||||||
|
word_count = len(words_in_sent)
|
||||||
|
|
||||||
# Get stripped words from the sentence
|
char_pos = 0
|
||||||
raw_words = sent_stripped.split()
|
for raw_word in words_in_sent:
|
||||||
# Map: candidate_form -> set of original cleaned words that produced it
|
cleaned = _PUNCT.sub("", raw_word)
|
||||||
# This lets us verify that prefix stripping is plausible
|
|
||||||
candidates: dict[str, str] = {} # form -> original_word
|
|
||||||
for w in raw_words:
|
|
||||||
cleaned = _PUNCT.sub("", w)
|
|
||||||
if not cleaned:
|
if not cleaned:
|
||||||
|
word_start = text.find(raw_word, char_pos)
|
||||||
|
char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos
|
||||||
continue
|
continue
|
||||||
# Direct match (always try)
|
|
||||||
candidates[cleaned] = cleaned
|
|
||||||
# Prefix stripping: only if remaining stem is >= 2 chars
|
|
||||||
# and the prefix char is a known Hebrew prefix letter
|
|
||||||
for prefix_len in (1, 2):
|
|
||||||
if len(cleaned) > prefix_len + 1:
|
|
||||||
prefix = cleaned[:prefix_len]
|
|
||||||
stem = cleaned[prefix_len:]
|
|
||||||
if all(c in _HEB_PREFIXES for c in prefix) and len(stem) >= 2:
|
|
||||||
candidates[stem] = cleaned
|
|
||||||
|
|
||||||
# Check which vocab words appear in this sentence
|
# Locate positions within the sentence
|
||||||
matched_forms = set(candidates.keys()) & all_forms
|
word_start_in_sent = text.find(raw_word, char_pos)
|
||||||
for form in matched_forms:
|
if word_start_in_sent < 0:
|
||||||
# Skip spurious matches: very short vocab forms (1-2 chars)
|
word_start_in_sent = char_pos
|
||||||
# should only match via direct word match, not prefix stripping
|
clean_offset_in_raw = raw_word.find(cleaned)
|
||||||
if len(form) <= 2 and form not in {_PUNCT.sub("", w) for w in raw_words}:
|
if clean_offset_in_raw < 0:
|
||||||
continue
|
clean_offset_in_raw = 0
|
||||||
for nikkud_word in words_by_stripped[form]:
|
clean_start = word_start_in_sent + clean_offset_in_raw
|
||||||
matches.setdefault(nikkud_word, []).append((word_count, sent_text))
|
clean_end = clean_start + len(cleaned)
|
||||||
|
|
||||||
# Sort by word count (prefer shorter sentences) and deduplicate
|
found: list[tuple[str, str]] = []
|
||||||
result = {}
|
|
||||||
for nikkud_word, sent_list in matches.items():
|
|
||||||
sent_list.sort(key=lambda x: x[0])
|
|
||||||
seen = set()
|
|
||||||
unique = []
|
|
||||||
for _, sent in sent_list:
|
|
||||||
if sent not in seen:
|
|
||||||
seen.add(sent)
|
|
||||||
unique.append(sent)
|
|
||||||
if len(unique) >= 5: # Keep top 5 per word
|
|
||||||
break
|
|
||||||
result[nikkud_word] = unique
|
|
||||||
|
|
||||||
return result
|
# Direct nikkud match
|
||||||
|
if cleaned in nikkud_index:
|
||||||
|
for unique_key, match_type in nikkud_index[cleaned]:
|
||||||
|
found.append((unique_key, match_type))
|
||||||
|
|
||||||
|
# Prefix stripping — only if no direct match exists
|
||||||
|
if cleaned not in nikkud_index:
|
||||||
|
for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index):
|
||||||
|
found.append((unique_key, match_type))
|
||||||
|
|
||||||
|
for unique_key, match_method in found:
|
||||||
|
matches.setdefault(unique_key, []).append(
|
||||||
|
{
|
||||||
|
"text": text,
|
||||||
|
"source": source,
|
||||||
|
"match_method": match_method,
|
||||||
|
"word_count": word_count,
|
||||||
|
"matched_form": cleaned,
|
||||||
|
"char_offset": clean_start,
|
||||||
|
"char_end": clean_end,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
|
||||||
|
char_pos = word_start_in_sent + len(raw_word)
|
||||||
|
|
||||||
|
return matches
|
||||||
|
|
||||||
|
|
||||||
# ── Main ─────────────────────────────────────────────────────────
|
# ── Writing results ──────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int:
|
||||||
print("=" * 60)
|
"""Update words dict entries with matched example sentences.
|
||||||
print("EPUB Example Sentence Extraction Pipeline")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
# Step 1: Extract sentences from all books
|
Selects up to 3 best sentences per word (scoring prefers 6–12 word
|
||||||
all_sentences = []
|
sentences and non-prefix matches). Also generates a cloze entry for
|
||||||
book_counts = {}
|
the top match, unless the word is in the confusable set.
|
||||||
|
|
||||||
for filename, book_name in EPUB_BOOKS.items():
|
Args:
|
||||||
path = EPUB_DIR / filename
|
words: The full words.json dict, modified in place.
|
||||||
if not path.exists():
|
matches: Output of ``match_sentences``.
|
||||||
print(f"\n[SKIP] {filename} not found")
|
confusable_keys: Set of unique_keys in confusable groups.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Count of words.json entries that were updated.
|
||||||
|
"""
|
||||||
|
import genanki # noqa: PLC0415 — import only where needed
|
||||||
|
|
||||||
|
updated = 0
|
||||||
|
|
||||||
|
for unique_key, sent_list in matches.items():
|
||||||
|
if unique_key not in words:
|
||||||
continue
|
continue
|
||||||
print(f"\n[EPUB] Extracting: {book_name} ({filename})")
|
|
||||||
|
entry = words[unique_key]
|
||||||
|
|
||||||
|
# Deduplicate by sentence text
|
||||||
|
seen_texts: set[str] = set()
|
||||||
|
unique: list[dict] = []
|
||||||
|
for s in sent_list:
|
||||||
|
if s["text"] not in seen_texts:
|
||||||
|
seen_texts.add(s["text"])
|
||||||
|
unique.append(s)
|
||||||
|
|
||||||
|
# Prefer direct matches; only fall back to prefix if none exist
|
||||||
|
direct = [s for s in unique if "prefix" not in s["match_method"]]
|
||||||
|
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
|
||||||
|
pool = direct if direct else prefix_only
|
||||||
|
|
||||||
|
# Score: prefer 6–12 word sentences
|
||||||
|
def _score(s: dict) -> tuple[int,]:
|
||||||
|
wc = s["word_count"]
|
||||||
|
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
|
||||||
|
return (length_score,)
|
||||||
|
|
||||||
|
pool.sort(key=_score)
|
||||||
|
best = pool[:3]
|
||||||
|
|
||||||
|
# Build vetted list
|
||||||
|
if not entry.get("examples"):
|
||||||
|
entry["examples"] = {}
|
||||||
|
examples: dict = entry["examples"]
|
||||||
|
examples["vetted"] = [
|
||||||
|
{
|
||||||
|
"text": s["text"],
|
||||||
|
"source": s["source"],
|
||||||
|
"match_method": s["match_method"],
|
||||||
|
}
|
||||||
|
for s in best
|
||||||
|
]
|
||||||
|
|
||||||
|
# Build cloze from best sentence (skip confusables)
|
||||||
|
is_confusable = unique_key in confusable_keys
|
||||||
|
if not is_confusable and best:
|
||||||
|
top = best[0]
|
||||||
|
# Preserve existing cloze_guid if sentence text unchanged
|
||||||
|
old_cloze = examples.get("cloze") or {}
|
||||||
|
if old_cloze.get("text") == top["text"]:
|
||||||
|
cloze_guid = old_cloze.get("cloze_guid")
|
||||||
|
else:
|
||||||
|
cloze_guid = genanki.guid_for("cloze", unique_key)
|
||||||
|
|
||||||
|
examples["cloze"] = {
|
||||||
|
"text": top["text"],
|
||||||
|
"cloze_word_start": top["char_offset"],
|
||||||
|
"cloze_word_end": top["char_end"],
|
||||||
|
"cloze_hint": None,
|
||||||
|
"cloze_guid": cloze_guid,
|
||||||
|
}
|
||||||
|
elif is_confusable:
|
||||||
|
examples.pop("cloze", None)
|
||||||
|
|
||||||
|
examples["rejected_count"] = 0
|
||||||
|
updated += 1
|
||||||
|
|
||||||
|
return updated
|
||||||
|
|
||||||
|
|
||||||
|
# ── Public API ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
def run(words: dict) -> dict:
|
||||||
|
"""Extract EPUB sentences, match against words, update words dict in place.
|
||||||
|
|
||||||
|
Called from run.py with the already-loaded words.json dict.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: The full words.json dict keyed by unique_key. Modified in place.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Summary stats dict with keys ``books``, ``matched``, ``total_vocab``.
|
||||||
|
"""
|
||||||
|
logger.info(" Extracting sentences from EPUBs ...")
|
||||||
|
all_sentences: list[dict] = []
|
||||||
|
book_counts: dict[str, int] = {}
|
||||||
|
|
||||||
|
for filepath, book_name in _discover_epubs().items():
|
||||||
|
path = Path(filepath)
|
||||||
sentences = extract_sentences_from_epub(path, book_name)
|
sentences = extract_sentences_from_epub(path, book_name)
|
||||||
book_counts[book_name] = len(sentences)
|
book_counts[book_name] = len(sentences)
|
||||||
all_sentences.extend(sentences)
|
all_sentences.extend(sentences)
|
||||||
print(f" -> {len(sentences)} sentences")
|
logger.info(f" {book_name}: {len(sentences)} sentences")
|
||||||
|
|
||||||
for filename, book_name in PDF_BOOKS.items():
|
if not all_sentences:
|
||||||
path = EPUB_DIR / filename
|
logger.warning(" No EPUB files found — skipping example extraction")
|
||||||
if not path.exists():
|
return {"books": {}, "matched": 0, "total_vocab": len(words)}
|
||||||
print(f"\n[SKIP] {filename} not found")
|
|
||||||
continue
|
|
||||||
print(f"\n[PDF] Extracting: {book_name} ({filename})")
|
|
||||||
sentences = extract_sentences_from_pdf(path, book_name)
|
|
||||||
book_counts[book_name] = len(sentences)
|
|
||||||
all_sentences.extend(sentences)
|
|
||||||
print(f" -> {len(sentences)} sentences")
|
|
||||||
|
|
||||||
print(f"\nTotal sentences: {len(all_sentences)}")
|
logger.info(f" Total sentences: {len(all_sentences)}")
|
||||||
|
|
||||||
# Step 2: Save sentence index
|
# Build nikkud index
|
||||||
index_path = DATA_DIR / "epub_sentence_index.json"
|
logger.info(" Building nikkud index from words.json ...")
|
||||||
with open(index_path, "w", encoding="utf-8") as f:
|
nikkud_index = _build_nikkud_index(words)
|
||||||
json.dump({"sentences": all_sentences}, f, ensure_ascii=False, indent=2)
|
logger.info(f" {len(nikkud_index)} unique nikkud forms indexed")
|
||||||
print(f"\nSaved sentence index: {index_path}")
|
|
||||||
|
|
||||||
# Step 3: Load vocab and match
|
# Filter out collision forms for entries that have unique forms
|
||||||
print(f"\nLoading vocab from {DICT_CSV} ...")
|
nikkud_index = _filter_collision_forms(nikkud_index)
|
||||||
words_by_stripped = load_vocab(DICT_CSV)
|
|
||||||
total_vocab = len({w for wlist in words_by_stripped.values() for w in wlist})
|
|
||||||
print(f" {total_vocab} unique vocab words ({len(words_by_stripped)} lookup forms)")
|
|
||||||
|
|
||||||
print("\nMatching sentences against vocab ...")
|
# Build confusable key set
|
||||||
examples_cache = match_sentences(all_sentences, words_by_stripped)
|
confusable_keys: set[str] = set()
|
||||||
|
for key, entry in words.items():
|
||||||
|
if entry.get("confusable_group"):
|
||||||
|
confusable_keys.add(key)
|
||||||
|
|
||||||
# Step 4: Save examples_cache
|
# Match sentences
|
||||||
cache_path = DATA_DIR / "examples_cache.json"
|
logger.info(" Matching sentences against vocab ...")
|
||||||
with open(cache_path, "w", encoding="utf-8") as f:
|
matches = match_sentences(all_sentences, nikkud_index, confusable_keys)
|
||||||
json.dump(examples_cache, f, ensure_ascii=False, indent=2)
|
logger.info(f" {len(matches)} words matched")
|
||||||
print(f"Saved examples cache: {cache_path}")
|
|
||||||
|
|
||||||
# Step 5: Summary stats
|
# Break down by match method
|
||||||
print("\n" + "=" * 60)
|
method_counts: dict[str, int] = {}
|
||||||
print("SUMMARY")
|
for sent_list in matches.values():
|
||||||
print("=" * 60)
|
for s in sent_list:
|
||||||
print("\nSentences per book:")
|
method = s["match_method"]
|
||||||
for book_name, count in book_counts.items():
|
method_counts[method] = method_counts.get(method, 0) + 1
|
||||||
print(f" {book_name}: {count}")
|
for method, count in sorted(method_counts.items()):
|
||||||
print(f" Total: {len(all_sentences)}")
|
logger.info(f" {method}: {count} sentence-word pairs")
|
||||||
|
|
||||||
print("\nVocab matching:")
|
# Update words dict in place
|
||||||
print(f" Total vocab words: {total_vocab}")
|
updated = update_words_json(words, matches, confusable_keys)
|
||||||
print(f" Words with examples: {len(examples_cache)}")
|
logger.info(f" Updated {updated} entries in words.json")
|
||||||
coverage = 100 * len(examples_cache) / total_vocab if total_vocab else 0
|
|
||||||
print(f" Coverage: {coverage:.1f}%")
|
|
||||||
|
|
||||||
# Show some sample matches
|
return {
|
||||||
print("\nSample matches:")
|
"books": book_counts,
|
||||||
count = 0
|
"matched": len(matches),
|
||||||
for word, sents in examples_cache.items():
|
"total_vocab": len(words),
|
||||||
if count >= 5:
|
}
|
||||||
break
|
|
||||||
print(f" {word} -> {sents[0][:60]}...")
|
|
||||||
count += 1
|
|
||||||
|
|
||||||
return examples_cache
|
|
||||||
|
|
||||||
|
# ── Standalone entry point ───────────────────────────────────────
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
import json
|
||||||
|
|
||||||
|
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||||
|
|
||||||
|
words_path = DATA_DIR / "words.json"
|
||||||
|
with open(words_path, encoding="utf-8") as f:
|
||||||
|
words = json.load(f)
|
||||||
|
|
||||||
|
stats = run(words)
|
||||||
|
|
||||||
|
# Save updated words.json
|
||||||
|
with open(words_path, "w", encoding="utf-8") as f:
|
||||||
|
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||||
|
|
||||||
|
coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0
|
||||||
|
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)")
|
||||||
|
|
|
||||||
|
|
@ -2,7 +2,8 @@
|
||||||
"""
|
"""
|
||||||
Consolidated detail page scraper for pealim.com.
|
Consolidated detail page scraper for pealim.com.
|
||||||
|
|
||||||
Visits /dict/<slug>/ detail pages for nouns and verbs in data/words.json.
|
Visits /dict/<slug>/ detail pages for nouns, verbs, adjectives and prepositions
|
||||||
|
in data/words.json.
|
||||||
Makes two requests per slug:
|
Makes two requests per slug:
|
||||||
1. hebstyle=mo cookie → nikkud forms
|
1. hebstyle=mo cookie → nikkud forms
|
||||||
2. hebstyle=vl cookie → ktiv male forms
|
2. hebstyle=vl cookie → ktiv male forms
|
||||||
|
|
@ -11,7 +12,8 @@ Updates entries in data/words.json with scraped detail data.
|
||||||
|
|
||||||
Usage:
|
Usage:
|
||||||
python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
|
python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
|
||||||
[--nouns-only | --verbs-only]
|
[--nouns-only | --verbs-only |
|
||||||
|
--adjectives-only | --prepositions-only]
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import argparse
|
import argparse
|
||||||
|
|
@ -144,28 +146,128 @@ FORM_KEY_TO_PERSON: dict[str, str] = {
|
||||||
"infinitive": "inf",
|
"infinitive": "inf",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Mishkal English name → Hebrew nikkud mapping (common patterns)
|
# Mishkal English name → Hebrew nikkud mapping
|
||||||
MISHKAL_HEBREW: dict[str, str] = {
|
# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
|
||||||
"CaCaC": "קָטָל",
|
# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
|
||||||
"CeCeC": "קֶטֶל",
|
# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
|
||||||
"CiCeC": "קִטֶל",
|
_MISHKAL_HEBREW_Q: dict[str, str] = {
|
||||||
"CaCeC": "קָטֶל",
|
# --- a ---
|
||||||
"CoCeC": "קוֹטֵל",
|
"aqtal": "אַקְטָל",
|
||||||
"CaCiC": "קָטִיד",
|
"aqtala": "אַקְטָלָה",
|
||||||
"CaCuC": "קָטוּר",
|
# --- e ---
|
||||||
"miCCaC": "מִקְטָל",
|
"eqtal": "אֶקְטָל",
|
||||||
"miCCeC": "מִקְטֶל",
|
# --- h ---
|
||||||
"maCCeC": "מַקְטֶל",
|
"haqtala": "הַקְטָלָה",
|
||||||
"maCCiC": "מַקְטִיר",
|
"heqtel": "הֶקְטֵל",
|
||||||
"hiCCiC": "הִקְטִיל",
|
"hiqqatlut": "הִקָּטְלוּת",
|
||||||
"CiCCuC": "קִטּוּל",
|
"hitqattlut": "הִתְקַטְּלוּת",
|
||||||
"hitCaCCeC": "הִתְקַטֵּל",
|
# --- m ---
|
||||||
"CaCCan": "קַטְּלָן",
|
"maqtal": "מַקְטָל",
|
||||||
"CaCCaC": "קַטָּל",
|
"maqtel": "מַקְטֵל",
|
||||||
"CiCCon": "קִטְּרוֹן",
|
"maqtela": "מַקְטֵלָה",
|
||||||
"CaCCeC": "קַטֶּלֶת",
|
"maqtelet": "מַקְטֶלֶת",
|
||||||
|
"maqtil": "מַקְטִיל",
|
||||||
|
"maqtol": "מַקְטוֹל",
|
||||||
|
"maqtolet": "מַקְטֹלֶת",
|
||||||
|
"maqtul": "מַקְטוּל",
|
||||||
|
"meqattel": "מְקַטֵּל",
|
||||||
|
"meqila": "מְקִילָה",
|
||||||
|
"mequla": "מְקוּלָה",
|
||||||
|
"mequttal": "מְקֻטָּל",
|
||||||
|
"miqtal": "מִקְטָל",
|
||||||
|
"miqtala": "מִקְטָלָה",
|
||||||
|
"miqtelet": "מִקְטֶלֶת",
|
||||||
|
"miqtol": "מִקְטוֹל",
|
||||||
|
"miqtolet": "מִקְטֹלֶת",
|
||||||
|
"mitqattel": "מִתְקַטֵּל",
|
||||||
|
"muqtal": "מֻקְטָל",
|
||||||
|
# --- n ---
|
||||||
|
"niqtal": "נִקְטָל",
|
||||||
|
# --- q ---
|
||||||
|
"qal": "קַל",
|
||||||
|
"qatal": "קָטָל",
|
||||||
|
"qatel": "קָטֵל",
|
||||||
|
"qatil": "קָטִיל",
|
||||||
|
"qatla": "קַטְלָה",
|
||||||
|
"qatlan": "קַטְלָן",
|
||||||
|
"qatlut": "קַטְלוּת",
|
||||||
|
"qatol": "קָטוֹל",
|
||||||
|
"qaton": "קָטוֹן",
|
||||||
|
"qattal": "קַטָּל",
|
||||||
|
"qattala": "קַטָּלָה",
|
||||||
|
"qattelet": "קַטֶּלֶת",
|
||||||
|
"qattil": "קַטִּיל",
|
||||||
|
"qattila": "קַטִּילָה",
|
||||||
|
"qattolet": "קַטֹּלֶת",
|
||||||
|
"qattul": "קַטּוּל",
|
||||||
|
"qatul": "קָטוּל",
|
||||||
|
"qatut": "קָטוּת",
|
||||||
|
"qetel": "קֶטֶל",
|
||||||
|
"qeteh": "קֵטֶה",
|
||||||
|
"qitla": "קִטְלָה",
|
||||||
|
"qitlon": "קִטְלוֹן",
|
||||||
|
"qittalon": "קִטָּלוֹן",
|
||||||
|
"qittel": "קִטֵּל",
|
||||||
|
"qittelet": "קִטֶּלֶת",
|
||||||
|
"qittol": "קִטּוֹל",
|
||||||
|
"qittolet": "קִטֹּלֶת",
|
||||||
|
"qittul": "קִטּוּל",
|
||||||
|
"qol": "קֹל",
|
||||||
|
"qotal": "קוֹטָל",
|
||||||
|
"qotel": "קוֹטֵל",
|
||||||
|
"qotelet": "קוֹטֶלֶת",
|
||||||
|
"qotla": "קָטְלָה",
|
||||||
|
"qtal": "קְטָל",
|
||||||
|
"qtala": "קְטָלָה",
|
||||||
|
"qtaltal": "קְטַלְטַל",
|
||||||
|
"qtaltan": "קְטַלְתָּן",
|
||||||
|
"qtaltolet": "קְטַלְטֹלֶת",
|
||||||
|
"qtel": "קְטֵל",
|
||||||
|
"qtela": "קְטֵלָה",
|
||||||
|
"qtelet": "קְטֶלֶת",
|
||||||
|
"qtil": "קְטִיל",
|
||||||
|
"qtila": "קְטִילָה",
|
||||||
|
"qtili": "קְטִילִי",
|
||||||
|
"qtol": "קְטוֹל",
|
||||||
|
"qtola": "קְטוֹלָה",
|
||||||
|
"qtolet": "קְטֹלֶת",
|
||||||
|
"qtul": "קְטוּל",
|
||||||
|
"qtula": "קְטוּלָה",
|
||||||
|
"qtulla": "קְטֻלָּה",
|
||||||
|
"qtut": "קְטוּת",
|
||||||
|
"qutla": "קֻטְלָה",
|
||||||
|
"quttolet": "קֻטּוֹלֶת",
|
||||||
|
# --- t ---
|
||||||
|
"taqtela": "תַּקְטֵלָה",
|
||||||
|
"taqtil": "תַּקְטִיל",
|
||||||
|
"taqtit": "תַּקְטִית",
|
||||||
|
"taqtul": "תַּקְטוּל",
|
||||||
|
"taqtula": "תַּקְטוּלָה",
|
||||||
|
"taqtut": "תַּקְטוּת",
|
||||||
|
"tiqtal": "תִּקְטָל",
|
||||||
|
"tiqtala": "תִּקְטָלָה",
|
||||||
|
"tiqtelet": "תִּקְטֶלֶת",
|
||||||
|
"tiqtolet": "תִּקְטֹלֶת",
|
||||||
|
"tqilla": "תְּקִלָּה",
|
||||||
|
"tqula": "תְּקוּלָה",
|
||||||
|
# --- y ---
|
||||||
|
"yaqtul": "יַקְטוּל",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _mishkal_to_hebrew(mishkal: str) -> str | None:
|
||||||
|
"""Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
|
||||||
|
if not mishkal:
|
||||||
|
return None
|
||||||
|
# Try as-is first (q-notation)
|
||||||
|
result = _MISHKAL_HEBREW_Q.get(mishkal)
|
||||||
|
if result:
|
||||||
|
return result
|
||||||
|
# Convert k-notation to q-notation and retry
|
||||||
|
q_form = mishkal.replace("k", "q")
|
||||||
|
return _MISHKAL_HEBREW_Q.get(q_form)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# HTTP session
|
# HTTP session
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -452,7 +554,7 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
||||||
|
|
||||||
if mishkal:
|
if mishkal:
|
||||||
result["mishkal"] = mishkal
|
result["mishkal"] = mishkal
|
||||||
result["mishkal_hebrew"] = MISHKAL_HEBREW.get(mishkal)
|
result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)
|
||||||
|
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
@ -887,6 +989,228 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Adjective detail parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
|
||||||
|
_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
|
||||||
|
"""
|
||||||
|
Parse the adjective inflection table from a pealim detail page (mo/nikkud).
|
||||||
|
|
||||||
|
Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
|
||||||
|
and audio URL from each.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping form key ("ms", "fs", "mp", "fp") to
|
||||||
|
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
|
||||||
|
"""
|
||||||
|
table = soup.find("table", class_="conjugation-table")
|
||||||
|
if not table:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result: dict[str, dict] = {}
|
||||||
|
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
|
||||||
|
div = table.find(id=cell_id)
|
||||||
|
if not div:
|
||||||
|
continue
|
||||||
|
nikkud, audio_url = _get_menukad_and_audio(div)
|
||||||
|
if nikkud:
|
||||||
|
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Parse the adjective inflection table from a vl (ktiv male) page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
|
||||||
|
"""
|
||||||
|
table = soup.find("table", class_="conjugation-table")
|
||||||
|
if not table:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result: dict[str, str] = {}
|
||||||
|
for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
|
||||||
|
div = table.find(id=cell_id)
|
||||||
|
if not div:
|
||||||
|
continue
|
||||||
|
ktiv = _get_plain_text(div)
|
||||||
|
if ktiv:
|
||||||
|
result[form_key] = ktiv
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Extract mishkal from the PoS section of an adjective detail page.
|
||||||
|
|
||||||
|
Reuses the same extraction logic as _parse_noun_gender_mishkal.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
|
||||||
|
"""
|
||||||
|
_, mishkal = _parse_noun_gender_mishkal(soup)
|
||||||
|
mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
|
||||||
|
return mishkal, mishkal_hebrew
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
||||||
|
"""
|
||||||
|
Parse adjective detail pages (mo=nikkud, vl=ktiv male).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict matching the adjective_inflection schema:
|
||||||
|
{ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
|
||||||
|
Empty dict if no forms found.
|
||||||
|
"""
|
||||||
|
mo_soup = BeautifulSoup(mo_html, "lxml")
|
||||||
|
vl_soup = BeautifulSoup(vl_html, "lxml")
|
||||||
|
|
||||||
|
mo_data = _parse_adjective_table(mo_soup)
|
||||||
|
vl_data = _parse_adjective_table_vl(vl_soup)
|
||||||
|
mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)
|
||||||
|
|
||||||
|
if not mo_data:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result: dict = {}
|
||||||
|
for form_key in _ADJECTIVE_FORM_KEYS:
|
||||||
|
mo_form = mo_data.get(form_key)
|
||||||
|
if mo_form:
|
||||||
|
nikkud = mo_form["nikkud"]
|
||||||
|
ktiv = vl_data.get(form_key, "")
|
||||||
|
if not ktiv:
|
||||||
|
logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
|
||||||
|
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
|
||||||
|
else:
|
||||||
|
result[form_key] = None
|
||||||
|
|
||||||
|
result["mishkal"] = mishkal or None
|
||||||
|
result["mishkal_hebrew"] = mishkal_hebrew or None
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Preposition detail parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
_PREPOSITION_CELL_IDS: tuple[str, ...] = (
|
||||||
|
"P-1s",
|
||||||
|
"P-1p",
|
||||||
|
"P-2ms",
|
||||||
|
"P-2fs",
|
||||||
|
"P-2mp",
|
||||||
|
"P-2fp",
|
||||||
|
"P-3ms",
|
||||||
|
"P-3fs",
|
||||||
|
"P-3mp",
|
||||||
|
"P-3fp",
|
||||||
|
)
|
||||||
|
_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
|
||||||
|
"1s",
|
||||||
|
"1p",
|
||||||
|
"2ms",
|
||||||
|
"2fs",
|
||||||
|
"2mp",
|
||||||
|
"2fp",
|
||||||
|
"3ms",
|
||||||
|
"3fs",
|
||||||
|
"3mp",
|
||||||
|
"3fp",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
|
||||||
|
"""
|
||||||
|
Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).
|
||||||
|
|
||||||
|
Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud
|
||||||
|
text and audio URL from each.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping person key ("1s", "1p", …, "3fp") to
|
||||||
|
{"nikkud": str, "audio_url": str}, or empty dict if table not found.
|
||||||
|
"""
|
||||||
|
table = soup.find("table", class_="conjugation-table")
|
||||||
|
if not table:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result: dict[str, dict] = {}
|
||||||
|
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
|
||||||
|
div = table.find(id=cell_id)
|
||||||
|
if not div:
|
||||||
|
continue
|
||||||
|
nikkud, audio_url = _get_menukad_and_audio(div)
|
||||||
|
if nikkud:
|
||||||
|
result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
|
||||||
|
"""
|
||||||
|
Parse the preposition pronominal suffix table from a vl (ktiv male) page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string.
|
||||||
|
"""
|
||||||
|
table = soup.find("table", class_="conjugation-table")
|
||||||
|
if not table:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result: dict[str, str] = {}
|
||||||
|
for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
|
||||||
|
div = table.find(id=cell_id)
|
||||||
|
if not div:
|
||||||
|
continue
|
||||||
|
ktiv = _get_plain_text(div)
|
||||||
|
if ktiv:
|
||||||
|
result[form_key] = ktiv
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
||||||
|
"""
|
||||||
|
Parse preposition detail pages (mo=nikkud, vl=ktiv male).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Dict matching the preposition_inflection schema:
|
||||||
|
{1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
|
||||||
|
Empty dict if no forms found.
|
||||||
|
"""
|
||||||
|
mo_soup = BeautifulSoup(mo_html, "lxml")
|
||||||
|
vl_soup = BeautifulSoup(vl_html, "lxml")
|
||||||
|
|
||||||
|
mo_data = _parse_preposition_table(mo_soup)
|
||||||
|
vl_data = _parse_preposition_table_vl(vl_soup)
|
||||||
|
|
||||||
|
if not mo_data:
|
||||||
|
return {}
|
||||||
|
|
||||||
|
result: dict = {}
|
||||||
|
for form_key in _PREPOSITION_FORM_KEYS:
|
||||||
|
mo_form = mo_data.get(form_key)
|
||||||
|
if mo_form:
|
||||||
|
nikkud = mo_form["nikkud"]
|
||||||
|
ktiv = vl_data.get(form_key, "")
|
||||||
|
if not ktiv:
|
||||||
|
logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
|
||||||
|
result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
|
||||||
|
else:
|
||||||
|
result[form_key] = None
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Merging strategy
|
# Merging strategy
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -926,6 +1250,22 @@ def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
|
||||||
return scraped
|
return scraped
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
|
||||||
|
"""
|
||||||
|
Merge scraped adjective data into existing adjective_inflection.
|
||||||
|
No GUIDs to preserve — simple overwrite with scraped data.
|
||||||
|
"""
|
||||||
|
return dict(scraped)
|
||||||
|
|
||||||
|
|
||||||
|
def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
|
||||||
|
"""
|
||||||
|
Merge scraped preposition data into existing preposition_inflection.
|
||||||
|
No GUIDs to preserve — simple overwrite with scraped data.
|
||||||
|
"""
|
||||||
|
return dict(scraped)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# I/O helpers
|
# I/O helpers
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -953,14 +1293,26 @@ def _save_words(data: dict) -> None:
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
def _should_process(entry: dict, pos: str, force: bool, nouns_only: bool, verbs_only: bool) -> bool:
|
def _should_process(
|
||||||
|
entry: dict,
|
||||||
|
pos: str,
|
||||||
|
force: bool,
|
||||||
|
nouns_only: bool,
|
||||||
|
verbs_only: bool,
|
||||||
|
adjectives_only: bool,
|
||||||
|
prepositions_only: bool,
|
||||||
|
) -> bool:
|
||||||
"""Return True if this entry should be scraped."""
|
"""Return True if this entry should be scraped."""
|
||||||
if not pos.startswith(("Noun", "Verb")):
|
if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
|
||||||
return False
|
return False
|
||||||
if nouns_only and not pos.startswith("Noun"):
|
if nouns_only and not pos.startswith("Noun"):
|
||||||
return False
|
return False
|
||||||
if verbs_only and not pos.startswith("Verb"):
|
if verbs_only and not pos.startswith("Verb"):
|
||||||
return False
|
return False
|
||||||
|
if adjectives_only and not pos.startswith("Adjective"):
|
||||||
|
return False
|
||||||
|
if prepositions_only and not pos.startswith("Preposition"):
|
||||||
|
return False
|
||||||
return force or not entry.get("detail_scraped")
|
return force or not entry.get("detail_scraped")
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -969,6 +1321,8 @@ def run(
|
||||||
force_refresh: bool = False,
|
force_refresh: bool = False,
|
||||||
nouns_only: bool = False,
|
nouns_only: bool = False,
|
||||||
verbs_only: bool = False,
|
verbs_only: bool = False,
|
||||||
|
adjectives_only: bool = False,
|
||||||
|
prepositions_only: bool = False,
|
||||||
) -> None:
|
) -> None:
|
||||||
"""
|
"""
|
||||||
Main scrape loop.
|
Main scrape loop.
|
||||||
|
|
@ -978,13 +1332,24 @@ def run(
|
||||||
force_refresh: Re-scrape entries where detail_scraped=True.
|
force_refresh: Re-scrape entries where detail_scraped=True.
|
||||||
nouns_only: Only scrape noun entries.
|
nouns_only: Only scrape noun entries.
|
||||||
verbs_only: Only scrape verb entries.
|
verbs_only: Only scrape verb entries.
|
||||||
|
adjectives_only: Only scrape adjective entries.
|
||||||
|
prepositions_only: Only scrape preposition entries.
|
||||||
"""
|
"""
|
||||||
words = _load_words()
|
words = _load_words()
|
||||||
|
|
||||||
candidates = [
|
candidates = [
|
||||||
(unique_key, entry)
|
(unique_key, entry)
|
||||||
for unique_key, entry in words.items()
|
for unique_key, entry in words.items()
|
||||||
if _should_process(entry, entry.get("pos", ""), force_refresh, nouns_only, verbs_only) and entry.get("slug")
|
if _should_process(
|
||||||
|
entry,
|
||||||
|
entry.get("pos", ""),
|
||||||
|
force_refresh,
|
||||||
|
nouns_only,
|
||||||
|
verbs_only,
|
||||||
|
adjectives_only,
|
||||||
|
prepositions_only,
|
||||||
|
)
|
||||||
|
and entry.get("slug")
|
||||||
]
|
]
|
||||||
|
|
||||||
total = len(candidates)
|
total = len(candidates)
|
||||||
|
|
@ -992,7 +1357,10 @@ def run(
|
||||||
candidates = candidates[:test]
|
candidates = candidates[:test]
|
||||||
logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
|
logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
|
||||||
else:
|
else:
|
||||||
logger.info("Processing %d eligible entries (nouns+verbs) from words.json", total)
|
logger.info(
|
||||||
|
"Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
|
||||||
|
total,
|
||||||
|
)
|
||||||
|
|
||||||
processed = 0
|
processed = 0
|
||||||
errors = 0
|
errors = 0
|
||||||
|
|
@ -1003,7 +1371,14 @@ def run(
|
||||||
word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
|
word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
|
||||||
url = f"{PEALIM_BASE}/dict/{slug}/"
|
url = f"{PEALIM_BASE}/dict/{slug}/"
|
||||||
|
|
||||||
label = "Noun" if pos.startswith("Noun") else "Verb"
|
if pos.startswith("Noun"):
|
||||||
|
label = "Noun"
|
||||||
|
elif pos.startswith("Verb"):
|
||||||
|
label = "Verb"
|
||||||
|
elif pos.startswith("Adjective"):
|
||||||
|
label = "Adjective"
|
||||||
|
else:
|
||||||
|
label = "Preposition"
|
||||||
logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)
|
logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)
|
||||||
|
|
||||||
# Fetch mo (nikkud) page
|
# Fetch mo (nikkud) page
|
||||||
|
|
@ -1042,7 +1417,7 @@ def run(
|
||||||
errors += 1
|
errors += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
else: # Verb
|
elif pos.startswith("Verb"):
|
||||||
existing_conj = entry.get("conjugation")
|
existing_conj = entry.get("conjugation")
|
||||||
scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
|
scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
|
||||||
if scraped:
|
if scraped:
|
||||||
|
|
@ -1059,6 +1434,41 @@ def run(
|
||||||
errors += 1
|
errors += 1
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
elif pos.startswith("Adjective"):
|
||||||
|
scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
|
||||||
|
if scraped:
|
||||||
|
existing_ai = entry.get("adjective_inflection")
|
||||||
|
merged = _merge_adjective_inflection(existing_ai, scraped)
|
||||||
|
words[unique_key]["adjective_inflection"] = merged
|
||||||
|
ms = merged.get("ms", {}) or {}
|
||||||
|
fs = merged.get("fs", {}) or {}
|
||||||
|
logger.info(
|
||||||
|
" ms=%s fs=%s mishkal=%s",
|
||||||
|
ms.get("nikkud", "—"),
|
||||||
|
fs.get("nikkud", "—"),
|
||||||
|
merged.get("mishkal", "—"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(" No adjective data scraped for %s", slug)
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
else: # Preposition
|
||||||
|
scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
|
||||||
|
if scraped:
|
||||||
|
existing_pi = entry.get("preposition_inflection")
|
||||||
|
merged = _merge_preposition_inflection(existing_pi, scraped)
|
||||||
|
words[unique_key]["preposition_inflection"] = merged
|
||||||
|
form_1s = merged.get("1s", {}) or {}
|
||||||
|
logger.info(
|
||||||
|
" 1s=%s",
|
||||||
|
form_1s.get("nikkud", "—"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
logger.warning(" No preposition data scraped for %s", slug)
|
||||||
|
errors += 1
|
||||||
|
continue
|
||||||
|
|
||||||
except Exception as exc: # noqa: BLE001
|
except Exception as exc: # noqa: BLE001
|
||||||
logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
|
logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
|
||||||
errors += 1
|
errors += 1
|
||||||
|
|
@ -1089,7 +1499,7 @@ def run(
|
||||||
|
|
||||||
def _build_parser() -> argparse.ArgumentParser:
|
def _build_parser() -> argparse.ArgumentParser:
|
||||||
parser = argparse.ArgumentParser(
|
parser = argparse.ArgumentParser(
|
||||||
description="Scrape pealim.com detail pages for nouns and verbs in data/words.json."
|
description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
|
||||||
)
|
)
|
||||||
parser.add_argument(
|
parser.add_argument(
|
||||||
"--test",
|
"--test",
|
||||||
|
|
@ -1117,6 +1527,18 @@ def _build_parser() -> argparse.ArgumentParser:
|
||||||
default=False,
|
default=False,
|
||||||
help="Only scrape Verb entries.",
|
help="Only scrape Verb entries.",
|
||||||
)
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--adjectives-only",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Only scrape Adjective entries.",
|
||||||
|
)
|
||||||
|
group.add_argument(
|
||||||
|
"--prepositions-only",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Only scrape Preposition entries.",
|
||||||
|
)
|
||||||
return parser
|
return parser
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1133,4 +1555,6 @@ if __name__ == "__main__":
|
||||||
force_refresh=args.force_refresh_detail,
|
force_refresh=args.force_refresh_detail,
|
||||||
nouns_only=args.nouns_only,
|
nouns_only=args.nouns_only,
|
||||||
verbs_only=args.verbs_only,
|
verbs_only=args.verbs_only,
|
||||||
|
adjectives_only=args.adjectives_only,
|
||||||
|
prepositions_only=args.prepositions_only,
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -1,183 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Rebuild vocab_sentence_matches.json using both direct word matching
|
|
||||||
and ktiv male conjugated/declined form matching.
|
|
||||||
|
|
||||||
This dramatically improves sentence coverage by matching not just
|
|
||||||
dictionary forms but all conjugated verbs and declined nouns.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import json
|
|
||||||
import logging
|
|
||||||
import re
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
from helpers import strip_nikkud as _strip_nikkud
|
|
||||||
|
|
||||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
||||||
logger = logging.getLogger(__name__)
|
|
||||||
|
|
||||||
DATA_DIR = Path(__file__).parent / "data"
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
# Load sentences
|
|
||||||
with open(DATA_DIR / "epub_sentence_index.json") as f:
|
|
||||||
sentences = json.load(f).get("sentences", [])
|
|
||||||
logger.info(f"Loaded {len(sentences)} sentences")
|
|
||||||
|
|
||||||
# Load vocab CSV
|
|
||||||
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
|
|
||||||
try:
|
|
||||||
df = pd.read_csv(csv_path, sep=";", index_col=0)
|
|
||||||
if df.shape[1] < 3:
|
|
||||||
raise ValueError
|
|
||||||
except (ValueError, pd.errors.ParserError):
|
|
||||||
df = pd.read_csv(csv_path, index_col=0)
|
|
||||||
logger.info(f"Loaded {len(df)} vocab entries")
|
|
||||||
|
|
||||||
# Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
|
|
||||||
word_lookup: dict[str, list[tuple[str, str]]] = {}
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
word = str(row.get("Word", "")).strip()
|
|
||||||
wni = str(row.get("Word Without Nikkud", "")).strip()
|
|
||||||
if not word or word in ("nan", "None"):
|
|
||||||
continue
|
|
||||||
stripped = _strip_nikkud(word)
|
|
||||||
if stripped:
|
|
||||||
word_lookup.setdefault(stripped, []).append((word, wni))
|
|
||||||
|
|
||||||
# Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
|
|
||||||
ktiv_path = DATA_DIR / "ktiv_male_forms.json"
|
|
||||||
ktiv_forms: dict[str, list[dict]] = {}
|
|
||||||
if ktiv_path.exists():
|
|
||||||
with open(ktiv_path) as f:
|
|
||||||
ktiv_forms = json.load(f)
|
|
||||||
logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
|
|
||||||
else:
|
|
||||||
logger.warning("No ktiv_male_forms.json — only using direct matching")
|
|
||||||
|
|
||||||
# Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
|
|
||||||
ktiv_to_word: dict[str, set[str]] = {}
|
|
||||||
for ktiv, entries in ktiv_forms.items():
|
|
||||||
for entry in entries:
|
|
||||||
word_nikkud = entry.get("word_nikkud", "")
|
|
||||||
if word_nikkud:
|
|
||||||
ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)
|
|
||||||
|
|
||||||
# Also add all vocab words' own stripped forms to ktiv_to_word
|
|
||||||
for stripped, entries in word_lookup.items():
|
|
||||||
for word_nikkud, _ in entries:
|
|
||||||
ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)
|
|
||||||
|
|
||||||
logger.info(f"Total matchable forms: {len(ktiv_to_word)}")
|
|
||||||
|
|
||||||
# Tokenize all sentences once
|
|
||||||
sentence_tokens: list[tuple[dict, list[str]]] = []
|
|
||||||
for s in sentences:
|
|
||||||
stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
|
|
||||||
tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
|
|
||||||
tokens = [t for t in tokens if t] # remove empty
|
|
||||||
sentence_tokens.append((s, tokens))
|
|
||||||
|
|
||||||
# Match: for each sentence token, check ktiv_to_word lookup
|
|
||||||
# Build word_nikkud → [sentence_info]
|
|
||||||
matches: dict[str, list[dict]] = {} # word_nikkud → [sentences]
|
|
||||||
|
|
||||||
for sent, tokens in sentence_tokens:
|
|
||||||
text = sent.get("text", "")
|
|
||||||
book = sent.get("book", "")
|
|
||||||
word_len = len(tokens)
|
|
||||||
|
|
||||||
# Skip sentences that are too short or too long
|
|
||||||
if word_len < 4 or word_len > 15:
|
|
||||||
continue
|
|
||||||
|
|
||||||
for tok in tokens:
|
|
||||||
if tok in ktiv_to_word:
|
|
||||||
for word_nikkud in ktiv_to_word[tok]:
|
|
||||||
matches.setdefault(word_nikkud, []).append(
|
|
||||||
{
|
|
||||||
"text": text,
|
|
||||||
"book": book,
|
|
||||||
"matched_form": tok,
|
|
||||||
"word_count": word_len,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
logger.info(f"Words with at least 1 match: {len(matches)}")
|
|
||||||
|
|
||||||
# Deduplicate and limit to 3 best sentences per word
|
|
||||||
# Prefer shorter sentences (6-12 words ideal)
|
|
||||||
output: dict[str, dict] = {}
|
|
||||||
for word_nikkud, sents in matches.items():
|
|
||||||
# Deduplicate by text
|
|
||||||
seen_texts = set()
|
|
||||||
unique = []
|
|
||||||
for s in sents:
|
|
||||||
if s["text"] not in seen_texts:
|
|
||||||
seen_texts.add(s["text"])
|
|
||||||
unique.append(s)
|
|
||||||
|
|
||||||
# Score: prefer 6-12 word sentences
|
|
||||||
def score(s):
|
|
||||||
wc = s["word_count"]
|
|
||||||
if 6 <= wc <= 12:
|
|
||||||
return 0 # ideal
|
|
||||||
return abs(wc - 9) # distance from ideal
|
|
||||||
|
|
||||||
unique.sort(key=score)
|
|
||||||
best = unique[:3]
|
|
||||||
|
|
||||||
# Find the Word Without Nikkud for this word
|
|
||||||
stripped = _strip_nikkud(word_nikkud)
|
|
||||||
wni = stripped # default
|
|
||||||
if stripped in word_lookup:
|
|
||||||
for wn, w_wni in word_lookup[stripped]:
|
|
||||||
if wn == word_nikkud:
|
|
||||||
wni = w_wni
|
|
||||||
break
|
|
||||||
|
|
||||||
output[wni] = {
|
|
||||||
"word_nikkud": word_nikkud,
|
|
||||||
"sentences": [{"text": s["text"], "book": s["book"]} for s in best],
|
|
||||||
}
|
|
||||||
|
|
||||||
# Save
|
|
||||||
out_path = DATA_DIR / "vocab_sentence_matches.json"
|
|
||||||
with open(out_path, "w") as f:
|
|
||||||
json.dump(output, f, ensure_ascii=False, indent=1)
|
|
||||||
|
|
||||||
total_sents = sum(len(v["sentences"]) for v in output.values())
|
|
||||||
logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")
|
|
||||||
|
|
||||||
# Stats
|
|
||||||
total_vocab = len(df)
|
|
||||||
pct = len(output) * 100 / total_vocab
|
|
||||||
logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")
|
|
||||||
|
|
||||||
# Breakdown by match type
|
|
||||||
direct_only = 0
|
|
||||||
ktiv_only = 0
|
|
||||||
both = 0
|
|
||||||
for _wni, info in output.items():
|
|
||||||
word = info["word_nikkud"]
|
|
||||||
stripped = _strip_nikkud(word)
|
|
||||||
has_direct = stripped in word_lookup
|
|
||||||
has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
|
|
||||||
if has_direct and has_ktiv:
|
|
||||||
both += 1
|
|
||||||
elif has_ktiv:
|
|
||||||
ktiv_only += 1
|
|
||||||
else:
|
|
||||||
direct_only += 1
|
|
||||||
|
|
||||||
logger.info(f" Direct matches only: {direct_only}")
|
|
||||||
logger.info(f" Ktiv male matches only: {ktiv_only}")
|
|
||||||
logger.info(f" Both: {both}")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
80
run.py
80
run.py
|
|
@ -11,7 +11,7 @@ Pipeline steps:
|
||||||
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
|
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
|
||||||
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
|
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
|
||||||
3. Frequency — load/download word frequency data
|
3. Frequency — load/download word frequency data
|
||||||
4. Examples — fetch Ben Yehuda example sentences
|
4. Examples — extract example sentences from Hebrew EPUBs
|
||||||
5. Audio download — download audio mp3 files
|
5. Audio download — download audio mp3 files
|
||||||
6. Fonts — download Heebo font files
|
6. Fonts — download Heebo font files
|
||||||
7. Images — fetch noun images from Wikipedia
|
7. Images — fetch noun images from Wikipedia
|
||||||
|
|
@ -21,9 +21,8 @@ Options:
|
||||||
--skip-scrape Skip list page scraping (use existing words.json)
|
--skip-scrape Skip list page scraping (use existing words.json)
|
||||||
--skip-detail Skip detail page scraping
|
--skip-detail Skip detail page scraping
|
||||||
--skip-audio Skip audio .mp3 downloads
|
--skip-audio Skip audio .mp3 downloads
|
||||||
--skip-examples Skip Ben Yehuda example fetching
|
--skip-examples Skip EPUB example extraction
|
||||||
--skip-images Skip image fetching for concrete nouns
|
--skip-images Skip image fetching for concrete nouns
|
||||||
--refresh-examples Force rebuild of Ben Yehuda index
|
|
||||||
--test N Limit to first N words/pages
|
--test N Limit to first N words/pages
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
|
@ -60,9 +59,8 @@ def parse_args():
|
||||||
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
|
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
|
||||||
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
|
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
|
||||||
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
||||||
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
|
||||||
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
||||||
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
|
|
||||||
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
||||||
return p.parse_args()
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
@ -93,22 +91,15 @@ def step_frequency() -> dict[str, int]:
|
||||||
return frequency_lookup._freq
|
return frequency_lookup._freq
|
||||||
|
|
||||||
|
|
||||||
def step_examples(args, _freq_cache: dict):
|
def step_examples(args) -> dict:
|
||||||
"""Step 4 — load/build Ben Yehuda example index."""
|
"""Step 4 — extract example sentences from Hebrew EPUBs."""
|
||||||
if args.skip_examples:
|
if args.skip_examples:
|
||||||
logger.info("[4] Skipping examples (--skip-examples)")
|
logger.info("[4] Skipping examples (--skip-examples)")
|
||||||
examples_path = DATA_DIR / "examples_cache.json"
|
|
||||||
if examples_path.exists():
|
|
||||||
with open(examples_path) as f:
|
|
||||||
return json.load(f)
|
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
logger.info("[4] Loading Ben Yehuda example index …")
|
logger.info("[4] Extracting EPUB example sentences …")
|
||||||
import benyehuda
|
import epub_examples
|
||||||
|
|
||||||
benyehuda.load(force_rebuild=args.refresh_examples)
|
|
||||||
|
|
||||||
# Read word list from words.json instead of CSV
|
|
||||||
if not WORDS_JSON.exists():
|
if not WORDS_JSON.exists():
|
||||||
logger.warning("[4] words.json not found, skipping examples")
|
logger.warning("[4] words.json not found, skipping examples")
|
||||||
return {}
|
return {}
|
||||||
|
|
@ -116,41 +107,14 @@ def step_examples(args, _freq_cache: dict):
|
||||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||||
words = json.load(f)
|
words = json.load(f)
|
||||||
|
|
||||||
entries = list(words.values())
|
stats = epub_examples.run(words)
|
||||||
if args.test:
|
|
||||||
entries = entries[: args.test]
|
|
||||||
|
|
||||||
# Build confusable consonant set from words.json
|
# Save updated words.json
|
||||||
consonant_counts: dict[str, int] = {}
|
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||||
for entry in entries:
|
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
|
||||||
if ktiv_male:
|
|
||||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
|
||||||
if safe:
|
|
||||||
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
|
||||||
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
|
||||||
|
|
||||||
# Delete stale cache entries for confusable words so they get re-fetched
|
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
|
||||||
stale_deleted = 0
|
return stats
|
||||||
for entry in entries:
|
|
||||||
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
|
||||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
|
||||||
if word_nikkud and ktiv_male:
|
|
||||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
|
||||||
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
|
|
||||||
del benyehuda._examples_cache[word_nikkud]
|
|
||||||
stale_deleted += 1
|
|
||||||
if stale_deleted:
|
|
||||||
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
|
|
||||||
|
|
||||||
logger.info(f" Pre-fetching examples for {len(entries)} words …")
|
|
||||||
for entry in entries:
|
|
||||||
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
|
||||||
if word_nikkud:
|
|
||||||
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
|
|
||||||
|
|
||||||
benyehuda.save_examples_cache()
|
|
||||||
return benyehuda._examples_cache
|
|
||||||
|
|
||||||
|
|
||||||
def step_detail_scrape(args):
|
def step_detail_scrape(args):
|
||||||
|
|
@ -250,7 +214,7 @@ def step_build_all(args):
|
||||||
apkg_builder.build_all_variants(words, limit=args.test)
|
apkg_builder.build_all_variants(words, limit=args.test)
|
||||||
|
|
||||||
|
|
||||||
def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
|
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
|
||||||
logger.info("")
|
logger.info("")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("SUMMARY")
|
logger.info("SUMMARY")
|
||||||
|
|
@ -267,10 +231,12 @@ def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: d
|
||||||
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
|
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
|
||||||
|
|
||||||
logger.info(f" Frequency entries: {len(freq_cache)}")
|
logger.info(f" Frequency entries: {len(freq_cache)}")
|
||||||
logger.info(f" Example cache entries: {len(examples_cache)}")
|
matched = example_stats.get("matched", 0)
|
||||||
covered = sum(1 for v in examples_cache.values() if v)
|
total = example_stats.get("total_vocab", 0)
|
||||||
if examples_cache:
|
if total:
|
||||||
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
|
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
|
||||||
|
for book, count in example_stats.get("books", {}).items():
|
||||||
|
logger.info(f" {book}: {count} sentences")
|
||||||
|
|
||||||
if AUDIO_DIR.exists():
|
if AUDIO_DIR.exists():
|
||||||
mp3s = list(AUDIO_DIR.glob("*.mp3"))
|
mp3s = list(AUDIO_DIR.glob("*.mp3"))
|
||||||
|
|
@ -321,8 +287,6 @@ def main():
|
||||||
logger.info(f" MODE: --only {args.only}")
|
logger.info(f" MODE: --only {args.only}")
|
||||||
if args.test:
|
if args.test:
|
||||||
logger.info(f" TEST MODE: {args.test} words")
|
logger.info(f" TEST MODE: {args.test} words")
|
||||||
if args.refresh_examples:
|
|
||||||
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
|
|
||||||
def _load_words_for_only() -> dict:
|
def _load_words_for_only() -> dict:
|
||||||
|
|
@ -385,13 +349,13 @@ def main():
|
||||||
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
|
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
|
||||||
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
|
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
|
||||||
freq_cache = step_frequency() # 3 — word frequency data
|
freq_cache = step_frequency() # 3 — word frequency data
|
||||||
examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples
|
example_stats = step_examples(args) # 4 — EPUB example sentences
|
||||||
step_audio_download(args) # 5 — download audio mp3s
|
step_audio_download(args) # 5 — download audio mp3s
|
||||||
step_fonts(args) # 6 — download Heebo fonts
|
step_fonts(args) # 6 — download Heebo fonts
|
||||||
step_images(args) # 7 — fetch noun images
|
step_images(args) # 7 — fetch noun images
|
||||||
step_build_all(args) # 8 — build all .apkg variants
|
step_build_all(args) # 8 — build all .apkg variants
|
||||||
|
|
||||||
print_summary(args, examples_cache, freq_cache)
|
print_summary(args, example_stats, freq_cache)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
|
|
@ -32,7 +32,7 @@ DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
|
||||||
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav
|
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav
|
||||||
|
|
||||||
VALID_PERSON_CODES: frozenset[str] = frozenset(
|
VALID_PERSON_CODES: frozenset[str] = frozenset(
|
||||||
["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
|
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
|
||||||
)
|
)
|
||||||
|
|
||||||
EMOJI_RE = re.compile(
|
EMOJI_RE = re.compile(
|
||||||
|
|
@ -561,6 +561,7 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
|
||||||
"""
|
"""
|
||||||
name = "conjugation_form_guids"
|
name = "conjugation_form_guids"
|
||||||
errors: list[str] = []
|
errors: list[str] = []
|
||||||
|
warnings: list[str] = []
|
||||||
|
|
||||||
for key, entry in data.items():
|
for key, entry in data.items():
|
||||||
conj = entry.get("conjugation")
|
conj = entry.get("conjugation")
|
||||||
|
|
@ -580,7 +581,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
|
||||||
guid_candidates = form.get("guid_candidates")
|
guid_candidates = form.get("guid_candidates")
|
||||||
|
|
||||||
if not guid and not guid_candidates:
|
if not guid and not guid_candidates:
|
||||||
errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
|
# New forms from rescrape use deterministic fallback — warn, don't fail
|
||||||
|
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
|
||||||
continue
|
continue
|
||||||
|
|
||||||
if guid:
|
if guid:
|
||||||
|
|
@ -597,6 +599,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
|
||||||
else:
|
else:
|
||||||
seen_guids[candidate] = label
|
seen_guids[candidate] = label
|
||||||
|
|
||||||
|
if warnings:
|
||||||
|
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
|
||||||
if errors:
|
if errors:
|
||||||
_fail(name, errors[:20] if not _verbose else errors)
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
if len(errors) > 20 and not _verbose:
|
if len(errors) > 20 and not _verbose:
|
||||||
|
|
|
||||||
486
tests/test_detail_scrape.py
Normal file
486
tests/test_detail_scrape.py
Normal file
|
|
@ -0,0 +1,486 @@
|
||||||
|
"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
|
||||||
|
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||||
|
|
||||||
|
from pealim_detail_scrape import (
|
||||||
|
_parse_adjective_table,
|
||||||
|
_parse_adjective_table_vl,
|
||||||
|
_parse_preposition_table,
|
||||||
|
_parse_preposition_table_vl,
|
||||||
|
_scrape_adjective_detail,
|
||||||
|
_scrape_preposition_detail,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Fixtures — real HTML snippets from pealim.com
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
ADJECTIVE_MO_TABLE = """
|
||||||
|
<table class="table table-condensed conjugation-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th class="column-header" colspan="2">Singular</th>
|
||||||
|
<th class="column-header" colspan="2">Plural</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th class="column-header">Masculine</th>
|
||||||
|
<th class="column-header">Feminine</th>
|
||||||
|
<th class="column-header">Masculine</th>
|
||||||
|
<th class="column-header">Feminine</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="ms-a">
|
||||||
|
<div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">🔊</span>
|
||||||
|
<span class="menukad">אֲבִיבִי</span>
|
||||||
|
</div></div>
|
||||||
|
<div class="meaning">spring-like, vernal</div>
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="fs-a">
|
||||||
|
<div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">🔊</span>
|
||||||
|
<span class="menukad">אֲבִיבִית</span>
|
||||||
|
</div></div>
|
||||||
|
<div class="meaning">spring-like, vernal</div>
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="mp-a">
|
||||||
|
<div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">🔊</span>
|
||||||
|
<span class="menukad">אֲבִיבִיִּים</span>
|
||||||
|
</div></div>
|
||||||
|
<div class="meaning">spring-like, vernal</div>
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="fp-a">
|
||||||
|
<div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">🔊</span>
|
||||||
|
<span class="menukad">אֲבִיבִיּוֹת</span>
|
||||||
|
</div></div>
|
||||||
|
<div class="meaning">spring-like, vernal</div>
|
||||||
|
</div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# VL version: menukad spans contain unvowelled text (hebstyle=vl)
|
||||||
|
ADJECTIVE_VL_TABLE = """
|
||||||
|
<table class="table table-condensed conjugation-table">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="ms-a"><div><div>
|
||||||
|
<span class="menukad">אביבי</span>
|
||||||
|
</div></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="fs-a"><div><div>
|
||||||
|
<span class="menukad">אביבית</span>
|
||||||
|
</div></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="mp-a"><div><div>
|
||||||
|
<span class="menukad">אביביים</span>
|
||||||
|
</div></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="fp-a"><div><div>
|
||||||
|
<span class="menukad">אביביות</span>
|
||||||
|
</div></div></div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
|
||||||
|
PREPOSITION_MO_TABLE = """
|
||||||
|
<table class="table table-condensed conjugation-table">
|
||||||
|
<thead>
|
||||||
|
<tr>
|
||||||
|
<th rowspan="2">Person</th>
|
||||||
|
<th class="column-header" colspan="2">Singular</th>
|
||||||
|
<th class="column-header" colspan="2">Plural</th>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th class="column-header">Masculine</th>
|
||||||
|
<th class="column-header">Feminine</th>
|
||||||
|
<th class="column-header">Masculine</th>
|
||||||
|
<th class="column-header">Feminine</th>
|
||||||
|
</tr>
|
||||||
|
</thead>
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>1st</th>
|
||||||
|
<td class="conj-td" colspan="2">
|
||||||
|
<div id="P-1s"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלִּי</span>
|
||||||
|
</div></div><div class="meaning"><strong>of mine</strong></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td" colspan="2">
|
||||||
|
<div id="P-1p"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלָּנוּ</span>
|
||||||
|
</div></div><div class="meaning"><strong>of ours</strong></div></div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>2nd</th>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-2ms"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלְּךָ</span>
|
||||||
|
</div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-2fs"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלָּךְ</span>
|
||||||
|
</div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-2mp"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלָּכֶם</span>
|
||||||
|
</div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-2fp"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלָּכֶן</span>
|
||||||
|
</div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>3rd</th>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-3ms"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלּוֹ</span>
|
||||||
|
</div></div><div class="meaning"><strong>of his</strong></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-3fs"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלָּהּ</span>
|
||||||
|
</div></div><div class="meaning"><strong>of hers</strong></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-3mp"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלָּהֶם</span>
|
||||||
|
</div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
|
||||||
|
</td>
|
||||||
|
<td class="conj-td">
|
||||||
|
<div id="P-3fp"><div><div>
|
||||||
|
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">🔊</span>
|
||||||
|
<span class="menukad">שֶׁלָּהֶן</span>
|
||||||
|
</div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
|
||||||
|
PREPOSITION_VL_TABLE = """
|
||||||
|
<table class="table table-condensed conjugation-table">
|
||||||
|
<tbody>
|
||||||
|
<tr>
|
||||||
|
<th>1st</th>
|
||||||
|
<td colspan="2"><div id="P-1s"><div><div>
|
||||||
|
<span class="menukad">שלי</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
<td colspan="2"><div id="P-1p"><div><div>
|
||||||
|
<span class="menukad">שלנו</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>2nd</th>
|
||||||
|
<td><div id="P-2ms"><div><div>
|
||||||
|
<span class="menukad">שלך</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
<td><div id="P-2fs"><div><div>
|
||||||
|
<span class="menukad">שלך</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
<td><div id="P-2mp"><div><div>
|
||||||
|
<span class="menukad">שלכם</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
<td><div id="P-2fp"><div><div>
|
||||||
|
<span class="menukad">שלכן</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<th>3rd</th>
|
||||||
|
<td><div id="P-3ms"><div><div>
|
||||||
|
<span class="menukad">שלו</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
<td><div id="P-3fs"><div><div>
|
||||||
|
<span class="menukad">שלה</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
<td><div id="P-3mp"><div><div>
|
||||||
|
<span class="menukad">שלהם</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
<td><div id="P-3fp"><div><div>
|
||||||
|
<span class="menukad">שלהן</span>
|
||||||
|
</div></div></div></td>
|
||||||
|
</tr>
|
||||||
|
</tbody>
|
||||||
|
</table>
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Minimal full-page wrappers so _scrape_*_detail() can parse them
|
||||||
|
_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
|
||||||
|
_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
|
||||||
|
_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
|
||||||
|
_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Adjective table tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseAdjectiveTable:
|
||||||
|
"""Tests for _parse_adjective_table (mo/nikkud page)."""
|
||||||
|
|
||||||
|
def test_returns_four_form_keys(self) -> None:
|
||||||
|
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||||
|
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
|
||||||
|
|
||||||
|
def test_ms_nikkud(self) -> None:
|
||||||
|
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||||
|
assert result["ms"]["nikkud"] == "אֲבִיבִי"
|
||||||
|
|
||||||
|
def test_fs_nikkud(self) -> None:
|
||||||
|
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||||
|
assert result["fs"]["nikkud"] == "אֲבִיבִית"
|
||||||
|
|
||||||
|
def test_mp_nikkud(self) -> None:
|
||||||
|
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||||
|
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
|
||||||
|
|
||||||
|
def test_fp_nikkud(self) -> None:
|
||||||
|
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||||
|
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
|
||||||
|
|
||||||
|
def test_audio_url_present(self) -> None:
|
||||||
|
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||||
|
assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
|
||||||
|
|
||||||
|
def test_empty_on_missing_table(self) -> None:
|
||||||
|
result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
|
||||||
|
assert result == {}
|
||||||
|
|
||||||
|
|
||||||
|
class TestParseAdjectiveTableVl:
|
||||||
|
"""Tests for _parse_adjective_table_vl (ktiv male page)."""
|
||||||
|
|
||||||
|
def test_returns_four_form_keys(self) -> None:
|
||||||
|
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||||
|
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
|
||||||
|
|
||||||
|
def test_ms_ktiv(self) -> None:
|
||||||
|
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||||
|
assert result["ms"] == "אביבי"
|
||||||
|
|
||||||
|
def test_fs_ktiv(self) -> None:
|
||||||
|
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||||
|
assert result["fs"] == "אביבית"
|
||||||
|
|
||||||
|
def test_mp_ktiv(self) -> None:
|
||||||
|
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||||
|
assert result["mp"] == "אביביים"
|
||||||
|
|
||||||
|
def test_fp_ktiv(self) -> None:
|
||||||
|
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||||
|
assert result["fp"] == "אביביות"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _scrape_adjective_detail tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestScrapeAdjectiveDetail:
|
||||||
|
"""Tests for _scrape_adjective_detail — schema compliance."""
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def result(self) -> dict:
|
||||||
|
return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
|
||||||
|
|
||||||
|
def test_returns_non_empty_dict(self, result: dict) -> None:
|
||||||
|
assert result
|
||||||
|
|
||||||
|
def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["ms"]["nikkud"] == "אֲבִיבִי"
|
||||||
|
assert result["ms"]["ktiv_male"] == "אביבי"
|
||||||
|
|
||||||
|
def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["fs"]["nikkud"] == "אֲבִיבִית"
|
||||||
|
assert result["fs"]["ktiv_male"] == "אביבית"
|
||||||
|
|
||||||
|
def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
|
||||||
|
assert result["mp"]["ktiv_male"] == "אביביים"
|
||||||
|
|
||||||
|
def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
|
||||||
|
assert result["fp"]["ktiv_male"] == "אביביות"
|
||||||
|
|
||||||
|
def test_mishkal_key_present(self, result: dict) -> None:
|
||||||
|
# mishkal may be None since no PoS section is in our minimal fixture
|
||||||
|
assert "mishkal" in result
|
||||||
|
|
||||||
|
def test_mishkal_hebrew_key_present(self, result: dict) -> None:
|
||||||
|
assert "mishkal_hebrew" in result
|
||||||
|
|
||||||
|
def test_all_schema_keys_present(self, result: dict) -> None:
|
||||||
|
expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
|
||||||
|
assert expected.issubset(result.keys())
|
||||||
|
|
||||||
|
def test_empty_on_no_table(self) -> None:
|
||||||
|
result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
|
||||||
|
assert result == {}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Preposition table tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestParsePrepositionTable:
|
||||||
|
"""Tests for _parse_preposition_table (mo/nikkud page)."""
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def result(self) -> dict:
|
||||||
|
return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
|
||||||
|
|
||||||
|
def test_returns_ten_form_keys(self, result: dict) -> None:
|
||||||
|
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||||
|
assert set(result.keys()) == expected
|
||||||
|
|
||||||
|
def test_1s_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["1s"]["nikkud"] == "שֶׁלִּי"
|
||||||
|
|
||||||
|
def test_1p_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
|
||||||
|
|
||||||
|
def test_2ms_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
|
||||||
|
|
||||||
|
def test_2fs_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
|
||||||
|
|
||||||
|
def test_2mp_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
|
||||||
|
|
||||||
|
def test_2fp_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
|
||||||
|
|
||||||
|
def test_3ms_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
|
||||||
|
|
||||||
|
def test_3fs_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
|
||||||
|
|
||||||
|
def test_3mp_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
|
||||||
|
|
||||||
|
def test_3fp_nikkud(self, result: dict) -> None:
|
||||||
|
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
|
||||||
|
|
||||||
|
def test_audio_url_present(self, result: dict) -> None:
|
||||||
|
assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
|
||||||
|
|
||||||
|
def test_empty_on_missing_table(self) -> None:
|
||||||
|
result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
|
||||||
|
assert result == {}
|
||||||
|
|
||||||
|
|
||||||
|
class TestParsePrepositionTableVl:
|
||||||
|
"""Tests for _parse_preposition_table_vl (ktiv male page)."""
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def result(self) -> dict:
|
||||||
|
return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
|
||||||
|
|
||||||
|
def test_returns_ten_form_keys(self, result: dict) -> None:
|
||||||
|
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||||
|
assert set(result.keys()) == expected
|
||||||
|
|
||||||
|
def test_1s_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["1s"] == "שלי"
|
||||||
|
|
||||||
|
def test_1p_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["1p"] == "שלנו"
|
||||||
|
|
||||||
|
def test_2ms_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["2ms"] == "שלך"
|
||||||
|
|
||||||
|
def test_3ms_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["3ms"] == "שלו"
|
||||||
|
|
||||||
|
def test_3fp_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["3fp"] == "שלהן"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# _scrape_preposition_detail tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestScrapePrepositionDetail:
|
||||||
|
"""Tests for _scrape_preposition_detail — schema compliance."""
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def result(self) -> dict:
|
||||||
|
return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
|
||||||
|
|
||||||
|
def test_returns_non_empty_dict(self, result: dict) -> None:
|
||||||
|
assert result
|
||||||
|
|
||||||
|
def test_all_ten_person_keys_present(self, result: dict) -> None:
|
||||||
|
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||||
|
assert expected.issubset(result.keys())
|
||||||
|
|
||||||
|
def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["1s"]["nikkud"] == "שֶׁלִּי"
|
||||||
|
assert result["1s"]["ktiv_male"] == "שלי"
|
||||||
|
|
||||||
|
def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
|
||||||
|
assert result["1p"]["ktiv_male"] == "שלנו"
|
||||||
|
|
||||||
|
def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
|
||||||
|
assert result["2ms"]["ktiv_male"] == "שלך"
|
||||||
|
|
||||||
|
def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
|
||||||
|
assert result["3ms"]["ktiv_male"] == "שלו"
|
||||||
|
|
||||||
|
def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
|
||||||
|
assert result["3fs"]["ktiv_male"] == "שלה"
|
||||||
|
|
||||||
|
def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||||
|
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
|
||||||
|
assert result["3fp"]["ktiv_male"] == "שלהן"
|
||||||
|
|
||||||
|
def test_empty_on_no_table(self) -> None:
|
||||||
|
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
|
||||||
|
assert result == {}
|
||||||
|
|
@ -42,3 +42,17 @@ def test_strip_nikkud_all_marks():
|
||||||
nikkud = "הַמַּלְכָּה"
|
nikkud = "הַמַּלְכָּה"
|
||||||
plain = strip_nikkud(nikkud)
|
plain = strip_nikkud(nikkud)
|
||||||
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
|
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
|
||||||
|
|
||||||
|
|
||||||
|
def test_categorize_pos_no_substring_match():
|
||||||
|
"""Regression: 'Pronoun' must NOT match 'Noun' category."""
|
||||||
|
from apkg_builder import _categorize_pos
|
||||||
|
|
||||||
|
assert _categorize_pos("Noun") == "Noun"
|
||||||
|
assert _categorize_pos("Verb") == "Verb"
|
||||||
|
assert _categorize_pos("Adjective") == "Adjective"
|
||||||
|
assert _categorize_pos("Adverb") == "Adverb"
|
||||||
|
assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
|
||||||
|
assert _categorize_pos("Preposition") == "Other"
|
||||||
|
assert _categorize_pos("Conjunction") == "Other"
|
||||||
|
assert _categorize_pos("Cardinal numeral") == "Other"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue