Sprint 15: example sentence pipeline overhaul + corpus expansion + card improvements
- Regenerated all example sentences from scratch (deleted legacy + stale entries) - Added .txt file support to epub_examples.py for Ben Yehuda corpus - 7 Ben Yehuda nikkud'd children's texts + 3 new Time Tunnel EPUBs - Maqaf-stripped construct form indexing (+68% inflected matches) - Total: 3,598 words with examples, 3,289 with cloze (was ~2,900) - Cloze prefix preservation (_cloze_prefix_len) - Hebrew spoiler stripping from English meanings - Gender field (זָכָר/נְקֵבָה) on vocab cards - sec-table CSS layout for aligned key:value pairs - Mishkal uses mishkal_hebrew on plural cards - Improved mishkal extraction from pealim detail pages - 21 new pytest tests (cloze, PoS, Hebrew stripping, gender, mishkal) - 2 new validate_data.py tests + mishkal stats - Colliding forms tracking (local-only) - Release tag v0.17 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
efd0745ada
commit
c85063ee2f
7 changed files with 15634 additions and 3273 deletions
1
.gitignore
vendored
1
.gitignore
vendored
|
|
@ -15,6 +15,7 @@ __pycache__/
|
||||||
|
|
||||||
# Large generated cache files (rebuild locally)
|
# Large generated cache files (rebuild locally)
|
||||||
data/benyehuda_index.json
|
data/benyehuda_index.json
|
||||||
|
data/colliding_forms.json
|
||||||
|
|
||||||
# Audio directories (large; rebuild locally)
|
# Audio directories (large; rebuild locally)
|
||||||
data/audio/
|
data/audio/
|
||||||
|
|
|
||||||
174
apkg_builder.py
174
apkg_builder.py
|
|
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
||||||
|
|
||||||
# Release version tag added to all notes so users can identify which release
|
# Release version tag added to all notes so users can identify which release
|
||||||
# their cards come from (visible in Anki's Browse view and card info).
|
# their cards come from (visible in Anki's Browse view and card info).
|
||||||
RELEASE_TAG = "v0.16"
|
RELEASE_TAG = "v0.17"
|
||||||
|
|
||||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||||
|
|
@ -185,18 +185,34 @@ CARD_CSS = """
|
||||||
font-weight: normal;
|
font-weight: normal;
|
||||||
color: #555;
|
color: #555;
|
||||||
}
|
}
|
||||||
|
.sec-table {
|
||||||
|
display: table;
|
||||||
|
margin: 6px auto 0;
|
||||||
|
direction: rtl;
|
||||||
|
border-collapse: collapse;
|
||||||
|
}
|
||||||
.sec-label {
|
.sec-label {
|
||||||
|
display: table-row;
|
||||||
font-size: 28px;
|
font-size: 28px;
|
||||||
font-weight: normal;
|
font-weight: normal;
|
||||||
color: #222;
|
color: #222;
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
text-align: center;
|
|
||||||
margin-top: 6px;
|
|
||||||
}
|
}
|
||||||
.sec-key {
|
.sec-key {
|
||||||
|
display: table-cell;
|
||||||
font-size: 28px;
|
font-size: 28px;
|
||||||
color: #222;
|
color: #222;
|
||||||
font-weight: bold;
|
font-weight: bold;
|
||||||
|
text-align: right;
|
||||||
|
padding: 2px 0 2px 8px;
|
||||||
|
white-space: nowrap;
|
||||||
|
}
|
||||||
|
.sec-val {
|
||||||
|
display: table-cell;
|
||||||
|
font-size: 28px;
|
||||||
|
color: #222;
|
||||||
|
text-align: right;
|
||||||
|
padding: 2px 0;
|
||||||
}
|
}
|
||||||
.definitions {
|
.definitions {
|
||||||
direction: rtl;
|
direction: rtl;
|
||||||
|
|
@ -231,6 +247,7 @@ CARD_CSS = """
|
||||||
.root-info { color: #e0e0e0; }
|
.root-info { color: #e0e0e0; }
|
||||||
.sec-label { color: #e0e0e0; }
|
.sec-label { color: #e0e0e0; }
|
||||||
.sec-key { color: #e0e0e0; }
|
.sec-key { color: #e0e0e0; }
|
||||||
|
.sec-val { color: #e0e0e0; }
|
||||||
.conf-entry { color: #ddd; }
|
.conf-entry { color: #ddd; }
|
||||||
.hint { color: #777; }
|
.hint { color: #777; }
|
||||||
.voice-label { color: #888; }
|
.voice-label { color: #888; }
|
||||||
|
|
@ -255,14 +272,17 @@ VOCAB_BACK_HEB = """
|
||||||
<div class="meaning">{{Meaning}}</div>
|
<div class="meaning">{{Meaning}}</div>
|
||||||
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
|
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
|
||||||
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
|
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
|
||||||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
<div class="sec-table">
|
||||||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
|
||||||
|
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
|
||||||
|
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
|
||||||
|
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
|
||||||
|
</div>
|
||||||
{{#SharedRoots}}
|
{{#SharedRoots}}
|
||||||
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
<div class="divider" style="margin:6px 0;"></div>
|
||||||
|
<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||||
<div class="root-info">{{SharedRoots}}</div>
|
<div class="root-info">{{SharedRoots}}</div>
|
||||||
{{/SharedRoots}}
|
{{/SharedRoots}}
|
||||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
|
||||||
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
VOCAB_FRONT_ENG = """
|
VOCAB_FRONT_ENG = """
|
||||||
|
|
@ -277,14 +297,17 @@ VOCAB_BACK_ENG = """
|
||||||
<div class="divider"></div>
|
<div class="divider"></div>
|
||||||
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
|
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
|
||||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||||
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
|
<div class="sec-table">
|
||||||
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
|
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
|
||||||
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
|
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
|
||||||
|
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
|
||||||
|
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
|
||||||
|
</div>
|
||||||
{{#SharedRoots}}
|
{{#SharedRoots}}
|
||||||
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
<div class="divider" style="margin:6px 0;"></div>
|
||||||
|
<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||||
<div class="root-info">{{SharedRoots}}</div>
|
<div class="root-info">{{SharedRoots}}</div>
|
||||||
{{/SharedRoots}}
|
{{/SharedRoots}}
|
||||||
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
VOCAB_FRONT_CLOZE = """
|
VOCAB_FRONT_CLOZE = """
|
||||||
|
|
@ -318,6 +341,7 @@ VOCAB_MODEL = genanki.Model(
|
||||||
{"name": "Prep"},
|
{"name": "Prep"},
|
||||||
{"name": "Hint"},
|
{"name": "Hint"},
|
||||||
{"name": "Plural"},
|
{"name": "Plural"},
|
||||||
|
{"name": "Gender"},
|
||||||
{"name": "ClozeExample"},
|
{"name": "ClozeExample"},
|
||||||
{"name": "ClozeHint"},
|
{"name": "ClozeHint"},
|
||||||
],
|
],
|
||||||
|
|
@ -359,11 +383,16 @@ CONJ_BACK = """
|
||||||
{{FrontSide}}<hr>
|
{{FrontSide}}<hr>
|
||||||
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
|
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
|
||||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||||
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
|
{{#Meaning}}<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>{{/Meaning}}
|
||||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
|
<div class="sec-table">
|
||||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
|
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
|
||||||
{{#RelatedVocab}}<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
|
||||||
<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
|
</div>
|
||||||
|
{{#RelatedVocab}}
|
||||||
|
<div class="divider" style="margin:6px 0;"></div>
|
||||||
|
<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
|
||||||
|
<div class="root-info">{{RelatedVocab}}</div>
|
||||||
|
{{/RelatedVocab}}
|
||||||
"""
|
"""
|
||||||
|
|
||||||
CONJ_CSS = CARD_CSS
|
CONJ_CSS = CARD_CSS
|
||||||
|
|
@ -703,6 +732,32 @@ def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]:
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
# Hebrew prefix letters (אותיות השימוש): בהוכלמש
|
||||||
|
_PREFIX_LETTERS = frozenset("בהוכלמש")
|
||||||
|
|
||||||
|
|
||||||
|
def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int:
|
||||||
|
"""Return the number of characters in the cloze token that are prefix (not part of the word).
|
||||||
|
|
||||||
|
For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars).
|
||||||
|
Returns 0 if the token starts with the word directly.
|
||||||
|
"""
|
||||||
|
if not word_nikkud or not cloze_token:
|
||||||
|
return 0
|
||||||
|
# If the token starts with the word nikkud, no prefix
|
||||||
|
if cloze_token.startswith(word_nikkud):
|
||||||
|
return 0
|
||||||
|
# Check if word nikkud appears as a suffix of the token
|
||||||
|
idx = cloze_token.find(word_nikkud)
|
||||||
|
if idx > 0:
|
||||||
|
# Verify prefix chars are valid Hebrew prefix letters
|
||||||
|
prefix_part = cloze_token[:idx]
|
||||||
|
base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"]
|
||||||
|
if base_letters and all(c in _PREFIX_LETTERS for c in base_letters):
|
||||||
|
return idx
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def build_vocab_deck(
|
def build_vocab_deck(
|
||||||
words: dict[str, dict],
|
words: dict[str, dict],
|
||||||
limit: int | None = None,
|
limit: int | None = None,
|
||||||
|
|
@ -758,7 +813,11 @@ def build_vocab_deck(
|
||||||
pos_heb = entry.get("pos_hebrew", "")
|
pos_heb = entry.get("pos_hebrew", "")
|
||||||
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
|
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
|
||||||
meaning = HBPAREN_RE.sub("", meaning).strip()
|
meaning = HBPAREN_RE.sub("", meaning).strip()
|
||||||
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
|
# Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning)
|
||||||
|
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||||
|
meaning = re.sub(r"[;:]\s*—", " —", meaning) # clean "; —" → " —"
|
||||||
|
meaning = re.sub(r";\s*:", ";", meaning) # clean "; :" → ";"
|
||||||
|
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||||||
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
|
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
|
||||||
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
|
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
|
||||||
meaning_raw = entry.get("meaning_raw", "") or ""
|
meaning_raw = entry.get("meaning_raw", "") or ""
|
||||||
|
|
@ -850,10 +909,13 @@ def build_vocab_deck(
|
||||||
start = cloze_data.get("cloze_word_start")
|
start = cloze_data.get("cloze_word_start")
|
||||||
end = cloze_data.get("cloze_word_end")
|
end = cloze_data.get("cloze_word_end")
|
||||||
if cloze_text and start is not None and end is not None:
|
if cloze_text and start is not None and end is not None:
|
||||||
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
|
# Preserve Hebrew prefix letters in the cloze blank
|
||||||
# Clean up duplicate/misplaced quotation marks
|
# e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____"
|
||||||
|
cloze_token = cloze_text[start:end]
|
||||||
|
prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud)
|
||||||
|
cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:]
|
||||||
|
# Clean up duplicate adjacent quotation marks (e.g. "" → ")
|
||||||
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
|
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
|
||||||
cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
|
|
||||||
raw_hint = cloze_data.get("cloze_hint") or ""
|
raw_hint = cloze_data.get("cloze_hint") or ""
|
||||||
if raw_hint:
|
if raw_hint:
|
||||||
cloze_hint = raw_hint
|
cloze_hint = raw_hint
|
||||||
|
|
@ -886,12 +948,19 @@ def build_vocab_deck(
|
||||||
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
|
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
|
||||||
related_html = "\n".join(parts)
|
related_html = "\n".join(parts)
|
||||||
|
|
||||||
# Plural form (nouns only — guard against adjective/verb inflection bleed)
|
# Plural form and gender (nouns only)
|
||||||
plural_str = ""
|
plural_str = ""
|
||||||
|
gender_str = ""
|
||||||
if pos_raw.startswith("Noun"):
|
if pos_raw.startswith("Noun"):
|
||||||
noun_inflection = entry.get("noun_inflection")
|
noun_inflection = entry.get("noun_inflection")
|
||||||
if noun_inflection and noun_inflection.get("plural"):
|
if noun_inflection:
|
||||||
plural_str = noun_inflection["plural"].get("nikkud", "")
|
if noun_inflection.get("plural"):
|
||||||
|
plural_str = noun_inflection["plural"].get("nikkud", "")
|
||||||
|
gender_raw = noun_inflection.get("gender") or ""
|
||||||
|
if gender_raw == "masculine":
|
||||||
|
gender_str = "זָכָר"
|
||||||
|
elif gender_raw == "feminine":
|
||||||
|
gender_str = "נְקֵבָה"
|
||||||
|
|
||||||
# Image
|
# Image
|
||||||
image_tag = ""
|
image_tag = ""
|
||||||
|
|
@ -927,6 +996,7 @@ def build_vocab_deck(
|
||||||
prep_str,
|
prep_str,
|
||||||
hint_str,
|
hint_str,
|
||||||
plural_str,
|
plural_str,
|
||||||
|
gender_str,
|
||||||
cloze_example,
|
cloze_example,
|
||||||
cloze_hint,
|
cloze_hint,
|
||||||
],
|
],
|
||||||
|
|
@ -941,7 +1011,8 @@ def build_vocab_deck(
|
||||||
prep_count = sum(1 for n in deck.notes if n.fields[12])
|
prep_count = sum(1 for n in deck.notes if n.fields[12])
|
||||||
hint_count = sum(1 for n in deck.notes if n.fields[13])
|
hint_count = sum(1 for n in deck.notes if n.fields[13])
|
||||||
plural_count = sum(1 for n in deck.notes if n.fields[14])
|
plural_count = sum(1 for n in deck.notes if n.fields[14])
|
||||||
cloze_count = sum(1 for n in deck.notes if n.fields[15])
|
gender_count = sum(1 for n in deck.notes if n.fields[15])
|
||||||
|
cloze_count = sum(1 for n in deck.notes if n.fields[16])
|
||||||
unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
|
unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
|
||||||
if emoji_count:
|
if emoji_count:
|
||||||
logger.info(f" Emoji extracted: {emoji_count} words")
|
logger.info(f" Emoji extracted: {emoji_count} words")
|
||||||
|
|
@ -951,6 +1022,8 @@ def build_vocab_deck(
|
||||||
logger.info(f" Eng→Heb hints: {hint_count} words")
|
logger.info(f" Eng→Heb hints: {hint_count} words")
|
||||||
if plural_count:
|
if plural_count:
|
||||||
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
|
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
|
||||||
|
if gender_count:
|
||||||
|
logger.info(f" Noun gender on vocab cards: {gender_count} words")
|
||||||
if cloze_count:
|
if cloze_count:
|
||||||
logger.info(f" Sentence cloze cards: {cloze_count} words")
|
logger.info(f" Sentence cloze cards: {cloze_count} words")
|
||||||
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
|
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
|
||||||
|
|
@ -1330,10 +1403,12 @@ def build_confusables_deck(
|
||||||
w = e["word"]["nikkud"]
|
w = e["word"]["nikkud"]
|
||||||
m = e.get("meaning", "")
|
m = e.get("meaning", "")
|
||||||
p = e.get("pos_hebrew", "")
|
p = e.get("pos_hebrew", "")
|
||||||
pos_label = f" ({p})" if p else ""
|
pos_div = f'<div style="font-size:18px; color:#888;">{p}</div>' if p else ""
|
||||||
defs_parts.append(
|
defs_parts.append(
|
||||||
f'<div class="conf-entry"><span class="hebrew" style="font-size:24px;">{w}</span>'
|
f'<div class="conf-entry">'
|
||||||
f" = {m}{pos_label}</div>"
|
f'<span class="hebrew" style="font-size:24px;">{w}</span>'
|
||||||
|
f'<div style="direction:ltr; text-align:center; font-size:22px;">{m}</div>'
|
||||||
|
f"{pos_div}</div>"
|
||||||
)
|
)
|
||||||
if include_audio:
|
if include_audio:
|
||||||
af = e.get("audio_file", "") or ""
|
af = e.get("audio_file", "") or ""
|
||||||
|
|
@ -1397,8 +1472,10 @@ PLURAL_BACK_SG = """
|
||||||
{{FrontSide}}<hr>
|
{{FrontSide}}<hr>
|
||||||
<div class="hebrew">{{Plural}}</div>
|
<div class="hebrew">{{Plural}}</div>
|
||||||
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
|
||||||
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
|
<div class="sec-table">
|
||||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
|
||||||
|
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
|
||||||
|
</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PLURAL_FRONT_PL = """
|
PLURAL_FRONT_PL = """
|
||||||
|
|
@ -1411,9 +1488,11 @@ PLURAL_BACK_PL = """
|
||||||
{{FrontSide}}<hr>
|
{{FrontSide}}<hr>
|
||||||
<div class="hebrew">{{Singular}}</div>
|
<div class="hebrew">{{Singular}}</div>
|
||||||
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
|
||||||
<div class="sec-label">{{Meaning}}</div>
|
<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>
|
||||||
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
|
<div class="sec-table">
|
||||||
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
|
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
|
||||||
|
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
|
||||||
|
</div>
|
||||||
"""
|
"""
|
||||||
|
|
||||||
PLURAL_CSS = CARD_CSS
|
PLURAL_CSS = CARD_CSS
|
||||||
|
|
@ -1501,13 +1580,25 @@ def build_plural_deck(
|
||||||
elif mishkal:
|
elif mishkal:
|
||||||
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
|
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
|
||||||
|
|
||||||
# Select exemplars per mishkal, preferring high-frequency words
|
# Select regular exemplars to achieve a 2:1 regular:irregular ratio.
|
||||||
per_mishkal = 6
|
# Distribute evenly across mishkal patterns, preferring high-frequency words.
|
||||||
|
irregular_count = len(irregulars)
|
||||||
|
target_regular = irregular_count * 2
|
||||||
|
mishkal_count = len(by_mishkal) or 1
|
||||||
|
per_mishkal = max(2, target_regular // mishkal_count)
|
||||||
|
|
||||||
selected: list[tuple[str, dict, dict]] = list(irregulars)
|
selected: list[tuple[str, dict, dict]] = list(irregulars)
|
||||||
|
regular_pool: list[tuple[str, dict, dict]] = []
|
||||||
for _mishkal, entries in sorted(by_mishkal.items()):
|
for _mishkal, entries in sorted(by_mishkal.items()):
|
||||||
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||||
selected.extend(entries[:per_mishkal])
|
regular_pool.extend(entries[:per_mishkal])
|
||||||
|
|
||||||
|
# If we overshot, trim to target (keeping highest-frequency across all mishkals)
|
||||||
|
if len(regular_pool) > target_regular:
|
||||||
|
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
|
||||||
|
regular_pool = regular_pool[:target_regular]
|
||||||
|
|
||||||
|
selected.extend(regular_pool)
|
||||||
|
|
||||||
note_count = 0
|
note_count = 0
|
||||||
for _unique_key, entry, noun_inflection in selected:
|
for _unique_key, entry, noun_inflection in selected:
|
||||||
|
|
@ -1517,7 +1608,7 @@ def build_plural_deck(
|
||||||
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
|
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
|
||||||
gender = noun_inflection.get("gender") or ""
|
gender = noun_inflection.get("gender") or ""
|
||||||
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
|
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
|
||||||
mishkal = noun_inflection.get("mishkal") or ""
|
mishkal_heb = noun_inflection.get("mishkal_hebrew") or ""
|
||||||
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
|
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
|
||||||
root_list = entry.get("root") or []
|
root_list = entry.get("root") or []
|
||||||
root = ".".join(root_list)
|
root = ".".join(root_list)
|
||||||
|
|
@ -1537,9 +1628,10 @@ def build_plural_deck(
|
||||||
if mp3_path not in media_files:
|
if mp3_path not in media_files:
|
||||||
media_files.append(mp3_path)
|
media_files.append(mp3_path)
|
||||||
|
|
||||||
|
mishkal_eng = noun_inflection.get("mishkal") or ""
|
||||||
tags = [RELEASE_TAG]
|
tags = [RELEASE_TAG]
|
||||||
if mishkal:
|
if mishkal_eng:
|
||||||
tags.append(f"mishkal::{mishkal}")
|
tags.append(f"mishkal::{mishkal_eng}")
|
||||||
if _is_irregular_plural(gender, plural_ktiv):
|
if _is_irregular_plural(gender, plural_ktiv):
|
||||||
tags.append("irregular")
|
tags.append("irregular")
|
||||||
|
|
||||||
|
|
@ -1553,7 +1645,7 @@ def build_plural_deck(
|
||||||
pl_audio,
|
pl_audio,
|
||||||
meaning,
|
meaning,
|
||||||
root,
|
root,
|
||||||
mishkal,
|
mishkal_heb,
|
||||||
gender_heb,
|
gender_heb,
|
||||||
],
|
],
|
||||||
tags=tags,
|
tags=tags,
|
||||||
|
|
|
||||||
18348
data/words.json
18348
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -29,7 +29,7 @@ WORDS_JSON = DATA_DIR / "words.json"
|
||||||
|
|
||||||
# Book metadata: filename -> display name
|
# Book metadata: filename -> display name
|
||||||
def _discover_epubs() -> dict[str, str]:
|
def _discover_epubs() -> dict[str, str]:
|
||||||
"""Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
|
"""Auto-discover all .epub and .txt files in EPUB_DIR, returning {filepath: display_name}."""
|
||||||
if not EPUB_DIR.exists():
|
if not EPUB_DIR.exists():
|
||||||
return {}
|
return {}
|
||||||
books: dict[str, str] = {}
|
books: dict[str, str] = {}
|
||||||
|
|
@ -50,6 +50,9 @@ def _discover_epubs() -> dict[str, str]:
|
||||||
else:
|
else:
|
||||||
name = stem_stripped[:40]
|
name = stem_stripped[:40]
|
||||||
books[str(path)] = name
|
books[str(path)] = name
|
||||||
|
# Also discover plain-text files (e.g. Ben Yehuda downloads)
|
||||||
|
for path in sorted(EPUB_DIR.glob("*.txt")):
|
||||||
|
books[str(path)] = path.stem
|
||||||
return books
|
return books
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -196,6 +199,20 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
|
||||||
return _split_into_sentences(full_text, book_name)
|
return _split_into_sentences(full_text, book_name)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_sentences_from_text(text_path: Path, book_name: str) -> list[dict]:
|
||||||
|
"""Extract sentences from a plain-text file (e.g. Ben Yehuda downloads).
|
||||||
|
|
||||||
|
Args:
|
||||||
|
text_path: Path to the .txt file.
|
||||||
|
book_name: Human-readable book name used as the ``source`` field.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of ``{"text": str, "source": str}`` dicts.
|
||||||
|
"""
|
||||||
|
full_text = text_path.read_text(encoding="utf-8")
|
||||||
|
return _split_into_sentences(full_text, book_name)
|
||||||
|
|
||||||
|
|
||||||
# ── Sentence splitting ───────────────────────────────────────────
|
# ── Sentence splitting ───────────────────────────────────────────
|
||||||
|
|
||||||
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
|
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
|
||||||
|
|
@ -480,7 +497,12 @@ def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||||
|
|
||||||
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||||
sub = noun.get(field) or {}
|
sub = noun.get(field) or {}
|
||||||
_add(sub.get("nikkud"), unique_key, "inflected")
|
form = sub.get("nikkud")
|
||||||
|
_add(form, unique_key, "inflected")
|
||||||
|
# Index construct forms without maqaf too — modern text often
|
||||||
|
# writes smichut as two space-separated words without maqaf
|
||||||
|
if form and form.endswith("־"):
|
||||||
|
_add(form[:-1], unique_key, "inflected")
|
||||||
|
|
||||||
pronominal = noun.get("pronominal_suffixes") or {}
|
pronominal = noun.get("pronominal_suffixes") or {}
|
||||||
for _person, sub in pronominal.items():
|
for _person, sub in pronominal.items():
|
||||||
|
|
@ -720,7 +742,10 @@ def run(words: dict) -> dict:
|
||||||
|
|
||||||
for filepath, book_name in _discover_epubs().items():
|
for filepath, book_name in _discover_epubs().items():
|
||||||
path = Path(filepath)
|
path = Path(filepath)
|
||||||
sentences = extract_sentences_from_epub(path, book_name)
|
if path.suffix == ".txt":
|
||||||
|
sentences = extract_sentences_from_text(path, book_name)
|
||||||
|
else:
|
||||||
|
sentences = extract_sentences_from_epub(path, book_name)
|
||||||
book_counts[book_name] = len(sentences)
|
book_counts[book_name] = len(sentences)
|
||||||
all_sentences.extend(sentences)
|
all_sentences.extend(sentences)
|
||||||
logger.info(f" {book_name}: {len(sentences)} sentences")
|
logger.info(f" {book_name}: {len(sentences)} sentences")
|
||||||
|
|
|
||||||
|
|
@ -459,15 +459,29 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
|
||||||
"""
|
"""
|
||||||
Extract (gender, mishkal) from the PoS section of the detail page.
|
Extract (gender, mishkal) from the PoS section of the detail page.
|
||||||
Returns ("masculine"|"feminine"|"", mishkal_english|"").
|
Returns ("masculine"|"feminine"|"", mishkal_english|"").
|
||||||
|
|
||||||
|
Pealim HTML structure:
|
||||||
|
<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>
|
||||||
|
The mishkal is in the <i> tag (k-notation, e.g. "ketel") or the nm= URL param (q-notation).
|
||||||
|
Some nouns have no mishkal link: <p>Noun – masculine</p>
|
||||||
"""
|
"""
|
||||||
gender = ""
|
gender = ""
|
||||||
mishkal = ""
|
mishkal = ""
|
||||||
|
|
||||||
# Try various selectors that pealim uses for PoS info
|
# Find the PoS <p> tag — on pealim detail pages it's a bare <p> like
|
||||||
pos_section = soup.find("div", class_="pos") or soup.find("p", class_="pos")
|
# "Noun – ketel pattern, masculine" or "Adjective – katul pattern"
|
||||||
|
pos_section = None
|
||||||
|
for p in soup.find_all("p"):
|
||||||
|
text = p.get_text(" ", strip=True)
|
||||||
|
if re.match(r"^(Noun|Adjective)\b", text):
|
||||||
|
pos_section = p
|
||||||
|
break
|
||||||
|
|
||||||
|
# Fall back to older selectors (div.pos, p.pos, div.page-header)
|
||||||
if not pos_section:
|
if not pos_section:
|
||||||
# Look for it in the page header area
|
pos_section = (
|
||||||
pos_section = soup.find("div", class_="page-header")
|
soup.find("div", class_="pos") or soup.find("p", class_="pos") or soup.find("div", class_="page-header")
|
||||||
|
)
|
||||||
|
|
||||||
if pos_section:
|
if pos_section:
|
||||||
text = pos_section.get_text(" ", strip=True)
|
text = pos_section.get_text(" ", strip=True)
|
||||||
|
|
@ -476,13 +490,21 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
|
||||||
if raw in text.lower():
|
if raw in text.lower():
|
||||||
gender = canonical
|
gender = canonical
|
||||||
break
|
break
|
||||||
# Mishkal detection: look for CaCaC-style patterns
|
|
||||||
mishkal_match = re.search(r"\b([A-Z][a-zA-Z\']+)\b", text)
|
# Mishkal detection: extract from <a href="...nm=XXXX"><i>YYYY</i> pattern</a>
|
||||||
if mishkal_match:
|
# Nouns use nm= param, adjectives use am= param
|
||||||
candidate = mishkal_match.group(1)
|
mishkal_link = pos_section.find("a", href=re.compile(r"[na]m="))
|
||||||
# Validate: mishkal names contain uppercase letters in CaCaC pattern
|
if mishkal_link:
|
||||||
if re.match(r"^[A-Za-z\']+$", candidate) and any(c.isupper() for c in candidate):
|
# Prefer <i> tag text (k-notation, matches _MISHKAL_HEBREW_Q after k→q)
|
||||||
mishkal = candidate
|
i_tag = mishkal_link.find("i")
|
||||||
|
if i_tag:
|
||||||
|
mishkal = i_tag.get_text(strip=True)
|
||||||
|
else:
|
||||||
|
# Fall back to nm= URL parameter (already q-notation)
|
||||||
|
href = mishkal_link.get("href", "")
|
||||||
|
nm_match = re.search(r"[na]m=([a-zA-Z']+)", href)
|
||||||
|
if nm_match:
|
||||||
|
mishkal = nm_match.group(1)
|
||||||
|
|
||||||
# Also check the og:description or breadcrumbs for gender
|
# Also check the og:description or breadcrumbs for gender
|
||||||
if not gender:
|
if not gender:
|
||||||
|
|
|
||||||
|
|
@ -685,6 +685,61 @@ def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
||||||
_pass(name)
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
|
||||||
|
"""English meanings must not contain bare Hebrew text (spoils the card)."""
|
||||||
|
name = "no_hebrew_in_meaning"
|
||||||
|
errors: list[str] = []
|
||||||
|
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
meaning = entry.get("meaning") or ""
|
||||||
|
# Apply same cleaning pipeline as apkg_builder
|
||||||
|
cleaned = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||||
|
cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(", ;:")
|
||||||
|
if hebrew_re.search(cleaned):
|
||||||
|
errors.append(f"[{key}] meaning still contains Hebrew after cleaning: {cleaned!r}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_mishkal_consistency(data: dict[str, Any]) -> None:
|
||||||
|
"""mishkal_hebrew must match mishkal via _mishkal_to_hebrew conversion."""
|
||||||
|
name = "mishkal_consistency"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
try:
|
||||||
|
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||||
|
except ImportError:
|
||||||
|
_warn(name, ["Could not import _mishkal_to_hebrew — skipping"])
|
||||||
|
return
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||||
|
infl = entry.get(infl_key)
|
||||||
|
if not infl:
|
||||||
|
continue
|
||||||
|
mishkal_eng = infl.get("mishkal") or ""
|
||||||
|
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||||
|
if mishkal_eng and mishkal_heb:
|
||||||
|
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||||
|
if expected and expected != mishkal_heb:
|
||||||
|
errors.append(f"[{key}] {infl_key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||||
|
if mishkal_heb and not mishkal_eng:
|
||||||
|
errors.append(f"[{key}] {infl_key}: has mishkal_hebrew but no mishkal")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Stats summary
|
# Stats summary
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -702,6 +757,11 @@ def print_stats(data: dict[str, Any]) -> None:
|
||||||
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
|
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
|
||||||
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
|
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
|
||||||
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
|
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
|
||||||
|
with_mishkal = sum(
|
||||||
|
1
|
||||||
|
for e in data.values()
|
||||||
|
if (e.get("noun_inflection") or {}).get("mishkal") or (e.get("adjective_inflection") or {}).get("mishkal")
|
||||||
|
)
|
||||||
|
|
||||||
print()
|
print()
|
||||||
print("Stats Summary")
|
print("Stats Summary")
|
||||||
|
|
@ -709,6 +769,7 @@ def print_stats(data: dict[str, Any]) -> None:
|
||||||
print(f" Total entries: {total:>6}")
|
print(f" Total entries: {total:>6}")
|
||||||
print(f" With conjugation data: {with_conj:>6}")
|
print(f" With conjugation data: {with_conj:>6}")
|
||||||
print(f" With noun_inflection: {with_noun_inf:>6}")
|
print(f" With noun_inflection: {with_noun_inf:>6}")
|
||||||
|
print(f" With mishkal: {with_mishkal:>6}")
|
||||||
print(f" With vetted examples: {with_vetted:>6}")
|
print(f" With vetted examples: {with_vetted:>6}")
|
||||||
print(f" With cloze examples: {with_cloze:>6}")
|
print(f" With cloze examples: {with_cloze:>6}")
|
||||||
print(f" With images: {with_image:>6}")
|
print(f" With images: {with_image:>6}")
|
||||||
|
|
@ -740,6 +801,8 @@ ALL_TESTS: dict[str, Any] = {
|
||||||
"conjugation_form_guids": test_conjugation_form_guids,
|
"conjugation_form_guids": test_conjugation_form_guids,
|
||||||
"conjugation_person_codes": test_conjugation_person_codes,
|
"conjugation_person_codes": test_conjugation_person_codes,
|
||||||
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||||||
|
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
|
||||||
|
"mishkal_consistency": test_mishkal_consistency,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
246
tests/test_apkg_builder.py
Normal file
246
tests/test_apkg_builder.py
Normal file
|
|
@ -0,0 +1,246 @@
|
||||||
|
"""Unit tests for apkg_builder — Sprint 15 learnings.
|
||||||
|
|
||||||
|
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
|
||||||
|
meanings, PoS exact matching, gender field population, and mishkal data integrity.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Ensure project root is on path
|
||||||
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||||
|
|
||||||
|
from apkg_builder import _categorize_pos, _cloze_prefix_len
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Cloze prefix preservation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestClozePrefix:
|
||||||
|
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
|
||||||
|
|
||||||
|
def test_single_prefix_bet(self):
|
||||||
|
# בַּתּוֹר = bet + patach + tor
|
||||||
|
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
|
||||||
|
|
||||||
|
def test_single_prefix_lamed(self):
|
||||||
|
# לַמֶּלֶךְ = lamed + patach + melech
|
||||||
|
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
|
||||||
|
|
||||||
|
def test_two_consonant_prefix(self):
|
||||||
|
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
|
||||||
|
token = "שֶׁבַּתּוֹר"
|
||||||
|
word = "תּוֹר"
|
||||||
|
prefix_len = _cloze_prefix_len(token, word)
|
||||||
|
assert prefix_len > 0
|
||||||
|
assert token[prefix_len:].startswith(word)
|
||||||
|
|
||||||
|
def test_no_prefix_direct_match(self):
|
||||||
|
# Word appears at start — no prefix
|
||||||
|
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
|
||||||
|
|
||||||
|
def test_empty_inputs(self):
|
||||||
|
assert _cloze_prefix_len("", "תּוֹר") == 0
|
||||||
|
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
|
||||||
|
assert _cloze_prefix_len("", "") == 0
|
||||||
|
|
||||||
|
def test_non_prefix_letter_returns_zero(self):
|
||||||
|
# If the "prefix" chars aren't valid prefix letters, return 0
|
||||||
|
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
|
||||||
|
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
|
||||||
|
|
||||||
|
def test_prefix_preserves_nikkud(self):
|
||||||
|
# Verify that prefix_len includes nikkud marks
|
||||||
|
token = "בַּתּוֹר"
|
||||||
|
word = "תּוֹר"
|
||||||
|
prefix_len = _cloze_prefix_len(token, word)
|
||||||
|
prefix = token[:prefix_len]
|
||||||
|
# Prefix should contain at least bet + nikkud mark(s)
|
||||||
|
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
|
||||||
|
assert base_letters == ["ב"]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# PoS exact matching (no substring collisions)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestCategorizePos:
|
||||||
|
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
|
||||||
|
|
||||||
|
def test_noun_exact(self):
|
||||||
|
assert _categorize_pos("Noun") == "Noun"
|
||||||
|
|
||||||
|
def test_pronoun_is_other(self):
|
||||||
|
assert _categorize_pos("Pronoun") == "Other"
|
||||||
|
|
||||||
|
def test_verb_exact(self):
|
||||||
|
assert _categorize_pos("Verb") == "Verb"
|
||||||
|
|
||||||
|
def test_noun_with_dash(self):
|
||||||
|
assert _categorize_pos("Noun – masculine") == "Noun"
|
||||||
|
|
||||||
|
def test_adjective(self):
|
||||||
|
assert _categorize_pos("Adjective") == "Adjective"
|
||||||
|
|
||||||
|
def test_conjunction_is_other(self):
|
||||||
|
assert _categorize_pos("Conjunction") == "Other"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Hebrew spoiler stripping from English meanings
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestHebrewSpoilerStripping:
|
||||||
|
"""English meanings must not contain Hebrew text (spoils the card)."""
|
||||||
|
|
||||||
|
# Use the same regex from apkg_builder.py
|
||||||
|
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _strip_hebrew(meaning: str) -> str:
|
||||||
|
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
|
||||||
|
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||||
|
meaning = re.sub(r"[;:]\s*—", " —", meaning)
|
||||||
|
meaning = re.sub(r";\s*:", ";", meaning)
|
||||||
|
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||||||
|
|
||||||
|
def test_pure_english_unchanged(self):
|
||||||
|
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
|
||||||
|
|
||||||
|
def test_hebrew_word_removed(self):
|
||||||
|
result = self._strip_hebrew("to eat; אכל")
|
||||||
|
assert "אכל" not in result
|
||||||
|
|
||||||
|
def test_hebrew_with_nikkud_removed(self):
|
||||||
|
result = self._strip_hebrew("tall; גָּבוֹהַּ")
|
||||||
|
assert "גָּבוֹהַּ" not in result
|
||||||
|
assert "tall" in result
|
||||||
|
|
||||||
|
def test_no_residual_hebrew_in_real_data(self):
|
||||||
|
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
|
||||||
|
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||||
|
if not words_path.exists():
|
||||||
|
pytest.skip("words.json not available")
|
||||||
|
|
||||||
|
with open(words_path, encoding="utf-8") as f:
|
||||||
|
words = json.load(f)
|
||||||
|
|
||||||
|
# The regex used in apkg_builder
|
||||||
|
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||||
|
spoilers = []
|
||||||
|
for key, entry in words.items():
|
||||||
|
meaning = entry.get("meaning") or ""
|
||||||
|
cleaned = self._strip_hebrew(meaning)
|
||||||
|
if hebrew_re.search(cleaned):
|
||||||
|
spoilers.append(f"{key}: {cleaned!r}")
|
||||||
|
|
||||||
|
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Gender field for nouns (words.json data integrity)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestGenderDataIntegrity:
|
||||||
|
"""Nouns with noun_inflection should have gender populated."""
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def words(self):
|
||||||
|
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||||
|
if not words_path.exists():
|
||||||
|
pytest.skip("words.json not available")
|
||||||
|
with open(words_path, encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def test_nouns_have_gender(self, words):
|
||||||
|
"""Nouns with noun_inflection should have a valid gender."""
|
||||||
|
missing = []
|
||||||
|
for key, entry in words.items():
|
||||||
|
pos = entry.get("pos") or ""
|
||||||
|
ni = entry.get("noun_inflection")
|
||||||
|
if pos.startswith("Noun") and ni:
|
||||||
|
gender = ni.get("gender") or ""
|
||||||
|
if gender not in ("masculine", "feminine", "masculine and feminine"):
|
||||||
|
missing.append(f"{key}: gender={gender!r}")
|
||||||
|
|
||||||
|
# Allow up to 7% missing (loan words, compound words, etc.)
|
||||||
|
noun_count = sum(
|
||||||
|
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
|
||||||
|
)
|
||||||
|
if noun_count > 0:
|
||||||
|
pct_missing = len(missing) / noun_count
|
||||||
|
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Mishkal data integrity
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
class TestMishkalIntegrity:
|
||||||
|
"""Validate mishkal data consistency in words.json."""
|
||||||
|
|
||||||
|
@pytest.fixture()
|
||||||
|
def words(self):
|
||||||
|
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||||
|
if not words_path.exists():
|
||||||
|
pytest.skip("words.json not available")
|
||||||
|
with open(words_path, encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
def test_mishkal_hebrew_matches_english(self, words):
|
||||||
|
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
|
||||||
|
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||||
|
|
||||||
|
mismatches = []
|
||||||
|
for key, entry in words.items():
|
||||||
|
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||||
|
infl = entry.get(infl_key)
|
||||||
|
if not infl:
|
||||||
|
continue
|
||||||
|
mishkal_eng = infl.get("mishkal") or ""
|
||||||
|
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||||
|
if mishkal_eng and mishkal_heb:
|
||||||
|
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||||
|
if expected and expected != mishkal_heb:
|
||||||
|
mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||||
|
|
||||||
|
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
|
||||||
|
|
||||||
|
def test_mishkal_hebrew_is_hebrew(self, words):
|
||||||
|
"""mishkal_hebrew must contain Hebrew characters."""
|
||||||
|
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||||
|
bad = []
|
||||||
|
for key, entry in words.items():
|
||||||
|
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||||
|
infl = entry.get(infl_key)
|
||||||
|
if not infl:
|
||||||
|
continue
|
||||||
|
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||||
|
if mishkal_heb and not hebrew_re.search(mishkal_heb):
|
||||||
|
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
|
||||||
|
|
||||||
|
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
|
||||||
|
|
||||||
|
def test_no_orphaned_mishkal(self, words):
|
||||||
|
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
|
||||||
|
orphans = []
|
||||||
|
for key, entry in words.items():
|
||||||
|
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||||
|
infl = entry.get(infl_key)
|
||||||
|
if not infl:
|
||||||
|
continue
|
||||||
|
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||||
|
mishkal_eng = infl.get("mishkal") or ""
|
||||||
|
if mishkal_heb and not mishkal_eng:
|
||||||
|
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
|
||||||
|
|
||||||
|
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"
|
||||||
Loading…
Reference in a new issue