Sprint 15: example sentence pipeline overhaul + corpus expansion + card improvements

- Regenerated all example sentences from scratch (deleted legacy + stale entries)
- Added .txt file support to epub_examples.py for Ben Yehuda corpus
- 7 Ben Yehuda nikkud'd children's texts + 3 new Time Tunnel EPUBs
- Maqaf-stripped construct form indexing (+68% inflected matches)
- Total: 3,598 words with examples, 3,289 with cloze (was ~2,900)
- Cloze prefix preservation (_cloze_prefix_len)
- Hebrew spoiler stripping from English meanings
- Gender field (זָכָר/נְקֵבָה) on vocab cards
- sec-table CSS layout for aligned key:value pairs
- Mishkal uses mishkal_hebrew on plural cards
- Improved mishkal extraction from pealim detail pages
- 21 new pytest tests (cloze, PoS, Hebrew stripping, gender, mishkal)
- 2 new validate_data.py tests + mishkal stats
- Colliding forms tracking (local-only)
- Release tag v0.17

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-10 10:44:14 +00:00
parent efd0745ada
commit c85063ee2f
7 changed files with 15634 additions and 3273 deletions

1
.gitignore vendored
View file

@ -15,6 +15,7 @@ __pycache__/
# Large generated cache files (rebuild locally)
data/benyehuda_index.json
data/colliding_forms.json
# Audio directories (large; rebuild locally)
data/audio/

View file

@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.16"
RELEASE_TAG = "v0.17"
# Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -185,18 +185,34 @@ CARD_CSS = """
font-weight: normal;
color: #555;
}
.sec-table {
display: table;
margin: 6px auto 0;
direction: rtl;
border-collapse: collapse;
}
.sec-label {
display: table-row;
font-size: 28px;
font-weight: normal;
color: #222;
direction: rtl;
text-align: center;
margin-top: 6px;
}
.sec-key {
display: table-cell;
font-size: 28px;
color: #222;
font-weight: bold;
text-align: right;
padding: 2px 0 2px 8px;
white-space: nowrap;
}
.sec-val {
display: table-cell;
font-size: 28px;
color: #222;
text-align: right;
padding: 2px 0;
}
.definitions {
direction: rtl;
@ -231,6 +247,7 @@ CARD_CSS = """
.root-info { color: #e0e0e0; }
.sec-label { color: #e0e0e0; }
.sec-key { color: #e0e0e0; }
.sec-val { color: #e0e0e0; }
.conf-entry { color: #ddd; }
.hint { color: #777; }
.voice-label { color: #888; }
@ -255,14 +272,17 @@ VOCAB_BACK_HEB = """
<div class="meaning">{{Meaning}}</div>
{{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
{{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
<div class="sec-table">
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
</div>
{{#SharedRoots}}
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="divider" style="margin:6px 0;"></div>
<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
"""
VOCAB_FRONT_ENG = """
@ -277,14 +297,17 @@ VOCAB_BACK_ENG = """
<div class="divider"></div>
<div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
<div class="sec-table">
{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
</div>
{{#SharedRoots}}
<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="divider" style="margin:6px 0;"></div>
<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="root-info">{{SharedRoots}}</div>
{{/SharedRoots}}
{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
"""
VOCAB_FRONT_CLOZE = """
@ -318,6 +341,7 @@ VOCAB_MODEL = genanki.Model(
{"name": "Prep"},
{"name": "Hint"},
{"name": "Plural"},
{"name": "Gender"},
{"name": "ClozeExample"},
{"name": "ClozeHint"},
],
@ -359,11 +383,16 @@ CONJ_BACK = """
{{FrontSide}}<hr>
<div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
{{#RelatedVocab}}<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
{{#Meaning}}<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>{{/Meaning}}
<div class="sec-table">
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
</div>
{{#RelatedVocab}}
<div class="divider" style="margin:6px 0;"></div>
<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
<div class="root-info">{{RelatedVocab}}</div>
{{/RelatedVocab}}
"""
CONJ_CSS = CARD_CSS
@ -703,6 +732,32 @@ def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]:
return result
# Hebrew prefix letters (אותיות השימוש): בהוכלמש
_PREFIX_LETTERS = frozenset("בהוכלמש")
def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int:
"""Return the number of characters in the cloze token that are prefix (not part of the word).
For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars).
Returns 0 if the token starts with the word directly.
"""
if not word_nikkud or not cloze_token:
return 0
# If the token starts with the word nikkud, no prefix
if cloze_token.startswith(word_nikkud):
return 0
# Check if word nikkud appears as a suffix of the token
idx = cloze_token.find(word_nikkud)
if idx > 0:
# Verify prefix chars are valid Hebrew prefix letters
prefix_part = cloze_token[:idx]
base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"]
if base_letters and all(c in _PREFIX_LETTERS for c in base_letters):
return idx
return 0
def build_vocab_deck(
words: dict[str, dict],
limit: int | None = None,
@ -758,7 +813,11 @@ def build_vocab_deck(
pos_heb = entry.get("pos_hebrew", "")
meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
meaning = HBPAREN_RE.sub("", meaning).strip()
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
# Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning)
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
meaning = re.sub(r"[;:]\s*—", "", meaning) # clean "; —" → " —"
meaning = re.sub(r";\s*:", ";", meaning) # clean "; :" → ";"
meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren
meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma
meaning_raw = entry.get("meaning_raw", "") or ""
@ -850,10 +909,13 @@ def build_vocab_deck(
start = cloze_data.get("cloze_word_start")
end = cloze_data.get("cloze_word_end")
if cloze_text and start is not None and end is not None:
cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
# Clean up duplicate/misplaced quotation marks
# Preserve Hebrew prefix letters in the cloze blank
# e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____"
cloze_token = cloze_text[start:end]
prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud)
cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:]
# Clean up duplicate adjacent quotation marks (e.g. "" → ")
cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
raw_hint = cloze_data.get("cloze_hint") or ""
if raw_hint:
cloze_hint = raw_hint
@ -886,12 +948,19 @@ def build_vocab_deck(
parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
related_html = "\n".join(parts)
# Plural form (nouns only — guard against adjective/verb inflection bleed)
# Plural form and gender (nouns only)
plural_str = ""
gender_str = ""
if pos_raw.startswith("Noun"):
noun_inflection = entry.get("noun_inflection")
if noun_inflection and noun_inflection.get("plural"):
plural_str = noun_inflection["plural"].get("nikkud", "")
if noun_inflection:
if noun_inflection.get("plural"):
plural_str = noun_inflection["plural"].get("nikkud", "")
gender_raw = noun_inflection.get("gender") or ""
if gender_raw == "masculine":
gender_str = "זָכָר"
elif gender_raw == "feminine":
gender_str = "נְקֵבָה"
# Image
image_tag = ""
@ -927,6 +996,7 @@ def build_vocab_deck(
prep_str,
hint_str,
plural_str,
gender_str,
cloze_example,
cloze_hint,
],
@ -941,7 +1011,8 @@ def build_vocab_deck(
prep_count = sum(1 for n in deck.notes if n.fields[12])
hint_count = sum(1 for n in deck.notes if n.fields[13])
plural_count = sum(1 for n in deck.notes if n.fields[14])
cloze_count = sum(1 for n in deck.notes if n.fields[15])
gender_count = sum(1 for n in deck.notes if n.fields[15])
cloze_count = sum(1 for n in deck.notes if n.fields[16])
unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
if emoji_count:
logger.info(f" Emoji extracted: {emoji_count} words")
@ -951,6 +1022,8 @@ def build_vocab_deck(
logger.info(f" Eng→Heb hints: {hint_count} words")
if plural_count:
logger.info(f" Noun plurals on vocab cards: {plural_count} words")
if gender_count:
logger.info(f" Noun gender on vocab cards: {gender_count} words")
if cloze_count:
logger.info(f" Sentence cloze cards: {cloze_count} words")
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
@ -1330,10 +1403,12 @@ def build_confusables_deck(
w = e["word"]["nikkud"]
m = e.get("meaning", "")
p = e.get("pos_hebrew", "")
pos_label = f" ({p})" if p else ""
pos_div = f'<div style="font-size:18px; color:#888;">{p}</div>' if p else ""
defs_parts.append(
f'<div class="conf-entry"><span class="hebrew" style="font-size:24px;">{w}</span>'
f" = {m}{pos_label}</div>"
f'<div class="conf-entry">'
f'<span class="hebrew" style="font-size:24px;">{w}</span>'
f'<div style="direction:ltr; text-align:center; font-size:22px;">{m}</div>'
f"{pos_div}</div>"
)
if include_audio:
af = e.get("audio_file", "") or ""
@ -1397,8 +1472,10 @@ PLURAL_BACK_SG = """
{{FrontSide}}<hr>
<div class="hebrew">{{Plural}}</div>
{{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
<div class="sec-table">
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
</div>
"""
PLURAL_FRONT_PL = """
@ -1411,9 +1488,11 @@ PLURAL_BACK_PL = """
{{FrontSide}}<hr>
<div class="hebrew">{{Singular}}</div>
{{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
<div class="sec-label">{{Meaning}}</div>
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>
<div class="sec-table">
{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
</div>
"""
PLURAL_CSS = CARD_CSS
@ -1501,13 +1580,25 @@ def build_plural_deck(
elif mishkal:
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
# Select exemplars per mishkal, preferring high-frequency words
per_mishkal = 6
# Select regular exemplars to achieve a 2:1 regular:irregular ratio.
# Distribute evenly across mishkal patterns, preferring high-frequency words.
irregular_count = len(irregulars)
target_regular = irregular_count * 2
mishkal_count = len(by_mishkal) or 1
per_mishkal = max(2, target_regular // mishkal_count)
selected: list[tuple[str, dict, dict]] = list(irregulars)
regular_pool: list[tuple[str, dict, dict]] = []
for _mishkal, entries in sorted(by_mishkal.items()):
entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
selected.extend(entries[:per_mishkal])
regular_pool.extend(entries[:per_mishkal])
# If we overshot, trim to target (keeping highest-frequency across all mishkals)
if len(regular_pool) > target_regular:
regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
regular_pool = regular_pool[:target_regular]
selected.extend(regular_pool)
note_count = 0
for _unique_key, entry, noun_inflection in selected:
@ -1517,7 +1608,7 @@ def build_plural_deck(
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
gender = noun_inflection.get("gender") or ""
gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
mishkal = noun_inflection.get("mishkal") or ""
mishkal_heb = noun_inflection.get("mishkal_hebrew") or ""
meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
root_list = entry.get("root") or []
root = ".".join(root_list)
@ -1537,9 +1628,10 @@ def build_plural_deck(
if mp3_path not in media_files:
media_files.append(mp3_path)
mishkal_eng = noun_inflection.get("mishkal") or ""
tags = [RELEASE_TAG]
if mishkal:
tags.append(f"mishkal::{mishkal}")
if mishkal_eng:
tags.append(f"mishkal::{mishkal_eng}")
if _is_irregular_plural(gender, plural_ktiv):
tags.append("irregular")
@ -1553,7 +1645,7 @@ def build_plural_deck(
pl_audio,
meaning,
root,
mishkal,
mishkal_heb,
gender_heb,
],
tags=tags,

File diff suppressed because it is too large Load diff

View file

@ -29,7 +29,7 @@ WORDS_JSON = DATA_DIR / "words.json"
# Book metadata: filename -> display name
def _discover_epubs() -> dict[str, str]:
"""Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
"""Auto-discover all .epub and .txt files in EPUB_DIR, returning {filepath: display_name}."""
if not EPUB_DIR.exists():
return {}
books: dict[str, str] = {}
@ -50,6 +50,9 @@ def _discover_epubs() -> dict[str, str]:
else:
name = stem_stripped[:40]
books[str(path)] = name
# Also discover plain-text files (e.g. Ben Yehuda downloads)
for path in sorted(EPUB_DIR.glob("*.txt")):
books[str(path)] = path.stem
return books
@ -196,6 +199,20 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
return _split_into_sentences(full_text, book_name)
def extract_sentences_from_text(text_path: Path, book_name: str) -> list[dict]:
"""Extract sentences from a plain-text file (e.g. Ben Yehuda downloads).
Args:
text_path: Path to the .txt file.
book_name: Human-readable book name used as the ``source`` field.
Returns:
List of ``{"text": str, "source": str}`` dicts.
"""
full_text = text_path.read_text(encoding="utf-8")
return _split_into_sentences(full_text, book_name)
# ── Sentence splitting ───────────────────────────────────────────
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
@ -480,7 +497,12 @@ def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
for field in ("singular", "plural", "construct_singular", "construct_plural"):
sub = noun.get(field) or {}
_add(sub.get("nikkud"), unique_key, "inflected")
form = sub.get("nikkud")
_add(form, unique_key, "inflected")
# Index construct forms without maqaf too — modern text often
# writes smichut as two space-separated words without maqaf
if form and form.endswith("־"):
_add(form[:-1], unique_key, "inflected")
pronominal = noun.get("pronominal_suffixes") or {}
for _person, sub in pronominal.items():
@ -720,7 +742,10 @@ def run(words: dict) -> dict:
for filepath, book_name in _discover_epubs().items():
path = Path(filepath)
sentences = extract_sentences_from_epub(path, book_name)
if path.suffix == ".txt":
sentences = extract_sentences_from_text(path, book_name)
else:
sentences = extract_sentences_from_epub(path, book_name)
book_counts[book_name] = len(sentences)
all_sentences.extend(sentences)
logger.info(f" {book_name}: {len(sentences)} sentences")

View file

@ -459,15 +459,29 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
"""
Extract (gender, mishkal) from the PoS section of the detail page.
Returns ("masculine"|"feminine"|"", mishkal_english|"").
Pealim HTML structure:
<p>Noun <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>
The mishkal is in the <i> tag (k-notation, e.g. "ketel") or the nm= URL param (q-notation).
Some nouns have no mishkal link: <p>Noun masculine</p>
"""
gender = ""
mishkal = ""
# Try various selectors that pealim uses for PoS info
pos_section = soup.find("div", class_="pos") or soup.find("p", class_="pos")
# Find the PoS <p> tag — on pealim detail pages it's a bare <p> like
# "Noun ketel pattern, masculine" or "Adjective katul pattern"
pos_section = None
for p in soup.find_all("p"):
text = p.get_text(" ", strip=True)
if re.match(r"^(Noun|Adjective)\b", text):
pos_section = p
break
# Fall back to older selectors (div.pos, p.pos, div.page-header)
if not pos_section:
# Look for it in the page header area
pos_section = soup.find("div", class_="page-header")
pos_section = (
soup.find("div", class_="pos") or soup.find("p", class_="pos") or soup.find("div", class_="page-header")
)
if pos_section:
text = pos_section.get_text(" ", strip=True)
@ -476,13 +490,21 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
if raw in text.lower():
gender = canonical
break
# Mishkal detection: look for CaCaC-style patterns
mishkal_match = re.search(r"\b([A-Z][a-zA-Z\']+)\b", text)
if mishkal_match:
candidate = mishkal_match.group(1)
# Validate: mishkal names contain uppercase letters in CaCaC pattern
if re.match(r"^[A-Za-z\']+$", candidate) and any(c.isupper() for c in candidate):
mishkal = candidate
# Mishkal detection: extract from <a href="...nm=XXXX"><i>YYYY</i> pattern</a>
# Nouns use nm= param, adjectives use am= param
mishkal_link = pos_section.find("a", href=re.compile(r"[na]m="))
if mishkal_link:
# Prefer <i> tag text (k-notation, matches _MISHKAL_HEBREW_Q after k→q)
i_tag = mishkal_link.find("i")
if i_tag:
mishkal = i_tag.get_text(strip=True)
else:
# Fall back to nm= URL parameter (already q-notation)
href = mishkal_link.get("href", "")
nm_match = re.search(r"[na]m=([a-zA-Z']+)", href)
if nm_match:
mishkal = nm_match.group(1)
# Also check the og:description or breadcrumbs for gender
if not gender:

View file

@ -685,6 +685,61 @@ def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
_pass(name)
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
"""English meanings must not contain bare Hebrew text (spoils the card)."""
name = "no_hebrew_in_meaning"
errors: list[str] = []
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
for key, entry in data.items():
meaning = entry.get("meaning") or ""
# Apply same cleaning pipeline as apkg_builder
cleaned = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(", ;:")
if hebrew_re.search(cleaned):
errors.append(f"[{key}] meaning still contains Hebrew after cleaning: {cleaned!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_mishkal_consistency(data: dict[str, Any]) -> None:
"""mishkal_hebrew must match mishkal via _mishkal_to_hebrew conversion."""
name = "mishkal_consistency"
errors: list[str] = []
try:
from pealim_detail_scrape import _mishkal_to_hebrew
except ImportError:
_warn(name, ["Could not import _mishkal_to_hebrew — skipping"])
return
for key, entry in data.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_eng = infl.get("mishkal") or ""
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_eng and mishkal_heb:
expected = _mishkal_to_hebrew(mishkal_eng) or ""
if expected and expected != mishkal_heb:
errors.append(f"[{key}] {infl_key}: {mishkal_eng}{mishkal_heb} (expected {expected})")
if mishkal_heb and not mishkal_eng:
errors.append(f"[{key}] {infl_key}: has mishkal_hebrew but no mishkal")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
# ---------------------------------------------------------------------------
# Stats summary
# ---------------------------------------------------------------------------
@ -702,6 +757,11 @@ def print_stats(data: dict[str, Any]) -> None:
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
with_mishkal = sum(
1
for e in data.values()
if (e.get("noun_inflection") or {}).get("mishkal") or (e.get("adjective_inflection") or {}).get("mishkal")
)
print()
print("Stats Summary")
@ -709,6 +769,7 @@ def print_stats(data: dict[str, Any]) -> None:
print(f" Total entries: {total:>6}")
print(f" With conjugation data: {with_conj:>6}")
print(f" With noun_inflection: {with_noun_inf:>6}")
print(f" With mishkal: {with_mishkal:>6}")
print(f" With vetted examples: {with_vetted:>6}")
print(f" With cloze examples: {with_cloze:>6}")
print(f" With images: {with_image:>6}")
@ -740,6 +801,8 @@ ALL_TESTS: dict[str, Any] = {
"conjugation_form_guids": test_conjugation_form_guids,
"conjugation_person_codes": test_conjugation_person_codes,
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
"mishkal_consistency": test_mishkal_consistency,
}

246
tests/test_apkg_builder.py Normal file
View file

@ -0,0 +1,246 @@
"""Unit tests for apkg_builder — Sprint 15 learnings.
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
meanings, PoS exact matching, gender field population, and mishkal data integrity.
"""
import json
import re
import sys
from pathlib import Path
import pytest
# Ensure project root is on path
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from apkg_builder import _categorize_pos, _cloze_prefix_len
# ---------------------------------------------------------------------------
# Cloze prefix preservation
# ---------------------------------------------------------------------------
class TestClozePrefix:
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
def test_single_prefix_bet(self):
# בַּתּוֹר = bet + patach + tor
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
def test_single_prefix_lamed(self):
# לַמֶּלֶךְ = lamed + patach + melech
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
def test_two_consonant_prefix(self):
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
token = "שֶׁבַּתּוֹר"
word = "תּוֹר"
prefix_len = _cloze_prefix_len(token, word)
assert prefix_len > 0
assert token[prefix_len:].startswith(word)
def test_no_prefix_direct_match(self):
# Word appears at start — no prefix
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
def test_empty_inputs(self):
assert _cloze_prefix_len("", "תּוֹר") == 0
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
assert _cloze_prefix_len("", "") == 0
def test_non_prefix_letter_returns_zero(self):
# If the "prefix" chars aren't valid prefix letters, return 0
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
def test_prefix_preserves_nikkud(self):
# Verify that prefix_len includes nikkud marks
token = "בַּתּוֹר"
word = "תּוֹר"
prefix_len = _cloze_prefix_len(token, word)
prefix = token[:prefix_len]
# Prefix should contain at least bet + nikkud mark(s)
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
assert base_letters == ["ב"]
# ---------------------------------------------------------------------------
# PoS exact matching (no substring collisions)
# ---------------------------------------------------------------------------
class TestCategorizePos:
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
def test_noun_exact(self):
assert _categorize_pos("Noun") == "Noun"
def test_pronoun_is_other(self):
assert _categorize_pos("Pronoun") == "Other"
def test_verb_exact(self):
assert _categorize_pos("Verb") == "Verb"
def test_noun_with_dash(self):
assert _categorize_pos("Noun masculine") == "Noun"
def test_adjective(self):
assert _categorize_pos("Adjective") == "Adjective"
def test_conjunction_is_other(self):
assert _categorize_pos("Conjunction") == "Other"
# ---------------------------------------------------------------------------
# Hebrew spoiler stripping from English meanings
# ---------------------------------------------------------------------------
class TestHebrewSpoilerStripping:
"""English meanings must not contain Hebrew text (spoils the card)."""
# Use the same regex from apkg_builder.py
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
@staticmethod
def _strip_hebrew(meaning: str) -> str:
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
meaning = re.sub(r"[;:]\s*—", "", meaning)
meaning = re.sub(r";\s*:", ";", meaning)
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
def test_pure_english_unchanged(self):
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
def test_hebrew_word_removed(self):
result = self._strip_hebrew("to eat; אכל")
assert "אכל" not in result
def test_hebrew_with_nikkud_removed(self):
result = self._strip_hebrew("tall; גָּבוֹהַּ")
assert "גָּבוֹהַּ" not in result
assert "tall" in result
def test_no_residual_hebrew_in_real_data(self):
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
# The regex used in apkg_builder
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
spoilers = []
for key, entry in words.items():
meaning = entry.get("meaning") or ""
cleaned = self._strip_hebrew(meaning)
if hebrew_re.search(cleaned):
spoilers.append(f"{key}: {cleaned!r}")
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
# ---------------------------------------------------------------------------
# Gender field for nouns (words.json data integrity)
# ---------------------------------------------------------------------------
class TestGenderDataIntegrity:
"""Nouns with noun_inflection should have gender populated."""
@pytest.fixture()
def words(self):
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
return json.load(f)
def test_nouns_have_gender(self, words):
"""Nouns with noun_inflection should have a valid gender."""
missing = []
for key, entry in words.items():
pos = entry.get("pos") or ""
ni = entry.get("noun_inflection")
if pos.startswith("Noun") and ni:
gender = ni.get("gender") or ""
if gender not in ("masculine", "feminine", "masculine and feminine"):
missing.append(f"{key}: gender={gender!r}")
# Allow up to 7% missing (loan words, compound words, etc.)
noun_count = sum(
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
)
if noun_count > 0:
pct_missing = len(missing) / noun_count
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
# ---------------------------------------------------------------------------
# Mishkal data integrity
# ---------------------------------------------------------------------------
class TestMishkalIntegrity:
"""Validate mishkal data consistency in words.json."""
@pytest.fixture()
def words(self):
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
if not words_path.exists():
pytest.skip("words.json not available")
with open(words_path, encoding="utf-8") as f:
return json.load(f)
def test_mishkal_hebrew_matches_english(self, words):
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
from pealim_detail_scrape import _mishkal_to_hebrew
mismatches = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_eng = infl.get("mishkal") or ""
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_eng and mishkal_heb:
expected = _mishkal_to_hebrew(mishkal_eng) or ""
if expected and expected != mishkal_heb:
mismatches.append(f"{key}: {mishkal_eng}{mishkal_heb} (expected {expected})")
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
def test_mishkal_hebrew_is_hebrew(self, words):
"""mishkal_hebrew must contain Hebrew characters."""
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
bad = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_heb = infl.get("mishkal_hebrew") or ""
if mishkal_heb and not hebrew_re.search(mishkal_heb):
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
def test_no_orphaned_mishkal(self, words):
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
orphans = []
for key, entry in words.items():
for infl_key in ("noun_inflection", "adjective_inflection"):
infl = entry.get(infl_key)
if not infl:
continue
mishkal_heb = infl.get("mishkal_hebrew") or ""
mishkal_eng = infl.get("mishkal") or ""
if mishkal_heb and not mishkal_eng:
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"