- Regenerated all example sentences from scratch (deleted legacy + stale entries) - Added .txt file support to epub_examples.py for Ben Yehuda corpus - 7 Ben Yehuda nikkud'd children's texts + 3 new Time Tunnel EPUBs - Maqaf-stripped construct form indexing (+68% inflected matches) - Total: 3,598 words with examples, 3,289 with cloze (was ~2,900) - Cloze prefix preservation (_cloze_prefix_len) - Hebrew spoiler stripping from English meanings - Gender field (זָכָר/נְקֵבָה) on vocab cards - sec-table CSS layout for aligned key:value pairs - Mishkal uses mishkal_hebrew on plural cards - Improved mishkal extraction from pealim detail pages - 21 new pytest tests (cloze, PoS, Hebrew stripping, gender, mishkal) - 2 new validate_data.py tests + mishkal stats - Colliding forms tracking (local-only) - Release tag v0.17 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
246 lines
9.6 KiB
Python
246 lines
9.6 KiB
Python
"""Unit tests for apkg_builder — Sprint 15 learnings.
|
||
|
||
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
|
||
meanings, PoS exact matching, gender field population, and mishkal data integrity.
|
||
"""
|
||
|
||
import json
|
||
import re
|
||
import sys
|
||
from pathlib import Path
|
||
|
||
import pytest
|
||
|
||
# Ensure project root is on path
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||
|
||
from apkg_builder import _categorize_pos, _cloze_prefix_len
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Cloze prefix preservation
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestClozePrefix:
|
||
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
|
||
|
||
def test_single_prefix_bet(self):
|
||
# בַּתּוֹר = bet + patach + tor
|
||
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
|
||
|
||
def test_single_prefix_lamed(self):
|
||
# לַמֶּלֶךְ = lamed + patach + melech
|
||
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
|
||
|
||
def test_two_consonant_prefix(self):
|
||
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
|
||
token = "שֶׁבַּתּוֹר"
|
||
word = "תּוֹר"
|
||
prefix_len = _cloze_prefix_len(token, word)
|
||
assert prefix_len > 0
|
||
assert token[prefix_len:].startswith(word)
|
||
|
||
def test_no_prefix_direct_match(self):
|
||
# Word appears at start — no prefix
|
||
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
|
||
|
||
def test_empty_inputs(self):
|
||
assert _cloze_prefix_len("", "תּוֹר") == 0
|
||
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
|
||
assert _cloze_prefix_len("", "") == 0
|
||
|
||
def test_non_prefix_letter_returns_zero(self):
|
||
# If the "prefix" chars aren't valid prefix letters, return 0
|
||
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
|
||
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
|
||
|
||
def test_prefix_preserves_nikkud(self):
|
||
# Verify that prefix_len includes nikkud marks
|
||
token = "בַּתּוֹר"
|
||
word = "תּוֹר"
|
||
prefix_len = _cloze_prefix_len(token, word)
|
||
prefix = token[:prefix_len]
|
||
# Prefix should contain at least bet + nikkud mark(s)
|
||
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
|
||
assert base_letters == ["ב"]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# PoS exact matching (no substring collisions)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestCategorizePos:
|
||
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
|
||
|
||
def test_noun_exact(self):
|
||
assert _categorize_pos("Noun") == "Noun"
|
||
|
||
def test_pronoun_is_other(self):
|
||
assert _categorize_pos("Pronoun") == "Other"
|
||
|
||
def test_verb_exact(self):
|
||
assert _categorize_pos("Verb") == "Verb"
|
||
|
||
def test_noun_with_dash(self):
|
||
assert _categorize_pos("Noun – masculine") == "Noun"
|
||
|
||
def test_adjective(self):
|
||
assert _categorize_pos("Adjective") == "Adjective"
|
||
|
||
def test_conjunction_is_other(self):
|
||
assert _categorize_pos("Conjunction") == "Other"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Hebrew spoiler stripping from English meanings
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestHebrewSpoilerStripping:
|
||
"""English meanings must not contain Hebrew text (spoils the card)."""
|
||
|
||
# Use the same regex from apkg_builder.py
|
||
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
|
||
|
||
@staticmethod
|
||
def _strip_hebrew(meaning: str) -> str:
|
||
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
|
||
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||
meaning = re.sub(r"[;:]\s*—", " —", meaning)
|
||
meaning = re.sub(r";\s*:", ";", meaning)
|
||
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||
|
||
def test_pure_english_unchanged(self):
|
||
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
|
||
|
||
def test_hebrew_word_removed(self):
|
||
result = self._strip_hebrew("to eat; אכל")
|
||
assert "אכל" not in result
|
||
|
||
def test_hebrew_with_nikkud_removed(self):
|
||
result = self._strip_hebrew("tall; גָּבוֹהַּ")
|
||
assert "גָּבוֹהַּ" not in result
|
||
assert "tall" in result
|
||
|
||
def test_no_residual_hebrew_in_real_data(self):
|
||
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
|
||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||
if not words_path.exists():
|
||
pytest.skip("words.json not available")
|
||
|
||
with open(words_path, encoding="utf-8") as f:
|
||
words = json.load(f)
|
||
|
||
# The regex used in apkg_builder
|
||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||
spoilers = []
|
||
for key, entry in words.items():
|
||
meaning = entry.get("meaning") or ""
|
||
cleaned = self._strip_hebrew(meaning)
|
||
if hebrew_re.search(cleaned):
|
||
spoilers.append(f"{key}: {cleaned!r}")
|
||
|
||
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Gender field for nouns (words.json data integrity)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestGenderDataIntegrity:
|
||
"""Nouns with noun_inflection should have gender populated."""
|
||
|
||
@pytest.fixture()
|
||
def words(self):
|
||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||
if not words_path.exists():
|
||
pytest.skip("words.json not available")
|
||
with open(words_path, encoding="utf-8") as f:
|
||
return json.load(f)
|
||
|
||
def test_nouns_have_gender(self, words):
|
||
"""Nouns with noun_inflection should have a valid gender."""
|
||
missing = []
|
||
for key, entry in words.items():
|
||
pos = entry.get("pos") or ""
|
||
ni = entry.get("noun_inflection")
|
||
if pos.startswith("Noun") and ni:
|
||
gender = ni.get("gender") or ""
|
||
if gender not in ("masculine", "feminine", "masculine and feminine"):
|
||
missing.append(f"{key}: gender={gender!r}")
|
||
|
||
# Allow up to 7% missing (loan words, compound words, etc.)
|
||
noun_count = sum(
|
||
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
|
||
)
|
||
if noun_count > 0:
|
||
pct_missing = len(missing) / noun_count
|
||
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Mishkal data integrity
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
class TestMishkalIntegrity:
|
||
"""Validate mishkal data consistency in words.json."""
|
||
|
||
@pytest.fixture()
|
||
def words(self):
|
||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||
if not words_path.exists():
|
||
pytest.skip("words.json not available")
|
||
with open(words_path, encoding="utf-8") as f:
|
||
return json.load(f)
|
||
|
||
def test_mishkal_hebrew_matches_english(self, words):
|
||
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
|
||
from pealim_detail_scrape import _mishkal_to_hebrew
|
||
|
||
mismatches = []
|
||
for key, entry in words.items():
|
||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||
infl = entry.get(infl_key)
|
||
if not infl:
|
||
continue
|
||
mishkal_eng = infl.get("mishkal") or ""
|
||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||
if mishkal_eng and mishkal_heb:
|
||
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||
if expected and expected != mishkal_heb:
|
||
mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||
|
||
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
|
||
|
||
def test_mishkal_hebrew_is_hebrew(self, words):
|
||
"""mishkal_hebrew must contain Hebrew characters."""
|
||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||
bad = []
|
||
for key, entry in words.items():
|
||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||
infl = entry.get(infl_key)
|
||
if not infl:
|
||
continue
|
||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||
if mishkal_heb and not hebrew_re.search(mishkal_heb):
|
||
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
|
||
|
||
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
|
||
|
||
def test_no_orphaned_mishkal(self, words):
|
||
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
|
||
orphans = []
|
||
for key, entry in words.items():
|
||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||
infl = entry.get(infl_key)
|
||
if not infl:
|
||
continue
|
||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||
mishkal_eng = infl.get("mishkal") or ""
|
||
if mishkal_heb and not mishkal_eng:
|
||
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
|
||
|
||
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"
|