hebrew_flash_cards/tests/test_epub_examples.py
Sochen af186e2030 Sprint 17: homograph example dedup + plural audio + prep extraction
- Homograph collision fix: _deduplicate_confusable_examples() clears
  shared examples from less-common confusable group members (36 entries
  fixed). Keeps examples only on highest-frequency meaning.
- Plural deck audio: wired up PluralAudio field in apkg_builder.py,
  downloaded 613 plural audio files from pealim.com for all deck entries.
- Prep extraction upstream: moved Hebrew preposition parsing from build
  time into list/detail scrapers (SCHEMA.yaml prep field added).
- Validation: new no_shared_confusable_examples check in validate_data.py
- Tests: 9 new unit tests for confusable deduplication (98 total)
- Release: v0.19

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-14 21:51:35 +00:00

127 lines
5.5 KiB
Python

"""Tests for epub_examples deduplication of confusable group examples."""
from epub_examples import _deduplicate_confusable_examples
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
"""Build a minimal words.json entry for testing."""
entry = {
"meaning": meaning,
"confusable_group": confusable_group,
}
if vetted_texts is not None:
entry["examples"] = {
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
}
if frequency_rank is not None:
entry["frequency_rank"] = frequency_rank
return entry
class TestDeduplicateConfusableExamples:
"""Tests for _deduplicate_confusable_examples()."""
def test_shared_examples_kept_on_higher_frequency(self):
"""When two confusables share identical examples, the one with
lower frequency_rank (more common) keeps them."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["key_a"]["examples"]["vetted"]) == 2
assert words["key_b"]["examples"]["vetted"] == []
def test_no_action_when_examples_differ(self):
"""Groups with different example sets are left untouched."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
assert len(words["key_a"]["examples"]["vetted"]) == 1
assert len(words["key_b"]["examples"]["vetted"]) == 1
def test_no_action_when_one_has_no_examples(self):
"""If only one member has examples, nothing to deduplicate."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("meaning2", group, frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
def test_no_frequency_uses_alphabetical_tiebreak(self):
"""When no member has frequency data, first alphabetically wins."""
group = ["alpha_key", "beta_key"]
words = {
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
"beta_key": _make_entry("meaning2", group, ["sent1"]),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
assert words["beta_key"]["examples"]["vetted"] == []
def test_three_way_group(self):
"""Three-member group: highest frequency wins, other two cleared."""
group = ["key_a", "key_b", "key_c"]
words = {
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 2
assert len(words["key_a"]["examples"]["vetted"]) == 2
assert words["key_b"]["examples"]["vetted"] == []
assert words["key_c"]["examples"]["vetted"] == []
def test_cloze_removed_from_losers(self):
"""Losing entries should have their cloze data removed too."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
}
# Add cloze to both
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert "cloze" not in words["key_b"]["examples"]
def test_no_confusable_groups_returns_zero(self):
"""Words without confusable_group are ignored."""
words = {
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
def test_mixed_frequency_and_none(self):
"""Member with frequency beats member without."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
"key_b": _make_entry("no_freq", group, ["sent1"]),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["key_a"]["examples"]["vetted"]) == 1
assert words["key_b"]["examples"]["vetted"] == []
def test_partial_overlap_not_deduplicated(self):
"""Groups with overlapping but not identical sentence sets are not touched."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0