Sprint 17: homograph example dedup + plural audio + prep extraction
- Homograph collision fix: _deduplicate_confusable_examples() clears shared examples from less-common confusable group members (36 entries fixed). Keeps examples only on highest-frequency meaning. - Plural deck audio: wired up PluralAudio field in apkg_builder.py, downloaded 613 plural audio files from pealim.com for all deck entries. - Prep extraction upstream: moved Hebrew preposition parsing from build time into list/detail scrapers (SCHEMA.yaml prep field added). - Validation: new no_shared_confusable_examples check in validate_data.py - Tests: 9 new unit tests for confusable deduplication (98 total) - Release: v0.19 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0d92451271
commit
af186e2030
9 changed files with 29782 additions and 14386 deletions
|
|
@ -27,6 +27,7 @@ entry:
|
||||||
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
||||||
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
||||||
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
||||||
|
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
|
||||||
audio_url: "https://..." # Pealim audio URL
|
audio_url: "https://..." # Pealim audio URL
|
||||||
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
||||||
tags: "" # Pealim tags if any
|
tags: "" # Pealim tags if any
|
||||||
|
|
|
||||||
|
|
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
||||||
|
|
||||||
# Release version tag added to all notes so users can identify which release
|
# Release version tag added to all notes so users can identify which release
|
||||||
# their cards come from (visible in Anki's Browse view and card info).
|
# their cards come from (visible in Anki's Browse view and card info).
|
||||||
RELEASE_TAG = "v0.18"
|
RELEASE_TAG = "v0.19"
|
||||||
|
|
||||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||||
|
|
@ -906,9 +906,14 @@ def build_vocab_deck(
|
||||||
emoji_str = emoji_lookup[kw]
|
emoji_str = emoji_lookup[kw]
|
||||||
break
|
break
|
||||||
|
|
||||||
# Extract Hebrew prepositions from meaning_raw
|
# Extract Hebrew prepositions: prefer upstream-parsed prep field, fall back to meaning_raw scan
|
||||||
preps = HBPAREN_RE.findall(meaning_raw)
|
# (fallback covers entries scraped before prep was moved upstream)
|
||||||
prep_str = " ".join(f"({p})" for p in preps)
|
entry_prep = entry.get("prep")
|
||||||
|
if entry_prep:
|
||||||
|
prep_str = " ".join(f"({p})" for p in entry_prep.split())
|
||||||
|
else:
|
||||||
|
preps = HBPAREN_RE.findall(meaning_raw)
|
||||||
|
prep_str = " ".join(f"({p})" for p in preps)
|
||||||
|
|
||||||
# Audio — use audio_file from entry; for confusables it's already slug-based
|
# Audio — use audio_file from entry; for confusables it's already slug-based
|
||||||
audio_tag = ""
|
audio_tag = ""
|
||||||
|
|
@ -1682,12 +1687,20 @@ def build_plural_deck(
|
||||||
sg_audio = ""
|
sg_audio = ""
|
||||||
pl_audio = ""
|
pl_audio = ""
|
||||||
if include_audio:
|
if include_audio:
|
||||||
sg_tag = _audio_tag(singular_ktiv)
|
slug = entry.get("slug", "")
|
||||||
|
sg_tag = _audio_tag(singular_ktiv, slug=slug)
|
||||||
if sg_tag:
|
if sg_tag:
|
||||||
sg_audio = sg_tag
|
sg_audio = sg_tag
|
||||||
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
|
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
|
||||||
if mp3_path not in media_files:
|
if mp3_path not in media_files:
|
||||||
media_files.append(mp3_path)
|
media_files.append(mp3_path)
|
||||||
|
# Plural audio: {slug}_plural.mp3
|
||||||
|
if slug:
|
||||||
|
pl_mp3 = AUDIO_DIR / f"{slug}_plural.mp3"
|
||||||
|
if pl_mp3.exists():
|
||||||
|
pl_audio = f"[sound:{pl_mp3.name}]"
|
||||||
|
if pl_mp3 not in media_files:
|
||||||
|
media_files.append(pl_mp3)
|
||||||
|
|
||||||
mishkal_eng = noun_inflection.get("mishkal") or ""
|
mishkal_eng = noun_inflection.get("mishkal") or ""
|
||||||
tags = [RELEASE_TAG]
|
tags = [RELEASE_TAG]
|
||||||
|
|
|
||||||
43857
data/words.json
43857
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -719,9 +719,87 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
||||||
examples["rejected_count"] = 0
|
examples["rejected_count"] = 0
|
||||||
updated += 1
|
updated += 1
|
||||||
|
|
||||||
|
# Deduplicate shared examples across confusable groups
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
if cleared:
|
||||||
|
logger.info(f" Cleared shared examples from {cleared} confusable entries")
|
||||||
|
|
||||||
return updated
|
return updated
|
||||||
|
|
||||||
|
|
||||||
|
def _deduplicate_confusable_examples(words: dict) -> int:
|
||||||
|
"""Remove shared examples from less-common confusable group members.
|
||||||
|
|
||||||
|
After example matching assigns sentences, confusable entries often share
|
||||||
|
identical examples (matched via shared nikkud forms). This function keeps
|
||||||
|
examples only on the highest-frequency member, clearing others.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
words: The full words.json dict, modified in place (examples already
|
||||||
|
assigned).
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Count of entries whose examples were cleared.
|
||||||
|
"""
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Build confusable group map: group_id → [unique_key, ...]
|
||||||
|
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||||
|
for key, entry in words.items():
|
||||||
|
cg = entry.get("confusable_group")
|
||||||
|
if cg:
|
||||||
|
group_id = tuple(sorted(cg))
|
||||||
|
group_map[group_id].append(key)
|
||||||
|
|
||||||
|
cleared = 0
|
||||||
|
|
||||||
|
for _group_id, members in group_map.items():
|
||||||
|
if len(members) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Collect vetted sentence text sets per member
|
||||||
|
member_texts: dict[str, frozenset[str]] = {}
|
||||||
|
for key in members:
|
||||||
|
vetted = (words[key].get("examples") or {}).get("vetted") or []
|
||||||
|
texts = frozenset(e.get("text", "") for e in vetted)
|
||||||
|
member_texts[key] = texts
|
||||||
|
|
||||||
|
# Find members with identical non-empty sentence sets
|
||||||
|
# Group members by their sentence set
|
||||||
|
text_groups: dict[frozenset[str], list[str]] = defaultdict(list)
|
||||||
|
for key, texts in member_texts.items():
|
||||||
|
if texts: # skip entries with no examples
|
||||||
|
text_groups[texts].append(key)
|
||||||
|
|
||||||
|
# For each set of members sharing identical examples, keep only the
|
||||||
|
# highest-frequency one
|
||||||
|
for _texts, sharing_keys in text_groups.items():
|
||||||
|
if len(sharing_keys) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Sort by frequency_rank (lower = more common = winner).
|
||||||
|
# No frequency → sort last (use large sentinel).
|
||||||
|
# Tie-break: alphabetical by unique_key.
|
||||||
|
def _sort_key(k: str) -> tuple[int, str]:
|
||||||
|
rank = words[k].get("frequency_rank")
|
||||||
|
return (rank if rank is not None else 999999, k)
|
||||||
|
|
||||||
|
sharing_keys.sort(key=_sort_key)
|
||||||
|
winner = sharing_keys[0]
|
||||||
|
losers = sharing_keys[1:]
|
||||||
|
|
||||||
|
for loser_key in losers:
|
||||||
|
entry = words[loser_key]
|
||||||
|
examples = entry.get("examples") or {}
|
||||||
|
examples["vetted"] = []
|
||||||
|
examples.pop("cloze", None)
|
||||||
|
entry["examples"] = examples
|
||||||
|
cleared += 1
|
||||||
|
logger.debug(f" Cleared examples from {loser_key} (kept on {winner})")
|
||||||
|
|
||||||
|
return cleared
|
||||||
|
|
||||||
|
|
||||||
# ── Public API ───────────────────────────────────────────────────
|
# ── Public API ───────────────────────────────────────────────────
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,6 +40,9 @@ SAVE_INTERVAL = 50 # write words.json every N processed entries
|
||||||
|
|
||||||
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
|
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
|
||||||
|
|
||||||
|
# Regex for Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||||
|
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||||
|
|
||||||
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
||||||
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
|
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
|
||||||
|
|
||||||
|
|
@ -948,9 +951,17 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
||||||
binyan = _extract_binyan_from_page(mo_soup)
|
binyan = _extract_binyan_from_page(mo_soup)
|
||||||
|
|
||||||
meaning = ""
|
meaning = ""
|
||||||
|
prep: str | None = None
|
||||||
lead_div = mo_soup.find("div", class_="lead")
|
lead_div = mo_soup.find("div", class_="lead")
|
||||||
if lead_div:
|
if lead_div:
|
||||||
meaning = lead_div.get_text(strip=True)
|
meaning = lead_div.get_text(strip=True)
|
||||||
|
# Extract preposition(s) from the lead text, e.g. "(על)" → "על"
|
||||||
|
prep_matches = HBPAREN_RE.findall(meaning)
|
||||||
|
if prep_matches:
|
||||||
|
prep = " ".join(prep_matches)
|
||||||
|
# Fall back to any prep already stored (e.g. from a previous manual edit)
|
||||||
|
if prep is None:
|
||||||
|
prep = existing.get("prep")
|
||||||
|
|
||||||
# Parse active forms
|
# Parse active forms
|
||||||
mo_active = _parse_conjugation_table(mo_soup, passive=False)
|
mo_active = _parse_conjugation_table(mo_soup, passive=False)
|
||||||
|
|
@ -1002,7 +1013,7 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
||||||
"binyan": binyan,
|
"binyan": binyan,
|
||||||
"binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
|
"binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
|
||||||
"meaning": meaning,
|
"meaning": meaning,
|
||||||
"prep": existing.get("prep"),
|
"prep": prep,
|
||||||
"active_forms": active_forms,
|
"active_forms": active_forms,
|
||||||
"hufal_pual_forms": hufal_pual_forms,
|
"hufal_pual_forms": hufal_pual_forms,
|
||||||
"reference_form_passive": reference_form_passive,
|
"reference_form_passive": reference_form_passive,
|
||||||
|
|
|
||||||
|
|
@ -86,6 +86,9 @@ EMOJI_RE = re.compile(
|
||||||
re.UNICODE,
|
re.UNICODE,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Regex for extracting Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||||
|
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||||
|
|
||||||
# Fields that must never be overwritten when updating an existing entry
|
# Fields that must never be overwritten when updating an existing entry
|
||||||
PROTECTED_FIELDS = frozenset(
|
PROTECTED_FIELDS = frozenset(
|
||||||
[
|
[
|
||||||
|
|
@ -149,6 +152,7 @@ def _default_entry() -> dict:
|
||||||
"image": None,
|
"image": None,
|
||||||
"image_source": None,
|
"image_source": None,
|
||||||
"hint": "",
|
"hint": "",
|
||||||
|
"prep": None,
|
||||||
"shared_roots": [],
|
"shared_roots": [],
|
||||||
"confusable_group": None,
|
"confusable_group": None,
|
||||||
"confusables_guid": None,
|
"confusables_guid": None,
|
||||||
|
|
@ -170,8 +174,9 @@ def _extract_emoji(text: str) -> str | None:
|
||||||
|
|
||||||
|
|
||||||
def _clean_meaning(raw: str) -> str:
|
def _clean_meaning(raw: str) -> str:
|
||||||
"""Strip emoji and extra whitespace from a raw meaning string."""
|
"""Strip emoji, Hebrew parenthesized prepositions, and extra whitespace from a raw meaning string."""
|
||||||
cleaned = EMOJI_RE.sub("", raw)
|
cleaned = EMOJI_RE.sub("", raw)
|
||||||
|
cleaned = HBPAREN_RE.sub("", cleaned)
|
||||||
return " ".join(cleaned.split())
|
return " ".join(cleaned.split())
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -453,6 +458,9 @@ def _merge_row(
|
||||||
emoji = _extract_emoji(meaning_raw_raw)
|
emoji = _extract_emoji(meaning_raw_raw)
|
||||||
tags = _build_tags(pos_en, root)
|
tags = _build_tags(pos_en, root)
|
||||||
audio_file = _compute_audio_file(slug, ktiv_male)
|
audio_file = _compute_audio_file(slug, ktiv_male)
|
||||||
|
# Extract Hebrew preposition(s) from the raw meaning (e.g. "(על)" → "על")
|
||||||
|
prep_matches = HBPAREN_RE.findall(meaning_raw)
|
||||||
|
prep: str | None = " ".join(prep_matches) if prep_matches else None
|
||||||
|
|
||||||
# ---- locate existing entry ----
|
# ---- locate existing entry ----
|
||||||
unique_key: str | None = slug_index.get(slug) if slug else None
|
unique_key: str | None = slug_index.get(slug) if slug else None
|
||||||
|
|
@ -468,6 +476,7 @@ def _merge_row(
|
||||||
entry["pos_hebrew"] = pos_heb
|
entry["pos_hebrew"] = pos_heb
|
||||||
entry["meaning"] = meaning
|
entry["meaning"] = meaning
|
||||||
entry["meaning_raw"] = meaning_raw
|
entry["meaning_raw"] = meaning_raw
|
||||||
|
entry["prep"] = prep
|
||||||
entry["audio_url"] = audio_url
|
entry["audio_url"] = audio_url
|
||||||
entry["audio_file"] = audio_file
|
entry["audio_file"] = audio_file
|
||||||
entry["tags"] = tags
|
entry["tags"] = tags
|
||||||
|
|
@ -484,6 +493,7 @@ def _merge_row(
|
||||||
entry["pos_hebrew"] = pos_heb
|
entry["pos_hebrew"] = pos_heb
|
||||||
entry["meaning"] = meaning
|
entry["meaning"] = meaning
|
||||||
entry["meaning_raw"] = meaning_raw
|
entry["meaning_raw"] = meaning_raw
|
||||||
|
entry["prep"] = prep
|
||||||
entry["emoji"] = emoji
|
entry["emoji"] = emoji
|
||||||
entry["emoji_source"] = "from_pealim" if emoji else None
|
entry["emoji_source"] = "from_pealim" if emoji else None
|
||||||
entry["audio_url"] = audio_url
|
entry["audio_url"] = audio_url
|
||||||
|
|
|
||||||
|
|
@ -20,8 +20,11 @@ from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
|
sys.path.insert(0, "/home/node/projects")
|
||||||
|
import load_keeshare
|
||||||
|
|
||||||
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
||||||
FORGEJO_TOKEN = "f023bd4cfd4b77aac584647f2fa8481df3906578"
|
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["API_TOKEN"]
|
||||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||||
|
|
||||||
# All deck variants to include in release
|
# All deck variants to include in release
|
||||||
|
|
|
||||||
|
|
@ -685,6 +685,57 @@ def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
||||||
_pass(name)
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_shared_confusable_examples(data: dict[str, Any]) -> None:
|
||||||
|
"""Within each confusable group, no two entries should share the same set of vetted sentence texts.
|
||||||
|
|
||||||
|
Shared examples indicate the deduplication step in epub_examples.py
|
||||||
|
failed to assign examples to only the highest-frequency member.
|
||||||
|
"""
|
||||||
|
name = "no_shared_confusable_examples"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
# Build confusable group map
|
||||||
|
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||||
|
for key, entry in data.items():
|
||||||
|
cg = entry.get("confusable_group")
|
||||||
|
if cg:
|
||||||
|
group_id = tuple(sorted(cg))
|
||||||
|
group_map[group_id].append(key)
|
||||||
|
|
||||||
|
for _group_id, members in group_map.items():
|
||||||
|
if len(members) < 2:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Collect sentence text sets per member
|
||||||
|
text_sets: dict[str, frozenset[str]] = {}
|
||||||
|
for key in members:
|
||||||
|
vetted = (data[key].get("examples") or {}).get("vetted") or []
|
||||||
|
texts = frozenset(e.get("text", "") for e in vetted)
|
||||||
|
if texts:
|
||||||
|
text_sets[key] = texts
|
||||||
|
|
||||||
|
# Check for identical sets
|
||||||
|
seen: dict[frozenset[str], str] = {}
|
||||||
|
for key, texts in text_sets.items():
|
||||||
|
if texts in seen:
|
||||||
|
meaning_a = (data[seen[texts]].get("meaning") or "")[:30]
|
||||||
|
meaning_b = (data[key].get("meaning") or "")[:30]
|
||||||
|
errors.append(
|
||||||
|
f"{seen[texts]} ({meaning_a}) and {key} ({meaning_b}) share {len(texts)} identical example(s)"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
seen[texts] = key
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
|
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
|
||||||
"""English meanings must not contain bare Hebrew text (spoils the card)."""
|
"""English meanings must not contain bare Hebrew text (spoils the card)."""
|
||||||
name = "no_hebrew_in_meaning"
|
name = "no_hebrew_in_meaning"
|
||||||
|
|
@ -801,6 +852,7 @@ ALL_TESTS: dict[str, Any] = {
|
||||||
"conjugation_form_guids": test_conjugation_form_guids,
|
"conjugation_form_guids": test_conjugation_form_guids,
|
||||||
"conjugation_person_codes": test_conjugation_person_codes,
|
"conjugation_person_codes": test_conjugation_person_codes,
|
||||||
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||||||
|
"no_shared_confusable_examples": test_no_shared_confusable_examples,
|
||||||
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
|
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
|
||||||
"mishkal_consistency": test_mishkal_consistency,
|
"mishkal_consistency": test_mishkal_consistency,
|
||||||
}
|
}
|
||||||
|
|
|
||||||
127
tests/test_epub_examples.py
Normal file
127
tests/test_epub_examples.py
Normal file
|
|
@ -0,0 +1,127 @@
|
||||||
|
"""Tests for epub_examples deduplication of confusable group examples."""
|
||||||
|
|
||||||
|
from epub_examples import _deduplicate_confusable_examples
|
||||||
|
|
||||||
|
|
||||||
|
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
|
||||||
|
"""Build a minimal words.json entry for testing."""
|
||||||
|
entry = {
|
||||||
|
"meaning": meaning,
|
||||||
|
"confusable_group": confusable_group,
|
||||||
|
}
|
||||||
|
if vetted_texts is not None:
|
||||||
|
entry["examples"] = {
|
||||||
|
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
|
||||||
|
}
|
||||||
|
if frequency_rank is not None:
|
||||||
|
entry["frequency_rank"] = frequency_rank
|
||||||
|
return entry
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeduplicateConfusableExamples:
|
||||||
|
"""Tests for _deduplicate_confusable_examples()."""
|
||||||
|
|
||||||
|
def test_shared_examples_kept_on_higher_frequency(self):
|
||||||
|
"""When two confusables share identical examples, the one with
|
||||||
|
lower frequency_rank (more common) keeps them."""
|
||||||
|
group = ["key_a", "key_b"]
|
||||||
|
words = {
|
||||||
|
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
|
||||||
|
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 1
|
||||||
|
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||||
|
assert words["key_b"]["examples"]["vetted"] == []
|
||||||
|
|
||||||
|
def test_no_action_when_examples_differ(self):
|
||||||
|
"""Groups with different example sets are left untouched."""
|
||||||
|
group = ["key_a", "key_b"]
|
||||||
|
words = {
|
||||||
|
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||||
|
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 0
|
||||||
|
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||||
|
assert len(words["key_b"]["examples"]["vetted"]) == 1
|
||||||
|
|
||||||
|
def test_no_action_when_one_has_no_examples(self):
|
||||||
|
"""If only one member has examples, nothing to deduplicate."""
|
||||||
|
group = ["key_a", "key_b"]
|
||||||
|
words = {
|
||||||
|
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||||
|
"key_b": _make_entry("meaning2", group, frequency_rank=200),
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 0
|
||||||
|
|
||||||
|
def test_no_frequency_uses_alphabetical_tiebreak(self):
|
||||||
|
"""When no member has frequency data, first alphabetically wins."""
|
||||||
|
group = ["alpha_key", "beta_key"]
|
||||||
|
words = {
|
||||||
|
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
|
||||||
|
"beta_key": _make_entry("meaning2", group, ["sent1"]),
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 1
|
||||||
|
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
|
||||||
|
assert words["beta_key"]["examples"]["vetted"] == []
|
||||||
|
|
||||||
|
def test_three_way_group(self):
|
||||||
|
"""Three-member group: highest frequency wins, other two cleared."""
|
||||||
|
group = ["key_a", "key_b", "key_c"]
|
||||||
|
words = {
|
||||||
|
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
|
||||||
|
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
|
||||||
|
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 2
|
||||||
|
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||||
|
assert words["key_b"]["examples"]["vetted"] == []
|
||||||
|
assert words["key_c"]["examples"]["vetted"] == []
|
||||||
|
|
||||||
|
def test_cloze_removed_from_losers(self):
|
||||||
|
"""Losing entries should have their cloze data removed too."""
|
||||||
|
group = ["key_a", "key_b"]
|
||||||
|
words = {
|
||||||
|
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
|
||||||
|
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
|
||||||
|
}
|
||||||
|
# Add cloze to both
|
||||||
|
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 1
|
||||||
|
assert "cloze" not in words["key_b"]["examples"]
|
||||||
|
|
||||||
|
def test_no_confusable_groups_returns_zero(self):
|
||||||
|
"""Words without confusable_group are ignored."""
|
||||||
|
words = {
|
||||||
|
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
|
||||||
|
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 0
|
||||||
|
|
||||||
|
def test_mixed_frequency_and_none(self):
|
||||||
|
"""Member with frequency beats member without."""
|
||||||
|
group = ["key_a", "key_b"]
|
||||||
|
words = {
|
||||||
|
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
|
||||||
|
"key_b": _make_entry("no_freq", group, ["sent1"]),
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 1
|
||||||
|
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||||
|
assert words["key_b"]["examples"]["vetted"] == []
|
||||||
|
|
||||||
|
def test_partial_overlap_not_deduplicated(self):
|
||||||
|
"""Groups with overlapping but not identical sentence sets are not touched."""
|
||||||
|
group = ["key_a", "key_b"]
|
||||||
|
words = {
|
||||||
|
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
|
||||||
|
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
|
||||||
|
}
|
||||||
|
cleared = _deduplicate_confusable_examples(words)
|
||||||
|
assert cleared == 0
|
||||||
Loading…
Reference in a new issue