Sprint 17: homograph example dedup + plural audio + prep extraction

- Homograph collision fix: _deduplicate_confusable_examples() clears
  shared examples from less-common confusable group members (36 entries
  fixed). Keeps examples only on highest-frequency meaning.
- Plural deck audio: wired up PluralAudio field in apkg_builder.py,
  downloaded 613 plural audio files from pealim.com for all deck entries.
- Prep extraction upstream: moved Hebrew preposition parsing from build
  time into list/detail scrapers (SCHEMA.yaml prep field added).
- Validation: new no_shared_confusable_examples check in validate_data.py
- Tests: 9 new unit tests for confusable deduplication (98 total)
- Release: v0.19

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-14 21:51:35 +00:00
parent 0d92451271
commit af186e2030
9 changed files with 29782 additions and 14386 deletions

View file

@ -27,6 +27,7 @@ entry:
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud) pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions) meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps) meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
audio_url: "https://..." # Pealim audio URL audio_url: "https://..." # Pealim audio URL
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise) audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
tags: "" # Pealim tags if any tags: "" # Pealim tags if any

View file

@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
# Release version tag added to all notes so users can identify which release # Release version tag added to all notes so users can identify which release
# their cards come from (visible in Anki's Browse view and card info). # their cards come from (visible in Anki's Browse view and card info).
RELEASE_TAG = "v0.18" RELEASE_TAG = "v0.19"
# Regex for extracting emoji and Hebrew prepositions from meaning strings # Regex for extracting emoji and Hebrew prepositions from meaning strings
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+") EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -906,9 +906,14 @@ def build_vocab_deck(
emoji_str = emoji_lookup[kw] emoji_str = emoji_lookup[kw]
break break
# Extract Hebrew prepositions from meaning_raw # Extract Hebrew prepositions: prefer upstream-parsed prep field, fall back to meaning_raw scan
preps = HBPAREN_RE.findall(meaning_raw) # (fallback covers entries scraped before prep was moved upstream)
prep_str = " ".join(f"({p})" for p in preps) entry_prep = entry.get("prep")
if entry_prep:
prep_str = " ".join(f"({p})" for p in entry_prep.split())
else:
preps = HBPAREN_RE.findall(meaning_raw)
prep_str = " ".join(f"({p})" for p in preps)
# Audio — use audio_file from entry; for confusables it's already slug-based # Audio — use audio_file from entry; for confusables it's already slug-based
audio_tag = "" audio_tag = ""
@ -1682,12 +1687,20 @@ def build_plural_deck(
sg_audio = "" sg_audio = ""
pl_audio = "" pl_audio = ""
if include_audio: if include_audio:
sg_tag = _audio_tag(singular_ktiv) slug = entry.get("slug", "")
sg_tag = _audio_tag(singular_ktiv, slug=slug)
if sg_tag: if sg_tag:
sg_audio = sg_tag sg_audio = sg_tag
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]") mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
if mp3_path not in media_files: if mp3_path not in media_files:
media_files.append(mp3_path) media_files.append(mp3_path)
# Plural audio: {slug}_plural.mp3
if slug:
pl_mp3 = AUDIO_DIR / f"{slug}_plural.mp3"
if pl_mp3.exists():
pl_audio = f"[sound:{pl_mp3.name}]"
if pl_mp3 not in media_files:
media_files.append(pl_mp3)
mishkal_eng = noun_inflection.get("mishkal") or "" mishkal_eng = noun_inflection.get("mishkal") or ""
tags = [RELEASE_TAG] tags = [RELEASE_TAG]

File diff suppressed because it is too large Load diff

View file

@ -719,9 +719,87 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
examples["rejected_count"] = 0 examples["rejected_count"] = 0
updated += 1 updated += 1
# Deduplicate shared examples across confusable groups
cleared = _deduplicate_confusable_examples(words)
if cleared:
logger.info(f" Cleared shared examples from {cleared} confusable entries")
return updated return updated
def _deduplicate_confusable_examples(words: dict) -> int:
"""Remove shared examples from less-common confusable group members.
After example matching assigns sentences, confusable entries often share
identical examples (matched via shared nikkud forms). This function keeps
examples only on the highest-frequency member, clearing others.
Args:
words: The full words.json dict, modified in place (examples already
assigned).
Returns:
Count of entries whose examples were cleared.
"""
from collections import defaultdict
# Build confusable group map: group_id → [unique_key, ...]
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
for key, entry in words.items():
cg = entry.get("confusable_group")
if cg:
group_id = tuple(sorted(cg))
group_map[group_id].append(key)
cleared = 0
for _group_id, members in group_map.items():
if len(members) < 2:
continue
# Collect vetted sentence text sets per member
member_texts: dict[str, frozenset[str]] = {}
for key in members:
vetted = (words[key].get("examples") or {}).get("vetted") or []
texts = frozenset(e.get("text", "") for e in vetted)
member_texts[key] = texts
# Find members with identical non-empty sentence sets
# Group members by their sentence set
text_groups: dict[frozenset[str], list[str]] = defaultdict(list)
for key, texts in member_texts.items():
if texts: # skip entries with no examples
text_groups[texts].append(key)
# For each set of members sharing identical examples, keep only the
# highest-frequency one
for _texts, sharing_keys in text_groups.items():
if len(sharing_keys) < 2:
continue
# Sort by frequency_rank (lower = more common = winner).
# No frequency → sort last (use large sentinel).
# Tie-break: alphabetical by unique_key.
def _sort_key(k: str) -> tuple[int, str]:
rank = words[k].get("frequency_rank")
return (rank if rank is not None else 999999, k)
sharing_keys.sort(key=_sort_key)
winner = sharing_keys[0]
losers = sharing_keys[1:]
for loser_key in losers:
entry = words[loser_key]
examples = entry.get("examples") or {}
examples["vetted"] = []
examples.pop("cloze", None)
entry["examples"] = examples
cleared += 1
logger.debug(f" Cleared examples from {loser_key} (kept on {winner})")
return cleared
# ── Public API ─────────────────────────────────────────────────── # ── Public API ───────────────────────────────────────────────────

View file

@ -40,6 +40,9 @@ SAVE_INTERVAL = 50 # write words.json every N processed entries
WORDS_JSON = Path(__file__).parent / "data" / "words.json" WORDS_JSON = Path(__file__).parent / "data" / "words.json"
# Regex for Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al") BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES) _BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
@ -948,9 +951,17 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
binyan = _extract_binyan_from_page(mo_soup) binyan = _extract_binyan_from_page(mo_soup)
meaning = "" meaning = ""
prep: str | None = None
lead_div = mo_soup.find("div", class_="lead") lead_div = mo_soup.find("div", class_="lead")
if lead_div: if lead_div:
meaning = lead_div.get_text(strip=True) meaning = lead_div.get_text(strip=True)
# Extract preposition(s) from the lead text, e.g. "(על)" → "על"
prep_matches = HBPAREN_RE.findall(meaning)
if prep_matches:
prep = " ".join(prep_matches)
# Fall back to any prep already stored (e.g. from a previous manual edit)
if prep is None:
prep = existing.get("prep")
# Parse active forms # Parse active forms
mo_active = _parse_conjugation_table(mo_soup, passive=False) mo_active = _parse_conjugation_table(mo_soup, passive=False)
@ -1002,7 +1013,7 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
"binyan": binyan, "binyan": binyan,
"binyan_hebrew": BINYAN_HEBREW.get(binyan, ""), "binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
"meaning": meaning, "meaning": meaning,
"prep": existing.get("prep"), "prep": prep,
"active_forms": active_forms, "active_forms": active_forms,
"hufal_pual_forms": hufal_pual_forms, "hufal_pual_forms": hufal_pual_forms,
"reference_form_passive": reference_form_passive, "reference_form_passive": reference_form_passive,

View file

@ -86,6 +86,9 @@ EMOJI_RE = re.compile(
re.UNICODE, re.UNICODE,
) )
# Regex for extracting Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
# Fields that must never be overwritten when updating an existing entry # Fields that must never be overwritten when updating an existing entry
PROTECTED_FIELDS = frozenset( PROTECTED_FIELDS = frozenset(
[ [
@ -149,6 +152,7 @@ def _default_entry() -> dict:
"image": None, "image": None,
"image_source": None, "image_source": None,
"hint": "", "hint": "",
"prep": None,
"shared_roots": [], "shared_roots": [],
"confusable_group": None, "confusable_group": None,
"confusables_guid": None, "confusables_guid": None,
@ -170,8 +174,9 @@ def _extract_emoji(text: str) -> str | None:
def _clean_meaning(raw: str) -> str: def _clean_meaning(raw: str) -> str:
"""Strip emoji and extra whitespace from a raw meaning string.""" """Strip emoji, Hebrew parenthesized prepositions, and extra whitespace from a raw meaning string."""
cleaned = EMOJI_RE.sub("", raw) cleaned = EMOJI_RE.sub("", raw)
cleaned = HBPAREN_RE.sub("", cleaned)
return " ".join(cleaned.split()) return " ".join(cleaned.split())
@ -453,6 +458,9 @@ def _merge_row(
emoji = _extract_emoji(meaning_raw_raw) emoji = _extract_emoji(meaning_raw_raw)
tags = _build_tags(pos_en, root) tags = _build_tags(pos_en, root)
audio_file = _compute_audio_file(slug, ktiv_male) audio_file = _compute_audio_file(slug, ktiv_male)
# Extract Hebrew preposition(s) from the raw meaning (e.g. "(על)" → "על")
prep_matches = HBPAREN_RE.findall(meaning_raw)
prep: str | None = " ".join(prep_matches) if prep_matches else None
# ---- locate existing entry ---- # ---- locate existing entry ----
unique_key: str | None = slug_index.get(slug) if slug else None unique_key: str | None = slug_index.get(slug) if slug else None
@ -468,6 +476,7 @@ def _merge_row(
entry["pos_hebrew"] = pos_heb entry["pos_hebrew"] = pos_heb
entry["meaning"] = meaning entry["meaning"] = meaning
entry["meaning_raw"] = meaning_raw entry["meaning_raw"] = meaning_raw
entry["prep"] = prep
entry["audio_url"] = audio_url entry["audio_url"] = audio_url
entry["audio_file"] = audio_file entry["audio_file"] = audio_file
entry["tags"] = tags entry["tags"] = tags
@ -484,6 +493,7 @@ def _merge_row(
entry["pos_hebrew"] = pos_heb entry["pos_hebrew"] = pos_heb
entry["meaning"] = meaning entry["meaning"] = meaning
entry["meaning_raw"] = meaning_raw entry["meaning_raw"] = meaning_raw
entry["prep"] = prep
entry["emoji"] = emoji entry["emoji"] = emoji
entry["emoji_source"] = "from_pealim" if emoji else None entry["emoji_source"] = "from_pealim" if emoji else None
entry["audio_url"] = audio_url entry["audio_url"] = audio_url

View file

@ -20,8 +20,11 @@ from pathlib import Path
import requests import requests
sys.path.insert(0, "/home/node/projects")
import load_keeshare
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards" REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
FORGEJO_TOKEN = "f023bd4cfd4b77aac584647f2fa8481df3906578" FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["API_TOKEN"]
OUTPUT_DIR = Path(__file__).parent / "output" OUTPUT_DIR = Path(__file__).parent / "output"
# All deck variants to include in release # All deck variants to include in release

View file

@ -685,6 +685,57 @@ def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
_pass(name) _pass(name)
def test_no_shared_confusable_examples(data: dict[str, Any]) -> None:
"""Within each confusable group, no two entries should share the same set of vetted sentence texts.
Shared examples indicate the deduplication step in epub_examples.py
failed to assign examples to only the highest-frequency member.
"""
name = "no_shared_confusable_examples"
errors: list[str] = []
from collections import defaultdict
# Build confusable group map
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
for key, entry in data.items():
cg = entry.get("confusable_group")
if cg:
group_id = tuple(sorted(cg))
group_map[group_id].append(key)
for _group_id, members in group_map.items():
if len(members) < 2:
continue
# Collect sentence text sets per member
text_sets: dict[str, frozenset[str]] = {}
for key in members:
vetted = (data[key].get("examples") or {}).get("vetted") or []
texts = frozenset(e.get("text", "") for e in vetted)
if texts:
text_sets[key] = texts
# Check for identical sets
seen: dict[frozenset[str], str] = {}
for key, texts in text_sets.items():
if texts in seen:
meaning_a = (data[seen[texts]].get("meaning") or "")[:30]
meaning_b = (data[key].get("meaning") or "")[:30]
errors.append(
f"{seen[texts]} ({meaning_a}) and {key} ({meaning_b}) share {len(texts)} identical example(s)"
)
else:
seen[texts] = key
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None: def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
"""English meanings must not contain bare Hebrew text (spoils the card).""" """English meanings must not contain bare Hebrew text (spoils the card)."""
name = "no_hebrew_in_meaning" name = "no_hebrew_in_meaning"
@ -801,6 +852,7 @@ ALL_TESTS: dict[str, Any] = {
"conjugation_form_guids": test_conjugation_form_guids, "conjugation_form_guids": test_conjugation_form_guids,
"conjugation_person_codes": test_conjugation_person_codes, "conjugation_person_codes": test_conjugation_person_codes,
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions, "no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
"no_shared_confusable_examples": test_no_shared_confusable_examples,
"no_hebrew_in_meaning": test_no_hebrew_in_meaning, "no_hebrew_in_meaning": test_no_hebrew_in_meaning,
"mishkal_consistency": test_mishkal_consistency, "mishkal_consistency": test_mishkal_consistency,
} }

127
tests/test_epub_examples.py Normal file
View file

@ -0,0 +1,127 @@
"""Tests for epub_examples deduplication of confusable group examples."""
from epub_examples import _deduplicate_confusable_examples
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
"""Build a minimal words.json entry for testing."""
entry = {
"meaning": meaning,
"confusable_group": confusable_group,
}
if vetted_texts is not None:
entry["examples"] = {
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
}
if frequency_rank is not None:
entry["frequency_rank"] = frequency_rank
return entry
class TestDeduplicateConfusableExamples:
"""Tests for _deduplicate_confusable_examples()."""
def test_shared_examples_kept_on_higher_frequency(self):
"""When two confusables share identical examples, the one with
lower frequency_rank (more common) keeps them."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["key_a"]["examples"]["vetted"]) == 2
assert words["key_b"]["examples"]["vetted"] == []
def test_no_action_when_examples_differ(self):
"""Groups with different example sets are left untouched."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
assert len(words["key_a"]["examples"]["vetted"]) == 1
assert len(words["key_b"]["examples"]["vetted"]) == 1
def test_no_action_when_one_has_no_examples(self):
"""If only one member has examples, nothing to deduplicate."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("meaning2", group, frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
def test_no_frequency_uses_alphabetical_tiebreak(self):
"""When no member has frequency data, first alphabetically wins."""
group = ["alpha_key", "beta_key"]
words = {
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
"beta_key": _make_entry("meaning2", group, ["sent1"]),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
assert words["beta_key"]["examples"]["vetted"] == []
def test_three_way_group(self):
"""Three-member group: highest frequency wins, other two cleared."""
group = ["key_a", "key_b", "key_c"]
words = {
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 2
assert len(words["key_a"]["examples"]["vetted"]) == 2
assert words["key_b"]["examples"]["vetted"] == []
assert words["key_c"]["examples"]["vetted"] == []
def test_cloze_removed_from_losers(self):
"""Losing entries should have their cloze data removed too."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
}
# Add cloze to both
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert "cloze" not in words["key_b"]["examples"]
def test_no_confusable_groups_returns_zero(self):
"""Words without confusable_group are ignored."""
words = {
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0
def test_mixed_frequency_and_none(self):
"""Member with frequency beats member without."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
"key_b": _make_entry("no_freq", group, ["sent1"]),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 1
assert len(words["key_a"]["examples"]["vetted"]) == 1
assert words["key_b"]["examples"]["vetted"] == []
def test_partial_overlap_not_deduplicated(self):
"""Groups with overlapping but not identical sentence sets are not touched."""
group = ["key_a", "key_b"]
words = {
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
}
cleared = _deduplicate_confusable_examples(words)
assert cleared == 0