Sprint 17: homograph example dedup + plural audio + prep extraction
- Homograph collision fix: _deduplicate_confusable_examples() clears shared examples from less-common confusable group members (36 entries fixed). Keeps examples only on highest-frequency meaning. - Plural deck audio: wired up PluralAudio field in apkg_builder.py, downloaded 613 plural audio files from pealim.com for all deck entries. - Prep extraction upstream: moved Hebrew preposition parsing from build time into list/detail scrapers (SCHEMA.yaml prep field added). - Validation: new no_shared_confusable_examples check in validate_data.py - Tests: 9 new unit tests for confusable deduplication (98 total) - Release: v0.19 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0d92451271
commit
af186e2030
9 changed files with 29782 additions and 14386 deletions
|
|
@ -27,6 +27,7 @@ entry:
|
|||
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
||||
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
||||
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
||||
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
|
||||
audio_url: "https://..." # Pealim audio URL
|
||||
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
||||
tags: "" # Pealim tags if any
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
|||
|
||||
# Release version tag added to all notes so users can identify which release
|
||||
# their cards come from (visible in Anki's Browse view and card info).
|
||||
RELEASE_TAG = "v0.18"
|
||||
RELEASE_TAG = "v0.19"
|
||||
|
||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||
|
|
@ -906,7 +906,12 @@ def build_vocab_deck(
|
|||
emoji_str = emoji_lookup[kw]
|
||||
break
|
||||
|
||||
# Extract Hebrew prepositions from meaning_raw
|
||||
# Extract Hebrew prepositions: prefer upstream-parsed prep field, fall back to meaning_raw scan
|
||||
# (fallback covers entries scraped before prep was moved upstream)
|
||||
entry_prep = entry.get("prep")
|
||||
if entry_prep:
|
||||
prep_str = " ".join(f"({p})" for p in entry_prep.split())
|
||||
else:
|
||||
preps = HBPAREN_RE.findall(meaning_raw)
|
||||
prep_str = " ".join(f"({p})" for p in preps)
|
||||
|
||||
|
|
@ -1682,12 +1687,20 @@ def build_plural_deck(
|
|||
sg_audio = ""
|
||||
pl_audio = ""
|
||||
if include_audio:
|
||||
sg_tag = _audio_tag(singular_ktiv)
|
||||
slug = entry.get("slug", "")
|
||||
sg_tag = _audio_tag(singular_ktiv, slug=slug)
|
||||
if sg_tag:
|
||||
sg_audio = sg_tag
|
||||
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
|
||||
if mp3_path not in media_files:
|
||||
media_files.append(mp3_path)
|
||||
# Plural audio: {slug}_plural.mp3
|
||||
if slug:
|
||||
pl_mp3 = AUDIO_DIR / f"{slug}_plural.mp3"
|
||||
if pl_mp3.exists():
|
||||
pl_audio = f"[sound:{pl_mp3.name}]"
|
||||
if pl_mp3 not in media_files:
|
||||
media_files.append(pl_mp3)
|
||||
|
||||
mishkal_eng = noun_inflection.get("mishkal") or ""
|
||||
tags = [RELEASE_TAG]
|
||||
|
|
|
|||
43857
data/words.json
43857
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -719,9 +719,87 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
examples["rejected_count"] = 0
|
||||
updated += 1
|
||||
|
||||
# Deduplicate shared examples across confusable groups
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
if cleared:
|
||||
logger.info(f" Cleared shared examples from {cleared} confusable entries")
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def _deduplicate_confusable_examples(words: dict) -> int:
|
||||
"""Remove shared examples from less-common confusable group members.
|
||||
|
||||
After example matching assigns sentences, confusable entries often share
|
||||
identical examples (matched via shared nikkud forms). This function keeps
|
||||
examples only on the highest-frequency member, clearing others.
|
||||
|
||||
Args:
|
||||
words: The full words.json dict, modified in place (examples already
|
||||
assigned).
|
||||
|
||||
Returns:
|
||||
Count of entries whose examples were cleared.
|
||||
"""
|
||||
from collections import defaultdict
|
||||
|
||||
# Build confusable group map: group_id → [unique_key, ...]
|
||||
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||
for key, entry in words.items():
|
||||
cg = entry.get("confusable_group")
|
||||
if cg:
|
||||
group_id = tuple(sorted(cg))
|
||||
group_map[group_id].append(key)
|
||||
|
||||
cleared = 0
|
||||
|
||||
for _group_id, members in group_map.items():
|
||||
if len(members) < 2:
|
||||
continue
|
||||
|
||||
# Collect vetted sentence text sets per member
|
||||
member_texts: dict[str, frozenset[str]] = {}
|
||||
for key in members:
|
||||
vetted = (words[key].get("examples") or {}).get("vetted") or []
|
||||
texts = frozenset(e.get("text", "") for e in vetted)
|
||||
member_texts[key] = texts
|
||||
|
||||
# Find members with identical non-empty sentence sets
|
||||
# Group members by their sentence set
|
||||
text_groups: dict[frozenset[str], list[str]] = defaultdict(list)
|
||||
for key, texts in member_texts.items():
|
||||
if texts: # skip entries with no examples
|
||||
text_groups[texts].append(key)
|
||||
|
||||
# For each set of members sharing identical examples, keep only the
|
||||
# highest-frequency one
|
||||
for _texts, sharing_keys in text_groups.items():
|
||||
if len(sharing_keys) < 2:
|
||||
continue
|
||||
|
||||
# Sort by frequency_rank (lower = more common = winner).
|
||||
# No frequency → sort last (use large sentinel).
|
||||
# Tie-break: alphabetical by unique_key.
|
||||
def _sort_key(k: str) -> tuple[int, str]:
|
||||
rank = words[k].get("frequency_rank")
|
||||
return (rank if rank is not None else 999999, k)
|
||||
|
||||
sharing_keys.sort(key=_sort_key)
|
||||
winner = sharing_keys[0]
|
||||
losers = sharing_keys[1:]
|
||||
|
||||
for loser_key in losers:
|
||||
entry = words[loser_key]
|
||||
examples = entry.get("examples") or {}
|
||||
examples["vetted"] = []
|
||||
examples.pop("cloze", None)
|
||||
entry["examples"] = examples
|
||||
cleared += 1
|
||||
logger.debug(f" Cleared examples from {loser_key} (kept on {winner})")
|
||||
|
||||
return cleared
|
||||
|
||||
|
||||
# ── Public API ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
|
|
|
|||
|
|
@ -40,6 +40,9 @@ SAVE_INTERVAL = 50 # write words.json every N processed entries
|
|||
|
||||
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
|
||||
|
||||
# Regex for Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||
|
||||
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
||||
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
|
||||
|
||||
|
|
@ -948,9 +951,17 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
|||
binyan = _extract_binyan_from_page(mo_soup)
|
||||
|
||||
meaning = ""
|
||||
prep: str | None = None
|
||||
lead_div = mo_soup.find("div", class_="lead")
|
||||
if lead_div:
|
||||
meaning = lead_div.get_text(strip=True)
|
||||
# Extract preposition(s) from the lead text, e.g. "(על)" → "על"
|
||||
prep_matches = HBPAREN_RE.findall(meaning)
|
||||
if prep_matches:
|
||||
prep = " ".join(prep_matches)
|
||||
# Fall back to any prep already stored (e.g. from a previous manual edit)
|
||||
if prep is None:
|
||||
prep = existing.get("prep")
|
||||
|
||||
# Parse active forms
|
||||
mo_active = _parse_conjugation_table(mo_soup, passive=False)
|
||||
|
|
@ -1002,7 +1013,7 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
|||
"binyan": binyan,
|
||||
"binyan_hebrew": BINYAN_HEBREW.get(binyan, ""),
|
||||
"meaning": meaning,
|
||||
"prep": existing.get("prep"),
|
||||
"prep": prep,
|
||||
"active_forms": active_forms,
|
||||
"hufal_pual_forms": hufal_pual_forms,
|
||||
"reference_form_passive": reference_form_passive,
|
||||
|
|
|
|||
|
|
@ -86,6 +86,9 @@ EMOJI_RE = re.compile(
|
|||
re.UNICODE,
|
||||
)
|
||||
|
||||
# Regex for extracting Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||
|
||||
# Fields that must never be overwritten when updating an existing entry
|
||||
PROTECTED_FIELDS = frozenset(
|
||||
[
|
||||
|
|
@ -149,6 +152,7 @@ def _default_entry() -> dict:
|
|||
"image": None,
|
||||
"image_source": None,
|
||||
"hint": "",
|
||||
"prep": None,
|
||||
"shared_roots": [],
|
||||
"confusable_group": None,
|
||||
"confusables_guid": None,
|
||||
|
|
@ -170,8 +174,9 @@ def _extract_emoji(text: str) -> str | None:
|
|||
|
||||
|
||||
def _clean_meaning(raw: str) -> str:
|
||||
"""Strip emoji and extra whitespace from a raw meaning string."""
|
||||
"""Strip emoji, Hebrew parenthesized prepositions, and extra whitespace from a raw meaning string."""
|
||||
cleaned = EMOJI_RE.sub("", raw)
|
||||
cleaned = HBPAREN_RE.sub("", cleaned)
|
||||
return " ".join(cleaned.split())
|
||||
|
||||
|
||||
|
|
@ -453,6 +458,9 @@ def _merge_row(
|
|||
emoji = _extract_emoji(meaning_raw_raw)
|
||||
tags = _build_tags(pos_en, root)
|
||||
audio_file = _compute_audio_file(slug, ktiv_male)
|
||||
# Extract Hebrew preposition(s) from the raw meaning (e.g. "(על)" → "על")
|
||||
prep_matches = HBPAREN_RE.findall(meaning_raw)
|
||||
prep: str | None = " ".join(prep_matches) if prep_matches else None
|
||||
|
||||
# ---- locate existing entry ----
|
||||
unique_key: str | None = slug_index.get(slug) if slug else None
|
||||
|
|
@ -468,6 +476,7 @@ def _merge_row(
|
|||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["audio_url"] = audio_url
|
||||
entry["audio_file"] = audio_file
|
||||
entry["tags"] = tags
|
||||
|
|
@ -484,6 +493,7 @@ def _merge_row(
|
|||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["emoji"] = emoji
|
||||
entry["emoji_source"] = "from_pealim" if emoji else None
|
||||
entry["audio_url"] = audio_url
|
||||
|
|
|
|||
|
|
@ -20,8 +20,11 @@ from pathlib import Path
|
|||
|
||||
import requests
|
||||
|
||||
sys.path.insert(0, "/home/node/projects")
|
||||
import load_keeshare
|
||||
|
||||
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
||||
FORGEJO_TOKEN = "f023bd4cfd4b77aac584647f2fa8481df3906578"
|
||||
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["API_TOKEN"]
|
||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
|
||||
# All deck variants to include in release
|
||||
|
|
|
|||
|
|
@ -685,6 +685,57 @@ def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
|||
_pass(name)
|
||||
|
||||
|
||||
def test_no_shared_confusable_examples(data: dict[str, Any]) -> None:
|
||||
"""Within each confusable group, no two entries should share the same set of vetted sentence texts.
|
||||
|
||||
Shared examples indicate the deduplication step in epub_examples.py
|
||||
failed to assign examples to only the highest-frequency member.
|
||||
"""
|
||||
name = "no_shared_confusable_examples"
|
||||
errors: list[str] = []
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
# Build confusable group map
|
||||
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||
for key, entry in data.items():
|
||||
cg = entry.get("confusable_group")
|
||||
if cg:
|
||||
group_id = tuple(sorted(cg))
|
||||
group_map[group_id].append(key)
|
||||
|
||||
for _group_id, members in group_map.items():
|
||||
if len(members) < 2:
|
||||
continue
|
||||
|
||||
# Collect sentence text sets per member
|
||||
text_sets: dict[str, frozenset[str]] = {}
|
||||
for key in members:
|
||||
vetted = (data[key].get("examples") or {}).get("vetted") or []
|
||||
texts = frozenset(e.get("text", "") for e in vetted)
|
||||
if texts:
|
||||
text_sets[key] = texts
|
||||
|
||||
# Check for identical sets
|
||||
seen: dict[frozenset[str], str] = {}
|
||||
for key, texts in text_sets.items():
|
||||
if texts in seen:
|
||||
meaning_a = (data[seen[texts]].get("meaning") or "")[:30]
|
||||
meaning_b = (data[key].get("meaning") or "")[:30]
|
||||
errors.append(
|
||||
f"{seen[texts]} ({meaning_a}) and {key} ({meaning_b}) share {len(texts)} identical example(s)"
|
||||
)
|
||||
else:
|
||||
seen[texts] = key
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
|
||||
"""English meanings must not contain bare Hebrew text (spoils the card)."""
|
||||
name = "no_hebrew_in_meaning"
|
||||
|
|
@ -801,6 +852,7 @@ ALL_TESTS: dict[str, Any] = {
|
|||
"conjugation_form_guids": test_conjugation_form_guids,
|
||||
"conjugation_person_codes": test_conjugation_person_codes,
|
||||
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||||
"no_shared_confusable_examples": test_no_shared_confusable_examples,
|
||||
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
|
||||
"mishkal_consistency": test_mishkal_consistency,
|
||||
}
|
||||
|
|
|
|||
127
tests/test_epub_examples.py
Normal file
127
tests/test_epub_examples.py
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
"""Tests for epub_examples deduplication of confusable group examples."""
|
||||
|
||||
from epub_examples import _deduplicate_confusable_examples
|
||||
|
||||
|
||||
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
|
||||
"""Build a minimal words.json entry for testing."""
|
||||
entry = {
|
||||
"meaning": meaning,
|
||||
"confusable_group": confusable_group,
|
||||
}
|
||||
if vetted_texts is not None:
|
||||
entry["examples"] = {
|
||||
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
|
||||
}
|
||||
if frequency_rank is not None:
|
||||
entry["frequency_rank"] = frequency_rank
|
||||
return entry
|
||||
|
||||
|
||||
class TestDeduplicateConfusableExamples:
|
||||
"""Tests for _deduplicate_confusable_examples()."""
|
||||
|
||||
def test_shared_examples_kept_on_higher_frequency(self):
|
||||
"""When two confusables share identical examples, the one with
|
||||
lower frequency_rank (more common) keeps them."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
|
||||
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_no_action_when_examples_differ(self):
|
||||
"""Groups with different example sets are left untouched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert len(words["key_b"]["examples"]["vetted"]) == 1
|
||||
|
||||
def test_no_action_when_one_has_no_examples(self):
|
||||
"""If only one member has examples, nothing to deduplicate."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_no_frequency_uses_alphabetical_tiebreak(self):
|
||||
"""When no member has frequency data, first alphabetically wins."""
|
||||
group = ["alpha_key", "beta_key"]
|
||||
words = {
|
||||
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
|
||||
"beta_key": _make_entry("meaning2", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
|
||||
assert words["beta_key"]["examples"]["vetted"] == []
|
||||
|
||||
def test_three_way_group(self):
|
||||
"""Three-member group: highest frequency wins, other two cleared."""
|
||||
group = ["key_a", "key_b", "key_c"]
|
||||
words = {
|
||||
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
|
||||
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
|
||||
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 2
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
assert words["key_c"]["examples"]["vetted"] == []
|
||||
|
||||
def test_cloze_removed_from_losers(self):
|
||||
"""Losing entries should have their cloze data removed too."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
|
||||
}
|
||||
# Add cloze to both
|
||||
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert "cloze" not in words["key_b"]["examples"]
|
||||
|
||||
def test_no_confusable_groups_returns_zero(self):
|
||||
"""Words without confusable_group are ignored."""
|
||||
words = {
|
||||
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_mixed_frequency_and_none(self):
|
||||
"""Member with frequency beats member without."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
|
||||
"key_b": _make_entry("no_freq", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_partial_overlap_not_deduplicated(self):
|
||||
"""Groups with overlapping but not identical sentence sets are not touched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
|
||||
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
Loading…
Reference in a new issue