Sprint 17: homograph example dedup + plural audio + prep extraction
- Homograph collision fix: _deduplicate_confusable_examples() clears shared examples from less-common confusable group members (36 entries fixed). Keeps examples only on highest-frequency meaning. - Plural deck audio: wired up PluralAudio field in apkg_builder.py, downloaded 613 plural audio files from pealim.com for all deck entries. - Prep extraction upstream: moved Hebrew preposition parsing from build time into list/detail scrapers (SCHEMA.yaml prep field added). - Validation: new no_shared_confusable_examples check in validate_data.py - Tests: 9 new unit tests for confusable deduplication (98 total) - Release: v0.19 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
0d92451271
commit
af186e2030
9 changed files with 29782 additions and 14386 deletions
|
|
@ -27,6 +27,7 @@ entry:
|
|||
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
||||
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
||||
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
||||
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
|
||||
audio_url: "https://..." # Pealim audio URL
|
||||
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
||||
tags: "" # Pealim tags if any
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
|||
|
||||
# Release version tag added to all notes so users can identify which release
|
||||
# their cards come from (visible in Anki's Browse view and card info).
|
||||
RELEASE_TAG = "v0.18"
|
||||
RELEASE_TAG = "v0.19"
|
||||
|
||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||
|
|
@ -906,9 +906,14 @@ def build_vocab_deck(
|
|||
emoji_str = emoji_lookup[kw]
|
||||
break
|
||||
|
||||
# Extract Hebrew prepositions from meaning_raw
|
||||
preps = HBPAREN_RE.findall(meaning_raw)
|
||||
prep_str = " ".join(f"({p})" for p in preps)
|
||||
# Extract Hebrew prepositions: prefer upstream-parsed prep field, fall back to meaning_raw scan
|
||||
# (fallback covers entries scraped before prep was moved upstream)
|
||||
entry_prep = entry.get("prep")
|
||||
if entry_prep:
|
||||
prep_str = " ".join(f"({p})" for p in entry_prep.split())
|
||||
else:
|
||||
preps = HBPAREN_RE.findall(meaning_raw)
|
||||
prep_str = " ".join(f"({p})" for p in preps)
|
||||
|
||||
# Audio — use audio_file from entry; for confusables it's already slug-based
|
||||
|
|
|
|||
Loading…
Reference in a new issue