fix: vet fallback emoji — verb gate + expanded stop list removes 852 bad matches
The fallback emoji system (keyword→Unicode char matching at build time) was producing 1,733 matches, many with wrong-sense emoji: - "high, tall" → ⚡ (from "high voltage") - "to cut" → 🥩 (cut of meat) - "city" → 🇻🇦 (Vatican flag) Two fixes: 1. Skip fallback for verbs (meanings starting "to ") — 476 removed 2. Expand _EMOJI_STOP with 100+ polysemous/abstract keywords — 376 more Result: 1733 → 881 fallback matches (49% reduction). The 114 from_pealim emojis (concrete nouns like 🍎 apple, 🐪 camel) are unaffected. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
5f617af4ba
commit
f978e5f39a
1 changed files with 114 additions and 2 deletions
116
apkg_builder.py
116
apkg_builder.py
|
|
@ -720,6 +720,116 @@ _EMOJI_STOP = frozenset(
|
|||
"bar",
|
||||
"wheel",
|
||||
"horizontal",
|
||||
# Polysemous keywords producing wrong-sense emoji (Sprint 17 audit)
|
||||
"high", # ⚡ high voltage, not "tall"
|
||||
"down", # 🫳 palm down, not "descend"
|
||||
"off", # 📴 phone off, not "remove"
|
||||
"away", # 💨 dashing away, not "depart"
|
||||
"together", # 🤲 palms together, not "unite"
|
||||
"top", # 🎩 top hat, not "upper"
|
||||
"low", # 🔈 low volume, not "short"
|
||||
"flat", # 🥿 ballet flat, not "apartment"
|
||||
"soft", # 🍦 soft serve, not "quiet"
|
||||
"broken", # 💔 broken heart, not "damaged"
|
||||
"round", # 📍 round pushpin, not "circular"
|
||||
"cool", # 🆒 COOL button, not "cold"
|
||||
"free", # 🆓 FREE button, not "liberated"
|
||||
"long", # 🪘 long drum, not "lengthy"
|
||||
"straight", # 📏 straight ruler, not "direct"
|
||||
"empty", # 🪹 empty nest, not "void"
|
||||
"hot", # 🥵 hot face, not "warm"
|
||||
"cross", # ✝️ latin cross, not "intersect"
|
||||
"bright", # 🔆 bright button, not "luminous"
|
||||
"old", # 👴 old man, not "aged"
|
||||
"head", # 🙂↔️ shaking head, not "leader"
|
||||
# Category words that match generic emoji
|
||||
"military", # 🎖️ military medal for any military term
|
||||
"sports", # 🏅 sports medal for any sports term
|
||||
"food", # 😋 yummy face for any food term
|
||||
"city", # 🇻🇦 Vatican flag for any city
|
||||
"china", # 🇨🇳 China flag for "porcelain"
|
||||
"polish", # 💅 nail polish for "to polish/shine"
|
||||
"aid", # 🦻 hearing aid for "to help"
|
||||
"office", # 🧑💼 office worker for "bureau"
|
||||
"construction", # 🏛️ classical building, not construction
|
||||
"cinema", # 🎦 cinema emoji for any film term
|
||||
"ceremony", # 🎑 moon ceremony for any ceremony
|
||||
"building", # 🏛️ classical building for any structure
|
||||
# Body parts / human features → wrong emoji
|
||||
"arm", # 🦾 mechanical arm for "to arm"
|
||||
"hair", # 👱 blond person for "hair"
|
||||
"nose", # 😤 steam from nose
|
||||
"tongue", # 😛 tongue-out face
|
||||
"chest", # not a chest
|
||||
"eyes", # 😃 face with eyes
|
||||
# Abstract/vague words
|
||||
"fear", # 😱 screaming face
|
||||
"anger", # 💢 anger symbol
|
||||
"angry", # 😠 angry face
|
||||
"tired", # 😫 tired face
|
||||
"sad", # 😥 sad face
|
||||
"joy", # 😂 tears of joy
|
||||
"love", # 💌 love letter
|
||||
"cold", # 🥶 cold face
|
||||
"pile", # 💩 pile of poo
|
||||
"man", # 👨 man
|
||||
"woman", # 👩 woman
|
||||
"boy", # 👦 boy
|
||||
"girl", # 👧 girl
|
||||
"baby", # 👶 baby
|
||||
"children", # 🚸 children crossing
|
||||
"student", # 🧑🎓 student
|
||||
"adult", # 🧑🧑🧒 family
|
||||
"name", # 📛 name badge
|
||||
"check", # ✅ check mark
|
||||
"line", # 🫥 dotted line face
|
||||
"floor", # 🤣 ROFL (rolling on floor)
|
||||
"room", # 🧖 person in steamy room
|
||||
"bubble", # 👁️🗨️ speech bubble
|
||||
"car", # 🚃 railway car, not automobile
|
||||
"bullet", # 🚅 bullet train
|
||||
"steam", # 😤 face with steam
|
||||
"fly", # 🪰 the insect, not the verb
|
||||
"plant", # 🪴 potted plant for all "X (plant)" entries
|
||||
"tree", # 🌲 evergreen for all "X (tree)" entries
|
||||
"ball", # ⛹️ person bouncing ball
|
||||
"bag", # 👝 clutch bag
|
||||
"fight", # not a fight
|
||||
"cloud", # not a cloud
|
||||
"video", # 🎮 video game, not video
|
||||
"rescue", # ⛑️ rescue worker helmet
|
||||
"exchange", # 💱 currency exchange
|
||||
"cut", # 🥩 cut of meat, not "to cut"
|
||||
"key", # 🔐 locked with key
|
||||
"walking", # 🚶 person walking
|
||||
"running", # 🏃 person running
|
||||
"climbing", # 🧗 person climbing
|
||||
"speaking", # 🗣️ speaking head
|
||||
"playing", # 🤽 person playing
|
||||
"feeding", # 👩🍼 person feeding
|
||||
"shooting", # 🌠 shooting star
|
||||
"clapping", # 👏 clapping hands
|
||||
"cooking", # 🍳 cooking emoji
|
||||
"holding", # 🥹 face holding back tears
|
||||
# More wrong-sense matches from remaining audit
|
||||
"paper", # 🏮 red lantern for "paper"
|
||||
"track", # 🛤️ railroad for "track record"
|
||||
"vertical", # 🚦 traffic light for "vertical"
|
||||
"speaker", # 🔇 muted speaker for "speaker (person)"
|
||||
"square", # 🟥 red square for "plaza"
|
||||
"wrapped", # 🎁 gift for "wrapped, bound"
|
||||
"volume", # 🔈 speaker for "volume (book)"
|
||||
"mobile", # 📱 phone for "mobile, moveable"
|
||||
"flash", # 📸 camera flash for "to shine"
|
||||
"identification", # 🪪 ID card for "locating"
|
||||
"service", # 🐕🦺 service dog for "service, term"
|
||||
"ground", # ⛱️ umbrella on ground
|
||||
"machine", # 🎰 slot machine for "mechanism"
|
||||
"liquid", # 🫗 pouring for "liquid, drop"
|
||||
"vehicle", # 🚙 SUV for any vehicle mention
|
||||
"window", # 🪟 window pane for "window, gap"
|
||||
"information", # ℹ️ info symbol
|
||||
"child", # 🧒 child emoji
|
||||
}
|
||||
)
|
||||
|
||||
|
|
@ -921,11 +1031,13 @@ def build_vocab_deck(
|
|||
else:
|
||||
freq_display = "Unlisted"
|
||||
|
||||
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup
|
||||
# Emoji: use entry's emoji if emoji_visible, else fall back to emoji_lookup.
|
||||
# Skip fallback for verbs — keyword matching on verb definitions produces
|
||||
# wrong-sense emoji (e.g. "to cut" → 🥩, "to arm" → 🦾).
|
||||
emoji_str = ""
|
||||
if entry.get("emoji_visible") and entry.get("emoji"):
|
||||
emoji_str = entry["emoji"]
|
||||
elif not emoji_str and emoji_lookup:
|
||||
elif emoji_lookup and not meaning.startswith("to "):
|
||||
meaning_clean_for_emoji = EMOJI_RE.sub("", meaning).strip()
|
||||
for kw in re.sub(r"[^\w\s]", " ", meaning_clean_for_emoji.lower()).split()[:5]:
|
||||
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
|
||||
|
|
|
|||
Loading…
Reference in a new issue