feat: curated emoji denylist, vocab audio URLs in CSV
- Expanded _EMOJI_STOP from ~20 to ~80 keywords after manual review of all 2,261 emoji-word pairs. Removes false positives from polysemous words (french→🍟, water→🤽, rock→🪨, etc.) - Emoji count: 2,261 → 1,820 (removed ~440 bad matches) - hebrew_dict.csv now populated with audio_url from pealim.com scrape (8,727 words with audio URLs) - Cached emoji_lookup.json (1,749 keywords from Unicode emoji-test.txt) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
607fd1a3bc
commit
419e952389
4 changed files with 18240 additions and 21220 deletions
|
|
@ -392,11 +392,35 @@ def _conj_audio_tag(slug: str, form_key: str) -> str:
|
|||
return ""
|
||||
|
||||
|
||||
# Stop-words excluded when matching emoji by meaning keyword
|
||||
# Keywords excluded when building emoji lookup AND matching meaning text.
|
||||
# Curated from manual review of all 2,261 emoji-word pairs (Sprint 8).
|
||||
_EMOJI_STOP = frozenset({
|
||||
# Basic stop words
|
||||
"to", "be", "a", "an", "the", "of", "in", "on", "at", "for", "and",
|
||||
"with", "by", "or", "but", "not", "as", "its", "face", "hand", "sign",
|
||||
"symbol", "button", "small", "large", "light", "dark", "open", "closed",
|
||||
"with", "by", "or", "but", "not", "as", "its",
|
||||
# Generic emoji description words (too vague)
|
||||
"face", "hand", "sign", "symbol", "button", "small", "large",
|
||||
"light", "dark", "open", "closed",
|
||||
# Numbers → clock emoji (🕐🕑🕒 etc.)
|
||||
"one", "two", "three", "four", "five", "six", "seven", "eight",
|
||||
"nine", "ten", "hundred", "thousand",
|
||||
# UI/media buttons (⏭️▶️⏪⏯️🔁🔚🔜⏺️)
|
||||
"next", "fast", "play", "pause", "repeat", "end", "soon", "record",
|
||||
# Abstract words → misleading object emoji
|
||||
"part", "place", "mark", "post", "department", "store", "note",
|
||||
"control", "level", "stop", "cover", "roll", "rolling", "pick",
|
||||
"over", "right", "way", "skin", "drop", "middle", "piece", "section",
|
||||
# Country/direction words → flag emoji (🇰🇵🇬🇸🇮🇴🇻🇦🇨🇫 etc.)
|
||||
"north", "south", "northern", "southern", "western", "eastern",
|
||||
"central", "territory", "kingdom", "united", "virgin",
|
||||
# Common words producing bad emoji matches
|
||||
"new", "big", "full", "last", "first", "double", "slightly",
|
||||
"without", "from", "behind", "people", "position", "status",
|
||||
"situation", "game", "call", "trade", "male", "female", "person",
|
||||
"letter",
|
||||
# Polysemous words → wrong emoji sense
|
||||
"french", "fried", "board", "bow", "water", "union", "rock",
|
||||
"left", "back", "crane", "dash", "bar", "wheel", "horizontal",
|
||||
})
|
||||
|
||||
|
||||
|
|
|
|||
1
data/emoji_lookup.json
Normal file
1
data/emoji_lookup.json
Normal file
File diff suppressed because one or more lines are too long
18212
data/hebrew_dict.csv
18212
data/hebrew_dict.csv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
Loading…
Reference in a new issue