feat: emoji Unicode lookup, conj nikkud, fix summary metric
- Emoji: _load_emoji_lookup() fetches unicode.org emoji-test.txt, builds
{keyword: emoji_char} map cached in data/emoji_lookup.json. Falls back
to empty dict on network failure. build_all_variants() loads once and
passes to all build_vocab_deck() calls. For each word without pealim
emoji, tries first 5 keywords from English meaning against lookup.
- Nikkud: זכר→זָכָר, נקבה→נְקֵבָה in PRESENT_EXPANSION constants and
build_conj_deck() 1st-person gender labels.
- Summary: conj audio file count now excludes _infinitive and _passive_
on-disk extras never bundled in .apkg (was 2235, now shows ~1765).
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
3fc3a21a33
commit
607fd1a3bc
2 changed files with 82 additions and 16 deletions
|
|
@ -320,24 +320,24 @@ CONJ_MODEL = genanki.Model(
|
|||
# Present-tense expansion: each form key → list of (pronoun, tense_label)
|
||||
PRESENT_EXPANSION = {
|
||||
"present_ms": [
|
||||
("אֲנִי (זכר)", "הוֹוֶה"),
|
||||
("אַתָּה", "הוֹוֶה"),
|
||||
("הוּא", "הוֹוֶה"),
|
||||
("אֲנִי (זָכָר)", "הוֹוֶה"),
|
||||
("אַתָּה", "הוֹוֶה"),
|
||||
("הוּא", "הוֹוֶה"),
|
||||
],
|
||||
"present_fs": [
|
||||
("אֲנִי (נקבה)", "הוֹוֶה"),
|
||||
("אַתְּ", "הוֹוֶה"),
|
||||
("הִיא", "הוֹוֶה"),
|
||||
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
|
||||
("אַתְּ", "הוֹוֶה"),
|
||||
("הִיא", "הוֹוֶה"),
|
||||
],
|
||||
"present_mp": [
|
||||
("אֲנַחְנוּ (זכר)", "הוֹוֶה"),
|
||||
("אַתֶּם", "הוֹוֶה"),
|
||||
("הֵם", "הוֹוֶה"),
|
||||
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
|
||||
("אַתֶּם", "הוֹוֶה"),
|
||||
("הֵם", "הוֹוֶה"),
|
||||
],
|
||||
"present_fp": [
|
||||
("אֲנַחְנוּ (נקבה)", "הוֹוֶה"),
|
||||
("אַתֶּן", "הוֹוֶה"),
|
||||
("הֵן", "הוֹוֶה"),
|
||||
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
|
||||
("אַתֶּן", "הוֹוֶה"),
|
||||
("הֵן", "הוֹוֶה"),
|
||||
],
|
||||
}
|
||||
|
||||
|
|
@ -392,6 +392,56 @@ def _conj_audio_tag(slug: str, form_key: str) -> str:
|
|||
return ""
|
||||
|
||||
|
||||
# Stop-words excluded when matching emoji by meaning keyword
|
||||
_EMOJI_STOP = frozenset({
|
||||
"to", "be", "a", "an", "the", "of", "in", "on", "at", "for", "and",
|
||||
"with", "by", "or", "but", "not", "as", "its", "face", "hand", "sign",
|
||||
"symbol", "button", "small", "large", "light", "dark", "open", "closed",
|
||||
})
|
||||
|
||||
|
||||
def _load_emoji_lookup() -> dict[str, str]:
|
||||
"""Load or fetch Unicode emoji keyword→character lookup.
|
||||
|
||||
Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
|
||||
Result is cached in data/emoji_lookup.json.
|
||||
Returns empty dict on network failure (safe fallback).
|
||||
"""
|
||||
cache_file = DATA_DIR / "emoji_lookup.json"
|
||||
if cache_file.exists():
|
||||
with open(cache_file) as f:
|
||||
return json.load(f)
|
||||
|
||||
import requests
|
||||
try:
|
||||
resp = requests.get(
|
||||
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
|
||||
timeout=30,
|
||||
)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
|
||||
return {}
|
||||
|
||||
lookup: dict[str, str] = {}
|
||||
for line in resp.text.splitlines():
|
||||
if "fully-qualified" not in line:
|
||||
continue
|
||||
m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
|
||||
if not m:
|
||||
continue
|
||||
emoji_char = m.group(1)
|
||||
desc = m.group(2).lower().strip()
|
||||
for word in desc.split():
|
||||
word = word.strip(".,'\"-")
|
||||
if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
|
||||
lookup[word] = emoji_char
|
||||
|
||||
cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
|
||||
logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
|
||||
return lookup
|
||||
|
||||
|
||||
def _translate_pos(pos_str: str) -> str:
|
||||
"""Translate PoS string to Hebrew. Handles 'Verb – Pi'el' style."""
|
||||
for eng, heb in POS_TO_HEBREW.items():
|
||||
|
|
@ -413,6 +463,7 @@ def build_vocab_deck(
|
|||
examples_cache: Optional[dict] = None,
|
||||
freq_cache: Optional[dict] = None,
|
||||
image_cache: Optional[dict] = None,
|
||||
emoji_lookup: Optional[dict] = None,
|
||||
limit: Optional[int] = None,
|
||||
include_audio: bool = True,
|
||||
include_images: bool = True,
|
||||
|
|
@ -498,10 +549,17 @@ def build_vocab_deck(
|
|||
continue
|
||||
seen_words.add(word)
|
||||
|
||||
# Extract emoji from meaning
|
||||
# Extract emoji from meaning (pealim embeds emoji in meaning text)
|
||||
emoji_str = ''.join(EMOJI_RE.findall(meaning))
|
||||
meaning_clean = EMOJI_RE.sub('', meaning).strip()
|
||||
|
||||
# Fallback: look up emoji from Unicode standard by English keyword
|
||||
if not emoji_str and emoji_lookup:
|
||||
for kw in re.sub(r"[^\w\s]", " ", meaning_clean.lower()).split()[:5]:
|
||||
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
|
||||
emoji_str = emoji_lookup[kw]
|
||||
break
|
||||
|
||||
# Extract Hebrew parentheticals (prepositions) from meaning
|
||||
preps = HBPAREN_RE.findall(meaning_clean)
|
||||
prep_str = ' '.join(f'({p})' for p in preps)
|
||||
|
|
@ -706,7 +764,7 @@ def build_conj_deck(
|
|||
|
||||
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
|
||||
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
|
||||
gender = verb_rng.choice(["זכר", "נקבה"])
|
||||
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
|
||||
pronoun = f"{pronoun} ({gender})"
|
||||
|
||||
add_note(pronoun, tense, conj_form, audio_tag)
|
||||
|
|
@ -775,6 +833,9 @@ def build_all_variants(
|
|||
"""Build all 6 release variants (4 vocab + 2 conj) into output/."""
|
||||
logger.info("Building all release variants …")
|
||||
|
||||
emoji_lookup = _load_emoji_lookup()
|
||||
logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded")
|
||||
|
||||
vocab_variants = [
|
||||
(False, False, VOCAB_APKG),
|
||||
(True, False, VOCAB_APKG_AUDIO),
|
||||
|
|
@ -789,6 +850,7 @@ def build_all_variants(
|
|||
examples_cache=examples_cache,
|
||||
freq_cache=freq_cache,
|
||||
image_cache=image_cache or {},
|
||||
emoji_lookup=emoji_lookup,
|
||||
limit=limit,
|
||||
include_audio=audio,
|
||||
include_images=images,
|
||||
|
|
|
|||
8
run.py
8
run.py
|
|
@ -453,8 +453,12 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
|
|||
logger.info(f" Vocabulary audio files: {len(mp3s)}")
|
||||
|
||||
if AUDIO_CONJ_DIR.exists():
|
||||
mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3"))
|
||||
logger.info(f" Conjugation audio files: {len(mp3s)}")
|
||||
# Count only files that will be bundled: active non-infinitive forms
|
||||
# (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
|
||||
mp3s = [p for p in AUDIO_CONJ_DIR.glob("*.mp3")
|
||||
if not p.stem.endswith("_infinitive")
|
||||
and "_passive_" not in p.stem]
|
||||
logger.info(f" Conjugation audio files (bundled): {len(mp3s)}")
|
||||
|
||||
image_cache_path = DATA_DIR / "image_cache.json"
|
||||
if image_cache_path.exists():
|
||||
|
|
|
|||
Loading…
Reference in a new issue