feat: emoji Unicode lookup, conj nikkud, fix summary metric

- Emoji: _load_emoji_lookup() fetches unicode.org emoji-test.txt, builds
  {keyword: emoji_char} map cached in data/emoji_lookup.json. Falls back
  to empty dict on network failure. build_all_variants() loads once and
  passes to all build_vocab_deck() calls. For each word without pealim
  emoji, tries first 5 keywords from English meaning against lookup.
- Nikkud: זכר→זָכָר, נקבה→נְקֵבָה in PRESENT_EXPANSION constants and
  build_conj_deck() 1st-person gender labels.
- Summary: conj audio file count now excludes _infinitive and _passive_
  on-disk extras never bundled in .apkg (was 2235, now shows ~1765).

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-05 21:24:10 +00:00
parent 3fc3a21a33
commit 607fd1a3bc
2 changed files with 82 additions and 16 deletions

View file

@ -320,24 +320,24 @@ CONJ_MODEL = genanki.Model(
# Present-tense expansion: each form key → list of (pronoun, tense_label)
PRESENT_EXPANSION = {
"present_ms": [
("אֲנִי (זכר)", "הוֹוֶה"),
("אַתָּה", "הוֹוֶה"),
("הוּא", "הוֹוֶה"),
("אֲנִי (זָכָר)", "הוֹוֶה"),
("אַתָּה", "הוֹוֶה"),
("הוּא", "הוֹוֶה"),
],
"present_fs": [
("אֲנִי (נקבה)", "הוֹוֶה"),
("אַתְּ", "הוֹוֶה"),
("הִיא", "הוֹוֶה"),
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
("אַתְּ", "הוֹוֶה"),
("הִיא", "הוֹוֶה"),
],
"present_mp": [
("אֲנַחְנוּ (זכר)", "הוֹוֶה"),
("אַתֶּם", "הוֹוֶה"),
("הֵם", "הוֹוֶה"),
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
("אַתֶּם", "הוֹוֶה"),
("הֵם", "הוֹוֶה"),
],
"present_fp": [
("אֲנַחְנוּ (נקבה)", "הוֹוֶה"),
("אַתֶּן", "הוֹוֶה"),
("הֵן", "הוֹוֶה"),
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
("אַתֶּן", "הוֹוֶה"),
("הֵן", "הוֹוֶה"),
],
}
@ -392,6 +392,56 @@ def _conj_audio_tag(slug: str, form_key: str) -> str:
return ""
# Stop-words excluded when matching emoji by meaning keyword
_EMOJI_STOP = frozenset({
"to", "be", "a", "an", "the", "of", "in", "on", "at", "for", "and",
"with", "by", "or", "but", "not", "as", "its", "face", "hand", "sign",
"symbol", "button", "small", "large", "light", "dark", "open", "closed",
})
def _load_emoji_lookup() -> dict[str, str]:
"""Load or fetch Unicode emoji keyword→character lookup.
Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
Result is cached in data/emoji_lookup.json.
Returns empty dict on network failure (safe fallback).
"""
cache_file = DATA_DIR / "emoji_lookup.json"
if cache_file.exists():
with open(cache_file) as f:
return json.load(f)
import requests
try:
resp = requests.get(
"https://unicode.org/Public/emoji/latest/emoji-test.txt",
timeout=30,
)
resp.raise_for_status()
except Exception as e:
logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
return {}
lookup: dict[str, str] = {}
for line in resp.text.splitlines():
if "fully-qualified" not in line:
continue
m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
if not m:
continue
emoji_char = m.group(1)
desc = m.group(2).lower().strip()
for word in desc.split():
word = word.strip(".,'\"-")
if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
lookup[word] = emoji_char
cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
return lookup
def _translate_pos(pos_str: str) -> str:
"""Translate PoS string to Hebrew. Handles 'Verb Pi'el' style."""
for eng, heb in POS_TO_HEBREW.items():
@ -413,6 +463,7 @@ def build_vocab_deck(
examples_cache: Optional[dict] = None,
freq_cache: Optional[dict] = None,
image_cache: Optional[dict] = None,
emoji_lookup: Optional[dict] = None,
limit: Optional[int] = None,
include_audio: bool = True,
include_images: bool = True,
@ -498,10 +549,17 @@ def build_vocab_deck(
continue
seen_words.add(word)
# Extract emoji from meaning
# Extract emoji from meaning (pealim embeds emoji in meaning text)
emoji_str = ''.join(EMOJI_RE.findall(meaning))
meaning_clean = EMOJI_RE.sub('', meaning).strip()
# Fallback: look up emoji from Unicode standard by English keyword
if not emoji_str and emoji_lookup:
for kw in re.sub(r"[^\w\s]", " ", meaning_clean.lower()).split()[:5]:
if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
emoji_str = emoji_lookup[kw]
break
# Extract Hebrew parentheticals (prepositions) from meaning
preps = HBPAREN_RE.findall(meaning_clean)
prep_str = ' '.join(f'({p})' for p in preps)
@ -706,7 +764,7 @@ def build_conj_deck(
# 1st-person forms get a randomly assigned gender label (deterministic per verb)
if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
gender = verb_rng.choice(["זכר", "נקבה"])
gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
pronoun = f"{pronoun} ({gender})"
add_note(pronoun, tense, conj_form, audio_tag)
@ -775,6 +833,9 @@ def build_all_variants(
"""Build all 6 release variants (4 vocab + 2 conj) into output/."""
logger.info("Building all release variants …")
emoji_lookup = _load_emoji_lookup()
logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded")
vocab_variants = [
(False, False, VOCAB_APKG),
(True, False, VOCAB_APKG_AUDIO),
@ -789,6 +850,7 @@ def build_all_variants(
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache or {},
emoji_lookup=emoji_lookup,
limit=limit,
include_audio=audio,
include_images=images,

8
run.py
View file

@ -453,8 +453,12 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
logger.info(f" Vocabulary audio files: {len(mp3s)}")
if AUDIO_CONJ_DIR.exists():
mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3"))
logger.info(f" Conjugation audio files: {len(mp3s)}")
# Count only files that will be bundled: active non-infinitive forms
# (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
mp3s = [p for p in AUDIO_CONJ_DIR.glob("*.mp3")
if not p.stem.endswith("_infinitive")
and "_passive_" not in p.stem]
logger.info(f" Conjugation audio files (bundled): {len(mp3s)}")
image_cache_path = DATA_DIR / "image_cache.json"
if image_cache_path.exists():