feat: emoji Unicode lookup, conj nikkud, fix summary metric

- Emoji: _load_emoji_lookup() fetches unicode.org emoji-test.txt, builds {keyword: emoji_char} map cached in data/emoji_lookup.json. Falls back to empty dict on network failure. build_all_variants() loads once and passes to all build_vocab_deck() calls. For each word without pealim emoji, tries first 5 keywords from English meaning against lookup. - Nikkud: זכר→זָכָר, נקבה→נְקֵבָה in PRESENT_EXPANSION constants and build_conj_deck() 1st-person gender labels. - Summary: conj audio file count now excludes _infinitive and _passive_ on-disk extras never bundled in .apkg (was 2235, now shows ~1765). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-05 21:24:10 +00:00 · 2026-03-05 21:24:10 +00:00 · 607fd1a3bc
commit 607fd1a3bc
parent 3fc3a21a33
2 changed files with 82 additions and 16 deletions
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -320,24 +320,24 @@ CONJ_MODEL = genanki.Model(
 # Present-tense expansion: each form key → list of (pronoun, tense_label)
 PRESENT_EXPANSION = {
    "present_ms": [
-        ("אֲנִי (זכר)",    "הוֹוֶה"),
-        ("אַתָּה",          "הוֹוֶה"),
-        ("הוּא",            "הוֹוֶה"),
+        ("אֲנִי (זָכָר)",    "הוֹוֶה"),
+        ("אַתָּה",            "הוֹוֶה"),
+        ("הוּא",              "הוֹוֶה"),
    ],
    "present_fs": [
-        ("אֲנִי (נקבה)",   "הוֹוֶה"),
-        ("אַתְּ",           "הוֹוֶה"),
-        ("הִיא",            "הוֹוֶה"),
+        ("אֲנִי (נְקֵבָה)",  "הוֹוֶה"),
+        ("אַתְּ",             "הוֹוֶה"),
+        ("הִיא",              "הוֹוֶה"),
    ],
    "present_mp": [
-        ("אֲנַחְנוּ (זכר)",  "הוֹוֶה"),
-        ("אַתֶּם",           "הוֹוֶה"),
-        ("הֵם",              "הוֹוֶה"),
+        ("אֲנַחְנוּ (זָכָר)",  "הוֹוֶה"),
+        ("אַתֶּם",              "הוֹוֶה"),
+        ("הֵם",                "הוֹוֶה"),
    ],
    "present_fp": [
-        ("אֲנַחְנוּ (נקבה)", "הוֹוֶה"),
-        ("אַתֶּן",            "הוֹוֶה"),
-        ("הֵן",               "הוֹוֶה"),
+        ("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
+        ("אַתֶּן",               "הוֹוֶה"),
+        ("הֵן",                  "הוֹוֶה"),
    ],
 }

@ -392,6 +392,56 @@ def _conj_audio_tag(slug: str, form_key: str) -> str:
    return ""


+# Stop-words excluded when matching emoji by meaning keyword
+_EMOJI_STOP = frozenset({
+    "to", "be", "a", "an", "the", "of", "in", "on", "at", "for", "and",
+    "with", "by", "or", "but", "not", "as", "its", "face", "hand", "sign",
+    "symbol", "button", "small", "large", "light", "dark", "open", "closed",
+})
+
+
+def _load_emoji_lookup() -> dict[str, str]:
+    """Load or fetch Unicode emoji keyword→character lookup.
+
+    Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping.
+    Result is cached in data/emoji_lookup.json.
+    Returns empty dict on network failure (safe fallback).
+    """
+    cache_file = DATA_DIR / "emoji_lookup.json"
+    if cache_file.exists():
+        with open(cache_file) as f:
+            return json.load(f)
+
+    import requests
+    try:
+        resp = requests.get(
+            "https://unicode.org/Public/emoji/latest/emoji-test.txt",
+            timeout=30,
+        )
+        resp.raise_for_status()
+    except Exception as e:
+        logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.")
+        return {}
+
+    lookup: dict[str, str] = {}
+    for line in resp.text.splitlines():
+        if "fully-qualified" not in line:
+            continue
+        m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line)
+        if not m:
+            continue
+        emoji_char = m.group(1)
+        desc = m.group(2).lower().strip()
+        for word in desc.split():
+            word = word.strip(".,'\"-")
+            if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup:
+                lookup[word] = emoji_char
+
+    cache_file.write_text(json.dumps(lookup, ensure_ascii=False))
+    logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}")
+    return lookup
+
+
 def _translate_pos(pos_str: str) -> str:
    """Translate PoS string to Hebrew. Handles 'Verb – Pi'el' style."""
    for eng, heb in POS_TO_HEBREW.items():
@ -413,6 +463,7 @@ def build_vocab_deck(
    examples_cache: Optional[dict] = None,
    freq_cache: Optional[dict] = None,
    image_cache: Optional[dict] = None,
+    emoji_lookup: Optional[dict] = None,
    limit: Optional[int] = None,
    include_audio: bool = True,
    include_images: bool = True,
@ -498,10 +549,17 @@ def build_vocab_deck(
            continue
        seen_words.add(word)

-        # Extract emoji from meaning
+        # Extract emoji from meaning (pealim embeds emoji in meaning text)
        emoji_str   = ''.join(EMOJI_RE.findall(meaning))
        meaning_clean = EMOJI_RE.sub('', meaning).strip()

+        # Fallback: look up emoji from Unicode standard by English keyword
+        if not emoji_str and emoji_lookup:
+            for kw in re.sub(r"[^\w\s]", " ", meaning_clean.lower()).split()[:5]:
+                if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup:
+                    emoji_str = emoji_lookup[kw]
+                    break
+
        # Extract Hebrew parentheticals (prepositions) from meaning
        preps = HBPAREN_RE.findall(meaning_clean)
        prep_str = ' '.join(f'({p})' for p in preps)
@ -706,7 +764,7 @@ def build_conj_deck(

            # 1st-person forms get a randomly assigned gender label (deterministic per verb)
            if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}:
-                gender = verb_rng.choice(["זכר", "נקבה"])
+                gender = verb_rng.choice(["זָכָר", "נְקֵבָה"])
                pronoun = f"{pronoun} ({gender})"

            add_note(pronoun, tense, conj_form, audio_tag)
@ -775,6 +833,9 @@ def build_all_variants(
    """Build all 6 release variants (4 vocab + 2 conj) into output/."""
    logger.info("Building all release variants …")

+    emoji_lookup = _load_emoji_lookup()
+    logger.info(f"  Emoji lookup: {len(emoji_lookup)} keywords loaded")
+
    vocab_variants = [
        (False, False, VOCAB_APKG),
        (True,  False, VOCAB_APKG_AUDIO),
@ -789,6 +850,7 @@ def build_all_variants(
            examples_cache=examples_cache,
            freq_cache=freq_cache,
            image_cache=image_cache or {},
+            emoji_lookup=emoji_lookup,
            limit=limit,
            include_audio=audio,
            include_images=images,
--- a/run.py
+++ b/run.py
@ -453,8 +453,12 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
        logger.info(f"  Vocabulary audio files: {len(mp3s)}")

    if AUDIO_CONJ_DIR.exists():
-        mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3"))
-        logger.info(f"  Conjugation audio files: {len(mp3s)}")
+        # Count only files that will be bundled: active non-infinitive forms
+        # (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
+        mp3s = [p for p in AUDIO_CONJ_DIR.glob("*.mp3")
+                if not p.stem.endswith("_infinitive")
+                and "_passive_" not in p.stem]
+        logger.info(f"  Conjugation audio files (bundled): {len(mp3s)}")

    image_cache_path = DATA_DIR / "image_cache.json"
    if image_cache_path.exists():