diff --git a/apkg_builder.py b/apkg_builder.py index 1572995..9eb8b6a 100644 --- a/apkg_builder.py +++ b/apkg_builder.py @@ -320,24 +320,24 @@ CONJ_MODEL = genanki.Model( # Present-tense expansion: each form key → list of (pronoun, tense_label) PRESENT_EXPANSION = { "present_ms": [ - ("אֲנִי (זכר)", "הוֹוֶה"), - ("אַתָּה", "הוֹוֶה"), - ("הוּא", "הוֹוֶה"), + ("אֲנִי (זָכָר)", "הוֹוֶה"), + ("אַתָּה", "הוֹוֶה"), + ("הוּא", "הוֹוֶה"), ], "present_fs": [ - ("אֲנִי (נקבה)", "הוֹוֶה"), - ("אַתְּ", "הוֹוֶה"), - ("הִיא", "הוֹוֶה"), + ("אֲנִי (נְקֵבָה)", "הוֹוֶה"), + ("אַתְּ", "הוֹוֶה"), + ("הִיא", "הוֹוֶה"), ], "present_mp": [ - ("אֲנַחְנוּ (זכר)", "הוֹוֶה"), - ("אַתֶּם", "הוֹוֶה"), - ("הֵם", "הוֹוֶה"), + ("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"), + ("אַתֶּם", "הוֹוֶה"), + ("הֵם", "הוֹוֶה"), ], "present_fp": [ - ("אֲנַחְנוּ (נקבה)", "הוֹוֶה"), - ("אַתֶּן", "הוֹוֶה"), - ("הֵן", "הוֹוֶה"), + ("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"), + ("אַתֶּן", "הוֹוֶה"), + ("הֵן", "הוֹוֶה"), ], } @@ -392,6 +392,56 @@ def _conj_audio_tag(slug: str, form_key: str) -> str: return "" +# Stop-words excluded when matching emoji by meaning keyword +_EMOJI_STOP = frozenset({ + "to", "be", "a", "an", "the", "of", "in", "on", "at", "for", "and", + "with", "by", "or", "but", "not", "as", "its", "face", "hand", "sign", + "symbol", "button", "small", "large", "light", "dark", "open", "closed", +}) + + +def _load_emoji_lookup() -> dict[str, str]: + """Load or fetch Unicode emoji keyword→character lookup. + + Parses unicode.org emoji-test.txt to build {keyword: emoji_char} mapping. + Result is cached in data/emoji_lookup.json. + Returns empty dict on network failure (safe fallback). + """ + cache_file = DATA_DIR / "emoji_lookup.json" + if cache_file.exists(): + with open(cache_file) as f: + return json.load(f) + + import requests + try: + resp = requests.get( + "https://unicode.org/Public/emoji/latest/emoji-test.txt", + timeout=30, + ) + resp.raise_for_status() + except Exception as e: + logger.warning(f"Could not fetch emoji data: {e}. Emoji lookup disabled.") + return {} + + lookup: dict[str, str] = {} + for line in resp.text.splitlines(): + if "fully-qualified" not in line: + continue + m = re.search(r"#\s+(\S+)\s+E[\d.]+\s+(.+)", line) + if not m: + continue + emoji_char = m.group(1) + desc = m.group(2).lower().strip() + for word in desc.split(): + word = word.strip(".,'\"-") + if len(word) > 2 and word not in _EMOJI_STOP and word not in lookup: + lookup[word] = emoji_char + + cache_file.write_text(json.dumps(lookup, ensure_ascii=False)) + logger.info(f"Built emoji lookup: {len(lookup)} keywords → {cache_file}") + return lookup + + def _translate_pos(pos_str: str) -> str: """Translate PoS string to Hebrew. Handles 'Verb – Pi'el' style.""" for eng, heb in POS_TO_HEBREW.items(): @@ -413,6 +463,7 @@ def build_vocab_deck( examples_cache: Optional[dict] = None, freq_cache: Optional[dict] = None, image_cache: Optional[dict] = None, + emoji_lookup: Optional[dict] = None, limit: Optional[int] = None, include_audio: bool = True, include_images: bool = True, @@ -498,10 +549,17 @@ def build_vocab_deck( continue seen_words.add(word) - # Extract emoji from meaning + # Extract emoji from meaning (pealim embeds emoji in meaning text) emoji_str = ''.join(EMOJI_RE.findall(meaning)) meaning_clean = EMOJI_RE.sub('', meaning).strip() + # Fallback: look up emoji from Unicode standard by English keyword + if not emoji_str and emoji_lookup: + for kw in re.sub(r"[^\w\s]", " ", meaning_clean.lower()).split()[:5]: + if len(kw) > 2 and kw not in _EMOJI_STOP and kw in emoji_lookup: + emoji_str = emoji_lookup[kw] + break + # Extract Hebrew parentheticals (prepositions) from meaning preps = HBPAREN_RE.findall(meaning_clean) prep_str = ' '.join(f'({p})' for p in preps) @@ -706,7 +764,7 @@ def build_conj_deck( # 1st-person forms get a randomly assigned gender label (deterministic per verb) if form_key in {"past_1s", "past_1p", "future_1s", "future_1p"}: - gender = verb_rng.choice(["זכר", "נקבה"]) + gender = verb_rng.choice(["זָכָר", "נְקֵבָה"]) pronoun = f"{pronoun} ({gender})" add_note(pronoun, tense, conj_form, audio_tag) @@ -775,6 +833,9 @@ def build_all_variants( """Build all 6 release variants (4 vocab + 2 conj) into output/.""" logger.info("Building all release variants …") + emoji_lookup = _load_emoji_lookup() + logger.info(f" Emoji lookup: {len(emoji_lookup)} keywords loaded") + vocab_variants = [ (False, False, VOCAB_APKG), (True, False, VOCAB_APKG_AUDIO), @@ -789,6 +850,7 @@ def build_all_variants( examples_cache=examples_cache, freq_cache=freq_cache, image_cache=image_cache or {}, + emoji_lookup=emoji_lookup, limit=limit, include_audio=audio, include_images=images, diff --git a/run.py b/run.py index abf9ac0..78f56a9 100644 --- a/run.py +++ b/run.py @@ -453,8 +453,12 @@ def print_summary(args, examples_cache, freq_cache, conjugations): logger.info(f" Vocabulary audio files: {len(mp3s)}") if AUDIO_CONJ_DIR.exists(): - mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3")) - logger.info(f" Conjugation audio files: {len(mp3s)}") + # Count only files that will be bundled: active non-infinitive forms + # (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras) + mp3s = [p for p in AUDIO_CONJ_DIR.glob("*.mp3") + if not p.stem.endswith("_infinitive") + and "_passive_" not in p.stem] + logger.info(f" Conjugation audio files (bundled): {len(mp3s)}") image_cache_path = DATA_DIR / "image_cache.json" if image_cache_path.exists():