Sprint 15: example sentence pipeline overhaul + corpus expansion + card improvements

- Regenerated all example sentences from scratch (deleted legacy + stale entries) - Added .txt file support to epub_examples.py for Ben Yehuda corpus - 7 Ben Yehuda nikkud'd children's texts + 3 new Time Tunnel EPUBs - Maqaf-stripped construct form indexing (+68% inflected matches) - Total: 3,598 words with examples, 3,289 with cloze (was ~2,900) - Cloze prefix preservation (_cloze_prefix_len) - Hebrew spoiler stripping from English meanings - Gender field (זָכָר/נְקֵבָה) on vocab cards - sec-table CSS layout for aligned key:value pairs - Mishkal uses mishkal_hebrew on plural cards - Improved mishkal extraction from pealim detail pages - 21 new pytest tests (cloze, PoS, Hebrew stripping, gender, mishkal) - 2 new validate_data.py tests + mishkal stats - Colliding forms tracking (local-only) - Release tag v0.17 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 10:44:14 +00:00 · 2026-03-10 10:44:14 +00:00 · c85063ee2f
commit c85063ee2f
parent efd0745ada
7 changed files with 15634 additions and 3273 deletions
--- a/.gitignore
+++ b/.gitignore
@ -15,6 +15,7 @@ __pycache__/

 # Large generated cache files (rebuild locally)
 data/benyehuda_index.json
+data/colliding_forms.json

 # Audio directories (large; rebuild locally)
 data/audio/
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903

 # Release version tag added to all notes so users can identify which release
 # their cards come from (visible in Anki's Browse view and card info).
-RELEASE_TAG = "v0.16"
+RELEASE_TAG = "v0.17"

 # Regex for extracting emoji and Hebrew prepositions from meaning strings
 EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -185,18 +185,34 @@ CARD_CSS = """
  font-weight: normal;
  color: #555;
 }
+.sec-table {
+  display: table;
+  margin: 6px auto 0;
+  direction: rtl;
+  border-collapse: collapse;
+}
 .sec-label {
+  display: table-row;
  font-size: 28px;
  font-weight: normal;
  color: #222;
  direction: rtl;
-  text-align: center;
-  margin-top: 6px;
 }
 .sec-key {
+  display: table-cell;
  font-size: 28px;
  color: #222;
  font-weight: bold;
+  text-align: right;
+  padding: 2px 0 2px 8px;
+  white-space: nowrap;
+}
+.sec-val {
+  display: table-cell;
+  font-size: 28px;
+  color: #222;
+  text-align: right;
+  padding: 2px 0;
 }
 .definitions {
  direction: rtl;
@ -231,6 +247,7 @@ CARD_CSS = """
  .root-info   { color: #e0e0e0; }
  .sec-label   { color: #e0e0e0; }
  .sec-key     { color: #e0e0e0; }
+  .sec-val     { color: #e0e0e0; }
  .conf-entry  { color: #ddd; }
  .hint        { color: #777; }
  .voice-label { color: #888; }
@ -255,14 +272,17 @@ VOCAB_BACK_HEB = """
 <div class="meaning">{{Meaning}}</div>
 {{#Emoji}}<div class="emoji-img">{{Emoji}}</div>{{/Emoji}}
 {{^Emoji}}{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;" onerror="this.parentElement.style.display='none'"></div>{{/Image}}{{/Emoji}}
-{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
-{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
+<div class="sec-table">
+{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
+{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
+{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
+{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
+</div>
 {{#SharedRoots}}
-<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
+<div class="divider" style="margin:6px 0;"></div>
+<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
 <div class="root-info">{{SharedRoots}}</div>
 {{/SharedRoots}}
-{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
-{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
 """

 VOCAB_FRONT_ENG = """
@ -277,14 +297,17 @@ VOCAB_BACK_ENG = """
 <div class="divider"></div>
 <div class="hebrew">{{Word}}{{#Prep}} <span class="hebrew-sm">{{Prep}}</span>{{/Prep}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
-{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
-{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
-{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
+<div class="sec-table">
+{{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">{{WordNoNikkud}}</span></div>{{/WordNoNikkud}}
+{{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>{{/Root}}
+{{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">{{PoS}}{{#Gender}}, {{Gender}}{{/Gender}}</span></div>{{/PoS}}
+{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">{{Plural}}</span></div>{{/Plural}}
+</div>
 {{#SharedRoots}}
-<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
+<div class="divider" style="margin:6px 0;"></div>
+<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
 <div class="root-info">{{SharedRoots}}</div>
 {{/SharedRoots}}
-{{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
 """

 VOCAB_FRONT_CLOZE = """
@ -318,6 +341,7 @@ VOCAB_MODEL = genanki.Model(
        {"name": "Prep"},
        {"name": "Hint"},
        {"name": "Plural"},
+        {"name": "Gender"},
        {"name": "ClozeExample"},
        {"name": "ClozeHint"},
    ],
@ -359,11 +383,16 @@ CONJ_BACK = """
 {{FrontSide}}<hr>
 <div class="hebrew">{{ConjugatedForm}}{{#Prep}} ({{Prep}}){{/Prep}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
-{{#Meaning}}<div class="sec-label">{{Meaning}}</div>{{/Meaning}}
-<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>
-<div class="sec-label"><span class="sec-key">בִּנְיָן:</span> {{Binyan}}</div>
-{{#RelatedVocab}}<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
-<div class="sec-label">{{RelatedVocab}}</div>{{/RelatedVocab}}
+{{#Meaning}}<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>{{/Meaning}}
+<div class="sec-table">
+<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">{{Root}}</span></div>
+<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">{{Binyan}}</span></div>
+</div>
+{{#RelatedVocab}}
+<div class="divider" style="margin:6px 0;"></div>
+<div class="sec-label" style="text-align:center;display:block;"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
+<div class="root-info">{{RelatedVocab}}</div>
+{{/RelatedVocab}}
 """

 CONJ_CSS = CARD_CSS
@ -703,6 +732,32 @@ def _forms_list_to_dict(forms_list: list[dict]) -> dict[str, dict]:
    return result


+# Hebrew prefix letters (אותיות השימוש): בהוכלמש
+_PREFIX_LETTERS = frozenset("בהוכלמש")
+
+
+def _cloze_prefix_len(cloze_token: str, word_nikkud: str) -> int:
+    """Return the number of characters in the cloze token that are prefix (not part of the word).
+
+    For "בַּתּוֹר" with word_nikkud "תּוֹר", returns 2 (ב + patach = 2 chars).
+    Returns 0 if the token starts with the word directly.
+    """
+    if not word_nikkud or not cloze_token:
+        return 0
+    # If the token starts with the word nikkud, no prefix
+    if cloze_token.startswith(word_nikkud):
+        return 0
+    # Check if word nikkud appears as a suffix of the token
+    idx = cloze_token.find(word_nikkud)
+    if idx > 0:
+        # Verify prefix chars are valid Hebrew prefix letters
+        prefix_part = cloze_token[:idx]
+        base_letters = [c for c in prefix_part if "\u05d0" <= c <= "\u05ea"]
+        if base_letters and all(c in _PREFIX_LETTERS for c in base_letters):
+            return idx
+    return 0
+
+
 def build_vocab_deck(
    words: dict[str, dict],
    limit: int | None = None,
@ -758,7 +813,11 @@ def build_vocab_deck(
        pos_heb = entry.get("pos_hebrew", "")
        meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
        meaning = HBPAREN_RE.sub("", meaning).strip()
-        meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
+        # Strip remaining bare Hebrew text (spoiler: ktiv male visible in English meaning)
+        meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
+        meaning = re.sub(r"[;:]\s*—", " —", meaning)  # clean "; —" → " —"
+        meaning = re.sub(r";\s*:", ";", meaning)  # clean "; :" → ";"
+        meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
        meaning = re.sub(r"(\w)\(", r"\1 (", meaning)  # space before opening paren
        meaning = re.sub(r",(\S)", r", \1", meaning)  # space after comma
        meaning_raw = entry.get("meaning_raw", "") or ""
@ -850,10 +909,13 @@ def build_vocab_deck(
            start = cloze_data.get("cloze_word_start")
            end = cloze_data.get("cloze_word_end")
            if cloze_text and start is not None and end is not None:
-                cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
-                # Clean up duplicate/misplaced quotation marks
+                # Preserve Hebrew prefix letters in the cloze blank
+                # e.g. "בַּתּוֹר" for word "תּוֹר" → "בַּ_____" not "_____"
+                cloze_token = cloze_text[start:end]
+                prefix_chars = _cloze_prefix_len(cloze_token, word_nikkud)
+                cloze_example = cloze_text[: start + prefix_chars] + "_____" + cloze_text[end:]
+                # Clean up duplicate adjacent quotation marks (e.g. "" → ")
                cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
-                cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
                raw_hint = cloze_data.get("cloze_hint") or ""
                if raw_hint:
                    cloze_hint = raw_hint
@ -886,12 +948,19 @@ def build_vocab_deck(
                    parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
            related_html = "\n".join(parts)

-        # Plural form (nouns only — guard against adjective/verb inflection bleed)
+        # Plural form and gender (nouns only)
        plural_str = ""
+        gender_str = ""
        if pos_raw.startswith("Noun"):
            noun_inflection = entry.get("noun_inflection")
-            if noun_inflection and noun_inflection.get("plural"):
-                plural_str = noun_inflection["plural"].get("nikkud", "")
+            if noun_inflection:
+                if noun_inflection.get("plural"):
+                    plural_str = noun_inflection["plural"].get("nikkud", "")
+                gender_raw = noun_inflection.get("gender") or ""
+                if gender_raw == "masculine":
+                    gender_str = "זָכָר"
+                elif gender_raw == "feminine":
+                    gender_str = "נְקֵבָה"

        # Image
        image_tag = ""
@ -927,6 +996,7 @@ def build_vocab_deck(
                prep_str,
                hint_str,
                plural_str,
+                gender_str,
                cloze_example,
                cloze_hint,
            ],
@ -941,7 +1011,8 @@ def build_vocab_deck(
    prep_count = sum(1 for n in deck.notes if n.fields[12])
    hint_count = sum(1 for n in deck.notes if n.fields[13])
    plural_count = sum(1 for n in deck.notes if n.fields[14])
-    cloze_count = sum(1 for n in deck.notes if n.fields[15])
+    gender_count = sum(1 for n in deck.notes if n.fields[15])
+    cloze_count = sum(1 for n in deck.notes if n.fields[16])
    unlisted = sum(1 for _, e in sorted_entries if (e.get("frequency") or 999_999) >= 999_999)
    if emoji_count:
        logger.info(f"  Emoji extracted: {emoji_count} words")
@ -951,6 +1022,8 @@ def build_vocab_deck(
        logger.info(f"  Eng→Heb hints: {hint_count} words")
    if plural_count:
        logger.info(f"  Noun plurals on vocab cards: {plural_count} words")
+    if gender_count:
+        logger.info(f"  Noun gender on vocab cards: {gender_count} words")
    if cloze_count:
        logger.info(f"  Sentence cloze cards: {cloze_count} words")
    logger.info(f"  Unlisted words (not in frequency corpus): {unlisted}/{len(sorted_entries)}")
@ -1330,10 +1403,12 @@ def build_confusables_deck(
            w = e["word"]["nikkud"]
            m = e.get("meaning", "")
            p = e.get("pos_hebrew", "")
-            pos_label = f" ({p})" if p else ""
+            pos_div = f'<div style="font-size:18px; color:#888;">{p}</div>' if p else ""
            defs_parts.append(
-                f'<div class="conf-entry"><span class="hebrew" style="font-size:24px;">{w}</span>'
-                f" = {m}{pos_label}</div>"
+                f'<div class="conf-entry">'
+                f'<span class="hebrew" style="font-size:24px;">{w}</span>'
+                f'<div style="direction:ltr; text-align:center; font-size:22px;">{m}</div>'
+                f"{pos_div}</div>"
            )
            if include_audio:
                af = e.get("audio_file", "") or ""
@ -1397,8 +1472,10 @@ PLURAL_BACK_SG = """
 {{FrontSide}}<hr>
 <div class="hebrew">{{Plural}}</div>
 {{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
-{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
-{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
+<div class="sec-table">
+{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
+{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
+</div>
 """

 PLURAL_FRONT_PL = """
@ -1411,9 +1488,11 @@ PLURAL_BACK_PL = """
 {{FrontSide}}<hr>
 <div class="hebrew">{{Singular}}</div>
 {{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
-<div class="sec-label">{{Meaning}}</div>
-{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
-{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
+<div class="sec-label" style="text-align:center;display:block;">{{Meaning}}</div>
+<div class="sec-table">
+{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span><span class="sec-val">{{Gender}}</span></div>{{/Gender}}
+{{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span><span class="sec-val">{{Mishkal}}</span></div>{{/Mishkal}}
+</div>
 """

 PLURAL_CSS = CARD_CSS
@ -1501,13 +1580,25 @@ def build_plural_deck(
        elif mishkal:
            by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))

-    # Select exemplars per mishkal, preferring high-frequency words
-    per_mishkal = 6
+    # Select regular exemplars to achieve a 2:1 regular:irregular ratio.
+    # Distribute evenly across mishkal patterns, preferring high-frequency words.
+    irregular_count = len(irregulars)
+    target_regular = irregular_count * 2
+    mishkal_count = len(by_mishkal) or 1
+    per_mishkal = max(2, target_regular // mishkal_count)

    selected: list[tuple[str, dict, dict]] = list(irregulars)
+    regular_pool: list[tuple[str, dict, dict]] = []
    for _mishkal, entries in sorted(by_mishkal.items()):
        entries.sort(key=lambda e: e[1].get("frequency") or 999_999)
-        selected.extend(entries[:per_mishkal])
+        regular_pool.extend(entries[:per_mishkal])
+
+    # If we overshot, trim to target (keeping highest-frequency across all mishkals)
+    if len(regular_pool) > target_regular:
+        regular_pool.sort(key=lambda e: e[1].get("frequency") or 999_999)
+        regular_pool = regular_pool[:target_regular]
+
+    selected.extend(regular_pool)

    note_count = 0
    for _unique_key, entry, noun_inflection in selected:
@ -1517,7 +1608,7 @@ def build_plural_deck(
        plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
        gender = noun_inflection.get("gender") or ""
        gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
-        mishkal = noun_inflection.get("mishkal") or ""
+        mishkal_heb = noun_inflection.get("mishkal_hebrew") or ""
        meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
        root_list = entry.get("root") or []
        root = ".".join(root_list)
@ -1537,9 +1628,10 @@ def build_plural_deck(
                if mp3_path not in media_files:
                    media_files.append(mp3_path)

+        mishkal_eng = noun_inflection.get("mishkal") or ""
        tags = [RELEASE_TAG]
-        if mishkal:
-            tags.append(f"mishkal::{mishkal}")
+        if mishkal_eng:
+            tags.append(f"mishkal::{mishkal_eng}")
        if _is_irregular_plural(gender, plural_ktiv):
            tags.append("irregular")

@ -1553,7 +1645,7 @@ def build_plural_deck(
                pl_audio,
                meaning,
                root,
-                mishkal,
+                mishkal_heb,
                gender_heb,
            ],
            tags=tags,
--- a/data/words.json
+++ b/data/words.json
--- a/epub_examples.py
+++ b/epub_examples.py
@ -29,7 +29,7 @@ WORDS_JSON = DATA_DIR / "words.json"

 # Book metadata: filename -> display name
 def _discover_epubs() -> dict[str, str]:
-    """Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
+    """Auto-discover all .epub and .txt files in EPUB_DIR, returning {filepath: display_name}."""
    if not EPUB_DIR.exists():
        return {}
    books: dict[str, str] = {}
@ -50,6 +50,9 @@ def _discover_epubs() -> dict[str, str]:
        else:
            name = stem_stripped[:40]
        books[str(path)] = name
+    # Also discover plain-text files (e.g. Ben Yehuda downloads)
+    for path in sorted(EPUB_DIR.glob("*.txt")):
+        books[str(path)] = path.stem
    return books


@ -196,6 +199,20 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
    return _split_into_sentences(full_text, book_name)


+def extract_sentences_from_text(text_path: Path, book_name: str) -> list[dict]:
+    """Extract sentences from a plain-text file (e.g. Ben Yehuda downloads).
+
+    Args:
+        text_path: Path to the .txt file.
+        book_name: Human-readable book name used as the ``source`` field.
+
+    Returns:
+        List of ``{"text": str, "source": str}`` dicts.
+    """
+    full_text = text_path.read_text(encoding="utf-8")
+    return _split_into_sentences(full_text, book_name)
+
+
 # ── Sentence splitting ───────────────────────────────────────────

 # Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
@ -480,7 +497,12 @@ def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:

        for field in ("singular", "plural", "construct_singular", "construct_plural"):
            sub = noun.get(field) or {}
-            _add(sub.get("nikkud"), unique_key, "inflected")
+            form = sub.get("nikkud")
+            _add(form, unique_key, "inflected")
+            # Index construct forms without maqaf too — modern text often
+            # writes smichut as two space-separated words without maqaf
+            if form and form.endswith("־"):
+                _add(form[:-1], unique_key, "inflected")

        pronominal = noun.get("pronominal_suffixes") or {}
        for _person, sub in pronominal.items():
@ -720,7 +742,10 @@ def run(words: dict) -> dict:

    for filepath, book_name in _discover_epubs().items():
        path = Path(filepath)
-        sentences = extract_sentences_from_epub(path, book_name)
+        if path.suffix == ".txt":
+            sentences = extract_sentences_from_text(path, book_name)
+        else:
+            sentences = extract_sentences_from_epub(path, book_name)
        book_counts[book_name] = len(sentences)
        all_sentences.extend(sentences)
        logger.info(f"    {book_name}: {len(sentences)} sentences")
--- a/pealim_detail_scrape.py
+++ b/pealim_detail_scrape.py
@ -459,15 +459,29 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
    """
    Extract (gender, mishkal) from the PoS section of the detail page.
    Returns ("masculine"|"feminine"|"", mishkal_english|"").
+
+    Pealim HTML structure:
+      <p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>
+    The mishkal is in the <i> tag (k-notation, e.g. "ketel") or the nm= URL param (q-notation).
+    Some nouns have no mishkal link: <p>Noun – masculine</p>
    """
    gender = ""
    mishkal = ""

-    # Try various selectors that pealim uses for PoS info
-    pos_section = soup.find("div", class_="pos") or soup.find("p", class_="pos")
+    # Find the PoS <p> tag — on pealim detail pages it's a bare <p> like
+    # "Noun – ketel pattern, masculine" or "Adjective – katul pattern"
+    pos_section = None
+    for p in soup.find_all("p"):
+        text = p.get_text(" ", strip=True)
+        if re.match(r"^(Noun|Adjective)\b", text):
+            pos_section = p
+            break
+
+    # Fall back to older selectors (div.pos, p.pos, div.page-header)
    if not pos_section:
-        # Look for it in the page header area
-        pos_section = soup.find("div", class_="page-header")
+        pos_section = (
+            soup.find("div", class_="pos") or soup.find("p", class_="pos") or soup.find("div", class_="page-header")
+        )

    if pos_section:
        text = pos_section.get_text(" ", strip=True)
@ -476,13 +490,21 @@ def _parse_noun_gender_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
            if raw in text.lower():
                gender = canonical
                break
-        # Mishkal detection: look for CaCaC-style patterns
-        mishkal_match = re.search(r"\b([A-Z][a-zA-Z\']+)\b", text)
-        if mishkal_match:
-            candidate = mishkal_match.group(1)
-            # Validate: mishkal names contain uppercase letters in CaCaC pattern
-            if re.match(r"^[A-Za-z\']+$", candidate) and any(c.isupper() for c in candidate):
-                mishkal = candidate
+
+        # Mishkal detection: extract from <a href="...nm=XXXX"><i>YYYY</i> pattern</a>
+        # Nouns use nm= param, adjectives use am= param
+        mishkal_link = pos_section.find("a", href=re.compile(r"[na]m="))
+        if mishkal_link:
+            # Prefer <i> tag text (k-notation, matches _MISHKAL_HEBREW_Q after k→q)
+            i_tag = mishkal_link.find("i")
+            if i_tag:
+                mishkal = i_tag.get_text(strip=True)
+            else:
+                # Fall back to nm= URL parameter (already q-notation)
+                href = mishkal_link.get("href", "")
+                nm_match = re.search(r"[na]m=([a-zA-Z']+)", href)
+                if nm_match:
+                    mishkal = nm_match.group(1)

    # Also check the og:description or breadcrumbs for gender
    if not gender:
--- a/scripts/validate_data.py
+++ b/scripts/validate_data.py
@ -685,6 +685,61 @@ def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
    _pass(name)


+def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
+    """English meanings must not contain bare Hebrew text (spoils the card)."""
+    name = "no_hebrew_in_meaning"
+    errors: list[str] = []
+    hebrew_re = re.compile(r"[\u05D0-\u05EA]")
+
+    for key, entry in data.items():
+        meaning = entry.get("meaning") or ""
+        # Apply same cleaning pipeline as apkg_builder
+        cleaned = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
+        cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(", ;:")
+        if hebrew_re.search(cleaned):
+            errors.append(f"[{key}] meaning still contains Hebrew after cleaning: {cleaned!r}")
+
+    if errors:
+        _fail(name, errors[:20] if not _verbose else errors)
+        if len(errors) > 20 and not _verbose:
+            print(f"          ... ({len(errors) - 20} more; use --verbose)")
+    else:
+        _pass(name)
+
+
+def test_mishkal_consistency(data: dict[str, Any]) -> None:
+    """mishkal_hebrew must match mishkal via _mishkal_to_hebrew conversion."""
+    name = "mishkal_consistency"
+    errors: list[str] = []
+
+    try:
+        from pealim_detail_scrape import _mishkal_to_hebrew
+    except ImportError:
+        _warn(name, ["Could not import _mishkal_to_hebrew — skipping"])
+        return
+
+    for key, entry in data.items():
+        for infl_key in ("noun_inflection", "adjective_inflection"):
+            infl = entry.get(infl_key)
+            if not infl:
+                continue
+            mishkal_eng = infl.get("mishkal") or ""
+            mishkal_heb = infl.get("mishkal_hebrew") or ""
+            if mishkal_eng and mishkal_heb:
+                expected = _mishkal_to_hebrew(mishkal_eng) or ""
+                if expected and expected != mishkal_heb:
+                    errors.append(f"[{key}] {infl_key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
+            if mishkal_heb and not mishkal_eng:
+                errors.append(f"[{key}] {infl_key}: has mishkal_hebrew but no mishkal")
+
+    if errors:
+        _fail(name, errors[:20] if not _verbose else errors)
+        if len(errors) > 20 and not _verbose:
+            print(f"          ... ({len(errors) - 20} more; use --verbose)")
+    else:
+        _pass(name)
+
+
 # ---------------------------------------------------------------------------
 # Stats summary
 # ---------------------------------------------------------------------------
@ -702,6 +757,11 @@ def print_stats(data: dict[str, Any]) -> None:
    with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
    in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
    with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
+    with_mishkal = sum(
+        1
+        for e in data.values()
+        if (e.get("noun_inflection") or {}).get("mishkal") or (e.get("adjective_inflection") or {}).get("mishkal")
+    )

    print()
    print("Stats Summary")
@ -709,6 +769,7 @@ def print_stats(data: dict[str, Any]) -> None:
    print(f"  Total entries:                {total:>6}")
    print(f"  With conjugation data:        {with_conj:>6}")
    print(f"  With noun_inflection:         {with_noun_inf:>6}")
+    print(f"  With mishkal:                 {with_mishkal:>6}")
    print(f"  With vetted examples:         {with_vetted:>6}")
    print(f"  With cloze examples:          {with_cloze:>6}")
    print(f"  With images:                  {with_image:>6}")
@ -740,6 +801,8 @@ ALL_TESTS: dict[str, Any] = {
    "conjugation_form_guids": test_conjugation_form_guids,
    "conjugation_person_codes": test_conjugation_person_codes,
    "no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
+    "no_hebrew_in_meaning": test_no_hebrew_in_meaning,
+    "mishkal_consistency": test_mishkal_consistency,
 }


--- a/tests/test_apkg_builder.py
+++ b/tests/test_apkg_builder.py
@ -0,0 +1,246 @@
+"""Unit tests for apkg_builder — Sprint 15 learnings.
+
+Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
+meanings, PoS exact matching, gender field population, and mishkal data integrity.
+"""
+
+import json
+import re
+import sys
+from pathlib import Path
+
+import pytest
+
+# Ensure project root is on path
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from apkg_builder import _categorize_pos, _cloze_prefix_len
+
+# ---------------------------------------------------------------------------
+# Cloze prefix preservation
+# ---------------------------------------------------------------------------
+
+
+class TestClozePrefix:
+    """_cloze_prefix_len must detect Hebrew prefix letters before the word."""
+
+    def test_single_prefix_bet(self):
+        # בַּתּוֹר = bet + patach + tor
+        assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
+
+    def test_single_prefix_lamed(self):
+        # לַמֶּלֶךְ = lamed + patach + melech
+        assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
+
+    def test_two_consonant_prefix(self):
+        # שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
+        token = "שֶׁבַּתּוֹר"
+        word = "תּוֹר"
+        prefix_len = _cloze_prefix_len(token, word)
+        assert prefix_len > 0
+        assert token[prefix_len:].startswith(word)
+
+    def test_no_prefix_direct_match(self):
+        # Word appears at start — no prefix
+        assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
+
+    def test_empty_inputs(self):
+        assert _cloze_prefix_len("", "תּוֹר") == 0
+        assert _cloze_prefix_len("בַּתּוֹר", "") == 0
+        assert _cloze_prefix_len("", "") == 0
+
+    def test_non_prefix_letter_returns_zero(self):
+        # If the "prefix" chars aren't valid prefix letters, return 0
+        # 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
+        assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
+
+    def test_prefix_preserves_nikkud(self):
+        # Verify that prefix_len includes nikkud marks
+        token = "בַּתּוֹר"
+        word = "תּוֹר"
+        prefix_len = _cloze_prefix_len(token, word)
+        prefix = token[:prefix_len]
+        # Prefix should contain at least bet + nikkud mark(s)
+        base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
+        assert base_letters == ["ב"]
+
+
+# ---------------------------------------------------------------------------
+# PoS exact matching (no substring collisions)
+# ---------------------------------------------------------------------------
+
+
+class TestCategorizePos:
+    """_categorize_pos must not let 'Pronoun' match 'Noun'."""
+
+    def test_noun_exact(self):
+        assert _categorize_pos("Noun") == "Noun"
+
+    def test_pronoun_is_other(self):
+        assert _categorize_pos("Pronoun") == "Other"
+
+    def test_verb_exact(self):
+        assert _categorize_pos("Verb") == "Verb"
+
+    def test_noun_with_dash(self):
+        assert _categorize_pos("Noun – masculine") == "Noun"
+
+    def test_adjective(self):
+        assert _categorize_pos("Adjective") == "Adjective"
+
+    def test_conjunction_is_other(self):
+        assert _categorize_pos("Conjunction") == "Other"
+
+
+# ---------------------------------------------------------------------------
+# Hebrew spoiler stripping from English meanings
+# ---------------------------------------------------------------------------
+
+
+class TestHebrewSpoilerStripping:
+    """English meanings must not contain Hebrew text (spoils the card)."""
+
+    # Use the same regex from apkg_builder.py
+    HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
+
+    @staticmethod
+    def _strip_hebrew(meaning: str) -> str:
+        """Replicate the meaning cleaning pipeline from build_vocab_deck."""
+        meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
+        meaning = re.sub(r"[;:]\s*—", " —", meaning)
+        meaning = re.sub(r";\s*:", ";", meaning)
+        return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
+
+    def test_pure_english_unchanged(self):
+        assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
+
+    def test_hebrew_word_removed(self):
+        result = self._strip_hebrew("to eat; אכל")
+        assert "אכל" not in result
+
+    def test_hebrew_with_nikkud_removed(self):
+        result = self._strip_hebrew("tall; גָּבוֹהַּ")
+        assert "גָּבוֹהַּ" not in result
+        assert "tall" in result
+
+    def test_no_residual_hebrew_in_real_data(self):
+        """Scan actual words.json — no meaning should contain Hebrew after stripping."""
+        words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
+        if not words_path.exists():
+            pytest.skip("words.json not available")
+
+        with open(words_path, encoding="utf-8") as f:
+            words = json.load(f)
+
+        # The regex used in apkg_builder
+        hebrew_re = re.compile(r"[\u05D0-\u05EA]")
+        spoilers = []
+        for key, entry in words.items():
+            meaning = entry.get("meaning") or ""
+            cleaned = self._strip_hebrew(meaning)
+            if hebrew_re.search(cleaned):
+                spoilers.append(f"{key}: {cleaned!r}")
+
+        assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
+
+
+# ---------------------------------------------------------------------------
+# Gender field for nouns (words.json data integrity)
+# ---------------------------------------------------------------------------
+
+
+class TestGenderDataIntegrity:
+    """Nouns with noun_inflection should have gender populated."""
+
+    @pytest.fixture()
+    def words(self):
+        words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
+        if not words_path.exists():
+            pytest.skip("words.json not available")
+        with open(words_path, encoding="utf-8") as f:
+            return json.load(f)
+
+    def test_nouns_have_gender(self, words):
+        """Nouns with noun_inflection should have a valid gender."""
+        missing = []
+        for key, entry in words.items():
+            pos = entry.get("pos") or ""
+            ni = entry.get("noun_inflection")
+            if pos.startswith("Noun") and ni:
+                gender = ni.get("gender") or ""
+                if gender not in ("masculine", "feminine", "masculine and feminine"):
+                    missing.append(f"{key}: gender={gender!r}")
+
+        # Allow up to 7% missing (loan words, compound words, etc.)
+        noun_count = sum(
+            1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
+        )
+        if noun_count > 0:
+            pct_missing = len(missing) / noun_count
+            assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
+
+
+# ---------------------------------------------------------------------------
+# Mishkal data integrity
+# ---------------------------------------------------------------------------
+
+
+class TestMishkalIntegrity:
+    """Validate mishkal data consistency in words.json."""
+
+    @pytest.fixture()
+    def words(self):
+        words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
+        if not words_path.exists():
+            pytest.skip("words.json not available")
+        with open(words_path, encoding="utf-8") as f:
+            return json.load(f)
+
+    def test_mishkal_hebrew_matches_english(self, words):
+        """If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
+        from pealim_detail_scrape import _mishkal_to_hebrew
+
+        mismatches = []
+        for key, entry in words.items():
+            for infl_key in ("noun_inflection", "adjective_inflection"):
+                infl = entry.get(infl_key)
+                if not infl:
+                    continue
+                mishkal_eng = infl.get("mishkal") or ""
+                mishkal_heb = infl.get("mishkal_hebrew") or ""
+                if mishkal_eng and mishkal_heb:
+                    expected = _mishkal_to_hebrew(mishkal_eng) or ""
+                    if expected and expected != mishkal_heb:
+                        mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
+
+        assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
+
+    def test_mishkal_hebrew_is_hebrew(self, words):
+        """mishkal_hebrew must contain Hebrew characters."""
+        hebrew_re = re.compile(r"[\u05D0-\u05EA]")
+        bad = []
+        for key, entry in words.items():
+            for infl_key in ("noun_inflection", "adjective_inflection"):
+                infl = entry.get(infl_key)
+                if not infl:
+                    continue
+                mishkal_heb = infl.get("mishkal_hebrew") or ""
+                if mishkal_heb and not hebrew_re.search(mishkal_heb):
+                    bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
+
+        assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
+
+    def test_no_orphaned_mishkal(self, words):
+        """If mishkal_hebrew is set, mishkal (English) must also be set."""
+        orphans = []
+        for key, entry in words.items():
+            for infl_key in ("noun_inflection", "adjective_inflection"):
+                infl = entry.get(infl_key)
+                if not infl:
+                    continue
+                mishkal_heb = infl.get("mishkal_hebrew") or ""
+                mishkal_eng = infl.get("mishkal") or ""
+                if mishkal_heb and not mishkal_eng:
+                    orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
+
+        assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"