Sprint 14: deck template/CSS overhaul + Sprint 12 detail scrape

Template & CSS fixes (15 items from Mar 9 feedback): - Fix conjugation front showing 3ms form instead of infinitive - Rename conjugation model to "Hebrew Conjugation" - Strip Hebrew parenthesized text from English meanings - Shoresh separator: spaces → dots (א.כ.ל) - Remove duplicate English meaning from cloze back - Remove example sentences from vocab front/back (cloze only) - Center-align audio buttons on all decks - Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)" - Unify sec-key/sec-label fonts, make keys bold - Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px) - Center-align related words groups - Sort confusables by average frequency - Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning - Clean duplicate quotation marks in cloze sentences Sprint 12 carry-forward (detail scrape + EPUB): - Adjective/preposition detail scraping in pealim_detail_scrape.py - EPUB example matching rewrite in epub_examples.py - Delete benyehuda.py and rebuild_sentence_matches.py (merged) - 49 parser tests for detail scraping - SCHEMA.yaml updates for new fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
feat: YAP-cleaned frequency corpus + two-tier assignment pipeline
2026-03-10 07:44:47 +00:00 · 2026-03-10 06:22:55 +00:00 · 2026-03-09 04:33:32 +00:00
15 changed files with 1885939 additions and 65808 deletions
--- a/SCHEMA.yaml
+++ b/SCHEMA.yaml
@ -138,11 +138,53 @@ entry:
  #     ktiv_male: "שומר"

  # --- Adjective-specific ---
-  adjective_inflection: null       # Reserved for future use
+  adjective_inflection: null       # null for non-adjectives
  # When populated:
-  #   ms/fs/mp/fp forms with nikkud/ktiv_male subfields
+  #   ms:
+  #     nikkud: "גָּדוֹל"
+  #     ktiv_male: "גדול"
+  #   fs:
+  #     nikkud: "גְּדוֹלָה"
+  #     ktiv_male: "גדולה"
+  #   mp:
+  #     nikkud: "גְּדוֹלִים"
+  #     ktiv_male: "גדולים"
+  #   fp:
+  #     nikkud: "גְּדוֹלוֹת"
+  #     ktiv_male: "גדולות"
+  #   mishkal: "CaCaC"             # English mishkal name (scraped from pealim PoS section)
+  #   mishkal_hebrew: "קָטָל"      # Hebrew mishkal name (computed via mapping)

  # --- Preposition-specific ---
-  preposition_inflection: null     # Reserved for future use
+  preposition_inflection: null     # null for non-prepositions
  # When populated:
-  #   Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)
+  #   1s:
+  #     nikkud: "שֶׁלִּי"
+  #     ktiv_male: "שלי"
+  #   1p:
+  #     nikkud: "שֶׁלָּנוּ"
+  #     ktiv_male: "שלנו"
+  #   2ms:
+  #     nikkud: "שֶׁלְּךָ"
+  #     ktiv_male: "שלך"
+  #   2fs:
+  #     nikkud: "שֶׁלָּךְ"
+  #     ktiv_male: "שלך"
+  #   2mp:
+  #     nikkud: "שֶׁלָּכֶם"
+  #     ktiv_male: "שלכם"
+  #   2fp:
+  #     nikkud: "שֶׁלָּכֶן"
+  #     ktiv_male: "שלכן"
+  #   3ms:
+  #     nikkud: "שֶׁלּוֹ"
+  #     ktiv_male: "שלו"
+  #   3fs:
+  #     nikkud: "שֶׁלָּהּ"
+  #     ktiv_male: "שלה"
+  #   3mp:
+  #     nikkud: "שֶׁלָּהֶם"
+  #     ktiv_male: "שלהם"
+  #   3fp:
+  #     nikkud: "שֶׁלָּהֶן"
+  #     ktiv_male: "שלהן"
--- a/apkg_builder.py
+++ b/apkg_builder.py
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903

 # Release version tag added to all notes so users can identify which release
 # their cards come from (visible in Anki's Browse view and card info).
-RELEASE_TAG = "v0.15.1"
+RELEASE_TAG = "v0.16"

 # Regex for extracting emoji and Hebrew prepositions from meaning strings
 EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
@ -117,13 +117,15 @@ CARD_CSS = """
 .card {
  font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
  font-size: 20px;
-  text-align: center;
+  text-align: right;
  color: #222;
  background: #fff;
  padding: 16px;
+  max-width: 600px;
+  margin: 0 auto;
 }
 .hebrew {
-  font-size: 36px;
+  font-size: 42px;
  font-weight: bold;
  direction: rtl;
  text-align: center;
@ -131,32 +133,34 @@ CARD_CSS = """
  color: #222;
 }
 .hebrew-sm {
-  font-size: 24px;
+  font-size: 30px;
  font-weight: normal;
  direction: rtl;
  text-align: center;
-  color: #333;
+  color: #222;
 }
 .meaning {
-  font-size: 28px;
+  font-size: 34px;
  color: #1a1a8c;
  margin: 8px 0;
+  text-align: center;
 }
 .hint {
-  font-size: 16px;
-  color: #888;
+  font-size: 22px;
+  color: #555;
  margin: 4px 0;
  direction: rtl;
+  text-align: center;
 }
 .root-info {
-  font-size: 18px;
-  color: #555;
+  font-size: 26px;
+  color: #222;
  margin-top: 6px;
  direction: rtl;
 }
 .example {
-  font-size: 18px;
-  color: #444;
+  font-size: 24px;
+  color: #222;
  direction: rtl;
  text-align: right;
  font-style: italic;
@ -182,16 +186,17 @@ CARD_CSS = """
  color: #555;
 }
 .sec-label {
-  font-size: 20px;
+  font-size: 28px;
  font-weight: normal;
-  color: #555;
+  color: #222;
  direction: rtl;
  text-align: center;
  margin-top: 6px;
 }
 .sec-key {
-  font-size: 18px;
-  color: #888;
+  font-size: 28px;
+  color: #222;
+  font-weight: bold;
 }
 .definitions {
  direction: rtl;
@ -199,32 +204,37 @@ CARD_CSS = """
 }
 .conf-entry {
  margin: 8px 0;
-  font-size: 20px;
+  font-size: 28px;
  direction: rtl;
 }
 .related-group {
  direction: rtl;
-  text-align: right;
+  text-align: center;
  margin: 2px 0;
-  font-size: 18px;
+  font-size: 26px;
 }
 .emoji-img {
  font-size: 3.5em;
  text-align: center;
  margin: 0.3em 0;
 }
+.card [type="button"], .card button, .replay-button {
+  display: block !important;
+  margin: 4px auto !important;
+  text-align: center;
+}
@media (prefers-color-scheme: dark) {
  .card        { color: #e8e8e8; background: #1c1c1e; }
  .hebrew      { color: #f0f0f0; }
-  .hebrew-sm   { color: #ddd; }
+  .hebrew-sm   { color: #e0e0e0; }
  .meaning     { color: #82b0ff; }
-  .root-info   { color: #aaa; }
-  .sec-label   { color: #aaa; }
-  .sec-key     { color: #666; }
+  .root-info   { color: #e0e0e0; }
+  .sec-label   { color: #e0e0e0; }
+  .sec-key     { color: #e0e0e0; }
  .conf-entry  { color: #ddd; }
  .hint        { color: #777; }
  .voice-label { color: #888; }
-  .example     { color: #bbb; border-right-color: #555; }
+  .example     { color: #e0e0e0; border-right-color: #555; }
  .divider     { border-top-color: #333; }
  .freq-badge  { color: #888; border-color: #444; }
 }
@ -252,9 +262,6 @@ VOCAB_BACK_HEB = """
 <div class="root-info">{{SharedRoots}}</div>
 {{/SharedRoots}}
 {{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
-{{#Example}}
-<div class="example">{{Example}}</div>
-{{/Example}}
 {{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
 """

@ -273,14 +280,15 @@ VOCAB_BACK_ENG = """
 {{#WordNoNikkud}}<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span> {{WordNoNikkud}}</div>{{/WordNoNikkud}}
 {{#Root}}<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span> {{Root}}</div>{{/Root}}
 {{#PoS}}<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span> {{PoS}}</div>{{/PoS}}
+{{#SharedRoots}}
+<div class="sec-label"><span class="sec-key">מִילִים קְשׁוּרוֹת:</span></div>
+<div class="root-info">{{SharedRoots}}</div>
+{{/SharedRoots}}
 {{#Plural}}<div class="sec-label"><span class="sec-key">רַבִּים:</span> {{Plural}}</div>{{/Plural}}
-{{#Example}}
-<div class="example">{{Example}}</div>
-{{/Example}}
 """

 VOCAB_FRONT_CLOZE = """
-<div class="example" style="font-size:24px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
+<div class="example" style="font-size:32px;font-style:normal;border:none;padding:0;text-align:center;">{{ClozeExample}}</div>
 {{#ClozeHint}}<div class="hint">{{ClozeHint}}</div>{{/ClozeHint}}
 """

@ -289,7 +297,6 @@ VOCAB_BACK_CLOZE = """
 <div class="divider"></div>
 <div class="hebrew">{{Word}}</div>
 {{#Audio}}<div>{{Audio}}</div>{{/Audio}}
-<div class="meaning">{{Meaning}}</div>
 """

 VOCAB_MODEL = genanki.Model(
@ -343,8 +350,8 @@ VOCAB_MODEL = genanki.Model(

 CONJ_FRONT = """
 <div class="sec-label">אֵיךְ אוֹמְרִים</div>
-<div class="hebrew" style="color:#1a1a8c;">{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
 <div class="hebrew">{{Pronoun}}</div>
+<div class="hebrew" style="color:#1a1a8c;">{{Infinitive}}{{#Prep}} <span class="hebrew-sm">({{Prep}})</span>{{/Prep}}{{#Voice}} <span class="voice-label">({{Voice}})</span>{{/Voice}}</div>
 <div class="hebrew">{{Tense}}</div>
 """

@ -363,7 +370,7 @@ CONJ_CSS = CARD_CSS

 CONJ_MODEL = genanki.Model(
    CONJ_MODEL_ID,
-    "Pealim Conjugation",
+    "Hebrew Conjugation",
    fields=[
        {"name": "Infinitive"},
        {"name": "ReferenceForm"},
@ -666,8 +673,9 @@ def _load_emoji_lookup() -> dict[str, str]:

 def _categorize_pos(pos_str: str) -> str:
    """Return the canonical PoS category key for grouping."""
+    base = pos_str.split("–")[0].split("—")[0].strip()
    for cat in POS_CATEGORY_LABELS:
-        if cat.lower() in pos_str.lower():
+        if base == cat:
            return cat
    return "Other"

@ -745,10 +753,14 @@ def build_vocab_deck(
        word_nikkud = entry["word"]["nikkud"]
        word_no_nik = entry["word"].get("ktiv_male", "")
        root_list = entry.get("root") or []
-        root = " ".join(root_list)
+        root = ".".join(root_list)
        pos_raw = entry.get("pos", "")
        pos_heb = entry.get("pos_hebrew", "")
-        meaning = entry.get("meaning", "") or ""
+        meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip()
+        meaning = HBPAREN_RE.sub("", meaning).strip()
+        meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")
+        meaning = re.sub(r"(\w)\(", r"\1 (", meaning)  # space before opening paren
+        meaning = re.sub(r",(\S)", r", \1", meaning)  # space after comma
        meaning_raw = entry.get("meaning_raw", "") or ""
        slug = entry.get("slug", "") or ""
        frequency = entry.get("frequency") or 999_999
@ -839,6 +851,9 @@ def build_vocab_deck(
            end = cloze_data.get("cloze_word_end")
            if cloze_text and start is not None and end is not None:
                cloze_example = cloze_text[:start] + "_____" + cloze_text[end:]
+                # Clean up duplicate/misplaced quotation marks
+                cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example)
+                cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example)
                raw_hint = cloze_data.get("cloze_hint") or ""
                if raw_hint:
                    cloze_hint = raw_hint
@ -871,11 +886,12 @@ def build_vocab_deck(
                    parts.append(f'<div class="related-group"><b>{label}:</b> {" ".join(rw_words)}</div>')
            related_html = "\n".join(parts)

-        # Plural form (for nouns)
+        # Plural form (nouns only — guard against adjective/verb inflection bleed)
        plural_str = ""
-        noun_inflection = entry.get("noun_inflection")
-        if noun_inflection and noun_inflection.get("plural"):
-            plural_str = noun_inflection["plural"].get("nikkud", "")
+        if pos_raw.startswith("Noun"):
+            noun_inflection = entry.get("noun_inflection")
+            if noun_inflection and noun_inflection.get("plural"):
+                plural_str = noun_inflection["plural"].get("nikkud", "")

        # Image
        image_tag = ""
@ -977,18 +993,28 @@ def build_conj_deck(
        binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or ""
        slug = entry.get("slug", "") or ""
        root_list = entry.get("root") or []
-        root = " ".join(root_list)
+        root = ".".join(root_list)
        voice = VOICE_MAP.get(binyan, "")

+        meaning_raw = entry.get("meaning_raw", "") or ""
        meaning = entry.get("meaning", "") or ""
-        # Extract Hebrew preposition from meaning_raw
+        # Extract Hebrew preposition — strip from meaning, show on Hebrew side
        prep_str = ""
        conj_prep = conj.get("prep")
        if conj_prep:
-            prep_str = f"({conj_prep})"
-        elif meaning:
-            preps = HBPAREN_RE.findall(entry.get("meaning_raw", "") or "")
-            prep_str = " ".join(f"({p})" for p in preps)
+            # Strip any parentheses from stored prep value
+            prep_str = conj_prep.strip("() ")
+        elif meaning_raw:
+            preps = HBPAREN_RE.findall(meaning_raw)
+            if preps:
+                prep_str = preps[0]
+        # Strip Hebrew prepositions from English meaning to avoid duplication
+        if prep_str:
+            meaning = HBPAREN_RE.sub("", meaning).strip()
+            # Also strip from meaning_raw patterns like "(על)"
+            meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip()
+            # Clean up double spaces and trailing commas
+            meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ")

        related = [w for w in root_words.get(root, []) if w != infinitive]
        related_str = " ".join(related[:8]) if related else ""
@ -1024,7 +1050,7 @@ def build_conj_deck(
            elif guid_candidates:
                note_guid = guid_candidates[0]
            else:
-                note_guid = genanki.guid_for(_infinitive, pronoun, tense)
+                note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb)
            note = genanki.Note(
                model=CONJ_MODEL,
                guid=note_guid,
@ -1213,8 +1239,10 @@ def build_conj_deck(
 # ──────────────────────────────────────────────────────────────────────────────

 CONF_FRONT = """
+<div style="direction:rtl; text-align:center;">
 <div class="hebrew" style="font-size:36px;">{{Words}}</div>
 <div class="meaning" style="font-size:32px; direction:rtl; text-align:center;">מה ההבדל?</div>
+</div>
 """

 CONF_BACK = """
@ -1271,7 +1299,10 @@ def build_confusables_deck(
            guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key))
        guid_to_entries.setdefault(guid, []).append(entry)

-    for guid, group_entries in sorted(guid_to_entries.items(), key=lambda x: x[0]):
+    for guid, group_entries in sorted(
+        guid_to_entries.items(),
+        key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]),
+    ):
        if guid in seen_guids:
            continue
        seen_guids.add(guid)
@ -1366,6 +1397,7 @@ PLURAL_BACK_SG = """
 {{FrontSide}}<hr>
 <div class="hebrew">{{Plural}}</div>
 {{#PluralAudio}}<div>{{PluralAudio}}</div>{{/PluralAudio}}
+{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
 {{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
 """

@ -1380,6 +1412,7 @@ PLURAL_BACK_PL = """
 <div class="hebrew">{{Singular}}</div>
 {{#SingularAudio}}<div>{{SingularAudio}}</div>{{/SingularAudio}}
 <div class="sec-label">{{Meaning}}</div>
+{{#Gender}}<div class="sec-label"><span class="sec-key">מִין:</span> {{Gender}}</div>{{/Gender}}
 {{#Mishkal}}<div class="sec-label"><span class="sec-key">מִשְׁקָל:</span> {{Mishkal}}</div>{{/Mishkal}}
 """

@ -1483,10 +1516,11 @@ def build_plural_deck(
        plural = noun_inflection["plural"]["nikkud"]
        plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
        gender = noun_inflection.get("gender") or ""
+        gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender)
        mishkal = noun_inflection.get("mishkal") or ""
-        meaning = entry.get("meaning") or ""
+        meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip()
        root_list = entry.get("root") or []
-        root = " ".join(root_list)
+        root = ".".join(root_list)

        # GUID from noun_inflection
        note_guid_raw = noun_inflection.get("plurals_guid")
@ -1520,7 +1554,7 @@ def build_plural_deck(
                meaning,
                root,
                mishkal,
-                gender,
+                gender_heb,
            ],
            tags=tags,
        )
--- a/benyehuda.py
+++ b/benyehuda.py
@ -1,202 +0,0 @@
-#!/usr/bin/env python3
-"""
-Ben Yehuda corpus example-sentence lookup (nikkud corpus).
-
-TODO: Rewrite to update words.json examples fields directly instead of
-writing to a separate examples_cache.json. Currently the migration script
-bridges the gap. See Phase 5 in SPRINT_LOG.md.
-
-Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
-then answers queries locally.
-
-Exposed API:
-  load(force_rebuild=False)
-  get_examples(word_nikkud) -> list[str]   (returns 0 or 1 examples)
-  save_examples_cache()
-"""
-
-import json
-import logging
-import re
-import zipfile
-from io import BytesIO
-from pathlib import Path
-
-import requests
-
-from helpers import strip_nikkud as _strip_nikkud
-
-logger = logging.getLogger(__name__)
-
-# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
-CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
-INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
-EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
-REQUEST_TIMEOUT = 120
-MIN_SENTENCE_LEN = 20
-MAX_SENTENCE_LEN = 200
-MAX_INDEX_ENTRIES = 500  # cap examples kept per word in index to limit memory
-
-# Module-level state
-_index: dict[str, list[str]] = {}  # word (with nikkud) -> [sentence, ...]
-_examples_cache: dict[str, list[str]] = {}  # word -> cached result for this run
-
-
-def _split_sentences(text: str) -> list[str]:
-    """
-    Split text into sentences on newlines only (Hebrew sentences don't have
-    mid-word period issues like English).  Min 20 chars, max 200 chars.
-    """
-    out = []
-    for line in text.split("\n"):
-        s = line.strip().strip("\"'.,;:!?")
-        s = s.strip()
-        if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
-            out.append(s)
-    return out
-
-
-def _build_index(corpus_zip_bytes: bytes) -> None:
-    """Parse corpus ZIP and build word (nikkud) → sentences index."""
-    global _index
-    _index = {}
-    logger.info("Building Ben Yehuda index from nikkud corpus …")
-
-    with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
-        txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
-        logger.info(f"  Corpus contains {len(txt_files)} text files")
-        for fname in txt_files:
-            try:
-                raw = zf.read(fname).decode("utf-8", errors="ignore")
-            except Exception:  # noqa: S112
-                continue
-            for sentence in _split_sentences(raw):
-                # Index by each unique Hebrew token (with nikkud) in the sentence
-                words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
-                for w in set(words):
-                    if len(w) >= 2:
-                        bucket = _index.setdefault(w, [])
-                        if len(bucket) < MAX_INDEX_ENTRIES:
-                            bucket.append(sentence)
-
-    logger.info(f"Index built: {len(_index)} unique word forms")
-
-
-def _save_index() -> None:
-    INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
-    with open(INDEX_PATH, "w", encoding="utf-8") as f:
-        json.dump(_index, f, ensure_ascii=False)
-    logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
-
-
-def _load_index() -> None:
-    global _index
-    with open(INDEX_PATH, encoding="utf-8") as f:
-        _index = json.load(f)
-    logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
-
-
-def load(force_rebuild: bool = False) -> None:
-    """Load or build the Ben Yehuda index. Downloads corpus if needed."""
-    global _index, _examples_cache
-    if _index and not force_rebuild:
-        return
-
-    if force_rebuild:
-        # Delete old index and discard examples cache
-        if INDEX_PATH.exists():
-            INDEX_PATH.unlink()
-            logger.info("Deleted old Ben Yehuda index (force rebuild)")
-        _examples_cache = {}
-    else:
-        # Load persisted examples cache (not needed on rebuild)
-        if EXAMPLES_CACHE_PATH.exists():
-            with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
-                _examples_cache = json.load(f)
-
-    if INDEX_PATH.exists():
-        _load_index()
-        return
-
-    logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
-    resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
-    resp.raise_for_status()
-    data = resp.content
-    logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
-
-    _build_index(data)
-    _save_index()
-
-
-def save_examples_cache() -> None:
-    EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
-    with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
-        json.dump(_examples_cache, f, ensure_ascii=False)
-    logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
-
-
-def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]:
-    """
-    Return 0 or 1 example sentences for the given word (nikkud form).
-
-    Lookup strategy:
-    1. Try exact nikkud match in index.
-    2. Fall back to stripped (no-nikkud) match against index keys.
-       Skipped when word's consonants are in confusable_consonants set
-       (to avoid returning sentences for the wrong homograph).
-
-    Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
-    the word as a whole token.
-    """
-    if not _index:
-        load()
-
-    word = word_nikkud.strip()
-    word_stripped = _strip_nikkud(word)
-
-    cache_key = word
-
-    if cache_key in _examples_cache:
-        return _examples_cache[cache_key]
-
-    # Lookup: try exact nikkud first, then stripped fallback
-    candidates = _index.get(word, [])
-    if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()):
-        # Try looking up by stripped form across index keys
-        for k, v in _index.items():
-            if _strip_nikkud(k) == word_stripped:
-                candidates = v
-                break
-
-    # Filter: word must appear as a whole token
-    # Match the stripped form (for robustness with nikkud variants in sentence)
-    if word_stripped:
-        pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
-        matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
-    else:
-        matched = candidates[:]
-
-    # Filter by length
-    matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
-
-    # Return the single longest sentence ≤ MAX_SENTENCE_LEN
-    if matched:
-        best = max(matched, key=len)
-        result = [best]
-    else:
-        result = []
-
-    _examples_cache[cache_key] = result
-    return result
-
-
-if __name__ == "__main__":
-    logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
-    load()
-    tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
-    for w in tests:
-        exs = get_examples(w)
-        print(f"\n{w}: {len(exs)} example(s)")
-        for ex in exs:
-            print(f"  → {ex[:100]}")
-    save_examples_cache()
--- a/data/frequency_clean.json
+++ b/data/frequency_clean.json
--- a/data/frequency_discarded.json
+++ b/data/frequency_discarded.json
--- a/data/words.json
+++ b/data/words.json
--- a/epub_examples.py
+++ b/epub_examples.py
@ -0,0 +1,791 @@
+#!/usr/bin/env python3
+"""
+Extract example sentences from nikud'd Hebrew EPUB files, match them against
+the vocabulary list in data/words.json, and write matched examples back into
+words.json.
+
+Usage (standalone):
+    python3 epub_examples.py
+
+Called from run.py via:
+    run(words)  — words dict is passed in and updated in place
+"""
+
+import logging
+import os
+import re
+import zipfile
+from html.parser import HTMLParser
+from pathlib import Path
+
+from helpers import strip_nikkud
+
+logger = logging.getLogger(__name__)
+
+DATA_DIR = Path(__file__).parent / "data"
+EPUB_DIR = DATA_DIR / "epubs"
+WORDS_JSON = DATA_DIR / "words.json"
+
+
+# Book metadata: filename -> display name
+def _discover_epubs() -> dict[str, str]:
+    """Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}."""
+    if not EPUB_DIR.exists():
+        return {}
+    books: dict[str, str] = {}
+    for path in sorted(EPUB_DIR.glob("*.epub")):
+        stem = path.stem
+        stem_stripped = strip_nikkud(stem).lower()
+        # Derive a brief English display name from the filename
+        parts = stem.split(" -- ")
+        title_part = strip_nikkud(parts[0]).strip().lower()
+        if "alice" in stem_stripped or "אליס" in title_part:
+            name = "alice_wonderland"
+        elif "little_prince" in stem_stripped or "נסיך" in title_part:
+            name = "little_prince"
+        elif "מנהרת" in title_part or "time_tunnel" in stem_stripped:
+            num_match = re.search(r"(\d+)", stem_stripped)
+            num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "")
+            name = f"time_tunnel_{num}"
+        else:
+            name = stem_stripped[:40]
+        books[str(path)] = name
+    return books
+
+
+# Sentence length bounds (word count)
+MIN_WORDS = 4
+MAX_WORDS = 15
+
+
+# ── HTML text extraction ─────────────────────────────────────────
+
+
+class _TextExtractor(HTMLParser):
+    """Extract text content from HTML, skipping script/style tags."""
+
+    SKIP_TAGS = {"script", "style", "head"}
+
+    def __init__(self):
+        super().__init__()
+        self.parts: list[str] = []
+        self._skip_depth = 0
+
+    def handle_starttag(self, tag, attrs):
+        _ = attrs  # required by HTMLParser interface
+        if tag in self.SKIP_TAGS:
+            self._skip_depth += 1
+        # Insert newline for block-level elements to avoid word concatenation
+        if tag in (
+            "p",
+            "div",
+            "br",
+            "li",
+            "h1",
+            "h2",
+            "h3",
+            "h4",
+            "h5",
+            "h6",
+            "td",
+            "th",
+            "tr",
+            "blockquote",
+            "section",
+        ):
+            self.parts.append("\n")
+
+    def handle_endtag(self, tag):
+        if tag in self.SKIP_TAGS:
+            self._skip_depth = max(0, self._skip_depth - 1)
+
+    def handle_data(self, data):
+        if self._skip_depth == 0:
+            self.parts.append(data)
+
+    def get_text(self) -> str:
+        return "".join(self.parts)
+
+
+def extract_text_from_html(html: str) -> str:
+    """Parse HTML and return plain text."""
+    parser = _TextExtractor()
+    parser.feed(html)
+    return parser.get_text()
+
+
+# ── EPUB processing ──────────────────────────────────────────────
+
+
+def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
+    """Get ordered list of content XHTML files from the OPF manifest."""
+    opf_path = None
+    for name in zf.namelist():
+        if name.endswith(".opf"):
+            opf_path = name
+            break
+    if not opf_path:
+        # Fallback: just use all xhtml files
+        return sorted(
+            n
+            for n in zf.namelist()
+            if n.endswith((".xhtml", ".html"))
+            and "toc" not in n.lower()
+            and "cover" not in n.lower()
+            and "nav" not in n.lower()
+        )
+
+    # Parse OPF to get spine order
+    opf_content = zf.read(opf_path).decode("utf-8")
+    opf_dir = os.path.dirname(opf_path)
+
+    # Extract manifest items: id -> href
+    manifest: dict[str, str] = {}
+    for m in re.finditer(r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
+        manifest[m.group(1)] = m.group(2)
+    # Also try reversed attribute order
+    for m in re.finditer(r'<item\s+[^>]*href="([^"]+)"[^>]*id="([^"]+)"', opf_content):
+        manifest[m.group(2)] = m.group(1)
+
+    # Extract spine order
+    spine_ids = re.findall(r'<itemref\s+[^>]*idref="([^"]+)"', opf_content)
+
+    result = []
+    for sid in spine_ids:
+        href = manifest.get(sid, "")
+        if href and href.endswith((".xhtml", ".html")):
+            full_path = os.path.join(opf_dir, href) if opf_dir else href
+            # Normalize path separators
+            full_path = full_path.replace("\\", "/")
+            if full_path in zf.namelist():
+                result.append(full_path)
+
+    if not result:
+        # Fallback
+        return sorted(
+            n
+            for n in zf.namelist()
+            if n.endswith((".xhtml", ".html")) and "toc" not in n.lower() and "cover" not in n.lower()
+        )
+    return result
+
+
+def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
+    """Extract sentences from an EPUB file.
+
+    Args:
+        epub_path: Path to the .epub file.
+        book_name: Human-readable book name used as the ``source`` field.
+
+    Returns:
+        List of ``{"text": str, "source": str}`` dicts.
+    """
+    zf = zipfile.ZipFile(epub_path)
+    content_files = _content_files_from_epub(zf)
+
+    all_text = []
+    for cf in content_files:
+        try:
+            html = zf.read(cf).decode("utf-8")
+        except (KeyError, UnicodeDecodeError):
+            continue
+        text = extract_text_from_html(html)
+        all_text.append(text)
+
+    full_text = "\n".join(all_text)
+    return _split_into_sentences(full_text, book_name)
+
+
+# ── Sentence splitting ───────────────────────────────────────────
+
+# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
+_SENT_SPLIT = re.compile(r"[.!?\u05C3]+")
+
+# Punctuation to strip from word boundaries when matching
+_PUNCT = re.compile(
+    r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|'
+    r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
+)
+
+
+def _split_into_sentences(text: str, book_name: str) -> list[dict]:
+    """Split text into Hebrew sentences and filter by word count.
+
+    Args:
+        text: Raw extracted text from an EPUB chapter.
+        book_name: Source label for each sentence dict.
+
+    Returns:
+        List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text.
+    """
+    # Normalize whitespace
+    text = re.sub(r"\s+", " ", text).strip()
+
+    raw_sentences = _SENT_SPLIT.split(text)
+    results: list[dict] = []
+    seen: set[str] = set()
+
+    for sent in raw_sentences:
+        sent = sent.strip()
+        if not sent:
+            continue
+
+        # Count Hebrew words (skip non-Hebrew tokens like numbers)
+        words = sent.split()
+        hebrew_words = [w for w in words if any("\u0590" <= c <= "\u05ff" for c in w)]
+
+        if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
+            continue
+
+        # Deduplicate by exact nikkud text
+        if sent in seen:
+            continue
+        seen.add(sent)
+
+        results.append({"text": sent, "source": book_name})
+
+    return results
+
+
+# ── Nikkud index ─────────────────────────────────────────────────
+
+# Unicode ranges for Hebrew combining marks
+_NIKKUD_LOW = 0x05B0  # start of vowel points (shva)
+_NIKKUD_HIGH = 0x05BD  # end of vowel range (meteg); 0x05BE is maqaf (punctuation)
+_DAGESH = "\u05bc"
+_SHIN_DOT = "\u05c1"
+_SIN_DOT = "\u05c2"
+
+# Valid prefix consonants
+_PREFIX_CONSONANTS = set("בהוכלמש")
+
+# Named vowel combining marks
+_SHVA = "\u05b0"
+_HIRIQ = "\u05b4"
+_TSERE = "\u05b5"
+_SEGOL = "\u05b6"
+_PATACH = "\u05b7"
+_QAMATZ = "\u05b8"
+
+# Valid nikkud patterns on each prefix consonant.
+# Key = consonant, Value = set of frozensets of combining marks valid for that prefix.
+_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = {
+    "ב": {
+        frozenset({_SHVA, _DAGESH}),  # בְּ standard
+        frozenset({_HIRIQ, _DAGESH}),  # בִּ before shva
+        frozenset({_PATACH, _DAGESH}),  # בַּ with definite article
+        frozenset({_QAMATZ, _DAGESH}),  # בָּ before chataf qamatz
+        frozenset({_SEGOL, _DAGESH}),  # בֶּ before chataf segol
+    },
+    "כ": {
+        frozenset({_SHVA, _DAGESH}),  # כְּ
+        frozenset({_HIRIQ, _DAGESH}),  # כִּ
+        frozenset({_PATACH, _DAGESH}),  # כַּ
+        frozenset({_QAMATZ, _DAGESH}),  # כָּ
+        frozenset({_SEGOL, _DAGESH}),  # כֶּ
+    },
+    "ל": {
+        frozenset({_SHVA}),  # לְ standard
+        frozenset({_HIRIQ}),  # לִ before shva
+        frozenset({_PATACH}),  # לַ with definite article
+        frozenset({_QAMATZ}),  # לָ demonstratives
+        frozenset({_SEGOL}),  # לֶ before chataf segol
+    },
+    "ו": {
+        frozenset({_SHVA}),  # וְ standard
+        frozenset({_DAGESH}),  # וּ (shureq) before shva/bumf
+        frozenset({_PATACH}),  # וַ before chataf patach
+        frozenset({_QAMATZ}),  # וָ before chataf qamatz
+        frozenset({_SEGOL}),  # וֶ before chataf segol
+        frozenset({_HIRIQ}),  # וִ before yud-shva
+    },
+    "מ": {
+        frozenset({_HIRIQ}),  # מִ standard
+        frozenset({_TSERE}),  # מֵ before gutturals
+    },
+    "ש": {
+        frozenset({_SEGOL, _DAGESH}),  # שֶׁ standard
+        frozenset({_SEGOL, _DAGESH, _SHIN_DOT}),  # שֶׁ with explicit shin dot
+    },
+    "ה": {
+        frozenset({_PATACH}),  # הַ standard definite article
+        frozenset({_QAMATZ}),  # הָ before gutturals
+        frozenset({_SEGOL}),  # הֶ before qamatz-bearing gutturals
+    },
+}
+
+
+def _is_combining_mark(ch: str) -> bool:
+    """Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots)."""
+    cp = ord(ch)
+    if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH:
+        return True
+    return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT)
+
+
+def _decompose_first_char(token: str) -> tuple[str, frozenset, str]:
+    """Split token into (first_consonant, its_combining_marks, remainder).
+
+    Args:
+        token: A nikkud Hebrew token string.
+
+    Returns:
+        A tuple of (consonant, marks, rest).  Returns ("", frozenset(), token)
+        if the token does not start with a Hebrew consonant (alef–tav range).
+    """
+    if not token:
+        return ("", frozenset(), token)
+
+    first = token[0]
+    # Check it's a Hebrew consonant (alef–tav)
+    if not ("\u05d0" <= first <= "\u05ea"):
+        return ("", frozenset(), token)
+
+    # Collect all combining marks that follow the consonant
+    marks: set[str] = set()
+    i = 1
+    while i < len(token):
+        ch = token[i]
+        if _is_combining_mark(ch):
+            marks.add(ch)
+            i += 1
+        else:
+            break
+
+    return (first, frozenset(marks), token[i:])
+
+
+def _is_valid_prefix(consonant: str, marks: frozenset) -> bool:
+    """Check if consonant + marks form a valid Hebrew prefix combination.
+
+    Args:
+        consonant: The prefix consonant character.
+        marks: Frozenset of combining mark characters on that consonant.
+
+    Returns:
+        True if this is a recognised Hebrew prefix vocalization.
+    """
+    valid = _VALID_PREFIX_MARKS.get(consonant)
+    if not valid:
+        return False
+    # For ש, allow shin dot to be present or absent
+    if consonant == "ש":
+        marks_without_shin = marks - {_SHIN_DOT}
+        return marks_without_shin in valid or marks in valid
+    return marks in valid
+
+
+def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str:
+    """Reassemble a token from its decomposed parts, sorting marks by codepoint."""
+    return consonant + "".join(sorted(marks)) + rest
+
+
+def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]:
+    """Try stripping 1 or 2 prefix letters from a nikkud token.
+
+    Args:
+        token: A cleaned nikkud word token.
+        nikkud_index: Mapping from nikkud form to list of (unique_key, match_type).
+
+    Returns:
+        List of (unique_key, match_type, matched_remainder) for each hit found.
+        The match_type will have ``"_prefix"`` appended to the base type.
+    """
+    results: list[tuple[str, str, str]] = []
+
+    # Try 1-letter prefix
+    c1, m1, rest1 = _decompose_first_char(token)
+    if not (c1 and _is_valid_prefix(c1, m1) and rest1):
+        return results
+
+    # Direct match on 1-prefix remainder
+    if rest1 in nikkud_index:
+        for unique_key, match_type in nikkud_index[rest1]:
+            results.append((unique_key, match_type + "_prefix", rest1))
+
+    # Try removing dagesh from first letter of remainder
+    # (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ)
+    c2, m2, rest2_inner = _decompose_first_char(rest1)
+    if c2 and _DAGESH in m2:
+        without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner)
+        if without_dagesh != rest1 and without_dagesh in nikkud_index:
+            for unique_key, match_type in nikkud_index[without_dagesh]:
+                results.append((unique_key, match_type + "_prefix", without_dagesh))
+
+    # Try 2-letter prefix (ו and ש commonly stack with another prefix)
+    if c1 in "וש":
+        c2b, m2b, rest2b = _decompose_first_char(rest1)
+        if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b:
+            if rest2b in nikkud_index:
+                for unique_key, match_type in nikkud_index[rest2b]:
+                    results.append((unique_key, match_type + "_prefix", rest2b))
+
+            # Also try dagesh removal on remainder of 2-letter prefix
+            c3, m3, rest3_inner = _decompose_first_char(rest2b)
+            if c3 and _DAGESH in m3:
+                without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner)
+                if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index:
+                    for unique_key, match_type in nikkud_index[without_dagesh2]:
+                        results.append((unique_key, match_type + "_prefix", without_dagesh2))
+
+    return results
+
+
+def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
+    """Build a mapping from nikkud form to list of (unique_key, match_type).
+
+    Indexes the following sources per entry:
+
+    - ``word.nikkud`` → "direct"
+    - conjugation active/passive forms → "conjugated"
+    - conjugation infinitive and reference_form → "conjugated"
+    - noun inflection singular/plural/construct/pronominal → "inflected"
+
+    Args:
+        words: The full words.json dict keyed by unique_key.
+
+    Returns:
+        Dict mapping each nikkud form to a list of (unique_key, match_type) tuples.
+    """
+    index: dict[str, list[tuple[str, str]]] = {}
+
+    def _add(form: str | None, unique_key: str, match_type: str) -> None:
+        if form:
+            index.setdefault(form, []).append((unique_key, match_type))
+
+    for unique_key, entry in words.items():
+        # Direct word form
+        word = entry.get("word") or {}
+        _add(word.get("nikkud"), unique_key, "direct")
+
+        # Conjugation forms
+        conj = entry.get("conjugation") or {}
+
+        for form_entry in conj.get("active_forms") or []:
+            form = (form_entry.get("form") or {}).get("nikkud")
+            _add(form, unique_key, "conjugated")
+
+        for form_entry in conj.get("hufal_pual_forms") or []:
+            form = (form_entry.get("form") or {}).get("nikkud")
+            _add(form, unique_key, "conjugated")
+
+        inf = conj.get("infinitive") or {}
+        _add(inf.get("nikkud"), unique_key, "conjugated")
+
+        ref = conj.get("reference_form") or {}
+        _add(ref.get("nikkud"), unique_key, "conjugated")
+
+        # Noun inflection forms
+        noun = entry.get("noun_inflection") or {}
+
+        for field in ("singular", "plural", "construct_singular", "construct_plural"):
+            sub = noun.get(field) or {}
+            _add(sub.get("nikkud"), unique_key, "inflected")
+
+        pronominal = noun.get("pronominal_suffixes") or {}
+        for _person, sub in pronominal.items():
+            if isinstance(sub, dict):
+                _add(sub.get("nikkud"), unique_key, "inflected")
+
+    return index
+
+
+def _filter_collision_forms(nikkud_index: dict) -> dict:
+    """Remove colliding forms for entries that have other unique forms.
+
+    A "colliding form" maps to 2+ unique_keys.  For each unique_key that
+    appears in a collision, check whether it also has at least one
+    non-colliding form in the index.  If so, remove it from the colliding
+    form's entry list.  If a unique_key's *only* indexed forms all collide,
+    keep them (otherwise the entry would get zero matches).
+
+    Returns a new index dict with the same structure.
+    """
+    # Identify collision forms and build reverse map (key → its forms)
+    collision_forms: set[str] = set()
+    key_to_forms: dict[str, set[str]] = {}
+
+    for form, entries in nikkud_index.items():
+        keys = {uk for uk, _ in entries}
+        if len(keys) >= 2:
+            collision_forms.add(form)
+        for uk, _ in entries:
+            key_to_forms.setdefault(uk, set()).add(form)
+
+    # For each key, check if it has any non-colliding form
+    keys_with_unique_forms: set[str] = set()
+    for uk, forms in key_to_forms.items():
+        if forms - collision_forms:
+            keys_with_unique_forms.add(uk)
+
+    # Build filtered index
+    filtered: dict[str, list[tuple[str, str]]] = {}
+    removed = 0
+    for form, entries in nikkud_index.items():
+        if form in collision_forms:
+            kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms]
+            removed += len(entries) - len(kept)
+            if kept:
+                filtered[form] = kept
+        else:
+            filtered[form] = entries
+
+    logger.info(f"    Filtered {removed} collision mappings from entries with unique forms")
+    return filtered
+
+
+# ── Matching ─────────────────────────────────────────────────────
+
+
+def match_sentences(
+    sentences: list[dict],
+    nikkud_index: dict,
+    confusable_keys: set[str],
+) -> dict:
+    """Match sentences to vocab words using the nikkud index.
+
+    Args:
+        sentences: List of ``{"text": str, "source": str}`` dicts.
+        nikkud_index: Output of ``_build_nikkud_index``.
+        confusable_keys: Set of unique_keys that are in confusable groups.
+
+    Returns:
+        Dict mapping unique_key → list of match dicts, each containing:
+        ``text``, ``source``, ``match_method``, ``word_count``,
+        ``matched_form``, ``char_offset``, ``char_end``.
+    """
+    matches: dict[str, list[dict]] = {}
+
+    for sent_info in sentences:
+        text = sent_info["text"]
+        source = sent_info["source"]
+        words_in_sent = text.split()
+        word_count = len(words_in_sent)
+
+        char_pos = 0
+        for raw_word in words_in_sent:
+            cleaned = _PUNCT.sub("", raw_word)
+            if not cleaned:
+                word_start = text.find(raw_word, char_pos)
+                char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos
+                continue
+
+            # Locate positions within the sentence
+            word_start_in_sent = text.find(raw_word, char_pos)
+            if word_start_in_sent < 0:
+                word_start_in_sent = char_pos
+            clean_offset_in_raw = raw_word.find(cleaned)
+            if clean_offset_in_raw < 0:
+                clean_offset_in_raw = 0
+            clean_start = word_start_in_sent + clean_offset_in_raw
+            clean_end = clean_start + len(cleaned)
+
+            found: list[tuple[str, str]] = []
+
+            # Direct nikkud match
+            if cleaned in nikkud_index:
+                for unique_key, match_type in nikkud_index[cleaned]:
+                    found.append((unique_key, match_type))
+
+            # Prefix stripping — only if no direct match exists
+            if cleaned not in nikkud_index:
+                for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index):
+                    found.append((unique_key, match_type))
+
+            for unique_key, match_method in found:
+                matches.setdefault(unique_key, []).append(
+                    {
+                        "text": text,
+                        "source": source,
+                        "match_method": match_method,
+                        "word_count": word_count,
+                        "matched_form": cleaned,
+                        "char_offset": clean_start,
+                        "char_end": clean_end,
+                    }
+                )
+
+            char_pos = word_start_in_sent + len(raw_word)
+
+    return matches
+
+
+# ── Writing results ──────────────────────────────────────────────
+
+
+def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int:
+    """Update words dict entries with matched example sentences.
+
+    Selects up to 3 best sentences per word (scoring prefers 6–12 word
+    sentences and non-prefix matches).  Also generates a cloze entry for
+    the top match, unless the word is in the confusable set.
+
+    Args:
+        words: The full words.json dict, modified in place.
+        matches: Output of ``match_sentences``.
+        confusable_keys: Set of unique_keys in confusable groups.
+
+    Returns:
+        Count of words.json entries that were updated.
+    """
+    import genanki  # noqa: PLC0415 — import only where needed
+
+    updated = 0
+
+    for unique_key, sent_list in matches.items():
+        if unique_key not in words:
+            continue
+
+        entry = words[unique_key]
+
+        # Deduplicate by sentence text
+        seen_texts: set[str] = set()
+        unique: list[dict] = []
+        for s in sent_list:
+            if s["text"] not in seen_texts:
+                seen_texts.add(s["text"])
+                unique.append(s)
+
+        # Prefer direct matches; only fall back to prefix if none exist
+        direct = [s for s in unique if "prefix" not in s["match_method"]]
+        prefix_only = [s for s in unique if "prefix" in s["match_method"]]
+        pool = direct if direct else prefix_only
+
+        # Score: prefer 6–12 word sentences
+        def _score(s: dict) -> tuple[int,]:
+            wc = s["word_count"]
+            length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
+            return (length_score,)
+
+        pool.sort(key=_score)
+        best = pool[:3]
+
+        # Build vetted list
+        if not entry.get("examples"):
+            entry["examples"] = {}
+        examples: dict = entry["examples"]
+        examples["vetted"] = [
+            {
+                "text": s["text"],
+                "source": s["source"],
+                "match_method": s["match_method"],
+            }
+            for s in best
+        ]
+
+        # Build cloze from best sentence (skip confusables)
+        is_confusable = unique_key in confusable_keys
+        if not is_confusable and best:
+            top = best[0]
+            # Preserve existing cloze_guid if sentence text unchanged
+            old_cloze = examples.get("cloze") or {}
+            if old_cloze.get("text") == top["text"]:
+                cloze_guid = old_cloze.get("cloze_guid")
+            else:
+                cloze_guid = genanki.guid_for("cloze", unique_key)
+
+            examples["cloze"] = {
+                "text": top["text"],
+                "cloze_word_start": top["char_offset"],
+                "cloze_word_end": top["char_end"],
+                "cloze_hint": None,
+                "cloze_guid": cloze_guid,
+            }
+        elif is_confusable:
+            examples.pop("cloze", None)
+
+        examples["rejected_count"] = 0
+        updated += 1
+
+    return updated
+
+
+# ── Public API ───────────────────────────────────────────────────
+
+
+def run(words: dict) -> dict:
+    """Extract EPUB sentences, match against words, update words dict in place.
+
+    Called from run.py with the already-loaded words.json dict.
+
+    Args:
+        words: The full words.json dict keyed by unique_key.  Modified in place.
+
+    Returns:
+        Summary stats dict with keys ``books``, ``matched``, ``total_vocab``.
+    """
+    logger.info("  Extracting sentences from EPUBs ...")
+    all_sentences: list[dict] = []
+    book_counts: dict[str, int] = {}
+
+    for filepath, book_name in _discover_epubs().items():
+        path = Path(filepath)
+        sentences = extract_sentences_from_epub(path, book_name)
+        book_counts[book_name] = len(sentences)
+        all_sentences.extend(sentences)
+        logger.info(f"    {book_name}: {len(sentences)} sentences")
+
+    if not all_sentences:
+        logger.warning("  No EPUB files found — skipping example extraction")
+        return {"books": {}, "matched": 0, "total_vocab": len(words)}
+
+    logger.info(f"  Total sentences: {len(all_sentences)}")
+
+    # Build nikkud index
+    logger.info("  Building nikkud index from words.json ...")
+    nikkud_index = _build_nikkud_index(words)
+    logger.info(f"    {len(nikkud_index)} unique nikkud forms indexed")
+
+    # Filter out collision forms for entries that have unique forms
+    nikkud_index = _filter_collision_forms(nikkud_index)
+
+    # Build confusable key set
+    confusable_keys: set[str] = set()
+    for key, entry in words.items():
+        if entry.get("confusable_group"):
+            confusable_keys.add(key)
+
+    # Match sentences
+    logger.info("  Matching sentences against vocab ...")
+    matches = match_sentences(all_sentences, nikkud_index, confusable_keys)
+    logger.info(f"    {len(matches)} words matched")
+
+    # Break down by match method
+    method_counts: dict[str, int] = {}
+    for sent_list in matches.values():
+        for s in sent_list:
+            method = s["match_method"]
+            method_counts[method] = method_counts.get(method, 0) + 1
+    for method, count in sorted(method_counts.items()):
+        logger.info(f"      {method}: {count} sentence-word pairs")
+
+    # Update words dict in place
+    updated = update_words_json(words, matches, confusable_keys)
+    logger.info(f"    Updated {updated} entries in words.json")
+
+    return {
+        "books": book_counts,
+        "matched": len(matches),
+        "total_vocab": len(words),
+    }
+
+
+# ── Standalone entry point ───────────────────────────────────────
+
+if __name__ == "__main__":
+    import json
+
+    logging.basicConfig(level=logging.INFO, format="%(message)s")
+
+    words_path = DATA_DIR / "words.json"
+    with open(words_path, encoding="utf-8") as f:
+        words = json.load(f)
+
+    stats = run(words)
+
+    # Save updated words.json
+    with open(words_path, "w", encoding="utf-8") as f:
+        json.dump(words, f, ensure_ascii=False, indent=2)
+
+    coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0
+    logger.info(f"  Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)")
--- a/frequency_lookup.py
+++ b/frequency_lookup.py
@ -19,6 +19,7 @@ logger = logging.getLogger(__name__)

 FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
 CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
+CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json"
 REQUEST_TIMEOUT = 30

 # Module-level cache: word_no_nikkud -> rank (1 = most common)
@ -26,12 +27,19 @@ _freq: dict[str, int] = {}


 def load(cache_path: Path = CACHE_PATH) -> None:
-    """Load frequency data from cache, downloading if not present."""
+    """Load frequency data from cache, downloading if not present.
+
+    Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json.
+    """
    global _freq
-    if cache_path.exists():
-        with open(cache_path, encoding="utf-8") as f:
+    # Prefer YAP-cleaned frequency data if available
+    clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None
+    load_path = clean_path if clean_path and clean_path.exists() else cache_path
+    if load_path.exists():
+        with open(load_path, encoding="utf-8") as f:
            _freq = json.load(f)
-        logger.info(f"Frequency cache loaded: {len(_freq)} entries")
+        label = "clean" if load_path == clean_path else "raw"
+        logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries")
        return

    logger.info("Downloading FrequencyWords he_50k.txt …")
--- a/pealim_detail_scrape.py
+++ b/pealim_detail_scrape.py
@ -2,7 +2,8 @@
 """
 Consolidated detail page scraper for pealim.com.

-Visits /dict/<slug>/ detail pages for nouns and verbs in data/words.json.
+Visits /dict/<slug>/ detail pages for nouns, verbs, adjectives and prepositions
+in data/words.json.
 Makes two requests per slug:
  1. hebstyle=mo cookie  → nikkud forms
  2. hebstyle=vl cookie  → ktiv male forms
@ -11,7 +12,8 @@ Updates entries in data/words.json with scraped detail data.

 Usage:
    python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail]
-                                    [--nouns-only | --verbs-only]
+                                    [--nouns-only | --verbs-only |
+                                     --adjectives-only | --prepositions-only]
 """

 import argparse
@ -144,28 +146,128 @@ FORM_KEY_TO_PERSON: dict[str, str] = {
    "infinitive": "inf",
 }

-# Mishkal English name → Hebrew nikkud mapping (common patterns)
-MISHKAL_HEBREW: dict[str, str] = {
-    "CaCaC": "קָטָל",
-    "CeCeC": "קֶטֶל",
-    "CiCeC": "קִטֶל",
-    "CaCeC": "קָטֶל",
-    "CoCeC": "קוֹטֵל",
-    "CaCiC": "קָטִיד",
-    "CaCuC": "קָטוּר",
-    "miCCaC": "מִקְטָל",
-    "miCCeC": "מִקְטֶל",
-    "maCCeC": "מַקְטֶל",
-    "maCCiC": "מַקְטִיר",
-    "hiCCiC": "הִקְטִיל",
-    "CiCCuC": "קִטּוּל",
-    "hitCaCCeC": "הִתְקַטֵּל",
-    "CaCCan": "קַטְּלָן",
-    "CaCCaC": "קַטָּל",
-    "CiCCon": "קִטְּרוֹן",
-    "CaCCeC": "קַטֶּלֶת",
+# Mishkal English name → Hebrew nikkud mapping
+# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal).
+# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion.
+# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns
+_MISHKAL_HEBREW_Q: dict[str, str] = {
+    # --- a ---
+    "aqtal": "אַקְטָל",
+    "aqtala": "אַקְטָלָה",
+    # --- e ---
+    "eqtal": "אֶקְטָל",
+    # --- h ---
+    "haqtala": "הַקְטָלָה",
+    "heqtel": "הֶקְטֵל",
+    "hiqqatlut": "הִקָּטְלוּת",
+    "hitqattlut": "הִתְקַטְּלוּת",
+    # --- m ---
+    "maqtal": "מַקְטָל",
+    "maqtel": "מַקְטֵל",
+    "maqtela": "מַקְטֵלָה",
+    "maqtelet": "מַקְטֶלֶת",
+    "maqtil": "מַקְטִיל",
+    "maqtol": "מַקְטוֹל",
+    "maqtolet": "מַקְטֹלֶת",
+    "maqtul": "מַקְטוּל",
+    "meqattel": "מְקַטֵּל",
+    "meqila": "מְקִילָה",
+    "mequla": "מְקוּלָה",
+    "mequttal": "מְקֻטָּל",
+    "miqtal": "מִקְטָל",
+    "miqtala": "מִקְטָלָה",
+    "miqtelet": "מִקְטֶלֶת",
+    "miqtol": "מִקְטוֹל",
+    "miqtolet": "מִקְטֹלֶת",
+    "mitqattel": "מִתְקַטֵּל",
+    "muqtal": "מֻקְטָל",
+    # --- n ---
+    "niqtal": "נִקְטָל",
+    # --- q ---
+    "qal": "קַל",
+    "qatal": "קָטָל",
+    "qatel": "קָטֵל",
+    "qatil": "קָטִיל",
+    "qatla": "קַטְלָה",
+    "qatlan": "קַטְלָן",
+    "qatlut": "קַטְלוּת",
+    "qatol": "קָטוֹל",
+    "qaton": "קָטוֹן",
+    "qattal": "קַטָּל",
+    "qattala": "קַטָּלָה",
+    "qattelet": "קַטֶּלֶת",
+    "qattil": "קַטִּיל",
+    "qattila": "קַטִּילָה",
+    "qattolet": "קַטֹּלֶת",
+    "qattul": "קַטּוּל",
+    "qatul": "קָטוּל",
+    "qatut": "קָטוּת",
+    "qetel": "קֶטֶל",
+    "qeteh": "קֵטֶה",
+    "qitla": "קִטְלָה",
+    "qitlon": "קִטְלוֹן",
+    "qittalon": "קִטָּלוֹן",
+    "qittel": "קִטֵּל",
+    "qittelet": "קִטֶּלֶת",
+    "qittol": "קִטּוֹל",
+    "qittolet": "קִטֹּלֶת",
+    "qittul": "קִטּוּל",
+    "qol": "קֹל",
+    "qotal": "קוֹטָל",
+    "qotel": "קוֹטֵל",
+    "qotelet": "קוֹטֶלֶת",
+    "qotla": "קָטְלָה",
+    "qtal": "קְטָל",
+    "qtala": "קְטָלָה",
+    "qtaltal": "קְטַלְטַל",
+    "qtaltan": "קְטַלְתָּן",
+    "qtaltolet": "קְטַלְטֹלֶת",
+    "qtel": "קְטֵל",
+    "qtela": "קְטֵלָה",
+    "qtelet": "קְטֶלֶת",
+    "qtil": "קְטִיל",
+    "qtila": "קְטִילָה",
+    "qtili": "קְטִילִי",
+    "qtol": "קְטוֹל",
+    "qtola": "קְטוֹלָה",
+    "qtolet": "קְטֹלֶת",
+    "qtul": "קְטוּל",
+    "qtula": "קְטוּלָה",
+    "qtulla": "קְטֻלָּה",
+    "qtut": "קְטוּת",
+    "qutla": "קֻטְלָה",
+    "quttolet": "קֻטּוֹלֶת",
+    # --- t ---
+    "taqtela": "תַּקְטֵלָה",
+    "taqtil": "תַּקְטִיל",
+    "taqtit": "תַּקְטִית",
+    "taqtul": "תַּקְטוּל",
+    "taqtula": "תַּקְטוּלָה",
+    "taqtut": "תַּקְטוּת",
+    "tiqtal": "תִּקְטָל",
+    "tiqtala": "תִּקְטָלָה",
+    "tiqtelet": "תִּקְטֶלֶת",
+    "tiqtolet": "תִּקְטֹלֶת",
+    "tqilla": "תְּקִלָּה",
+    "tqula": "תְּקוּלָה",
+    # --- y ---
+    "yaqtul": "יַקְטוּל",
 }

+
+def _mishkal_to_hebrew(mishkal: str) -> str | None:
+    """Look up Hebrew mishkal, handling k-notation → q-notation conversion."""
+    if not mishkal:
+        return None
+    # Try as-is first (q-notation)
+    result = _MISHKAL_HEBREW_Q.get(mishkal)
+    if result:
+        return result
+    # Convert k-notation to q-notation and retry
+    q_form = mishkal.replace("k", "q")
+    return _MISHKAL_HEBREW_Q.get(q_form)
+
+
 # ---------------------------------------------------------------------------
 # HTTP session
 # ---------------------------------------------------------------------------
@ -452,7 +554,7 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:

    if mishkal:
        result["mishkal"] = mishkal
-        result["mishkal_hebrew"] = MISHKAL_HEBREW.get(mishkal)
+        result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal)

    return result

@ -887,6 +989,228 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
    return result


+# ---------------------------------------------------------------------------
+# Adjective detail parsing
+# ---------------------------------------------------------------------------
+
+_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a")
+_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp")
+
+
+def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]:
+    """
+    Parse the adjective inflection table from a pealim detail page (mo/nikkud).
+
+    Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text
+    and audio URL from each.
+
+    Returns:
+        Dict mapping form key ("ms", "fs", "mp", "fp") to
+        {"nikkud": str, "audio_url": str}, or empty dict if table not found.
+    """
+    table = soup.find("table", class_="conjugation-table")
+    if not table:
+        return {}
+
+    result: dict[str, dict] = {}
+    for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
+        div = table.find(id=cell_id)
+        if not div:
+            continue
+        nikkud, audio_url = _get_menukad_and_audio(div)
+        if nikkud:
+            result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
+
+    return result
+
+
+def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]:
+    """
+    Parse the adjective inflection table from a vl (ktiv male) page.
+
+    Returns:
+        Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string.
+    """
+    table = soup.find("table", class_="conjugation-table")
+    if not table:
+        return {}
+
+    result: dict[str, str] = {}
+    for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True):
+        div = table.find(id=cell_id)
+        if not div:
+            continue
+        ktiv = _get_plain_text(div)
+        if ktiv:
+            result[form_key] = ktiv
+
+    return result
+
+
+def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]:
+    """
+    Extract mishkal from the PoS section of an adjective detail page.
+
+    Reuses the same extraction logic as _parse_noun_gender_mishkal.
+
+    Returns:
+        Tuple of (mishkal_english, mishkal_hebrew) where either may be empty.
+    """
+    _, mishkal = _parse_noun_gender_mishkal(soup)
+    mishkal_hebrew = _mishkal_to_hebrew(mishkal) or ""
+    return mishkal, mishkal_hebrew
+
+
+def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
+    """
+    Parse adjective detail pages (mo=nikkud, vl=ktiv male).
+
+    Returns:
+        Dict matching the adjective_inflection schema:
+        {ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}.
+        Empty dict if no forms found.
+    """
+    mo_soup = BeautifulSoup(mo_html, "lxml")
+    vl_soup = BeautifulSoup(vl_html, "lxml")
+
+    mo_data = _parse_adjective_table(mo_soup)
+    vl_data = _parse_adjective_table_vl(vl_soup)
+    mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup)
+
+    if not mo_data:
+        return {}
+
+    result: dict = {}
+    for form_key in _ADJECTIVE_FORM_KEYS:
+        mo_form = mo_data.get(form_key)
+        if mo_form:
+            nikkud = mo_form["nikkud"]
+            ktiv = vl_data.get(form_key, "")
+            if not ktiv:
+                logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud)
+            result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
+        else:
+            result[form_key] = None
+
+    result["mishkal"] = mishkal or None
+    result["mishkal_hebrew"] = mishkal_hebrew or None
+
+    return result
+
+
+# ---------------------------------------------------------------------------
+# Preposition detail parsing
+# ---------------------------------------------------------------------------
+
+_PREPOSITION_CELL_IDS: tuple[str, ...] = (
+    "P-1s",
+    "P-1p",
+    "P-2ms",
+    "P-2fs",
+    "P-2mp",
+    "P-2fp",
+    "P-3ms",
+    "P-3fs",
+    "P-3mp",
+    "P-3fp",
+)
+_PREPOSITION_FORM_KEYS: tuple[str, ...] = (
+    "1s",
+    "1p",
+    "2ms",
+    "2fs",
+    "2mp",
+    "2fp",
+    "3ms",
+    "3fs",
+    "3mp",
+    "3fp",
+)
+
+
+def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]:
+    """
+    Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud).
+
+    Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud
+    text and audio URL from each.
+
+    Returns:
+        Dict mapping person key ("1s", "1p", …, "3fp") to
+        {"nikkud": str, "audio_url": str}, or empty dict if table not found.
+    """
+    table = soup.find("table", class_="conjugation-table")
+    if not table:
+        return {}
+
+    result: dict[str, dict] = {}
+    for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
+        div = table.find(id=cell_id)
+        if not div:
+            continue
+        nikkud, audio_url = _get_menukad_and_audio(div)
+        if nikkud:
+            result[form_key] = {"nikkud": nikkud, "audio_url": audio_url}
+
+    return result
+
+
+def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]:
+    """
+    Parse the preposition pronominal suffix table from a vl (ktiv male) page.
+
+    Returns:
+        Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string.
+    """
+    table = soup.find("table", class_="conjugation-table")
+    if not table:
+        return {}
+
+    result: dict[str, str] = {}
+    for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True):
+        div = table.find(id=cell_id)
+        if not div:
+            continue
+        ktiv = _get_plain_text(div)
+        if ktiv:
+            result[form_key] = ktiv
+
+    return result
+
+
+def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
+    """
+    Parse preposition detail pages (mo=nikkud, vl=ktiv male).
+
+    Returns:
+        Dict matching the preposition_inflection schema:
+        {1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}.
+        Empty dict if no forms found.
+    """
+    mo_soup = BeautifulSoup(mo_html, "lxml")
+    vl_soup = BeautifulSoup(vl_html, "lxml")
+
+    mo_data = _parse_preposition_table(mo_soup)
+    vl_data = _parse_preposition_table_vl(vl_soup)
+
+    if not mo_data:
+        return {}
+
+    result: dict = {}
+    for form_key in _PREPOSITION_FORM_KEYS:
+        mo_form = mo_data.get(form_key)
+        if mo_form:
+            nikkud = mo_form["nikkud"]
+            ktiv = vl_data.get(form_key, "")
+            if not ktiv:
+                logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud)
+            result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv}
+        else:
+            result[form_key] = None
+
+    return result
+
+
 # ---------------------------------------------------------------------------
 # Merging strategy
 # ---------------------------------------------------------------------------
@ -926,6 +1250,22 @@ def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict:
    return scraped


+def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict:
+    """
+    Merge scraped adjective data into existing adjective_inflection.
+    No GUIDs to preserve — simple overwrite with scraped data.
+    """
+    return dict(scraped)
+
+
+def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict:
+    """
+    Merge scraped preposition data into existing preposition_inflection.
+    No GUIDs to preserve — simple overwrite with scraped data.
+    """
+    return dict(scraped)
+
+
 # ---------------------------------------------------------------------------
 # I/O helpers
 # ---------------------------------------------------------------------------
@ -953,14 +1293,26 @@ def _save_words(data: dict) -> None:
 # ---------------------------------------------------------------------------


-def _should_process(entry: dict, pos: str, force: bool, nouns_only: bool, verbs_only: bool) -> bool:
+def _should_process(
+    entry: dict,
+    pos: str,
+    force: bool,
+    nouns_only: bool,
+    verbs_only: bool,
+    adjectives_only: bool,
+    prepositions_only: bool,
+) -> bool:
    """Return True if this entry should be scraped."""
-    if not pos.startswith(("Noun", "Verb")):
+    if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")):
        return False
    if nouns_only and not pos.startswith("Noun"):
        return False
    if verbs_only and not pos.startswith("Verb"):
        return False
+    if adjectives_only and not pos.startswith("Adjective"):
+        return False
+    if prepositions_only and not pos.startswith("Preposition"):
+        return False
    return force or not entry.get("detail_scraped")


@ -969,6 +1321,8 @@ def run(
    force_refresh: bool = False,
    nouns_only: bool = False,
    verbs_only: bool = False,
+    adjectives_only: bool = False,
+    prepositions_only: bool = False,
 ) -> None:
    """
    Main scrape loop.
@ -978,13 +1332,24 @@ def run(
        force_refresh: Re-scrape entries where detail_scraped=True.
        nouns_only: Only scrape noun entries.
        verbs_only: Only scrape verb entries.
+        adjectives_only: Only scrape adjective entries.
+        prepositions_only: Only scrape preposition entries.
    """
    words = _load_words()

    candidates = [
        (unique_key, entry)
        for unique_key, entry in words.items()
-        if _should_process(entry, entry.get("pos", ""), force_refresh, nouns_only, verbs_only) and entry.get("slug")
+        if _should_process(
+            entry,
+            entry.get("pos", ""),
+            force_refresh,
+            nouns_only,
+            verbs_only,
+            adjectives_only,
+            prepositions_only,
+        )
+        and entry.get("slug")
    ]

    total = len(candidates)
@ -992,7 +1357,10 @@ def run(
        candidates = candidates[:test]
        logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total)
    else:
-        logger.info("Processing %d eligible entries (nouns+verbs) from words.json", total)
+        logger.info(
+            "Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json",
+            total,
+        )

    processed = 0
    errors = 0
@ -1003,7 +1371,14 @@ def run(
        word_nikkud = entry.get("word", {}).get("nikkud", unique_key)
        url = f"{PEALIM_BASE}/dict/{slug}/"

-        label = "Noun" if pos.startswith("Noun") else "Verb"
+        if pos.startswith("Noun"):
+            label = "Noun"
+        elif pos.startswith("Verb"):
+            label = "Verb"
+        elif pos.startswith("Adjective"):
+            label = "Adjective"
+        else:
+            label = "Preposition"
        logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug)

        # Fetch mo (nikkud) page
@ -1042,7 +1417,7 @@ def run(
                    errors += 1
                    continue

-            else:  # Verb
+            elif pos.startswith("Verb"):
                existing_conj = entry.get("conjugation")
                scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj)
                if scraped:
@ -1059,6 +1434,41 @@ def run(
                    errors += 1
                    continue

+            elif pos.startswith("Adjective"):
+                scraped = _scrape_adjective_detail(slug, mo_html, vl_html)
+                if scraped:
+                    existing_ai = entry.get("adjective_inflection")
+                    merged = _merge_adjective_inflection(existing_ai, scraped)
+                    words[unique_key]["adjective_inflection"] = merged
+                    ms = merged.get("ms", {}) or {}
+                    fs = merged.get("fs", {}) or {}
+                    logger.info(
+                        "  ms=%s fs=%s mishkal=%s",
+                        ms.get("nikkud", "—"),
+                        fs.get("nikkud", "—"),
+                        merged.get("mishkal", "—"),
+                    )
+                else:
+                    logger.warning("  No adjective data scraped for %s", slug)
+                    errors += 1
+                    continue
+
+            else:  # Preposition
+                scraped = _scrape_preposition_detail(slug, mo_html, vl_html)
+                if scraped:
+                    existing_pi = entry.get("preposition_inflection")
+                    merged = _merge_preposition_inflection(existing_pi, scraped)
+                    words[unique_key]["preposition_inflection"] = merged
+                    form_1s = merged.get("1s", {}) or {}
+                    logger.info(
+                        "  1s=%s",
+                        form_1s.get("nikkud", "—"),
+                    )
+                else:
+                    logger.warning("  No preposition data scraped for %s", slug)
+                    errors += 1
+                    continue
+
        except Exception as exc:  # noqa: BLE001
            logger.error("  Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True)
            errors += 1
@ -1089,7 +1499,7 @@ def run(

 def _build_parser() -> argparse.ArgumentParser:
    parser = argparse.ArgumentParser(
-        description="Scrape pealim.com detail pages for nouns and verbs in data/words.json."
+        description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.")
    )
    parser.add_argument(
        "--test",
@ -1117,6 +1527,18 @@ def _build_parser() -> argparse.ArgumentParser:
        default=False,
        help="Only scrape Verb entries.",
    )
+    group.add_argument(
+        "--adjectives-only",
+        action="store_true",
+        default=False,
+        help="Only scrape Adjective entries.",
+    )
+    group.add_argument(
+        "--prepositions-only",
+        action="store_true",
+        default=False,
+        help="Only scrape Preposition entries.",
+    )
    return parser


@ -1133,4 +1555,6 @@ if __name__ == "__main__":
        force_refresh=args.force_refresh_detail,
        nouns_only=args.nouns_only,
        verbs_only=args.verbs_only,
+        adjectives_only=args.adjectives_only,
+        prepositions_only=args.prepositions_only,
    )
--- a/run.py
+++ b/run.py
@ -11,7 +11,7 @@ Pipeline steps:
  1. List scrape    — scrape pealim.com list pages → words.json (captures slugs)
  2. Detail scrape  — scrape noun/verb detail pages using slugs → words.json
  3. Frequency      — load/download word frequency data
-  4. Examples       — fetch Ben Yehuda example sentences
+  4. Examples       — extract example sentences from Hebrew EPUBs
  5. Audio download — download audio mp3 files
  6. Fonts          — download Heebo font files
  7. Images         — fetch noun images from Wikipedia
@ -21,9 +21,8 @@ Options:
  --skip-scrape        Skip list page scraping (use existing words.json)
  --skip-detail        Skip detail page scraping
  --skip-audio         Skip audio .mp3 downloads
-  --skip-examples      Skip Ben Yehuda example fetching
+  --skip-examples      Skip EPUB example extraction
  --skip-images        Skip image fetching for concrete nouns
-  --refresh-examples   Force rebuild of Ben Yehuda index
  --test N             Limit to first N words/pages
 """

@ -60,9 +59,8 @@ def parse_args():
    p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
    p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
    p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
-    p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
+    p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
    p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
-    p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
    p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
    return p.parse_args()

@ -93,22 +91,15 @@ def step_frequency() -> dict[str, int]:
    return frequency_lookup._freq


-def step_examples(args, _freq_cache: dict):
-    """Step 4 — load/build Ben Yehuda example index."""
+def step_examples(args) -> dict:
+    """Step 4 — extract example sentences from Hebrew EPUBs."""
    if args.skip_examples:
        logger.info("[4] Skipping examples (--skip-examples)")
-        examples_path = DATA_DIR / "examples_cache.json"
-        if examples_path.exists():
-            with open(examples_path) as f:
-                return json.load(f)
        return {}

-    logger.info("[4] Loading Ben Yehuda example index …")
-    import benyehuda
+    logger.info("[4] Extracting EPUB example sentences …")
+    import epub_examples

-    benyehuda.load(force_rebuild=args.refresh_examples)
-
-    # Read word list from words.json instead of CSV
    if not WORDS_JSON.exists():
        logger.warning("[4] words.json not found, skipping examples")
        return {}
@ -116,41 +107,14 @@ def step_examples(args, _freq_cache: dict):
    with open(WORDS_JSON, encoding="utf-8") as f:
        words = json.load(f)

-    entries = list(words.values())
-    if args.test:
-        entries = entries[: args.test]
+    stats = epub_examples.run(words)

-    # Build confusable consonant set from words.json
-    consonant_counts: dict[str, int] = {}
-    for entry in entries:
-        ktiv_male = entry.get("word", {}).get("ktiv_male", "")
-        if ktiv_male:
-            safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
-            if safe:
-                consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
-    confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
+    # Save updated words.json
+    with open(WORDS_JSON, "w", encoding="utf-8") as f:
+        json.dump(words, f, ensure_ascii=False, indent=2)

-    # Delete stale cache entries for confusable words so they get re-fetched
-    stale_deleted = 0
-    for entry in entries:
-        word_nikkud = entry.get("word", {}).get("nikkud", "")
-        ktiv_male = entry.get("word", {}).get("ktiv_male", "")
-        if word_nikkud and ktiv_male:
-            safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
-            if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
-                del benyehuda._examples_cache[word_nikkud]
-                stale_deleted += 1
-    if stale_deleted:
-        logger.info(f"    Deleted {stale_deleted} stale confusable cache entries")
-
-    logger.info(f"    Pre-fetching examples for {len(entries)} words …")
-    for entry in entries:
-        word_nikkud = entry.get("word", {}).get("nikkud", "")
-        if word_nikkud:
-            benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
-
-    benyehuda.save_examples_cache()
-    return benyehuda._examples_cache
+    logger.info(f"    Coverage: {stats['matched']}/{stats['total_vocab']}")
+    return stats


 def step_detail_scrape(args):
@ -250,7 +214,7 @@ def step_build_all(args):
    apkg_builder.build_all_variants(words, limit=args.test)


-def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
+def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
    logger.info("")
    logger.info("=" * 60)
    logger.info("SUMMARY")
@ -267,10 +231,12 @@ def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: d
        logger.info(f"  Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")

    logger.info(f"  Frequency entries: {len(freq_cache)}")
-    logger.info(f"  Example cache entries: {len(examples_cache)}")
-    covered = sum(1 for v in examples_cache.values() if v)
-    if examples_cache:
-        logger.info(f"  Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
+    matched = example_stats.get("matched", 0)
+    total = example_stats.get("total_vocab", 0)
+    if total:
+        logger.info(f"  Example coverage: {matched}/{total} ({100 * matched // total}%)")
+    for book, count in example_stats.get("books", {}).items():
+        logger.info(f"    {book}: {count} sentences")

    if AUDIO_DIR.exists():
        mp3s = list(AUDIO_DIR.glob("*.mp3"))
@ -321,8 +287,6 @@ def main():
        logger.info(f"  MODE: --only {args.only}")
    if args.test:
        logger.info(f"  TEST MODE: {args.test} words")
-    if args.refresh_examples:
-        logger.info("  REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
    logger.info("=" * 60)

    def _load_words_for_only() -> dict:
@ -385,13 +349,13 @@ def main():
    step_list_scrape(args)  # 1 — scrape list pages → words.json (captures slugs)
    step_detail_scrape(args)  # 2 — scrape detail pages using slugs → words.json
    freq_cache = step_frequency()  # 3 — word frequency data
-    examples_cache = step_examples(args, _freq_cache=freq_cache)  # 4 — Ben Yehuda examples
+    example_stats = step_examples(args)  # 4 — EPUB example sentences
    step_audio_download(args)  # 5 — download audio mp3s
    step_fonts(args)  # 6 — download Heebo fonts
    step_images(args)  # 7 — fetch noun images
    step_build_all(args)  # 8 — build all .apkg variants

-    print_summary(args, examples_cache, freq_cache)
+    print_summary(args, example_stats, freq_cache)


 if __name__ == "__main__":
--- a/scripts/assign_frequency.py
+++ b/scripts/assign_frequency.py
@ -0,0 +1,392 @@
+#!/usr/bin/env python3
+"""Assign frequency ranks from the cleaned corpus to words.json entries.
+
+Two-tier assignment with PoS priority:
+  Tier 1: Match headword ktiv_male directly against corpus
+  Tier 2: Match conjugated/inflected forms (only if no other entry already
+           claimed that corpus word via tier 1)
+
+PoS priority (based on standalone-word likelihood in Hebrew text):
+  כינויי_גוף (Pronoun) > מילות_חיבור (Conjunction) > שם_תואר (Adjective) >
+  מילית (Particle) > שם_עצם (Noun) > תוארי_הפועל (Adverb) >
+  מילות_יחס (Preposition) > פעלים (Verb)
+
+Usage:
+    python3 scripts/assign_frequency.py              # assign and save
+    python3 scripts/assign_frequency.py --dry-run    # preview only
+    python3 scripts/assign_frequency.py --stats      # show statistics only
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+from collections import defaultdict
+from pathlib import Path
+
+logger = logging.getLogger(__name__)
+
+PROJECT_ROOT = Path(__file__).parent.parent
+WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
+CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
+RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
+
+# Function word PoS — these dominate content words in homograph groups
+FUNCTION_POS = frozenset({"כינויי_גוף", "מילות_חיבור", "מילית", "מילות_יחס", "תוארי_הפועל"})
+
+# Content PoS that loses frequency when a function word dominates
+# Adjectives also lose (e.g. כן "honest" vs כן "yes") — they're rare collisions
+CONTENT_POS = frozenset({"שם_עצם", "שם_תואר", "פעלים"})
+
+# Manual overrides: at these corpus ranks, ALL homographs share frequency.
+# These are cases where the content word is genuinely common enough to deserve it.
+# e.g. rank 15: עם "people" (NN) alongside עם "with" (PREP)
+# Manual overrides: at these ktiv_male forms, ALL homographs share frequency.
+# These are cases where the content word is genuinely common enough to deserve it.
+SHARE_ALL_WORDS = frozenset(
+    {
+        "עם",  # "people" (NN) + "with" (PREP)
+        "שם",  # "name" (NN) + "there" (ADV)
+        "אל",  # "god" (NN) + "to" (PREP) + "don't" (PART)
+        "עד",  # "witness"/"eternity" (NN) + "until" (PREP)
+        "פה",  # "mouth" (NN) + "here" (ADV)
+        "לאחר",  # "to be late" (VB) + "after" (PREP)
+        "יופי",  # "beauty" (NN) + "great!" (ADV)
+        "המון",  # "crowd" (NN) + "lots of" (ADV)
+        "חבל",  # "rope" (NN) + "it's a pity" (ADV)
+        "ראשית",  # "beginning" (NN) + "firstly" (ADV)
+        "עקב",  # "heel"/"footprint" (NN) + "due to" (CONJ)
+        "אולם",  # "hall" (NN) + "however" (ADV)
+    }
+)
+
+
+def _get_pos_tag(entry: dict) -> str:
+    """Extract primary PoS tag from entry's tags field."""
+    tags = (entry.get("tags") or "").split()
+    for t in tags:
+        if not t.startswith("שורש"):
+            return t
+    return "unknown"
+
+
+def _build_form_index(words: dict) -> dict[str, list[tuple[str, str]]]:
+    """Build reverse index: ktiv_male_form -> [(unique_key, match_type), ...]"""
+    index: dict[str, list[tuple[str, str]]] = defaultdict(list)
+
+    for key, entry in words.items():
+        w = entry.get("word") or {}
+        if km := w.get("ktiv_male"):
+            index[km].append((key, "headword"))
+
+        # Verb conjugations: indexed for new-assignment-only matching (no upgrades).
+        # Conjugated forms collide with unrelated headwords, so tier 2 only uses
+        # these for entries that have NO existing frequency.
+        conj = entry.get("conjugation") or {}
+        for form in conj.get("active_forms") or []:
+            if isinstance(form, dict):
+                form_data = form.get("form") or {}
+                if km2 := form_data.get("ktiv_male"):
+                    km2 = km2.rstrip("!\u200f ")
+                    index[km2].append((key, "conjugation"))
+
+        for hp in conj.get("hufal_pual_forms") or []:
+            if isinstance(hp, dict):
+                hp_data = hp.get("form") or {}
+                if km3 := hp_data.get("ktiv_male"):
+                    km3 = km3.rstrip("!\u200f ")
+                    index[km3].append((key, "conjugation"))
+
+        for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
+            for inf_data in (entry.get(field) or {}).values():
+                if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
+                    index[km4].append((key, "inflection"))
+
+    return dict(index)
+
+
+def _should_get_frequency(
+    entry: dict,
+    all_headword_entries: list[tuple[str, str]],
+    corpus_word: str,
+    words: dict,
+) -> bool:
+    """Decide if an entry should get frequency in a homograph group.
+
+    Rules:
+    - If only one entry matches, it always gets frequency.
+    - If SHARE_ALL_WORDS includes this corpus word, all entries share.
+    - If the group has function words AND content words, content words lose.
+    - Otherwise all entries share.
+    """
+    if len(all_headword_entries) <= 1:
+        return True
+    if corpus_word in SHARE_ALL_WORDS:
+        return True
+
+    pos = _get_pos_tag(entry)
+    has_function = any(_get_pos_tag(words[k]) in FUNCTION_POS for k, _ in all_headword_entries)
+
+    return not (has_function and pos in CONTENT_POS)
+
+
+def assign_frequencies(
+    words: dict,
+    freq_corpus: dict[str, int],
+    raw_corpus: dict[str, int] | None = None,
+    upgrade: bool = False,
+) -> dict[str, dict]:
+    """Assign frequency ranks to words.json entries. Returns assignment details.
+
+    freq_corpus controls which words are valid (cleaned corpus).
+    raw_corpus provides original rank numbers (with gaps). If not provided,
+    uses freq_corpus ranks (re-ranked, no gaps).
+    upgrade: if True, tier 2 can upgrade an entry's rank when a conjugated/inflected
+    form has a better (lower) rank than the headword match.
+    """
+    rank_source = raw_corpus if raw_corpus is not None else freq_corpus
+    form_index = _build_form_index(words)
+
+    # Track which corpus words have been claimed by tier 1
+    tier1_claimed: set[str] = set()
+
+    # Results tracking
+    assignments: dict[str, dict] = {}  # unique_key -> {rank, source, corpus_word}
+
+    # --- Tier 1: headword matches ---
+    # For each corpus word, find all headword matches and assign to eligible entries.
+    # Homograph groups: function words get frequency, content words don't (unless overridden).
+    corpus_by_rank = sorted(freq_corpus.items(), key=lambda x: x[1])
+
+    for corpus_word, _clean_rank in corpus_by_rank:
+        matches = form_index.get(corpus_word, [])
+        headword_matches = [(k, t) for k, t in matches if t == "headword"]
+        if not headword_matches:
+            continue
+
+        original_rank = rank_source.get(corpus_word, _clean_rank)
+        assigned_any = False
+        for entry_key, _ in headword_matches:
+            if entry_key in assignments:
+                continue
+            if _should_get_frequency(words[entry_key], headword_matches, corpus_word, words):
+                assignments[entry_key] = {
+                    "rank": original_rank,
+                    "source": "headword",
+                    "corpus_word": corpus_word,
+                }
+                assigned_any = True
+
+        if assigned_any:
+            tier1_claimed.add(corpus_word)
+
+    tier1_count = len(assignments)
+    logger.info("Tier 1 (headword): %d entries assigned", tier1_count)
+
+    # --- Tier 2: conjugation/inflection matches ---
+    # Only use corpus words NOT claimed in tier 1.
+    # A corpus word that matches an inflection is "owned" by that headword —
+    # it cannot also upgrade an unrelated verb via conjugation.
+    # Upgrades (when enabled) only apply within the same match type priority.
+    for corpus_word, _clean_rank in corpus_by_rank:
+        if corpus_word in tier1_claimed:
+            continue
+
+        matches = form_index.get(corpus_word, [])
+        secondary_matches = [(k, t) for k, t in matches if t in ("conjugation", "inflection")]
+        if not secondary_matches:
+            continue
+
+        original_rank = rank_source.get(corpus_word, _clean_rank)
+
+        # Split by type: inflections take priority over conjugations
+        inflection_matches = [(k, t) for k, t in secondary_matches if t == "inflection"]
+        conjugation_matches = [(k, t) for k, t in secondary_matches if t == "conjugation"]
+
+        # If any inflection matches exist, this corpus word belongs to inflection.
+        # Don't let conjugations claim it.
+        active_matches = inflection_matches if inflection_matches else conjugation_matches
+
+        for entry_key, match_type in active_matches:
+            existing = assignments.get(entry_key)
+            if existing is None:
+                # New assignment — conjugations only allowed for rank > 5000
+                # (too many false positives in the important tiers)
+                if match_type == "conjugation" and original_rank <= 5000:
+                    continue
+                assignments[entry_key] = {
+                    "rank": original_rank,
+                    "source": match_type,
+                    "corpus_word": corpus_word,
+                }
+                break
+            if upgrade and match_type == "inflection" and original_rank < existing["rank"]:
+                # Upgrade — only allowed for inflections (conjugations collide too much)
+                assignments[entry_key] = {
+                    "rank": original_rank,
+                    "source": f"upgrade:{match_type}",
+                    "corpus_word": corpus_word,
+                }
+                break
+
+    tier2_count = len(assignments) - tier1_count
+    logger.info("Tier 2 (conjugation/inflection): %d entries assigned", tier2_count)
+
+    return assignments
+
+
+def print_stats(words: dict, assignments: dict, freq_corpus: dict) -> None:
+    """Print detailed statistics about frequency assignment."""
+    total = len(words)
+    assigned = len(assignments)
+    previously_had = sum(1 for e in words.values() if e.get("frequency") is not None)
+
+    print(f"\n{'=' * 60}")
+    print("Frequency Assignment Statistics")
+    print(f"{'=' * 60}")
+    print(f"Words.json entries:      {total}")
+    print(f"Clean corpus size:       {len(freq_corpus)}")
+    print(f"Previously had freq:     {previously_had}")
+    print(f"Now assigned:            {assigned}")
+    print(f"Newly gained:            {assigned - previously_had}")
+    print(f"Still unlisted:          {total - assigned}")
+
+    # By tier
+    tier1 = sum(1 for a in assignments.values() if a["source"] == "headword")
+    tier2_conj = sum(1 for a in assignments.values() if a["source"] == "conjugation")
+    tier2_inf = sum(1 for a in assignments.values() if a["source"] == "inflection")
+    print("\nBy assignment tier:")
+    print(f"  Tier 1 (headword):     {tier1}")
+    print(f"  Tier 2 (conjugation):  {tier2_conj}")
+    print(f"  Tier 2 (inflection):   {tier2_inf}")
+
+    # By PoS
+    print("\nBy PoS:")
+    from collections import Counter
+
+    pos_assigned = Counter()
+    pos_total = Counter()
+    for k, v in words.items():
+        pos = _get_pos_tag(v)
+        pos_total[pos] += 1
+        if k in assignments:
+            pos_assigned[pos] += 1
+    pos_order = [
+        "כינויי_גוף",
+        "מילות_חיבור",
+        "שם_תואר",
+        "מילית",
+        "שם_עצם",
+        "תוארי_הפועל",
+        "מילות_יחס",
+        "פעלים",
+        "unknown",
+    ]
+    for pos in sorted(pos_total, key=lambda p: pos_order.index(p) if p in pos_order else 99):
+        a = pos_assigned[pos]
+        t = pos_total[pos]
+        pct = a / t * 100 if t else 0
+        print(f"  {pos:20s}: {a:5d}/{t:5d} ({pct:.0f}%)")
+
+    # By frequency tier (using apkg_builder tiers)
+    print("\nBy frequency tier:")
+    tiers = {
+        "Core (1-500)": (1, 500),
+        "Essential (501-1500)": (501, 1500),
+        "Intermediate (1501-3000)": (1501, 3000),
+        "Upper-intermediate (3001-5000)": (3001, 5000),
+        "Advanced (5001-10000)": (5001, 10000),
+        "Rare (10001+)": (10001, 999999),
+    }
+    for label, (lo, hi) in tiers.items():
+        count = sum(1 for a in assignments.values() if lo <= a["rank"] <= hi)
+        print(f"  {label:35s}: {count}")
+
+    # Top 20 newly assigned (entries that didn't have frequency before)
+    newly = []
+    for k, a in assignments.items():
+        if words[k].get("frequency") is None:
+            w = words[k].get("word", {})
+            newly.append((a["rank"], k, w.get("ktiv_male", ""), a["source"], a["corpus_word"]))
+    newly.sort()
+    if newly:
+        print("\nTop 20 newly assigned entries:")
+        for rank, _key, ktiv, source, corpus_word in newly[:20]:
+            print(f"  rank {rank:5d}: {ktiv:15s} via {source:12s} (corpus: {corpus_word})")
+
+    # Entries that LOST frequency (had it before, not assigned now)
+    lost = []
+    for k, v in words.items():
+        old_freq = v.get("frequency")
+        if old_freq is not None and k not in assignments:
+            w = v.get("word", {})
+            lost.append((old_freq, k, w.get("ktiv_male", "")))
+    lost.sort()
+    if lost:
+        print(f"\nEntries that would LOSE frequency ({len(lost)} total):")
+        for rank, _key, ktiv in lost[:20]:
+            print(f"  was rank {rank:5d}: {ktiv}")
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Assign frequency to words.json")
+    parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
+    parser.add_argument("--stats", action="store_true", help="Show statistics only")
+    parser.add_argument(
+        "--upgrade", action="store_true", help="Allow tier 2 to upgrade headword rank from conjugated forms"
+    )
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    # Load data
+    freq_path = CLEAN_CACHE if CLEAN_CACHE.exists() else RAW_CACHE
+    logger.info("Loading frequency corpus: %s", freq_path)
+    with open(freq_path, encoding="utf-8") as f:
+        freq_corpus: dict[str, int] = json.load(f)
+
+    # Load raw corpus for original rank numbers (with gaps)
+    raw_corpus: dict[str, int] | None = None
+    if RAW_CACHE.exists() and freq_path != RAW_CACHE:
+        with open(RAW_CACHE, encoding="utf-8") as f:
+            raw_corpus = json.load(f)
+        logger.info("Using original ranks from %s", RAW_CACHE)
+
+    with open(WORDS_JSON, encoding="utf-8") as f:
+        words: dict = json.load(f)
+
+    logger.info("Corpus: %d entries, Words.json: %d entries", len(freq_corpus), len(words))
+
+    # Run assignment
+    assignments = assign_frequencies(words, freq_corpus, raw_corpus, upgrade=args.upgrade)
+
+    # Stats
+    print_stats(words, assignments, freq_corpus)
+
+    if args.stats or args.dry_run:
+        if args.dry_run:
+            logger.info("Dry run — no changes saved")
+        return
+
+    # Apply to words.json
+    changed = 0
+    for key, entry in words.items():
+        if key in assignments:
+            new_rank = assignments[key]["rank"]
+            if entry.get("frequency") != new_rank:
+                entry["frequency"] = new_rank
+                changed += 1
+        else:
+            if entry.get("frequency") is not None:
+                entry["frequency"] = None
+                changed += 1
+
+    with open(WORDS_JSON, "w", encoding="utf-8") as f:
+        json.dump(words, f, ensure_ascii=False, indent=2)
+
+    logger.info("Updated %d entries in words.json", changed)
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/clean_frequency_corpus.py
+++ b/scripts/clean_frequency_corpus.py
@ -0,0 +1,400 @@
+#!/usr/bin/env python3
+"""Clean the Hebrew frequency corpus by removing prefix+word combinations.
+
+Two modes:
+  --mode yap       (default) Use YAP morphological analyzer for accurate prefix detection.
+                   Requires YAP API running at localhost:8000.
+  --mode heuristic Use rule-based prefix stripping (no external dependencies).
+
+Both modes preserve words that exist as known dictionary forms in words.json.
+
+Usage:
+    python3 scripts/clean_frequency_corpus.py                    # YAP mode
+    python3 scripts/clean_frequency_corpus.py --mode heuristic   # heuristic fallback
+    python3 scripts/clean_frequency_corpus.py --dry-run          # preview only
+    python3 scripts/clean_frequency_corpus.py --resume           # resume YAP from checkpoint
+    python3 scripts/clean_frequency_corpus.py --limit 1000       # process first N entries
+
+Input:  data/frequency_cache.json   (raw he_50k.txt, 49999 entries)
+Output: data/frequency_clean.json   (filtered, prefix combos removed)
+        data/frequency_discarded.json (discarded entries with reason)
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import logging
+import os
+import sys
+import time
+from pathlib import Path
+
+import requests
+
+logger = logging.getLogger(__name__)
+
+PROJECT_ROOT = Path(__file__).parent.parent
+RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
+CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
+DISCARDED = PROJECT_ROOT / "data" / "frequency_discarded.json"
+WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
+CHECKPOINT = PROJECT_ROOT / "data" / "_yap_checkpoint.json"
+
+YAP_URL = os.environ.get("YAP_URL", "http://localhost:8000/yap/heb/joint")
+YAP_TIMEOUT = 10
+BATCH_SAVE_INTERVAL = 500
+
+# --- YAP mode constants ---
+# POS tags that indicate a prefix
+PREFIX_POS = frozenset({"PREPOSITION", "CONJ", "DEF", "REL"})
+# POS tags for the host word that make the combo a false positive
+HOST_POS = frozenset({"NN", "NNP", "NNT", "PRP", "CD", "DT", "EX"})
+
+# --- Heuristic mode constants ---
+# Hebrew prefix combinations, longest first for greedy matching.
+PREFIXES = [
+    # 4-char
+    "וכשמ",
+    "וכשב",
+    "וכשל",
+    "וכשה",
+    # 3-char
+    "וכש",
+    "ומה",
+    "ובה",
+    "וכה",
+    "ולה",
+    "ומש",
+    "ובש",
+    "וכב",
+    "ולב",
+    "ומב",
+    "וכל",
+    "ולכ",
+    "שבה",
+    "שמה",
+    # 2-char
+    "כש",
+    "מה",
+    "בה",
+    "כה",
+    "לה",
+    "מש",
+    "בש",
+    "וב",
+    "וה",
+    "וכ",
+    "ול",
+    "ומ",
+    "וש",
+    "כב",
+    "לב",
+    "מב",
+    "כל",
+    "לכ",
+    "שב",
+    "שה",
+    "שכ",
+    "של",
+    "שמ",
+    # 1-char
+    "ב",
+    "ה",
+    "ו",
+    "כ",
+    "ל",
+    "מ",
+    "ש",
+]
+MIN_REMAINDER_LEN = 2
+
+
+def _load_known_forms(words_path: Path) -> set[str]:
+    """Load all known ktiv_male forms from words.json."""
+    if not words_path.exists():
+        logger.warning("words.json not found at %s — no dictionary filter", words_path)
+        return set()
+
+    with open(words_path, encoding="utf-8") as f:
+        words = json.load(f)
+
+    known: set[str] = set()
+    for entry in words.values():
+        w = entry.get("word") or {}
+        if km := w.get("ktiv_male"):
+            known.add(km)
+
+        for form in entry.get("active_forms") or []:
+            if isinstance(form, dict) and (km2 := form.get("ktiv_male")):
+                known.add(km2)
+
+        for hp in entry.get("hufal_pual_forms") or []:
+            if isinstance(hp, dict) and (km3 := hp.get("ktiv_male")):
+                known.add(km3)
+
+        for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
+            for inf_data in (entry.get(field) or {}).values():
+                if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
+                    known.add(km4)
+
+    logger.info("Loaded %d known dictionary forms from words.json", len(known))
+    return known
+
+
+# ── YAP mode ──────────────────────────────────────────────────────────────
+
+
+def query_yap(word: str) -> dict | None:
+    """Send a single word to YAP and return the JSON response."""
+    payload = {"text": f"{word}  "}
+    try:
+        resp = requests.post(YAP_URL, json=payload, timeout=YAP_TIMEOUT)
+        resp.raise_for_status()
+        return resp.json()
+    except requests.RequestException as e:
+        logger.warning("YAP request failed for '%s': %s", word, e)
+        return None
+
+
+def is_prefix_combo_yap(yap_response: dict) -> tuple[bool, str]:
+    """Check if any morphological analysis segments the word as prefix+host.
+
+    Conservative: if ANY analysis in the lattice shows prefix+host → discard.
+    """
+    lattice = yap_response.get("ma_lattice", "")
+    if not lattice:
+        return False, ""
+
+    arcs = []
+    for line in lattice.strip().split("\n"):
+        if not line.strip():
+            continue
+        parts = line.split("\t")
+        if len(parts) < 6:
+            continue
+        arcs.append(
+            {
+                "from": parts[0],
+                "to": parts[1],
+                "form": parts[2],
+                "lemma": parts[3],
+                "cpos": parts[4],
+                "pos": parts[5],
+            }
+        )
+
+    if len(arcs) < 2:
+        return False, ""
+
+    for a in arcs:
+        if a["cpos"] not in PREFIX_POS and a["pos"] not in PREFIX_POS:
+            continue
+        for b in arcs:
+            if b["from"] != a["to"]:
+                continue
+            if b["cpos"] in HOST_POS or b["pos"] in HOST_POS:
+                reason = f"{a['form']}({a['cpos']})+{b['form']}({b['cpos']})"
+                return True, reason
+
+    return False, ""
+
+
+# ── Heuristic mode ────────────────────────────────────────────────────────
+
+
+def find_prefix_decomposition(word: str, freq: dict[str, int]) -> tuple[str, str] | None:
+    """Check if word is a prefix+higher-ranked-word combo (heuristic)."""
+    if len(word) <= MIN_REMAINDER_LEN:
+        return None
+
+    word_rank = freq.get(word, 999999)
+
+    for prefix in PREFIXES:
+        if not word.startswith(prefix):
+            continue
+        remainder = word[len(prefix) :]
+        if len(remainder) < MIN_REMAINDER_LEN:
+            continue
+        if remainder in freq and freq[remainder] < word_rank:
+            return prefix, remainder
+
+    return None
+
+
+# ── Main ──────────────────────────────────────────────────────────────────
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Clean frequency corpus")
+    parser.add_argument("--mode", choices=["yap", "heuristic"], default="yap", help="Detection mode")
+    parser.add_argument("--dry-run", action="store_true", help="Show removals without saving")
+    parser.add_argument("--resume", action="store_true", help="Resume YAP mode from checkpoint")
+    parser.add_argument("--limit", type=int, default=0, help="Process only first N words (0=all)")
+    args = parser.parse_args()
+
+    logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+
+    if not RAW_CACHE.exists():
+        logger.error("Raw frequency cache not found: %s", RAW_CACHE)
+        sys.exit(1)
+
+    with open(RAW_CACHE, encoding="utf-8") as f:
+        raw_freq: dict[str, int] = json.load(f)
+
+    logger.info("Raw frequency corpus: %d entries", len(raw_freq))
+
+    # Sort by rank
+    words_by_rank = sorted(raw_freq.items(), key=lambda x: x[1])
+    if args.limit:
+        words_by_rank = words_by_rank[: args.limit]
+
+    if args.mode == "yap":
+        discarded_list = _run_yap_mode(words_by_rank, args)
+    else:
+        known_forms = _load_known_forms(WORDS_JSON)
+        discarded_list = _run_heuristic_mode(words_by_rank, raw_freq, known_forms)
+
+    kept_count = len(words_by_rank) - len(discarded_list)
+    logger.info("Done. Kept: %d, Discarded: %d", kept_count, len(discarded_list))
+
+    if args.dry_run:
+        logger.info("Dry run — no files written")
+        return
+
+    # Build clean frequency dict (re-ranked without gaps)
+    discarded_words = {d["word"] for d in discarded_list}
+    clean_freq: dict[str, int] = {}
+    new_rank = 1
+    for word, _rank in words_by_rank:
+        if word not in discarded_words:
+            clean_freq[word] = new_rank
+            new_rank += 1
+
+    with open(CLEAN_CACHE, "w", encoding="utf-8") as f:
+        json.dump(clean_freq, f, ensure_ascii=False)
+    logger.info("Clean frequency saved: %d entries → %s", len(clean_freq), CLEAN_CACHE)
+
+    with open(DISCARDED, "w", encoding="utf-8") as f:
+        json.dump(discarded_list, f, ensure_ascii=False, indent=2)
+    logger.info("Discarded entries saved: %d → %s", len(discarded_list), DISCARDED)
+
+
+def _run_yap_mode(
+    words_by_rank: list[tuple[str, int]],
+    args: argparse.Namespace,
+) -> list[dict]:
+    """Run YAP-based prefix detection."""
+    # Check YAP connectivity
+    test = query_yap("בדיקה")
+    if test is None:
+        logger.error("Cannot connect to YAP API at %s", YAP_URL)
+        sys.exit(1)
+    logger.info("YAP API connected")
+
+    # Load checkpoint if resuming
+    analyzed: dict[str, dict] = {}
+    if args.resume and CHECKPOINT.exists():
+        with open(CHECKPOINT, encoding="utf-8") as f:
+            analyzed = json.load(f)
+        logger.info("Resumed from checkpoint: %d words already analyzed", len(analyzed))
+
+    discarded_list: list[dict] = []
+    discarded_count = 0
+    kept_count = 0
+    error_count = 0
+
+    for i, (word, rank) in enumerate(words_by_rank):
+        # Already analyzed (from checkpoint)
+        if word in analyzed:
+            if analyzed[word]["discard"]:
+                discarded_count += 1
+                discarded_list.append({"word": word, "original_rank": rank, "reason": analyzed[word]["reason"]})
+            else:
+                kept_count += 1
+            continue
+
+        # Trivial: single char, ASCII, or too short
+        if len(word) <= 1 or word.isascii():
+            analyzed[word] = {"discard": False, "reason": ""}
+            kept_count += 1
+            continue
+
+        result = query_yap(word)
+        if result is None:
+            analyzed[word] = {"discard": False, "reason": "yap_error"}
+            error_count += 1
+            kept_count += 1
+            time.sleep(0.5)
+            continue
+
+        is_combo, reason = is_prefix_combo_yap(result)
+        analyzed[word] = {"discard": is_combo, "reason": reason}
+
+        if is_combo:
+            discarded_count += 1
+            discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
+            if rank <= 500 or discarded_count <= 50:
+                logger.info("  DISCARD rank %5d: %s (%s)", rank, word, reason)
+        else:
+            kept_count += 1
+
+        # Rate limit
+        if i % 10 == 0:
+            time.sleep(0.01)
+
+        # Checkpoint
+        if (i + 1) % BATCH_SAVE_INTERVAL == 0:
+            if not args.dry_run:
+                with open(CHECKPOINT, "w", encoding="utf-8") as f:
+                    json.dump(analyzed, f, ensure_ascii=False)
+            logger.info(
+                "  [%d/%d] kept=%d discarded=%d errors=%d",
+                i + 1,
+                len(words_by_rank),
+                kept_count,
+                discarded_count,
+                error_count,
+            )
+
+    # Final checkpoint save
+    if not args.dry_run and CHECKPOINT.exists():
+        CHECKPOINT.unlink()
+
+    if error_count:
+        logger.warning("%d YAP errors encountered", error_count)
+
+    return discarded_list
+
+
+def _run_heuristic_mode(
+    words_by_rank: list[tuple[str, int]],
+    raw_freq: dict[str, int],
+    known_forms: set[str],
+) -> list[dict]:
+    """Run heuristic prefix detection (no external dependencies)."""
+    discarded_list: list[dict] = []
+    discarded_count = 0
+
+    for word, rank in words_by_rank:
+        if len(word) <= 1 or word.isascii():
+            continue
+
+        # Known dictionary form → keep
+        if word in known_forms:
+            continue
+
+        result = find_prefix_decomposition(word, raw_freq)
+        if result is not None:
+            prefix, remainder = result
+            discarded_count += 1
+            reason = f"{prefix}+{remainder} (rank {raw_freq[remainder]})"
+            discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
+            if rank <= 500 or discarded_count <= 50:
+                logger.info("  DISCARD rank %5d: %s = %s", rank, word, reason)
+
+    return discarded_list
+
+
+if __name__ == "__main__":
+    main()
--- a/scripts/validate_data.py
+++ b/scripts/validate_data.py
@ -32,7 +32,7 @@ DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
 HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA)  # alef–tav

 VALID_PERSON_CODES: frozenset[str] = frozenset(
-    ["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
+    ["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
 )

 EMOJI_RE = re.compile(
@ -561,6 +561,7 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
    """
    name = "conjugation_form_guids"
    errors: list[str] = []
+    warnings: list[str] = []

    for key, entry in data.items():
        conj = entry.get("conjugation")
@ -580,7 +581,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
                guid_candidates = form.get("guid_candidates")

                if not guid and not guid_candidates:
-                    errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
+                    # New forms from rescrape use deterministic fallback — warn, don't fail
+                    warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
                    continue

                if guid:
@ -597,6 +599,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None:
                        else:
                            seen_guids[candidate] = label

+    if warnings:
+        _warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
--- a/tests/test_detail_scrape.py
+++ b/tests/test_detail_scrape.py
@ -0,0 +1,486 @@
+"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
+
+import sys
+from pathlib import Path
+
+import pytest
+
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from pealim_detail_scrape import (
+    _parse_adjective_table,
+    _parse_adjective_table_vl,
+    _parse_preposition_table,
+    _parse_preposition_table_vl,
+    _scrape_adjective_detail,
+    _scrape_preposition_detail,
+)
+
+# ---------------------------------------------------------------------------
+# Fixtures — real HTML snippets from pealim.com
+# ---------------------------------------------------------------------------
+
+ADJECTIVE_MO_TABLE = """
+<table class="table table-condensed conjugation-table">
+  <thead>
+    <tr>
+      <th class="column-header" colspan="2">Singular</th>
+      <th class="column-header" colspan="2">Plural</th>
+    </tr>
+    <tr>
+      <th class="column-header">Masculine</th>
+      <th class="column-header">Feminine</th>
+      <th class="column-header">Masculine</th>
+      <th class="column-header">Feminine</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <td class="conj-td">
+        <div id="ms-a">
+          <div><div>
+            <span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">&#128266;</span>
+            <span class="menukad">אֲבִיבִי</span>
+          </div></div>
+          <div class="meaning">spring-like, vernal</div>
+        </div>
+      </td>
+      <td class="conj-td">
+        <div id="fs-a">
+          <div><div>
+            <span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">&#128266;</span>
+            <span class="menukad">אֲבִיבִית</span>
+          </div></div>
+          <div class="meaning">spring-like, vernal</div>
+        </div>
+      </td>
+      <td class="conj-td">
+        <div id="mp-a">
+          <div><div>
+            <span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">&#128266;</span>
+            <span class="menukad">אֲבִיבִיִּים</span>
+          </div></div>
+          <div class="meaning">spring-like, vernal</div>
+        </div>
+      </td>
+      <td class="conj-td">
+        <div id="fp-a">
+          <div><div>
+            <span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">&#128266;</span>
+            <span class="menukad">אֲבִיבִיּוֹת</span>
+          </div></div>
+          <div class="meaning">spring-like, vernal</div>
+        </div>
+      </td>
+    </tr>
+  </tbody>
+</table>
+"""
+
+# VL version: menukad spans contain unvowelled text (hebstyle=vl)
+ADJECTIVE_VL_TABLE = """
+<table class="table table-condensed conjugation-table">
+  <tbody>
+    <tr>
+      <td class="conj-td">
+        <div id="ms-a"><div><div>
+          <span class="menukad">אביבי</span>
+        </div></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="fs-a"><div><div>
+          <span class="menukad">אביבית</span>
+        </div></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="mp-a"><div><div>
+          <span class="menukad">אביביים</span>
+        </div></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="fp-a"><div><div>
+          <span class="menukad">אביביות</span>
+        </div></div></div>
+      </td>
+    </tr>
+  </tbody>
+</table>
+"""
+
+PREPOSITION_MO_TABLE = """
+<table class="table table-condensed conjugation-table">
+  <thead>
+    <tr>
+      <th rowspan="2">Person</th>
+      <th class="column-header" colspan="2">Singular</th>
+      <th class="column-header" colspan="2">Plural</th>
+    </tr>
+    <tr>
+      <th class="column-header">Masculine</th>
+      <th class="column-header">Feminine</th>
+      <th class="column-header">Masculine</th>
+      <th class="column-header">Feminine</th>
+    </tr>
+  </thead>
+  <tbody>
+    <tr>
+      <th>1st</th>
+      <td class="conj-td" colspan="2">
+        <div id="P-1s"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלִּי</span>
+        </div></div><div class="meaning"><strong>of mine</strong></div></div>
+      </td>
+      <td class="conj-td" colspan="2">
+        <div id="P-1p"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלָּנוּ</span>
+        </div></div><div class="meaning"><strong>of ours</strong></div></div>
+      </td>
+    </tr>
+    <tr>
+      <th>2nd</th>
+      <td class="conj-td">
+        <div id="P-2ms"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלְּךָ</span>
+        </div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="P-2fs"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלָּךְ</span>
+        </div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="P-2mp"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלָּכֶם</span>
+        </div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="P-2fp"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלָּכֶן</span>
+        </div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
+      </td>
+    </tr>
+    <tr>
+      <th>3rd</th>
+      <td class="conj-td">
+        <div id="P-3ms"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלּוֹ</span>
+        </div></div><div class="meaning"><strong>of his</strong></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="P-3fs"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלָּהּ</span>
+        </div></div><div class="meaning"><strong>of hers</strong></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="P-3mp"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלָּהֶם</span>
+        </div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
+      </td>
+      <td class="conj-td">
+        <div id="P-3fp"><div><div>
+          <span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">&#128266;</span>
+          <span class="menukad">שֶׁלָּהֶן</span>
+        </div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
+      </td>
+    </tr>
+  </tbody>
+</table>
+"""
+
+PREPOSITION_VL_TABLE = """
+<table class="table table-condensed conjugation-table">
+  <tbody>
+    <tr>
+      <th>1st</th>
+      <td colspan="2"><div id="P-1s"><div><div>
+        <span class="menukad">שלי</span>
+      </div></div></div></td>
+      <td colspan="2"><div id="P-1p"><div><div>
+        <span class="menukad">שלנו</span>
+      </div></div></div></td>
+    </tr>
+    <tr>
+      <th>2nd</th>
+      <td><div id="P-2ms"><div><div>
+        <span class="menukad">שלך</span>
+      </div></div></div></td>
+      <td><div id="P-2fs"><div><div>
+        <span class="menukad">שלך</span>
+      </div></div></div></td>
+      <td><div id="P-2mp"><div><div>
+        <span class="menukad">שלכם</span>
+      </div></div></div></td>
+      <td><div id="P-2fp"><div><div>
+        <span class="menukad">שלכן</span>
+      </div></div></div></td>
+    </tr>
+    <tr>
+      <th>3rd</th>
+      <td><div id="P-3ms"><div><div>
+        <span class="menukad">שלו</span>
+      </div></div></div></td>
+      <td><div id="P-3fs"><div><div>
+        <span class="menukad">שלה</span>
+      </div></div></div></td>
+      <td><div id="P-3mp"><div><div>
+        <span class="menukad">שלהם</span>
+      </div></div></div></td>
+      <td><div id="P-3fp"><div><div>
+        <span class="menukad">שלהן</span>
+      </div></div></div></td>
+    </tr>
+  </tbody>
+</table>
+"""
+
+# Minimal full-page wrappers so _scrape_*_detail() can parse them
+_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
+_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
+_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
+_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
+
+
+# ---------------------------------------------------------------------------
+# Adjective table tests
+# ---------------------------------------------------------------------------
+
+
+class TestParseAdjectiveTable:
+    """Tests for _parse_adjective_table (mo/nikkud page)."""
+
+    def test_returns_four_form_keys(self) -> None:
+        result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+        assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
+
+    def test_ms_nikkud(self) -> None:
+        result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+        assert result["ms"]["nikkud"] == "אֲבִיבִי"
+
+    def test_fs_nikkud(self) -> None:
+        result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+        assert result["fs"]["nikkud"] == "אֲבִיבִית"
+
+    def test_mp_nikkud(self) -> None:
+        result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+        assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
+
+    def test_fp_nikkud(self) -> None:
+        result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+        assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
+
+    def test_audio_url_present(self) -> None:
+        result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
+        assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
+
+    def test_empty_on_missing_table(self) -> None:
+        result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
+        assert result == {}
+
+
+class TestParseAdjectiveTableVl:
+    """Tests for _parse_adjective_table_vl (ktiv male page)."""
+
+    def test_returns_four_form_keys(self) -> None:
+        result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+        assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
+
+    def test_ms_ktiv(self) -> None:
+        result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+        assert result["ms"] == "אביבי"
+
+    def test_fs_ktiv(self) -> None:
+        result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+        assert result["fs"] == "אביבית"
+
+    def test_mp_ktiv(self) -> None:
+        result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+        assert result["mp"] == "אביביים"
+
+    def test_fp_ktiv(self) -> None:
+        result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
+        assert result["fp"] == "אביביות"
+
+
+# ---------------------------------------------------------------------------
+# _scrape_adjective_detail tests
+# ---------------------------------------------------------------------------
+
+
+class TestScrapeAdjectiveDetail:
+    """Tests for _scrape_adjective_detail — schema compliance."""
+
+    @pytest.fixture()
+    def result(self) -> dict:
+        return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
+
+    def test_returns_non_empty_dict(self, result: dict) -> None:
+        assert result
+
+    def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["ms"]["nikkud"] == "אֲבִיבִי"
+        assert result["ms"]["ktiv_male"] == "אביבי"
+
+    def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["fs"]["nikkud"] == "אֲבִיבִית"
+        assert result["fs"]["ktiv_male"] == "אביבית"
+
+    def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
+        assert result["mp"]["ktiv_male"] == "אביביים"
+
+    def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
+        assert result["fp"]["ktiv_male"] == "אביביות"
+
+    def test_mishkal_key_present(self, result: dict) -> None:
+        # mishkal may be None since no PoS section is in our minimal fixture
+        assert "mishkal" in result
+
+    def test_mishkal_hebrew_key_present(self, result: dict) -> None:
+        assert "mishkal_hebrew" in result
+
+    def test_all_schema_keys_present(self, result: dict) -> None:
+        expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
+        assert expected.issubset(result.keys())
+
+    def test_empty_on_no_table(self) -> None:
+        result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
+        assert result == {}
+
+
+# ---------------------------------------------------------------------------
+# Preposition table tests
+# ---------------------------------------------------------------------------
+
+
+class TestParsePrepositionTable:
+    """Tests for _parse_preposition_table (mo/nikkud page)."""
+
+    @pytest.fixture()
+    def result(self) -> dict:
+        return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
+
+    def test_returns_ten_form_keys(self, result: dict) -> None:
+        expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
+        assert set(result.keys()) == expected
+
+    def test_1s_nikkud(self, result: dict) -> None:
+        assert result["1s"]["nikkud"] == "שֶׁלִּי"
+
+    def test_1p_nikkud(self, result: dict) -> None:
+        assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
+
+    def test_2ms_nikkud(self, result: dict) -> None:
+        assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
+
+    def test_2fs_nikkud(self, result: dict) -> None:
+        assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
+
+    def test_2mp_nikkud(self, result: dict) -> None:
+        assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
+
+    def test_2fp_nikkud(self, result: dict) -> None:
+        assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
+
+    def test_3ms_nikkud(self, result: dict) -> None:
+        assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
+
+    def test_3fs_nikkud(self, result: dict) -> None:
+        assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
+
+    def test_3mp_nikkud(self, result: dict) -> None:
+        assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
+
+    def test_3fp_nikkud(self, result: dict) -> None:
+        assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
+
+    def test_audio_url_present(self, result: dict) -> None:
+        assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
+
+    def test_empty_on_missing_table(self) -> None:
+        result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
+        assert result == {}
+
+
+class TestParsePrepositionTableVl:
+    """Tests for _parse_preposition_table_vl (ktiv male page)."""
+
+    @pytest.fixture()
+    def result(self) -> dict:
+        return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
+
+    def test_returns_ten_form_keys(self, result: dict) -> None:
+        expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
+        assert set(result.keys()) == expected
+
+    def test_1s_ktiv(self, result: dict) -> None:
+        assert result["1s"] == "שלי"
+
+    def test_1p_ktiv(self, result: dict) -> None:
+        assert result["1p"] == "שלנו"
+
+    def test_2ms_ktiv(self, result: dict) -> None:
+        assert result["2ms"] == "שלך"
+
+    def test_3ms_ktiv(self, result: dict) -> None:
+        assert result["3ms"] == "שלו"
+
+    def test_3fp_ktiv(self, result: dict) -> None:
+        assert result["3fp"] == "שלהן"
+
+
+# ---------------------------------------------------------------------------
+# _scrape_preposition_detail tests
+# ---------------------------------------------------------------------------
+
+
+class TestScrapePrepositionDetail:
+    """Tests for _scrape_preposition_detail — schema compliance."""
+
+    @pytest.fixture()
+    def result(self) -> dict:
+        return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
+
+    def test_returns_non_empty_dict(self, result: dict) -> None:
+        assert result
+
+    def test_all_ten_person_keys_present(self, result: dict) -> None:
+        expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
+        assert expected.issubset(result.keys())
+
+    def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["1s"]["nikkud"] == "שֶׁלִּי"
+        assert result["1s"]["ktiv_male"] == "שלי"
+
+    def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
+        assert result["1p"]["ktiv_male"] == "שלנו"
+
+    def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
+        assert result["2ms"]["ktiv_male"] == "שלך"
+
+    def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
+        assert result["3ms"]["ktiv_male"] == "שלו"
+
+    def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
+        assert result["3fs"]["ktiv_male"] == "שלה"
+
+    def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
+        assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
+        assert result["3fp"]["ktiv_male"] == "שלהן"
+
+    def test_empty_on_no_table(self) -> None:
+        result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
+        assert result == {}
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@ -42,3 +42,17 @@ def test_strip_nikkud_all_marks():
    nikkud = "הַמַּלְכָּה"
    plain = strip_nikkud(nikkud)
    assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
+
+
+def test_categorize_pos_no_substring_match():
+    """Regression: 'Pronoun' must NOT match 'Noun' category."""
+    from apkg_builder import _categorize_pos
+
+    assert _categorize_pos("Noun") == "Noun"
+    assert _categorize_pos("Verb") == "Verb"
+    assert _categorize_pos("Adjective") == "Adjective"
+    assert _categorize_pos("Adverb") == "Adverb"
+    assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
+    assert _categorize_pos("Preposition") == "Other"
+    assert _categorize_pos("Conjunction") == "Other"
+    assert _categorize_pos("Cardinal numeral") == "Other"