diff --git a/SCHEMA.yaml b/SCHEMA.yaml index 80a4973..132eb38 100644 --- a/SCHEMA.yaml +++ b/SCHEMA.yaml @@ -138,11 +138,53 @@ entry: # ktiv_male: "שומר" # --- Adjective-specific --- - adjective_inflection: null # Reserved for future use + adjective_inflection: null # null for non-adjectives # When populated: - # ms/fs/mp/fp forms with nikkud/ktiv_male subfields + # ms: + # nikkud: "גָּדוֹל" + # ktiv_male: "גדול" + # fs: + # nikkud: "גְּדוֹלָה" + # ktiv_male: "גדולה" + # mp: + # nikkud: "גְּדוֹלִים" + # ktiv_male: "גדולים" + # fp: + # nikkud: "גְּדוֹלוֹת" + # ktiv_male: "גדולות" + # mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section) + # mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping) # --- Preposition-specific --- - preposition_inflection: null # Reserved for future use + preposition_inflection: null # null for non-prepositions # When populated: - # Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...) + # 1s: + # nikkud: "שֶׁלִּי" + # ktiv_male: "שלי" + # 1p: + # nikkud: "שֶׁלָּנוּ" + # ktiv_male: "שלנו" + # 2ms: + # nikkud: "שֶׁלְּךָ" + # ktiv_male: "שלך" + # 2fs: + # nikkud: "שֶׁלָּךְ" + # ktiv_male: "שלך" + # 2mp: + # nikkud: "שֶׁלָּכֶם" + # ktiv_male: "שלכם" + # 2fp: + # nikkud: "שֶׁלָּכֶן" + # ktiv_male: "שלכן" + # 3ms: + # nikkud: "שֶׁלּוֹ" + # ktiv_male: "שלו" + # 3fs: + # nikkud: "שֶׁלָּהּ" + # ktiv_male: "שלה" + # 3mp: + # nikkud: "שֶׁלָּהֶם" + # ktiv_male: "שלהם" + # 3fp: + # nikkud: "שֶׁלָּהֶן" + # ktiv_male: "שלהן" diff --git a/apkg_builder.py b/apkg_builder.py index 74dd182..d038b9e 100644 --- a/apkg_builder.py +++ b/apkg_builder.py @@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903 # Release version tag added to all notes so users can identify which release # their cards come from (visible in Anki's Browse view and card info). -RELEASE_TAG = "v0.15.1" +RELEASE_TAG = "v0.16" # Regex for extracting emoji and Hebrew prepositions from meaning strings EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+") @@ -117,13 +117,15 @@ CARD_CSS = """ .card { font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif; font-size: 20px; - text-align: center; + text-align: right; color: #222; background: #fff; padding: 16px; + max-width: 600px; + margin: 0 auto; } .hebrew { - font-size: 36px; + font-size: 42px; font-weight: bold; direction: rtl; text-align: center; @@ -131,32 +133,34 @@ CARD_CSS = """ color: #222; } .hebrew-sm { - font-size: 24px; + font-size: 30px; font-weight: normal; direction: rtl; text-align: center; - color: #333; + color: #222; } .meaning { - font-size: 28px; + font-size: 34px; color: #1a1a8c; margin: 8px 0; + text-align: center; } .hint { - font-size: 16px; - color: #888; + font-size: 22px; + color: #555; margin: 4px 0; direction: rtl; + text-align: center; } .root-info { - font-size: 18px; - color: #555; + font-size: 26px; + color: #222; margin-top: 6px; direction: rtl; } .example { - font-size: 18px; - color: #444; + font-size: 24px; + color: #222; direction: rtl; text-align: right; font-style: italic; @@ -182,16 +186,17 @@ CARD_CSS = """ color: #555; } .sec-label { - font-size: 20px; + font-size: 28px; font-weight: normal; - color: #555; + color: #222; direction: rtl; text-align: center; margin-top: 6px; } .sec-key { - font-size: 18px; - color: #888; + font-size: 28px; + color: #222; + font-weight: bold; } .definitions { direction: rtl; @@ -199,32 +204,37 @@ CARD_CSS = """ } .conf-entry { margin: 8px 0; - font-size: 20px; + font-size: 28px; direction: rtl; } .related-group { direction: rtl; - text-align: right; + text-align: center; margin: 2px 0; - font-size: 18px; + font-size: 26px; } .emoji-img { font-size: 3.5em; text-align: center; margin: 0.3em 0; } +.card [type="button"], .card button, .replay-button { + display: block !important; + margin: 4px auto !important; + text-align: center; +} @media (prefers-color-scheme: dark) { .card { color: #e8e8e8; background: #1c1c1e; } .hebrew { color: #f0f0f0; } - .hebrew-sm { color: #ddd; } + .hebrew-sm { color: #e0e0e0; } .meaning { color: #82b0ff; } - .root-info { color: #aaa; } - .sec-label { color: #aaa; } - .sec-key { color: #666; } + .root-info { color: #e0e0e0; } + .sec-label { color: #e0e0e0; } + .sec-key { color: #e0e0e0; } .conf-entry { color: #ddd; } .hint { color: #777; } .voice-label { color: #888; } - .example { color: #bbb; border-right-color: #555; } + .example { color: #e0e0e0; border-right-color: #555; } .divider { border-top-color: #333; } .freq-badge { color: #888; border-color: #444; } } @@ -252,9 +262,6 @@ VOCAB_BACK_HEB = """
{{SharedRoots}}
{{/SharedRoots}} {{#Plural}}
רַבִּים: {{Plural}}
{{/Plural}} -{{#Example}} -
{{Example}}
-{{/Example}} {{#Frequency}}
#{{Frequency}}
{{/Frequency}} """ @@ -273,14 +280,15 @@ VOCAB_BACK_ENG = """ {{#WordNoNikkud}}
לְלֹא נִיקּוּד: {{WordNoNikkud}}
{{/WordNoNikkud}} {{#Root}}
שֹׁרֶשׁ: {{Root}}
{{/Root}} {{#PoS}}
חֵלֶק דִּיבּוּר: {{PoS}}
{{/PoS}} +{{#SharedRoots}} +
מִילִים קְשׁוּרוֹת:
+
{{SharedRoots}}
+{{/SharedRoots}} {{#Plural}}
רַבִּים: {{Plural}}
{{/Plural}} -{{#Example}} -
{{Example}}
-{{/Example}} """ VOCAB_FRONT_CLOZE = """ -
{{ClozeExample}}
+
{{ClozeExample}}
{{#ClozeHint}}
{{ClozeHint}}
{{/ClozeHint}} """ @@ -289,7 +297,6 @@ VOCAB_BACK_CLOZE = """
{{Word}}
{{#Audio}}
{{Audio}}
{{/Audio}} -
{{Meaning}}
""" VOCAB_MODEL = genanki.Model( @@ -343,8 +350,8 @@ VOCAB_MODEL = genanki.Model( CONJ_FRONT = """
אֵיךְ אוֹמְרִים
-
{{ReferenceForm}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Pronoun}}
+
{{Infinitive}}{{#Prep}} ({{Prep}}){{/Prep}}{{#Voice}} ({{Voice}}){{/Voice}}
{{Tense}}
""" @@ -363,7 +370,7 @@ CONJ_CSS = CARD_CSS CONJ_MODEL = genanki.Model( CONJ_MODEL_ID, - "Pealim Conjugation", + "Hebrew Conjugation", fields=[ {"name": "Infinitive"}, {"name": "ReferenceForm"}, @@ -666,8 +673,9 @@ def _load_emoji_lookup() -> dict[str, str]: def _categorize_pos(pos_str: str) -> str: """Return the canonical PoS category key for grouping.""" + base = pos_str.split("–")[0].split("—")[0].strip() for cat in POS_CATEGORY_LABELS: - if cat.lower() in pos_str.lower(): + if base == cat: return cat return "Other" @@ -745,10 +753,14 @@ def build_vocab_deck( word_nikkud = entry["word"]["nikkud"] word_no_nik = entry["word"].get("ktiv_male", "") root_list = entry.get("root") or [] - root = " ".join(root_list) + root = ".".join(root_list) pos_raw = entry.get("pos", "") pos_heb = entry.get("pos_hebrew", "") - meaning = entry.get("meaning", "") or "" + meaning = EMOJI_RE.sub("", entry.get("meaning", "") or "").strip() + meaning = HBPAREN_RE.sub("", meaning).strip() + meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ") + meaning = re.sub(r"(\w)\(", r"\1 (", meaning) # space before opening paren + meaning = re.sub(r",(\S)", r", \1", meaning) # space after comma meaning_raw = entry.get("meaning_raw", "") or "" slug = entry.get("slug", "") or "" frequency = entry.get("frequency") or 999_999 @@ -839,6 +851,9 @@ def build_vocab_deck( end = cloze_data.get("cloze_word_end") if cloze_text and start is not None and end is not None: cloze_example = cloze_text[:start] + "_____" + cloze_text[end:] + # Clean up duplicate/misplaced quotation marks + cloze_example = re.sub(r'["״]\s*["״]', '"', cloze_example) + cloze_example = re.sub(r'^\s*["״]\s*', "", cloze_example) raw_hint = cloze_data.get("cloze_hint") or "" if raw_hint: cloze_hint = raw_hint @@ -871,11 +886,12 @@ def build_vocab_deck( parts.append(f'') related_html = "\n".join(parts) - # Plural form (for nouns) + # Plural form (nouns only — guard against adjective/verb inflection bleed) plural_str = "" - noun_inflection = entry.get("noun_inflection") - if noun_inflection and noun_inflection.get("plural"): - plural_str = noun_inflection["plural"].get("nikkud", "") + if pos_raw.startswith("Noun"): + noun_inflection = entry.get("noun_inflection") + if noun_inflection and noun_inflection.get("plural"): + plural_str = noun_inflection["plural"].get("nikkud", "") # Image image_tag = "" @@ -977,18 +993,28 @@ def build_conj_deck( binyan_heb: str = conj.get("binyan_hebrew") or BINYAN_TO_HEBREW.get(binyan, binyan) or "" slug = entry.get("slug", "") or "" root_list = entry.get("root") or [] - root = " ".join(root_list) + root = ".".join(root_list) voice = VOICE_MAP.get(binyan, "") + meaning_raw = entry.get("meaning_raw", "") or "" meaning = entry.get("meaning", "") or "" - # Extract Hebrew preposition from meaning_raw + # Extract Hebrew preposition — strip from meaning, show on Hebrew side prep_str = "" conj_prep = conj.get("prep") if conj_prep: - prep_str = f"({conj_prep})" - elif meaning: - preps = HBPAREN_RE.findall(entry.get("meaning_raw", "") or "") - prep_str = " ".join(f"({p})" for p in preps) + # Strip any parentheses from stored prep value + prep_str = conj_prep.strip("() ") + elif meaning_raw: + preps = HBPAREN_RE.findall(meaning_raw) + if preps: + prep_str = preps[0] + # Strip Hebrew prepositions from English meaning to avoid duplication + if prep_str: + meaning = HBPAREN_RE.sub("", meaning).strip() + # Also strip from meaning_raw patterns like "(על)" + meaning = re.sub(r"\(\s*" + re.escape(prep_str) + r"\s*-?\s*\)", "", meaning).strip() + # Clean up double spaces and trailing commas + meaning = re.sub(r"\s{2,}", " ", meaning).strip(", ") related = [w for w in root_words.get(root, []) if w != infinitive] related_str = " ".join(related[:8]) if related else "" @@ -1024,7 +1050,7 @@ def build_conj_deck( elif guid_candidates: note_guid = guid_candidates[0] else: - note_guid = genanki.guid_for(_infinitive, pronoun, tense) + note_guid = genanki.guid_for(_infinitive, pronoun, tense, _binyan_heb) note = genanki.Note( model=CONJ_MODEL, guid=note_guid, @@ -1213,8 +1239,10 @@ def build_conj_deck( # ────────────────────────────────────────────────────────────────────────────── CONF_FRONT = """ +
{{Words}}
מה ההבדל?
+
""" CONF_BACK = """ @@ -1271,7 +1299,10 @@ def build_confusables_deck( guid = genanki.guid_for("confusable", entry["word"].get("ktiv_male", unique_key)) guid_to_entries.setdefault(guid, []).append(entry) - for guid, group_entries in sorted(guid_to_entries.items(), key=lambda x: x[0]): + for guid, group_entries in sorted( + guid_to_entries.items(), + key=lambda x: sum(e.get("frequency") or 999_999 for e in x[1]) / len(x[1]), + ): if guid in seen_guids: continue seen_guids.add(guid) @@ -1366,6 +1397,7 @@ PLURAL_BACK_SG = """ {{FrontSide}}
{{Plural}}
{{#PluralAudio}}
{{PluralAudio}}
{{/PluralAudio}} +{{#Gender}}
מִין: {{Gender}}
{{/Gender}} {{#Mishkal}}
מִשְׁקָל: {{Mishkal}}
{{/Mishkal}} """ @@ -1380,6 +1412,7 @@ PLURAL_BACK_PL = """
{{Singular}}
{{#SingularAudio}}
{{SingularAudio}}
{{/SingularAudio}}
{{Meaning}}
+{{#Gender}}
מִין: {{Gender}}
{{/Gender}} {{#Mishkal}}
מִשְׁקָל: {{Mishkal}}
{{/Mishkal}} """ @@ -1483,10 +1516,11 @@ def build_plural_deck( plural = noun_inflection["plural"]["nikkud"] plural_ktiv = noun_inflection["plural"].get("ktiv_male", "") gender = noun_inflection.get("gender") or "" + gender_heb = {"masculine": "זָכָר", "feminine": "נְקֵבָה"}.get(gender, gender) mishkal = noun_inflection.get("mishkal") or "" - meaning = entry.get("meaning") or "" + meaning = EMOJI_RE.sub("", entry.get("meaning") or "").strip() root_list = entry.get("root") or [] - root = " ".join(root_list) + root = ".".join(root_list) # GUID from noun_inflection note_guid_raw = noun_inflection.get("plurals_guid") @@ -1520,7 +1554,7 @@ def build_plural_deck( meaning, root, mishkal, - gender, + gender_heb, ], tags=tags, ) diff --git a/benyehuda.py b/benyehuda.py deleted file mode 100644 index e3e94e8..0000000 --- a/benyehuda.py +++ /dev/null @@ -1,202 +0,0 @@ -#!/usr/bin/env python3 -""" -Ben Yehuda corpus example-sentence lookup (nikkud corpus). - -TODO: Rewrite to update words.json examples fields directly instead of -writing to a separate examples_cache.json. Currently the migration script -bridges the gap. See Phase 5 in SPRINT_LOG.md. - -Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form, -then answers queries locally. - -Exposed API: - load(force_rebuild=False) - get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples) - save_examples_cache() -""" - -import json -import logging -import re -import zipfile -from io import BytesIO -from pathlib import Path - -import requests - -from helpers import strip_nikkud as _strip_nikkud - -logger = logging.getLogger(__name__) - -# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip) -CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip" -INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json" -EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json" -REQUEST_TIMEOUT = 120 -MIN_SENTENCE_LEN = 20 -MAX_SENTENCE_LEN = 200 -MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory - -# Module-level state -_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...] -_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run - - -def _split_sentences(text: str) -> list[str]: - """ - Split text into sentences on newlines only (Hebrew sentences don't have - mid-word period issues like English). Min 20 chars, max 200 chars. - """ - out = [] - for line in text.split("\n"): - s = line.strip().strip("\"'.,;:!?") - s = s.strip() - if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN: - out.append(s) - return out - - -def _build_index(corpus_zip_bytes: bytes) -> None: - """Parse corpus ZIP and build word (nikkud) → sentences index.""" - global _index - _index = {} - logger.info("Building Ben Yehuda index from nikkud corpus …") - - with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf: - txt_files = [n for n in zf.namelist() if n.endswith(".txt")] - logger.info(f" Corpus contains {len(txt_files)} text files") - for fname in txt_files: - try: - raw = zf.read(fname).decode("utf-8", errors="ignore") - except Exception: # noqa: S112 - continue - for sentence in _split_sentences(raw): - # Index by each unique Hebrew token (with nikkud) in the sentence - words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence) - for w in set(words): - if len(w) >= 2: - bucket = _index.setdefault(w, []) - if len(bucket) < MAX_INDEX_ENTRIES: - bucket.append(sentence) - - logger.info(f"Index built: {len(_index)} unique word forms") - - -def _save_index() -> None: - INDEX_PATH.parent.mkdir(parents=True, exist_ok=True) - with open(INDEX_PATH, "w", encoding="utf-8") as f: - json.dump(_index, f, ensure_ascii=False) - logger.info(f"Ben Yehuda index saved → {INDEX_PATH}") - - -def _load_index() -> None: - global _index - with open(INDEX_PATH, encoding="utf-8") as f: - _index = json.load(f) - logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms") - - -def load(force_rebuild: bool = False) -> None: - """Load or build the Ben Yehuda index. Downloads corpus if needed.""" - global _index, _examples_cache - if _index and not force_rebuild: - return - - if force_rebuild: - # Delete old index and discard examples cache - if INDEX_PATH.exists(): - INDEX_PATH.unlink() - logger.info("Deleted old Ben Yehuda index (force rebuild)") - _examples_cache = {} - else: - # Load persisted examples cache (not needed on rebuild) - if EXAMPLES_CACHE_PATH.exists(): - with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f: - _examples_cache = json.load(f) - - if INDEX_PATH.exists(): - _load_index() - return - - logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)") - resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True) - resp.raise_for_status() - data = resp.content - logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB") - - _build_index(data) - _save_index() - - -def save_examples_cache() -> None: - EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True) - with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f: - json.dump(_examples_cache, f, ensure_ascii=False) - logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}") - - -def get_examples(word_nikkud: str, confusable_consonants: set[str] | None = None) -> list[str]: - """ - Return 0 or 1 example sentences for the given word (nikkud form). - - Lookup strategy: - 1. Try exact nikkud match in index. - 2. Fall back to stripped (no-nikkud) match against index keys. - Skipped when word's consonants are in confusable_consonants set - (to avoid returning sentences for the wrong homograph). - - Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains - the word as a whole token. - """ - if not _index: - load() - - word = word_nikkud.strip() - word_stripped = _strip_nikkud(word) - - cache_key = word - - if cache_key in _examples_cache: - return _examples_cache[cache_key] - - # Lookup: try exact nikkud first, then stripped fallback - candidates = _index.get(word, []) - if not candidates and word_stripped and word_stripped not in (confusable_consonants or set()): - # Try looking up by stripped form across index keys - for k, v in _index.items(): - if _strip_nikkud(k) == word_stripped: - candidates = v - break - - # Filter: word must appear as a whole token - # Match the stripped form (for robustness with nikkud variants in sentence) - if word_stripped: - pattern = r"(? display name -EPUB_BOOKS = { - "little_prince.epub": "הנסיך הקטן", - "time_tunnel_82.epub": "מנהרת הזמן 82", -} +def _discover_epubs() -> dict[str, str]: + """Auto-discover all .epub files in EPUB_DIR, returning {filepath: display_name}.""" + if not EPUB_DIR.exists(): + return {} + books: dict[str, str] = {} + for path in sorted(EPUB_DIR.glob("*.epub")): + stem = path.stem + stem_stripped = strip_nikkud(stem).lower() + # Derive a brief English display name from the filename + parts = stem.split(" -- ") + title_part = strip_nikkud(parts[0]).strip().lower() + if "alice" in stem_stripped or "אליס" in title_part: + name = "alice_wonderland" + elif "little_prince" in stem_stripped or "נסיך" in title_part: + name = "little_prince" + elif "מנהרת" in title_part or "time_tunnel" in stem_stripped: + num_match = re.search(r"(\d+)", stem_stripped) + num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "") + name = f"time_tunnel_{num}" + else: + name = stem_stripped[:40] + books[str(path)] = name + return books -# PDF books are excluded — pypdf produces garbled RTL text (reversed chars within -# words). If/when a proper EPUB version becomes available on Calibre, add it to -# EPUB_BOOKS above instead. -PDF_BOOKS: dict[str, str] = {} # Sentence length bounds (word count) MIN_WORDS = 4 @@ -58,7 +75,7 @@ class _TextExtractor(HTMLParser): _ = attrs # required by HTMLParser interface if tag in self.SKIP_TAGS: self._skip_depth += 1 - # Insert space for block-level elements to avoid word concatenation + # Insert newline for block-level elements to avoid word concatenation if tag in ( "p", "div", @@ -102,7 +119,6 @@ def extract_text_from_html(html: str) -> str: def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]: """Get ordered list of content XHTML files from the OPF manifest.""" - # Find the OPF file opf_path = None for name in zf.namelist(): if name.endswith(".opf"): @@ -124,7 +140,7 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]: opf_dir = os.path.dirname(opf_path) # Extract manifest items: id -> href - manifest = {} + manifest: dict[str, str] = {} for m in re.finditer(r']*id="([^"]+)"[^>]*href="([^"]+)"', opf_content): manifest[m.group(1)] = m.group(2) # Also try reversed attribute order @@ -157,7 +173,12 @@ def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]: def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]: """Extract sentences from an EPUB file. - Returns list of {"text": str, "book": str, "stripped": str} + Args: + epub_path: Path to the .epub file. + book_name: Human-readable book name used as the ``source`` field. + + Returns: + List of ``{"text": str, "source": str}`` dicts. """ zf = zipfile.ZipFile(epub_path) content_files = _content_files_from_epub(zf) @@ -175,41 +196,6 @@ def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]: return _split_into_sentences(full_text, book_name) -# ── PDF processing ─────────────────────────────────────────────── - - -def extract_sentences_from_pdf(pdf_path: Path, book_name: str) -> list[dict]: - """Extract sentences from a PDF file (best-effort, handles RTL reversal).""" - try: - import pypdf - except ImportError: - print(f" [SKIP] pypdf not installed, cannot process {pdf_path.name}") - return [] - - reader = pypdf.PdfReader(pdf_path) - all_text_parts = [] - - for page in reader.pages: - raw = page.extract_text() - if not raw: - continue - # pypdf often reverses word order for RTL text; fix it - fixed_lines = [] - for line in raw.split("\n"): - words = line.split() - # Check if this line is predominantly Hebrew - hebrew_chars = sum(1 for c in line if "\u0590" <= c <= "\u05ff") - if hebrew_chars > len(line) * 0.3 and len(words) > 1: - # Reverse word order - fixed_lines.append(" ".join(reversed(words))) - else: - fixed_lines.append(line) - all_text_parts.append("\n".join(fixed_lines)) - - full_text = "\n".join(all_text_parts) - return _split_into_sentences(full_text, book_name) - - # ── Sentence splitting ─────────────────────────────────────────── # Hebrew sentence terminators: period, exclamation, question mark, sof pasuk @@ -217,18 +203,27 @@ _SENT_SPLIT = re.compile(r"[.!?\u05C3]+") # Punctuation to strip from word boundaries when matching _PUNCT = re.compile( - r'^[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|[\u0022\u0027\u05F4\u05F3,;:\-–—…\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$' + r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|' + r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$' ) def _split_into_sentences(text: str, book_name: str) -> list[dict]: - """Split text into sentences and filter by length.""" + """Split text into Hebrew sentences and filter by word count. + + Args: + text: Raw extracted text from an EPUB chapter. + book_name: Source label for each sentence dict. + + Returns: + List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text. + """ # Normalize whitespace text = re.sub(r"\s+", " ", text).strip() raw_sentences = _SENT_SPLIT.split(text) - results = [] - seen = set() + results: list[dict] = [] + seen: set[str] = set() for sent in raw_sentences: sent = sent.strip() @@ -242,205 +237,555 @@ def _split_into_sentences(text: str, book_name: str) -> list[dict]: if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS: continue - # Skip duplicates - stripped = strip_nikkud(sent) - if stripped in seen: + # Deduplicate by exact nikkud text + if sent in seen: continue - seen.add(stripped) + seen.add(sent) - results.append( - { - "text": sent, - "book": book_name, - "stripped": stripped, - } - ) + results.append({"text": sent, "source": book_name}) return results -# ── Vocab loading ──────────────────────────────────────────────── +# ── Nikkud index ───────────────────────────────────────────────── + +# Unicode ranges for Hebrew combining marks +_NIKKUD_LOW = 0x05B0 # start of vowel points (shva) +_NIKKUD_HIGH = 0x05BD # end of vowel range (meteg); 0x05BE is maqaf (punctuation) +_DAGESH = "\u05bc" +_SHIN_DOT = "\u05c1" +_SIN_DOT = "\u05c2" + +# Valid prefix consonants +_PREFIX_CONSONANTS = set("בהוכלמש") + +# Named vowel combining marks +_SHVA = "\u05b0" +_HIRIQ = "\u05b4" +_TSERE = "\u05b5" +_SEGOL = "\u05b6" +_PATACH = "\u05b7" +_QAMATZ = "\u05b8" + +# Valid nikkud patterns on each prefix consonant. +# Key = consonant, Value = set of frozensets of combining marks valid for that prefix. +_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = { + "ב": { + frozenset({_SHVA, _DAGESH}), # בְּ standard + frozenset({_HIRIQ, _DAGESH}), # בִּ before shva + frozenset({_PATACH, _DAGESH}), # בַּ with definite article + frozenset({_QAMATZ, _DAGESH}), # בָּ before chataf qamatz + frozenset({_SEGOL, _DAGESH}), # בֶּ before chataf segol + }, + "כ": { + frozenset({_SHVA, _DAGESH}), # כְּ + frozenset({_HIRIQ, _DAGESH}), # כִּ + frozenset({_PATACH, _DAGESH}), # כַּ + frozenset({_QAMATZ, _DAGESH}), # כָּ + frozenset({_SEGOL, _DAGESH}), # כֶּ + }, + "ל": { + frozenset({_SHVA}), # לְ standard + frozenset({_HIRIQ}), # לִ before shva + frozenset({_PATACH}), # לַ with definite article + frozenset({_QAMATZ}), # לָ demonstratives + frozenset({_SEGOL}), # לֶ before chataf segol + }, + "ו": { + frozenset({_SHVA}), # וְ standard + frozenset({_DAGESH}), # וּ (shureq) before shva/bumf + frozenset({_PATACH}), # וַ before chataf patach + frozenset({_QAMATZ}), # וָ before chataf qamatz + frozenset({_SEGOL}), # וֶ before chataf segol + frozenset({_HIRIQ}), # וִ before yud-shva + }, + "מ": { + frozenset({_HIRIQ}), # מִ standard + frozenset({_TSERE}), # מֵ before gutturals + }, + "ש": { + frozenset({_SEGOL, _DAGESH}), # שֶׁ standard + frozenset({_SEGOL, _DAGESH, _SHIN_DOT}), # שֶׁ with explicit shin dot + }, + "ה": { + frozenset({_PATACH}), # הַ standard definite article + frozenset({_QAMATZ}), # הָ before gutturals + frozenset({_SEGOL}), # הֶ before qamatz-bearing gutturals + }, +} -def load_vocab(csv_path: Path) -> dict: - """Load vocab CSV and return {stripped_form: nikkud_word} mapping. +def _is_combining_mark(ch: str) -> bool: + """Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots).""" + cp = ord(ch) + if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH: + return True + return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT) - Also returns reverse mapping for lookup. - Returns (word_to_nikkud, nikkud_words_set) + +def _decompose_first_char(token: str) -> tuple[str, frozenset, str]: + """Split token into (first_consonant, its_combining_marks, remainder). + + Args: + token: A nikkud Hebrew token string. + + Returns: + A tuple of (consonant, marks, rest). Returns ("", frozenset(), token) + if the token does not start with a Hebrew consonant (alef–tav range). """ - words_by_stripped: dict[str, list[str]] = {} # stripped -> [nikkud words] + if not token: + return ("", frozenset(), token) - with open(csv_path, encoding="utf-8") as f: - reader = csv.DictReader(f, delimiter=";") - for row in reader: - nikkud_word = row.get("Word", "").strip() - word_no_nik = row.get("Word Without Nikkud", "").strip() - if not nikkud_word: - continue + first = token[0] + # Check it's a Hebrew consonant (alef–tav) + if not ("\u05d0" <= first <= "\u05ea"): + return ("", frozenset(), token) - # Method 1: strip nikkud from the Word column - stripped_from_nikkud = strip_nikkud(nikkud_word) + # Collect all combining marks that follow the consonant + marks: set[str] = set() + i = 1 + while i < len(token): + ch = token[i] + if _is_combining_mark(ch): + marks.add(ch) + i += 1 + else: + break - # Add both forms for matching - for form in {stripped_from_nikkud, word_no_nik}: - if form: - words_by_stripped.setdefault(form, []).append(nikkud_word) + return (first, frozenset(marks), token[i:]) - return words_by_stripped + +def _is_valid_prefix(consonant: str, marks: frozenset) -> bool: + """Check if consonant + marks form a valid Hebrew prefix combination. + + Args: + consonant: The prefix consonant character. + marks: Frozenset of combining mark characters on that consonant. + + Returns: + True if this is a recognised Hebrew prefix vocalization. + """ + valid = _VALID_PREFIX_MARKS.get(consonant) + if not valid: + return False + # For ש, allow shin dot to be present or absent + if consonant == "ש": + marks_without_shin = marks - {_SHIN_DOT} + return marks_without_shin in valid or marks in valid + return marks in valid + + +def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str: + """Reassemble a token from its decomposed parts, sorting marks by codepoint.""" + return consonant + "".join(sorted(marks)) + rest + + +def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]: + """Try stripping 1 or 2 prefix letters from a nikkud token. + + Args: + token: A cleaned nikkud word token. + nikkud_index: Mapping from nikkud form to list of (unique_key, match_type). + + Returns: + List of (unique_key, match_type, matched_remainder) for each hit found. + The match_type will have ``"_prefix"`` appended to the base type. + """ + results: list[tuple[str, str, str]] = [] + + # Try 1-letter prefix + c1, m1, rest1 = _decompose_first_char(token) + if not (c1 and _is_valid_prefix(c1, m1) and rest1): + return results + + # Direct match on 1-prefix remainder + if rest1 in nikkud_index: + for unique_key, match_type in nikkud_index[rest1]: + results.append((unique_key, match_type + "_prefix", rest1)) + + # Try removing dagesh from first letter of remainder + # (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ) + c2, m2, rest2_inner = _decompose_first_char(rest1) + if c2 and _DAGESH in m2: + without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner) + if without_dagesh != rest1 and without_dagesh in nikkud_index: + for unique_key, match_type in nikkud_index[without_dagesh]: + results.append((unique_key, match_type + "_prefix", without_dagesh)) + + # Try 2-letter prefix (ו and ש commonly stack with another prefix) + if c1 in "וש": + c2b, m2b, rest2b = _decompose_first_char(rest1) + if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b: + if rest2b in nikkud_index: + for unique_key, match_type in nikkud_index[rest2b]: + results.append((unique_key, match_type + "_prefix", rest2b)) + + # Also try dagesh removal on remainder of 2-letter prefix + c3, m3, rest3_inner = _decompose_first_char(rest2b) + if c3 and _DAGESH in m3: + without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner) + if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index: + for unique_key, match_type in nikkud_index[without_dagesh2]: + results.append((unique_key, match_type + "_prefix", without_dagesh2)) + + return results + + +def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]: + """Build a mapping from nikkud form to list of (unique_key, match_type). + + Indexes the following sources per entry: + + - ``word.nikkud`` → "direct" + - conjugation active/passive forms → "conjugated" + - conjugation infinitive and reference_form → "conjugated" + - noun inflection singular/plural/construct/pronominal → "inflected" + + Args: + words: The full words.json dict keyed by unique_key. + + Returns: + Dict mapping each nikkud form to a list of (unique_key, match_type) tuples. + """ + index: dict[str, list[tuple[str, str]]] = {} + + def _add(form: str | None, unique_key: str, match_type: str) -> None: + if form: + index.setdefault(form, []).append((unique_key, match_type)) + + for unique_key, entry in words.items(): + # Direct word form + word = entry.get("word") or {} + _add(word.get("nikkud"), unique_key, "direct") + + # Conjugation forms + conj = entry.get("conjugation") or {} + + for form_entry in conj.get("active_forms") or []: + form = (form_entry.get("form") or {}).get("nikkud") + _add(form, unique_key, "conjugated") + + for form_entry in conj.get("hufal_pual_forms") or []: + form = (form_entry.get("form") or {}).get("nikkud") + _add(form, unique_key, "conjugated") + + inf = conj.get("infinitive") or {} + _add(inf.get("nikkud"), unique_key, "conjugated") + + ref = conj.get("reference_form") or {} + _add(ref.get("nikkud"), unique_key, "conjugated") + + # Noun inflection forms + noun = entry.get("noun_inflection") or {} + + for field in ("singular", "plural", "construct_singular", "construct_plural"): + sub = noun.get(field) or {} + _add(sub.get("nikkud"), unique_key, "inflected") + + pronominal = noun.get("pronominal_suffixes") or {} + for _person, sub in pronominal.items(): + if isinstance(sub, dict): + _add(sub.get("nikkud"), unique_key, "inflected") + + return index + + +def _filter_collision_forms(nikkud_index: dict) -> dict: + """Remove colliding forms for entries that have other unique forms. + + A "colliding form" maps to 2+ unique_keys. For each unique_key that + appears in a collision, check whether it also has at least one + non-colliding form in the index. If so, remove it from the colliding + form's entry list. If a unique_key's *only* indexed forms all collide, + keep them (otherwise the entry would get zero matches). + + Returns a new index dict with the same structure. + """ + # Identify collision forms and build reverse map (key → its forms) + collision_forms: set[str] = set() + key_to_forms: dict[str, set[str]] = {} + + for form, entries in nikkud_index.items(): + keys = {uk for uk, _ in entries} + if len(keys) >= 2: + collision_forms.add(form) + for uk, _ in entries: + key_to_forms.setdefault(uk, set()).add(form) + + # For each key, check if it has any non-colliding form + keys_with_unique_forms: set[str] = set() + for uk, forms in key_to_forms.items(): + if forms - collision_forms: + keys_with_unique_forms.add(uk) + + # Build filtered index + filtered: dict[str, list[tuple[str, str]]] = {} + removed = 0 + for form, entries in nikkud_index.items(): + if form in collision_forms: + kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms] + removed += len(entries) - len(kept) + if kept: + filtered[form] = kept + else: + filtered[form] = entries + + logger.info(f" Filtered {removed} collision mappings from entries with unique forms") + return filtered # ── Matching ───────────────────────────────────────────────────── -def match_sentences(sentences: list[dict], words_by_stripped: dict) -> dict: - """Match sentences against vocab words. +def match_sentences( + sentences: list[dict], + nikkud_index: dict, + confusable_keys: set[str], +) -> dict: + """Match sentences to vocab words using the nikkud index. - Returns {nikkud_word: [sentences]} with best (shortest) first. + Args: + sentences: List of ``{"text": str, "source": str}`` dicts. + nikkud_index: Output of ``_build_nikkud_index``. + confusable_keys: Set of unique_keys that are in confusable groups. + + Returns: + Dict mapping unique_key → list of match dicts, each containing: + ``text``, ``source``, ``match_method``, ``word_count``, + ``matched_form``, ``char_offset``, ``char_end``. """ - # Build a set of all stripped forms for fast lookup - all_forms = set(words_by_stripped.keys()) - - # Hebrew single-letter prefixes: ב, ה, ו, כ, ל, מ, ש, ד (של) - _HEB_PREFIXES = set("בהוכלמשד") - - # For each sentence, extract stripped words - matches: dict[str, list[tuple[int, str]]] = {} # nikkud_word -> [(word_count, sentence)] + matches: dict[str, list[dict]] = {} for sent_info in sentences: - sent_text = sent_info["text"] - sent_stripped = sent_info["stripped"] - word_count = len(sent_text.split()) + text = sent_info["text"] + source = sent_info["source"] + words_in_sent = text.split() + word_count = len(words_in_sent) - # Get stripped words from the sentence - raw_words = sent_stripped.split() - # Map: candidate_form -> set of original cleaned words that produced it - # This lets us verify that prefix stripping is plausible - candidates: dict[str, str] = {} # form -> original_word - for w in raw_words: - cleaned = _PUNCT.sub("", w) + char_pos = 0 + for raw_word in words_in_sent: + cleaned = _PUNCT.sub("", raw_word) if not cleaned: + word_start = text.find(raw_word, char_pos) + char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos continue - # Direct match (always try) - candidates[cleaned] = cleaned - # Prefix stripping: only if remaining stem is >= 2 chars - # and the prefix char is a known Hebrew prefix letter - for prefix_len in (1, 2): - if len(cleaned) > prefix_len + 1: - prefix = cleaned[:prefix_len] - stem = cleaned[prefix_len:] - if all(c in _HEB_PREFIXES for c in prefix) and len(stem) >= 2: - candidates[stem] = cleaned - # Check which vocab words appear in this sentence - matched_forms = set(candidates.keys()) & all_forms - for form in matched_forms: - # Skip spurious matches: very short vocab forms (1-2 chars) - # should only match via direct word match, not prefix stripping - if len(form) <= 2 and form not in {_PUNCT.sub("", w) for w in raw_words}: - continue - for nikkud_word in words_by_stripped[form]: - matches.setdefault(nikkud_word, []).append((word_count, sent_text)) + # Locate positions within the sentence + word_start_in_sent = text.find(raw_word, char_pos) + if word_start_in_sent < 0: + word_start_in_sent = char_pos + clean_offset_in_raw = raw_word.find(cleaned) + if clean_offset_in_raw < 0: + clean_offset_in_raw = 0 + clean_start = word_start_in_sent + clean_offset_in_raw + clean_end = clean_start + len(cleaned) - # Sort by word count (prefer shorter sentences) and deduplicate - result = {} - for nikkud_word, sent_list in matches.items(): - sent_list.sort(key=lambda x: x[0]) - seen = set() - unique = [] - for _, sent in sent_list: - if sent not in seen: - seen.add(sent) - unique.append(sent) - if len(unique) >= 5: # Keep top 5 per word - break - result[nikkud_word] = unique + found: list[tuple[str, str]] = [] - return result + # Direct nikkud match + if cleaned in nikkud_index: + for unique_key, match_type in nikkud_index[cleaned]: + found.append((unique_key, match_type)) + + # Prefix stripping — only if no direct match exists + if cleaned not in nikkud_index: + for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index): + found.append((unique_key, match_type)) + + for unique_key, match_method in found: + matches.setdefault(unique_key, []).append( + { + "text": text, + "source": source, + "match_method": match_method, + "word_count": word_count, + "matched_form": cleaned, + "char_offset": clean_start, + "char_end": clean_end, + } + ) + + char_pos = word_start_in_sent + len(raw_word) + + return matches -# ── Main ───────────────────────────────────────────────────────── +# ── Writing results ────────────────────────────────────────────── -def main(): - print("=" * 60) - print("EPUB Example Sentence Extraction Pipeline") - print("=" * 60) +def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int: + """Update words dict entries with matched example sentences. - # Step 1: Extract sentences from all books - all_sentences = [] - book_counts = {} + Selects up to 3 best sentences per word (scoring prefers 6–12 word + sentences and non-prefix matches). Also generates a cloze entry for + the top match, unless the word is in the confusable set. - for filename, book_name in EPUB_BOOKS.items(): - path = EPUB_DIR / filename - if not path.exists(): - print(f"\n[SKIP] {filename} not found") + Args: + words: The full words.json dict, modified in place. + matches: Output of ``match_sentences``. + confusable_keys: Set of unique_keys in confusable groups. + + Returns: + Count of words.json entries that were updated. + """ + import genanki # noqa: PLC0415 — import only where needed + + updated = 0 + + for unique_key, sent_list in matches.items(): + if unique_key not in words: continue - print(f"\n[EPUB] Extracting: {book_name} ({filename})") + + entry = words[unique_key] + + # Deduplicate by sentence text + seen_texts: set[str] = set() + unique: list[dict] = [] + for s in sent_list: + if s["text"] not in seen_texts: + seen_texts.add(s["text"]) + unique.append(s) + + # Prefer direct matches; only fall back to prefix if none exist + direct = [s for s in unique if "prefix" not in s["match_method"]] + prefix_only = [s for s in unique if "prefix" in s["match_method"]] + pool = direct if direct else prefix_only + + # Score: prefer 6–12 word sentences + def _score(s: dict) -> tuple[int,]: + wc = s["word_count"] + length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0 + return (length_score,) + + pool.sort(key=_score) + best = pool[:3] + + # Build vetted list + if not entry.get("examples"): + entry["examples"] = {} + examples: dict = entry["examples"] + examples["vetted"] = [ + { + "text": s["text"], + "source": s["source"], + "match_method": s["match_method"], + } + for s in best + ] + + # Build cloze from best sentence (skip confusables) + is_confusable = unique_key in confusable_keys + if not is_confusable and best: + top = best[0] + # Preserve existing cloze_guid if sentence text unchanged + old_cloze = examples.get("cloze") or {} + if old_cloze.get("text") == top["text"]: + cloze_guid = old_cloze.get("cloze_guid") + else: + cloze_guid = genanki.guid_for("cloze", unique_key) + + examples["cloze"] = { + "text": top["text"], + "cloze_word_start": top["char_offset"], + "cloze_word_end": top["char_end"], + "cloze_hint": None, + "cloze_guid": cloze_guid, + } + elif is_confusable: + examples.pop("cloze", None) + + examples["rejected_count"] = 0 + updated += 1 + + return updated + + +# ── Public API ─────────────────────────────────────────────────── + + +def run(words: dict) -> dict: + """Extract EPUB sentences, match against words, update words dict in place. + + Called from run.py with the already-loaded words.json dict. + + Args: + words: The full words.json dict keyed by unique_key. Modified in place. + + Returns: + Summary stats dict with keys ``books``, ``matched``, ``total_vocab``. + """ + logger.info(" Extracting sentences from EPUBs ...") + all_sentences: list[dict] = [] + book_counts: dict[str, int] = {} + + for filepath, book_name in _discover_epubs().items(): + path = Path(filepath) sentences = extract_sentences_from_epub(path, book_name) book_counts[book_name] = len(sentences) all_sentences.extend(sentences) - print(f" -> {len(sentences)} sentences") + logger.info(f" {book_name}: {len(sentences)} sentences") - for filename, book_name in PDF_BOOKS.items(): - path = EPUB_DIR / filename - if not path.exists(): - print(f"\n[SKIP] {filename} not found") - continue - print(f"\n[PDF] Extracting: {book_name} ({filename})") - sentences = extract_sentences_from_pdf(path, book_name) - book_counts[book_name] = len(sentences) - all_sentences.extend(sentences) - print(f" -> {len(sentences)} sentences") + if not all_sentences: + logger.warning(" No EPUB files found — skipping example extraction") + return {"books": {}, "matched": 0, "total_vocab": len(words)} - print(f"\nTotal sentences: {len(all_sentences)}") + logger.info(f" Total sentences: {len(all_sentences)}") - # Step 2: Save sentence index - index_path = DATA_DIR / "epub_sentence_index.json" - with open(index_path, "w", encoding="utf-8") as f: - json.dump({"sentences": all_sentences}, f, ensure_ascii=False, indent=2) - print(f"\nSaved sentence index: {index_path}") + # Build nikkud index + logger.info(" Building nikkud index from words.json ...") + nikkud_index = _build_nikkud_index(words) + logger.info(f" {len(nikkud_index)} unique nikkud forms indexed") - # Step 3: Load vocab and match - print(f"\nLoading vocab from {DICT_CSV} ...") - words_by_stripped = load_vocab(DICT_CSV) - total_vocab = len({w for wlist in words_by_stripped.values() for w in wlist}) - print(f" {total_vocab} unique vocab words ({len(words_by_stripped)} lookup forms)") + # Filter out collision forms for entries that have unique forms + nikkud_index = _filter_collision_forms(nikkud_index) - print("\nMatching sentences against vocab ...") - examples_cache = match_sentences(all_sentences, words_by_stripped) + # Build confusable key set + confusable_keys: set[str] = set() + for key, entry in words.items(): + if entry.get("confusable_group"): + confusable_keys.add(key) - # Step 4: Save examples_cache - cache_path = DATA_DIR / "examples_cache.json" - with open(cache_path, "w", encoding="utf-8") as f: - json.dump(examples_cache, f, ensure_ascii=False, indent=2) - print(f"Saved examples cache: {cache_path}") + # Match sentences + logger.info(" Matching sentences against vocab ...") + matches = match_sentences(all_sentences, nikkud_index, confusable_keys) + logger.info(f" {len(matches)} words matched") - # Step 5: Summary stats - print("\n" + "=" * 60) - print("SUMMARY") - print("=" * 60) - print("\nSentences per book:") - for book_name, count in book_counts.items(): - print(f" {book_name}: {count}") - print(f" Total: {len(all_sentences)}") + # Break down by match method + method_counts: dict[str, int] = {} + for sent_list in matches.values(): + for s in sent_list: + method = s["match_method"] + method_counts[method] = method_counts.get(method, 0) + 1 + for method, count in sorted(method_counts.items()): + logger.info(f" {method}: {count} sentence-word pairs") - print("\nVocab matching:") - print(f" Total vocab words: {total_vocab}") - print(f" Words with examples: {len(examples_cache)}") - coverage = 100 * len(examples_cache) / total_vocab if total_vocab else 0 - print(f" Coverage: {coverage:.1f}%") + # Update words dict in place + updated = update_words_json(words, matches, confusable_keys) + logger.info(f" Updated {updated} entries in words.json") - # Show some sample matches - print("\nSample matches:") - count = 0 - for word, sents in examples_cache.items(): - if count >= 5: - break - print(f" {word} -> {sents[0][:60]}...") - count += 1 + return { + "books": book_counts, + "matched": len(matches), + "total_vocab": len(words), + } - return examples_cache +# ── Standalone entry point ─────────────────────────────────────── if __name__ == "__main__": - main() + import json + + logging.basicConfig(level=logging.INFO, format="%(message)s") + + words_path = DATA_DIR / "words.json" + with open(words_path, encoding="utf-8") as f: + words = json.load(f) + + stats = run(words) + + # Save updated words.json + with open(words_path, "w", encoding="utf-8") as f: + json.dump(words, f, ensure_ascii=False, indent=2) + + coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0 + logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)") diff --git a/pealim_detail_scrape.py b/pealim_detail_scrape.py index 22bef58..36730ba 100644 --- a/pealim_detail_scrape.py +++ b/pealim_detail_scrape.py @@ -2,7 +2,8 @@ """ Consolidated detail page scraper for pealim.com. -Visits /dict// detail pages for nouns and verbs in data/words.json. +Visits /dict// detail pages for nouns, verbs, adjectives and prepositions +in data/words.json. Makes two requests per slug: 1. hebstyle=mo cookie → nikkud forms 2. hebstyle=vl cookie → ktiv male forms @@ -11,7 +12,8 @@ Updates entries in data/words.json with scraped detail data. Usage: python3 pealim_detail_scrape.py [--test N] [--force-refresh-detail] - [--nouns-only | --verbs-only] + [--nouns-only | --verbs-only | + --adjectives-only | --prepositions-only] """ import argparse @@ -144,28 +146,128 @@ FORM_KEY_TO_PERSON: dict[str, str] = { "infinitive": "inf", } -# Mishkal English name → Hebrew nikkud mapping (common patterns) -MISHKAL_HEBREW: dict[str, str] = { - "CaCaC": "קָטָל", - "CeCeC": "קֶטֶל", - "CiCeC": "קִטֶל", - "CaCeC": "קָטֶל", - "CoCeC": "קוֹטֵל", - "CaCiC": "קָטִיד", - "CaCuC": "קָטוּר", - "miCCaC": "מִקְטָל", - "miCCeC": "מִקְטֶל", - "maCCeC": "מַקְטֶל", - "maCCiC": "מַקְטִיר", - "hiCCiC": "הִקְטִיל", - "CiCCuC": "קִטּוּל", - "hitCaCCeC": "הִתְקַטֵּל", - "CaCCan": "קַטְּלָן", - "CaCCaC": "קַטָּל", - "CiCCon": "קִטְּרוֹן", - "CaCCeC": "קַטֶּלֶת", +# Mishkal English name → Hebrew nikkud mapping +# Pealim dropdown uses q-notation (qatal), detail pages return k-notation (katal). +# We store q-notation keys; _mishkal_to_hebrew() handles the k→q conversion. +# Source: https://www.pealim.com/he/dict/?pos={adjective,noun}&am=qatil dropdowns +_MISHKAL_HEBREW_Q: dict[str, str] = { + # --- a --- + "aqtal": "אַקְטָל", + "aqtala": "אַקְטָלָה", + # --- e --- + "eqtal": "אֶקְטָל", + # --- h --- + "haqtala": "הַקְטָלָה", + "heqtel": "הֶקְטֵל", + "hiqqatlut": "הִקָּטְלוּת", + "hitqattlut": "הִתְקַטְּלוּת", + # --- m --- + "maqtal": "מַקְטָל", + "maqtel": "מַקְטֵל", + "maqtela": "מַקְטֵלָה", + "maqtelet": "מַקְטֶלֶת", + "maqtil": "מַקְטִיל", + "maqtol": "מַקְטוֹל", + "maqtolet": "מַקְטֹלֶת", + "maqtul": "מַקְטוּל", + "meqattel": "מְקַטֵּל", + "meqila": "מְקִילָה", + "mequla": "מְקוּלָה", + "mequttal": "מְקֻטָּל", + "miqtal": "מִקְטָל", + "miqtala": "מִקְטָלָה", + "miqtelet": "מִקְטֶלֶת", + "miqtol": "מִקְטוֹל", + "miqtolet": "מִקְטֹלֶת", + "mitqattel": "מִתְקַטֵּל", + "muqtal": "מֻקְטָל", + # --- n --- + "niqtal": "נִקְטָל", + # --- q --- + "qal": "קַל", + "qatal": "קָטָל", + "qatel": "קָטֵל", + "qatil": "קָטִיל", + "qatla": "קַטְלָה", + "qatlan": "קַטְלָן", + "qatlut": "קַטְלוּת", + "qatol": "קָטוֹל", + "qaton": "קָטוֹן", + "qattal": "קַטָּל", + "qattala": "קַטָּלָה", + "qattelet": "קַטֶּלֶת", + "qattil": "קַטִּיל", + "qattila": "קַטִּילָה", + "qattolet": "קַטֹּלֶת", + "qattul": "קַטּוּל", + "qatul": "קָטוּל", + "qatut": "קָטוּת", + "qetel": "קֶטֶל", + "qeteh": "קֵטֶה", + "qitla": "קִטְלָה", + "qitlon": "קִטְלוֹן", + "qittalon": "קִטָּלוֹן", + "qittel": "קִטֵּל", + "qittelet": "קִטֶּלֶת", + "qittol": "קִטּוֹל", + "qittolet": "קִטֹּלֶת", + "qittul": "קִטּוּל", + "qol": "קֹל", + "qotal": "קוֹטָל", + "qotel": "קוֹטֵל", + "qotelet": "קוֹטֶלֶת", + "qotla": "קָטְלָה", + "qtal": "קְטָל", + "qtala": "קְטָלָה", + "qtaltal": "קְטַלְטַל", + "qtaltan": "קְטַלְתָּן", + "qtaltolet": "קְטַלְטֹלֶת", + "qtel": "קְטֵל", + "qtela": "קְטֵלָה", + "qtelet": "קְטֶלֶת", + "qtil": "קְטִיל", + "qtila": "קְטִילָה", + "qtili": "קְטִילִי", + "qtol": "קְטוֹל", + "qtola": "קְטוֹלָה", + "qtolet": "קְטֹלֶת", + "qtul": "קְטוּל", + "qtula": "קְטוּלָה", + "qtulla": "קְטֻלָּה", + "qtut": "קְטוּת", + "qutla": "קֻטְלָה", + "quttolet": "קֻטּוֹלֶת", + # --- t --- + "taqtela": "תַּקְטֵלָה", + "taqtil": "תַּקְטִיל", + "taqtit": "תַּקְטִית", + "taqtul": "תַּקְטוּל", + "taqtula": "תַּקְטוּלָה", + "taqtut": "תַּקְטוּת", + "tiqtal": "תִּקְטָל", + "tiqtala": "תִּקְטָלָה", + "tiqtelet": "תִּקְטֶלֶת", + "tiqtolet": "תִּקְטֹלֶת", + "tqilla": "תְּקִלָּה", + "tqula": "תְּקוּלָה", + # --- y --- + "yaqtul": "יַקְטוּל", } + +def _mishkal_to_hebrew(mishkal: str) -> str | None: + """Look up Hebrew mishkal, handling k-notation → q-notation conversion.""" + if not mishkal: + return None + # Try as-is first (q-notation) + result = _MISHKAL_HEBREW_Q.get(mishkal) + if result: + return result + # Convert k-notation to q-notation and retry + q_form = mishkal.replace("k", "q") + return _MISHKAL_HEBREW_Q.get(q_form) + + # --------------------------------------------------------------------------- # HTTP session # --------------------------------------------------------------------------- @@ -452,7 +554,7 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict: if mishkal: result["mishkal"] = mishkal - result["mishkal_hebrew"] = MISHKAL_HEBREW.get(mishkal) + result["mishkal_hebrew"] = _mishkal_to_hebrew(mishkal) return result @@ -887,6 +989,228 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di return result +# --------------------------------------------------------------------------- +# Adjective detail parsing +# --------------------------------------------------------------------------- + +_ADJECTIVE_CELL_IDS: tuple[str, ...] = ("ms-a", "fs-a", "mp-a", "fp-a") +_ADJECTIVE_FORM_KEYS: tuple[str, ...] = ("ms", "fs", "mp", "fp") + + +def _parse_adjective_table(soup: BeautifulSoup) -> dict[str, dict]: + """ + Parse the adjective inflection table from a pealim detail page (mo/nikkud). + + Locates cells by ID (ms-a, fs-a, mp-a, fp-a) and extracts nikkud text + and audio URL from each. + + Returns: + Dict mapping form key ("ms", "fs", "mp", "fp") to + {"nikkud": str, "audio_url": str}, or empty dict if table not found. + """ + table = soup.find("table", class_="conjugation-table") + if not table: + return {} + + result: dict[str, dict] = {} + for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True): + div = table.find(id=cell_id) + if not div: + continue + nikkud, audio_url = _get_menukad_and_audio(div) + if nikkud: + result[form_key] = {"nikkud": nikkud, "audio_url": audio_url} + + return result + + +def _parse_adjective_table_vl(soup: BeautifulSoup) -> dict[str, str]: + """ + Parse the adjective inflection table from a vl (ktiv male) page. + + Returns: + Dict mapping form key ("ms", "fs", "mp", "fp") to ktiv male string. + """ + table = soup.find("table", class_="conjugation-table") + if not table: + return {} + + result: dict[str, str] = {} + for cell_id, form_key in zip(_ADJECTIVE_CELL_IDS, _ADJECTIVE_FORM_KEYS, strict=True): + div = table.find(id=cell_id) + if not div: + continue + ktiv = _get_plain_text(div) + if ktiv: + result[form_key] = ktiv + + return result + + +def _parse_adjective_mishkal(soup: BeautifulSoup) -> tuple[str, str]: + """ + Extract mishkal from the PoS section of an adjective detail page. + + Reuses the same extraction logic as _parse_noun_gender_mishkal. + + Returns: + Tuple of (mishkal_english, mishkal_hebrew) where either may be empty. + """ + _, mishkal = _parse_noun_gender_mishkal(soup) + mishkal_hebrew = _mishkal_to_hebrew(mishkal) or "" + return mishkal, mishkal_hebrew + + +def _scrape_adjective_detail(_slug: str, mo_html: str, vl_html: str) -> dict: + """ + Parse adjective detail pages (mo=nikkud, vl=ktiv male). + + Returns: + Dict matching the adjective_inflection schema: + {ms, fs, mp, fp: {nikkud, ktiv_male}, mishkal, mishkal_hebrew}. + Empty dict if no forms found. + """ + mo_soup = BeautifulSoup(mo_html, "lxml") + vl_soup = BeautifulSoup(vl_html, "lxml") + + mo_data = _parse_adjective_table(mo_soup) + vl_data = _parse_adjective_table_vl(vl_soup) + mishkal, mishkal_hebrew = _parse_adjective_mishkal(mo_soup) + + if not mo_data: + return {} + + result: dict = {} + for form_key in _ADJECTIVE_FORM_KEYS: + mo_form = mo_data.get(form_key) + if mo_form: + nikkud = mo_form["nikkud"] + ktiv = vl_data.get(form_key, "") + if not ktiv: + logger.warning("No ktiv_male for adjective form %s: %s", form_key, nikkud) + result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv} + else: + result[form_key] = None + + result["mishkal"] = mishkal or None + result["mishkal_hebrew"] = mishkal_hebrew or None + + return result + + +# --------------------------------------------------------------------------- +# Preposition detail parsing +# --------------------------------------------------------------------------- + +_PREPOSITION_CELL_IDS: tuple[str, ...] = ( + "P-1s", + "P-1p", + "P-2ms", + "P-2fs", + "P-2mp", + "P-2fp", + "P-3ms", + "P-3fs", + "P-3mp", + "P-3fp", +) +_PREPOSITION_FORM_KEYS: tuple[str, ...] = ( + "1s", + "1p", + "2ms", + "2fs", + "2mp", + "2fp", + "3ms", + "3fs", + "3mp", + "3fp", +) + + +def _parse_preposition_table(soup: BeautifulSoup) -> dict[str, dict]: + """ + Parse the preposition pronominal suffix table from a pealim detail page (mo/nikkud). + + Locates cells by ID (P-1s, P-1p, P-2ms, …, P-3fp) and extracts nikkud + text and audio URL from each. + + Returns: + Dict mapping person key ("1s", "1p", …, "3fp") to + {"nikkud": str, "audio_url": str}, or empty dict if table not found. + """ + table = soup.find("table", class_="conjugation-table") + if not table: + return {} + + result: dict[str, dict] = {} + for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True): + div = table.find(id=cell_id) + if not div: + continue + nikkud, audio_url = _get_menukad_and_audio(div) + if nikkud: + result[form_key] = {"nikkud": nikkud, "audio_url": audio_url} + + return result + + +def _parse_preposition_table_vl(soup: BeautifulSoup) -> dict[str, str]: + """ + Parse the preposition pronominal suffix table from a vl (ktiv male) page. + + Returns: + Dict mapping person key ("1s", "1p", …, "3fp") to ktiv male string. + """ + table = soup.find("table", class_="conjugation-table") + if not table: + return {} + + result: dict[str, str] = {} + for cell_id, form_key in zip(_PREPOSITION_CELL_IDS, _PREPOSITION_FORM_KEYS, strict=True): + div = table.find(id=cell_id) + if not div: + continue + ktiv = _get_plain_text(div) + if ktiv: + result[form_key] = ktiv + + return result + + +def _scrape_preposition_detail(_slug: str, mo_html: str, vl_html: str) -> dict: + """ + Parse preposition detail pages (mo=nikkud, vl=ktiv male). + + Returns: + Dict matching the preposition_inflection schema: + {1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp: {nikkud, ktiv_male}}. + Empty dict if no forms found. + """ + mo_soup = BeautifulSoup(mo_html, "lxml") + vl_soup = BeautifulSoup(vl_html, "lxml") + + mo_data = _parse_preposition_table(mo_soup) + vl_data = _parse_preposition_table_vl(vl_soup) + + if not mo_data: + return {} + + result: dict = {} + for form_key in _PREPOSITION_FORM_KEYS: + mo_form = mo_data.get(form_key) + if mo_form: + nikkud = mo_form["nikkud"] + ktiv = vl_data.get(form_key, "") + if not ktiv: + logger.warning("No ktiv_male for preposition form %s: %s", form_key, nikkud) + result[form_key] = {"nikkud": nikkud, "ktiv_male": ktiv} + else: + result[form_key] = None + + return result + + # --------------------------------------------------------------------------- # Merging strategy # --------------------------------------------------------------------------- @@ -926,6 +1250,22 @@ def _merge_conjugation(_existing_conj: dict | None, scraped: dict) -> dict: return scraped +def _merge_adjective_inflection(_existing_ai: dict | None, scraped: dict) -> dict: + """ + Merge scraped adjective data into existing adjective_inflection. + No GUIDs to preserve — simple overwrite with scraped data. + """ + return dict(scraped) + + +def _merge_preposition_inflection(_existing_pi: dict | None, scraped: dict) -> dict: + """ + Merge scraped preposition data into existing preposition_inflection. + No GUIDs to preserve — simple overwrite with scraped data. + """ + return dict(scraped) + + # --------------------------------------------------------------------------- # I/O helpers # --------------------------------------------------------------------------- @@ -953,14 +1293,26 @@ def _save_words(data: dict) -> None: # --------------------------------------------------------------------------- -def _should_process(entry: dict, pos: str, force: bool, nouns_only: bool, verbs_only: bool) -> bool: +def _should_process( + entry: dict, + pos: str, + force: bool, + nouns_only: bool, + verbs_only: bool, + adjectives_only: bool, + prepositions_only: bool, +) -> bool: """Return True if this entry should be scraped.""" - if not pos.startswith(("Noun", "Verb")): + if not pos.startswith(("Noun", "Verb", "Adjective", "Preposition")): return False if nouns_only and not pos.startswith("Noun"): return False if verbs_only and not pos.startswith("Verb"): return False + if adjectives_only and not pos.startswith("Adjective"): + return False + if prepositions_only and not pos.startswith("Preposition"): + return False return force or not entry.get("detail_scraped") @@ -969,6 +1321,8 @@ def run( force_refresh: bool = False, nouns_only: bool = False, verbs_only: bool = False, + adjectives_only: bool = False, + prepositions_only: bool = False, ) -> None: """ Main scrape loop. @@ -978,13 +1332,24 @@ def run( force_refresh: Re-scrape entries where detail_scraped=True. nouns_only: Only scrape noun entries. verbs_only: Only scrape verb entries. + adjectives_only: Only scrape adjective entries. + prepositions_only: Only scrape preposition entries. """ words = _load_words() candidates = [ (unique_key, entry) for unique_key, entry in words.items() - if _should_process(entry, entry.get("pos", ""), force_refresh, nouns_only, verbs_only) and entry.get("slug") + if _should_process( + entry, + entry.get("pos", ""), + force_refresh, + nouns_only, + verbs_only, + adjectives_only, + prepositions_only, + ) + and entry.get("slug") ] total = len(candidates) @@ -992,7 +1357,10 @@ def run( candidates = candidates[:test] logger.info("Test mode: processing %d of %d eligible entries", len(candidates), total) else: - logger.info("Processing %d eligible entries (nouns+verbs) from words.json", total) + logger.info( + "Processing %d eligible entries (nouns+verbs+adjectives+prepositions) from words.json", + total, + ) processed = 0 errors = 0 @@ -1003,7 +1371,14 @@ def run( word_nikkud = entry.get("word", {}).get("nikkud", unique_key) url = f"{PEALIM_BASE}/dict/{slug}/" - label = "Noun" if pos.startswith("Noun") else "Verb" + if pos.startswith("Noun"): + label = "Noun" + elif pos.startswith("Verb"): + label = "Verb" + elif pos.startswith("Adjective"): + label = "Adjective" + else: + label = "Preposition" logger.info("[%d/%d] %s: %s (%s)", idx, len(candidates), label, word_nikkud, slug) # Fetch mo (nikkud) page @@ -1042,7 +1417,7 @@ def run( errors += 1 continue - else: # Verb + elif pos.startswith("Verb"): existing_conj = entry.get("conjugation") scraped = _scrape_verb_detail(slug, mo_html, vl_html, existing_conj) if scraped: @@ -1059,6 +1434,41 @@ def run( errors += 1 continue + elif pos.startswith("Adjective"): + scraped = _scrape_adjective_detail(slug, mo_html, vl_html) + if scraped: + existing_ai = entry.get("adjective_inflection") + merged = _merge_adjective_inflection(existing_ai, scraped) + words[unique_key]["adjective_inflection"] = merged + ms = merged.get("ms", {}) or {} + fs = merged.get("fs", {}) or {} + logger.info( + " ms=%s fs=%s mishkal=%s", + ms.get("nikkud", "—"), + fs.get("nikkud", "—"), + merged.get("mishkal", "—"), + ) + else: + logger.warning(" No adjective data scraped for %s", slug) + errors += 1 + continue + + else: # Preposition + scraped = _scrape_preposition_detail(slug, mo_html, vl_html) + if scraped: + existing_pi = entry.get("preposition_inflection") + merged = _merge_preposition_inflection(existing_pi, scraped) + words[unique_key]["preposition_inflection"] = merged + form_1s = merged.get("1s", {}) or {} + logger.info( + " 1s=%s", + form_1s.get("nikkud", "—"), + ) + else: + logger.warning(" No preposition data scraped for %s", slug) + errors += 1 + continue + except Exception as exc: # noqa: BLE001 logger.error(" Parse error for %s (%s): %s", slug, word_nikkud, exc, exc_info=True) errors += 1 @@ -1089,7 +1499,7 @@ def run( def _build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( - description="Scrape pealim.com detail pages for nouns and verbs in data/words.json." + description=("Scrape pealim.com detail pages for nouns, verbs, adjectives and prepositions in data/words.json.") ) parser.add_argument( "--test", @@ -1117,6 +1527,18 @@ def _build_parser() -> argparse.ArgumentParser: default=False, help="Only scrape Verb entries.", ) + group.add_argument( + "--adjectives-only", + action="store_true", + default=False, + help="Only scrape Adjective entries.", + ) + group.add_argument( + "--prepositions-only", + action="store_true", + default=False, + help="Only scrape Preposition entries.", + ) return parser @@ -1133,4 +1555,6 @@ if __name__ == "__main__": force_refresh=args.force_refresh_detail, nouns_only=args.nouns_only, verbs_only=args.verbs_only, + adjectives_only=args.adjectives_only, + prepositions_only=args.prepositions_only, ) diff --git a/rebuild_sentence_matches.py b/rebuild_sentence_matches.py deleted file mode 100644 index 1d8b1cb..0000000 --- a/rebuild_sentence_matches.py +++ /dev/null @@ -1,183 +0,0 @@ -#!/usr/bin/env python3 -""" -Rebuild vocab_sentence_matches.json using both direct word matching -and ktiv male conjugated/declined form matching. - -This dramatically improves sentence coverage by matching not just -dictionary forms but all conjugated verbs and declined nouns. -""" - -import json -import logging -import re -from pathlib import Path - -import pandas as pd - -from helpers import strip_nikkud as _strip_nikkud - -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") -logger = logging.getLogger(__name__) - -DATA_DIR = Path(__file__).parent / "data" - - -def main(): - # Load sentences - with open(DATA_DIR / "epub_sentence_index.json") as f: - sentences = json.load(f).get("sentences", []) - logger.info(f"Loaded {len(sentences)} sentences") - - # Load vocab CSV - csv_path = DATA_DIR / "hebrew_dict_for_anki.csv" - try: - df = pd.read_csv(csv_path, sep=";", index_col=0) - if df.shape[1] < 3: - raise ValueError - except (ValueError, pd.errors.ParserError): - df = pd.read_csv(csv_path, index_col=0) - logger.info(f"Loaded {len(df)} vocab entries") - - # Build word lookup: stripped_form → (word_nikkud, word_no_nikkud) - word_lookup: dict[str, list[tuple[str, str]]] = {} - for _, row in df.iterrows(): - word = str(row.get("Word", "")).strip() - wni = str(row.get("Word Without Nikkud", "")).strip() - if not word or word in ("nan", "None"): - continue - stripped = _strip_nikkud(word) - if stripped: - word_lookup.setdefault(stripped, []).append((word, wni)) - - # Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}] - ktiv_path = DATA_DIR / "ktiv_male_forms.json" - ktiv_forms: dict[str, list[dict]] = {} - if ktiv_path.exists(): - with open(ktiv_path) as f: - ktiv_forms = json.load(f) - logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms") - else: - logger.warning("No ktiv_male_forms.json — only using direct matching") - - # Build reverse lookup: ktiv_male → set of dictionary words (nikkud) - ktiv_to_word: dict[str, set[str]] = {} - for ktiv, entries in ktiv_forms.items(): - for entry in entries: - word_nikkud = entry.get("word_nikkud", "") - if word_nikkud: - ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud) - - # Also add all vocab words' own stripped forms to ktiv_to_word - for stripped, entries in word_lookup.items(): - for word_nikkud, _ in entries: - ktiv_to_word.setdefault(stripped, set()).add(word_nikkud) - - logger.info(f"Total matchable forms: {len(ktiv_to_word)}") - - # Tokenize all sentences once - sentence_tokens: list[tuple[dict, list[str]]] = [] - for s in sentences: - stripped = s.get("stripped", _strip_nikkud(s.get("text", ""))) - tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()] - tokens = [t for t in tokens if t] # remove empty - sentence_tokens.append((s, tokens)) - - # Match: for each sentence token, check ktiv_to_word lookup - # Build word_nikkud → [sentence_info] - matches: dict[str, list[dict]] = {} # word_nikkud → [sentences] - - for sent, tokens in sentence_tokens: - text = sent.get("text", "") - book = sent.get("book", "") - word_len = len(tokens) - - # Skip sentences that are too short or too long - if word_len < 4 or word_len > 15: - continue - - for tok in tokens: - if tok in ktiv_to_word: - for word_nikkud in ktiv_to_word[tok]: - matches.setdefault(word_nikkud, []).append( - { - "text": text, - "book": book, - "matched_form": tok, - "word_count": word_len, - } - ) - - logger.info(f"Words with at least 1 match: {len(matches)}") - - # Deduplicate and limit to 3 best sentences per word - # Prefer shorter sentences (6-12 words ideal) - output: dict[str, dict] = {} - for word_nikkud, sents in matches.items(): - # Deduplicate by text - seen_texts = set() - unique = [] - for s in sents: - if s["text"] not in seen_texts: - seen_texts.add(s["text"]) - unique.append(s) - - # Score: prefer 6-12 word sentences - def score(s): - wc = s["word_count"] - if 6 <= wc <= 12: - return 0 # ideal - return abs(wc - 9) # distance from ideal - - unique.sort(key=score) - best = unique[:3] - - # Find the Word Without Nikkud for this word - stripped = _strip_nikkud(word_nikkud) - wni = stripped # default - if stripped in word_lookup: - for wn, w_wni in word_lookup[stripped]: - if wn == word_nikkud: - wni = w_wni - break - - output[wni] = { - "word_nikkud": word_nikkud, - "sentences": [{"text": s["text"], "book": s["book"]} for s in best], - } - - # Save - out_path = DATA_DIR / "vocab_sentence_matches.json" - with open(out_path, "w") as f: - json.dump(output, f, ensure_ascii=False, indent=1) - - total_sents = sum(len(v["sentences"]) for v in output.values()) - logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}") - - # Stats - total_vocab = len(df) - pct = len(output) * 100 / total_vocab - logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)") - - # Breakdown by match type - direct_only = 0 - ktiv_only = 0 - both = 0 - for _wni, info in output.items(): - word = info["word_nikkud"] - stripped = _strip_nikkud(word) - has_direct = stripped in word_lookup - has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"]) - if has_direct and has_ktiv: - both += 1 - elif has_ktiv: - ktiv_only += 1 - else: - direct_only += 1 - - logger.info(f" Direct matches only: {direct_only}") - logger.info(f" Ktiv male matches only: {ktiv_only}") - logger.info(f" Both: {both}") - - -if __name__ == "__main__": - main() diff --git a/run.py b/run.py index 93142f5..b3d527c 100644 --- a/run.py +++ b/run.py @@ -11,7 +11,7 @@ Pipeline steps: 1. List scrape — scrape pealim.com list pages → words.json (captures slugs) 2. Detail scrape — scrape noun/verb detail pages using slugs → words.json 3. Frequency — load/download word frequency data - 4. Examples — fetch Ben Yehuda example sentences + 4. Examples — extract example sentences from Hebrew EPUBs 5. Audio download — download audio mp3 files 6. Fonts — download Heebo font files 7. Images — fetch noun images from Wikipedia @@ -21,9 +21,8 @@ Options: --skip-scrape Skip list page scraping (use existing words.json) --skip-detail Skip detail page scraping --skip-audio Skip audio .mp3 downloads - --skip-examples Skip Ben Yehuda example fetching + --skip-examples Skip EPUB example extraction --skip-images Skip image fetching for concrete nouns - --refresh-examples Force rebuild of Ben Yehuda index --test N Limit to first N words/pages """ @@ -60,9 +59,8 @@ def parse_args(): p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping") p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping") p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads") - p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup") + p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction") p.add_argument("--skip-images", action="store_true", help="Skip image fetching") - p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index") p.add_argument("--test", type=int, metavar="N", help="Limit to first N words") return p.parse_args() @@ -93,22 +91,15 @@ def step_frequency() -> dict[str, int]: return frequency_lookup._freq -def step_examples(args, _freq_cache: dict): - """Step 4 — load/build Ben Yehuda example index.""" +def step_examples(args) -> dict: + """Step 4 — extract example sentences from Hebrew EPUBs.""" if args.skip_examples: logger.info("[4] Skipping examples (--skip-examples)") - examples_path = DATA_DIR / "examples_cache.json" - if examples_path.exists(): - with open(examples_path) as f: - return json.load(f) return {} - logger.info("[4] Loading Ben Yehuda example index …") - import benyehuda + logger.info("[4] Extracting EPUB example sentences …") + import epub_examples - benyehuda.load(force_rebuild=args.refresh_examples) - - # Read word list from words.json instead of CSV if not WORDS_JSON.exists(): logger.warning("[4] words.json not found, skipping examples") return {} @@ -116,41 +107,14 @@ def step_examples(args, _freq_cache: dict): with open(WORDS_JSON, encoding="utf-8") as f: words = json.load(f) - entries = list(words.values()) - if args.test: - entries = entries[: args.test] + stats = epub_examples.run(words) - # Build confusable consonant set from words.json - consonant_counts: dict[str, int] = {} - for entry in entries: - ktiv_male = entry.get("word", {}).get("ktiv_male", "") - if ktiv_male: - safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male) - if safe: - consonant_counts[safe] = consonant_counts.get(safe, 0) + 1 - confusable_consonants = {k for k, v in consonant_counts.items() if v > 1} + # Save updated words.json + with open(WORDS_JSON, "w", encoding="utf-8") as f: + json.dump(words, f, ensure_ascii=False, indent=2) - # Delete stale cache entries for confusable words so they get re-fetched - stale_deleted = 0 - for entry in entries: - word_nikkud = entry.get("word", {}).get("nikkud", "") - ktiv_male = entry.get("word", {}).get("ktiv_male", "") - if word_nikkud and ktiv_male: - safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male) - if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache: - del benyehuda._examples_cache[word_nikkud] - stale_deleted += 1 - if stale_deleted: - logger.info(f" Deleted {stale_deleted} stale confusable cache entries") - - logger.info(f" Pre-fetching examples for {len(entries)} words …") - for entry in entries: - word_nikkud = entry.get("word", {}).get("nikkud", "") - if word_nikkud: - benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants) - - benyehuda.save_examples_cache() - return benyehuda._examples_cache + logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}") + return stats def step_detail_scrape(args): @@ -250,7 +214,7 @@ def step_build_all(args): apkg_builder.build_all_variants(words, limit=args.test) -def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict): +def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict): logger.info("") logger.info("=" * 60) logger.info("SUMMARY") @@ -267,10 +231,12 @@ def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: d logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}") logger.info(f" Frequency entries: {len(freq_cache)}") - logger.info(f" Example cache entries: {len(examples_cache)}") - covered = sum(1 for v in examples_cache.values() if v) - if examples_cache: - logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)") + matched = example_stats.get("matched", 0) + total = example_stats.get("total_vocab", 0) + if total: + logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)") + for book, count in example_stats.get("books", {}).items(): + logger.info(f" {book}: {count} sentences") if AUDIO_DIR.exists(): mp3s = list(AUDIO_DIR.glob("*.mp3")) @@ -321,8 +287,6 @@ def main(): logger.info(f" MODE: --only {args.only}") if args.test: logger.info(f" TEST MODE: {args.test} words") - if args.refresh_examples: - logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt") logger.info("=" * 60) def _load_words_for_only() -> dict: @@ -385,13 +349,13 @@ def main(): step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs) step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json freq_cache = step_frequency() # 3 — word frequency data - examples_cache = step_examples(args, _freq_cache=freq_cache) # 4 — Ben Yehuda examples + example_stats = step_examples(args) # 4 — EPUB example sentences step_audio_download(args) # 5 — download audio mp3s step_fonts(args) # 6 — download Heebo fonts step_images(args) # 7 — fetch noun images step_build_all(args) # 8 — build all .apkg variants - print_summary(args, examples_cache, freq_cache) + print_summary(args, example_stats, freq_cache) if __name__ == "__main__": diff --git a/scripts/validate_data.py b/scripts/validate_data.py index 69dfb9d..5ce760d 100644 --- a/scripts/validate_data.py +++ b/scripts/validate_data.py @@ -32,7 +32,7 @@ DATA_FILE = Path(__file__).parent.parent / "data" / "words.json" HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav VALID_PERSON_CODES: frozenset[str] = frozenset( - ["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"] + ["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"] ) EMOJI_RE = re.compile( @@ -561,6 +561,7 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None: """ name = "conjugation_form_guids" errors: list[str] = [] + warnings: list[str] = [] for key, entry in data.items(): conj = entry.get("conjugation") @@ -580,7 +581,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None: guid_candidates = form.get("guid_candidates") if not guid and not guid_candidates: - errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'") + # New forms from rescrape use deterministic fallback — warn, don't fail + warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'") continue if guid: @@ -597,6 +599,8 @@ def test_conjugation_form_guids(data: dict[str, Any]) -> None: else: seen_guids[candidate] = label + if warnings: + _warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"]) if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: diff --git a/tests/test_detail_scrape.py b/tests/test_detail_scrape.py new file mode 100644 index 0000000..8a040c5 --- /dev/null +++ b/tests/test_detail_scrape.py @@ -0,0 +1,486 @@ +"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py.""" + +import sys +from pathlib import Path + +import pytest + +sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) + +from pealim_detail_scrape import ( + _parse_adjective_table, + _parse_adjective_table_vl, + _parse_preposition_table, + _parse_preposition_table_vl, + _scrape_adjective_detail, + _scrape_preposition_detail, +) + +# --------------------------------------------------------------------------- +# Fixtures — real HTML snippets from pealim.com +# --------------------------------------------------------------------------- + +ADJECTIVE_MO_TABLE = """ + + + + + + + + + + + + + + + + + + + + + +
SingularPlural
MasculineFeminineMasculineFeminine
+
+
+ 🔊 + אֲבִיבִי +
+
spring-like, vernal
+
+
+
+
+ 🔊 + אֲבִיבִית +
+
spring-like, vernal
+
+
+
+
+ 🔊 + אֲבִיבִיִּים +
+
spring-like, vernal
+
+
+
+
+ 🔊 + אֲבִיבִיּוֹת +
+
spring-like, vernal
+
+
+""" + +# VL version: menukad spans contain unvowelled text (hebstyle=vl) +ADJECTIVE_VL_TABLE = """ + + + + + + + + + +
+
+ אביבי +
+
+
+ אביבית +
+
+
+ אביביים +
+
+
+ אביביות +
+
+""" + +PREPOSITION_MO_TABLE = """ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
PersonSingularPlural
MasculineFeminineMasculineFeminine
1st +
+ 🔊 + שֶׁלִּי +
of mine
+
+
+ 🔊 + שֶׁלָּנוּ +
of ours
+
2nd +
+ 🔊 + שֶׁלְּךָ +
of yours m. sg.
+
+
+ 🔊 + שֶׁלָּךְ +
of yours f. sg.
+
+
+ 🔊 + שֶׁלָּכֶם +
of yours m. pl.
+
+
+ 🔊 + שֶׁלָּכֶן +
of yours f. pl.
+
3rd +
+ 🔊 + שֶׁלּוֹ +
of his
+
+
+ 🔊 + שֶׁלָּהּ +
of hers
+
+
+ 🔊 + שֶׁלָּהֶם +
of theirs m.
+
+
+ 🔊 + שֶׁלָּהֶן +
of theirs f.
+
+""" + +PREPOSITION_VL_TABLE = """ + + + + + + + + + + + + + + + + + + + + + + +
1st
+ שלי +
+ שלנו +
2nd
+ שלך +
+ שלך +
+ שלכם +
+ שלכן +
3rd
+ שלו +
+ שלה +
+ שלהם +
+ שלהן +
+""" + +# Minimal full-page wrappers so _scrape_*_detail() can parse them +_ADJECTIVE_MO_PAGE = f"{ADJECTIVE_MO_TABLE}" +_ADJECTIVE_VL_PAGE = f"{ADJECTIVE_VL_TABLE}" +_PREPOSITION_MO_PAGE = f"{PREPOSITION_MO_TABLE}" +_PREPOSITION_VL_PAGE = f"{PREPOSITION_VL_TABLE}" + + +# --------------------------------------------------------------------------- +# Adjective table tests +# --------------------------------------------------------------------------- + + +class TestParseAdjectiveTable: + """Tests for _parse_adjective_table (mo/nikkud page).""" + + def test_returns_four_form_keys(self) -> None: + result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml")) + assert set(result.keys()) == {"ms", "fs", "mp", "fp"} + + def test_ms_nikkud(self) -> None: + result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml")) + assert result["ms"]["nikkud"] == "אֲבִיבִי" + + def test_fs_nikkud(self) -> None: + result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml")) + assert result["fs"]["nikkud"] == "אֲבִיבִית" + + def test_mp_nikkud(self) -> None: + result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml")) + assert result["mp"]["nikkud"] == "אֲבִיבִיִּים" + + def test_fp_nikkud(self) -> None: + result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml")) + assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת" + + def test_audio_url_present(self) -> None: + result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml")) + assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/") + + def test_empty_on_missing_table(self) -> None: + result = _parse_adjective_table(__import__("bs4").BeautifulSoup("", "lxml")) + assert result == {} + + +class TestParseAdjectiveTableVl: + """Tests for _parse_adjective_table_vl (ktiv male page).""" + + def test_returns_four_form_keys(self) -> None: + result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml")) + assert set(result.keys()) == {"ms", "fs", "mp", "fp"} + + def test_ms_ktiv(self) -> None: + result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml")) + assert result["ms"] == "אביבי" + + def test_fs_ktiv(self) -> None: + result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml")) + assert result["fs"] == "אביבית" + + def test_mp_ktiv(self) -> None: + result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml")) + assert result["mp"] == "אביביים" + + def test_fp_ktiv(self) -> None: + result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml")) + assert result["fp"] == "אביביות" + + +# --------------------------------------------------------------------------- +# _scrape_adjective_detail tests +# --------------------------------------------------------------------------- + + +class TestScrapeAdjectiveDetail: + """Tests for _scrape_adjective_detail — schema compliance.""" + + @pytest.fixture() + def result(self) -> dict: + return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE) + + def test_returns_non_empty_dict(self, result: dict) -> None: + assert result + + def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["ms"]["nikkud"] == "אֲבִיבִי" + assert result["ms"]["ktiv_male"] == "אביבי" + + def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["fs"]["nikkud"] == "אֲבִיבִית" + assert result["fs"]["ktiv_male"] == "אביבית" + + def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["mp"]["nikkud"] == "אֲבִיבִיִּים" + assert result["mp"]["ktiv_male"] == "אביביים" + + def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת" + assert result["fp"]["ktiv_male"] == "אביביות" + + def test_mishkal_key_present(self, result: dict) -> None: + # mishkal may be None since no PoS section is in our minimal fixture + assert "mishkal" in result + + def test_mishkal_hebrew_key_present(self, result: dict) -> None: + assert "mishkal_hebrew" in result + + def test_all_schema_keys_present(self, result: dict) -> None: + expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"} + assert expected.issubset(result.keys()) + + def test_empty_on_no_table(self) -> None: + result = _scrape_adjective_detail("missing", "", "") + assert result == {} + + +# --------------------------------------------------------------------------- +# Preposition table tests +# --------------------------------------------------------------------------- + + +class TestParsePrepositionTable: + """Tests for _parse_preposition_table (mo/nikkud page).""" + + @pytest.fixture() + def result(self) -> dict: + return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml")) + + def test_returns_ten_form_keys(self, result: dict) -> None: + expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"} + assert set(result.keys()) == expected + + def test_1s_nikkud(self, result: dict) -> None: + assert result["1s"]["nikkud"] == "שֶׁלִּי" + + def test_1p_nikkud(self, result: dict) -> None: + assert result["1p"]["nikkud"] == "שֶׁלָּנוּ" + + def test_2ms_nikkud(self, result: dict) -> None: + assert result["2ms"]["nikkud"] == "שֶׁלְּךָ" + + def test_2fs_nikkud(self, result: dict) -> None: + assert result["2fs"]["nikkud"] == "שֶׁלָּךְ" + + def test_2mp_nikkud(self, result: dict) -> None: + assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם" + + def test_2fp_nikkud(self, result: dict) -> None: + assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן" + + def test_3ms_nikkud(self, result: dict) -> None: + assert result["3ms"]["nikkud"] == "שֶׁלּוֹ" + + def test_3fs_nikkud(self, result: dict) -> None: + assert result["3fs"]["nikkud"] == "שֶׁלָּהּ" + + def test_3mp_nikkud(self, result: dict) -> None: + assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם" + + def test_3fp_nikkud(self, result: dict) -> None: + assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן" + + def test_audio_url_present(self, result: dict) -> None: + assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/") + + def test_empty_on_missing_table(self) -> None: + result = _parse_preposition_table(__import__("bs4").BeautifulSoup("", "lxml")) + assert result == {} + + +class TestParsePrepositionTableVl: + """Tests for _parse_preposition_table_vl (ktiv male page).""" + + @pytest.fixture() + def result(self) -> dict: + return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml")) + + def test_returns_ten_form_keys(self, result: dict) -> None: + expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"} + assert set(result.keys()) == expected + + def test_1s_ktiv(self, result: dict) -> None: + assert result["1s"] == "שלי" + + def test_1p_ktiv(self, result: dict) -> None: + assert result["1p"] == "שלנו" + + def test_2ms_ktiv(self, result: dict) -> None: + assert result["2ms"] == "שלך" + + def test_3ms_ktiv(self, result: dict) -> None: + assert result["3ms"] == "שלו" + + def test_3fp_ktiv(self, result: dict) -> None: + assert result["3fp"] == "שלהן" + + +# --------------------------------------------------------------------------- +# _scrape_preposition_detail tests +# --------------------------------------------------------------------------- + + +class TestScrapePrepositionDetail: + """Tests for _scrape_preposition_detail — schema compliance.""" + + @pytest.fixture() + def result(self) -> dict: + return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE) + + def test_returns_non_empty_dict(self, result: dict) -> None: + assert result + + def test_all_ten_person_keys_present(self, result: dict) -> None: + expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"} + assert expected.issubset(result.keys()) + + def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["1s"]["nikkud"] == "שֶׁלִּי" + assert result["1s"]["ktiv_male"] == "שלי" + + def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["1p"]["nikkud"] == "שֶׁלָּנוּ" + assert result["1p"]["ktiv_male"] == "שלנו" + + def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["2ms"]["nikkud"] == "שֶׁלְּךָ" + assert result["2ms"]["ktiv_male"] == "שלך" + + def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["3ms"]["nikkud"] == "שֶׁלּוֹ" + assert result["3ms"]["ktiv_male"] == "שלו" + + def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["3fs"]["nikkud"] == "שֶׁלָּהּ" + assert result["3fs"]["ktiv_male"] == "שלה" + + def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None: + assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן" + assert result["3fp"]["ktiv_male"] == "שלהן" + + def test_empty_on_no_table(self) -> None: + result = _scrape_preposition_detail("missing", "", "") + assert result == {} diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 78851ef..d64223d 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -42,3 +42,17 @@ def test_strip_nikkud_all_marks(): nikkud = "הַמַּלְכָּה" plain = strip_nikkud(nikkud) assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}" + + +def test_categorize_pos_no_substring_match(): + """Regression: 'Pronoun' must NOT match 'Noun' category.""" + from apkg_builder import _categorize_pos + + assert _categorize_pos("Noun") == "Noun" + assert _categorize_pos("Verb") == "Verb" + assert _categorize_pos("Adjective") == "Adjective" + assert _categorize_pos("Adverb") == "Adverb" + assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun" + assert _categorize_pos("Preposition") == "Other" + assert _categorize_pos("Conjunction") == "Other" + assert _categorize_pos("Cardinal numeral") == "Other"