hebrew_flash_cards/epub_examples.py
Sochen 00fba934fb feat(epub_examples): export try_strip_prefix as public alias
Exposes _try_strip_prefix under a public name so the upcoming
sentence_difficulty module can reuse Hebrew prefix stripping logic
without duplicating it.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-15 13:18:55 +00:00

898 lines
31 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Extract example sentences from nikud'd Hebrew EPUB files, match them against
the vocabulary list in data/words.json, and write matched examples back into
words.json.
Usage (standalone):
python3 epub_examples.py
Called from run.py via:
run(words) — words dict is passed in and updated in place
"""
import logging
import os
import re
import zipfile
from html.parser import HTMLParser
from pathlib import Path
from helpers import strip_nikkud
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
EPUB_DIR = DATA_DIR / "epubs"
WORDS_JSON = DATA_DIR / "words.json"
# Book metadata: filename -> display name
def _discover_epubs() -> dict[str, str]:
"""Auto-discover all .epub and .txt files in EPUB_DIR, returning {filepath: display_name}."""
if not EPUB_DIR.exists():
return {}
books: dict[str, str] = {}
for path in sorted(EPUB_DIR.glob("*.epub")):
stem = path.stem
stem_stripped = strip_nikkud(stem).lower()
# Derive a brief English display name from the filename
parts = stem.split(" -- ")
title_part = strip_nikkud(parts[0]).strip().lower()
if "alice" in stem_stripped or "אליס" in title_part:
name = "alice_wonderland"
elif "little_prince" in stem_stripped or "נסיך" in title_part:
name = "little_prince"
elif "מנהרת" in title_part or "time_tunnel" in stem_stripped:
num_match = re.search(r"(\d+)", stem_stripped)
num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "")
name = f"time_tunnel_{num}"
else:
name = stem_stripped[:40]
books[str(path)] = name
# Also discover plain-text files (e.g. Ben Yehuda downloads)
for path in sorted(EPUB_DIR.glob("*.txt")):
books[str(path)] = path.stem
return books
# Sentence length bounds (word count)
MIN_WORDS = 3
MAX_WORDS = 15
# ── HTML text extraction ─────────────────────────────────────────
class _TextExtractor(HTMLParser):
"""Extract text content from HTML, skipping script/style tags."""
SKIP_TAGS = {"script", "style", "head"}
def __init__(self):
super().__init__()
self.parts: list[str] = []
self._skip_depth = 0
def handle_starttag(self, tag, attrs):
_ = attrs # required by HTMLParser interface
if tag in self.SKIP_TAGS:
self._skip_depth += 1
# Insert newline for block-level elements to avoid word concatenation
if tag in (
"p",
"div",
"br",
"li",
"h1",
"h2",
"h3",
"h4",
"h5",
"h6",
"td",
"th",
"tr",
"blockquote",
"section",
):
self.parts.append("\n")
def handle_endtag(self, tag):
if tag in self.SKIP_TAGS:
self._skip_depth = max(0, self._skip_depth - 1)
def handle_data(self, data):
if self._skip_depth == 0:
self.parts.append(data)
def get_text(self) -> str:
return "".join(self.parts)
def extract_text_from_html(html: str) -> str:
"""Parse HTML and return plain text."""
parser = _TextExtractor()
parser.feed(html)
return parser.get_text()
# ── EPUB processing ──────────────────────────────────────────────
def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
"""Get ordered list of content XHTML files from the OPF manifest."""
opf_path = None
for name in zf.namelist():
if name.endswith(".opf"):
opf_path = name
break
if not opf_path:
# Fallback: just use all xhtml files
return sorted(
n
for n in zf.namelist()
if n.endswith((".xhtml", ".html"))
and "toc" not in n.lower()
and "cover" not in n.lower()
and "nav" not in n.lower()
)
# Parse OPF to get spine order
opf_content = zf.read(opf_path).decode("utf-8")
opf_dir = os.path.dirname(opf_path)
# Extract manifest items: id -> href
manifest: dict[str, str] = {}
for m in re.finditer(r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
manifest[m.group(1)] = m.group(2)
# Also try reversed attribute order
for m in re.finditer(r'<item\s+[^>]*href="([^"]+)"[^>]*id="([^"]+)"', opf_content):
manifest[m.group(2)] = m.group(1)
# Extract spine order
spine_ids = re.findall(r'<itemref\s+[^>]*idref="([^"]+)"', opf_content)
result = []
for sid in spine_ids:
href = manifest.get(sid, "")
if href and href.endswith((".xhtml", ".html")):
full_path = os.path.join(opf_dir, href) if opf_dir else href
# Normalize path separators
full_path = full_path.replace("\\", "/")
if full_path in zf.namelist():
result.append(full_path)
if not result:
# Fallback
return sorted(
n
for n in zf.namelist()
if n.endswith((".xhtml", ".html")) and "toc" not in n.lower() and "cover" not in n.lower()
)
return result
def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
"""Extract sentences from an EPUB file.
Args:
epub_path: Path to the .epub file.
book_name: Human-readable book name used as the ``source`` field.
Returns:
List of ``{"text": str, "source": str}`` dicts.
"""
zf = zipfile.ZipFile(epub_path)
content_files = _content_files_from_epub(zf)
all_text = []
for cf in content_files:
try:
html = zf.read(cf).decode("utf-8")
except (KeyError, UnicodeDecodeError):
continue
text = extract_text_from_html(html)
all_text.append(text)
full_text = "\n".join(all_text)
return _split_into_sentences(full_text, book_name)
def extract_sentences_from_text(text_path: Path, book_name: str) -> list[dict]:
"""Extract sentences from a plain-text file (e.g. Ben Yehuda downloads).
Args:
text_path: Path to the .txt file.
book_name: Human-readable book name used as the ``source`` field.
Returns:
List of ``{"text": str, "source": str}`` dicts.
"""
full_text = text_path.read_text(encoding="utf-8")
return _split_into_sentences(full_text, book_name)
# ── Sentence splitting ───────────────────────────────────────────
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
_SENT_SPLIT = re.compile(r"[.!?\u05C3]+")
# Punctuation to strip from word boundaries when matching
_PUNCT = re.compile(
r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|'
r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
)
def _split_into_sentences(text: str, book_name: str) -> list[dict]:
"""Split text into Hebrew sentences and filter by word count.
Args:
text: Raw extracted text from an EPUB chapter.
book_name: Source label for each sentence dict.
Returns:
List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text.
"""
# Normalize whitespace
text = re.sub(r"\s+", " ", text).strip()
raw_sentences = _SENT_SPLIT.split(text)
results: list[dict] = []
seen: set[str] = set()
for sent in raw_sentences:
sent = sent.strip()
if not sent:
continue
# Count Hebrew words (skip non-Hebrew tokens like numbers)
words = sent.split()
hebrew_words = [w for w in words if any("\u0590" <= c <= "\u05ff" for c in w)]
if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
continue
# Deduplicate by exact nikkud text
if sent in seen:
continue
seen.add(sent)
results.append({"text": sent, "source": book_name})
return results
# ── Nikkud index ─────────────────────────────────────────────────
# Unicode ranges for Hebrew combining marks
_NIKKUD_LOW = 0x05B0 # start of vowel points (shva)
_NIKKUD_HIGH = 0x05BD # end of vowel range (meteg); 0x05BE is maqaf (punctuation)
_DAGESH = "\u05bc"
_SHIN_DOT = "\u05c1"
_SIN_DOT = "\u05c2"
# Valid prefix consonants
_PREFIX_CONSONANTS = set("בהוכלמש")
# Named vowel combining marks
_SHVA = "\u05b0"
_HIRIQ = "\u05b4"
_TSERE = "\u05b5"
_SEGOL = "\u05b6"
_PATACH = "\u05b7"
_QAMATZ = "\u05b8"
# Valid nikkud patterns on each prefix consonant.
# Key = consonant, Value = set of frozensets of combining marks valid for that prefix.
_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = {
"ב": {
frozenset({_SHVA, _DAGESH}), # בְּ standard
frozenset({_HIRIQ, _DAGESH}), # בִּ before shva
frozenset({_PATACH, _DAGESH}), # בַּ with definite article
frozenset({_QAMATZ, _DAGESH}), # בָּ before chataf qamatz
frozenset({_SEGOL, _DAGESH}), # בֶּ before chataf segol
},
"כ": {
frozenset({_SHVA, _DAGESH}), # כְּ
frozenset({_HIRIQ, _DAGESH}), # כִּ
frozenset({_PATACH, _DAGESH}), # כַּ
frozenset({_QAMATZ, _DAGESH}), # כָּ
frozenset({_SEGOL, _DAGESH}), # כֶּ
},
"ל": {
frozenset({_SHVA}), # לְ standard
frozenset({_HIRIQ}), # לִ before shva
frozenset({_PATACH}), # לַ with definite article
frozenset({_QAMATZ}), # לָ demonstratives
frozenset({_SEGOL}), # לֶ before chataf segol
},
"ו": {
frozenset({_SHVA}), # וְ standard
frozenset({_DAGESH}), # וּ (shureq) before shva/bumf
frozenset({_PATACH}), # וַ before chataf patach
frozenset({_QAMATZ}), # וָ before chataf qamatz
frozenset({_SEGOL}), # וֶ before chataf segol
frozenset({_HIRIQ}), # וִ before yud-shva
},
"מ": {
frozenset({_HIRIQ}), # מִ standard
frozenset({_TSERE}), # מֵ before gutturals
},
"ש": {
frozenset({_SEGOL, _DAGESH}), # שֶׁ standard
frozenset({_SEGOL, _DAGESH, _SHIN_DOT}), # שֶׁ with explicit shin dot
},
"ה": {
frozenset({_PATACH}), # הַ standard definite article
frozenset({_QAMATZ}), # הָ before gutturals
frozenset({_SEGOL}), # הֶ before qamatz-bearing gutturals
},
}
def _is_combining_mark(ch: str) -> bool:
"""Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots)."""
cp = ord(ch)
if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH:
return True
return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT)
def _decompose_first_char(token: str) -> tuple[str, frozenset, str]:
"""Split token into (first_consonant, its_combining_marks, remainder).
Args:
token: A nikkud Hebrew token string.
Returns:
A tuple of (consonant, marks, rest). Returns ("", frozenset(), token)
if the token does not start with a Hebrew consonant (aleftav range).
"""
if not token:
return ("", frozenset(), token)
first = token[0]
# Check it's a Hebrew consonant (aleftav)
if not ("\u05d0" <= first <= "\u05ea"):
return ("", frozenset(), token)
# Collect all combining marks that follow the consonant
marks: set[str] = set()
i = 1
while i < len(token):
ch = token[i]
if _is_combining_mark(ch):
marks.add(ch)
i += 1
else:
break
return (first, frozenset(marks), token[i:])
def _is_valid_prefix(consonant: str, marks: frozenset) -> bool:
"""Check if consonant + marks form a valid Hebrew prefix combination.
Args:
consonant: The prefix consonant character.
marks: Frozenset of combining mark characters on that consonant.
Returns:
True if this is a recognised Hebrew prefix vocalization.
"""
valid = _VALID_PREFIX_MARKS.get(consonant)
if not valid:
return False
# For ש, allow shin dot to be present or absent
if consonant == "ש":
marks_without_shin = marks - {_SHIN_DOT}
return marks_without_shin in valid or marks in valid
return marks in valid
def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str:
"""Reassemble a token from its decomposed parts, sorting marks by codepoint."""
return consonant + "".join(sorted(marks)) + rest
def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]:
"""Try stripping 1 or 2 prefix letters from a nikkud token.
Args:
token: A cleaned nikkud word token.
nikkud_index: Mapping from nikkud form to list of (unique_key, match_type).
Returns:
List of (unique_key, match_type, matched_remainder) for each hit found.
The match_type will have ``"_prefix"`` appended to the base type.
"""
results: list[tuple[str, str, str]] = []
# Try 1-letter prefix
c1, m1, rest1 = _decompose_first_char(token)
if not (c1 and _is_valid_prefix(c1, m1) and rest1):
return results
# Direct match on 1-prefix remainder
if rest1 in nikkud_index:
for unique_key, match_type in nikkud_index[rest1]:
results.append((unique_key, match_type + "_prefix", rest1))
# Try removing dagesh from first letter of remainder
# (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ)
c2, m2, rest2_inner = _decompose_first_char(rest1)
if c2 and _DAGESH in m2:
without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner)
if without_dagesh != rest1 and without_dagesh in nikkud_index:
for unique_key, match_type in nikkud_index[without_dagesh]:
results.append((unique_key, match_type + "_prefix", without_dagesh))
# Try 2-letter prefix (ו and ש commonly stack with another prefix)
if c1 in "וש":
c2b, m2b, rest2b = _decompose_first_char(rest1)
if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b:
if rest2b in nikkud_index:
for unique_key, match_type in nikkud_index[rest2b]:
results.append((unique_key, match_type + "_prefix", rest2b))
# Also try dagesh removal on remainder of 2-letter prefix
c3, m3, rest3_inner = _decompose_first_char(rest2b)
if c3 and _DAGESH in m3:
without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner)
if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index:
for unique_key, match_type in nikkud_index[without_dagesh2]:
results.append((unique_key, match_type + "_prefix", without_dagesh2))
return results
# Public alias for use by sentence_difficulty module
try_strip_prefix = _try_strip_prefix
def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
"""Build a mapping from nikkud form to list of (unique_key, match_type).
Indexes the following sources per entry:
- ``word.nikkud`` → "direct"
- conjugation active/passive forms → "conjugated"
- conjugation infinitive and reference_form → "conjugated"
- noun inflection singular/plural/construct/pronominal → "inflected"
Args:
words: The full words.json dict keyed by unique_key.
Returns:
Dict mapping each nikkud form to a list of (unique_key, match_type) tuples.
"""
index: dict[str, list[tuple[str, str]]] = {}
def _add(form: str | None, unique_key: str, match_type: str) -> None:
if form:
index.setdefault(form, []).append((unique_key, match_type))
for unique_key, entry in words.items():
# Direct word form
word = entry.get("word") or {}
_add(word.get("nikkud"), unique_key, "direct")
# Conjugation forms
conj = entry.get("conjugation") or {}
for form_entry in conj.get("active_forms") or []:
form = (form_entry.get("form") or {}).get("nikkud")
_add(form, unique_key, "conjugated")
for form_entry in conj.get("hufal_pual_forms") or []:
form = (form_entry.get("form") or {}).get("nikkud")
_add(form, unique_key, "conjugated")
inf = conj.get("infinitive") or {}
_add(inf.get("nikkud"), unique_key, "conjugated")
ref = conj.get("reference_form") or {}
_add(ref.get("nikkud"), unique_key, "conjugated")
# Noun inflection forms
noun = entry.get("noun_inflection") or {}
for field in ("singular", "plural", "construct_singular", "construct_plural"):
sub = noun.get(field) or {}
form = sub.get("nikkud")
_add(form, unique_key, "inflected")
# Index construct forms without maqaf too — modern text often
# writes smichut as two space-separated words without maqaf
if form and form.endswith("־"):
_add(form[:-1], unique_key, "inflected")
pronominal = noun.get("pronominal_suffixes") or {}
for _person, sub in pronominal.items():
if isinstance(sub, dict):
_add(sub.get("nikkud"), unique_key, "inflected")
return index
def _filter_collision_forms(nikkud_index: dict) -> dict:
"""Remove colliding forms for entries that have other unique forms.
A "colliding form" maps to 2+ unique_keys. For each unique_key that
appears in a collision, check whether it also has at least one
non-colliding form in the index. If so, remove it from the colliding
form's entry list. If a unique_key's *only* indexed forms all collide,
keep them (otherwise the entry would get zero matches).
Returns a new index dict with the same structure.
"""
# Identify collision forms and build reverse map (key → its forms)
collision_forms: set[str] = set()
key_to_forms: dict[str, set[str]] = {}
for form, entries in nikkud_index.items():
keys = {uk for uk, _ in entries}
if len(keys) >= 2:
collision_forms.add(form)
for uk, _ in entries:
key_to_forms.setdefault(uk, set()).add(form)
# For each key, check if it has any non-colliding form
keys_with_unique_forms: set[str] = set()
for uk, forms in key_to_forms.items():
if forms - collision_forms:
keys_with_unique_forms.add(uk)
# Build filtered index
filtered: dict[str, list[tuple[str, str]]] = {}
removed = 0
for form, entries in nikkud_index.items():
if form in collision_forms:
kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms]
removed += len(entries) - len(kept)
if kept:
filtered[form] = kept
else:
filtered[form] = entries
logger.info(f" Filtered {removed} collision mappings from entries with unique forms")
return filtered
# ── Matching ─────────────────────────────────────────────────────
def match_sentences(
sentences: list[dict],
nikkud_index: dict,
confusable_keys: set[str],
) -> dict:
"""Match sentences to vocab words using the nikkud index.
Args:
sentences: List of ``{"text": str, "source": str}`` dicts.
nikkud_index: Output of ``_build_nikkud_index``.
confusable_keys: Set of unique_keys that are in confusable groups.
Returns:
Dict mapping unique_key → list of match dicts, each containing:
``text``, ``source``, ``match_method``, ``word_count``,
``matched_form``, ``char_offset``, ``char_end``.
"""
matches: dict[str, list[dict]] = {}
for sent_info in sentences:
text = sent_info["text"]
source = sent_info["source"]
words_in_sent = text.split()
word_count = len(words_in_sent)
char_pos = 0
for raw_word in words_in_sent:
cleaned = _PUNCT.sub("", raw_word)
if not cleaned:
word_start = text.find(raw_word, char_pos)
char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos
continue
# Locate positions within the sentence
word_start_in_sent = text.find(raw_word, char_pos)
if word_start_in_sent < 0:
word_start_in_sent = char_pos
clean_offset_in_raw = raw_word.find(cleaned)
if clean_offset_in_raw < 0:
clean_offset_in_raw = 0
clean_start = word_start_in_sent + clean_offset_in_raw
clean_end = clean_start + len(cleaned)
found: list[tuple[str, str]] = []
# Direct nikkud match
if cleaned in nikkud_index:
for unique_key, match_type in nikkud_index[cleaned]:
found.append((unique_key, match_type))
# Prefix stripping — only if no direct match exists
if cleaned not in nikkud_index:
for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index):
found.append((unique_key, match_type))
for unique_key, match_method in found:
matches.setdefault(unique_key, []).append(
{
"text": text,
"source": source,
"match_method": match_method,
"word_count": word_count,
"matched_form": cleaned,
"char_offset": clean_start,
"char_end": clean_end,
}
)
char_pos = word_start_in_sent + len(raw_word)
return matches
# ── Writing results ──────────────────────────────────────────────
def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int:
"""Update words dict entries with matched example sentences.
Selects up to 3 best sentences per word (scoring prefers 612 word
sentences and non-prefix matches). Also generates a cloze entry for
the top match, unless the word is in the confusable set.
Args:
words: The full words.json dict, modified in place.
matches: Output of ``match_sentences``.
confusable_keys: Set of unique_keys in confusable groups.
Returns:
Count of words.json entries that were updated.
"""
import genanki # noqa: PLC0415 — import only where needed
updated = 0
for unique_key, sent_list in matches.items():
if unique_key not in words:
continue
entry = words[unique_key]
# Deduplicate by sentence text
seen_texts: set[str] = set()
unique: list[dict] = []
for s in sent_list:
if s["text"] not in seen_texts:
seen_texts.add(s["text"])
unique.append(s)
# Prefer direct matches; only fall back to prefix if none exist
direct = [s for s in unique if "prefix" not in s["match_method"]]
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
pool = direct if direct else prefix_only
# Score: prefer 612 word sentences
def _score(s: dict) -> tuple[int,]:
wc = s["word_count"]
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
return (length_score,)
pool.sort(key=_score)
best = pool[:3]
# Build vetted list
if not entry.get("examples"):
entry["examples"] = {}
examples: dict = entry["examples"]
examples["vetted"] = [
{
"text": s["text"],
"source": s["source"],
"match_method": s["match_method"],
}
for s in best
]
# Build cloze from best sentence (skip confusables)
is_confusable = unique_key in confusable_keys
if not is_confusable and best:
top = best[0]
# Preserve existing cloze_guid if sentence text unchanged
old_cloze = examples.get("cloze") or {}
if old_cloze.get("text") == top["text"]:
cloze_guid = old_cloze.get("cloze_guid")
else:
cloze_guid = genanki.guid_for("cloze", unique_key)
examples["cloze"] = {
"text": top["text"],
"cloze_word_start": top["char_offset"],
"cloze_word_end": top["char_end"],
"cloze_hint": None,
"cloze_guid": cloze_guid,
}
elif is_confusable:
examples.pop("cloze", None)
examples["rejected_count"] = 0
updated += 1
# Deduplicate shared examples across confusable groups
cleared = _deduplicate_confusable_examples(words)
if cleared:
logger.info(f" Cleared shared examples from {cleared} confusable entries")
return updated
def _deduplicate_confusable_examples(words: dict) -> int:
"""Remove shared examples from less-common confusable group members.
After example matching assigns sentences, confusable entries often share
identical examples (matched via shared nikkud forms). This function keeps
examples only on the highest-frequency member, clearing others.
Args:
words: The full words.json dict, modified in place (examples already
assigned).
Returns:
Count of entries whose examples were cleared.
"""
from collections import defaultdict
# Build confusable group map: group_id → [unique_key, ...]
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
for key, entry in words.items():
cg = entry.get("confusable_group")
if cg:
group_id = tuple(sorted(cg))
group_map[group_id].append(key)
cleared = 0
for _group_id, members in group_map.items():
if len(members) < 2:
continue
# Collect vetted sentence text sets per member
member_texts: dict[str, frozenset[str]] = {}
for key in members:
vetted = (words[key].get("examples") or {}).get("vetted") or []
texts = frozenset(e.get("text", "") for e in vetted)
member_texts[key] = texts
# Find members with identical non-empty sentence sets
# Group members by their sentence set
text_groups: dict[frozenset[str], list[str]] = defaultdict(list)
for key, texts in member_texts.items():
if texts: # skip entries with no examples
text_groups[texts].append(key)
# For each set of members sharing identical examples, keep only the
# highest-frequency one
for _texts, sharing_keys in text_groups.items():
if len(sharing_keys) < 2:
continue
# Sort by frequency_rank (lower = more common = winner).
# No frequency → sort last (use large sentinel).
# Tie-break: alphabetical by unique_key.
def _sort_key(k: str) -> tuple[int, str]:
rank = words[k].get("frequency_rank")
return (rank if rank is not None else 999999, k)
sharing_keys.sort(key=_sort_key)
winner = sharing_keys[0]
losers = sharing_keys[1:]
for loser_key in losers:
entry = words[loser_key]
examples = entry.get("examples") or {}
examples["vetted"] = []
examples.pop("cloze", None)
entry["examples"] = examples
cleared += 1
logger.debug(f" Cleared examples from {loser_key} (kept on {winner})")
return cleared
# ── Public API ───────────────────────────────────────────────────
def run(words: dict) -> dict:
"""Extract EPUB sentences, match against words, update words dict in place.
Called from run.py with the already-loaded words.json dict.
Args:
words: The full words.json dict keyed by unique_key. Modified in place.
Returns:
Summary stats dict with keys ``books``, ``matched``, ``total_vocab``.
"""
logger.info(" Extracting sentences from EPUBs ...")
all_sentences: list[dict] = []
book_counts: dict[str, int] = {}
for filepath, book_name in _discover_epubs().items():
path = Path(filepath)
if path.suffix == ".txt":
sentences = extract_sentences_from_text(path, book_name)
else:
sentences = extract_sentences_from_epub(path, book_name)
book_counts[book_name] = len(sentences)
all_sentences.extend(sentences)
logger.info(f" {book_name}: {len(sentences)} sentences")
if not all_sentences:
logger.warning(" No EPUB files found — skipping example extraction")
return {"books": {}, "matched": 0, "total_vocab": len(words)}
logger.info(f" Total sentences: {len(all_sentences)}")
# Build nikkud index
logger.info(" Building nikkud index from words.json ...")
nikkud_index = _build_nikkud_index(words)
logger.info(f" {len(nikkud_index)} unique nikkud forms indexed")
# Filter out collision forms for entries that have unique forms
nikkud_index = _filter_collision_forms(nikkud_index)
# Build confusable key set
confusable_keys: set[str] = set()
for key, entry in words.items():
if entry.get("confusable_group"):
confusable_keys.add(key)
# Match sentences
logger.info(" Matching sentences against vocab ...")
matches = match_sentences(all_sentences, nikkud_index, confusable_keys)
logger.info(f" {len(matches)} words matched")
# Break down by match method
method_counts: dict[str, int] = {}
for sent_list in matches.values():
for s in sent_list:
method = s["match_method"]
method_counts[method] = method_counts.get(method, 0) + 1
for method, count in sorted(method_counts.items()):
logger.info(f" {method}: {count} sentence-word pairs")
# Update words dict in place
updated = update_words_json(words, matches, confusable_keys)
logger.info(f" Updated {updated} entries in words.json")
return {
"books": book_counts,
"matched": len(matches),
"total_vocab": len(words),
}
# ── Standalone entry point ───────────────────────────────────────
if __name__ == "__main__":
import json
logging.basicConfig(level=logging.INFO, format="%(message)s")
words_path = DATA_DIR / "words.json"
with open(words_path, encoding="utf-8") as f:
words = json.load(f)
stats = run(words)
# Save updated words.json
with open(words_path, "w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)")