hebrew_flash_cards/rebuild_sentence_matches.py
Sochen b8b65442cb restore epub_examples.py and rebuild_sentence_matches.py
Accidentally removed in 6c2a0f8 — these are the EPUB sentence
extraction and matching scripts used to build vetted_sentences.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 04:33:32 +00:00

183 lines
6.1 KiB
Python

#!/usr/bin/env python3
"""
Rebuild vocab_sentence_matches.json using both direct word matching
and ktiv male conjugated/declined form matching.
This dramatically improves sentence coverage by matching not just
dictionary forms but all conjugated verbs and declined nouns.
"""
import json
import logging
import re
from pathlib import Path
import pandas as pd
from helpers import strip_nikkud as _strip_nikkud
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
def main():
# Load sentences
with open(DATA_DIR / "epub_sentence_index.json") as f:
sentences = json.load(f).get("sentences", [])
logger.info(f"Loaded {len(sentences)} sentences")
# Load vocab CSV
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
try:
df = pd.read_csv(csv_path, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(csv_path, index_col=0)
logger.info(f"Loaded {len(df)} vocab entries")
# Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
word_lookup: dict[str, list[tuple[str, str]]] = {}
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
wni = str(row.get("Word Without Nikkud", "")).strip()
if not word or word in ("nan", "None"):
continue
stripped = _strip_nikkud(word)
if stripped:
word_lookup.setdefault(stripped, []).append((word, wni))
# Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
ktiv_path = DATA_DIR / "ktiv_male_forms.json"
ktiv_forms: dict[str, list[dict]] = {}
if ktiv_path.exists():
with open(ktiv_path) as f:
ktiv_forms = json.load(f)
logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
else:
logger.warning("No ktiv_male_forms.json — only using direct matching")
# Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
ktiv_to_word: dict[str, set[str]] = {}
for ktiv, entries in ktiv_forms.items():
for entry in entries:
word_nikkud = entry.get("word_nikkud", "")
if word_nikkud:
ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)
# Also add all vocab words' own stripped forms to ktiv_to_word
for stripped, entries in word_lookup.items():
for word_nikkud, _ in entries:
ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)
logger.info(f"Total matchable forms: {len(ktiv_to_word)}")
# Tokenize all sentences once
sentence_tokens: list[tuple[dict, list[str]]] = []
for s in sentences:
stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
tokens = [t for t in tokens if t] # remove empty
sentence_tokens.append((s, tokens))
# Match: for each sentence token, check ktiv_to_word lookup
# Build word_nikkud → [sentence_info]
matches: dict[str, list[dict]] = {} # word_nikkud → [sentences]
for sent, tokens in sentence_tokens:
text = sent.get("text", "")
book = sent.get("book", "")
word_len = len(tokens)
# Skip sentences that are too short or too long
if word_len < 4 or word_len > 15:
continue
for tok in tokens:
if tok in ktiv_to_word:
for word_nikkud in ktiv_to_word[tok]:
matches.setdefault(word_nikkud, []).append(
{
"text": text,
"book": book,
"matched_form": tok,
"word_count": word_len,
}
)
logger.info(f"Words with at least 1 match: {len(matches)}")
# Deduplicate and limit to 3 best sentences per word
# Prefer shorter sentences (6-12 words ideal)
output: dict[str, dict] = {}
for word_nikkud, sents in matches.items():
# Deduplicate by text
seen_texts = set()
unique = []
for s in sents:
if s["text"] not in seen_texts:
seen_texts.add(s["text"])
unique.append(s)
# Score: prefer 6-12 word sentences
def score(s):
wc = s["word_count"]
if 6 <= wc <= 12:
return 0 # ideal
return abs(wc - 9) # distance from ideal
unique.sort(key=score)
best = unique[:3]
# Find the Word Without Nikkud for this word
stripped = _strip_nikkud(word_nikkud)
wni = stripped # default
if stripped in word_lookup:
for wn, w_wni in word_lookup[stripped]:
if wn == word_nikkud:
wni = w_wni
break
output[wni] = {
"word_nikkud": word_nikkud,
"sentences": [{"text": s["text"], "book": s["book"]} for s in best],
}
# Save
out_path = DATA_DIR / "vocab_sentence_matches.json"
with open(out_path, "w") as f:
json.dump(output, f, ensure_ascii=False, indent=1)
total_sents = sum(len(v["sentences"]) for v in output.values())
logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")
# Stats
total_vocab = len(df)
pct = len(output) * 100 / total_vocab
logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")
# Breakdown by match type
direct_only = 0
ktiv_only = 0
both = 0
for _wni, info in output.items():
word = info["word_nikkud"]
stripped = _strip_nikkud(word)
has_direct = stripped in word_lookup
has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
if has_direct and has_ktiv:
both += 1
elif has_ktiv:
ktiv_only += 1
else:
direct_only += 1
logger.info(f" Direct matches only: {direct_only}")
logger.info(f" Ktiv male matches only: {ktiv_only}")
logger.info(f" Both: {both}")
if __name__ == "__main__":
main()