- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
183 lines
6.1 KiB
Python
183 lines
6.1 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Rebuild vocab_sentence_matches.json using both direct word matching
|
|
and ktiv male conjugated/declined form matching.
|
|
|
|
This dramatically improves sentence coverage by matching not just
|
|
dictionary forms but all conjugated verbs and declined nouns.
|
|
"""
|
|
|
|
import json
|
|
import logging
|
|
import re
|
|
from pathlib import Path
|
|
|
|
import pandas as pd
|
|
|
|
from helpers import strip_nikkud as _strip_nikkud
|
|
|
|
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
|
logger = logging.getLogger(__name__)
|
|
|
|
DATA_DIR = Path(__file__).parent / "data"
|
|
|
|
|
|
def main():
|
|
# Load sentences
|
|
with open(DATA_DIR / "epub_sentence_index.json") as f:
|
|
sentences = json.load(f).get("sentences", [])
|
|
logger.info(f"Loaded {len(sentences)} sentences")
|
|
|
|
# Load vocab CSV
|
|
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
|
|
try:
|
|
df = pd.read_csv(csv_path, sep=";", index_col=0)
|
|
if df.shape[1] < 3:
|
|
raise ValueError
|
|
except (ValueError, pd.errors.ParserError):
|
|
df = pd.read_csv(csv_path, index_col=0)
|
|
logger.info(f"Loaded {len(df)} vocab entries")
|
|
|
|
# Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
|
|
word_lookup: dict[str, list[tuple[str, str]]] = {}
|
|
for _, row in df.iterrows():
|
|
word = str(row.get("Word", "")).strip()
|
|
wni = str(row.get("Word Without Nikkud", "")).strip()
|
|
if not word or word in ("nan", "None"):
|
|
continue
|
|
stripped = _strip_nikkud(word)
|
|
if stripped:
|
|
word_lookup.setdefault(stripped, []).append((word, wni))
|
|
|
|
# Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
|
|
ktiv_path = DATA_DIR / "ktiv_male_forms.json"
|
|
ktiv_forms: dict[str, list[dict]] = {}
|
|
if ktiv_path.exists():
|
|
with open(ktiv_path) as f:
|
|
ktiv_forms = json.load(f)
|
|
logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
|
|
else:
|
|
logger.warning("No ktiv_male_forms.json — only using direct matching")
|
|
|
|
# Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
|
|
ktiv_to_word: dict[str, set[str]] = {}
|
|
for ktiv, entries in ktiv_forms.items():
|
|
for entry in entries:
|
|
word_nikkud = entry.get("word_nikkud", "")
|
|
if word_nikkud:
|
|
ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)
|
|
|
|
# Also add all vocab words' own stripped forms to ktiv_to_word
|
|
for stripped, entries in word_lookup.items():
|
|
for word_nikkud, _ in entries:
|
|
ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)
|
|
|
|
logger.info(f"Total matchable forms: {len(ktiv_to_word)}")
|
|
|
|
# Tokenize all sentences once
|
|
sentence_tokens: list[tuple[dict, list[str]]] = []
|
|
for s in sentences:
|
|
stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
|
|
tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
|
|
tokens = [t for t in tokens if t] # remove empty
|
|
sentence_tokens.append((s, tokens))
|
|
|
|
# Match: for each sentence token, check ktiv_to_word lookup
|
|
# Build word_nikkud → [sentence_info]
|
|
matches: dict[str, list[dict]] = {} # word_nikkud → [sentences]
|
|
|
|
for sent, tokens in sentence_tokens:
|
|
text = sent.get("text", "")
|
|
book = sent.get("book", "")
|
|
word_len = len(tokens)
|
|
|
|
# Skip sentences that are too short or too long
|
|
if word_len < 4 or word_len > 15:
|
|
continue
|
|
|
|
for tok in tokens:
|
|
if tok in ktiv_to_word:
|
|
for word_nikkud in ktiv_to_word[tok]:
|
|
matches.setdefault(word_nikkud, []).append(
|
|
{
|
|
"text": text,
|
|
"book": book,
|
|
"matched_form": tok,
|
|
"word_count": word_len,
|
|
}
|
|
)
|
|
|
|
logger.info(f"Words with at least 1 match: {len(matches)}")
|
|
|
|
# Deduplicate and limit to 3 best sentences per word
|
|
# Prefer shorter sentences (6-12 words ideal)
|
|
output: dict[str, dict] = {}
|
|
for word_nikkud, sents in matches.items():
|
|
# Deduplicate by text
|
|
seen_texts = set()
|
|
unique = []
|
|
for s in sents:
|
|
if s["text"] not in seen_texts:
|
|
seen_texts.add(s["text"])
|
|
unique.append(s)
|
|
|
|
# Score: prefer 6-12 word sentences
|
|
def score(s):
|
|
wc = s["word_count"]
|
|
if 6 <= wc <= 12:
|
|
return 0 # ideal
|
|
return abs(wc - 9) # distance from ideal
|
|
|
|
unique.sort(key=score)
|
|
best = unique[:3]
|
|
|
|
# Find the Word Without Nikkud for this word
|
|
stripped = _strip_nikkud(word_nikkud)
|
|
wni = stripped # default
|
|
if stripped in word_lookup:
|
|
for wn, w_wni in word_lookup[stripped]:
|
|
if wn == word_nikkud:
|
|
wni = w_wni
|
|
break
|
|
|
|
output[wni] = {
|
|
"word_nikkud": word_nikkud,
|
|
"sentences": [{"text": s["text"], "book": s["book"]} for s in best],
|
|
}
|
|
|
|
# Save
|
|
out_path = DATA_DIR / "vocab_sentence_matches.json"
|
|
with open(out_path, "w") as f:
|
|
json.dump(output, f, ensure_ascii=False, indent=1)
|
|
|
|
total_sents = sum(len(v["sentences"]) for v in output.values())
|
|
logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")
|
|
|
|
# Stats
|
|
total_vocab = len(df)
|
|
pct = len(output) * 100 / total_vocab
|
|
logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")
|
|
|
|
# Breakdown by match type
|
|
direct_only = 0
|
|
ktiv_only = 0
|
|
both = 0
|
|
for _wni, info in output.items():
|
|
word = info["word_nikkud"]
|
|
stripped = _strip_nikkud(word)
|
|
has_direct = stripped in word_lookup
|
|
has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
|
|
if has_direct and has_ktiv:
|
|
both += 1
|
|
elif has_ktiv:
|
|
ktiv_only += 1
|
|
else:
|
|
direct_only += 1
|
|
|
|
logger.info(f" Direct matches only: {direct_only}")
|
|
logger.info(f" Ktiv male matches only: {ktiv_only}")
|
|
logger.info(f" Both: {both}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|