hebrew_flash_cards/rebuild_sentence_matches.py

#!/usr/bin/env python3
"""
Rebuild vocab_sentence_matches.json using both direct word matching
and ktiv male conjugated/declined form matching.

This dramatically improves sentence coverage by matching not just
dictionary forms but all conjugated verbs and declined nouns.
"""

import json
import logging
import re
from pathlib import Path

import pandas as pd

from helpers import strip_nikkud as _strip_nikkud

logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
logger = logging.getLogger(__name__)

DATA_DIR = Path(__file__).parent / "data"


def main():
    # Load sentences
    with open(DATA_DIR / "epub_sentence_index.json") as f:
        sentences = json.load(f).get("sentences", [])
    logger.info(f"Loaded {len(sentences)} sentences")

    # Load vocab CSV
    csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
    try:
        df = pd.read_csv(csv_path, sep=";", index_col=0)
        if df.shape[1] < 3:
            raise ValueError
    except (ValueError, pd.errors.ParserError):
        df = pd.read_csv(csv_path, index_col=0)
    logger.info(f"Loaded {len(df)} vocab entries")

    # Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
    word_lookup: dict[str, list[tuple[str, str]]] = {}
    for _, row in df.iterrows():
        word = str(row.get("Word", "")).strip()
        wni = str(row.get("Word Without Nikkud", "")).strip()
        if not word or word in ("nan", "None"):
            continue
        stripped = _strip_nikkud(word)
        if stripped:
            word_lookup.setdefault(stripped, []).append((word, wni))

    # Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
    ktiv_path = DATA_DIR / "ktiv_male_forms.json"
    ktiv_forms: dict[str, list[dict]] = {}
    if ktiv_path.exists():
        with open(ktiv_path) as f:
            ktiv_forms = json.load(f)
        logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
    else:
        logger.warning("No ktiv_male_forms.json — only using direct matching")

    # Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
    ktiv_to_word: dict[str, set[str]] = {}
    for ktiv, entries in ktiv_forms.items():
        for entry in entries:
            word_nikkud = entry.get("word_nikkud", "")
            if word_nikkud:
                ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)

    # Also add all vocab words' own stripped forms to ktiv_to_word
    for stripped, entries in word_lookup.items():
        for word_nikkud, _ in entries:
            ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)

    logger.info(f"Total matchable forms: {len(ktiv_to_word)}")

    # Tokenize all sentences once
    sentence_tokens: list[tuple[dict, list[str]]] = []
    for s in sentences:
        stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
        tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
        tokens = [t for t in tokens if t]  # remove empty
        sentence_tokens.append((s, tokens))

    # Match: for each sentence token, check ktiv_to_word lookup
    # Build word_nikkud → [sentence_info]
    matches: dict[str, list[dict]] = {}  # word_nikkud → [sentences]

    for sent, tokens in sentence_tokens:
        text = sent.get("text", "")
        book = sent.get("book", "")
        word_len = len(tokens)

        # Skip sentences that are too short or too long
        if word_len < 4 or word_len > 15:
            continue

        for tok in tokens:
            if tok in ktiv_to_word:
                for word_nikkud in ktiv_to_word[tok]:
                    matches.setdefault(word_nikkud, []).append(
                        {
                            "text": text,
                            "book": book,
                            "matched_form": tok,
                            "word_count": word_len,
                        }
                    )

    logger.info(f"Words with at least 1 match: {len(matches)}")

    # Deduplicate and limit to 3 best sentences per word
    # Prefer shorter sentences (6-12 words ideal)
    output: dict[str, dict] = {}
    for word_nikkud, sents in matches.items():
        # Deduplicate by text
        seen_texts = set()
        unique = []
        for s in sents:
            if s["text"] not in seen_texts:
                seen_texts.add(s["text"])
                unique.append(s)

        # Score: prefer 6-12 word sentences
        def score(s):
            wc = s["word_count"]
            if 6 <= wc <= 12:
                return 0  # ideal
            return abs(wc - 9)  # distance from ideal

        unique.sort(key=score)
        best = unique[:3]

        # Find the Word Without Nikkud for this word
        stripped = _strip_nikkud(word_nikkud)
        wni = stripped  # default
        if stripped in word_lookup:
            for wn, w_wni in word_lookup[stripped]:
                if wn == word_nikkud:
                    wni = w_wni
                    break

        output[wni] = {
            "word_nikkud": word_nikkud,
            "sentences": [{"text": s["text"], "book": s["book"]} for s in best],
        }

    # Save
    out_path = DATA_DIR / "vocab_sentence_matches.json"
    with open(out_path, "w") as f:
        json.dump(output, f, ensure_ascii=False, indent=1)

    total_sents = sum(len(v["sentences"]) for v in output.values())
    logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")

    # Stats
    total_vocab = len(df)
    pct = len(output) * 100 / total_vocab
    logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")

    # Breakdown by match type
    direct_only = 0
    ktiv_only = 0
    both = 0
    for _wni, info in output.items():
        word = info["word_nikkud"]
        stripped = _strip_nikkud(word)
        has_direct = stripped in word_lookup
        has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
        if has_direct and has_ktiv:
            both += 1
        elif has_ktiv:
            ktiv_only += 1
        else:
            direct_only += 1

    logger.info(f"  Direct matches only: {direct_only}")
    logger.info(f"  Ktiv male matches only: {ktiv_only}")
    logger.info(f"  Both: {both}")


if __name__ == "__main__":
    main()