hebrew_flash_cards/scripts/extract_pdf_sentences.py

#!/usr/bin/env python3
"""
Extract sentences from PDF books and match vocab words to sentences.

1. Extract sentences from alice.pdf and lion_strawberry.pdf
2. Merge into existing epub_sentence_index.json
3. Match vocab words to sentences, produce vocab_sentence_matches.json
"""

import json
import os
import re
import sys

# Use the venv with pymupdf
sys.path.insert(0, "/home/node/projects/pealim/venv_pdf/lib/python3.11/site-packages")
# Also need the main venv for pandas
sys.path.insert(0, "/home/node/projects/pealim/lib/python3.11/site-packages")

import fitz
import pandas as pd

BASE_DIR = "/home/node/projects/pealim"
DATA_DIR = os.path.join(BASE_DIR, "data")
EPUBS_DIR = os.path.join(DATA_DIR, "epubs")
SENTENCE_INDEX = os.path.join(DATA_DIR, "epub_sentence_index.json")
VOCAB_CSV = os.path.join(DATA_DIR, "hebrew_dict_for_anki.csv")
MATCHES_FILE = os.path.join(DATA_DIR, "vocab_sentence_matches.json")

NIKKUD_RE = re.compile(r"[\u0591-\u05C7]")
HEBREW_RE = re.compile(r"[\u05d0-\u05ea]")
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea\ufb20-\ufb4f]")


def strip_nikkud(text):
    """Remove all Hebrew nikkud/cantillation marks."""
    return NIKKUD_RE.sub("", text)


def collapse_hebrew_spaces(text):
    """Collapse spaces between Hebrew letter fragments (for badly-encoded PDFs).

    Strategy: strip nikkud first, then iteratively remove spaces between
    Hebrew characters. Real word boundaries are detected by:
    - Final-form letters (ם ן ף ך ץ) followed by space
    - Punctuation (.,;:!?"')
    - Non-Hebrew characters
    """
    stripped = strip_nikkud(text)
    # Normalize presentation forms to standard Hebrew
    # FB20-FB4F contains presentation forms
    for code in range(0xFB2A, 0xFB50):
        ch = chr(code)
        if ch in stripped:
            # Map shin/sin dots, dagesh forms back to base
            # FB2A = שׁ (shin+dot), FB2B = שׂ (sin+dot)
            base_map = {
                "\ufb2a": "ש",
                "\ufb2b": "ש",
                "\ufb35": "ו",
                "\ufb4b": "ו",
                "\ufb30": "א",
                "\ufb31": "ב",
                "\ufb32": "ג",
                "\ufb33": "ד",
                "\ufb34": "ה",
                "\ufb36": "ז",
                "\ufb38": "ט",
                "\ufb39": "י",
                "\ufb3a": "כ",
                "\ufb3b": "כ",
                "\ufb3c": "ל",
                "\ufb3e": "מ",
                "\ufb40": "נ",
                "\ufb41": "ס",
                "\ufb43": "פ",
                "\ufb44": "פ",
                "\ufb46": "צ",
                "\ufb47": "ק",
                "\ufb48": "ר",
                "\ufb49": "ש",
                "\ufb4a": "ת",
            }
            if ch in base_map:
                stripped = stripped.replace(ch, base_map[ch])

    # Replace multiple spaces with single
    stripped = re.sub(r" {2,}", " ", stripped)

    # Now rebuild text, keeping spaces only at word boundaries
    # Word boundary markers: final-form letters, punctuation, non-Hebrew
    final_forms = set("םןףךץ")
    result = []
    i = 0
    chars = list(stripped)

    while i < len(chars):
        if chars[i] != " ":
            result.append(chars[i])
            i += 1
            continue

        # It's a space. Decide if it's a word boundary.
        # Look back for the last non-space character
        prev_ch = None
        for j in range(len(result) - 1, -1, -1):
            if result[j] != " ":
                prev_ch = result[j]
                break

        # Look forward for next non-space character
        next_ch = None
        for j in range(i + 1, len(chars)):
            if chars[j] != " ":
                next_ch = chars[j]
                break

        is_boundary = False

        # After final-form letter = word boundary
        if prev_ch and prev_ch in final_forms:
            is_boundary = True

        # Before/after punctuation or non-Hebrew = word boundary
        if prev_ch and not HEBREW_RE.match(prev_ch):
            is_boundary = True
        if next_ch and not HEBREW_RE.match(next_ch):
            is_boundary = True

        # If either side is not Hebrew at all, boundary
        if prev_ch is None or next_ch is None:
            is_boundary = True

        if is_boundary:
            result.append(" ")
        # else: skip the space (collapse intra-word gap)
        i += 1

    return "".join(result).strip()


def extract_pdf_sentences(pdf_path, book_name):
    """Extract sentences from a PDF file."""
    doc = fitz.open(pdf_path)
    sentences = []

    for page_num in range(len(doc)):
        page = doc[page_num]
        text = page.get_text()

        if not text.strip():
            continue

        # Split into lines first, then split on sentence-ending punctuation
        lines = text.split("\n")

        raw_sentences = []
        for line in lines:
            line = line.strip()
            if not line:
                continue
            # Split on sentence-ending punctuation followed by space or at end
            parts = re.split(r"(?<=[.?!])\s+", line)
            raw_sentences.extend(parts)

        for sent in raw_sentences:
            sent = sent.strip()
            if not sent:
                continue

            # Must contain Hebrew characters
            if not HEBREW_RE.search(sent):
                continue

            # Create stripped version (no nikkud, collapsed spaces for PDF)
            stripped = collapse_hebrew_spaces(sent)

            # Count Hebrew words in stripped version
            words = [w for w in stripped.split() if HEBREW_RE.search(w)]
            word_count = len(words)

            # Filter: 4-15 Hebrew words
            if word_count < 4 or word_count > 15:
                continue

            # Drop metadata-like lines
            # Page numbers (just digits)
            if re.match(r"^\d+$", sent.strip()):
                continue
            # Copyright text
            if any(kw in sent.lower() for kw in ["copyright", "©", "isbn", "printed in"]):
                continue

            sentences.append(
                {
                    "text": sent,
                    "book": book_name,
                    "stripped": stripped,
                }
            )

    doc.close()
    return sentences


def has_extractable_text(pdf_path):
    """Check if a PDF has extractable text."""
    doc = fitz.open(pdf_path)
    text_found = False
    for i in range(min(len(doc), 10)):
        if doc[i].get_text().strip():
            text_found = True
            break
    doc.close()
    return text_found


def load_sentence_index():
    """Load existing sentence index."""
    if os.path.exists(SENTENCE_INDEX):
        with open(SENTENCE_INDEX, encoding="utf-8") as f:
            return json.load(f)
    return {"sentences": []}


def save_sentence_index(data):
    """Save sentence index."""
    with open(SENTENCE_INDEX, "w", encoding="utf-8") as f:
        json.dump(data, f, ensure_ascii=False, indent=2)


def match_vocab_to_sentences(sentences, vocab_df):
    """Match vocab words to sentences."""
    matches = {}

    # Build lookup: word_no_nikkud -> word_nikkud
    vocab_words = []
    for _, row in vocab_df.iterrows():
        word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
        word_nik = str(row.get("Word", "")).strip()
        if word_no_nik and word_nik:
            vocab_words.append((word_no_nik, word_nik))

    print(f"Matching {len(vocab_words)} vocab words against {len(sentences)} sentences...")

    # Precompute: for each sentence, get the stripped text
    sent_data = []
    for s in sentences:
        stripped = s.get("stripped", "")
        # For PDF sentences, stripped already has collapsed spaces but words may be joined
        # For EPUB sentences, stripped has proper word spacing
        sent_data.append(
            {
                "text": s["text"],
                "book": s["book"],
                "stripped": stripped,
                "word_count": len(stripped.split()),
            }
        )

    matched_count = 0

    for word_no_nik, word_nik in vocab_words:
        if len(word_no_nik) < 2:
            continue

        # Build regex for word boundary matching
        # Use both approaches: proper word boundary and substring for PDF text
        pattern = re.compile(r"(?:^|\s)" + re.escape(word_no_nik) + r"(?:\s|$)")
        # For PDF texts with collapsed spaces, also try substring match
        # but only for words >= 3 chars to avoid false positives
        use_substring = len(word_no_nik) >= 3

        word_matches = []

        for sd in sent_data:
            stripped = sd["stripped"]

            # Try word-boundary match first
            if pattern.search(stripped):
                word_matches.append(sd)
            elif use_substring and word_no_nik in stripped:
                # Substring match for PDF texts with collapsed spaces
                # Verify it's not part of a longer word by checking the character
                # before and after in the collapsed text
                idx = stripped.find(word_no_nik)
                before_ok = idx == 0 or not HEBREW_RE.match(stripped[idx - 1])
                after_idx = idx + len(word_no_nik)
                after_ok = after_idx >= len(stripped) or not HEBREW_RE.match(stripped[after_idx])
                # Only count if at least one boundary is clear
                # (for PDF collapsed text, boundaries are often missing)
                # For PDF books, we accept substring matches
                if sd["book"] in ("אליס בארץ הפלאות", "האריה שאהב תות") or before_ok or after_ok:
                    word_matches.append(sd)

        if word_matches:
            matched_count += 1

            # Sort by preference: 6-12 words ideal, then shorter is better
            def score(sd):
                wc = sd["word_count"]
                if 6 <= wc <= 12:
                    return (0, wc)  # ideal range, prefer shorter
                if wc < 6:
                    return (1, -wc)  # too short
                return (2, wc)  # too long

            word_matches.sort(key=score)
            best = word_matches[:3]

            matches[word_no_nik] = {
                "word_nikkud": word_nik,
                "sentences": [{"text": m["text"], "book": m["book"]} for m in best],
            }

    print(
        f"Words with at least 1 match: {matched_count}/{len(vocab_words)} ({100 * matched_count / len(vocab_words):.1f}%)"
    )
    return matches


def main():
    # ── Step 1: Extract from PDFs ──
    pdfs = [
        ("alice.pdf", "אליס בארץ הפלאות"),
        ("lion_strawberry.pdf", "האריה שאהב תות"),
    ]

    all_new_sentences = []

    for filename, book_name in pdfs:
        pdf_path = os.path.join(EPUBS_DIR, filename)
        if not os.path.exists(pdf_path):
            print(f"SKIP: {filename} not found")
            continue

        if not has_extractable_text(pdf_path):
            print(f"SKIP: {filename} has no extractable text (likely scanned images)")
            continue

        print(f"Extracting from {filename} ({book_name})...")
        sentences = extract_pdf_sentences(pdf_path, book_name)
        print(f"  Extracted {len(sentences)} sentences")
        all_new_sentences.extend(sentences)

    # ── Step 2: Merge with existing index ──
    index = load_sentence_index()
    existing_count = len(index["sentences"])

    # Deduplicate by (stripped, book)
    existing_keys = set()
    for s in index["sentences"]:
        key = (s.get("stripped", ""), s.get("book", ""))
        existing_keys.add(key)

    added = 0
    for s in all_new_sentences:
        key = (s["stripped"], s["book"])
        if key not in existing_keys:
            index["sentences"].append(s)
            existing_keys.add(key)
            added += 1

    save_sentence_index(index)
    total = len(index["sentences"])
    print(f"\nSentence index: {existing_count} existing + {added} new = {total} total")

    # ── Per-book stats ──
    book_counts = {}
    for s in index["sentences"]:
        book = s.get("book", "unknown")
        book_counts[book] = book_counts.get(book, 0) + 1

    print("\nSentences per book:")
    for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
        print(f"  {book}: {count}")

    # ── Step 3: Match vocab words to sentences ──
    print(f"\nLoading vocab from {VOCAB_CSV}...")
    vocab_df = pd.read_csv(VOCAB_CSV, sep=";", index_col=0)
    print(f"  {len(vocab_df)} vocab words loaded")

    matches = match_vocab_to_sentences(index["sentences"], vocab_df)

    with open(MATCHES_FILE, "w", encoding="utf-8") as f:
        json.dump(matches, f, ensure_ascii=False, indent=2)

    print(f"\nWrote {len(matches)} word matches to {MATCHES_FILE}")

    # ── Step 4: Summary stats ──
    total_words = len(vocab_df)
    matched_words = len(matches)
    print(f"\n{'=' * 50}")
    print("SUMMARY")
    print(f"{'=' * 50}")
    print(f"Total sentences: {total}")
    for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
        print(f"  {book}: {count}")
    print(f"Total vocab words: {total_words}")
    print(f"Words with sentences: {matched_words} ({100 * matched_words / total_words:.1f}%)")
    print(f"Words without sentences: {total_words - matched_words}")


if __name__ == "__main__":
    main()