#!/usr/bin/env python3 """ Extract sentences from PDF books and match vocab words to sentences. 1. Extract sentences from alice.pdf and lion_strawberry.pdf 2. Merge into existing epub_sentence_index.json 3. Match vocab words to sentences, produce vocab_sentence_matches.json """ import json import os import re import sys # Use the venv with pymupdf sys.path.insert(0, "/home/node/projects/pealim/venv_pdf/lib/python3.11/site-packages") # Also need the main venv for pandas sys.path.insert(0, "/home/node/projects/pealim/lib/python3.11/site-packages") import fitz import pandas as pd BASE_DIR = "/home/node/projects/pealim" DATA_DIR = os.path.join(BASE_DIR, "data") EPUBS_DIR = os.path.join(DATA_DIR, "epubs") SENTENCE_INDEX = os.path.join(DATA_DIR, "epub_sentence_index.json") VOCAB_CSV = os.path.join(DATA_DIR, "hebrew_dict_for_anki.csv") MATCHES_FILE = os.path.join(DATA_DIR, "vocab_sentence_matches.json") NIKKUD_RE = re.compile(r"[\u0591-\u05C7]") HEBREW_RE = re.compile(r"[\u05d0-\u05ea]") HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea\ufb20-\ufb4f]") def strip_nikkud(text): """Remove all Hebrew nikkud/cantillation marks.""" return NIKKUD_RE.sub("", text) def collapse_hebrew_spaces(text): """Collapse spaces between Hebrew letter fragments (for badly-encoded PDFs). Strategy: strip nikkud first, then iteratively remove spaces between Hebrew characters. Real word boundaries are detected by: - Final-form letters (ם ן ף ך ץ) followed by space - Punctuation (.,;:!?"') - Non-Hebrew characters """ stripped = strip_nikkud(text) # Normalize presentation forms to standard Hebrew # FB20-FB4F contains presentation forms for code in range(0xFB2A, 0xFB50): ch = chr(code) if ch in stripped: # Map shin/sin dots, dagesh forms back to base # FB2A = שׁ (shin+dot), FB2B = שׂ (sin+dot) base_map = { "\ufb2a": "ש", "\ufb2b": "ש", "\ufb35": "ו", "\ufb4b": "ו", "\ufb30": "א", "\ufb31": "ב", "\ufb32": "ג", "\ufb33": "ד", "\ufb34": "ה", "\ufb36": "ז", "\ufb38": "ט", "\ufb39": "י", "\ufb3a": "כ", "\ufb3b": "כ", "\ufb3c": "ל", "\ufb3e": "מ", "\ufb40": "נ", "\ufb41": "ס", "\ufb43": "פ", "\ufb44": "פ", "\ufb46": "צ", "\ufb47": "ק", "\ufb48": "ר", "\ufb49": "ש", "\ufb4a": "ת", } if ch in base_map: stripped = stripped.replace(ch, base_map[ch]) # Replace multiple spaces with single stripped = re.sub(r" {2,}", " ", stripped) # Now rebuild text, keeping spaces only at word boundaries # Word boundary markers: final-form letters, punctuation, non-Hebrew final_forms = set("םןףךץ") result = [] i = 0 chars = list(stripped) while i < len(chars): if chars[i] != " ": result.append(chars[i]) i += 1 continue # It's a space. Decide if it's a word boundary. # Look back for the last non-space character prev_ch = None for j in range(len(result) - 1, -1, -1): if result[j] != " ": prev_ch = result[j] break # Look forward for next non-space character next_ch = None for j in range(i + 1, len(chars)): if chars[j] != " ": next_ch = chars[j] break is_boundary = False # After final-form letter = word boundary if prev_ch and prev_ch in final_forms: is_boundary = True # Before/after punctuation or non-Hebrew = word boundary if prev_ch and not HEBREW_RE.match(prev_ch): is_boundary = True if next_ch and not HEBREW_RE.match(next_ch): is_boundary = True # If either side is not Hebrew at all, boundary if prev_ch is None or next_ch is None: is_boundary = True if is_boundary: result.append(" ") # else: skip the space (collapse intra-word gap) i += 1 return "".join(result).strip() def extract_pdf_sentences(pdf_path, book_name): """Extract sentences from a PDF file.""" doc = fitz.open(pdf_path) sentences = [] for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() if not text.strip(): continue # Split into lines first, then split on sentence-ending punctuation lines = text.split("\n") raw_sentences = [] for line in lines: line = line.strip() if not line: continue # Split on sentence-ending punctuation followed by space or at end parts = re.split(r"(?<=[.?!])\s+", line) raw_sentences.extend(parts) for sent in raw_sentences: sent = sent.strip() if not sent: continue # Must contain Hebrew characters if not HEBREW_RE.search(sent): continue # Create stripped version (no nikkud, collapsed spaces for PDF) stripped = collapse_hebrew_spaces(sent) # Count Hebrew words in stripped version words = [w for w in stripped.split() if HEBREW_RE.search(w)] word_count = len(words) # Filter: 4-15 Hebrew words if word_count < 4 or word_count > 15: continue # Drop metadata-like lines # Page numbers (just digits) if re.match(r"^\d+$", sent.strip()): continue # Copyright text if any(kw in sent.lower() for kw in ["copyright", "©", "isbn", "printed in"]): continue sentences.append( { "text": sent, "book": book_name, "stripped": stripped, } ) doc.close() return sentences def has_extractable_text(pdf_path): """Check if a PDF has extractable text.""" doc = fitz.open(pdf_path) text_found = False for i in range(min(len(doc), 10)): if doc[i].get_text().strip(): text_found = True break doc.close() return text_found def load_sentence_index(): """Load existing sentence index.""" if os.path.exists(SENTENCE_INDEX): with open(SENTENCE_INDEX, encoding="utf-8") as f: return json.load(f) return {"sentences": []} def save_sentence_index(data): """Save sentence index.""" with open(SENTENCE_INDEX, "w", encoding="utf-8") as f: json.dump(data, f, ensure_ascii=False, indent=2) def match_vocab_to_sentences(sentences, vocab_df): """Match vocab words to sentences.""" matches = {} # Build lookup: word_no_nikkud -> word_nikkud vocab_words = [] for _, row in vocab_df.iterrows(): word_no_nik = str(row.get("Word Without Nikkud", "")).strip() word_nik = str(row.get("Word", "")).strip() if word_no_nik and word_nik: vocab_words.append((word_no_nik, word_nik)) print(f"Matching {len(vocab_words)} vocab words against {len(sentences)} sentences...") # Precompute: for each sentence, get the stripped text sent_data = [] for s in sentences: stripped = s.get("stripped", "") # For PDF sentences, stripped already has collapsed spaces but words may be joined # For EPUB sentences, stripped has proper word spacing sent_data.append( { "text": s["text"], "book": s["book"], "stripped": stripped, "word_count": len(stripped.split()), } ) matched_count = 0 for word_no_nik, word_nik in vocab_words: if len(word_no_nik) < 2: continue # Build regex for word boundary matching # Use both approaches: proper word boundary and substring for PDF text pattern = re.compile(r"(?:^|\s)" + re.escape(word_no_nik) + r"(?:\s|$)") # For PDF texts with collapsed spaces, also try substring match # but only for words >= 3 chars to avoid false positives use_substring = len(word_no_nik) >= 3 word_matches = [] for sd in sent_data: stripped = sd["stripped"] # Try word-boundary match first if pattern.search(stripped): word_matches.append(sd) elif use_substring and word_no_nik in stripped: # Substring match for PDF texts with collapsed spaces # Verify it's not part of a longer word by checking the character # before and after in the collapsed text idx = stripped.find(word_no_nik) before_ok = idx == 0 or not HEBREW_RE.match(stripped[idx - 1]) after_idx = idx + len(word_no_nik) after_ok = after_idx >= len(stripped) or not HEBREW_RE.match(stripped[after_idx]) # Only count if at least one boundary is clear # (for PDF collapsed text, boundaries are often missing) # For PDF books, we accept substring matches if sd["book"] in ("אליס בארץ הפלאות", "האריה שאהב תות") or before_ok or after_ok: word_matches.append(sd) if word_matches: matched_count += 1 # Sort by preference: 6-12 words ideal, then shorter is better def score(sd): wc = sd["word_count"] if 6 <= wc <= 12: return (0, wc) # ideal range, prefer shorter if wc < 6: return (1, -wc) # too short return (2, wc) # too long word_matches.sort(key=score) best = word_matches[:3] matches[word_no_nik] = { "word_nikkud": word_nik, "sentences": [{"text": m["text"], "book": m["book"]} for m in best], } print( f"Words with at least 1 match: {matched_count}/{len(vocab_words)} ({100 * matched_count / len(vocab_words):.1f}%)" ) return matches def main(): # ── Step 1: Extract from PDFs ── pdfs = [ ("alice.pdf", "אליס בארץ הפלאות"), ("lion_strawberry.pdf", "האריה שאהב תות"), ] all_new_sentences = [] for filename, book_name in pdfs: pdf_path = os.path.join(EPUBS_DIR, filename) if not os.path.exists(pdf_path): print(f"SKIP: {filename} not found") continue if not has_extractable_text(pdf_path): print(f"SKIP: {filename} has no extractable text (likely scanned images)") continue print(f"Extracting from {filename} ({book_name})...") sentences = extract_pdf_sentences(pdf_path, book_name) print(f" Extracted {len(sentences)} sentences") all_new_sentences.extend(sentences) # ── Step 2: Merge with existing index ── index = load_sentence_index() existing_count = len(index["sentences"]) # Deduplicate by (stripped, book) existing_keys = set() for s in index["sentences"]: key = (s.get("stripped", ""), s.get("book", "")) existing_keys.add(key) added = 0 for s in all_new_sentences: key = (s["stripped"], s["book"]) if key not in existing_keys: index["sentences"].append(s) existing_keys.add(key) added += 1 save_sentence_index(index) total = len(index["sentences"]) print(f"\nSentence index: {existing_count} existing + {added} new = {total} total") # ── Per-book stats ── book_counts = {} for s in index["sentences"]: book = s.get("book", "unknown") book_counts[book] = book_counts.get(book, 0) + 1 print("\nSentences per book:") for book, count in sorted(book_counts.items(), key=lambda x: -x[1]): print(f" {book}: {count}") # ── Step 3: Match vocab words to sentences ── print(f"\nLoading vocab from {VOCAB_CSV}...") vocab_df = pd.read_csv(VOCAB_CSV, sep=";", index_col=0) print(f" {len(vocab_df)} vocab words loaded") matches = match_vocab_to_sentences(index["sentences"], vocab_df) with open(MATCHES_FILE, "w", encoding="utf-8") as f: json.dump(matches, f, ensure_ascii=False, indent=2) print(f"\nWrote {len(matches)} word matches to {MATCHES_FILE}") # ── Step 4: Summary stats ── total_words = len(vocab_df) matched_words = len(matches) print(f"\n{'=' * 50}") print("SUMMARY") print(f"{'=' * 50}") print(f"Total sentences: {total}") for book, count in sorted(book_counts.items(), key=lambda x: -x[1]): print(f" {book}: {count}") print(f"Total vocab words: {total_words}") print(f"Words with sentences: {matched_words} ({100 * matched_words / total_words:.1f}%)") print(f"Words without sentences: {total_words - matched_words}") if __name__ == "__main__": main()