hebrew_flash_cards/scripts/extract_pdf_sentences.py
Sochen 17f7458d19 Sprint 9: cloze cards, plurals deck, project reorg, lint tooling
- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences
- Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns)
- Ktiv male forms expanded to 20,711 entries for sentence matching
- Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for
  one-off tools, tests/ with smoke tests, deleted 3 dead files
- Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig,
  fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars)
- validate_apkg.py: card count range check for optional cloze template
- Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals,
  noun_slug_map, vocab_sentence_matches, epub_sentence_index

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-07 08:09:39 +00:00

405 lines
13 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""
Extract sentences from PDF books and match vocab words to sentences.
1. Extract sentences from alice.pdf and lion_strawberry.pdf
2. Merge into existing epub_sentence_index.json
3. Match vocab words to sentences, produce vocab_sentence_matches.json
"""
import json
import os
import re
import sys
# Use the venv with pymupdf
sys.path.insert(0, "/home/node/projects/pealim/venv_pdf/lib/python3.11/site-packages")
# Also need the main venv for pandas
sys.path.insert(0, "/home/node/projects/pealim/lib/python3.11/site-packages")
import fitz
import pandas as pd
BASE_DIR = "/home/node/projects/pealim"
DATA_DIR = os.path.join(BASE_DIR, "data")
EPUBS_DIR = os.path.join(DATA_DIR, "epubs")
SENTENCE_INDEX = os.path.join(DATA_DIR, "epub_sentence_index.json")
VOCAB_CSV = os.path.join(DATA_DIR, "hebrew_dict_for_anki.csv")
MATCHES_FILE = os.path.join(DATA_DIR, "vocab_sentence_matches.json")
NIKKUD_RE = re.compile(r"[\u0591-\u05C7]")
HEBREW_RE = re.compile(r"[\u05d0-\u05ea]")
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea\ufb20-\ufb4f]")
def strip_nikkud(text):
"""Remove all Hebrew nikkud/cantillation marks."""
return NIKKUD_RE.sub("", text)
def collapse_hebrew_spaces(text):
"""Collapse spaces between Hebrew letter fragments (for badly-encoded PDFs).
Strategy: strip nikkud first, then iteratively remove spaces between
Hebrew characters. Real word boundaries are detected by:
- Final-form letters (ם ן ף ך ץ) followed by space
- Punctuation (.,;:!?"')
- Non-Hebrew characters
"""
stripped = strip_nikkud(text)
# Normalize presentation forms to standard Hebrew
# FB20-FB4F contains presentation forms
for code in range(0xFB2A, 0xFB50):
ch = chr(code)
if ch in stripped:
# Map shin/sin dots, dagesh forms back to base
# FB2A = שׁ (shin+dot), FB2B = שׂ (sin+dot)
base_map = {
"\ufb2a": "ש",
"\ufb2b": "ש",
"\ufb35": "ו",
"\ufb4b": "ו",
"\ufb30": "א",
"\ufb31": "ב",
"\ufb32": "ג",
"\ufb33": "ד",
"\ufb34": "ה",
"\ufb36": "ז",
"\ufb38": "ט",
"\ufb39": "י",
"\ufb3a": "כ",
"\ufb3b": "כ",
"\ufb3c": "ל",
"\ufb3e": "מ",
"\ufb40": "נ",
"\ufb41": "ס",
"\ufb43": "פ",
"\ufb44": "פ",
"\ufb46": "צ",
"\ufb47": "ק",
"\ufb48": "ר",
"\ufb49": "ש",
"\ufb4a": "ת",
}
if ch in base_map:
stripped = stripped.replace(ch, base_map[ch])
# Replace multiple spaces with single
stripped = re.sub(r" {2,}", " ", stripped)
# Now rebuild text, keeping spaces only at word boundaries
# Word boundary markers: final-form letters, punctuation, non-Hebrew
final_forms = set("םןףךץ")
result = []
i = 0
chars = list(stripped)
while i < len(chars):
if chars[i] != " ":
result.append(chars[i])
i += 1
continue
# It's a space. Decide if it's a word boundary.
# Look back for the last non-space character
prev_ch = None
for j in range(len(result) - 1, -1, -1):
if result[j] != " ":
prev_ch = result[j]
break
# Look forward for next non-space character
next_ch = None
for j in range(i + 1, len(chars)):
if chars[j] != " ":
next_ch = chars[j]
break
is_boundary = False
# After final-form letter = word boundary
if prev_ch and prev_ch in final_forms:
is_boundary = True
# Before/after punctuation or non-Hebrew = word boundary
if prev_ch and not HEBREW_RE.match(prev_ch):
is_boundary = True
if next_ch and not HEBREW_RE.match(next_ch):
is_boundary = True
# If either side is not Hebrew at all, boundary
if prev_ch is None or next_ch is None:
is_boundary = True
if is_boundary:
result.append(" ")
# else: skip the space (collapse intra-word gap)
i += 1
return "".join(result).strip()
def extract_pdf_sentences(pdf_path, book_name):
"""Extract sentences from a PDF file."""
doc = fitz.open(pdf_path)
sentences = []
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
if not text.strip():
continue
# Split into lines first, then split on sentence-ending punctuation
lines = text.split("\n")
raw_sentences = []
for line in lines:
line = line.strip()
if not line:
continue
# Split on sentence-ending punctuation followed by space or at end
parts = re.split(r"(?<=[.?!])\s+", line)
raw_sentences.extend(parts)
for sent in raw_sentences:
sent = sent.strip()
if not sent:
continue
# Must contain Hebrew characters
if not HEBREW_RE.search(sent):
continue
# Create stripped version (no nikkud, collapsed spaces for PDF)
stripped = collapse_hebrew_spaces(sent)
# Count Hebrew words in stripped version
words = [w for w in stripped.split() if HEBREW_RE.search(w)]
word_count = len(words)
# Filter: 4-15 Hebrew words
if word_count < 4 or word_count > 15:
continue
# Drop metadata-like lines
# Page numbers (just digits)
if re.match(r"^\d+$", sent.strip()):
continue
# Copyright text
if any(kw in sent.lower() for kw in ["copyright", "©", "isbn", "printed in"]):
continue
sentences.append(
{
"text": sent,
"book": book_name,
"stripped": stripped,
}
)
doc.close()
return sentences
def has_extractable_text(pdf_path):
"""Check if a PDF has extractable text."""
doc = fitz.open(pdf_path)
text_found = False
for i in range(min(len(doc), 10)):
if doc[i].get_text().strip():
text_found = True
break
doc.close()
return text_found
def load_sentence_index():
"""Load existing sentence index."""
if os.path.exists(SENTENCE_INDEX):
with open(SENTENCE_INDEX, encoding="utf-8") as f:
return json.load(f)
return {"sentences": []}
def save_sentence_index(data):
"""Save sentence index."""
with open(SENTENCE_INDEX, "w", encoding="utf-8") as f:
json.dump(data, f, ensure_ascii=False, indent=2)
def match_vocab_to_sentences(sentences, vocab_df):
"""Match vocab words to sentences."""
matches = {}
# Build lookup: word_no_nikkud -> word_nikkud
vocab_words = []
for _, row in vocab_df.iterrows():
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
word_nik = str(row.get("Word", "")).strip()
if word_no_nik and word_nik:
vocab_words.append((word_no_nik, word_nik))
print(f"Matching {len(vocab_words)} vocab words against {len(sentences)} sentences...")
# Precompute: for each sentence, get the stripped text
sent_data = []
for s in sentences:
stripped = s.get("stripped", "")
# For PDF sentences, stripped already has collapsed spaces but words may be joined
# For EPUB sentences, stripped has proper word spacing
sent_data.append(
{
"text": s["text"],
"book": s["book"],
"stripped": stripped,
"word_count": len(stripped.split()),
}
)
matched_count = 0
for word_no_nik, word_nik in vocab_words:
if len(word_no_nik) < 2:
continue
# Build regex for word boundary matching
# Use both approaches: proper word boundary and substring for PDF text
pattern = re.compile(r"(?:^|\s)" + re.escape(word_no_nik) + r"(?:\s|$)")
# For PDF texts with collapsed spaces, also try substring match
# but only for words >= 3 chars to avoid false positives
use_substring = len(word_no_nik) >= 3
word_matches = []
for sd in sent_data:
stripped = sd["stripped"]
# Try word-boundary match first
if pattern.search(stripped):
word_matches.append(sd)
elif use_substring and word_no_nik in stripped:
# Substring match for PDF texts with collapsed spaces
# Verify it's not part of a longer word by checking the character
# before and after in the collapsed text
idx = stripped.find(word_no_nik)
before_ok = idx == 0 or not HEBREW_RE.match(stripped[idx - 1])
after_idx = idx + len(word_no_nik)
after_ok = after_idx >= len(stripped) or not HEBREW_RE.match(stripped[after_idx])
# Only count if at least one boundary is clear
# (for PDF collapsed text, boundaries are often missing)
# For PDF books, we accept substring matches
if sd["book"] in ("אליס בארץ הפלאות", "האריה שאהב תות") or before_ok or after_ok:
word_matches.append(sd)
if word_matches:
matched_count += 1
# Sort by preference: 6-12 words ideal, then shorter is better
def score(sd):
wc = sd["word_count"]
if 6 <= wc <= 12:
return (0, wc) # ideal range, prefer shorter
if wc < 6:
return (1, -wc) # too short
return (2, wc) # too long
word_matches.sort(key=score)
best = word_matches[:3]
matches[word_no_nik] = {
"word_nikkud": word_nik,
"sentences": [{"text": m["text"], "book": m["book"]} for m in best],
}
print(
f"Words with at least 1 match: {matched_count}/{len(vocab_words)} ({100 * matched_count / len(vocab_words):.1f}%)"
)
return matches
def main():
# ── Step 1: Extract from PDFs ──
pdfs = [
("alice.pdf", "אליס בארץ הפלאות"),
("lion_strawberry.pdf", "האריה שאהב תות"),
]
all_new_sentences = []
for filename, book_name in pdfs:
pdf_path = os.path.join(EPUBS_DIR, filename)
if not os.path.exists(pdf_path):
print(f"SKIP: {filename} not found")
continue
if not has_extractable_text(pdf_path):
print(f"SKIP: {filename} has no extractable text (likely scanned images)")
continue
print(f"Extracting from {filename} ({book_name})...")
sentences = extract_pdf_sentences(pdf_path, book_name)
print(f" Extracted {len(sentences)} sentences")
all_new_sentences.extend(sentences)
# ── Step 2: Merge with existing index ──
index = load_sentence_index()
existing_count = len(index["sentences"])
# Deduplicate by (stripped, book)
existing_keys = set()
for s in index["sentences"]:
key = (s.get("stripped", ""), s.get("book", ""))
existing_keys.add(key)
added = 0
for s in all_new_sentences:
key = (s["stripped"], s["book"])
if key not in existing_keys:
index["sentences"].append(s)
existing_keys.add(key)
added += 1
save_sentence_index(index)
total = len(index["sentences"])
print(f"\nSentence index: {existing_count} existing + {added} new = {total} total")
# ── Per-book stats ──
book_counts = {}
for s in index["sentences"]:
book = s.get("book", "unknown")
book_counts[book] = book_counts.get(book, 0) + 1
print("\nSentences per book:")
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
print(f" {book}: {count}")
# ── Step 3: Match vocab words to sentences ──
print(f"\nLoading vocab from {VOCAB_CSV}...")
vocab_df = pd.read_csv(VOCAB_CSV, sep=";", index_col=0)
print(f" {len(vocab_df)} vocab words loaded")
matches = match_vocab_to_sentences(index["sentences"], vocab_df)
with open(MATCHES_FILE, "w", encoding="utf-8") as f:
json.dump(matches, f, ensure_ascii=False, indent=2)
print(f"\nWrote {len(matches)} word matches to {MATCHES_FILE}")
# ── Step 4: Summary stats ──
total_words = len(vocab_df)
matched_words = len(matches)
print(f"\n{'=' * 50}")
print("SUMMARY")
print(f"{'=' * 50}")
print(f"Total sentences: {total}")
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
print(f" {book}: {count}")
print(f"Total vocab words: {total_words}")
print(f"Words with sentences: {matched_words} ({100 * matched_words / total_words:.1f}%)")
print(f"Words without sentences: {total_words - matched_words}")
if __name__ == "__main__":
main()