- Cloze card pipeline: 924 cards from 2,296 AI-vetted Hebrew book sentences - Plurals deck: 375 notes (144 irregular + 231 regular from 86 mishkal patterns) - Ktiv male forms expanded to 20,711 entries for sentence matching - Project reorg: helpers.py (deduped strip_nikkud from 10 files), scripts/ for one-off tools, tests/ with smoke tests, deleted 3 dead files - Lint tooling: pyproject.toml with ruff/vulture/bandit/pytest config, .editorconfig, fixed all 129 ruff errors (B023 closure fix, SIM103, unused vars) - validate_apkg.py: card count range check for optional cloze template - Data caches committed: vetted_sentences, ktiv_male_forms, noun_plurals, noun_slug_map, vocab_sentence_matches, epub_sentence_index Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
405 lines
13 KiB
Python
405 lines
13 KiB
Python
#!/usr/bin/env python3
|
||
"""
|
||
Extract sentences from PDF books and match vocab words to sentences.
|
||
|
||
1. Extract sentences from alice.pdf and lion_strawberry.pdf
|
||
2. Merge into existing epub_sentence_index.json
|
||
3. Match vocab words to sentences, produce vocab_sentence_matches.json
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
|
||
# Use the venv with pymupdf
|
||
sys.path.insert(0, "/home/node/projects/pealim/venv_pdf/lib/python3.11/site-packages")
|
||
# Also need the main venv for pandas
|
||
sys.path.insert(0, "/home/node/projects/pealim/lib/python3.11/site-packages")
|
||
|
||
import fitz
|
||
import pandas as pd
|
||
|
||
BASE_DIR = "/home/node/projects/pealim"
|
||
DATA_DIR = os.path.join(BASE_DIR, "data")
|
||
EPUBS_DIR = os.path.join(DATA_DIR, "epubs")
|
||
SENTENCE_INDEX = os.path.join(DATA_DIR, "epub_sentence_index.json")
|
||
VOCAB_CSV = os.path.join(DATA_DIR, "hebrew_dict_for_anki.csv")
|
||
MATCHES_FILE = os.path.join(DATA_DIR, "vocab_sentence_matches.json")
|
||
|
||
NIKKUD_RE = re.compile(r"[\u0591-\u05C7]")
|
||
HEBREW_RE = re.compile(r"[\u05d0-\u05ea]")
|
||
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea\ufb20-\ufb4f]")
|
||
|
||
|
||
def strip_nikkud(text):
|
||
"""Remove all Hebrew nikkud/cantillation marks."""
|
||
return NIKKUD_RE.sub("", text)
|
||
|
||
|
||
def collapse_hebrew_spaces(text):
|
||
"""Collapse spaces between Hebrew letter fragments (for badly-encoded PDFs).
|
||
|
||
Strategy: strip nikkud first, then iteratively remove spaces between
|
||
Hebrew characters. Real word boundaries are detected by:
|
||
- Final-form letters (ם ן ף ך ץ) followed by space
|
||
- Punctuation (.,;:!?"')
|
||
- Non-Hebrew characters
|
||
"""
|
||
stripped = strip_nikkud(text)
|
||
# Normalize presentation forms to standard Hebrew
|
||
# FB20-FB4F contains presentation forms
|
||
for code in range(0xFB2A, 0xFB50):
|
||
ch = chr(code)
|
||
if ch in stripped:
|
||
# Map shin/sin dots, dagesh forms back to base
|
||
# FB2A = שׁ (shin+dot), FB2B = שׂ (sin+dot)
|
||
base_map = {
|
||
"\ufb2a": "ש",
|
||
"\ufb2b": "ש",
|
||
"\ufb35": "ו",
|
||
"\ufb4b": "ו",
|
||
"\ufb30": "א",
|
||
"\ufb31": "ב",
|
||
"\ufb32": "ג",
|
||
"\ufb33": "ד",
|
||
"\ufb34": "ה",
|
||
"\ufb36": "ז",
|
||
"\ufb38": "ט",
|
||
"\ufb39": "י",
|
||
"\ufb3a": "כ",
|
||
"\ufb3b": "כ",
|
||
"\ufb3c": "ל",
|
||
"\ufb3e": "מ",
|
||
"\ufb40": "נ",
|
||
"\ufb41": "ס",
|
||
"\ufb43": "פ",
|
||
"\ufb44": "פ",
|
||
"\ufb46": "צ",
|
||
"\ufb47": "ק",
|
||
"\ufb48": "ר",
|
||
"\ufb49": "ש",
|
||
"\ufb4a": "ת",
|
||
}
|
||
if ch in base_map:
|
||
stripped = stripped.replace(ch, base_map[ch])
|
||
|
||
# Replace multiple spaces with single
|
||
stripped = re.sub(r" {2,}", " ", stripped)
|
||
|
||
# Now rebuild text, keeping spaces only at word boundaries
|
||
# Word boundary markers: final-form letters, punctuation, non-Hebrew
|
||
final_forms = set("םןףךץ")
|
||
result = []
|
||
i = 0
|
||
chars = list(stripped)
|
||
|
||
while i < len(chars):
|
||
if chars[i] != " ":
|
||
result.append(chars[i])
|
||
i += 1
|
||
continue
|
||
|
||
# It's a space. Decide if it's a word boundary.
|
||
# Look back for the last non-space character
|
||
prev_ch = None
|
||
for j in range(len(result) - 1, -1, -1):
|
||
if result[j] != " ":
|
||
prev_ch = result[j]
|
||
break
|
||
|
||
# Look forward for next non-space character
|
||
next_ch = None
|
||
for j in range(i + 1, len(chars)):
|
||
if chars[j] != " ":
|
||
next_ch = chars[j]
|
||
break
|
||
|
||
is_boundary = False
|
||
|
||
# After final-form letter = word boundary
|
||
if prev_ch and prev_ch in final_forms:
|
||
is_boundary = True
|
||
|
||
# Before/after punctuation or non-Hebrew = word boundary
|
||
if prev_ch and not HEBREW_RE.match(prev_ch):
|
||
is_boundary = True
|
||
if next_ch and not HEBREW_RE.match(next_ch):
|
||
is_boundary = True
|
||
|
||
# If either side is not Hebrew at all, boundary
|
||
if prev_ch is None or next_ch is None:
|
||
is_boundary = True
|
||
|
||
if is_boundary:
|
||
result.append(" ")
|
||
# else: skip the space (collapse intra-word gap)
|
||
i += 1
|
||
|
||
return "".join(result).strip()
|
||
|
||
|
||
def extract_pdf_sentences(pdf_path, book_name):
|
||
"""Extract sentences from a PDF file."""
|
||
doc = fitz.open(pdf_path)
|
||
sentences = []
|
||
|
||
for page_num in range(len(doc)):
|
||
page = doc[page_num]
|
||
text = page.get_text()
|
||
|
||
if not text.strip():
|
||
continue
|
||
|
||
# Split into lines first, then split on sentence-ending punctuation
|
||
lines = text.split("\n")
|
||
|
||
raw_sentences = []
|
||
for line in lines:
|
||
line = line.strip()
|
||
if not line:
|
||
continue
|
||
# Split on sentence-ending punctuation followed by space or at end
|
||
parts = re.split(r"(?<=[.?!])\s+", line)
|
||
raw_sentences.extend(parts)
|
||
|
||
for sent in raw_sentences:
|
||
sent = sent.strip()
|
||
if not sent:
|
||
continue
|
||
|
||
# Must contain Hebrew characters
|
||
if not HEBREW_RE.search(sent):
|
||
continue
|
||
|
||
# Create stripped version (no nikkud, collapsed spaces for PDF)
|
||
stripped = collapse_hebrew_spaces(sent)
|
||
|
||
# Count Hebrew words in stripped version
|
||
words = [w for w in stripped.split() if HEBREW_RE.search(w)]
|
||
word_count = len(words)
|
||
|
||
# Filter: 4-15 Hebrew words
|
||
if word_count < 4 or word_count > 15:
|
||
continue
|
||
|
||
# Drop metadata-like lines
|
||
# Page numbers (just digits)
|
||
if re.match(r"^\d+$", sent.strip()):
|
||
continue
|
||
# Copyright text
|
||
if any(kw in sent.lower() for kw in ["copyright", "©", "isbn", "printed in"]):
|
||
continue
|
||
|
||
sentences.append(
|
||
{
|
||
"text": sent,
|
||
"book": book_name,
|
||
"stripped": stripped,
|
||
}
|
||
)
|
||
|
||
doc.close()
|
||
return sentences
|
||
|
||
|
||
def has_extractable_text(pdf_path):
|
||
"""Check if a PDF has extractable text."""
|
||
doc = fitz.open(pdf_path)
|
||
text_found = False
|
||
for i in range(min(len(doc), 10)):
|
||
if doc[i].get_text().strip():
|
||
text_found = True
|
||
break
|
||
doc.close()
|
||
return text_found
|
||
|
||
|
||
def load_sentence_index():
|
||
"""Load existing sentence index."""
|
||
if os.path.exists(SENTENCE_INDEX):
|
||
with open(SENTENCE_INDEX, encoding="utf-8") as f:
|
||
return json.load(f)
|
||
return {"sentences": []}
|
||
|
||
|
||
def save_sentence_index(data):
|
||
"""Save sentence index."""
|
||
with open(SENTENCE_INDEX, "w", encoding="utf-8") as f:
|
||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||
|
||
|
||
def match_vocab_to_sentences(sentences, vocab_df):
|
||
"""Match vocab words to sentences."""
|
||
matches = {}
|
||
|
||
# Build lookup: word_no_nikkud -> word_nikkud
|
||
vocab_words = []
|
||
for _, row in vocab_df.iterrows():
|
||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||
word_nik = str(row.get("Word", "")).strip()
|
||
if word_no_nik and word_nik:
|
||
vocab_words.append((word_no_nik, word_nik))
|
||
|
||
print(f"Matching {len(vocab_words)} vocab words against {len(sentences)} sentences...")
|
||
|
||
# Precompute: for each sentence, get the stripped text
|
||
sent_data = []
|
||
for s in sentences:
|
||
stripped = s.get("stripped", "")
|
||
# For PDF sentences, stripped already has collapsed spaces but words may be joined
|
||
# For EPUB sentences, stripped has proper word spacing
|
||
sent_data.append(
|
||
{
|
||
"text": s["text"],
|
||
"book": s["book"],
|
||
"stripped": stripped,
|
||
"word_count": len(stripped.split()),
|
||
}
|
||
)
|
||
|
||
matched_count = 0
|
||
|
||
for word_no_nik, word_nik in vocab_words:
|
||
if len(word_no_nik) < 2:
|
||
continue
|
||
|
||
# Build regex for word boundary matching
|
||
# Use both approaches: proper word boundary and substring for PDF text
|
||
pattern = re.compile(r"(?:^|\s)" + re.escape(word_no_nik) + r"(?:\s|$)")
|
||
# For PDF texts with collapsed spaces, also try substring match
|
||
# but only for words >= 3 chars to avoid false positives
|
||
use_substring = len(word_no_nik) >= 3
|
||
|
||
word_matches = []
|
||
|
||
for sd in sent_data:
|
||
stripped = sd["stripped"]
|
||
|
||
# Try word-boundary match first
|
||
if pattern.search(stripped):
|
||
word_matches.append(sd)
|
||
elif use_substring and word_no_nik in stripped:
|
||
# Substring match for PDF texts with collapsed spaces
|
||
# Verify it's not part of a longer word by checking the character
|
||
# before and after in the collapsed text
|
||
idx = stripped.find(word_no_nik)
|
||
before_ok = idx == 0 or not HEBREW_RE.match(stripped[idx - 1])
|
||
after_idx = idx + len(word_no_nik)
|
||
after_ok = after_idx >= len(stripped) or not HEBREW_RE.match(stripped[after_idx])
|
||
# Only count if at least one boundary is clear
|
||
# (for PDF collapsed text, boundaries are often missing)
|
||
# For PDF books, we accept substring matches
|
||
if sd["book"] in ("אליס בארץ הפלאות", "האריה שאהב תות") or before_ok or after_ok:
|
||
word_matches.append(sd)
|
||
|
||
if word_matches:
|
||
matched_count += 1
|
||
|
||
# Sort by preference: 6-12 words ideal, then shorter is better
|
||
def score(sd):
|
||
wc = sd["word_count"]
|
||
if 6 <= wc <= 12:
|
||
return (0, wc) # ideal range, prefer shorter
|
||
if wc < 6:
|
||
return (1, -wc) # too short
|
||
return (2, wc) # too long
|
||
|
||
word_matches.sort(key=score)
|
||
best = word_matches[:3]
|
||
|
||
matches[word_no_nik] = {
|
||
"word_nikkud": word_nik,
|
||
"sentences": [{"text": m["text"], "book": m["book"]} for m in best],
|
||
}
|
||
|
||
print(
|
||
f"Words with at least 1 match: {matched_count}/{len(vocab_words)} ({100 * matched_count / len(vocab_words):.1f}%)"
|
||
)
|
||
return matches
|
||
|
||
|
||
def main():
|
||
# ── Step 1: Extract from PDFs ──
|
||
pdfs = [
|
||
("alice.pdf", "אליס בארץ הפלאות"),
|
||
("lion_strawberry.pdf", "האריה שאהב תות"),
|
||
]
|
||
|
||
all_new_sentences = []
|
||
|
||
for filename, book_name in pdfs:
|
||
pdf_path = os.path.join(EPUBS_DIR, filename)
|
||
if not os.path.exists(pdf_path):
|
||
print(f"SKIP: {filename} not found")
|
||
continue
|
||
|
||
if not has_extractable_text(pdf_path):
|
||
print(f"SKIP: {filename} has no extractable text (likely scanned images)")
|
||
continue
|
||
|
||
print(f"Extracting from {filename} ({book_name})...")
|
||
sentences = extract_pdf_sentences(pdf_path, book_name)
|
||
print(f" Extracted {len(sentences)} sentences")
|
||
all_new_sentences.extend(sentences)
|
||
|
||
# ── Step 2: Merge with existing index ──
|
||
index = load_sentence_index()
|
||
existing_count = len(index["sentences"])
|
||
|
||
# Deduplicate by (stripped, book)
|
||
existing_keys = set()
|
||
for s in index["sentences"]:
|
||
key = (s.get("stripped", ""), s.get("book", ""))
|
||
existing_keys.add(key)
|
||
|
||
added = 0
|
||
for s in all_new_sentences:
|
||
key = (s["stripped"], s["book"])
|
||
if key not in existing_keys:
|
||
index["sentences"].append(s)
|
||
existing_keys.add(key)
|
||
added += 1
|
||
|
||
save_sentence_index(index)
|
||
total = len(index["sentences"])
|
||
print(f"\nSentence index: {existing_count} existing + {added} new = {total} total")
|
||
|
||
# ── Per-book stats ──
|
||
book_counts = {}
|
||
for s in index["sentences"]:
|
||
book = s.get("book", "unknown")
|
||
book_counts[book] = book_counts.get(book, 0) + 1
|
||
|
||
print("\nSentences per book:")
|
||
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
|
||
print(f" {book}: {count}")
|
||
|
||
# ── Step 3: Match vocab words to sentences ──
|
||
print(f"\nLoading vocab from {VOCAB_CSV}...")
|
||
vocab_df = pd.read_csv(VOCAB_CSV, sep=";", index_col=0)
|
||
print(f" {len(vocab_df)} vocab words loaded")
|
||
|
||
matches = match_vocab_to_sentences(index["sentences"], vocab_df)
|
||
|
||
with open(MATCHES_FILE, "w", encoding="utf-8") as f:
|
||
json.dump(matches, f, ensure_ascii=False, indent=2)
|
||
|
||
print(f"\nWrote {len(matches)} word matches to {MATCHES_FILE}")
|
||
|
||
# ── Step 4: Summary stats ──
|
||
total_words = len(vocab_df)
|
||
matched_words = len(matches)
|
||
print(f"\n{'=' * 50}")
|
||
print("SUMMARY")
|
||
print(f"{'=' * 50}")
|
||
print(f"Total sentences: {total}")
|
||
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
|
||
print(f" {book}: {count}")
|
||
print(f"Total vocab words: {total_words}")
|
||
print(f"Words with sentences: {matched_words} ({100 * matched_words / total_words:.1f}%)")
|
||
print(f"Words without sentences: {total_words - matched_words}")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|