feat: Sprint 3 — verb validation, Heebo font, images, card fixes
Verb validation:
- Add validate_verb_list.py: queries pealim.com for all 70 entries in
nevo_typed_verbs_from_modern_hebrew, classifies as OK/3ms/REVIEW/NOT_FOUND,
writes cleaned verbs_input.txt with structured sections and REVIEW flags
- New verbs_input.txt: 51 verified verbs, 15 Pu'al/Huf'al 3ms-past forms,
4 REVIEW entries flagged for manual correction before conjugation re-run
Card fixes (apkg_builder.py):
- Conjugation card: binyan field now in Hebrew (פָּעַל/נִפְעַל etc.) via
BINYAN_TO_HEBREW map; root and binyan on separate lines in CONJ_BACK template
- Vocabulary card: remove "דוגמה:" label (keep right-border quote styling)
- Related-words: "Other" category shown unlabeled (no spurious Hebrew header)
- Frequency: unlisted words (not in 50k corpus) now display "50k+" badge
- Add Image field to VOCAB_MODEL and templates ({{#Image}}<img>{{/Image}})
- Diagnostic logging: unlisted word count and uncategorized related-words count
Hebrew font (Heebo):
- Download Heebo variable font TTF from Google Fonts GitHub → data/fonts/
- Add @font-face declarations to CARD_CSS for both decks
- Bundle _Heebo-Regular.ttf and _Heebo-Bold.ttf in every .apkg via
write_vocab_apkg() / write_conj_apkg() using _font_media_files() helper
Image infrastructure (image_fetch.py):
- New script: fetches Wikipedia pageimages + Wikimedia Commons thumbnails
for concrete Noun-PoS entries (concreteness heuristic: ≤4 words, no
abstract suffixes: -tion/-ity/-ness/-ment/-ance/-ence/-ism/-hood/-ship/-ure)
- Caches results in data/image_cache.json; downloads to data/images/
- Resume-safe; supports --limit/--dry-run/--word flags
Pipeline (run.py):
- Add step_fonts(): downloads Heebo TTF files to data/fonts/ (cached)
- Add step_images(): calls image_fetch.run(), respects --skip-images
- Add --skip-images flag
- Pass image_cache to build_vocab_deck(); add image stats to print_summary()
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
78cc7f0ef1
commit
d9e2533166
8 changed files with 895 additions and 104 deletions
125
apkg_builder.py
125
apkg_builder.py
|
|
@ -33,6 +33,20 @@ OUTPUT_DIR = Path(__file__).parent / "output"
|
|||
VOCAB_APKG = OUTPUT_DIR / "pealim_vocabulary.apkg"
|
||||
CONJ_APKG = OUTPUT_DIR / "pealim_conjugations.apkg"
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# Binyan → Hebrew label mapping (for conjugation card display)
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
BINYAN_TO_HEBREW: dict[str, str] = {
|
||||
"Pa'al": "פָּעַל",
|
||||
"Nif'al": "נִפְעַל",
|
||||
"Pi'el": "פִּעֵל",
|
||||
"Pu'al": "פֻּעַל",
|
||||
"Hitpa'el": "הִתְפַּעֵל",
|
||||
"Hif'il": "הִפְעִיל",
|
||||
"Huf'al": "הֻפְעַל",
|
||||
}
|
||||
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
# PoS → Hebrew label mapping
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
|
@ -60,9 +74,21 @@ POS_CATEGORY_LABELS = {
|
|||
# Shared CSS
|
||||
# ──────────────────────────────────────────────────────────────────────────────
|
||||
|
||||
FONTS_DIR = DATA_DIR / "fonts"
|
||||
|
||||
CARD_CSS = """
|
||||
@font-face {
|
||||
font-family: 'Heebo';
|
||||
src: url('_Heebo-Regular.ttf');
|
||||
font-weight: normal;
|
||||
}
|
||||
@font-face {
|
||||
font-family: 'Heebo';
|
||||
src: url('_Heebo-Bold.ttf');
|
||||
font-weight: bold;
|
||||
}
|
||||
.card {
|
||||
font-family: Arial, sans-serif;
|
||||
font-family: 'Heebo', 'Arial Hebrew', 'David', Arial, sans-serif;
|
||||
font-size: 20px;
|
||||
text-align: center;
|
||||
color: #222;
|
||||
|
|
@ -149,8 +175,8 @@ VOCAB_BACK_HEB = """
|
|||
<div class="sec-label">מילים קשורות:</div>
|
||||
<div class="root-info">{{SharedRoots}}</div>
|
||||
{{/SharedRoots}}
|
||||
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
|
||||
{{#Example}}
|
||||
<div class="sec-label">דוגמה:</div>
|
||||
<div class="example">{{Example}}</div>
|
||||
{{/Example}}
|
||||
{{#Frequency}}<div class="freq-badge">#{{Frequency}}</div>{{/Frequency}}
|
||||
|
|
@ -168,8 +194,8 @@ VOCAB_BACK_ENG = """
|
|||
{{#WordNoNikkud}}<div class="sec-label">ללא ניקוד: {{WordNoNikkud}}</div>{{/WordNoNikkud}}
|
||||
{{#Root}}<div class="sec-label">שורש: {{Root}}</div>{{/Root}}
|
||||
{{#PoS}}<div class="sec-label">חלק דיבור: {{PoS}}</div>{{/PoS}}
|
||||
{{#Image}}<div><img src="{{Image}}" style="max-height:150px;margin-top:8px;"></div>{{/Image}}
|
||||
{{#Example}}
|
||||
<div class="sec-label">דוגמה:</div>
|
||||
<div class="example">{{Example}}</div>
|
||||
{{/Example}}
|
||||
"""
|
||||
|
|
@ -188,6 +214,7 @@ VOCAB_MODEL = genanki.Model(
|
|||
{"name": "Audio"},
|
||||
{"name": "Example"},
|
||||
{"name": "Frequency"},
|
||||
{"name": "Image"},
|
||||
],
|
||||
templates=[
|
||||
{
|
||||
|
|
@ -219,7 +246,8 @@ CONJ_BACK = """
|
|||
{{FrontSide}}<hr>
|
||||
<div class="hebrew">{{ConjugatedForm}}</div>
|
||||
{{#Audio}}<div>{{Audio}}</div>{{/Audio}}
|
||||
<div class="sec-label">שורש: {{Root}} | בניין: {{Binyan}}</div>
|
||||
<div class="sec-label">שורש: {{Root}}</div>
|
||||
<div class="sec-label">בניין: {{Binyan}}</div>
|
||||
"""
|
||||
|
||||
CONJ_CSS = CARD_CSS
|
||||
|
|
@ -339,6 +367,7 @@ def build_vocab_deck(
|
|||
dict_csv: Path,
|
||||
examples_cache: Optional[dict] = None,
|
||||
freq_cache: Optional[dict] = None,
|
||||
image_cache: Optional[dict] = None,
|
||||
limit: Optional[int] = None,
|
||||
) -> tuple[genanki.Deck, list[Path]]:
|
||||
"""
|
||||
|
|
@ -360,6 +389,18 @@ def build_vocab_deck(
|
|||
|
||||
examples_cache = examples_cache or {}
|
||||
freq_cache = freq_cache or {}
|
||||
image_cache = image_cache or {}
|
||||
|
||||
# Load image cache from disk if not passed in
|
||||
image_cache_path = DATA_DIR / "image_cache.json"
|
||||
if not image_cache and image_cache_path.exists():
|
||||
try:
|
||||
with open(image_cache_path) as _f:
|
||||
image_cache = json.load(_f)
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
images_dir = DATA_DIR / "images"
|
||||
|
||||
# Build word_stripped → pos_category dict for related-words grouping
|
||||
word_to_pos_cat: dict[str, str] = {}
|
||||
|
|
@ -390,7 +431,8 @@ def build_vocab_deck(
|
|||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||||
shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip()
|
||||
tags_str = str(row.get("tags", row.get("Tags", ""))).strip()
|
||||
freq_rank = int(row["_freq_rank"]) if row["_freq_rank"] < 999_999 else ""
|
||||
freq_rank_raw = int(row["_freq_rank"])
|
||||
freq_display = str(freq_rank_raw) if freq_rank_raw < 999_999 else "50k+"
|
||||
|
||||
root = "" if root in ("nan", "None", "-") else root
|
||||
pos_raw = "" if pos_raw in ("nan", "None") else pos_raw
|
||||
|
|
@ -430,12 +472,26 @@ def build_vocab_deck(
|
|||
groups[cat].append(rw)
|
||||
parts = []
|
||||
for cat, words in groups.items():
|
||||
label = POS_CATEGORY_LABELS.get(cat, cat)
|
||||
parts.append(
|
||||
f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>'
|
||||
)
|
||||
if cat == "Other":
|
||||
# No label for uncategorized words — just list them plain
|
||||
parts.append(f'<div class="related-group">{" ".join(words)}</div>')
|
||||
else:
|
||||
label = POS_CATEGORY_LABELS.get(cat, cat)
|
||||
parts.append(
|
||||
f'<div class="related-group"><b>{label}:</b> {" ".join(words)}</div>'
|
||||
)
|
||||
related_html = "\n".join(parts)
|
||||
|
||||
# Image: look up by stripped word (no-nikkud)
|
||||
image_filename = image_cache.get(word_no_nik or _strip_nikkud(word), None)
|
||||
image_tag = ""
|
||||
if image_filename:
|
||||
image_path = images_dir / image_filename
|
||||
if image_path.exists():
|
||||
image_tag = image_filename
|
||||
if image_path not in media_files:
|
||||
media_files.append(image_path)
|
||||
|
||||
note = genanki.Note(
|
||||
model=VOCAB_MODEL,
|
||||
fields=[
|
||||
|
|
@ -448,12 +504,23 @@ def build_vocab_deck(
|
|||
tags_str,
|
||||
audio_tag,
|
||||
example_html,
|
||||
str(freq_rank),
|
||||
freq_display,
|
||||
image_tag,
|
||||
],
|
||||
tags=tags_str.split() if tags_str else [],
|
||||
)
|
||||
deck.add_note(note)
|
||||
|
||||
# Diagnostic: count words without PoS coverage in shared_roots
|
||||
other_count = sum(
|
||||
1 for _, row in df.iterrows()
|
||||
for rw in str(row.get("shared roots", row.get("SharedRoots", ""))).split()
|
||||
if str(row.get("shared roots", row.get("SharedRoots", ""))) not in ("nan", "None", "")
|
||||
and word_to_pos_cat.get(_strip_nikkud(rw)) is None
|
||||
)
|
||||
unlisted = sum(1 for v in df["_freq_rank"] if int(v) >= 999_999)
|
||||
logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(df)}")
|
||||
logger.info(f" Related-words without PoS coverage: {other_count} (shown unlabeled)")
|
||||
logger.info(f"Vocabulary deck: {len(deck.notes)} notes")
|
||||
return deck, media_files
|
||||
|
||||
|
|
@ -471,11 +538,12 @@ def build_conj_deck(
|
|||
if not data or not data.get("forms"):
|
||||
continue
|
||||
|
||||
root = data.get("root", "")
|
||||
binyan = data.get("binyan", "")
|
||||
ref_form = data.get("reference_form", infinitive)
|
||||
slug = data.get("slug", "")
|
||||
voice = VOICE_MAP.get(binyan, "")
|
||||
root = data.get("root", "")
|
||||
binyan = data.get("binyan", "")
|
||||
binyan_heb = BINYAN_TO_HEBREW.get(binyan, binyan)
|
||||
ref_form = data.get("reference_form", infinitive)
|
||||
slug = data.get("slug", "")
|
||||
voice = VOICE_MAP.get(binyan, "")
|
||||
forms = data["forms"]
|
||||
|
||||
def add_note(pronoun: str, tense: str, conj_form: str, audio_tag: str) -> None:
|
||||
|
|
@ -491,7 +559,7 @@ def build_conj_deck(
|
|||
tense,
|
||||
conj_form,
|
||||
root,
|
||||
binyan,
|
||||
binyan_heb,
|
||||
voice,
|
||||
audio_tag,
|
||||
],
|
||||
|
|
@ -540,11 +608,12 @@ def build_conj_deck(
|
|||
# Also process passive partner forms if present
|
||||
passive = data.get("passive_partner")
|
||||
if passive and passive.get("forms"):
|
||||
passive_root = passive.get("root", root)
|
||||
passive_binyan = passive.get("binyan", "")
|
||||
passive_ref = passive.get("reference_form", ref_form)
|
||||
passive_voice = VOICE_MAP.get(passive_binyan, "")
|
||||
passive_slug = passive.get("slug", slug)
|
||||
passive_root = passive.get("root", root)
|
||||
passive_binyan = passive.get("binyan", "")
|
||||
passive_binyan_heb = BINYAN_TO_HEBREW.get(passive_binyan, passive_binyan)
|
||||
passive_ref = passive.get("reference_form", ref_form)
|
||||
passive_voice = VOICE_MAP.get(passive_binyan, "")
|
||||
passive_slug = passive.get("slug", slug)
|
||||
|
||||
for form_key, form_data in passive["forms"].items():
|
||||
conj_form = form_data.get("form", "")
|
||||
|
|
@ -573,7 +642,7 @@ def build_conj_deck(
|
|||
tense,
|
||||
conj_form,
|
||||
passive_root,
|
||||
passive_binyan,
|
||||
passive_binyan_heb,
|
||||
passive_voice,
|
||||
audio_tag,
|
||||
],
|
||||
|
|
@ -588,6 +657,12 @@ def build_conj_deck(
|
|||
return deck, media_files
|
||||
|
||||
|
||||
def _font_media_files() -> list[str]:
|
||||
"""Return list of Heebo font file paths that exist, for bundling in .apkg."""
|
||||
font_paths = list(FONTS_DIR.glob("_Heebo*.ttf"))
|
||||
return [str(p) for p in font_paths if p.exists()]
|
||||
|
||||
|
||||
def write_vocab_apkg(
|
||||
deck: genanki.Deck,
|
||||
media_files: list[Path],
|
||||
|
|
@ -595,7 +670,7 @@ def write_vocab_apkg(
|
|||
) -> None:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pkg = genanki.Package(deck)
|
||||
pkg.media_files = [str(p) for p in media_files if p.exists()]
|
||||
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
|
||||
pkg.write_to_file(str(out_path))
|
||||
logger.info(f"Vocabulary deck written → {out_path}")
|
||||
|
||||
|
|
@ -607,8 +682,8 @@ def write_conj_apkg(
|
|||
) -> None:
|
||||
out_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
pkg = genanki.Package(deck)
|
||||
if media_files:
|
||||
pkg.media_files = [str(p) for p in media_files if p.exists()]
|
||||
base = [str(p) for p in (media_files or []) if p.exists()]
|
||||
pkg.media_files = base + _font_media_files()
|
||||
pkg.write_to_file(str(out_path))
|
||||
logger.info(f"Conjugation deck written → {out_path}")
|
||||
|
||||
|
|
|
|||
BIN
data/fonts/_Heebo-Bold.ttf
Normal file
BIN
data/fonts/_Heebo-Bold.ttf
Normal file
Binary file not shown.
BIN
data/fonts/_Heebo-Regular.ttf
Normal file
BIN
data/fonts/_Heebo-Regular.ttf
Normal file
Binary file not shown.
313
image_fetch.py
Normal file
313
image_fetch.py
Normal file
|
|
@ -0,0 +1,313 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
|
||||
|
||||
Scope: Noun PoS entries only. Concreteness heuristic:
|
||||
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
|
||||
-ship, -ure, -al, -ing when not a gerund, -ence)
|
||||
- Meaning is ≤ 4 words
|
||||
|
||||
Image sources (tried in order):
|
||||
1. Wikipedia page image via pageimages API
|
||||
2. Wikimedia Commons search (first image file result)
|
||||
|
||||
Cache: data/image_cache.json (word_no_nikkud → filename or null)
|
||||
Output: data/images/<safe_name>.jpg
|
||||
|
||||
Usage:
|
||||
python3 image_fetch.py [--limit N] [--sample] [--word WORD] [--dry-run]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
IMAGES_DIR = DATA_DIR / "images"
|
||||
CACHE_PATH = DATA_DIR / "image_cache.json"
|
||||
|
||||
REQUEST_DELAY = 0.5
|
||||
REQUEST_TIMEOUT = 10
|
||||
|
||||
# Abstract noun suffixes — words whose English meaning ends in these are skipped
|
||||
ABSTRACT_SUFFIXES = (
|
||||
"tion", "ity", "ness", "ment", "ance", "ence", "ism",
|
||||
"hood", "ship", "ure", "ism", "age",
|
||||
)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
"User-Agent": "pealim-anki/3.0 (educational Hebrew Anki deck builder; contact: anki@pealim.invalid)"
|
||||
})
|
||||
|
||||
|
||||
def _strip_nikkud(text: str) -> str:
|
||||
return "".join(
|
||||
ch for ch in unicodedata.normalize("NFD", text)
|
||||
if unicodedata.category(ch) != "Mn"
|
||||
)
|
||||
|
||||
|
||||
def is_concrete(english_meaning: str) -> bool:
|
||||
"""Return True if the English meaning looks like a concrete noun."""
|
||||
meaning = english_meaning.strip().lower()
|
||||
# Strip leading article
|
||||
meaning = re.sub(r"^(a|an|the)\s+", "", meaning)
|
||||
words = meaning.split()
|
||||
if len(words) > 4:
|
||||
return False
|
||||
# Check last word for abstract suffixes
|
||||
last = words[-1] if words else ""
|
||||
for suffix in ABSTRACT_SUFFIXES:
|
||||
if last.endswith(suffix):
|
||||
return False
|
||||
return True
|
||||
|
||||
|
||||
def _safe_name(word_no_nikkud: str) -> str:
|
||||
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
|
||||
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud))
|
||||
return hebrew_only if hebrew_only else "unknown"
|
||||
|
||||
|
||||
def _try_wikipedia(query: str) -> str | None:
|
||||
"""Try Wikipedia pageimages API. Returns image URL or None."""
|
||||
url = "https://en.wikipedia.org/w/api.php"
|
||||
params = {
|
||||
"action": "query",
|
||||
"titles": query,
|
||||
"prop": "pageimages",
|
||||
"format": "json",
|
||||
"pithumbsize": 200,
|
||||
"redirects": 1,
|
||||
}
|
||||
try:
|
||||
resp = session.get(url, params=params, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
pages = data.get("query", {}).get("pages", {})
|
||||
for page in pages.values():
|
||||
if "thumbnail" in page:
|
||||
return page["thumbnail"]["source"]
|
||||
except Exception as e:
|
||||
logger.debug(f"Wikipedia API error for {query!r}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _try_commons(query: str) -> str | None:
|
||||
"""Try Wikimedia Commons file search. Returns thumbnail URL or None."""
|
||||
url = "https://commons.wikimedia.org/w/api.php"
|
||||
params = {
|
||||
"action": "query",
|
||||
"list": "search",
|
||||
"srnamespace": 6,
|
||||
"srsearch": query,
|
||||
"format": "json",
|
||||
"srlimit": 1,
|
||||
}
|
||||
try:
|
||||
resp = session.get(url, params=params, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
data = resp.json()
|
||||
hits = data.get("query", {}).get("search", [])
|
||||
if not hits:
|
||||
return None
|
||||
file_title = hits[0]["title"] # e.g. "File:Cat_portrait.jpg"
|
||||
# Fetch imageinfo to get thumbnail URL
|
||||
info_params = {
|
||||
"action": "query",
|
||||
"titles": file_title,
|
||||
"prop": "imageinfo",
|
||||
"iiprop": "url",
|
||||
"iiurlwidth": 200,
|
||||
"format": "json",
|
||||
}
|
||||
resp2 = session.get(url, params=info_params, timeout=REQUEST_TIMEOUT)
|
||||
resp2.raise_for_status()
|
||||
data2 = resp2.json()
|
||||
pages2 = data2.get("query", {}).get("pages", {})
|
||||
for page in pages2.values():
|
||||
info = page.get("imageinfo", [])
|
||||
if info and "thumburl" in info[0]:
|
||||
return info[0]["thumburl"]
|
||||
except Exception as e:
|
||||
logger.debug(f"Commons API error for {query!r}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _download_image(image_url: str, dest_path: Path) -> bool:
|
||||
"""Download image_url to dest_path. Returns True on success."""
|
||||
try:
|
||||
resp = session.get(image_url, timeout=REQUEST_TIMEOUT, stream=True)
|
||||
resp.raise_for_status()
|
||||
content_type = resp.headers.get("content-type", "")
|
||||
if "image" not in content_type:
|
||||
return False
|
||||
dest_path.write_bytes(resp.content)
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.debug(f"Download failed {image_url}: {e}")
|
||||
return False
|
||||
|
||||
|
||||
def get_image(english_meaning: str, word_no_nikkud: str) -> str | None:
|
||||
"""
|
||||
Fetch a thumbnail image for the word. Returns filename (in IMAGES_DIR) or None.
|
||||
Downloads to IMAGES_DIR/<safe_name>.jpg.
|
||||
"""
|
||||
if not is_concrete(english_meaning):
|
||||
return None
|
||||
|
||||
safe = _safe_name(word_no_nikkud)
|
||||
dest = IMAGES_DIR / f"{safe}.jpg"
|
||||
|
||||
if dest.exists():
|
||||
return dest.name
|
||||
|
||||
# Try Wikipedia first, then Commons
|
||||
query = english_meaning.strip().lower()
|
||||
query = re.sub(r"^(a|an|the)\s+", "", query)
|
||||
|
||||
image_url = _try_wikipedia(query)
|
||||
time.sleep(REQUEST_DELAY)
|
||||
if not image_url:
|
||||
image_url = _try_commons(query)
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if not image_url:
|
||||
return None
|
||||
|
||||
IMAGES_DIR.mkdir(parents=True, exist_ok=True)
|
||||
if _download_image(image_url, dest):
|
||||
logger.info(f" {word_no_nikkud!r} ({english_meaning!r}) → {dest.name}")
|
||||
return dest.name
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def load_cache() -> dict:
|
||||
if CACHE_PATH.exists():
|
||||
try:
|
||||
with open(CACHE_PATH, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
||||
def save_cache(cache: dict) -> None:
|
||||
CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CACHE_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(cache, f, ensure_ascii=False, indent=2, sort_keys=True)
|
||||
|
||||
|
||||
def run(limit: int | None = None, dry_run: bool = False, single_word: str | None = None) -> dict:
|
||||
"""
|
||||
Fetch images for all Noun-PoS words in pealim_dict_for_anki.csv.
|
||||
Returns the updated image_cache dict.
|
||||
"""
|
||||
import pandas as pd
|
||||
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
logger.error("Dictionary CSV not found")
|
||||
return {}
|
||||
|
||||
try:
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError
|
||||
except Exception:
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
|
||||
cache = load_cache()
|
||||
processed = 0
|
||||
hits = 0
|
||||
skipped_abstract = 0
|
||||
skipped_cached = 0
|
||||
|
||||
for _, row in df.iterrows():
|
||||
if limit and processed >= limit:
|
||||
break
|
||||
|
||||
word = str(row.get("Word", "")).strip()
|
||||
meaning = str(row.get("Meaning", "")).strip()
|
||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
||||
pos_raw = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||||
|
||||
if not word or not meaning or meaning in ("nan", "None"):
|
||||
continue
|
||||
if "nan" in pos_raw.lower() or "Noun" not in pos_raw:
|
||||
continue
|
||||
|
||||
if single_word and word_plain != single_word:
|
||||
continue
|
||||
|
||||
cache_key = word_plain or _strip_nikkud(word)
|
||||
|
||||
if cache_key in cache:
|
||||
skipped_cached += 1
|
||||
continue
|
||||
|
||||
if not is_concrete(meaning):
|
||||
skipped_abstract += 1
|
||||
cache[cache_key] = None
|
||||
continue
|
||||
|
||||
processed += 1
|
||||
logger.info(f"[{processed}] {word_plain!r} ({meaning!r}) …")
|
||||
|
||||
if dry_run:
|
||||
logger.info(" [dry-run] would fetch image")
|
||||
cache[cache_key] = None
|
||||
continue
|
||||
|
||||
filename = get_image(meaning, cache_key)
|
||||
cache[cache_key] = filename
|
||||
if filename:
|
||||
hits += 1
|
||||
|
||||
# Save cache periodically
|
||||
if processed % 10 == 0:
|
||||
save_cache(cache)
|
||||
|
||||
save_cache(cache)
|
||||
|
||||
logger.info(
|
||||
f"Image fetch complete: {hits} found, "
|
||||
f"{processed - hits} not found, "
|
||||
f"{skipped_abstract} abstract (skipped), "
|
||||
f"{skipped_cached} cached"
|
||||
)
|
||||
return cache
|
||||
|
||||
|
||||
def main() -> None:
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
p = argparse.ArgumentParser(description="Fetch images for concrete Hebrew nouns")
|
||||
p.add_argument("--limit", type=int, metavar="N", help="Process at most N nouns (for testing)")
|
||||
p.add_argument("--dry-run", action="store_true", help="Don't download, just check concreteness")
|
||||
p.add_argument("--word", metavar="WORD", help="Fetch image for a specific word (no-nikkud form)")
|
||||
args = p.parse_args()
|
||||
|
||||
cache = run(limit=args.limit, dry_run=args.dry_run, single_word=args.word)
|
||||
found = [(k, v) for k, v in cache.items() if v]
|
||||
print(f"\n{len(found)} words with images (of {len(cache)} in cache)")
|
||||
if found[:5]:
|
||||
print("Sample:", found[:5])
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
70
nevo_typed_verbs_from_modern_hebrew
Normal file
70
nevo_typed_verbs_from_modern_hebrew
Normal file
|
|
@ -0,0 +1,70 @@
|
|||
לשמור
|
||||
ללמוד
|
||||
לאסוף
|
||||
לעבוד
|
||||
לחבוש
|
||||
לאכול
|
||||
לשאול
|
||||
לשלוח
|
||||
לגבוה
|
||||
לשבת
|
||||
לרשת
|
||||
לפול
|
||||
לקום
|
||||
לשים
|
||||
לחון
|
||||
לקרוא
|
||||
לקנות
|
||||
להיבדק
|
||||
להרדם
|
||||
לההרג
|
||||
להחקר
|
||||
להישאר
|
||||
להיפגע
|
||||
להיוולד
|
||||
להנצל
|
||||
להיסוג
|
||||
להימצא
|
||||
להיבנות
|
||||
לדבר
|
||||
לברך
|
||||
לנהל
|
||||
לנצח
|
||||
לקומם
|
||||
למלא
|
||||
לחכות
|
||||
לגלגל
|
||||
בותל
|
||||
תואם
|
||||
קומם
|
||||
דוכא
|
||||
זוכה
|
||||
פורסם
|
||||
להתלבש
|
||||
להסתלק
|
||||
להצטלם
|
||||
להזדקק
|
||||
להתנהג
|
||||
להתלקלח
|
||||
להתקומם
|
||||
להתפלא
|
||||
להתגלות
|
||||
להתקלקל
|
||||
להכניס
|
||||
להעסיק
|
||||
להחליט
|
||||
להבטיח
|
||||
להוריד
|
||||
להפיל
|
||||
להקים
|
||||
להקלל
|
||||
המציא
|
||||
להרשות
|
||||
הוגבל
|
||||
העבר
|
||||
הוזהר
|
||||
הופל
|
||||
הוקם
|
||||
הוחל
|
||||
הוקפא
|
||||
הופנה
|
||||
87
run.py
87
run.py
|
|
@ -10,6 +10,7 @@ Options:
|
|||
--skip-audio Skip audio .mp3 downloads
|
||||
--skip-examples Skip Ben Yehuda example fetching
|
||||
--skip-conjugations Skip verb conjugation extraction
|
||||
--skip-images Skip image fetching for concrete nouns
|
||||
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
|
||||
--test N Process only the first N dictionary words (for quick testing)
|
||||
"""
|
||||
|
|
@ -34,6 +35,7 @@ DATA_DIR = Path(__file__).parent / "data"
|
|||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
AUDIO_DIR = DATA_DIR / "audio"
|
||||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||
FONTS_DIR = DATA_DIR / "fonts"
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
|
@ -42,6 +44,7 @@ def parse_args():
|
|||
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
||||
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
||||
p.add_argument("--skip-conjugations", action="store_true", help="Skip verb conjugation extraction")
|
||||
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
||||
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
|
||||
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
||||
return p.parse_args()
|
||||
|
|
@ -270,7 +273,77 @@ def step_conj_audio(args, conjugations: dict):
|
|||
)
|
||||
|
||||
|
||||
def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
|
||||
def step_fonts(args):
|
||||
"""Step 4c — download Heebo font files (one-time, cached)."""
|
||||
FONTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
regular = FONTS_DIR / "_Heebo-Regular.ttf"
|
||||
bold = FONTS_DIR / "_Heebo-Bold.ttf"
|
||||
|
||||
if regular.exists() and bold.exists():
|
||||
logger.info("[4c] Heebo fonts already cached")
|
||||
return
|
||||
|
||||
logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
|
||||
|
||||
# Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
|
||||
import requests as _req
|
||||
headers = {
|
||||
# Request TTF (not woff2) so Anki can embed them
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
|
||||
}
|
||||
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
|
||||
try:
|
||||
css_resp = _req.get(css_url, headers=headers, timeout=15)
|
||||
css_resp.raise_for_status()
|
||||
css_text = css_resp.text
|
||||
|
||||
# Find all src: url(...) references (may be woff2 for modern UA)
|
||||
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
|
||||
logger.debug(f" Found {len(font_urls)} font URL(s) in CSS")
|
||||
|
||||
# Prefer TTF; if only woff2 available, download first two and note
|
||||
downloaded = []
|
||||
for i, fu in enumerate(font_urls[:2]):
|
||||
fu = fu.strip("'\"")
|
||||
dest = regular if i == 0 else bold
|
||||
if dest.exists():
|
||||
continue
|
||||
fr = _req.get(fu, timeout=15)
|
||||
fr.raise_for_status()
|
||||
dest.write_bytes(fr.content)
|
||||
downloaded.append(dest.name)
|
||||
logger.info(f" Downloaded → {dest.name}")
|
||||
|
||||
if not downloaded:
|
||||
logger.info(" All font files already present")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Heebo download failed: {e}")
|
||||
logger.warning(" Cards will fall back to Arial Hebrew / David.")
|
||||
logger.warning(
|
||||
" To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
|
||||
"from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
|
||||
f"into {FONTS_DIR}"
|
||||
)
|
||||
|
||||
|
||||
def step_images(args) -> dict:
|
||||
"""Step 4d — fetch images for concrete nouns (resume-safe)."""
|
||||
if args.skip_images:
|
||||
logger.info("[4d] Skipping images (--skip-images)")
|
||||
cache_path = DATA_DIR / "image_cache.json"
|
||||
if cache_path.exists():
|
||||
with open(cache_path) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
limit = args.test # When in test mode, limit images too
|
||||
logger.info("[4d] Fetching images for concrete nouns …")
|
||||
import image_fetch
|
||||
return image_fetch.run(limit=limit)
|
||||
|
||||
|
||||
def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache: dict = None):
|
||||
"""Step 5 — build vocabulary .apkg."""
|
||||
logger.info("[5] Building vocabulary deck …")
|
||||
import apkg_builder
|
||||
|
|
@ -283,6 +356,7 @@ def step_build_vocab(args, examples_cache: dict, freq_cache: dict):
|
|||
dict_csv,
|
||||
examples_cache=examples_cache,
|
||||
freq_cache=freq_cache,
|
||||
image_cache=image_cache or {},
|
||||
limit=args.test,
|
||||
)
|
||||
apkg_builder.write_vocab_apkg(deck, media)
|
||||
|
|
@ -349,6 +423,13 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
|
|||
mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3"))
|
||||
logger.info(f" Conjugation audio files: {len(mp3s)}")
|
||||
|
||||
image_cache_path = DATA_DIR / "image_cache.json"
|
||||
if image_cache_path.exists():
|
||||
with open(image_cache_path) as f:
|
||||
ic = json.load(f)
|
||||
found_imgs = sum(1 for v in ic.values() if v)
|
||||
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
|
||||
|
||||
vocab_apkg = OUTPUT_DIR / "pealim_vocabulary.apkg"
|
||||
conj_apkg = OUTPUT_DIR / "pealim_conjugations.apkg"
|
||||
if vocab_apkg.exists():
|
||||
|
|
@ -380,7 +461,9 @@ def main():
|
|||
freq_cache = step_frequency()
|
||||
examples_cache = step_examples(args, freq_cache)
|
||||
step_audio(args)
|
||||
step_build_vocab(args, examples_cache, freq_cache)
|
||||
step_fonts(args)
|
||||
image_cache = step_images(args)
|
||||
step_build_vocab(args, examples_cache, freq_cache, image_cache)
|
||||
conjugations = step_conjugations(args)
|
||||
|
||||
print_summary(args, examples_cache, freq_cache, conjugations or {})
|
||||
|
|
|
|||
251
validate_verb_list.py
Normal file
251
validate_verb_list.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
|
||||
|
||||
For each verb:
|
||||
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
|
||||
2. Searches pealim.com to find URL slug
|
||||
3. Fetches the page to confirm the binyan
|
||||
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
|
||||
|
||||
Output:
|
||||
verbs_input.txt — cleaned verb list for conjugation_extract.py
|
||||
Printed validation report table
|
||||
|
||||
Usage:
|
||||
python3 validate_verb_list.py
|
||||
|
||||
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
|
||||
running conjugation extraction.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
PEALIM_BASE = "https://www.pealim.com"
|
||||
REQUEST_DELAY = 1.5
|
||||
REQUEST_TIMEOUT = 15
|
||||
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
|
||||
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
|
||||
|
||||
# Known problem entries: word → (action, note)
|
||||
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
|
||||
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
|
||||
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
|
||||
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
|
||||
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
|
||||
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
|
||||
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
|
||||
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
|
||||
}
|
||||
|
||||
# Expected binyan by line range (1-indexed) per plan analysis
|
||||
LINE_RANGES: list[tuple[range, str]] = [
|
||||
(range(1, 18), "Pa'al"),
|
||||
(range(18, 29), "Nif'al"),
|
||||
(range(29, 37), "Pi'el"),
|
||||
(range(37, 43), "Pu'al"),
|
||||
(range(43, 53), "Hitpa'el"),
|
||||
(range(53, 63), "Hif'il"),
|
||||
(range(63, 71), "Huf'al"),
|
||||
]
|
||||
|
||||
SECTION_HEADERS: dict[str, str] = {
|
||||
"Pa'al": "# Pa'al (פָּעַל)",
|
||||
"Nif'al": "# Nif'al (נִפְעַל)",
|
||||
"Pi'el": "# Pi'el (פִּעֵל)",
|
||||
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
|
||||
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
|
||||
"Hif'il": "# Hif'il (הִפְעִיל)",
|
||||
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
|
||||
|
||||
|
||||
def classify_by_line(line_num: int) -> str:
|
||||
"""Return expected binyan for a 1-indexed line number."""
|
||||
for r, binyan in LINE_RANGES:
|
||||
if line_num in r:
|
||||
return binyan
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def find_slug(query: str) -> str | None:
|
||||
"""Search pealim.com and return first URL slug found."""
|
||||
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
|
||||
try:
|
||||
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
||||
return slugs[0] if slugs else None
|
||||
except Exception as e:
|
||||
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def get_page_binyan(slug: str) -> str:
|
||||
"""Fetch /dict/<slug>/ and extract binyan from page header."""
|
||||
url = f"{PEALIM_BASE}/dict/{slug}/"
|
||||
try:
|
||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
|
||||
for h3 in soup.find_all("h3", class_="page-header"):
|
||||
text = h3.get_text(" ", strip=True)
|
||||
for bname in binyan_names:
|
||||
if bname in text:
|
||||
return bname
|
||||
meta = soup.find("meta", {"property": "og:description"})
|
||||
if meta:
|
||||
desc = meta.get("content", "")
|
||||
for bname in binyan_names:
|
||||
if bname in desc:
|
||||
return bname
|
||||
except Exception as e:
|
||||
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
|
||||
return ""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not SOURCE_FILE.exists():
|
||||
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
lines = [l.strip() for l in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if l.strip()]
|
||||
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
|
||||
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
|
||||
|
||||
results = []
|
||||
|
||||
for line_num, word in enumerate(lines, start=1):
|
||||
expected_binyan = classify_by_line(line_num)
|
||||
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
|
||||
|
||||
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
|
||||
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
|
||||
|
||||
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
|
||||
|
||||
if issue_type == "REVIEW":
|
||||
# Don't query pealim for known-bad entries
|
||||
print(f"REVIEW (skipping query)")
|
||||
results.append({
|
||||
"line": line_num, "word": word,
|
||||
"expected_binyan": expected_binyan,
|
||||
"slug": "", "page_binyan": "",
|
||||
"status": "REVIEW", "notes": issue_note,
|
||||
"is_3ms": is_3ms_by_position,
|
||||
})
|
||||
continue
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
slug = find_slug(word)
|
||||
|
||||
if slug:
|
||||
time.sleep(REQUEST_DELAY)
|
||||
page_binyan = get_page_binyan(slug)
|
||||
else:
|
||||
page_binyan = ""
|
||||
|
||||
# Determine status
|
||||
if issue_type == "3ms" or is_3ms_by_position:
|
||||
status = "3ms"
|
||||
notes = issue_note or "Pu'al/Huf'al 3ms past form"
|
||||
elif not slug:
|
||||
status = "NOT_FOUND"
|
||||
notes = "no search result on pealim.com"
|
||||
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
|
||||
status = "MISMATCH"
|
||||
notes = f"expected {expected_binyan}, page says {page_binyan}"
|
||||
else:
|
||||
status = "OK"
|
||||
notes = ""
|
||||
|
||||
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
|
||||
results.append({
|
||||
"line": line_num, "word": word,
|
||||
"expected_binyan": expected_binyan,
|
||||
"slug": slug or "", "page_binyan": page_binyan,
|
||||
"status": status, "notes": notes,
|
||||
"is_3ms": is_3ms_by_position or issue_type == "3ms",
|
||||
})
|
||||
|
||||
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
|
||||
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
|
||||
review_lines: list[str] = []
|
||||
|
||||
for r in results:
|
||||
b = r["expected_binyan"]
|
||||
if b not in sections:
|
||||
b = list(sections.keys())[0]
|
||||
|
||||
if r["status"] == "REVIEW":
|
||||
review_lines.append(f"# REVIEW: {r['word']} — {r['notes']}")
|
||||
elif r["status"] == "3ms":
|
||||
sections[b].append(f"# 3ms: {r['word']}")
|
||||
elif r["status"] in ("OK", "MISMATCH"):
|
||||
sections[b].append(r["word"])
|
||||
else: # NOT_FOUND
|
||||
sections[b].append(f"# NOT_FOUND: {r['word']} — {r['notes']}")
|
||||
|
||||
output_lines = [
|
||||
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
|
||||
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
|
||||
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
|
||||
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
|
||||
"",
|
||||
]
|
||||
for binyan, header in SECTION_HEADERS.items():
|
||||
if sections.get(binyan):
|
||||
output_lines.append(header)
|
||||
output_lines.extend(sections[binyan])
|
||||
output_lines.append("")
|
||||
|
||||
if review_lines:
|
||||
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
|
||||
output_lines.extend(review_lines)
|
||||
output_lines.append("")
|
||||
|
||||
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
|
||||
print(f"\nWrote → {OUTPUT_FILE}")
|
||||
|
||||
# ── Print summary table ──────────────────────────────────────────────────────
|
||||
col_w = [4, 22, 14, 38, 12]
|
||||
print("\n" + "=" * 95)
|
||||
print("VALIDATION REPORT")
|
||||
print("=" * 95)
|
||||
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
|
||||
print("-" * 95)
|
||||
for r in results:
|
||||
print(
|
||||
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
|
||||
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
|
||||
)
|
||||
print("=" * 95)
|
||||
|
||||
counts = {s: sum(1 for r in results if r["status"] == s)
|
||||
for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
|
||||
print(
|
||||
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
|
||||
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
|
||||
)
|
||||
print(f"Total entries: {len(results)}")
|
||||
|
||||
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
|
||||
print(
|
||||
"\n⚠ Review flagged entries in verbs_input.txt before running:\n"
|
||||
" python3 conjugation_extract.py"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
153
verbs_input.txt
153
verbs_input.txt
|
|
@ -1,91 +1,90 @@
|
|||
# Verb list from Coffin & Bolozky, A Reference Grammar of Modern Hebrew (2005), Appendix 1.
|
||||
# Citation: Coffin, Edna Amir and Shmuel Bolozky. A Reference Grammar of Modern Hebrew.
|
||||
# Cambridge University Press, 2005.
|
||||
# Lines prefixed '# 3ms:' are Pu'al/Huf'al verbs searched by 3ms past form.
|
||||
# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew
|
||||
# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).
|
||||
# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.
|
||||
# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.
|
||||
|
||||
# Pa'al (פָּעַל)
|
||||
לָלֶכֶת
|
||||
לָבוֹא
|
||||
לָשֶׁבֶת
|
||||
לָקוּם
|
||||
לָשִׂים
|
||||
לָדַעַת
|
||||
לִרְאוֹת
|
||||
לוֹמַר
|
||||
לַעֲשׂוֹת
|
||||
לִתֵּן
|
||||
לִקְחַת
|
||||
לֶאֱכֹל
|
||||
לִשְׁתּוֹת
|
||||
לִכְתּוֹב
|
||||
לִקְרוֹא
|
||||
לִשְׁמוֹר
|
||||
לִשְׁמֹעַ
|
||||
לִפְתּוֹחַ
|
||||
לִסְגּוֹר
|
||||
לִנְסוֹעַ
|
||||
לִרְכּוֹב
|
||||
לִשְׁכַּב
|
||||
לַחְשׁוֹב
|
||||
לִבְכּוֹת
|
||||
לָרוּץ
|
||||
לִשְׁאֹל
|
||||
לַעֲנוֹת
|
||||
לִמְכּוֹר
|
||||
לִקְנוֹת
|
||||
לִלְמֹד
|
||||
לשמור
|
||||
ללמוד
|
||||
לאסוף
|
||||
לעבוד
|
||||
לחבוש
|
||||
לאכול
|
||||
לשאול
|
||||
לשלוח
|
||||
לשבת
|
||||
לרשת
|
||||
לפול
|
||||
לקום
|
||||
לשים
|
||||
לחון
|
||||
לקרוא
|
||||
לקנות
|
||||
|
||||
# Nif'al (נִפְעַל)
|
||||
לְהִכָּנֵס
|
||||
לְהִפָּתַח
|
||||
לְהִסָּגֵר
|
||||
לְהִשָּׁמֵר
|
||||
לְהִמָּצֵא
|
||||
לְהִרְאוֹת
|
||||
לְהִכָּתֵב
|
||||
לְהִשָּׁבֵר
|
||||
להיבדק
|
||||
להרדם
|
||||
להחקר
|
||||
להישאר
|
||||
להיפגע
|
||||
להיוולד
|
||||
להנצל
|
||||
להיסוג
|
||||
להימצא
|
||||
להיבנות
|
||||
|
||||
# Pi'el (פִּעֵל)
|
||||
לְדַבֵּר
|
||||
לְסַפֵּר
|
||||
לְבַקֵּשׁ
|
||||
לְקַבֵּל
|
||||
לְשַׁלֵּם
|
||||
לְצַלֵּם
|
||||
לְנַסּוֹת
|
||||
לְחַכּוֹת
|
||||
לְטַלְפֵן
|
||||
לְבַשֵּׁל
|
||||
לדבר
|
||||
לברך
|
||||
לנהל
|
||||
לנצח
|
||||
לקומם
|
||||
למלא
|
||||
לחכות
|
||||
לגלגל
|
||||
|
||||
# Pu'al (פֻּעַל) — 3ms past, no infinitive
|
||||
# 3ms: דֻּבַּר
|
||||
# 3ms: סֻפַּר
|
||||
# 3ms: בֻּקַּשׁ
|
||||
# 3ms: קֻבַּל
|
||||
# 3ms: בותל
|
||||
# 3ms: תואם
|
||||
# 3ms: קומם
|
||||
# 3ms: דוכא
|
||||
# 3ms: זוכה
|
||||
# 3ms: פורסם
|
||||
|
||||
# Hitpa'el (הִתְפַּעֵל)
|
||||
לְהִתְלַבֵּשׁ
|
||||
לְהִתְרַחֵץ
|
||||
לְהִתְנַהֵג
|
||||
לְהִתְחַתֵּן
|
||||
לְהִתְגּוֹרֵר
|
||||
לְהִתְכּוֹנֵן
|
||||
לְהִתְחִיל
|
||||
להתלבש
|
||||
להסתלק
|
||||
להצטלם
|
||||
להזדקק
|
||||
להתנהג
|
||||
להתקומם
|
||||
להתפלא
|
||||
להתגלות
|
||||
להתקלקל
|
||||
|
||||
# Hif'il (הִפְעִיל)
|
||||
לְהַגִּיד
|
||||
לְהַבִּין
|
||||
לְהַכִּיר
|
||||
לְהַרְגִּישׁ
|
||||
לְהַחְלִיט
|
||||
לְהַתְחִיל
|
||||
לְהַכְנִיס
|
||||
לְהוֹצִיא
|
||||
לְהוֹרִיד
|
||||
לְהַעְלוֹת
|
||||
להכניס
|
||||
להעסיק
|
||||
להחליט
|
||||
להבטיח
|
||||
להוריד
|
||||
להפיל
|
||||
להקים
|
||||
# 3ms: המציא
|
||||
להרשות
|
||||
|
||||
# Huf'al (הֻפְעַל) — 3ms past, no infinitive
|
||||
# 3ms: הוּגַד
|
||||
# 3ms: הוּבַן
|
||||
# 3ms: הוּכְנַס
|
||||
# 3ms: הוּצָא
|
||||
# 3ms: הוגבל
|
||||
# 3ms: העבר
|
||||
# 3ms: הוזהר
|
||||
# 3ms: הופל
|
||||
# 3ms: הוקם
|
||||
# 3ms: הוחל
|
||||
# 3ms: הוקפא
|
||||
# 3ms: הופנה
|
||||
|
||||
# ── Entries flagged for manual review ──────────────────────────────────────────
|
||||
# REVIEW: לגבוה — not a standard infinitive form; likely defective spelling or wrong word
|
||||
# REVIEW: לההרג — extra ה; should probably be להיהרג (Nif'al of הרג)
|
||||
# REVIEW: להתלקלח — not a real word; likely typo for להתקלקל
|
||||
# REVIEW: להקלל — ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל
|
||||
|
|
|
|||
Loading…
Reference in a new issue