Sprint 11: unified JSON architecture + consolidated scraping pipeline

Migrate from fragmented CSV + 10 JSON files to a single data/words.json
(9,104 entries) as the unified data store. All GUIDs preserved for Anki
study progress continuity.

New files:
- SCHEMA.yaml: authoritative schema for words.json
- pealim_list_scrape.py: consolidated list page scraper → words.json
- pealim_detail_scrape.py: noun/verb detail scraper → words.json
- pealim_audio_download.py: audio downloader reading from words.json
- scripts/migrate_to_json.py: one-time CSV→JSON migration
- scripts/validate_data.py: 17 data integrity tests
- scripts/check_guid_coverage.py: GUID preservation checker
- scripts/repair_slugs.py: slug deduplication repair tool
- tests/test_scraper_integration.py: live scraper integration tests

Updated:
- apkg_builder.py: reads from words.json (no more pandas)
- run.py: 8-step pipeline (list scrape → frequency → examples →
  detail scrape → audio download → fonts → images → build)
- benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers
  for future words.json integration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-08 10:54:58 +00:00
parent 2e48109d7f
commit 08fb7009d8
20 changed files with 561420 additions and 10124 deletions

26
.claude/settings.json Normal file
View file

@ -0,0 +1,26 @@
{
"hooks": {
"PostToolUse": [
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=\"$CLAUDE_FILE_PATH\"; if [ -n \"$file\" ] && echo \"$file\" | grep -q '\\.py$'; then ruff format --quiet \"$file\" && ruff check --fix --quiet \"$file\" 2>/dev/null; fi"
}
]
}
],
"PreToolUse": [
{
"matcher": "Edit|Write",
"hooks": [
{
"type": "command",
"command": "file=\"$CLAUDE_FILE_PATH\"; if echo \"$file\" | grep -qE '(legacy_guid_map\\.json|\\.env)$'; then echo 'BLOCKED: Protected file — legacy_guid_map.json and .env are read-only' >&2; exit 2; fi"
}
]
}
]
}
}

View file

@ -56,7 +56,7 @@ Fields on each card:
| Plural form | for nouns: רבים: שֻׁלְחָנוֹת |
| Disambiguation hint | for ambiguous Eng→Heb cards |
Cards are presented in **frequency order** — Anki will show you the most common words first.
Cards are presented in **frequency order** — Anki will show you the most common words first. Note that because frequency is collected with words without nikkud, words that have the same letters but different nikkud will be assigned the same frequency.
### Eng→Heb disambiguation

148
SCHEMA.yaml Normal file
View file

@ -0,0 +1,148 @@
# Hebrew Flash Cards — Unified Data Schema (words.json)
# Revised based on Nevo's feedback (2026-03-08)
#
# Top-level: dict keyed by unique_key
# Unique key: nikkud word for most entries (e.g. "אָב")
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
#
# Hebrew text fields use nikkud/ktiv_male subfields:
# field:
# nikkud: "אָב" # with nikkud (hebstyle=mo)
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
#
# Pronoun notation for conjugation forms uses grammatical codes:
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
entry:
# --- Core Identity ---
word:
nikkud: "אָב"
ktiv_male: "אב"
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
root: ["א", "ב"] # Shoresh as list of consonant chars
pos: "Noun" # Part of speech in English (as from pealim)
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
audio_url: "https://..." # Pealim audio URL
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
tags: "" # Pealim tags if any
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
# --- Identity & Progress ---
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
# --- Frequency ---
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
# --- Display Enrichment ---
emoji: "👨"
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
image: "father.jpg" # Wikipedia/Commons image filename, or null
image_source: "wikipedia" # One of: wikipedia, commons, null
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
# --- Shared Roots ---
shared_roots: [] # List of unique_keys of other words sharing the same root
# Computed by iterating all entries and grouping by root
# --- Confusables ---
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
# --- Example Sentences ---
examples:
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
vetted: true
cloze: # Best sentence for cloze card, or null
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
cloze_word_start: 0 # Character offset of the clozed word in text
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
cloze_hint: "family member"
cloze_guid: "def456..." # GUID for the cloze note
rejected_count: 0
# --- Noun-specific: Inflection Forms ---
noun_inflection: null # null for non-nouns
# When populated:
# plurals_guid: "ghi789..." # GUID for plurals deck note
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
# nikkud: "אָב"
# ktiv_male: "אב"
# plural:
# nikkud: "אָבוֹת"
# ktiv_male: "אבות"
# singular_audio: "6009-av.mp3"
# plural_audio: null # TODO: scrape from detail pages
# construct_singular:
# nikkud: "אֲבִי"
# ktiv_male: "אבי"
# construct_plural:
# nikkud: "אֲבוֹת"
# ktiv_male: "אבות"
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
# 1s:
# nikkud: "אָבִי"
# ktiv_male: "אבי"
# 1p:
# nikkud: "אָבִינוּ"
# ktiv_male: "אבינו"
# 2ms: ...
# 2fs: ...
# 2mp: ...
# 2fp: ...
# 3ms: ...
# 3fs: ...
# 3mp: ...
# 3fp: ...
# gender: "masculine"
# gender_hebrew:
# nikkud: "זָכָר"
# ktiv_male: "זכר"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Verb-specific: Conjugation Data ---
conjugation: null # null for non-verbs
# When populated:
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
# infinitive:
# nikkud: "לִשְׁמֹר"
# ktiv_male: "לשמור"
# reference_form: # 3ms past (the citation form)
# nikkud: "שָׁמַר"
# ktiv_male: "שמר"
# binyan: "Pa'al" # English binyan name
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
# prep: "על" # Hebrew preposition the verb takes, or null
# active_forms:
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# tense: "עָבָר"
# form:
# nikkud: "שָׁמַרְתִּי"
# ktiv_male: "שמרתי"
# audio_url: "https://..."
# audio_file: null # For future use
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
# nikkud: "שֻׁמַּר"
# ktiv_male: "שומר"
# --- Adjective-specific ---
adjective_inflection: null # Reserved for future use
# When populated:
# ms/fs/mp/fp forms with nikkud/ktiv_male subfields
# --- Preposition-specific ---
preposition_inflection: null # Reserved for future use
# When populated:
# Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)

File diff suppressed because it is too large Load diff

View file

@ -2,6 +2,10 @@
"""
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
TODO: Rewrite to update words.json examples fields directly instead of
writing to a separate examples_cache.json. Currently the migration script
bridges the gap. See Phase 5 in SPRINT_LOG.md.
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
then answers queries locally.

File diff suppressed because it is too large Load diff

546420
data/words.json Normal file

File diff suppressed because it is too large Load diff

View file

@ -3,6 +3,10 @@
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
Downloads he_50k.txt once; subsequent runs read from cache.
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
TODO: Rewrite to update words.json frequency field directly instead of
writing to a separate frequency_cache.json. Currently the migration script
bridges the gap. See Phase 5 in SPRINT_LOG.md.
"""
import json

View file

@ -2,6 +2,10 @@
"""
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
TODO: Rewrite to update words.json image/image_source fields directly instead of
writing to a separate image_cache.json. Currently the migration script bridges
the gap. See Phase 5 in SPRINT_LOG.md.
Scope: Noun PoS entries only. Concreteness heuristic:
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
-ship, -ure, -al, -ing when not a gerund, -ence)
@ -59,7 +63,6 @@ session.headers.update(
)
def is_concrete(english_meaning: str) -> bool:
"""Return True if the English meaning looks like a concrete noun."""
meaning = english_meaning.strip().lower()

346
pealim_audio_download.py Normal file
View file

@ -0,0 +1,346 @@
#!/usr/bin/env python3
"""Download audio files from URLs stored in words.json.
Three audio categories are handled:
1. Vocab audio data/audio/{audio_file}
2. Noun plural data/audio/{slug}_plural.mp3
3. Conjugation data/audio_conj/{slug}_{form_key}.mp3
data/audio_conj/{slug}_passive_{form_key}.mp3
"""
import argparse
import json
import logging
import re
import time
from pathlib import Path
import requests
from helpers import strip_nikkud
logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
WORDS_JSON = DATA_DIR / "words.json"
DOWNLOAD_DELAY = 0.3
MAX_RETRIES = 3
# Map Hebrew tense names to English prefixes for form_key construction.
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
# appear in the current dataset but the form_key collapses to bare "infinitive".
TENSE_TO_PREFIX = {
"הוֹוֶה": "present",
"עָבָר": "past",
"עָתִיד": "future",
"צִוּוּי": "imperative",
"מְקוֹר": "infinitive",
}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _make_audio_file(entry: dict) -> str:
"""Derive the vocab audio filename when audio_file is absent.
Slug-based for confusable entries (slug contains the disambiguating ID),
consonant-only for all others.
Args:
entry: A words.json entry dict.
Returns:
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
"""
slug: str = entry["slug"]
if entry.get("confusable_group"):
return f"{slug}.mp3"
word: str = entry.get("word", "")
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word))
return f"{safe_name}.mp3"
def _form_key(person: str, tense: str) -> str:
"""Build a filesystem-safe form key from person and tense fields.
Args:
person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
tense: Hebrew tense string from the conjugation form.
Returns:
Form key such as ``"past_1s"`` or ``"present_ms"``.
Infinitive tense always returns ``"infinitive"`` (no person suffix).
"""
prefix = TENSE_TO_PREFIX.get(tense, tense)
if prefix == "infinitive":
return "infinitive"
return f"{prefix}_{person}"
def _download(url: str, dest: Path, session: requests.Session) -> bool:
"""Download *url* to *dest*, retrying up to MAX_RETRIES times.
Skips the download silently if *dest* already exists.
Args:
url: HTTP(S) URL to download.
dest: Local path to write the file to.
session: Shared requests session.
Returns:
``True`` if the file was downloaded (or already existed),
``False`` if all retries were exhausted.
"""
if dest.exists():
return True
for attempt in range(1, MAX_RETRIES + 1):
try:
resp = session.get(url, timeout=15)
resp.raise_for_status()
dest.write_bytes(resp.content)
logger.debug("Downloaded %s%s", url, dest.name)
return True
except requests.RequestException as exc:
wait = 2**attempt
if attempt < MAX_RETRIES:
logger.warning(
"Attempt %d/%d failed for %s (%s) — retrying in %ds",
attempt,
MAX_RETRIES,
url,
exc,
wait,
)
time.sleep(wait)
else:
logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
return False
# ---------------------------------------------------------------------------
# Per-category downloaders
# ---------------------------------------------------------------------------
def download_vocab_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int, int]:
"""Download vocabulary audio files.
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached, no_url) counts.
"""
downloaded = cached = no_url = 0
for entry in entries:
url: str | None = entry.get("audio_url")
if not url:
no_url += 1
continue
audio_file: str | None = entry.get("audio_file")
if not audio_file:
audio_file = _make_audio_file(entry)
dest = AUDIO_DIR / audio_file
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
else:
no_url += 1 # count persistent failures alongside missing URLs
return downloaded, cached, no_url
def download_noun_plural_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int]:
"""Download noun plural audio files.
Destination: ``data/audio/{slug}_plural.mp3``
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached) counts.
"""
downloaded = cached = 0
for entry in entries:
ni = entry.get("noun_inflection")
if not ni or not isinstance(ni, dict):
continue
url: str | None = ni.get("plural_audio")
if not url or not url.startswith("http"):
continue
slug: str = entry["slug"]
dest = AUDIO_DIR / f"{slug}_plural.mp3"
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
return downloaded, cached
def download_conjugation_audio(
entries: list[dict],
session: requests.Session,
) -> tuple[int, int, int]:
"""Download conjugation form audio files.
Active forms ``data/audio_conj/{slug}_{form_key}.mp3``
Passive forms ``data/audio_conj/{slug}_passive_{form_key}.mp3``
Args:
entries: List of words.json entry dicts.
session: Shared requests session.
Returns:
Tuple of (downloaded, cached, failed) counts.
"""
downloaded = cached = failed = 0
for entry in entries:
conj = entry.get("conjugation")
if not conj:
continue
slug: str = entry["slug"]
form_sets: list[tuple[str, list]] = [
("", conj.get("active_forms") or []),
("passive_", conj.get("hufal_pual_forms") or []),
]
for prefix, forms in form_sets:
for form in forms:
url: str | None = form.get("audio_url")
if not url:
continue
key = _form_key(form.get("person", ""), form.get("tense", ""))
dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
if dest.exists():
cached += 1
continue
if _download(url, dest, session):
downloaded += 1
time.sleep(DOWNLOAD_DELAY)
else:
failed += 1
return downloaded, cached, failed
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
"""Parse CLI args and run the audio download pipeline."""
parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
parser.add_argument(
"--skip-vocab",
action="store_true",
help="Skip vocabulary audio downloads.",
)
parser.add_argument(
"--skip-conj",
action="store_true",
help="Skip conjugation audio downloads.",
)
parser.add_argument(
"--test",
metavar="N",
type=int,
default=None,
help="Limit processing to the first N words.json entries.",
)
args = parser.parse_args()
logging.basicConfig(
level=logging.INFO,
format="%(message)s",
)
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
with open(WORDS_JSON, encoding="utf-8") as fh:
raw: dict[str, dict] = json.load(fh)
entries = list(raw.values())
if args.test is not None:
entries = entries[: args.test]
logger.info("[4] Downloading audio files …")
session = requests.Session()
session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
# --- Vocab ---
if not args.skip_vocab:
v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
else:
v_dl = v_cached = v_no_url = 0
# --- Noun plural ---
np_dl, np_cached = download_noun_plural_audio(entries, session)
# --- Conjugation ---
if not args.skip_conj:
c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
else:
c_dl = c_cached = c_failed = 0
# --- Summary ---
if not args.skip_vocab:
logger.info(
" Vocab: %d downloaded, %d cached, %d no URL",
v_dl,
v_cached,
v_no_url,
)
logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached)
if not args.skip_conj:
failed_msg = f", {c_failed} failed" if c_failed else ""
logger.info(
" Conjugation: %d downloaded, %d cached%s",
c_dl,
c_cached,
failed_msg,
)
if __name__ == "__main__":
main()

1130
pealim_detail_scrape.py Normal file

File diff suppressed because it is too large Load diff

706
pealim_list_scrape.py Normal file
View file

@ -0,0 +1,706 @@
#!/usr/bin/env python3
"""
Consolidated list page scraper for pealim.com.
Scrapes /dict/?page=N with two cookie variants (hebstyle=mo for nikkud,
hebstyle=vl for ktiv male) and writes results directly to data/words.json.
Usage:
python3 pealim_list_scrape.py [--test N] [--force-refresh]
"""
import argparse
import json
import logging
import os
import re
import time
from datetime import date
from pathlib import Path
import requests
from bs4 import BeautifulSoup
from helpers import strip_nikkud
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).parent
DATA_DIR = PROJECT_ROOT / "data"
WORDS_JSON = DATA_DIR / "words.json"
PROGRESS_JSON = DATA_DIR / "list_scrape_progress.json"
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
REQUEST_DELAY = 1.5 # seconds between requests
REQUEST_TIMEOUT = 15 # seconds
DEFAULT_TOTAL_PAGES = 608
SAVE_EVERY = 10 # pages between incremental saves
TODAY = date.today().isoformat()
# Prefer lxml if available; html.parser is the fallback
try:
import lxml # type: ignore[import-untyped] # noqa: F401
BS4_PARSER = "lxml"
except ImportError:
BS4_PARSER = "html.parser"
# ---------------------------------------------------------------------------
# Part-of-speech mappings
# ---------------------------------------------------------------------------
POS_HEBREW: dict[str, str] = {
"Noun": "שֵׁם עֶצֶם",
"Verb": "פֹּעַל",
"Adjective": "שֵׁם תֹּאַר",
"Adverb": "תֹּאַר הַפֹּעַל",
"Pronoun": "כִּנּוּי גּוּף",
"Preposition": "מִילַּת יַחַס",
"Conjunction": "מִילַּת חִבּוּר",
"Interjection": "מִילַּת קְרִיאָה",
"Numeral": "שֵׁם מִסְפָּר",
"Cardinal numeral": "שֵׁם מִסְפָּר",
"Particle": "מִילִּית",
"Determiner": "מְגַדִּיר",
"Existential": "מִילַּת קִיּוּם",
"Interrogative": "מִילַּת שְׁאֵלָה",
}
# Use exact match on the POS string prefix; longer keys must be checked first.
POS_HEBREW_ORDERED: list[tuple[str, str]] = sorted(POS_HEBREW.items(), key=lambda x: -len(x[0]))
BINYAN_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּיעֵל",
"Pu'al": "פֻּעַל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
"Hitpa'el": "הִתְפַּעֵל",
}
# Regex for extracting emoji characters
EMOJI_RE = re.compile(
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF]+",
re.UNICODE,
)
# Fields that must never be overwritten when updating an existing entry
PROTECTED_FIELDS = frozenset(
[
"vocab_legacy_guid",
"confusables_guid",
"frequency",
"pseudo_frequency",
"emoji",
"emoji_source",
"emoji_visible",
"image",
"image_source",
"hint",
"examples",
"noun_inflection",
"conjugation",
"adjective_inflection",
"preposition_inflection",
]
)
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki-scraper/1.0)"})
# ---------------------------------------------------------------------------
# Default entry template
# ---------------------------------------------------------------------------
def _default_entry() -> dict:
"""Return a fresh entry with all fields initialised to safe defaults."""
return {
"word": {"nikkud": "", "ktiv_male": ""},
"slug": "",
"root": [],
"pos": "",
"pos_hebrew": "",
"meaning": "",
"meaning_raw": "",
"audio_url": "",
"audio_file": "",
"tags": "",
"last_scrape_date": "",
"vocab_legacy_guid": None,
"frequency": None,
"pseudo_frequency": None,
"emoji": None,
"emoji_source": None,
"emoji_visible": False,
"image": None,
"image_source": None,
"hint": "",
"shared_roots": [],
"confusable_group": None,
"confusables_guid": None,
"examples": None,
"noun_inflection": None,
"conjugation": None,
"adjective_inflection": None,
"preposition_inflection": None,
}
# ---------------------------------------------------------------------------
# Parsing helpers
# ---------------------------------------------------------------------------
def _extract_emoji(text: str) -> str | None:
"""Return the first emoji run found in *text*, or None."""
m = EMOJI_RE.search(text)
return m.group(0) if m else None
def _clean_meaning(raw: str) -> str:
"""Strip emoji and extra whitespace from a raw meaning string."""
cleaned = EMOJI_RE.sub("", raw)
return " ".join(cleaned.split())
def _parse_pos(pos_raw: str) -> tuple[str, str]:
"""
Parse raw PoS string into (pos_en, pos_hebrew).
Examples:
"Noun masculine" ("Noun", "שֵׁם עֶצֶם")
"Verb pa'al" ("Verb", "פֹּעַל — פָּעַל")
"Cardinal numeral" ("Cardinal numeral", "שֵׁם מִסְפָּר")
"""
# Strip leading/trailing whitespace; normalise dashes
pos_clean = pos_raw.strip()
# Determine the base English PoS with longest-match strategy
pos_en = ""
for key, _ in POS_HEBREW_ORDERED:
if pos_clean.startswith(key):
pos_en = key
break
if not pos_en:
# Fallback: take everything up to " " or the full string
pos_en = pos_clean.split(" ")[0].split(" - ")[0].strip()
pos_heb = POS_HEBREW.get(pos_en, pos_en)
# For verbs, attempt to append binyan
if pos_en == "Verb":
# Look for binyan after dash; pealim uses "Verb pa'al"
dash_parts = re.split(r"\s*[-]\s*", pos_clean)
if len(dash_parts) >= 2:
binyan_raw = dash_parts[1].strip()
# Normalise capitalisation for lookup: "pa'al" → "Pa'al"
binyan_key = binyan_raw.capitalize()
# Handle mixed-case entries like "Nif'al"
for bkey in BINYAN_HEBREW:
if bkey.lower() == binyan_raw.lower():
binyan_key = bkey
break
binyan_heb = BINYAN_HEBREW.get(binyan_key)
if binyan_heb:
pos_heb = f"{pos_heb}{binyan_heb}"
return pos_en, pos_heb
def _parse_root(root_raw: str) -> list[str]:
"""
Convert raw root text to a list of consonants.
Pealim shows roots as "פ - ע - ל" or "פ.ע.ל" or "" (no root).
"""
if not root_raw or root_raw in ("-", "", ""):
return []
# Split on " - " or "." separators
parts = re.split(r"\s*[-–—.]\s*", root_raw.strip())
return [p.strip() for p in parts if p.strip()]
def _build_tags(pos_en: str, root: list[str]) -> str:
"""
Generate Anki tags string matching the existing project convention.
Examples:
pos=Noun, root=[] "שם_עצם"
pos=Noun, root=["א","ב"] "שורש::אב שם_עצם"
pos=Verb, root=["שמר"] "שורש::שמר פעלים"
"""
pos_tag_map = {
"Noun": "שם_עצם",
"Verb": "פעלים",
"Adjective": "שם_תואר",
"Adverb": "תוארי_הפועל",
"Pronoun": "כינוייוף",
"Preposition": "מילות_יחס",
"Conjunction": "מילות_חיבור",
"Particle": "מילית",
"Numeral": "שם_מספר",
"Cardinal numeral": "שם_מספר",
"Determiner": "מגדיר",
"Existential": "מילת_קיום",
"Interrogative": "מילת_שאלה",
"Interjection": "מילת_קריאה",
}
parts: list[str] = []
if root:
root_str = "".join(strip_nikkud(c) for c in root)
parts.append(f"שורש::{root_str}")
pos_heb_tag = pos_tag_map.get(pos_en, "")
if pos_heb_tag:
parts.append(pos_heb_tag)
return " ".join(parts)
def _compute_audio_file(slug: str, ktiv_male: str) -> str:
"""
Return the local audio filename for an entry.
The actual confusable detection happens later (after all pages are scraped);
here we store a placeholder that post_process() will correct.
We default to the consonant-based name; confusables get slug-based names.
"""
consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
# ---------------------------------------------------------------------------
# Page parsing
# ---------------------------------------------------------------------------
def _parse_mo_page(html: bytes) -> list[dict]:
"""
Parse a hebstyle=mo (nikkud) list page.
Returns a list of raw row dicts with keys:
nikkud, slug, root_raw, pos_raw, meaning_raw, audio_url
"""
soup = BeautifulSoup(html, BS4_PARSER)
rows: list[dict] = []
for tr in soup.select("table tr"):
tds = tr.find_all("td")
if len(tds) < 4:
continue
# Audio URL
audio_span = tds[0].find(attrs={"data-audio": True})
audio_url: str = audio_span["data-audio"] if audio_span else ""
# Slug
slug = ""
link = tds[0].find("a", href=True)
if link:
m = re.search(r"/dict/([^/]+)/", link["href"])
if m:
slug = m.group(1)
# Nikkud word
menukad = tds[0].find("span", class_="menukad")
nikkud = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
root_raw = tds[1].get_text(strip=True)
pos_raw = tds[2].get_text(strip=True)
meaning_raw = tds[3].get_text(strip=True)
if nikkud:
rows.append(
{
"nikkud": nikkud,
"slug": slug,
"root_raw": root_raw,
"pos_raw": pos_raw,
"meaning_raw": meaning_raw,
"audio_url": audio_url,
}
)
return rows
def _parse_vl_words(html: bytes) -> list[str]:
"""
Parse a hebstyle=vl (ktiv male) list page.
Returns ordered list of ktiv male strings (one per table row).
"""
soup = BeautifulSoup(html, BS4_PARSER)
words: list[str] = []
for tr in soup.select("table tr"):
tds = tr.find_all("td")
if len(tds) < 4:
continue
menukad = tds[0].find("span", class_="menukad")
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
words.append(word)
return words
# ---------------------------------------------------------------------------
# words.json I/O
# ---------------------------------------------------------------------------
def _load_words() -> dict:
"""Load words.json; return empty dict if missing."""
if not WORDS_JSON.exists():
logger.info("data/words.json not found — starting fresh.")
return {}
with WORDS_JSON.open(encoding="utf-8") as fh:
return json.load(fh)
def _save_words(words: dict) -> None:
"""Atomically write words to words.json via a .tmp file."""
tmp = WORDS_JSON.with_suffix(".json.tmp")
with tmp.open("w", encoding="utf-8") as fh:
json.dump(words, fh, ensure_ascii=False, indent=2)
os.replace(tmp, WORDS_JSON)
logger.info("Saved data/words.json (%d entries)", len(words))
# ---------------------------------------------------------------------------
# Progress tracking
# ---------------------------------------------------------------------------
def _load_progress() -> set[int]:
"""Return set of already-completed page numbers."""
if not PROGRESS_JSON.exists():
return set()
with PROGRESS_JSON.open(encoding="utf-8") as fh:
data = json.load(fh)
return set(data.get("completed_pages", []))
def _save_progress(completed: set[int]) -> None:
"""Atomically write progress file."""
tmp = PROGRESS_JSON.with_suffix(".json.tmp")
with tmp.open("w", encoding="utf-8") as fh:
json.dump({"completed_pages": sorted(completed)}, fh)
os.replace(tmp, PROGRESS_JSON)
# ---------------------------------------------------------------------------
# Unique key generation
# ---------------------------------------------------------------------------
def _make_unique_key(nikkud: str, pos_en: str, meaning: str, existing_keys: set[str]) -> str:
"""
Generate a collision-free unique key for a new entry.
Escalation:
1. nikkud
2. nikkud|pos_en
3. nikkud|pos_en|meaning
4. nikkud|pos_en|meaning|N (N = 2, 3, )
"""
candidate = nikkud
if candidate not in existing_keys:
return candidate
candidate = f"{nikkud}|{pos_en}"
if candidate not in existing_keys:
return candidate
candidate = f"{nikkud}|{pos_en}|{meaning}"
if candidate not in existing_keys:
return candidate
n = 2
while True:
candidate = f"{nikkud}|{pos_en}|{meaning}|{n}"
if candidate not in existing_keys:
return candidate
n += 1
# ---------------------------------------------------------------------------
# Core: merge one scraped row into words dict
# ---------------------------------------------------------------------------
def _merge_row(
words: dict,
slug_index: dict[str, str],
nikkud: str,
ktiv_male: str,
slug: str,
root_raw: str,
pos_raw: str,
meaning_raw_raw: str,
audio_url: str,
) -> None:
"""
Upsert a single scraped row into *words* in-place.
*slug_index* maps slug unique_key for fast lookup and is updated here
when a new entry is created.
"""
# Derived fields
pos_en, pos_heb = _parse_pos(pos_raw)
root = _parse_root(root_raw)
meaning_raw = meaning_raw_raw
meaning = _clean_meaning(meaning_raw)
emoji = _extract_emoji(meaning_raw_raw)
tags = _build_tags(pos_en, root)
audio_file = _compute_audio_file(slug, ktiv_male)
# ---- locate existing entry ----
unique_key: str | None = slug_index.get(slug) if slug else None
if unique_key and unique_key in words:
# Update list-level fields only; never touch protected fields
entry = words[unique_key]
entry["word"]["nikkud"] = nikkud
entry["word"]["ktiv_male"] = ktiv_male
entry["slug"] = slug
entry["root"] = root
entry["pos"] = pos_en
entry["pos_hebrew"] = pos_heb
entry["meaning"] = meaning
entry["meaning_raw"] = meaning_raw
entry["audio_url"] = audio_url
entry["audio_file"] = audio_file
entry["tags"] = tags
entry["last_scrape_date"] = TODAY
else:
# Create new entry
unique_key = _make_unique_key(nikkud, pos_en, meaning, set(words.keys()))
entry = _default_entry()
entry["word"]["nikkud"] = nikkud
entry["word"]["ktiv_male"] = ktiv_male
entry["slug"] = slug
entry["root"] = root
entry["pos"] = pos_en
entry["pos_hebrew"] = pos_heb
entry["meaning"] = meaning
entry["meaning_raw"] = meaning_raw
entry["emoji"] = emoji
entry["emoji_source"] = "from_pealim" if emoji else None
entry["audio_url"] = audio_url
entry["audio_file"] = audio_file
entry["tags"] = tags
entry["last_scrape_date"] = TODAY
words[unique_key] = entry
if slug:
slug_index[slug] = unique_key
# ---------------------------------------------------------------------------
# Post-processing: recompute confusable_group, shared_roots, audio_file
# ---------------------------------------------------------------------------
def _post_process(words: dict) -> None:
"""
After all pages are scraped, recompute derived cross-entry fields:
- confusable_group: entries sharing the same ktiv_male (2+)
- shared_roots: entries sharing the same root (excluding self)
- audio_file: slug-based for confusables, consonant-based otherwise
"""
logger.info("Post-processing: recomputing confusable groups and shared roots...")
# --- confusable groups ---
ktiv_to_keys: dict[str, list[str]] = {}
for key, entry in words.items():
ktiv = entry.get("word", {}).get("ktiv_male", "")
if ktiv:
ktiv_to_keys.setdefault(ktiv, []).append(key)
for _, entry in words.items():
ktiv = entry.get("word", {}).get("ktiv_male", "")
group = ktiv_to_keys.get(ktiv, [])
if len(group) >= 2:
entry["confusable_group"] = sorted(group)
# Confusable → slug-based audio filename
slug = entry.get("slug", "")
if slug:
entry["audio_file"] = f"{slug}.mp3"
else:
# Only clear confusable_group if it wasn't set by enrichment (i.e. no confusables_guid)
if not entry.get("confusables_guid"):
entry["confusable_group"] = None
# Non-confusable → consonant-based audio filename
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
slug = entry.get("slug", "")
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
# --- shared roots ---
root_to_keys: dict[str, list[str]] = {}
for key, entry in words.items():
root = entry.get("root")
if root:
root_str = "|".join(root) # canonical form for grouping
root_to_keys.setdefault(root_str, []).append(key)
for key, entry in words.items():
root = entry.get("root")
if root:
root_str = "|".join(root)
siblings = root_to_keys.get(root_str, [])
entry["shared_roots"] = sorted(k for k in siblings if k != key)
else:
entry["shared_roots"] = []
logger.info("Post-processing complete.")
# ---------------------------------------------------------------------------
# Scraping loop
# ---------------------------------------------------------------------------
def _build_slug_index(words: dict) -> dict[str, str]:
"""Build slug → unique_key lookup from the current words dict."""
index: dict[str, str] = {}
for key, entry in words.items():
slug = entry.get("slug", "")
if slug and slug not in index:
index[slug] = key
return index
def _fetch_page(url: str, cookies: dict) -> bytes | None:
"""Fetch a single page; return raw bytes or None on failure."""
try:
resp = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return resp.content
except requests.RequestException as exc:
logger.error("Request failed for %s: %s", url, exc)
return None
def run_scrape(total_pages: int, force_refresh: bool) -> None:
"""
Main scrape loop.
Args:
total_pages: Number of list pages to scrape.
force_refresh: If True, ignore progress file and re-scrape all pages.
"""
words = _load_words()
slug_index = _build_slug_index(words)
completed = set() if force_refresh else _load_progress()
if force_refresh and completed:
logger.info("--force-refresh: ignoring %d completed pages.", len(completed))
pages_to_do = [p for p in range(1, total_pages + 1) if p not in completed]
logger.info(
"Pages to scrape: %d / %d (already done: %d)",
len(pages_to_do),
total_pages,
len(completed),
)
pages_since_save = 0
for page_num in pages_to_do:
url = f"{PEALIM_DICT_URL}?page={page_num}"
logger.info("Scraping page %d / %d", page_num, total_pages)
# --- hebstyle=mo (nikkud + audio + slug) ---
mo_html = _fetch_page(url, {"translit": "none", "hebstyle": "mo"})
if mo_html is None:
logger.warning("Skipping page %d (mo fetch failed).", page_num)
time.sleep(REQUEST_DELAY * 2)
continue
time.sleep(REQUEST_DELAY)
# --- hebstyle=vl (ktiv male) ---
vl_html = _fetch_page(url, {"translit": "none", "hebstyle": "vl"})
if vl_html is None:
logger.warning("Skipping page %d (vl fetch failed).", page_num)
time.sleep(REQUEST_DELAY * 2)
continue
# Parse
mo_rows = _parse_mo_page(mo_html)
vl_words = _parse_vl_words(vl_html)
if not mo_rows:
logger.warning("Page %d returned no rows — might be past end.", page_num)
completed.add(page_num)
_save_progress(completed)
time.sleep(REQUEST_DELAY)
continue
# Merge each row
for i, row in enumerate(mo_rows):
ktiv_male = vl_words[i] if i < len(vl_words) else ""
_merge_row(
words=words,
slug_index=slug_index,
nikkud=row["nikkud"],
ktiv_male=ktiv_male,
slug=row["slug"],
root_raw=row["root_raw"],
pos_raw=row["pos_raw"],
meaning_raw_raw=row["meaning_raw"],
audio_url=row["audio_url"],
)
completed.add(page_num)
pages_since_save += 1
# Incremental save every SAVE_EVERY pages
if pages_since_save >= SAVE_EVERY:
_save_words(words)
_save_progress(completed)
pages_since_save = 0
time.sleep(REQUEST_DELAY)
# Final save + post-processing
logger.info("All pages scraped. Running post-processing…")
_post_process(words)
_save_words(words)
_save_progress(completed)
logger.info("Done. Total entries in words.json: %d", len(words))
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
"""Entry point."""
parser = argparse.ArgumentParser(description="Scrape pealim.com list pages into data/words.json.")
parser.add_argument(
"--test",
metavar="N",
type=int,
default=None,
help="Scrape only the first N pages (for testing).",
)
parser.add_argument(
"--force-refresh",
action="store_true",
default=False,
help="Re-scrape all pages, ignoring existing progress.",
)
args = parser.parse_args()
total_pages = args.test if args.test is not None else DEFAULT_TOTAL_PAGES
logger.info(
"Starting pealim list scraper | pages=%d | force=%s | parser=%s",
total_pages,
args.force_refresh,
BS4_PARSER,
)
run_scrape(total_pages=total_pages, force_refresh=args.force_refresh)
if __name__ == "__main__":
main()

View file

@ -25,6 +25,9 @@ dev = [
[tool.pytest.ini_options]
testpaths = ["tests"]
markers = [
"integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
]
[tool.ruff]
target-version = "py311"

505
run.py
View file

@ -7,10 +7,10 @@ Usage:
Options:
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
--skip-scrape Skip list page scraping (use existing words.json)
--skip-detail Skip detail page scraping
--skip-audio Skip audio .mp3 downloads
--skip-examples Skip Ben Yehuda example fetching
--skip-conjugations Skip verb conjugation extraction
--skip-images Skip image fetching for concrete nouns
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
--test N Process only the first N dictionary words (for quick testing)
@ -21,7 +21,6 @@ import json
import logging
import re
import sys
import time
from pathlib import Path
from helpers import strip_nikkud
@ -39,6 +38,7 @@ OUTPUT_DIR = Path(__file__).parent / "output"
AUDIO_DIR = DATA_DIR / "audio"
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
FONTS_DIR = DATA_DIR / "fonts"
WORDS_JSON = DATA_DIR / "words.json"
def parse_args():
@ -48,47 +48,31 @@ def parse_args():
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
help="Run only one deck (skips all unrelated steps)",
)
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
p.add_argument(
"--skip-conjugations",
action="store_true",
help="Skip verb conjugation extraction (deprecated: use --only vocab)",
)
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
return p.parse_args()
def step_scrape(args):
"""Step 1 — scrape or load dictionary."""
dict_csv = DATA_DIR / "hebrew_dict.csv"
anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
# Legacy fallback names
legacy_dict = DATA_DIR / "pealim_dict.csv"
def step_list_scrape(args):
"""Step 1 — scrape pealim.com list pages → words.json."""
if args.skip_scrape:
if dict_csv.exists():
logger.info(f"[1] Using existing {dict_csv}")
elif legacy_dict.exists():
logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
if WORDS_JSON.exists():
logger.info("[1] Using existing words.json (--skip-scrape)")
else:
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
sys.exit(1)
return
logger.info("[1] Scraping dictionary from pealim.com …")
logger.info("[1] Scraping dictionary list pages from pealim.com …")
import pealim_list_scrape
import hebrew_extract
df = hebrew_extract.extract_from_website()
df.to_csv(dict_csv, index=True)
logger.info(f" Saved {len(df)} words → {dict_csv}")
df = hebrew_extract.modify_for_anki(df)
df.to_csv(anki_csv, sep=";", index=True)
logger.info(f" Saved Anki CSV → {anki_csv}")
total_pages = args.test if args.test else None
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
def step_frequency() -> dict[str, int]:
@ -100,7 +84,7 @@ def step_frequency() -> dict[str, int]:
return frequency_lookup._freq
def step_examples(args, freq_cache: dict):
def step_examples(args, _freq_cache: dict):
"""Step 3 — load/build Ben Yehuda example index."""
if args.skip_examples:
logger.info("[3] Skipping examples (--skip-examples)")
@ -115,255 +99,100 @@ def step_examples(args, freq_cache: dict):
benyehuda.load(force_rebuild=args.refresh_examples)
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
# Read word list from words.json instead of CSV
if not WORDS_JSON.exists():
logger.warning("[3] words.json not found, skipping examples")
return {}
try:
import pandas as pd
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
entries = list(words.values())
if args.test:
entries = entries[: args.test]
if args.test:
df = df.head(args.test)
# Build confusable consonant set from words.json
consonant_counts: dict[str, int] = {}
for entry in entries:
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
# Build confusable consonant set from CSV
consonant_counts: dict[str, int] = {}
for _, row in df.iterrows():
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
if word_no_nik and word_no_nik not in ("nan", "None"):
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
# Delete stale cache entries for confusable words so they get re-fetched
stale_deleted = 0
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if word_nikkud and ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
del benyehuda._examples_cache[word_nikkud]
stale_deleted += 1
if stale_deleted:
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
# Delete stale cache entries for confusable words so they get re-fetched
stale_deleted = 0
for _, row in df.iterrows():
word_nikkud = str(row.get("Word", "")).strip()
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
if word_nikkud and word_no_nik:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
del benyehuda._examples_cache[word_nikkud]
stale_deleted += 1
if stale_deleted:
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
logger.info(f" Pre-fetching examples for {len(df)} words …")
for _, row in df.iterrows():
# Use nikkud word form as primary key (nikkud corpus)
word_nikkud = str(row.get("Word", "")).strip()
if word_nikkud:
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
except Exception as e:
logger.warning(f" Could not pre-fetch all examples: {e}")
logger.info(f" Pre-fetching examples for {len(entries)} words …")
for entry in entries:
word_nikkud = entry.get("word", {}).get("nikkud", "")
if word_nikkud:
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
benyehuda.save_examples_cache()
return benyehuda._examples_cache
def step_audio(args):
"""Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
if args.skip_audio:
logger.info("[4] Skipping audio (--skip-audio)")
def step_detail_scrape(args):
"""Step 4 — scrape detail pages for nouns and verbs → update words.json."""
if args.skip_detail:
logger.info("[4] Skipping detail scrape (--skip-detail)")
return
logger.info("[4] Downloading vocabulary audio files …")
logger.info("[4] Scraping detail pages from pealim.com …")
import pealim_detail_scrape
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
import pandas as pd
import requests
try:
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
if "audio_url" not in df.columns:
logger.warning(" No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
return
if args.test:
df = df.head(args.test)
# Build confusable set: consonant forms that appear more than once
confusable_consonants: set[str] = set()
consonant_counts: dict[str, int] = {}
for _, row in df.iterrows():
word_plain = str(row.get("Word Without Nikkud", "")).strip()
if word_plain and word_plain not in ("nan", "None"):
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain))
if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
downloaded = 0
skipped = 0
no_url = 0
for _, row in df.iterrows():
word = str(row.get("Word", "")).strip()
word_plain = str(row.get("Word Without Nikkud", "")).strip()
audio_url = str(row.get("audio_url", "")).strip()
slug = str(row.get("slug", "")).strip()
if not word:
continue
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
if not safe_name:
continue
# Confusable words: use slug-based filename to avoid collisions
if safe_name in confusable_consonants and slug and slug not in ("nan", "None"):
mp3_path = AUDIO_DIR / f"{slug}.mp3"
else:
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
if mp3_path.exists():
skipped += 1
continue
if not audio_url or audio_url in ("nan", "None", ""):
no_url += 1
continue
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.3)
except Exception as e:
logger.debug(f" Audio download failed for {word}: {e}")
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
except Exception as e:
logger.warning(f" Audio step failed: {e}")
test_limit = args.test if args.test else None
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
def step_conj_audio(args, conjugations: dict):
"""Step 4b — download conjugation audio .mp3 files."""
def step_audio_download(args):
"""Step 5 — download audio .mp3 files from URLs in words.json."""
if args.skip_audio:
logger.info("[4b] Skipping conjugation audio (--skip-audio)")
logger.info("[5] Skipping audio (--skip-audio)")
return
logger.info("[4b] Downloading conjugation audio files …")
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
logger.info("[5] Downloading audio files …")
import pealim_audio_download
import requests
downloaded = 0
skipped = 0
failed = 0
for _infinitive, data in conjugations.items():
if not data or not data.get("forms"):
continue
slug = data.get("slug", "")
if not slug:
continue
# Active forms
for form_key, form_data in data["forms"].items():
audio_url = form_data.get("audio_url", "")
if not audio_url:
continue
filename = f"{slug}_{form_key}.mp3"
mp3_path = AUDIO_CONJ_DIR / filename
if mp3_path.exists():
skipped += 1
continue
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.2)
except Exception as e:
logger.debug(f" Conj audio failed {filename}: {e}")
failed += 1
# Passive partner forms
passive = data.get("passive_partner")
if passive and passive.get("forms"):
for form_key, form_data in passive["forms"].items():
audio_url = form_data.get("audio_url", "")
if not audio_url:
continue
filename = f"{slug}_passive_{form_key}.mp3"
mp3_path = AUDIO_CONJ_DIR / filename
if mp3_path.exists():
skipped += 1
continue
try:
resp = requests.get(audio_url, timeout=10)
resp.raise_for_status()
mp3_path.write_bytes(resp.content)
downloaded += 1
time.sleep(0.2)
except Exception as e:
logger.debug(f" Conj audio failed {filename}: {e}")
failed += 1
logger.info(f" Conjugation audio: {downloaded} downloaded, {skipped} cached, {failed} failed")
test_limit = args.test if args.test else None
pealim_audio_download.run(test=test_limit)
def step_fonts(args):
"""Step 4c — download Heebo font files (one-time, cached)."""
def step_fonts(_args: argparse.Namespace):
"""Step 6 — download Heebo font files (one-time, cached)."""
FONTS_DIR.mkdir(parents=True, exist_ok=True)
regular = FONTS_DIR / "_Heebo-Regular.ttf"
bold = FONTS_DIR / "_Heebo-Bold.ttf"
if regular.exists() and bold.exists():
logger.info("[4c] Heebo fonts already cached")
logger.info("[6] Heebo fonts already cached")
return
logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
# Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
import requests as _req
headers = {
# Request TTF (not woff2) so Anki can embed them
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
}
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
try:
css_resp = _req.get(css_url, headers=headers, timeout=15)
css_resp.raise_for_status()
css_text = css_resp.text
# Find all src: url(...) references (may be woff2 for modern UA)
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
logger.debug(f" Found {len(font_urls)} font URL(s) in CSS")
# Prefer TTF; if only woff2 available, download first two and note
downloaded = []
for i, fu in enumerate(font_urls[:2]):
fu = fu.strip("'\"")
dest = regular if i == 0 else bold
@ -372,128 +201,60 @@ def step_fonts(args):
fr = _req.get(fu, timeout=15)
fr.raise_for_status()
dest.write_bytes(fr.content)
downloaded.append(dest.name)
logger.info(f" Downloaded → {dest.name}")
if not downloaded:
logger.info(" All font files already present")
except Exception as e:
logger.warning(f" Heebo download failed: {e}")
logger.warning(" Cards will fall back to Arial Hebrew / David.")
logger.warning(
" To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
"from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
f"into {FONTS_DIR}"
)
def step_images(args) -> dict:
"""Step 4d — fetch images for concrete nouns (resume-safe)."""
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
if args.skip_images:
logger.info("[4d] Skipping images (--skip-images)")
logger.info("[7] Skipping images (--skip-images)")
cache_path = DATA_DIR / "image_cache.json"
if cache_path.exists():
with open(cache_path) as f:
return json.load(f)
return {}
limit = args.test # When in test mode, limit images too
logger.info("[4d] Fetching images for concrete nouns …")
limit = args.test
logger.info("[7] Fetching images for concrete nouns …")
import image_fetch
return image_fetch.run(limit=limit)
def step_build_all(
args, examples_cache: dict, freq_cache: dict, conjugations: dict | None, image_cache: dict | None = None
):
"""Step 5 — build all 6 release variants (4 vocab + 2 conj)."""
logger.info("[5] Building all deck variants …")
def step_build_all(args):
"""Step 8 — build all 12 release variants from the unified words.json."""
logger.info("[8] Building all deck variants …")
import apkg_builder
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if not WORDS_JSON.exists():
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
sys.exit(1)
apkg_builder.build_all_variants(
dict_csv,
conjugations=conjugations or {},
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache or {},
limit=args.test,
)
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
apkg_builder.build_all_variants(words, limit=args.test)
def step_conjugations(args):
"""Step 6 — extract conjugations (returns data; building handled by step_build_all).
--skip-conjugations skips re-extraction from pealim.com but still loads
from cache so conj deck variants are built correctly.
"""
conj_cache = DATA_DIR / "conjugations.json"
if args.skip_conjugations:
if conj_cache.exists():
logger.info("[6] --skip-conjugations: loading from cache …")
with open(conj_cache) as f:
import json as _json
return _json.load(f)
logger.info("[6] --skip-conjugations: no cache found, skipping conj decks")
return None
verbs_file = Path(__file__).parent / "verbs_input.txt"
if not verbs_file.exists():
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
return None
if conj_cache.exists():
logger.info("[6] Using cached conjugations.json …")
with open(conj_cache) as f:
import json as _json
conjugations = _json.load(f)
else:
logger.info("[6] Extracting verb conjugations …")
import conjugation_extract
conjugations = conjugation_extract.main(verbs_file)
# Download conjugation audio
step_conj_audio(args, conjugations)
return conjugations
def print_summary(args, examples_cache, freq_cache, conjugations):
def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
logger.info("")
logger.info("=" * 60)
logger.info("SUMMARY")
logger.info("=" * 60)
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "pealim_dict.csv"
if dict_csv.exists():
import pandas as pd
if WORDS_JSON.exists():
with open(WORDS_JSON, encoding="utf-8") as f:
words = json.load(f)
logger.info(f" Dictionary words: {len(words)}")
try:
df = pd.read_csv(dict_csv, sep=";", index_col=0)
if df.shape[1] < 3:
raise ValueError("too few columns")
except (ValueError, pd.errors.ParserError):
df = pd.read_csv(dict_csv, index_col=0)
logger.info(f" Dictionary words: {len(df)}")
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
logger.info(f" Frequency entries: {len(freq_cache)}")
logger.info(f" Example cache entries: {len(examples_cache)}")
@ -506,8 +267,6 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
logger.info(f" Vocabulary audio files: {len(mp3s)}")
if AUDIO_CONJ_DIR.exists():
# Count only files that will be bundled: active non-infinitive forms
# (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
mp3s = [
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
]
@ -538,9 +297,6 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
if apkg.exists():
size_mb = apkg.stat().st_size / 1e6
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
if conjugations:
verb_count = sum(1 for v in conjugations.values() if v)
logger.info(f" Verbs in conjugation deck: {verb_count}")
logger.info("=" * 60)
logger.info("DONE")
@ -559,88 +315,73 @@ def main():
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
logger.info("=" * 60)
def _load_words_for_only() -> dict:
if not WORDS_JSON.exists():
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
sys.exit(1)
with open(WORDS_JSON, encoding="utf-8") as f:
return json.load(f)
if args.only == "conjugations":
step_fonts(args)
conjugations = step_conjugations(args)
if conjugations:
import apkg_builder
import apkg_builder
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
deck, media = apkg_builder.build_conj_deck(
conjugations,
include_audio=audio,
dict_csv=dict_csv,
)
apkg_builder.write_conj_apkg(deck, media, out_path=path)
print_summary(args, {}, {}, conjugations or {})
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
apkg_builder.write_conj_apkg(deck, media, out_path=path)
print_summary(args, {}, {})
return
if args.only == "confusables":
step_fonts(args)
import apkg_builder
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
deck, media = apkg_builder.build_confusables_deck(dict_csv, include_audio=audio)
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
apkg_builder.write_conf_apkg(deck, media, out_path=path)
print_summary(args, {}, {}, {})
print_summary(args, {}, {})
return
if args.only == "plurals":
step_fonts(args)
import apkg_builder
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
words = _load_words_for_only()
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
deck, media = apkg_builder.build_plural_deck(dict_csv=dict_csv, include_audio=audio)
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
apkg_builder.write_plural_apkg(deck, media, out_path=path)
print_summary(args, {}, {}, {})
print_summary(args, {}, {})
return
if args.only == "complete":
step_fonts(args)
freq_cache = step_frequency() if not args.skip_scrape else {}
examples_cache = step_examples(args, freq_cache) if not args.skip_examples else {}
image_cache = step_images(args) if not args.skip_images else {}
conjugations = step_conjugations(args)
import apkg_builder
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
if not dict_csv.exists():
dict_csv = DATA_DIR / "hebrew_dict.csv"
words = _load_words_for_only()
emoji_lookup = apkg_builder._load_emoji_lookup()
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
decks, media = apkg_builder.build_complete_deck(
dict_csv,
conjugations=conjugations or {},
examples_cache=examples_cache,
freq_cache=freq_cache,
image_cache=image_cache,
emoji_lookup=emoji_lookup,
words,
include_audio=audio,
emoji_lookup=emoji_lookup,
)
apkg_builder.write_complete_apkg(decks, media, out_path=path)
print_summary(args, examples_cache, freq_cache, conjugations or {})
print_summary(args, {}, {})
return
if args.only == "vocab":
args.skip_conjugations = True
step_scrape(args)
# Full pipeline
step_list_scrape(args)
freq_cache = step_frequency()
examples_cache = step_examples(args, freq_cache)
step_audio(args)
step_detail_scrape(args)
step_audio_download(args)
step_fonts(args)
image_cache = step_images(args)
conjugations = step_conjugations(args)
step_build_all(args, examples_cache, freq_cache, conjugations, image_cache)
step_images(args)
step_build_all(args)
print_summary(args, examples_cache, freq_cache, conjugations or {})
print_summary(args, examples_cache, freq_cache)
if __name__ == "__main__":

View file

@ -0,0 +1,212 @@
"""Check that every GUID in the last-release complete .apkg exists in words.json.
Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
then compares against all GUID fields stored in data/words.json.
Usage:
python3 scripts/check_guid_coverage.py
python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
python3 scripts/check_guid_coverage.py --verbose
"""
from __future__ import annotations
import argparse
import json
import os
import sqlite3
import sys
import tempfile
import zipfile
from pathlib import Path
from typing import Any
PROJECT_ROOT = Path(__file__).parent.parent
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
# Known model IDs (from apkg_builder.py)
MODEL_IDS = {
1701222017968: "vocab",
1234567893: "conjugation",
1234567897: "plurals",
1234567895: "confusables",
}
def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
"""Extract GUIDs from .apkg grouped by model ID."""
by_model: dict[int, set[str]] = {}
with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
z.extractall(td)
db_path = os.path.join(td, "collection.anki2")
conn = sqlite3.connect(db_path)
cur = conn.cursor()
cur.execute("SELECT guid, mid FROM notes")
for guid, mid in cur.fetchall():
by_model.setdefault(mid, set()).add(guid)
conn.close()
return by_model
def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
"""Collect all GUIDs from words.json grouped by deck type."""
vocab_guids: set[str] = set()
cloze_guids: set[str] = set()
conj_guids: set[str] = set()
plurals_guids: set[str] = set()
confusables_guids: set[str] = set()
for entry in data.values():
# Vocab legacy GUID
g = entry.get("vocab_legacy_guid")
if g:
vocab_guids.add(g)
# Cloze GUID (stored in examples.cloze.cloze_guid)
examples = entry.get("examples")
if examples:
cloze = examples.get("cloze")
if cloze:
g = cloze.get("cloze_guid")
if g:
cloze_guids.add(g)
# Plurals GUID (stored inside noun_inflection)
ni = entry.get("noun_inflection")
if ni:
g = ni.get("plurals_guid")
if g:
plurals_guids.add(g)
# Confusables GUID (top-level)
g = entry.get("confusables_guid")
if g:
confusables_guids.add(g)
# Conjugation form GUIDs
conj = entry.get("conjugation")
if conj:
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
g = form.get("guid")
if g:
conj_guids.add(g)
gc = form.get("guid_candidates")
if gc:
for g2 in gc:
conj_guids.add(g2)
return {
"vocab": vocab_guids,
"cloze": cloze_guids,
"conjugation": conj_guids,
"plurals": plurals_guids,
"confusables": confusables_guids,
}
def main() -> None:
parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
parser.add_argument(
"--apkg",
type=Path,
default=DEFAULT_APKG,
help=f"Path to .apkg file (default: {DEFAULT_APKG})",
)
parser.add_argument("--verbose", "-v", action="store_true")
args = parser.parse_args()
if not args.apkg.exists():
print(f"ERROR: apkg not found: {args.apkg}")
sys.exit(2)
if not WORDS_JSON.exists():
print(f"ERROR: words.json not found: {WORDS_JSON}")
sys.exit(2)
print(f"Checking: {args.apkg}")
print(f"Against: {WORDS_JSON}")
print()
apkg_by_model = extract_apkg_guids(args.apkg)
data = json.load(WORDS_JSON.open(encoding="utf-8"))
wj = collect_words_json_guids(data)
total_apkg = sum(len(s) for s in apkg_by_model.values())
total_wj = sum(len(s) for s in wj.values())
print(f"Total GUIDs in apkg: {total_apkg}")
print(f"Total GUIDs in words.json: {total_wj}")
print()
all_missing = 0
all_extra = 0
for mid, deck_name in MODEL_IDS.items():
apkg_set = apkg_by_model.get(mid, set())
# Map apkg model to words.json GUID sets
if deck_name == "vocab":
# Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
# They share the note GUID — vocab_legacy_guid IS the note guid
wj_set = wj["vocab"] | wj["cloze"]
elif deck_name == "conjugation":
wj_set = wj["conjugation"]
elif deck_name == "plurals":
wj_set = wj["plurals"]
elif deck_name == "confusables":
wj_set = wj["confusables"]
else:
wj_set = set()
missing = apkg_set - wj_set
extra = wj_set - apkg_set
matched = apkg_set & wj_set
all_missing += len(missing)
all_extra += len(extra)
status = "PASS" if not missing else "FAIL"
print(f" {status} {deck_name} (mid={mid})")
print(
f" apkg={len(apkg_set)}, words.json={len(wj_set)}, "
f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
)
if missing and args.verbose:
# Try to find what word each missing GUID belongs to in the apkg
print(" Missing GUIDs (in apkg, not in words.json):")
for g in sorted(missing)[:20]:
print(f" {g!r}")
if len(missing) > 20:
print(f" ... ({len(missing) - 20} more)")
if extra and args.verbose:
print(" Extra GUIDs (in words.json, not in apkg):")
for g in sorted(extra)[:10]:
print(f" {g!r}")
if len(extra) > 10:
print(f" ... ({len(extra) - 10} more)")
print()
# Check for unknown model IDs in apkg
unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
if unknown_mids:
print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}")
for mid in unknown_mids:
print(f" mid={mid}: {len(apkg_by_model[mid])} notes")
print("" * 60)
if all_missing:
print(f" FAILED: {all_missing} apkg GUIDs not found in words.json")
print(" (These notes would lose study progress on reimport)")
sys.exit(1)
else:
print(f" All {total_apkg} apkg GUIDs accounted for in words.json.")
sys.exit(0)
if __name__ == "__main__":
main()

1041
scripts/migrate_to_json.py Normal file

File diff suppressed because it is too large Load diff

420
scripts/repair_slugs.py Normal file
View file

@ -0,0 +1,420 @@
#!/usr/bin/env python3
"""
Repair duplicate slugs in data/words.json.
Homographs (words with identical spelling but different meanings) were
assigned the same slug by the scraper. This script fetches the pealim.com
search page for each affected word, matches entries by meaning (and nikkud),
and writes the corrected slugs back to words.json and the source CSV.
Usage:
python3 scripts/repair_slugs.py [--dry-run]
"""
from __future__ import annotations
import argparse
import json
import logging
import re
import sys
import time
from collections import defaultdict
from difflib import SequenceMatcher
from pathlib import Path
import pandas as pd
import requests
from bs4 import BeautifulSoup
# ---------------------------------------------------------------------------
# Paths
# ---------------------------------------------------------------------------
PROJECT_ROOT = Path(__file__).resolve().parent.parent
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
CSV_PATH = PROJECT_ROOT / "data" / "hebrew_dict_for_anki.csv"
# ---------------------------------------------------------------------------
# HTTP session
# ---------------------------------------------------------------------------
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
COOKIES: dict[str, str] = {"translit": "none", "hebstyle": "mo"}
REQUEST_DELAY = 1.5 # seconds between requests
REQUEST_TIMEOUT = 15 # seconds
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Similarity helpers
# ---------------------------------------------------------------------------
FUZZY_THRESHOLD = 0.4
def _similarity(a: str, b: str) -> float:
"""Return SequenceMatcher ratio between two strings (both lowercased)."""
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
def _best_match(
our_meaning: str,
candidates: list[dict],
our_nikkud: str,
) -> tuple[dict | None, float]:
"""
Return (best_candidate, ratio) by comparing our_meaning against each
candidate's meaning field. Nikkud exact-match gives a bonus to break ties.
"""
best: dict | None = None
best_score = -1.0
for cand in candidates:
ratio = _similarity(our_meaning, cand["meaning"])
# Nikkud exact match adds a small bonus so the right homograph wins
# even when meanings are very similar
if our_nikkud and cand["word"] == our_nikkud:
ratio = min(1.0, ratio + 0.05)
if ratio > best_score:
best_score = ratio
best = cand
return best, best_score
# ---------------------------------------------------------------------------
# Search-page parser
# ---------------------------------------------------------------------------
def _parse_search_results(html: bytes) -> list[dict]:
"""
Parse pealim.com search results page.
Each ``div.verb-search-result`` block contains:
- div.verb-search-data > a[href] slug
- div.verb-search-lemma > span.menukad nikkud word
- div.verb-search-binyan part of speech
- div.verb-search-meaning meaning text
Returns a list of dicts with keys: slug, word, pos, meaning.
"""
soup = BeautifulSoup(html, "html.parser")
results: list[dict] = []
for block in soup.find_all("div", class_="verb-search-result"):
data_div = block.find("div", class_="verb-search-data")
if not data_div:
continue
# Slug from the detail-page link
slug = ""
link = data_div.find("a", href=True)
if link:
m = re.search(r"/dict/([^/#]+)/", link["href"])
if m:
slug = m.group(1)
# Nikkud word
lemma_div = block.find("div", class_="verb-search-lemma")
menukad = lemma_div.find("span", class_="menukad") if lemma_div else None
word = menukad.get_text(strip=True) if menukad else (lemma_div.get_text(strip=True) if lemma_div else "")
# Part of speech
pos_div = block.find("div", class_="verb-search-binyan")
pos = pos_div.get_text(strip=True).replace("Part of speech:", "").strip() if pos_div else ""
# Meaning
meaning_div = block.find("div", class_="verb-search-meaning")
meaning = meaning_div.get_text(strip=True) if meaning_div else ""
if slug:
results.append({"slug": slug, "word": word, "pos": pos, "meaning": meaning})
return results
def _fetch_search_results(ktiv_male: str) -> list[dict]:
"""Fetch and parse search results for a given consonant-only spelling."""
url = f"https://www.pealim.com/search/?q={ktiv_male}"
logger.debug("GET %s", url)
resp = SESSION.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
return _parse_search_results(resp.content)
# ---------------------------------------------------------------------------
# Core logic
# ---------------------------------------------------------------------------
def find_duplicate_groups(data: dict) -> dict[str, list[str]]:
"""
Return mapping slug [word_key, ...] for all slugs shared by 2+ entries.
The word_key is the top-level key in words.json (nikkud + PoS + meaning).
"""
slug_to_keys: dict[str, list[str]] = defaultdict(list)
for key, entry in data.items():
slug = entry.get("slug", "")
if slug:
slug_to_keys[slug].append(key)
return {slug: keys for slug, keys in slug_to_keys.items() if len(keys) > 1}
def repair_group(
slug: str,
keys: list[str],
data: dict,
dry_run: bool,
) -> tuple[int, int]:
"""
Attempt to repair one group of entries sharing *slug*.
Homographs can have different ktiv_male spellings (e.g. אבידה vs אבדה for
the two spellings of אֲבֵדָה). We therefore build a union of all search
results obtained by querying each distinct ktiv_male in the group.
Returns (fixed_count, skipped_count).
"""
# Collect distinct ktiv_male values across the group (usually one, but
# sometimes two when homographs have different consonant spellings).
ktiv_to_keys: dict[str, list[str]] = defaultdict(list)
for k in keys:
ktiv = data[k]["word"]["ktiv_male"]
ktiv_to_keys[ktiv].append(k)
nikkud_word = data[keys[0]]["word"]["nikkud"]
logger.info(
" Fetching search results for %s%d entries share slug %s",
nikkud_word,
len(keys),
slug,
)
# Fetch search results for every distinct ktiv_male and merge
all_candidates: list[dict] = []
seen_slugs: set[str] = set()
for ktiv in ktiv_to_keys:
try:
results = _fetch_search_results(ktiv)
except requests.RequestException as exc:
logger.warning(" HTTP error for %s: %s", ktiv, exc)
results = []
for r in results:
if r["slug"] not in seen_slugs:
seen_slugs.add(r["slug"])
all_candidates.append(r)
if len(ktiv_to_keys) > 1:
# Small delay between sub-queries within the same group
time.sleep(REQUEST_DELAY)
if not all_candidates:
logger.warning(" No search results — skipping group")
return 0, len(keys)
# Filter candidates to those whose nikkud word matches the entry's nikkud.
# This avoids accidentally matching a completely different word that shares
# the same consonant spelling (e.g. different voweling entirely).
group_nikkuds = {data[k]["word"]["nikkud"] for k in keys}
filtered = [c for c in all_candidates if c["word"] in group_nikkuds]
if not filtered:
logger.warning(
" Search results don't contain nikkud %s — candidates: %s — skipping",
group_nikkuds,
[c["word"] for c in all_candidates],
)
return 0, len(keys)
fixed = 0
skipped = 0
for key in keys:
entry = data[key]
our_meaning = entry.get("meaning", "")
our_nikkud = entry["word"]["nikkud"]
# Only consider candidates that match this entry's nikkud
nikkud_filtered = [c for c in filtered if c["word"] == our_nikkud]
pool = nikkud_filtered if nikkud_filtered else filtered
best, score = _best_match(our_meaning, pool, our_nikkud)
if best is None or score < FUZZY_THRESHOLD:
logger.warning(
" SKIP key=%s | meaning=%r | best_score=%.2f",
key,
our_meaning,
score,
)
skipped += 1
continue
new_slug = best["slug"]
old_slug = entry["slug"]
if new_slug == old_slug:
logger.info(" SAME key=%s | slug=%s (score=%.2f)", key, old_slug, score)
fixed += 1
continue
logger.info(
" FIX key=%s | %s%s | matched=%r (score=%.2f)",
key,
old_slug,
new_slug,
best["meaning"],
score,
)
if not dry_run:
data[key]["slug"] = new_slug
fixed += 1
return fixed, skipped
# ---------------------------------------------------------------------------
# CSV update
# ---------------------------------------------------------------------------
def update_csv(data: dict, dry_run: bool) -> None:
"""
Re-write the CSV so every row's slug column matches words.json.
The CSV is semicolon-delimited; the slug column is named 'slug'.
We match rows by 'Word Without Nikkud' (ktiv_male) AND 'Meaning' because
homographs share the same ktiv_male.
"""
df = pd.read_csv(CSV_PATH, sep=";", dtype=str)
if "slug" not in df.columns:
logger.warning("CSV has no 'slug' column — skipping CSV update")
return
# Build a lookup: (ktiv_male, meaning) → new_slug from words.json
lookup: dict[tuple[str, str], str] = {}
for entry in data.values():
ktiv = entry["word"].get("ktiv_male", "")
meaning = entry.get("meaning", "")
slug = entry.get("slug", "")
if ktiv and slug:
lookup[(ktiv, meaning)] = slug
changes = 0
for idx, row in df.iterrows():
ktiv = str(row.get("Word Without Nikkud", "")).strip()
meaning = str(row.get("Meaning", "")).strip()
key = (ktiv, meaning)
if key in lookup:
new_slug = lookup[key]
old_slug = str(row["slug"]).strip()
if new_slug != old_slug:
logger.info(
" CSV row %d: %s%s (%s)",
idx,
old_slug,
new_slug,
ktiv,
)
if not dry_run:
df.at[idx, "slug"] = new_slug
changes += 1
logger.info("CSV: %d slug(s) to update", changes)
if not dry_run and changes:
df.to_csv(CSV_PATH, sep=";", index=True)
logger.info("CSV written to %s", CSV_PATH)
elif dry_run:
logger.info("DRY-RUN: CSV not written")
# ---------------------------------------------------------------------------
# Main
# ---------------------------------------------------------------------------
def main(argv: list[str] | None = None) -> int:
parser = argparse.ArgumentParser(description="Repair duplicate slugs in data/words.json")
parser.add_argument(
"--dry-run",
action="store_true",
help="Preview changes without writing any files",
)
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Enable debug logging",
)
args = parser.parse_args(argv)
if args.verbose:
logging.getLogger().setLevel(logging.DEBUG)
if args.dry_run:
logger.info("=== DRY-RUN mode — no files will be modified ===")
# Load data
logger.info("Loading %s", WORDS_JSON)
with WORDS_JSON.open(encoding="utf-8") as fh:
data: dict = json.load(fh)
logger.info("Loaded %d entries", len(data))
# Identify duplicate groups
groups = find_duplicate_groups(data)
total_groups = len(groups)
total_entries = sum(len(v) for v in groups.values())
logger.info(
"Found %d duplicate-slug groups covering %d entries",
total_groups,
total_entries,
)
# Process each group
total_fixed = 0
total_skipped = 0
for group_idx, (slug, keys) in enumerate(sorted(groups.items()), 1):
logger.info(
"[%d/%d] slug=%s (%d entries)",
group_idx,
total_groups,
slug,
len(keys),
)
fixed, skipped = repair_group(slug, keys, data, dry_run=args.dry_run)
total_fixed += fixed
total_skipped += skipped
# Respectful delay between HTTP requests
if group_idx < total_groups:
time.sleep(REQUEST_DELAY)
logger.info(
"Summary: %d fixed, %d skipped (out of %d entries in %d groups)",
total_fixed,
total_skipped,
total_entries,
total_groups,
)
# Write updated words.json
if not args.dry_run:
logger.info("Writing %s", WORDS_JSON)
with WORDS_JSON.open("w", encoding="utf-8") as fh:
json.dump(data, fh, ensure_ascii=False, indent=2)
logger.info("words.json written")
else:
logger.info("DRY-RUN: words.json not written")
# Update CSV
logger.info("Updating CSV %s", CSV_PATH)
update_csv(data, dry_run=args.dry_run)
return 0 if total_skipped == 0 else 1
if __name__ == "__main__":
sys.exit(main())

800
scripts/validate_data.py Normal file
View file

@ -0,0 +1,800 @@
"""Standalone integrity validator for data/words.json.
Validates the unified Hebrew Flash Cards data against the schema defined in
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
Usage:
python3 scripts/validate_data.py
python3 scripts/validate_data.py --verbose
python3 scripts/validate_data.py --test confusable_symmetric
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import unicodedata
from pathlib import Path
from typing import Any
# ---------------------------------------------------------------------------
# Bootstrap: make project root importable so helpers.py is accessible
# ---------------------------------------------------------------------------
sys.path.insert(0, str(Path(__file__).parent.parent))
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # aleftav
VALID_PERSON_CODES: frozenset[str] = frozenset(
["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
)
EMOJI_RE = re.compile(
r"[\U0001f600-\U0001f64f"
r"\U0001f300-\U0001f5ff"
r"\U0001f680-\U0001f6ff"
r"\U0001f1e0-\U0001f1ff"
r"\U00002702-\U000027b0"
r"\U0001f900-\U0001f9ff"
r"\U0001fa00-\U0001fa6f"
r"\U0001fa70-\U0001faff]"
)
# ---------------------------------------------------------------------------
# Result tracking
# ---------------------------------------------------------------------------
_failures: list[str] = []
_warnings: list[str] = []
_verbose: bool = False
def _pass(name: str) -> None:
print(f" PASS {name}")
def _fail(name: str, details: list[str]) -> None:
global _failures
_failures.append(name)
print(f" FAIL {name}")
for d in details:
print(f" {d}")
def _warn(name: str, details: list[str]) -> None:
global _warnings
_warnings.extend(details)
print(f" WARN {name}")
for d in details:
print(f" {d}")
def _verbose_print(msg: str) -> None:
if _verbose:
print(f" {msg}")
# ---------------------------------------------------------------------------
# Helper: load data
# ---------------------------------------------------------------------------
def load_data() -> dict[str, Any]:
"""Load words.json and return the parsed dict."""
if not DATA_FILE.exists():
print(f"ERROR: data file not found: {DATA_FILE}")
sys.exit(2)
with DATA_FILE.open(encoding="utf-8") as fh:
return json.load(fh)
def _is_hebrew_consonant(ch: str) -> bool:
"""Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
only the first base character after NFD decomposition.
"""
normalized = unicodedata.normalize("NFD", ch)
# The first codepoint is the base consonant; the rest are combining marks.
base = normalized[0]
cp = ord(base)
return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
# ---------------------------------------------------------------------------
# Individual tests
# ---------------------------------------------------------------------------
def test_required_fields(data: dict[str, Any]) -> None:
"""Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
name = "required_fields"
errors: list[str] = []
warn_details: list[str] = []
for key, entry in data.items():
word = entry.get("word")
if not isinstance(word, dict):
errors.append(f"[{key}] 'word' is missing or not a dict")
else:
if not word.get("nikkud"):
errors.append(f"[{key}] word.nikkud is missing or empty")
if not word.get("ktiv_male"):
errors.append(f"[{key}] word.ktiv_male is missing or empty")
if not entry.get("slug"):
errors.append(f"[{key}] 'slug' is missing or empty")
if not entry.get("pos"):
errors.append(f"[{key}] 'pos' is missing or empty")
if not entry.get("meaning"):
errors.append(f"[{key}] 'meaning' is missing or empty")
if entry.get("frequency") is None:
warn_details.append(f"[{key}] 'frequency' is null/missing")
if warn_details:
_warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
if len(warn_details) > 20 and not _verbose:
print(f" ... ({len(warn_details) - 20} more; use --verbose)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_root_format(data: dict[str, Any]) -> None:
"""root is a list of 2-5 Hebrew consonant chars, or an empty list."""
name = "root_format"
errors: list[str] = []
for key, entry in data.items():
root = entry.get("root")
if root is None:
errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
continue
if not isinstance(root, list):
errors.append(f"[{key}] 'root' is not a list: {root!r}")
continue
if len(root) == 0:
continue # rootless word — valid
if not (2 <= len(root) <= 5):
errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
continue
for ch in root:
# A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
# Validate by checking the base consonant after NFD decomposition.
if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
break
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_unique_slugs(data: dict[str, Any]) -> None:
"""All non-empty slugs are unique across entries — each pealim page is a distinct word."""
name = "unique_slugs"
seen: dict[str, list[str]] = {}
for key, entry in data.items():
slug = entry.get("slug")
if slug:
seen.setdefault(slug, []).append(key)
dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
if dups:
errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001
"""JSON loaded without top-level key collisions.
Python's json.load silently keeps the last value on duplicate keys;
we re-parse with a custom object_pairs_hook to detect them.
The pre-parsed ``_data`` dict is not used here because we need to
re-read the raw file to catch duplicate keys that json.load would
silently merge.
"""
name = "no_duplicate_keys"
duplicates: list[str] = []
def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
d: dict[str, Any] = {}
for k, v in pairs:
if k in d:
duplicates.append(k)
d[k] = v
return d
with DATA_FILE.open(encoding="utf-8") as fh:
json.load(fh, object_pairs_hook=_detect_dups)
if duplicates:
_fail(name, [f"duplicate key: {k!r}" for k in duplicates])
else:
_pass(name)
def test_confusable_symmetric(data: dict[str, Any]) -> None:
"""If A lists B in confusable_group, B must list A."""
name = "confusable_symmetric"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
for other_key in group:
other = data.get(other_key)
if other is None:
errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
continue
other_group = other.get("confusable_group") or []
if key not in other_group:
errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
"""Every key in shared_roots must exist as a top-level key."""
name = "shared_roots_valid_keys"
errors: list[str] = []
for key, entry in data.items():
shared = entry.get("shared_roots")
if not shared:
continue
for ref_key in shared:
if ref_key not in data:
errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_unique_legacy_guids(data: dict[str, Any]) -> None:
"""No two entries share the same vocab_legacy_guid (excluding null).
Exception: entries that share the same word.nikkud value inherited the
same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
These are tolerated the duplicate GUID is a known artefact of how
legacy GUIDs were generated from the nikkud word alone.
"""
name = "unique_legacy_guids"
seen: dict[str, list[str]] = {}
for key, entry in data.items():
guid = entry.get("vocab_legacy_guid")
if guid:
seen.setdefault(guid, []).append(key)
errors: list[str] = []
for guid, keys in seen.items():
if len(keys) <= 1:
continue
# Tolerate sharing if ALL entries with this GUID share the same word.nikkud
nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
if len(nikkud_values) == 1:
# Same nikkud -> inherited from same legacy card; tolerable
_verbose_print(
f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
)
continue
errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
"""noun_inflection must be null if pos doesn't start with 'Noun'.
Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
"""
name = "no_noun_inflection_on_non_nouns"
errors: list[str] = []
for key, entry in data.items():
pos = entry.get("pos") or ""
noun_inf = entry.get("noun_inflection")
if not pos.startswith("Noun") and noun_inf is not None:
errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
_verbose_print(f"offending entry: {key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
"""meaning field must not contain inline emoji characters."""
name = "no_emoji_in_meaning"
errors: list[str] = []
for key, entry in data.items():
meaning = entry.get("meaning") or ""
if EMOJI_RE.search(meaning):
errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
"""For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
Uses nikkud (exact) matching, not stripped matching.
"""
name = "example_sentences_contain_word"
errors: list[str] = []
for key, entry in data.items():
examples = entry.get("examples")
if not examples:
continue
vetted = examples.get("vetted")
if not vetted:
continue
word_obj = entry.get("word") or {}
nikkud_word = word_obj.get("nikkud") or ""
if not nikkud_word:
continue
found = any(nikkud_word in (s.get("text") or "") for s in vetted)
if not found:
sentences_preview = [s.get("text", "") for s in vetted[:2]]
errors.append(
f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
)
if errors:
_warn(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
_pass(name)
def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
"""cloze_word_start/end must be within text bounds when present.
Null offsets are tolerated (and warned separately) because some sentences
contain only inflected/construct/plural forms that cannot be matched back
to the base nikkud or ktiv_male this is a data quality issue in
vetted_sentences.json, not a schema violation.
"""
name = "cloze_offsets_valid"
errors: list[str] = []
null_warn: list[str] = []
for key, entry in data.items():
examples = entry.get("examples")
if not examples:
continue
cloze = examples.get("cloze")
if not cloze:
continue
text = cloze.get("text") or ""
start = cloze.get("cloze_word_start")
end = cloze.get("cloze_word_end")
if start is None or end is None:
null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
continue
text_len = len(text)
if not isinstance(start, int) or not isinstance(end, int):
errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
continue
if start < 0 or end < 0:
errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
continue
if start >= end:
errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
continue
if end > text_len:
errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
if null_warn:
_warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
if len(null_warn) > 20 and not _verbose:
print(f" ... ({len(null_warn) - 20} more; use --verbose)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
"""hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
name = "hufal_pual_only_on_hifil_piel"
errors: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
hufal_pual = conj.get("hufal_pual_forms")
if hufal_pual is None:
continue
binyan = conj.get("binyan") or ""
binyan_lower = binyan.lower()
if "hif" not in binyan_lower and "pi" not in binyan_lower:
errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
"""All entries in a confusable_group must share the same word.ktiv_male."""
name = "confusable_group_shares_ktiv_male"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
my_word = entry.get("word") or {}
my_ktiv = my_word.get("ktiv_male")
if not my_ktiv:
continue
for other_key in group:
other = data.get(other_key)
if not other:
continue # already caught by confusable_symmetric
other_word = other.get("word") or {}
other_ktiv = other_word.get("ktiv_male")
if other_ktiv and other_ktiv != my_ktiv:
errors.append(
f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
)
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_confusables_guid(data: dict[str, Any]) -> None:
"""confusables_guid must be consistent within each confusable_group.
Rules:
- If confusable_group is non-null, confusables_guid must be non-null.
- If confusable_group is null, confusables_guid must be null.
- All entries that share a confusable_group must share the same
confusables_guid value.
"""
name = "confusables_guid"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
guid = entry.get("confusables_guid")
if group and not guid:
errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
elif not group and guid is not None:
errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
if not group or not guid:
continue
for other_key in group:
other = data.get(other_key)
if not other:
continue # already caught by confusable_symmetric
other_guid = other.get("confusables_guid")
if other_guid != guid:
errors.append(
f"[{key}] confusables_guid={guid!r} but confusable member "
f"{other_key!r} has confusables_guid={other_guid!r}"
)
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_conjugation_form_guids(data: dict[str, Any]) -> None:
"""Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
Rules:
- Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
1st person forms where multiple GUIDs are possible).
- No two forms within the same verb (across both form lists) may share a GUID.
"""
name = "conjugation_form_guids"
errors: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
person = form.get("person", "?")
label = f"{form_list_key}[{person}]"
guid = form.get("guid")
guid_candidates = form.get("guid_candidates")
if not guid and not guid_candidates:
errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
continue
if guid:
if guid in seen_guids:
errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
else:
seen_guids[guid] = label
elif guid_candidates:
for candidate in guid_candidates:
if candidate in seen_guids:
errors.append(
f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
)
else:
seen_guids[candidate] = label
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_conjugation_person_codes(data: dict[str, Any]) -> None:
"""active_forms person codes must be from the defined valid set."""
name = "conjugation_person_codes"
errors: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
person = form.get("person")
if person not in VALID_PERSON_CODES:
errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
"""For confusable words, their example sentences must not contain the wrong
homograph's nikkud word.
Specifically: if A and B are confusable (same ktiv_male), A's vetted
sentences must not contain B's nikkud form, and vice versa.
"""
name = "no_stripped_form_sentence_collisions"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
examples = entry.get("examples")
if not examples:
continue
vetted = examples.get("vetted")
if not vetted:
continue
my_word = entry.get("word") or {}
my_nikkud = my_word.get("nikkud") or ""
my_texts = [s.get("text") or "" for s in vetted]
for other_key in group:
other = data.get(other_key)
if not other:
continue
other_word = other.get("word") or {}
other_nikkud = other_word.get("nikkud") or ""
if not other_nikkud or other_nikkud == my_nikkud:
continue # same nikkud homographs are ok (we can't distinguish by nikkud)
for text in my_texts:
if other_nikkud in text:
errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
_verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
break # one error per (key, other_key) pair is enough
if errors:
_warn(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
_pass(name)
# ---------------------------------------------------------------------------
# Stats summary
# ---------------------------------------------------------------------------
def print_stats(data: dict[str, Any]) -> None:
"""Print a summary of dataset coverage metrics."""
total = len(data)
with_conj = sum(1 for e in data.values() if e.get("conjugation"))
with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
with_image = sum(1 for e in data.values() if e.get("image"))
with_emoji = sum(1 for e in data.values() if e.get("emoji"))
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
print()
print("Stats Summary")
print("" * 42)
print(f" Total entries: {total:>6}")
print(f" With conjugation data: {with_conj:>6}")
print(f" With noun_inflection: {with_noun_inf:>6}")
print(f" With vetted examples: {with_vetted:>6}")
print(f" With cloze examples: {with_cloze:>6}")
print(f" With images: {with_image:>6}")
print(f" With emoji: {with_emoji:>6}")
print(f" With legacy GUIDs: {with_guid:>6}")
print(f" In confusable groups: {in_confusable:>6}")
print(f" With shared roots: {with_shared_roots:>6}")
# ---------------------------------------------------------------------------
# Test registry
# ---------------------------------------------------------------------------
ALL_TESTS: dict[str, Any] = {
"required_fields": test_required_fields,
"root_format": test_root_format,
"unique_slugs": test_unique_slugs,
"no_duplicate_keys": test_no_duplicate_keys,
"confusable_symmetric": test_confusable_symmetric,
"shared_roots_valid_keys": test_shared_roots_valid_keys,
"unique_legacy_guids": test_unique_legacy_guids,
"no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
"no_emoji_in_meaning": test_no_emoji_in_meaning,
"example_sentences_contain_word": test_example_sentences_contain_word,
"cloze_offsets_valid": test_cloze_offsets_valid,
"hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
"confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
"confusables_guid": test_confusables_guid,
"conjugation_form_guids": test_conjugation_form_guids,
"conjugation_person_codes": test_conjugation_person_codes,
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
}
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
global _verbose
parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Print full details for all failures (not just first 20).",
)
parser.add_argument(
"--test",
metavar="NAME",
help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
)
args = parser.parse_args()
_verbose = args.verbose
data = load_data()
# Select tests to run
if args.test:
if args.test not in ALL_TESTS:
print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
sys.exit(2)
tests_to_run = {args.test: ALL_TESTS[args.test]}
else:
tests_to_run = ALL_TESTS
print(f"Validating {DATA_FILE} ({len(data)} entries)")
print("" * 60)
# no_duplicate_keys needs the file, not the pre-parsed dict
for test_fn in tests_to_run.values():
test_fn(data)
# Summary
if not args.test:
print_stats(data)
print()
print("" * 60)
if _warnings:
print(f" Warnings : {len(_warnings)}")
if _failures:
print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
sys.exit(1)
else:
print(f" All {len(tests_to_run)} test(s) passed.")
sys.exit(0)
if __name__ == "__main__":
main()

View file

@ -0,0 +1,441 @@
#!/usr/bin/env python3
"""Integration tests: scrape real pealim.com pages and validate data.
These tests hit pealim.com directly. They are skipped when the environment
variable SKIP_INTEGRATION is set to any non-empty string.
Run with:
pytest tests/test_scraper_integration.py -v -m integration
"""
import json
import os
import re
import sys
import time
from pathlib import Path
import pytest
# Add project root to path so all sibling modules are importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import pealim_detail_scrape
import pealim_list_scrape
# ---------------------------------------------------------------------------
# Skip marker
# ---------------------------------------------------------------------------
skip_integration = pytest.mark.skipif(
bool(os.environ.get("SKIP_INTEGRATION", "")),
reason="SKIP_INTEGRATION is set",
)
# A known Hif'il verb slug that is not page-1 dependent.
# לְהַגִּיד (to tell/say) — Hif'il, slug 4183-lehagid
HIFIL_VERB_SLUG = "4183-lehagid"
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
HIFIL_VERB_MEANING = "to say, to tell"
# Minimum expected entries from a single list page
MIN_LIST_ENTRIES = 10
# Hebrew character regex (Unicode block U+05D0U+05EA)
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
# Slug pattern: one or more digits, hyphen, one or more word chars
SLUG_RE = re.compile(r"^\d+-\w+$")
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _has_hebrew(text: str) -> bool:
"""Return True if *text* contains at least one Hebrew consonant."""
return bool(HEBREW_CHAR_RE.search(text))
def _words_from_file(path: Path) -> dict:
with path.open(encoding="utf-8") as fh:
return json.load(fh)
# ---------------------------------------------------------------------------
# Test class: list page scrape
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestListScrape:
"""Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
# Scrape exactly one page
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
assert words_path.exists(), "words.json was not created after scrape"
words = _words_from_file(words_path)
assert len(words) >= MIN_LIST_ENTRIES, (
f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
)
def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
for key, entry in words.items():
word_block = entry.get("word", {})
nikkud = word_block.get("nikkud", "")
ktiv_male = word_block.get("ktiv_male", "")
slug = entry.get("slug", "")
pos = entry.get("pos", "")
meaning = entry.get("meaning", "")
assert nikkud, f"Entry '{key}': word.nikkud is empty"
assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
assert slug, f"Entry '{key}': slug is empty"
assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
assert pos, f"Entry '{key}': pos is empty"
assert meaning, f"Entry '{key}': meaning is empty"
def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""At least one entry on page 1 must have a non-empty root list."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
entries_with_root = [e for e in words.values() if e.get("root")]
assert entries_with_root, "No entries on page 1 have a non-empty root list"
def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""At least one entry on page 1 must have a non-empty audio_url."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
entries_with_audio = [e for e in words.values() if e.get("audio_url")]
assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
for key, entry in words.items():
assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
# ---------------------------------------------------------------------------
# Test class: noun detail scrape
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestDetailScrapeNoun:
"""Validate pealim_detail_scrape for a real noun detail page."""
def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
"""Return the first (key, entry) pair that is a Noun with a non-empty root."""
for key, entry in words.items():
if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
return key, entry
return None
def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
"""
Scrape page 1 into a fresh words.json and return (path, words).
Uses list scraper monkeypatched to tmp_path.
"""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
return words_path, words
def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After detail scrape, noun_inflection must not be null."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, noun_entry = pair
# Now monkeypatch detail scraper and run it on just this noun
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
# Small rate-limit delay between list scrape and detail scrape
time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
entry = updated_words.get(noun_key, {})
assert entry.get("noun_inflection") is not None, (
f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
)
def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _noun_entry = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
singular = ni.get("singular") or {}
plural = ni.get("plural") or {}
assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Noun gender must be 'masculine' or 'feminine'."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _noun_entry = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
gender = ni.get("gender", "")
assert gender in ("masculine", "feminine"), (
f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
)
def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""detail_scraped must be True after a successful noun detail scrape."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _ = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
assert updated_words[noun_key].get("detail_scraped") is True, (
f"detail_scraped is not True after scrape for '{noun_key}'"
)
# ---------------------------------------------------------------------------
# Test class: verb detail scrape (Hif'il)
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestDetailScrapeVerb:
"""Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
def _build_test_words_json(self, tmp_path: Path) -> Path:
"""
Write a minimal words.json containing only the known Hif'il verb entry.
The detail scraper's run() will pick it up because pos starts with 'Verb'
and detail_scraped is absent/False.
"""
words_path = tmp_path / "words.json"
entry = {
"word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
"slug": HIFIL_VERB_SLUG,
"root": ["נ", "ג", "ד"],
"pos": "Verb",
"pos_hebrew": "פֹּעַל — הִפְעִיל",
"meaning": HIFIL_VERB_MEANING,
"meaning_raw": HIFIL_VERB_MEANING,
"audio_url": "",
"audio_file": "להגיד.mp3",
"tags": "שורש::נגד פעלים",
"last_scrape_date": "2026-03-08",
"vocab_legacy_guid": None,
"frequency": None,
"pseudo_frequency": None,
"emoji": None,
"emoji_source": None,
"emoji_visible": False,
"image": None,
"image_source": None,
"hint": "",
"shared_roots": [],
"confusable_group": None,
"confusables_guid": None,
"examples": None,
"noun_inflection": None,
"conjugation": None,
"adjective_inflection": None,
"preposition_inflection": None,
# Intentionally no detail_scraped key so the scraper processes it
}
words = {HIFIL_VERB_NIKKUD: entry}
with words_path.open("w", encoding="utf-8") as fh:
json.dump(words, fh, ensure_ascii=False, indent=2)
return words_path
def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After detail scrape, conjugation must not be null for the Hif'il verb."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
entry = words.get(HIFIL_VERB_NIKKUD, {})
assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
assert conj.get("binyan_hebrew") == "הִפְעִיל", (
f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
)
def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
infinitive = conj.get("infinitive") or {}
reference_form = conj.get("reference_form") or {}
inf_nikkud = infinitive.get("nikkud", "")
ref_nikkud = reference_form.get("nikkud", "")
assert inf_nikkud and _has_hebrew(inf_nikkud), (
f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
)
assert ref_nikkud and _has_hebrew(ref_nikkud), (
f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
)
def test_verb_detail_active_forms_count_and_structure(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""active_forms must be a list of at least 20 entries, each with required sub-fields."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
active_forms = conj.get("active_forms")
assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
for i, form in enumerate(active_forms):
assert form.get("person"), f"active_forms[{i}].person is empty"
assert form.get("tense"), f"active_forms[{i}].tense is empty"
form_block = form.get("form") or {}
assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
)
assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
)
def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
hufal_forms = conj.get("hufal_pual_forms")
assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
ref_passive = conj.get("reference_form_passive")
assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
passive_nikkud = (ref_passive or {}).get("nikkud", "")
assert passive_nikkud and _has_hebrew(passive_nikkud), (
f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
)
def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""detail_scraped must be True after a successful verb detail scrape."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
entry = words.get(HIFIL_VERB_NIKKUD, {})
assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"

View file

@ -25,8 +25,7 @@ def test_apkg_builder_imports():
def test_data_files_exist():
data_dir = Path(__file__).resolve().parent.parent / "data"
assert (data_dir / "hebrew_dict_for_anki.csv").exists(), "vocab CSV missing"
assert (data_dir / "conjugations.json").exists(), "conjugations cache missing"
assert (data_dir / "words.json").exists(), "words.json missing"
def test_strip_nikkud_idempotent():
@ -42,4 +41,4 @@ def test_strip_nikkud_all_marks():
# Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
nikkud = "הַמַּלְכָּה"
plain = strip_nikkud(nikkud)
assert all(ch < "\u0591" or ch > "\u05C7" for ch in plain), f"Residual nikkud in: {plain}"
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"