Sprint 11: unified JSON architecture + consolidated scraping pipeline
Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
2e48109d7f
commit
08fb7009d8
20 changed files with 561420 additions and 10124 deletions
26
.claude/settings.json
Normal file
26
.claude/settings.json
Normal file
|
|
@ -0,0 +1,26 @@
|
||||||
|
{
|
||||||
|
"hooks": {
|
||||||
|
"PostToolUse": [
|
||||||
|
{
|
||||||
|
"matcher": "Edit|Write",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "file=\"$CLAUDE_FILE_PATH\"; if [ -n \"$file\" ] && echo \"$file\" | grep -q '\\.py$'; then ruff format --quiet \"$file\" && ruff check --fix --quiet \"$file\" 2>/dev/null; fi"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"PreToolUse": [
|
||||||
|
{
|
||||||
|
"matcher": "Edit|Write",
|
||||||
|
"hooks": [
|
||||||
|
{
|
||||||
|
"type": "command",
|
||||||
|
"command": "file=\"$CLAUDE_FILE_PATH\"; if echo \"$file\" | grep -qE '(legacy_guid_map\\.json|\\.env)$'; then echo 'BLOCKED: Protected file — legacy_guid_map.json and .env are read-only' >&2; exit 2; fi"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
@ -56,7 +56,7 @@ Fields on each card:
|
||||||
| Plural form | for nouns: רבים: שֻׁלְחָנוֹת |
|
| Plural form | for nouns: רבים: שֻׁלְחָנוֹת |
|
||||||
| Disambiguation hint | for ambiguous Eng→Heb cards |
|
| Disambiguation hint | for ambiguous Eng→Heb cards |
|
||||||
|
|
||||||
Cards are presented in **frequency order** — Anki will show you the most common words first.
|
Cards are presented in **frequency order** — Anki will show you the most common words first. Note that because frequency is collected with words without nikkud, words that have the same letters but different nikkud will be assigned the same frequency.
|
||||||
|
|
||||||
### Eng→Heb disambiguation
|
### Eng→Heb disambiguation
|
||||||
|
|
||||||
|
|
|
||||||
148
SCHEMA.yaml
Normal file
148
SCHEMA.yaml
Normal file
|
|
@ -0,0 +1,148 @@
|
||||||
|
# Hebrew Flash Cards — Unified Data Schema (words.json)
|
||||||
|
# Revised based on Nevo's feedback (2026-03-08)
|
||||||
|
#
|
||||||
|
# Top-level: dict keyed by unique_key
|
||||||
|
# Unique key: nikkud word for most entries (e.g. "אָב")
|
||||||
|
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
|
||||||
|
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
|
||||||
|
#
|
||||||
|
# Hebrew text fields use nikkud/ktiv_male subfields:
|
||||||
|
# field:
|
||||||
|
# nikkud: "אָב" # with nikkud (hebstyle=mo)
|
||||||
|
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
|
||||||
|
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
|
||||||
|
#
|
||||||
|
# Pronoun notation for conjugation forms uses grammatical codes:
|
||||||
|
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
||||||
|
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
|
||||||
|
|
||||||
|
entry:
|
||||||
|
# --- Core Identity ---
|
||||||
|
word:
|
||||||
|
nikkud: "אָב"
|
||||||
|
ktiv_male: "אב"
|
||||||
|
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
|
||||||
|
root: ["א", "ב"] # Shoresh as list of consonant chars
|
||||||
|
pos: "Noun" # Part of speech in English (as from pealim)
|
||||||
|
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
||||||
|
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
||||||
|
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
||||||
|
audio_url: "https://..." # Pealim audio URL
|
||||||
|
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
||||||
|
tags: "" # Pealim tags if any
|
||||||
|
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
|
||||||
|
|
||||||
|
# --- Identity & Progress ---
|
||||||
|
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
|
||||||
|
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
|
||||||
|
|
||||||
|
# --- Frequency ---
|
||||||
|
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
|
||||||
|
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
|
||||||
|
|
||||||
|
# --- Display Enrichment ---
|
||||||
|
emoji: "👨"
|
||||||
|
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
|
||||||
|
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
|
||||||
|
image: "father.jpg" # Wikipedia/Commons image filename, or null
|
||||||
|
image_source: "wikipedia" # One of: wikipedia, commons, null
|
||||||
|
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
|
||||||
|
|
||||||
|
# --- Shared Roots ---
|
||||||
|
shared_roots: [] # List of unique_keys of other words sharing the same root
|
||||||
|
# Computed by iterating all entries and grouping by root
|
||||||
|
|
||||||
|
# --- Confusables ---
|
||||||
|
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
|
||||||
|
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
|
||||||
|
|
||||||
|
# --- Example Sentences ---
|
||||||
|
examples:
|
||||||
|
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
|
||||||
|
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
||||||
|
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
|
||||||
|
vetted: true
|
||||||
|
cloze: # Best sentence for cloze card, or null
|
||||||
|
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
||||||
|
cloze_word_start: 0 # Character offset of the clozed word in text
|
||||||
|
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
|
||||||
|
cloze_hint: "family member"
|
||||||
|
cloze_guid: "def456..." # GUID for the cloze note
|
||||||
|
rejected_count: 0
|
||||||
|
|
||||||
|
# --- Noun-specific: Inflection Forms ---
|
||||||
|
noun_inflection: null # null for non-nouns
|
||||||
|
# When populated:
|
||||||
|
# plurals_guid: "ghi789..." # GUID for plurals deck note
|
||||||
|
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
|
||||||
|
# nikkud: "אָב"
|
||||||
|
# ktiv_male: "אב"
|
||||||
|
# plural:
|
||||||
|
# nikkud: "אָבוֹת"
|
||||||
|
# ktiv_male: "אבות"
|
||||||
|
# singular_audio: "6009-av.mp3"
|
||||||
|
# plural_audio: null # TODO: scrape from detail pages
|
||||||
|
# construct_singular:
|
||||||
|
# nikkud: "אֲבִי"
|
||||||
|
# ktiv_male: "אבי"
|
||||||
|
# construct_plural:
|
||||||
|
# nikkud: "אֲבוֹת"
|
||||||
|
# ktiv_male: "אבות"
|
||||||
|
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
|
||||||
|
# 1s:
|
||||||
|
# nikkud: "אָבִי"
|
||||||
|
# ktiv_male: "אבי"
|
||||||
|
# 1p:
|
||||||
|
# nikkud: "אָבִינוּ"
|
||||||
|
# ktiv_male: "אבינו"
|
||||||
|
# 2ms: ...
|
||||||
|
# 2fs: ...
|
||||||
|
# 2mp: ...
|
||||||
|
# 2fp: ...
|
||||||
|
# 3ms: ...
|
||||||
|
# 3fs: ...
|
||||||
|
# 3mp: ...
|
||||||
|
# 3fp: ...
|
||||||
|
# gender: "masculine"
|
||||||
|
# gender_hebrew:
|
||||||
|
# nikkud: "זָכָר"
|
||||||
|
# ktiv_male: "זכר"
|
||||||
|
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
||||||
|
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
||||||
|
|
||||||
|
# --- Verb-specific: Conjugation Data ---
|
||||||
|
conjugation: null # null for non-verbs
|
||||||
|
# When populated:
|
||||||
|
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
|
||||||
|
# infinitive:
|
||||||
|
# nikkud: "לִשְׁמֹר"
|
||||||
|
# ktiv_male: "לשמור"
|
||||||
|
# reference_form: # 3ms past (the citation form)
|
||||||
|
# nikkud: "שָׁמַר"
|
||||||
|
# ktiv_male: "שמר"
|
||||||
|
# binyan: "Pa'al" # English binyan name
|
||||||
|
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
|
||||||
|
# prep: "על" # Hebrew preposition the verb takes, or null
|
||||||
|
# active_forms:
|
||||||
|
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
||||||
|
# tense: "עָבָר"
|
||||||
|
# form:
|
||||||
|
# nikkud: "שָׁמַרְתִּי"
|
||||||
|
# ktiv_male: "שמרתי"
|
||||||
|
# audio_url: "https://..."
|
||||||
|
# audio_file: null # For future use
|
||||||
|
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
|
||||||
|
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
|
||||||
|
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
|
||||||
|
# nikkud: "שֻׁמַּר"
|
||||||
|
# ktiv_male: "שומר"
|
||||||
|
|
||||||
|
# --- Adjective-specific ---
|
||||||
|
adjective_inflection: null # Reserved for future use
|
||||||
|
# When populated:
|
||||||
|
# ms/fs/mp/fp forms with nikkud/ktiv_male subfields
|
||||||
|
|
||||||
|
# --- Preposition-specific ---
|
||||||
|
preposition_inflection: null # Reserved for future use
|
||||||
|
# When populated:
|
||||||
|
# Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)
|
||||||
1084
apkg_builder.py
1084
apkg_builder.py
File diff suppressed because it is too large
Load diff
|
|
@ -2,6 +2,10 @@
|
||||||
"""
|
"""
|
||||||
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
|
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
|
||||||
|
|
||||||
|
TODO: Rewrite to update words.json examples fields directly instead of
|
||||||
|
writing to a separate examples_cache.json. Currently the migration script
|
||||||
|
bridges the gap. See Phase 5 in SPRINT_LOG.md.
|
||||||
|
|
||||||
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
|
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
|
||||||
then answers queries locally.
|
then answers queries locally.
|
||||||
|
|
||||||
|
|
|
||||||
File diff suppressed because it is too large
Load diff
546420
data/words.json
Normal file
546420
data/words.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -3,6 +3,10 @@
|
||||||
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
|
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
|
||||||
Downloads he_50k.txt once; subsequent runs read from cache.
|
Downloads he_50k.txt once; subsequent runs read from cache.
|
||||||
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
|
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
|
||||||
|
|
||||||
|
TODO: Rewrite to update words.json frequency field directly instead of
|
||||||
|
writing to a separate frequency_cache.json. Currently the migration script
|
||||||
|
bridges the gap. See Phase 5 in SPRINT_LOG.md.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import json
|
import json
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,10 @@
|
||||||
"""
|
"""
|
||||||
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
|
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
|
||||||
|
|
||||||
|
TODO: Rewrite to update words.json image/image_source fields directly instead of
|
||||||
|
writing to a separate image_cache.json. Currently the migration script bridges
|
||||||
|
the gap. See Phase 5 in SPRINT_LOG.md.
|
||||||
|
|
||||||
Scope: Noun PoS entries only. Concreteness heuristic:
|
Scope: Noun PoS entries only. Concreteness heuristic:
|
||||||
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
|
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
|
||||||
-ship, -ure, -al, -ing when not a gerund, -ence)
|
-ship, -ure, -al, -ing when not a gerund, -ence)
|
||||||
|
|
@ -59,7 +63,6 @@ session.headers.update(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def is_concrete(english_meaning: str) -> bool:
|
def is_concrete(english_meaning: str) -> bool:
|
||||||
"""Return True if the English meaning looks like a concrete noun."""
|
"""Return True if the English meaning looks like a concrete noun."""
|
||||||
meaning = english_meaning.strip().lower()
|
meaning = english_meaning.strip().lower()
|
||||||
|
|
|
||||||
346
pealim_audio_download.py
Normal file
346
pealim_audio_download.py
Normal file
|
|
@ -0,0 +1,346 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Download audio files from URLs stored in words.json.
|
||||||
|
|
||||||
|
Three audio categories are handled:
|
||||||
|
1. Vocab audio → data/audio/{audio_file}
|
||||||
|
2. Noun plural → data/audio/{slug}_plural.mp3
|
||||||
|
3. Conjugation → data/audio_conj/{slug}_{form_key}.mp3
|
||||||
|
data/audio_conj/{slug}_passive_{form_key}.mp3
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
from helpers import strip_nikkud
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
AUDIO_DIR = DATA_DIR / "audio"
|
||||||
|
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||||
|
WORDS_JSON = DATA_DIR / "words.json"
|
||||||
|
|
||||||
|
DOWNLOAD_DELAY = 0.3
|
||||||
|
MAX_RETRIES = 3
|
||||||
|
|
||||||
|
# Map Hebrew tense names to English prefixes for form_key construction.
|
||||||
|
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
|
||||||
|
# appear in the current dataset but the form_key collapses to bare "infinitive".
|
||||||
|
TENSE_TO_PREFIX = {
|
||||||
|
"הוֹוֶה": "present",
|
||||||
|
"עָבָר": "past",
|
||||||
|
"עָתִיד": "future",
|
||||||
|
"צִוּוּי": "imperative",
|
||||||
|
"מְקוֹר": "infinitive",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _make_audio_file(entry: dict) -> str:
|
||||||
|
"""Derive the vocab audio filename when audio_file is absent.
|
||||||
|
|
||||||
|
Slug-based for confusable entries (slug contains the disambiguating ID),
|
||||||
|
consonant-only for all others.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entry: A words.json entry dict.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
|
||||||
|
"""
|
||||||
|
slug: str = entry["slug"]
|
||||||
|
if entry.get("confusable_group"):
|
||||||
|
return f"{slug}.mp3"
|
||||||
|
word: str = entry.get("word", "")
|
||||||
|
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word))
|
||||||
|
return f"{safe_name}.mp3"
|
||||||
|
|
||||||
|
|
||||||
|
def _form_key(person: str, tense: str) -> str:
|
||||||
|
"""Build a filesystem-safe form key from person and tense fields.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
|
||||||
|
tense: Hebrew tense string from the conjugation form.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Form key such as ``"past_1s"`` or ``"present_ms"``.
|
||||||
|
Infinitive tense always returns ``"infinitive"`` (no person suffix).
|
||||||
|
"""
|
||||||
|
prefix = TENSE_TO_PREFIX.get(tense, tense)
|
||||||
|
if prefix == "infinitive":
|
||||||
|
return "infinitive"
|
||||||
|
return f"{prefix}_{person}"
|
||||||
|
|
||||||
|
|
||||||
|
def _download(url: str, dest: Path, session: requests.Session) -> bool:
|
||||||
|
"""Download *url* to *dest*, retrying up to MAX_RETRIES times.
|
||||||
|
|
||||||
|
Skips the download silently if *dest* already exists.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: HTTP(S) URL to download.
|
||||||
|
dest: Local path to write the file to.
|
||||||
|
session: Shared requests session.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
``True`` if the file was downloaded (or already existed),
|
||||||
|
``False`` if all retries were exhausted.
|
||||||
|
"""
|
||||||
|
if dest.exists():
|
||||||
|
return True
|
||||||
|
|
||||||
|
for attempt in range(1, MAX_RETRIES + 1):
|
||||||
|
try:
|
||||||
|
resp = session.get(url, timeout=15)
|
||||||
|
resp.raise_for_status()
|
||||||
|
dest.write_bytes(resp.content)
|
||||||
|
logger.debug("Downloaded %s → %s", url, dest.name)
|
||||||
|
return True
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
wait = 2**attempt
|
||||||
|
if attempt < MAX_RETRIES:
|
||||||
|
logger.warning(
|
||||||
|
"Attempt %d/%d failed for %s (%s) — retrying in %ds",
|
||||||
|
attempt,
|
||||||
|
MAX_RETRIES,
|
||||||
|
url,
|
||||||
|
exc,
|
||||||
|
wait,
|
||||||
|
)
|
||||||
|
time.sleep(wait)
|
||||||
|
else:
|
||||||
|
logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Per-category downloaders
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def download_vocab_audio(
|
||||||
|
entries: list[dict],
|
||||||
|
session: requests.Session,
|
||||||
|
) -> tuple[int, int, int]:
|
||||||
|
"""Download vocabulary audio files.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entries: List of words.json entry dicts.
|
||||||
|
session: Shared requests session.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (downloaded, cached, no_url) counts.
|
||||||
|
"""
|
||||||
|
downloaded = cached = no_url = 0
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
url: str | None = entry.get("audio_url")
|
||||||
|
if not url:
|
||||||
|
no_url += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
audio_file: str | None = entry.get("audio_file")
|
||||||
|
if not audio_file:
|
||||||
|
audio_file = _make_audio_file(entry)
|
||||||
|
|
||||||
|
dest = AUDIO_DIR / audio_file
|
||||||
|
|
||||||
|
if dest.exists():
|
||||||
|
cached += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if _download(url, dest, session):
|
||||||
|
downloaded += 1
|
||||||
|
time.sleep(DOWNLOAD_DELAY)
|
||||||
|
else:
|
||||||
|
no_url += 1 # count persistent failures alongside missing URLs
|
||||||
|
|
||||||
|
return downloaded, cached, no_url
|
||||||
|
|
||||||
|
|
||||||
|
def download_noun_plural_audio(
|
||||||
|
entries: list[dict],
|
||||||
|
session: requests.Session,
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
"""Download noun plural audio files.
|
||||||
|
|
||||||
|
Destination: ``data/audio/{slug}_plural.mp3``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entries: List of words.json entry dicts.
|
||||||
|
session: Shared requests session.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (downloaded, cached) counts.
|
||||||
|
"""
|
||||||
|
downloaded = cached = 0
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
ni = entry.get("noun_inflection")
|
||||||
|
if not ni or not isinstance(ni, dict):
|
||||||
|
continue
|
||||||
|
|
||||||
|
url: str | None = ni.get("plural_audio")
|
||||||
|
if not url or not url.startswith("http"):
|
||||||
|
continue
|
||||||
|
|
||||||
|
slug: str = entry["slug"]
|
||||||
|
dest = AUDIO_DIR / f"{slug}_plural.mp3"
|
||||||
|
|
||||||
|
if dest.exists():
|
||||||
|
cached += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if _download(url, dest, session):
|
||||||
|
downloaded += 1
|
||||||
|
time.sleep(DOWNLOAD_DELAY)
|
||||||
|
|
||||||
|
return downloaded, cached
|
||||||
|
|
||||||
|
|
||||||
|
def download_conjugation_audio(
|
||||||
|
entries: list[dict],
|
||||||
|
session: requests.Session,
|
||||||
|
) -> tuple[int, int, int]:
|
||||||
|
"""Download conjugation form audio files.
|
||||||
|
|
||||||
|
Active forms → ``data/audio_conj/{slug}_{form_key}.mp3``
|
||||||
|
Passive forms → ``data/audio_conj/{slug}_passive_{form_key}.mp3``
|
||||||
|
|
||||||
|
Args:
|
||||||
|
entries: List of words.json entry dicts.
|
||||||
|
session: Shared requests session.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
Tuple of (downloaded, cached, failed) counts.
|
||||||
|
"""
|
||||||
|
downloaded = cached = failed = 0
|
||||||
|
|
||||||
|
for entry in entries:
|
||||||
|
conj = entry.get("conjugation")
|
||||||
|
if not conj:
|
||||||
|
continue
|
||||||
|
|
||||||
|
slug: str = entry["slug"]
|
||||||
|
|
||||||
|
form_sets: list[tuple[str, list]] = [
|
||||||
|
("", conj.get("active_forms") or []),
|
||||||
|
("passive_", conj.get("hufal_pual_forms") or []),
|
||||||
|
]
|
||||||
|
|
||||||
|
for prefix, forms in form_sets:
|
||||||
|
for form in forms:
|
||||||
|
url: str | None = form.get("audio_url")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
|
||||||
|
key = _form_key(form.get("person", ""), form.get("tense", ""))
|
||||||
|
dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
|
||||||
|
|
||||||
|
if dest.exists():
|
||||||
|
cached += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
if _download(url, dest, session):
|
||||||
|
downloaded += 1
|
||||||
|
time.sleep(DOWNLOAD_DELAY)
|
||||||
|
else:
|
||||||
|
failed += 1
|
||||||
|
|
||||||
|
return downloaded, cached, failed
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
"""Parse CLI args and run the audio download pipeline."""
|
||||||
|
parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-vocab",
|
||||||
|
action="store_true",
|
||||||
|
help="Skip vocabulary audio downloads.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--skip-conj",
|
||||||
|
action="store_true",
|
||||||
|
help="Skip conjugation audio downloads.",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--test",
|
||||||
|
metavar="N",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Limit processing to the first N words.json entries.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(message)s",
|
||||||
|
)
|
||||||
|
|
||||||
|
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
with open(WORDS_JSON, encoding="utf-8") as fh:
|
||||||
|
raw: dict[str, dict] = json.load(fh)
|
||||||
|
|
||||||
|
entries = list(raw.values())
|
||||||
|
if args.test is not None:
|
||||||
|
entries = entries[: args.test]
|
||||||
|
|
||||||
|
logger.info("[4] Downloading audio files …")
|
||||||
|
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
|
||||||
|
|
||||||
|
# --- Vocab ---
|
||||||
|
if not args.skip_vocab:
|
||||||
|
v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
|
||||||
|
else:
|
||||||
|
v_dl = v_cached = v_no_url = 0
|
||||||
|
|
||||||
|
# --- Noun plural ---
|
||||||
|
np_dl, np_cached = download_noun_plural_audio(entries, session)
|
||||||
|
|
||||||
|
# --- Conjugation ---
|
||||||
|
if not args.skip_conj:
|
||||||
|
c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
|
||||||
|
else:
|
||||||
|
c_dl = c_cached = c_failed = 0
|
||||||
|
|
||||||
|
# --- Summary ---
|
||||||
|
if not args.skip_vocab:
|
||||||
|
logger.info(
|
||||||
|
" Vocab: %d downloaded, %d cached, %d no URL",
|
||||||
|
v_dl,
|
||||||
|
v_cached,
|
||||||
|
v_no_url,
|
||||||
|
)
|
||||||
|
logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached)
|
||||||
|
if not args.skip_conj:
|
||||||
|
failed_msg = f", {c_failed} failed" if c_failed else ""
|
||||||
|
logger.info(
|
||||||
|
" Conjugation: %d downloaded, %d cached%s",
|
||||||
|
c_dl,
|
||||||
|
c_cached,
|
||||||
|
failed_msg,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
1130
pealim_detail_scrape.py
Normal file
1130
pealim_detail_scrape.py
Normal file
File diff suppressed because it is too large
Load diff
706
pealim_list_scrape.py
Normal file
706
pealim_list_scrape.py
Normal file
|
|
@ -0,0 +1,706 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Consolidated list page scraper for pealim.com.
|
||||||
|
|
||||||
|
Scrapes /dict/?page=N with two cookie variants (hebstyle=mo for nikkud,
|
||||||
|
hebstyle=vl for ktiv male) and writes results directly to data/words.json.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 pealim_list_scrape.py [--test N] [--force-refresh]
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import time
|
||||||
|
from datetime import date
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
from helpers import strip_nikkud
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Paths
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
PROJECT_ROOT = Path(__file__).parent
|
||||||
|
DATA_DIR = PROJECT_ROOT / "data"
|
||||||
|
WORDS_JSON = DATA_DIR / "words.json"
|
||||||
|
PROGRESS_JSON = DATA_DIR / "list_scrape_progress.json"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Constants
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||||||
|
REQUEST_DELAY = 1.5 # seconds between requests
|
||||||
|
REQUEST_TIMEOUT = 15 # seconds
|
||||||
|
DEFAULT_TOTAL_PAGES = 608
|
||||||
|
SAVE_EVERY = 10 # pages between incremental saves
|
||||||
|
TODAY = date.today().isoformat()
|
||||||
|
|
||||||
|
# Prefer lxml if available; html.parser is the fallback
|
||||||
|
try:
|
||||||
|
import lxml # type: ignore[import-untyped] # noqa: F401
|
||||||
|
|
||||||
|
BS4_PARSER = "lxml"
|
||||||
|
except ImportError:
|
||||||
|
BS4_PARSER = "html.parser"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Part-of-speech mappings
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
POS_HEBREW: dict[str, str] = {
|
||||||
|
"Noun": "שֵׁם עֶצֶם",
|
||||||
|
"Verb": "פֹּעַל",
|
||||||
|
"Adjective": "שֵׁם תֹּאַר",
|
||||||
|
"Adverb": "תֹּאַר הַפֹּעַל",
|
||||||
|
"Pronoun": "כִּנּוּי גּוּף",
|
||||||
|
"Preposition": "מִילַּת יַחַס",
|
||||||
|
"Conjunction": "מִילַּת חִבּוּר",
|
||||||
|
"Interjection": "מִילַּת קְרִיאָה",
|
||||||
|
"Numeral": "שֵׁם מִסְפָּר",
|
||||||
|
"Cardinal numeral": "שֵׁם מִסְפָּר",
|
||||||
|
"Particle": "מִילִּית",
|
||||||
|
"Determiner": "מְגַדִּיר",
|
||||||
|
"Existential": "מִילַּת קִיּוּם",
|
||||||
|
"Interrogative": "מִילַּת שְׁאֵלָה",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Use exact match on the POS string prefix; longer keys must be checked first.
|
||||||
|
POS_HEBREW_ORDERED: list[tuple[str, str]] = sorted(POS_HEBREW.items(), key=lambda x: -len(x[0]))
|
||||||
|
|
||||||
|
BINYAN_HEBREW: dict[str, str] = {
|
||||||
|
"Pa'al": "פָּעַל",
|
||||||
|
"Nif'al": "נִפְעַל",
|
||||||
|
"Pi'el": "פִּיעֵל",
|
||||||
|
"Pu'al": "פֻּעַל",
|
||||||
|
"Hif'il": "הִפְעִיל",
|
||||||
|
"Huf'al": "הֻפְעַל",
|
||||||
|
"Hitpa'el": "הִתְפַּעֵל",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Regex for extracting emoji characters
|
||||||
|
EMOJI_RE = re.compile(
|
||||||
|
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF]+",
|
||||||
|
re.UNICODE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fields that must never be overwritten when updating an existing entry
|
||||||
|
PROTECTED_FIELDS = frozenset(
|
||||||
|
[
|
||||||
|
"vocab_legacy_guid",
|
||||||
|
"confusables_guid",
|
||||||
|
"frequency",
|
||||||
|
"pseudo_frequency",
|
||||||
|
"emoji",
|
||||||
|
"emoji_source",
|
||||||
|
"emoji_visible",
|
||||||
|
"image",
|
||||||
|
"image_source",
|
||||||
|
"hint",
|
||||||
|
"examples",
|
||||||
|
"noun_inflection",
|
||||||
|
"conjugation",
|
||||||
|
"adjective_inflection",
|
||||||
|
"preposition_inflection",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Logging
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HTTP session
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
session = requests.Session()
|
||||||
|
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki-scraper/1.0)"})
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Default entry template
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _default_entry() -> dict:
|
||||||
|
"""Return a fresh entry with all fields initialised to safe defaults."""
|
||||||
|
return {
|
||||||
|
"word": {"nikkud": "", "ktiv_male": ""},
|
||||||
|
"slug": "",
|
||||||
|
"root": [],
|
||||||
|
"pos": "",
|
||||||
|
"pos_hebrew": "",
|
||||||
|
"meaning": "",
|
||||||
|
"meaning_raw": "",
|
||||||
|
"audio_url": "",
|
||||||
|
"audio_file": "",
|
||||||
|
"tags": "",
|
||||||
|
"last_scrape_date": "",
|
||||||
|
"vocab_legacy_guid": None,
|
||||||
|
"frequency": None,
|
||||||
|
"pseudo_frequency": None,
|
||||||
|
"emoji": None,
|
||||||
|
"emoji_source": None,
|
||||||
|
"emoji_visible": False,
|
||||||
|
"image": None,
|
||||||
|
"image_source": None,
|
||||||
|
"hint": "",
|
||||||
|
"shared_roots": [],
|
||||||
|
"confusable_group": None,
|
||||||
|
"confusables_guid": None,
|
||||||
|
"examples": None,
|
||||||
|
"noun_inflection": None,
|
||||||
|
"conjugation": None,
|
||||||
|
"adjective_inflection": None,
|
||||||
|
"preposition_inflection": None,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Parsing helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _extract_emoji(text: str) -> str | None:
|
||||||
|
"""Return the first emoji run found in *text*, or None."""
|
||||||
|
m = EMOJI_RE.search(text)
|
||||||
|
return m.group(0) if m else None
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_meaning(raw: str) -> str:
|
||||||
|
"""Strip emoji and extra whitespace from a raw meaning string."""
|
||||||
|
cleaned = EMOJI_RE.sub("", raw)
|
||||||
|
return " ".join(cleaned.split())
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_pos(pos_raw: str) -> tuple[str, str]:
|
||||||
|
"""
|
||||||
|
Parse raw PoS string into (pos_en, pos_hebrew).
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
"Noun – masculine" → ("Noun", "שֵׁם עֶצֶם")
|
||||||
|
"Verb – pa'al" → ("Verb", "פֹּעַל — פָּעַל")
|
||||||
|
"Cardinal numeral" → ("Cardinal numeral", "שֵׁם מִסְפָּר")
|
||||||
|
"""
|
||||||
|
# Strip leading/trailing whitespace; normalise dashes
|
||||||
|
pos_clean = pos_raw.strip()
|
||||||
|
|
||||||
|
# Determine the base English PoS with longest-match strategy
|
||||||
|
pos_en = ""
|
||||||
|
for key, _ in POS_HEBREW_ORDERED:
|
||||||
|
if pos_clean.startswith(key):
|
||||||
|
pos_en = key
|
||||||
|
break
|
||||||
|
if not pos_en:
|
||||||
|
# Fallback: take everything up to " – " or the full string
|
||||||
|
pos_en = pos_clean.split(" – ")[0].split(" - ")[0].strip()
|
||||||
|
|
||||||
|
pos_heb = POS_HEBREW.get(pos_en, pos_en)
|
||||||
|
|
||||||
|
# For verbs, attempt to append binyan
|
||||||
|
if pos_en == "Verb":
|
||||||
|
# Look for binyan after dash; pealim uses "Verb – pa'al"
|
||||||
|
dash_parts = re.split(r"\s*[–-]\s*", pos_clean)
|
||||||
|
if len(dash_parts) >= 2:
|
||||||
|
binyan_raw = dash_parts[1].strip()
|
||||||
|
# Normalise capitalisation for lookup: "pa'al" → "Pa'al"
|
||||||
|
binyan_key = binyan_raw.capitalize()
|
||||||
|
# Handle mixed-case entries like "Nif'al"
|
||||||
|
for bkey in BINYAN_HEBREW:
|
||||||
|
if bkey.lower() == binyan_raw.lower():
|
||||||
|
binyan_key = bkey
|
||||||
|
break
|
||||||
|
binyan_heb = BINYAN_HEBREW.get(binyan_key)
|
||||||
|
if binyan_heb:
|
||||||
|
pos_heb = f"{pos_heb} — {binyan_heb}"
|
||||||
|
|
||||||
|
return pos_en, pos_heb
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_root(root_raw: str) -> list[str]:
|
||||||
|
"""
|
||||||
|
Convert raw root text to a list of consonants.
|
||||||
|
|
||||||
|
Pealim shows roots as "פ - ע - ל" or "פ.ע.ל" or "—" (no root).
|
||||||
|
"""
|
||||||
|
if not root_raw or root_raw in ("-", "—", "–"):
|
||||||
|
return []
|
||||||
|
# Split on " - " or "." separators
|
||||||
|
parts = re.split(r"\s*[-–—.]\s*", root_raw.strip())
|
||||||
|
return [p.strip() for p in parts if p.strip()]
|
||||||
|
|
||||||
|
|
||||||
|
def _build_tags(pos_en: str, root: list[str]) -> str:
|
||||||
|
"""
|
||||||
|
Generate Anki tags string matching the existing project convention.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
pos=Noun, root=[] → "שם_עצם"
|
||||||
|
pos=Noun, root=["א","ב"] → "שורש::אב שם_עצם"
|
||||||
|
pos=Verb, root=["שמר"] → "שורש::שמר פעלים"
|
||||||
|
"""
|
||||||
|
pos_tag_map = {
|
||||||
|
"Noun": "שם_עצם",
|
||||||
|
"Verb": "פעלים",
|
||||||
|
"Adjective": "שם_תואר",
|
||||||
|
"Adverb": "תוארי_הפועל",
|
||||||
|
"Pronoun": "כינויי_גוף",
|
||||||
|
"Preposition": "מילות_יחס",
|
||||||
|
"Conjunction": "מילות_חיבור",
|
||||||
|
"Particle": "מילית",
|
||||||
|
"Numeral": "שם_מספר",
|
||||||
|
"Cardinal numeral": "שם_מספר",
|
||||||
|
"Determiner": "מגדיר",
|
||||||
|
"Existential": "מילת_קיום",
|
||||||
|
"Interrogative": "מילת_שאלה",
|
||||||
|
"Interjection": "מילת_קריאה",
|
||||||
|
}
|
||||||
|
|
||||||
|
parts: list[str] = []
|
||||||
|
if root:
|
||||||
|
root_str = "".join(strip_nikkud(c) for c in root)
|
||||||
|
parts.append(f"שורש::{root_str}")
|
||||||
|
|
||||||
|
pos_heb_tag = pos_tag_map.get(pos_en, "")
|
||||||
|
if pos_heb_tag:
|
||||||
|
parts.append(pos_heb_tag)
|
||||||
|
|
||||||
|
return " ".join(parts)
|
||||||
|
|
||||||
|
|
||||||
|
def _compute_audio_file(slug: str, ktiv_male: str) -> str:
|
||||||
|
"""
|
||||||
|
Return the local audio filename for an entry.
|
||||||
|
|
||||||
|
The actual confusable detection happens later (after all pages are scraped);
|
||||||
|
here we store a placeholder that post_process() will correct.
|
||||||
|
We default to the consonant-based name; confusables get slug-based names.
|
||||||
|
"""
|
||||||
|
consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
|
||||||
|
return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Page parsing
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _parse_mo_page(html: bytes) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Parse a hebstyle=mo (nikkud) list page.
|
||||||
|
|
||||||
|
Returns a list of raw row dicts with keys:
|
||||||
|
nikkud, slug, root_raw, pos_raw, meaning_raw, audio_url
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, BS4_PARSER)
|
||||||
|
rows: list[dict] = []
|
||||||
|
for tr in soup.select("table tr"):
|
||||||
|
tds = tr.find_all("td")
|
||||||
|
if len(tds) < 4:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Audio URL
|
||||||
|
audio_span = tds[0].find(attrs={"data-audio": True})
|
||||||
|
audio_url: str = audio_span["data-audio"] if audio_span else ""
|
||||||
|
|
||||||
|
# Slug
|
||||||
|
slug = ""
|
||||||
|
link = tds[0].find("a", href=True)
|
||||||
|
if link:
|
||||||
|
m = re.search(r"/dict/([^/]+)/", link["href"])
|
||||||
|
if m:
|
||||||
|
slug = m.group(1)
|
||||||
|
|
||||||
|
# Nikkud word
|
||||||
|
menukad = tds[0].find("span", class_="menukad")
|
||||||
|
nikkud = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||||
|
|
||||||
|
root_raw = tds[1].get_text(strip=True)
|
||||||
|
pos_raw = tds[2].get_text(strip=True)
|
||||||
|
meaning_raw = tds[3].get_text(strip=True)
|
||||||
|
|
||||||
|
if nikkud:
|
||||||
|
rows.append(
|
||||||
|
{
|
||||||
|
"nikkud": nikkud,
|
||||||
|
"slug": slug,
|
||||||
|
"root_raw": root_raw,
|
||||||
|
"pos_raw": pos_raw,
|
||||||
|
"meaning_raw": meaning_raw,
|
||||||
|
"audio_url": audio_url,
|
||||||
|
}
|
||||||
|
)
|
||||||
|
return rows
|
||||||
|
|
||||||
|
|
||||||
|
def _parse_vl_words(html: bytes) -> list[str]:
|
||||||
|
"""
|
||||||
|
Parse a hebstyle=vl (ktiv male) list page.
|
||||||
|
|
||||||
|
Returns ordered list of ktiv male strings (one per table row).
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, BS4_PARSER)
|
||||||
|
words: list[str] = []
|
||||||
|
for tr in soup.select("table tr"):
|
||||||
|
tds = tr.find_all("td")
|
||||||
|
if len(tds) < 4:
|
||||||
|
continue
|
||||||
|
menukad = tds[0].find("span", class_="menukad")
|
||||||
|
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||||
|
words.append(word)
|
||||||
|
return words
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# words.json I/O
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _load_words() -> dict:
|
||||||
|
"""Load words.json; return empty dict if missing."""
|
||||||
|
if not WORDS_JSON.exists():
|
||||||
|
logger.info("data/words.json not found — starting fresh.")
|
||||||
|
return {}
|
||||||
|
with WORDS_JSON.open(encoding="utf-8") as fh:
|
||||||
|
return json.load(fh)
|
||||||
|
|
||||||
|
|
||||||
|
def _save_words(words: dict) -> None:
|
||||||
|
"""Atomically write words to words.json via a .tmp file."""
|
||||||
|
tmp = WORDS_JSON.with_suffix(".json.tmp")
|
||||||
|
with tmp.open("w", encoding="utf-8") as fh:
|
||||||
|
json.dump(words, fh, ensure_ascii=False, indent=2)
|
||||||
|
os.replace(tmp, WORDS_JSON)
|
||||||
|
logger.info("Saved data/words.json (%d entries)", len(words))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Progress tracking
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _load_progress() -> set[int]:
|
||||||
|
"""Return set of already-completed page numbers."""
|
||||||
|
if not PROGRESS_JSON.exists():
|
||||||
|
return set()
|
||||||
|
with PROGRESS_JSON.open(encoding="utf-8") as fh:
|
||||||
|
data = json.load(fh)
|
||||||
|
return set(data.get("completed_pages", []))
|
||||||
|
|
||||||
|
|
||||||
|
def _save_progress(completed: set[int]) -> None:
|
||||||
|
"""Atomically write progress file."""
|
||||||
|
tmp = PROGRESS_JSON.with_suffix(".json.tmp")
|
||||||
|
with tmp.open("w", encoding="utf-8") as fh:
|
||||||
|
json.dump({"completed_pages": sorted(completed)}, fh)
|
||||||
|
os.replace(tmp, PROGRESS_JSON)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Unique key generation
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _make_unique_key(nikkud: str, pos_en: str, meaning: str, existing_keys: set[str]) -> str:
|
||||||
|
"""
|
||||||
|
Generate a collision-free unique key for a new entry.
|
||||||
|
|
||||||
|
Escalation:
|
||||||
|
1. nikkud
|
||||||
|
2. nikkud|pos_en
|
||||||
|
3. nikkud|pos_en|meaning
|
||||||
|
4. nikkud|pos_en|meaning|N (N = 2, 3, …)
|
||||||
|
"""
|
||||||
|
candidate = nikkud
|
||||||
|
if candidate not in existing_keys:
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
candidate = f"{nikkud}|{pos_en}"
|
||||||
|
if candidate not in existing_keys:
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
candidate = f"{nikkud}|{pos_en}|{meaning}"
|
||||||
|
if candidate not in existing_keys:
|
||||||
|
return candidate
|
||||||
|
|
||||||
|
n = 2
|
||||||
|
while True:
|
||||||
|
candidate = f"{nikkud}|{pos_en}|{meaning}|{n}"
|
||||||
|
if candidate not in existing_keys:
|
||||||
|
return candidate
|
||||||
|
n += 1
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Core: merge one scraped row into words dict
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _merge_row(
|
||||||
|
words: dict,
|
||||||
|
slug_index: dict[str, str],
|
||||||
|
nikkud: str,
|
||||||
|
ktiv_male: str,
|
||||||
|
slug: str,
|
||||||
|
root_raw: str,
|
||||||
|
pos_raw: str,
|
||||||
|
meaning_raw_raw: str,
|
||||||
|
audio_url: str,
|
||||||
|
) -> None:
|
||||||
|
"""
|
||||||
|
Upsert a single scraped row into *words* in-place.
|
||||||
|
|
||||||
|
*slug_index* maps slug → unique_key for fast lookup and is updated here
|
||||||
|
when a new entry is created.
|
||||||
|
"""
|
||||||
|
# Derived fields
|
||||||
|
pos_en, pos_heb = _parse_pos(pos_raw)
|
||||||
|
root = _parse_root(root_raw)
|
||||||
|
meaning_raw = meaning_raw_raw
|
||||||
|
meaning = _clean_meaning(meaning_raw)
|
||||||
|
emoji = _extract_emoji(meaning_raw_raw)
|
||||||
|
tags = _build_tags(pos_en, root)
|
||||||
|
audio_file = _compute_audio_file(slug, ktiv_male)
|
||||||
|
|
||||||
|
# ---- locate existing entry ----
|
||||||
|
unique_key: str | None = slug_index.get(slug) if slug else None
|
||||||
|
|
||||||
|
if unique_key and unique_key in words:
|
||||||
|
# Update list-level fields only; never touch protected fields
|
||||||
|
entry = words[unique_key]
|
||||||
|
entry["word"]["nikkud"] = nikkud
|
||||||
|
entry["word"]["ktiv_male"] = ktiv_male
|
||||||
|
entry["slug"] = slug
|
||||||
|
entry["root"] = root
|
||||||
|
entry["pos"] = pos_en
|
||||||
|
entry["pos_hebrew"] = pos_heb
|
||||||
|
entry["meaning"] = meaning
|
||||||
|
entry["meaning_raw"] = meaning_raw
|
||||||
|
entry["audio_url"] = audio_url
|
||||||
|
entry["audio_file"] = audio_file
|
||||||
|
entry["tags"] = tags
|
||||||
|
entry["last_scrape_date"] = TODAY
|
||||||
|
else:
|
||||||
|
# Create new entry
|
||||||
|
unique_key = _make_unique_key(nikkud, pos_en, meaning, set(words.keys()))
|
||||||
|
entry = _default_entry()
|
||||||
|
entry["word"]["nikkud"] = nikkud
|
||||||
|
entry["word"]["ktiv_male"] = ktiv_male
|
||||||
|
entry["slug"] = slug
|
||||||
|
entry["root"] = root
|
||||||
|
entry["pos"] = pos_en
|
||||||
|
entry["pos_hebrew"] = pos_heb
|
||||||
|
entry["meaning"] = meaning
|
||||||
|
entry["meaning_raw"] = meaning_raw
|
||||||
|
entry["emoji"] = emoji
|
||||||
|
entry["emoji_source"] = "from_pealim" if emoji else None
|
||||||
|
entry["audio_url"] = audio_url
|
||||||
|
entry["audio_file"] = audio_file
|
||||||
|
entry["tags"] = tags
|
||||||
|
entry["last_scrape_date"] = TODAY
|
||||||
|
words[unique_key] = entry
|
||||||
|
if slug:
|
||||||
|
slug_index[slug] = unique_key
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Post-processing: recompute confusable_group, shared_roots, audio_file
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _post_process(words: dict) -> None:
|
||||||
|
"""
|
||||||
|
After all pages are scraped, recompute derived cross-entry fields:
|
||||||
|
|
||||||
|
- confusable_group: entries sharing the same ktiv_male (2+)
|
||||||
|
- shared_roots: entries sharing the same root (excluding self)
|
||||||
|
- audio_file: slug-based for confusables, consonant-based otherwise
|
||||||
|
"""
|
||||||
|
logger.info("Post-processing: recomputing confusable groups and shared roots...")
|
||||||
|
|
||||||
|
# --- confusable groups ---
|
||||||
|
ktiv_to_keys: dict[str, list[str]] = {}
|
||||||
|
for key, entry in words.items():
|
||||||
|
ktiv = entry.get("word", {}).get("ktiv_male", "")
|
||||||
|
if ktiv:
|
||||||
|
ktiv_to_keys.setdefault(ktiv, []).append(key)
|
||||||
|
|
||||||
|
for _, entry in words.items():
|
||||||
|
ktiv = entry.get("word", {}).get("ktiv_male", "")
|
||||||
|
group = ktiv_to_keys.get(ktiv, [])
|
||||||
|
if len(group) >= 2:
|
||||||
|
entry["confusable_group"] = sorted(group)
|
||||||
|
# Confusable → slug-based audio filename
|
||||||
|
slug = entry.get("slug", "")
|
||||||
|
if slug:
|
||||||
|
entry["audio_file"] = f"{slug}.mp3"
|
||||||
|
else:
|
||||||
|
# Only clear confusable_group if it wasn't set by enrichment (i.e. no confusables_guid)
|
||||||
|
if not entry.get("confusables_guid"):
|
||||||
|
entry["confusable_group"] = None
|
||||||
|
# Non-confusable → consonant-based audio filename
|
||||||
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||||
|
consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
|
||||||
|
slug = entry.get("slug", "")
|
||||||
|
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||||
|
|
||||||
|
# --- shared roots ---
|
||||||
|
root_to_keys: dict[str, list[str]] = {}
|
||||||
|
for key, entry in words.items():
|
||||||
|
root = entry.get("root")
|
||||||
|
if root:
|
||||||
|
root_str = "|".join(root) # canonical form for grouping
|
||||||
|
root_to_keys.setdefault(root_str, []).append(key)
|
||||||
|
|
||||||
|
for key, entry in words.items():
|
||||||
|
root = entry.get("root")
|
||||||
|
if root:
|
||||||
|
root_str = "|".join(root)
|
||||||
|
siblings = root_to_keys.get(root_str, [])
|
||||||
|
entry["shared_roots"] = sorted(k for k in siblings if k != key)
|
||||||
|
else:
|
||||||
|
entry["shared_roots"] = []
|
||||||
|
|
||||||
|
logger.info("Post-processing complete.")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Scraping loop
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _build_slug_index(words: dict) -> dict[str, str]:
|
||||||
|
"""Build slug → unique_key lookup from the current words dict."""
|
||||||
|
index: dict[str, str] = {}
|
||||||
|
for key, entry in words.items():
|
||||||
|
slug = entry.get("slug", "")
|
||||||
|
if slug and slug not in index:
|
||||||
|
index[slug] = key
|
||||||
|
return index
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_page(url: str, cookies: dict) -> bytes | None:
|
||||||
|
"""Fetch a single page; return raw bytes or None on failure."""
|
||||||
|
try:
|
||||||
|
resp = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return resp.content
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
logger.error("Request failed for %s: %s", url, exc)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def run_scrape(total_pages: int, force_refresh: bool) -> None:
|
||||||
|
"""
|
||||||
|
Main scrape loop.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
total_pages: Number of list pages to scrape.
|
||||||
|
force_refresh: If True, ignore progress file and re-scrape all pages.
|
||||||
|
"""
|
||||||
|
words = _load_words()
|
||||||
|
slug_index = _build_slug_index(words)
|
||||||
|
completed = set() if force_refresh else _load_progress()
|
||||||
|
|
||||||
|
if force_refresh and completed:
|
||||||
|
logger.info("--force-refresh: ignoring %d completed pages.", len(completed))
|
||||||
|
|
||||||
|
pages_to_do = [p for p in range(1, total_pages + 1) if p not in completed]
|
||||||
|
logger.info(
|
||||||
|
"Pages to scrape: %d / %d (already done: %d)",
|
||||||
|
len(pages_to_do),
|
||||||
|
total_pages,
|
||||||
|
len(completed),
|
||||||
|
)
|
||||||
|
|
||||||
|
pages_since_save = 0
|
||||||
|
|
||||||
|
for page_num in pages_to_do:
|
||||||
|
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||||||
|
logger.info("Scraping page %d / %d …", page_num, total_pages)
|
||||||
|
|
||||||
|
# --- hebstyle=mo (nikkud + audio + slug) ---
|
||||||
|
mo_html = _fetch_page(url, {"translit": "none", "hebstyle": "mo"})
|
||||||
|
if mo_html is None:
|
||||||
|
logger.warning("Skipping page %d (mo fetch failed).", page_num)
|
||||||
|
time.sleep(REQUEST_DELAY * 2)
|
||||||
|
continue
|
||||||
|
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
# --- hebstyle=vl (ktiv male) ---
|
||||||
|
vl_html = _fetch_page(url, {"translit": "none", "hebstyle": "vl"})
|
||||||
|
if vl_html is None:
|
||||||
|
logger.warning("Skipping page %d (vl fetch failed).", page_num)
|
||||||
|
time.sleep(REQUEST_DELAY * 2)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Parse
|
||||||
|
mo_rows = _parse_mo_page(mo_html)
|
||||||
|
vl_words = _parse_vl_words(vl_html)
|
||||||
|
|
||||||
|
if not mo_rows:
|
||||||
|
logger.warning("Page %d returned no rows — might be past end.", page_num)
|
||||||
|
completed.add(page_num)
|
||||||
|
_save_progress(completed)
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Merge each row
|
||||||
|
for i, row in enumerate(mo_rows):
|
||||||
|
ktiv_male = vl_words[i] if i < len(vl_words) else ""
|
||||||
|
_merge_row(
|
||||||
|
words=words,
|
||||||
|
slug_index=slug_index,
|
||||||
|
nikkud=row["nikkud"],
|
||||||
|
ktiv_male=ktiv_male,
|
||||||
|
slug=row["slug"],
|
||||||
|
root_raw=row["root_raw"],
|
||||||
|
pos_raw=row["pos_raw"],
|
||||||
|
meaning_raw_raw=row["meaning_raw"],
|
||||||
|
audio_url=row["audio_url"],
|
||||||
|
)
|
||||||
|
|
||||||
|
completed.add(page_num)
|
||||||
|
pages_since_save += 1
|
||||||
|
|
||||||
|
# Incremental save every SAVE_EVERY pages
|
||||||
|
if pages_since_save >= SAVE_EVERY:
|
||||||
|
_save_words(words)
|
||||||
|
_save_progress(completed)
|
||||||
|
pages_since_save = 0
|
||||||
|
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
# Final save + post-processing
|
||||||
|
logger.info("All pages scraped. Running post-processing…")
|
||||||
|
_post_process(words)
|
||||||
|
_save_words(words)
|
||||||
|
_save_progress(completed)
|
||||||
|
logger.info("Done. Total entries in words.json: %d", len(words))
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CLI
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def main() -> None:
|
||||||
|
"""Entry point."""
|
||||||
|
parser = argparse.ArgumentParser(description="Scrape pealim.com list pages into data/words.json.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--test",
|
||||||
|
metavar="N",
|
||||||
|
type=int,
|
||||||
|
default=None,
|
||||||
|
help="Scrape only the first N pages (for testing).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--force-refresh",
|
||||||
|
action="store_true",
|
||||||
|
default=False,
|
||||||
|
help="Re-scrape all pages, ignoring existing progress.",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
total_pages = args.test if args.test is not None else DEFAULT_TOTAL_PAGES
|
||||||
|
logger.info(
|
||||||
|
"Starting pealim list scraper | pages=%d | force=%s | parser=%s",
|
||||||
|
total_pages,
|
||||||
|
args.force_refresh,
|
||||||
|
BS4_PARSER,
|
||||||
|
)
|
||||||
|
|
||||||
|
run_scrape(total_pages=total_pages, force_refresh=args.force_refresh)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
|
|
@ -25,6 +25,9 @@ dev = [
|
||||||
|
|
||||||
[tool.pytest.ini_options]
|
[tool.pytest.ini_options]
|
||||||
testpaths = ["tests"]
|
testpaths = ["tests"]
|
||||||
|
markers = [
|
||||||
|
"integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
|
||||||
|
]
|
||||||
|
|
||||||
[tool.ruff]
|
[tool.ruff]
|
||||||
target-version = "py311"
|
target-version = "py311"
|
||||||
|
|
|
||||||
505
run.py
505
run.py
|
|
@ -7,10 +7,10 @@ Usage:
|
||||||
|
|
||||||
Options:
|
Options:
|
||||||
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
|
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
|
||||||
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
|
--skip-scrape Skip list page scraping (use existing words.json)
|
||||||
|
--skip-detail Skip detail page scraping
|
||||||
--skip-audio Skip audio .mp3 downloads
|
--skip-audio Skip audio .mp3 downloads
|
||||||
--skip-examples Skip Ben Yehuda example fetching
|
--skip-examples Skip Ben Yehuda example fetching
|
||||||
--skip-conjugations Skip verb conjugation extraction
|
|
||||||
--skip-images Skip image fetching for concrete nouns
|
--skip-images Skip image fetching for concrete nouns
|
||||||
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
|
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
|
||||||
--test N Process only the first N dictionary words (for quick testing)
|
--test N Process only the first N dictionary words (for quick testing)
|
||||||
|
|
@ -21,7 +21,6 @@ import json
|
||||||
import logging
|
import logging
|
||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import time
|
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from helpers import strip_nikkud
|
from helpers import strip_nikkud
|
||||||
|
|
@ -39,6 +38,7 @@ OUTPUT_DIR = Path(__file__).parent / "output"
|
||||||
AUDIO_DIR = DATA_DIR / "audio"
|
AUDIO_DIR = DATA_DIR / "audio"
|
||||||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||||
FONTS_DIR = DATA_DIR / "fonts"
|
FONTS_DIR = DATA_DIR / "fonts"
|
||||||
|
WORDS_JSON = DATA_DIR / "words.json"
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
def parse_args():
|
||||||
|
|
@ -48,47 +48,31 @@ def parse_args():
|
||||||
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
|
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
|
||||||
help="Run only one deck (skips all unrelated steps)",
|
help="Run only one deck (skips all unrelated steps)",
|
||||||
)
|
)
|
||||||
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
|
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
|
||||||
|
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
|
||||||
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
||||||
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
||||||
p.add_argument(
|
|
||||||
"--skip-conjugations",
|
|
||||||
action="store_true",
|
|
||||||
help="Skip verb conjugation extraction (deprecated: use --only vocab)",
|
|
||||||
)
|
|
||||||
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
||||||
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
|
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
|
||||||
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
||||||
return p.parse_args()
|
return p.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def step_scrape(args):
|
def step_list_scrape(args):
|
||||||
"""Step 1 — scrape or load dictionary."""
|
"""Step 1 — scrape pealim.com list pages → words.json."""
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
|
||||||
anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
|
||||||
# Legacy fallback names
|
|
||||||
legacy_dict = DATA_DIR / "pealim_dict.csv"
|
|
||||||
if args.skip_scrape:
|
if args.skip_scrape:
|
||||||
if dict_csv.exists():
|
if WORDS_JSON.exists():
|
||||||
logger.info(f"[1] Using existing {dict_csv}")
|
logger.info("[1] Using existing words.json (--skip-scrape)")
|
||||||
elif legacy_dict.exists():
|
|
||||||
logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
|
|
||||||
else:
|
else:
|
||||||
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
|
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("[1] Scraping dictionary from pealim.com …")
|
logger.info("[1] Scraping dictionary list pages from pealim.com …")
|
||||||
|
import pealim_list_scrape
|
||||||
|
|
||||||
import hebrew_extract
|
total_pages = args.test if args.test else None
|
||||||
|
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
|
||||||
df = hebrew_extract.extract_from_website()
|
|
||||||
df.to_csv(dict_csv, index=True)
|
|
||||||
logger.info(f" Saved {len(df)} words → {dict_csv}")
|
|
||||||
|
|
||||||
df = hebrew_extract.modify_for_anki(df)
|
|
||||||
df.to_csv(anki_csv, sep=";", index=True)
|
|
||||||
logger.info(f" Saved Anki CSV → {anki_csv}")
|
|
||||||
|
|
||||||
|
|
||||||
def step_frequency() -> dict[str, int]:
|
def step_frequency() -> dict[str, int]:
|
||||||
|
|
@ -100,7 +84,7 @@ def step_frequency() -> dict[str, int]:
|
||||||
return frequency_lookup._freq
|
return frequency_lookup._freq
|
||||||
|
|
||||||
|
|
||||||
def step_examples(args, freq_cache: dict):
|
def step_examples(args, _freq_cache: dict):
|
||||||
"""Step 3 — load/build Ben Yehuda example index."""
|
"""Step 3 — load/build Ben Yehuda example index."""
|
||||||
if args.skip_examples:
|
if args.skip_examples:
|
||||||
logger.info("[3] Skipping examples (--skip-examples)")
|
logger.info("[3] Skipping examples (--skip-examples)")
|
||||||
|
|
@ -115,255 +99,100 @@ def step_examples(args, freq_cache: dict):
|
||||||
|
|
||||||
benyehuda.load(force_rebuild=args.refresh_examples)
|
benyehuda.load(force_rebuild=args.refresh_examples)
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
# Read word list from words.json instead of CSV
|
||||||
if not dict_csv.exists():
|
if not WORDS_JSON.exists():
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
logger.warning("[3] words.json not found, skipping examples")
|
||||||
if not dict_csv.exists():
|
return {}
|
||||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
|
||||||
|
|
||||||
try:
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||||
import pandas as pd
|
words = json.load(f)
|
||||||
|
|
||||||
try:
|
entries = list(words.values())
|
||||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
if args.test:
|
||||||
if df.shape[1] < 3:
|
entries = entries[: args.test]
|
||||||
raise ValueError("too few columns")
|
|
||||||
except (ValueError, pd.errors.ParserError):
|
|
||||||
df = pd.read_csv(dict_csv, index_col=0)
|
|
||||||
|
|
||||||
if args.test:
|
# Build confusable consonant set from words.json
|
||||||
df = df.head(args.test)
|
consonant_counts: dict[str, int] = {}
|
||||||
|
for entry in entries:
|
||||||
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||||
|
if ktiv_male:
|
||||||
|
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
|
||||||
|
if safe:
|
||||||
|
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
||||||
|
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
||||||
|
|
||||||
# Build confusable consonant set from CSV
|
# Delete stale cache entries for confusable words so they get re-fetched
|
||||||
consonant_counts: dict[str, int] = {}
|
stale_deleted = 0
|
||||||
for _, row in df.iterrows():
|
for entry in entries:
|
||||||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
||||||
if word_no_nik and word_no_nik not in ("nan", "None"):
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
|
if word_nikkud and ktiv_male:
|
||||||
if safe:
|
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
|
||||||
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
|
||||||
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
del benyehuda._examples_cache[word_nikkud]
|
||||||
|
stale_deleted += 1
|
||||||
|
if stale_deleted:
|
||||||
|
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
|
||||||
|
|
||||||
# Delete stale cache entries for confusable words so they get re-fetched
|
logger.info(f" Pre-fetching examples for {len(entries)} words …")
|
||||||
stale_deleted = 0
|
for entry in entries:
|
||||||
for _, row in df.iterrows():
|
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
||||||
word_nikkud = str(row.get("Word", "")).strip()
|
if word_nikkud:
|
||||||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
|
||||||
if word_nikkud and word_no_nik:
|
|
||||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_no_nik))
|
|
||||||
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
|
|
||||||
del benyehuda._examples_cache[word_nikkud]
|
|
||||||
stale_deleted += 1
|
|
||||||
if stale_deleted:
|
|
||||||
logger.info(f" Deleted {stale_deleted} stale confusable cache entries")
|
|
||||||
|
|
||||||
logger.info(f" Pre-fetching examples for {len(df)} words …")
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
# Use nikkud word form as primary key (nikkud corpus)
|
|
||||||
word_nikkud = str(row.get("Word", "")).strip()
|
|
||||||
if word_nikkud:
|
|
||||||
benyehuda.get_examples(word_nikkud, confusable_consonants=confusable_consonants)
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" Could not pre-fetch all examples: {e}")
|
|
||||||
|
|
||||||
benyehuda.save_examples_cache()
|
benyehuda.save_examples_cache()
|
||||||
return benyehuda._examples_cache
|
return benyehuda._examples_cache
|
||||||
|
|
||||||
|
|
||||||
def step_audio(args):
|
def step_detail_scrape(args):
|
||||||
"""Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
|
"""Step 4 — scrape detail pages for nouns and verbs → update words.json."""
|
||||||
if args.skip_audio:
|
if args.skip_detail:
|
||||||
logger.info("[4] Skipping audio (--skip-audio)")
|
logger.info("[4] Skipping detail scrape (--skip-detail)")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("[4] Downloading vocabulary audio files …")
|
logger.info("[4] Scraping detail pages from pealim.com …")
|
||||||
|
import pealim_detail_scrape
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
test_limit = args.test if args.test else None
|
||||||
if not dict_csv.exists():
|
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
|
||||||
|
|
||||||
import pandas as pd
|
|
||||||
import requests
|
|
||||||
|
|
||||||
try:
|
|
||||||
try:
|
|
||||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
|
||||||
if df.shape[1] < 3:
|
|
||||||
raise ValueError("too few columns")
|
|
||||||
except (ValueError, pd.errors.ParserError):
|
|
||||||
df = pd.read_csv(dict_csv, index_col=0)
|
|
||||||
|
|
||||||
if "audio_url" not in df.columns:
|
|
||||||
logger.warning(" No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
|
|
||||||
return
|
|
||||||
|
|
||||||
if args.test:
|
|
||||||
df = df.head(args.test)
|
|
||||||
|
|
||||||
# Build confusable set: consonant forms that appear more than once
|
|
||||||
confusable_consonants: set[str] = set()
|
|
||||||
consonant_counts: dict[str, int] = {}
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
|
||||||
if word_plain and word_plain not in ("nan", "None"):
|
|
||||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain))
|
|
||||||
if safe:
|
|
||||||
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
|
||||||
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
|
||||||
|
|
||||||
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
|
||||||
downloaded = 0
|
|
||||||
skipped = 0
|
|
||||||
no_url = 0
|
|
||||||
|
|
||||||
for _, row in df.iterrows():
|
|
||||||
word = str(row.get("Word", "")).strip()
|
|
||||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
|
||||||
audio_url = str(row.get("audio_url", "")).strip()
|
|
||||||
slug = str(row.get("slug", "")).strip()
|
|
||||||
|
|
||||||
if not word:
|
|
||||||
continue
|
|
||||||
|
|
||||||
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
|
|
||||||
if not safe_name:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Confusable words: use slug-based filename to avoid collisions
|
|
||||||
if safe_name in confusable_consonants and slug and slug not in ("nan", "None"):
|
|
||||||
mp3_path = AUDIO_DIR / f"{slug}.mp3"
|
|
||||||
else:
|
|
||||||
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
|
|
||||||
|
|
||||||
if mp3_path.exists():
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
if not audio_url or audio_url in ("nan", "None", ""):
|
|
||||||
no_url += 1
|
|
||||||
continue
|
|
||||||
|
|
||||||
try:
|
|
||||||
resp = requests.get(audio_url, timeout=10)
|
|
||||||
resp.raise_for_status()
|
|
||||||
mp3_path.write_bytes(resp.content)
|
|
||||||
downloaded += 1
|
|
||||||
time.sleep(0.3)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f" Audio download failed for {word}: {e}")
|
|
||||||
|
|
||||||
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
logger.warning(f" Audio step failed: {e}")
|
|
||||||
|
|
||||||
|
|
||||||
def step_conj_audio(args, conjugations: dict):
|
def step_audio_download(args):
|
||||||
"""Step 4b — download conjugation audio .mp3 files."""
|
"""Step 5 — download audio .mp3 files from URLs in words.json."""
|
||||||
if args.skip_audio:
|
if args.skip_audio:
|
||||||
logger.info("[4b] Skipping conjugation audio (--skip-audio)")
|
logger.info("[5] Skipping audio (--skip-audio)")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("[4b] Downloading conjugation audio files …")
|
logger.info("[5] Downloading audio files …")
|
||||||
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
|
import pealim_audio_download
|
||||||
|
|
||||||
import requests
|
test_limit = args.test if args.test else None
|
||||||
|
pealim_audio_download.run(test=test_limit)
|
||||||
downloaded = 0
|
|
||||||
skipped = 0
|
|
||||||
failed = 0
|
|
||||||
|
|
||||||
for _infinitive, data in conjugations.items():
|
|
||||||
if not data or not data.get("forms"):
|
|
||||||
continue
|
|
||||||
|
|
||||||
slug = data.get("slug", "")
|
|
||||||
if not slug:
|
|
||||||
continue
|
|
||||||
|
|
||||||
# Active forms
|
|
||||||
for form_key, form_data in data["forms"].items():
|
|
||||||
audio_url = form_data.get("audio_url", "")
|
|
||||||
if not audio_url:
|
|
||||||
continue
|
|
||||||
filename = f"{slug}_{form_key}.mp3"
|
|
||||||
mp3_path = AUDIO_CONJ_DIR / filename
|
|
||||||
if mp3_path.exists():
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
resp = requests.get(audio_url, timeout=10)
|
|
||||||
resp.raise_for_status()
|
|
||||||
mp3_path.write_bytes(resp.content)
|
|
||||||
downloaded += 1
|
|
||||||
time.sleep(0.2)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f" Conj audio failed {filename}: {e}")
|
|
||||||
failed += 1
|
|
||||||
|
|
||||||
# Passive partner forms
|
|
||||||
passive = data.get("passive_partner")
|
|
||||||
if passive and passive.get("forms"):
|
|
||||||
for form_key, form_data in passive["forms"].items():
|
|
||||||
audio_url = form_data.get("audio_url", "")
|
|
||||||
if not audio_url:
|
|
||||||
continue
|
|
||||||
filename = f"{slug}_passive_{form_key}.mp3"
|
|
||||||
mp3_path = AUDIO_CONJ_DIR / filename
|
|
||||||
if mp3_path.exists():
|
|
||||||
skipped += 1
|
|
||||||
continue
|
|
||||||
try:
|
|
||||||
resp = requests.get(audio_url, timeout=10)
|
|
||||||
resp.raise_for_status()
|
|
||||||
mp3_path.write_bytes(resp.content)
|
|
||||||
downloaded += 1
|
|
||||||
time.sleep(0.2)
|
|
||||||
except Exception as e:
|
|
||||||
logger.debug(f" Conj audio failed {filename}: {e}")
|
|
||||||
failed += 1
|
|
||||||
|
|
||||||
logger.info(f" Conjugation audio: {downloaded} downloaded, {skipped} cached, {failed} failed")
|
|
||||||
|
|
||||||
|
|
||||||
def step_fonts(args):
|
def step_fonts(_args: argparse.Namespace):
|
||||||
"""Step 4c — download Heebo font files (one-time, cached)."""
|
"""Step 6 — download Heebo font files (one-time, cached)."""
|
||||||
FONTS_DIR.mkdir(parents=True, exist_ok=True)
|
FONTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||||
regular = FONTS_DIR / "_Heebo-Regular.ttf"
|
regular = FONTS_DIR / "_Heebo-Regular.ttf"
|
||||||
bold = FONTS_DIR / "_Heebo-Bold.ttf"
|
bold = FONTS_DIR / "_Heebo-Bold.ttf"
|
||||||
|
|
||||||
if regular.exists() and bold.exists():
|
if regular.exists() and bold.exists():
|
||||||
logger.info("[4c] Heebo fonts already cached")
|
logger.info("[6] Heebo fonts already cached")
|
||||||
return
|
return
|
||||||
|
|
||||||
logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
|
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
|
||||||
|
|
||||||
# Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
|
|
||||||
import requests as _req
|
import requests as _req
|
||||||
|
|
||||||
headers = {
|
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
|
||||||
# Request TTF (not woff2) so Anki can embed them
|
|
||||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
|
|
||||||
}
|
|
||||||
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
|
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
|
||||||
try:
|
try:
|
||||||
css_resp = _req.get(css_url, headers=headers, timeout=15)
|
css_resp = _req.get(css_url, headers=headers, timeout=15)
|
||||||
css_resp.raise_for_status()
|
css_resp.raise_for_status()
|
||||||
css_text = css_resp.text
|
css_text = css_resp.text
|
||||||
|
|
||||||
# Find all src: url(...) references (may be woff2 for modern UA)
|
|
||||||
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
|
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
|
||||||
logger.debug(f" Found {len(font_urls)} font URL(s) in CSS")
|
|
||||||
|
|
||||||
# Prefer TTF; if only woff2 available, download first two and note
|
|
||||||
downloaded = []
|
|
||||||
for i, fu in enumerate(font_urls[:2]):
|
for i, fu in enumerate(font_urls[:2]):
|
||||||
fu = fu.strip("'\"")
|
fu = fu.strip("'\"")
|
||||||
dest = regular if i == 0 else bold
|
dest = regular if i == 0 else bold
|
||||||
|
|
@ -372,128 +201,60 @@ def step_fonts(args):
|
||||||
fr = _req.get(fu, timeout=15)
|
fr = _req.get(fu, timeout=15)
|
||||||
fr.raise_for_status()
|
fr.raise_for_status()
|
||||||
dest.write_bytes(fr.content)
|
dest.write_bytes(fr.content)
|
||||||
downloaded.append(dest.name)
|
|
||||||
logger.info(f" Downloaded → {dest.name}")
|
logger.info(f" Downloaded → {dest.name}")
|
||||||
|
|
||||||
if not downloaded:
|
|
||||||
logger.info(" All font files already present")
|
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.warning(f" Heebo download failed: {e}")
|
logger.warning(f" Heebo download failed: {e}")
|
||||||
logger.warning(" Cards will fall back to Arial Hebrew / David.")
|
logger.warning(" Cards will fall back to Arial Hebrew / David.")
|
||||||
logger.warning(
|
|
||||||
" To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
|
|
||||||
"from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
|
|
||||||
f"into {FONTS_DIR}"
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def step_images(args) -> dict:
|
def step_images(args) -> dict:
|
||||||
"""Step 4d — fetch images for concrete nouns (resume-safe)."""
|
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
|
||||||
if args.skip_images:
|
if args.skip_images:
|
||||||
logger.info("[4d] Skipping images (--skip-images)")
|
logger.info("[7] Skipping images (--skip-images)")
|
||||||
cache_path = DATA_DIR / "image_cache.json"
|
cache_path = DATA_DIR / "image_cache.json"
|
||||||
if cache_path.exists():
|
if cache_path.exists():
|
||||||
with open(cache_path) as f:
|
with open(cache_path) as f:
|
||||||
return json.load(f)
|
return json.load(f)
|
||||||
return {}
|
return {}
|
||||||
|
|
||||||
limit = args.test # When in test mode, limit images too
|
limit = args.test
|
||||||
logger.info("[4d] Fetching images for concrete nouns …")
|
logger.info("[7] Fetching images for concrete nouns …")
|
||||||
import image_fetch
|
import image_fetch
|
||||||
|
|
||||||
return image_fetch.run(limit=limit)
|
return image_fetch.run(limit=limit)
|
||||||
|
|
||||||
|
|
||||||
def step_build_all(
|
def step_build_all(args):
|
||||||
args, examples_cache: dict, freq_cache: dict, conjugations: dict | None, image_cache: dict | None = None
|
"""Step 8 — build all 12 release variants from the unified words.json."""
|
||||||
):
|
logger.info("[8] Building all deck variants …")
|
||||||
"""Step 5 — build all 6 release variants (4 vocab + 2 conj)."""
|
|
||||||
logger.info("[5] Building all deck variants …")
|
|
||||||
import apkg_builder
|
import apkg_builder
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
if not WORDS_JSON.exists():
|
||||||
if not dict_csv.exists():
|
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
sys.exit(1)
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
|
||||||
|
|
||||||
apkg_builder.build_all_variants(
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||||
dict_csv,
|
words = json.load(f)
|
||||||
conjugations=conjugations or {},
|
|
||||||
examples_cache=examples_cache,
|
apkg_builder.build_all_variants(words, limit=args.test)
|
||||||
freq_cache=freq_cache,
|
|
||||||
image_cache=image_cache or {},
|
|
||||||
limit=args.test,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def step_conjugations(args):
|
def print_summary(_args: argparse.Namespace, examples_cache: dict, freq_cache: dict):
|
||||||
"""Step 6 — extract conjugations (returns data; building handled by step_build_all).
|
|
||||||
|
|
||||||
--skip-conjugations skips re-extraction from pealim.com but still loads
|
|
||||||
from cache so conj deck variants are built correctly.
|
|
||||||
"""
|
|
||||||
conj_cache = DATA_DIR / "conjugations.json"
|
|
||||||
|
|
||||||
if args.skip_conjugations:
|
|
||||||
if conj_cache.exists():
|
|
||||||
logger.info("[6] --skip-conjugations: loading from cache …")
|
|
||||||
with open(conj_cache) as f:
|
|
||||||
import json as _json
|
|
||||||
|
|
||||||
return _json.load(f)
|
|
||||||
logger.info("[6] --skip-conjugations: no cache found, skipping conj decks")
|
|
||||||
return None
|
|
||||||
|
|
||||||
verbs_file = Path(__file__).parent / "verbs_input.txt"
|
|
||||||
if not verbs_file.exists():
|
|
||||||
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
|
|
||||||
return None
|
|
||||||
|
|
||||||
if conj_cache.exists():
|
|
||||||
logger.info("[6] Using cached conjugations.json …")
|
|
||||||
with open(conj_cache) as f:
|
|
||||||
import json as _json
|
|
||||||
|
|
||||||
conjugations = _json.load(f)
|
|
||||||
else:
|
|
||||||
logger.info("[6] Extracting verb conjugations …")
|
|
||||||
import conjugation_extract
|
|
||||||
|
|
||||||
conjugations = conjugation_extract.main(verbs_file)
|
|
||||||
|
|
||||||
# Download conjugation audio
|
|
||||||
step_conj_audio(args, conjugations)
|
|
||||||
|
|
||||||
return conjugations
|
|
||||||
|
|
||||||
|
|
||||||
def print_summary(args, examples_cache, freq_cache, conjugations):
|
|
||||||
logger.info("")
|
logger.info("")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("SUMMARY")
|
logger.info("SUMMARY")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
if WORDS_JSON.exists():
|
||||||
if not dict_csv.exists():
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
words = json.load(f)
|
||||||
if not dict_csv.exists():
|
logger.info(f" Dictionary words: {len(words)}")
|
||||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
|
||||||
if dict_csv.exists():
|
|
||||||
import pandas as pd
|
|
||||||
|
|
||||||
try:
|
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
|
||||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
|
||||||
if df.shape[1] < 3:
|
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
|
||||||
raise ValueError("too few columns")
|
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
|
||||||
except (ValueError, pd.errors.ParserError):
|
|
||||||
df = pd.read_csv(dict_csv, index_col=0)
|
|
||||||
logger.info(f" Dictionary words: {len(df)}")
|
|
||||||
|
|
||||||
logger.info(f" Frequency entries: {len(freq_cache)}")
|
logger.info(f" Frequency entries: {len(freq_cache)}")
|
||||||
logger.info(f" Example cache entries: {len(examples_cache)}")
|
logger.info(f" Example cache entries: {len(examples_cache)}")
|
||||||
|
|
@ -506,8 +267,6 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
|
||||||
logger.info(f" Vocabulary audio files: {len(mp3s)}")
|
logger.info(f" Vocabulary audio files: {len(mp3s)}")
|
||||||
|
|
||||||
if AUDIO_CONJ_DIR.exists():
|
if AUDIO_CONJ_DIR.exists():
|
||||||
# Count only files that will be bundled: active non-infinitive forms
|
|
||||||
# (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
|
|
||||||
mp3s = [
|
mp3s = [
|
||||||
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
|
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
|
||||||
]
|
]
|
||||||
|
|
@ -538,9 +297,6 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
|
||||||
if apkg.exists():
|
if apkg.exists():
|
||||||
size_mb = apkg.stat().st_size / 1e6
|
size_mb = apkg.stat().st_size / 1e6
|
||||||
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
|
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
|
||||||
if conjugations:
|
|
||||||
verb_count = sum(1 for v in conjugations.values() if v)
|
|
||||||
logger.info(f" Verbs in conjugation deck: {verb_count}")
|
|
||||||
|
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
logger.info("DONE")
|
logger.info("DONE")
|
||||||
|
|
@ -559,88 +315,73 @@ def main():
|
||||||
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
|
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
|
||||||
logger.info("=" * 60)
|
logger.info("=" * 60)
|
||||||
|
|
||||||
|
def _load_words_for_only() -> dict:
|
||||||
|
if not WORDS_JSON.exists():
|
||||||
|
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
|
||||||
|
sys.exit(1)
|
||||||
|
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
if args.only == "conjugations":
|
if args.only == "conjugations":
|
||||||
step_fonts(args)
|
step_fonts(args)
|
||||||
conjugations = step_conjugations(args)
|
import apkg_builder
|
||||||
if conjugations:
|
|
||||||
import apkg_builder
|
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
words = _load_words_for_only()
|
||||||
if not dict_csv.exists():
|
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
|
||||||
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
|
apkg_builder.write_conj_apkg(deck, media, out_path=path)
|
||||||
deck, media = apkg_builder.build_conj_deck(
|
print_summary(args, {}, {})
|
||||||
conjugations,
|
|
||||||
include_audio=audio,
|
|
||||||
dict_csv=dict_csv,
|
|
||||||
)
|
|
||||||
apkg_builder.write_conj_apkg(deck, media, out_path=path)
|
|
||||||
print_summary(args, {}, {}, conjugations or {})
|
|
||||||
return
|
return
|
||||||
|
|
||||||
if args.only == "confusables":
|
if args.only == "confusables":
|
||||||
step_fonts(args)
|
step_fonts(args)
|
||||||
import apkg_builder
|
import apkg_builder
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
words = _load_words_for_only()
|
||||||
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
|
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
|
||||||
deck, media = apkg_builder.build_confusables_deck(dict_csv, include_audio=audio)
|
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
|
||||||
apkg_builder.write_conf_apkg(deck, media, out_path=path)
|
apkg_builder.write_conf_apkg(deck, media, out_path=path)
|
||||||
print_summary(args, {}, {}, {})
|
print_summary(args, {}, {})
|
||||||
return
|
return
|
||||||
|
|
||||||
if args.only == "plurals":
|
if args.only == "plurals":
|
||||||
step_fonts(args)
|
step_fonts(args)
|
||||||
import apkg_builder
|
import apkg_builder
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
words = _load_words_for_only()
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
|
||||||
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
|
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
|
||||||
deck, media = apkg_builder.build_plural_deck(dict_csv=dict_csv, include_audio=audio)
|
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
|
||||||
apkg_builder.write_plural_apkg(deck, media, out_path=path)
|
apkg_builder.write_plural_apkg(deck, media, out_path=path)
|
||||||
print_summary(args, {}, {}, {})
|
print_summary(args, {}, {})
|
||||||
return
|
return
|
||||||
|
|
||||||
if args.only == "complete":
|
if args.only == "complete":
|
||||||
step_fonts(args)
|
step_fonts(args)
|
||||||
freq_cache = step_frequency() if not args.skip_scrape else {}
|
|
||||||
examples_cache = step_examples(args, freq_cache) if not args.skip_examples else {}
|
|
||||||
image_cache = step_images(args) if not args.skip_images else {}
|
|
||||||
conjugations = step_conjugations(args)
|
|
||||||
import apkg_builder
|
import apkg_builder
|
||||||
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
words = _load_words_for_only()
|
||||||
if not dict_csv.exists():
|
|
||||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
|
||||||
emoji_lookup = apkg_builder._load_emoji_lookup()
|
emoji_lookup = apkg_builder._load_emoji_lookup()
|
||||||
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
|
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
|
||||||
decks, media = apkg_builder.build_complete_deck(
|
decks, media = apkg_builder.build_complete_deck(
|
||||||
dict_csv,
|
words,
|
||||||
conjugations=conjugations or {},
|
|
||||||
examples_cache=examples_cache,
|
|
||||||
freq_cache=freq_cache,
|
|
||||||
image_cache=image_cache,
|
|
||||||
emoji_lookup=emoji_lookup,
|
|
||||||
include_audio=audio,
|
include_audio=audio,
|
||||||
|
emoji_lookup=emoji_lookup,
|
||||||
)
|
)
|
||||||
apkg_builder.write_complete_apkg(decks, media, out_path=path)
|
apkg_builder.write_complete_apkg(decks, media, out_path=path)
|
||||||
print_summary(args, examples_cache, freq_cache, conjugations or {})
|
print_summary(args, {}, {})
|
||||||
return
|
return
|
||||||
|
|
||||||
if args.only == "vocab":
|
# Full pipeline
|
||||||
args.skip_conjugations = True
|
step_list_scrape(args)
|
||||||
|
|
||||||
step_scrape(args)
|
|
||||||
freq_cache = step_frequency()
|
freq_cache = step_frequency()
|
||||||
examples_cache = step_examples(args, freq_cache)
|
examples_cache = step_examples(args, freq_cache)
|
||||||
step_audio(args)
|
step_detail_scrape(args)
|
||||||
|
step_audio_download(args)
|
||||||
step_fonts(args)
|
step_fonts(args)
|
||||||
image_cache = step_images(args)
|
step_images(args)
|
||||||
conjugations = step_conjugations(args)
|
step_build_all(args)
|
||||||
step_build_all(args, examples_cache, freq_cache, conjugations, image_cache)
|
|
||||||
|
|
||||||
print_summary(args, examples_cache, freq_cache, conjugations or {})
|
print_summary(args, examples_cache, freq_cache)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|
|
||||||
212
scripts/check_guid_coverage.py
Normal file
212
scripts/check_guid_coverage.py
Normal file
|
|
@ -0,0 +1,212 @@
|
||||||
|
"""Check that every GUID in the last-release complete .apkg exists in words.json.
|
||||||
|
|
||||||
|
Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
|
||||||
|
then compares against all GUID fields stored in data/words.json.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/check_guid_coverage.py
|
||||||
|
python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
|
||||||
|
python3 scripts/check_guid_coverage.py --verbose
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import sqlite3
|
||||||
|
import sys
|
||||||
|
import tempfile
|
||||||
|
import zipfile
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
PROJECT_ROOT = Path(__file__).parent.parent
|
||||||
|
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
|
||||||
|
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||||
|
|
||||||
|
# Known model IDs (from apkg_builder.py)
|
||||||
|
MODEL_IDS = {
|
||||||
|
1701222017968: "vocab",
|
||||||
|
1234567893: "conjugation",
|
||||||
|
1234567897: "plurals",
|
||||||
|
1234567895: "confusables",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
|
||||||
|
"""Extract GUIDs from .apkg grouped by model ID."""
|
||||||
|
by_model: dict[int, set[str]] = {}
|
||||||
|
with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
|
||||||
|
z.extractall(td)
|
||||||
|
db_path = os.path.join(td, "collection.anki2")
|
||||||
|
conn = sqlite3.connect(db_path)
|
||||||
|
cur = conn.cursor()
|
||||||
|
cur.execute("SELECT guid, mid FROM notes")
|
||||||
|
for guid, mid in cur.fetchall():
|
||||||
|
by_model.setdefault(mid, set()).add(guid)
|
||||||
|
conn.close()
|
||||||
|
return by_model
|
||||||
|
|
||||||
|
|
||||||
|
def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
|
||||||
|
"""Collect all GUIDs from words.json grouped by deck type."""
|
||||||
|
vocab_guids: set[str] = set()
|
||||||
|
cloze_guids: set[str] = set()
|
||||||
|
conj_guids: set[str] = set()
|
||||||
|
plurals_guids: set[str] = set()
|
||||||
|
confusables_guids: set[str] = set()
|
||||||
|
|
||||||
|
for entry in data.values():
|
||||||
|
# Vocab legacy GUID
|
||||||
|
g = entry.get("vocab_legacy_guid")
|
||||||
|
if g:
|
||||||
|
vocab_guids.add(g)
|
||||||
|
|
||||||
|
# Cloze GUID (stored in examples.cloze.cloze_guid)
|
||||||
|
examples = entry.get("examples")
|
||||||
|
if examples:
|
||||||
|
cloze = examples.get("cloze")
|
||||||
|
if cloze:
|
||||||
|
g = cloze.get("cloze_guid")
|
||||||
|
if g:
|
||||||
|
cloze_guids.add(g)
|
||||||
|
|
||||||
|
# Plurals GUID (stored inside noun_inflection)
|
||||||
|
ni = entry.get("noun_inflection")
|
||||||
|
if ni:
|
||||||
|
g = ni.get("plurals_guid")
|
||||||
|
if g:
|
||||||
|
plurals_guids.add(g)
|
||||||
|
|
||||||
|
# Confusables GUID (top-level)
|
||||||
|
g = entry.get("confusables_guid")
|
||||||
|
if g:
|
||||||
|
confusables_guids.add(g)
|
||||||
|
|
||||||
|
# Conjugation form GUIDs
|
||||||
|
conj = entry.get("conjugation")
|
||||||
|
if conj:
|
||||||
|
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||||
|
forms = conj.get(form_list_key)
|
||||||
|
if not forms:
|
||||||
|
continue
|
||||||
|
for form in forms:
|
||||||
|
g = form.get("guid")
|
||||||
|
if g:
|
||||||
|
conj_guids.add(g)
|
||||||
|
gc = form.get("guid_candidates")
|
||||||
|
if gc:
|
||||||
|
for g2 in gc:
|
||||||
|
conj_guids.add(g2)
|
||||||
|
|
||||||
|
return {
|
||||||
|
"vocab": vocab_guids,
|
||||||
|
"cloze": cloze_guids,
|
||||||
|
"conjugation": conj_guids,
|
||||||
|
"plurals": plurals_guids,
|
||||||
|
"confusables": confusables_guids,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
|
||||||
|
parser.add_argument(
|
||||||
|
"--apkg",
|
||||||
|
type=Path,
|
||||||
|
default=DEFAULT_APKG,
|
||||||
|
help=f"Path to .apkg file (default: {DEFAULT_APKG})",
|
||||||
|
)
|
||||||
|
parser.add_argument("--verbose", "-v", action="store_true")
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if not args.apkg.exists():
|
||||||
|
print(f"ERROR: apkg not found: {args.apkg}")
|
||||||
|
sys.exit(2)
|
||||||
|
if not WORDS_JSON.exists():
|
||||||
|
print(f"ERROR: words.json not found: {WORDS_JSON}")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
|
print(f"Checking: {args.apkg}")
|
||||||
|
print(f"Against: {WORDS_JSON}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
apkg_by_model = extract_apkg_guids(args.apkg)
|
||||||
|
data = json.load(WORDS_JSON.open(encoding="utf-8"))
|
||||||
|
wj = collect_words_json_guids(data)
|
||||||
|
|
||||||
|
total_apkg = sum(len(s) for s in apkg_by_model.values())
|
||||||
|
total_wj = sum(len(s) for s in wj.values())
|
||||||
|
print(f"Total GUIDs in apkg: {total_apkg}")
|
||||||
|
print(f"Total GUIDs in words.json: {total_wj}")
|
||||||
|
print()
|
||||||
|
|
||||||
|
all_missing = 0
|
||||||
|
all_extra = 0
|
||||||
|
|
||||||
|
for mid, deck_name in MODEL_IDS.items():
|
||||||
|
apkg_set = apkg_by_model.get(mid, set())
|
||||||
|
|
||||||
|
# Map apkg model to words.json GUID sets
|
||||||
|
if deck_name == "vocab":
|
||||||
|
# Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
|
||||||
|
# They share the note GUID — vocab_legacy_guid IS the note guid
|
||||||
|
wj_set = wj["vocab"] | wj["cloze"]
|
||||||
|
elif deck_name == "conjugation":
|
||||||
|
wj_set = wj["conjugation"]
|
||||||
|
elif deck_name == "plurals":
|
||||||
|
wj_set = wj["plurals"]
|
||||||
|
elif deck_name == "confusables":
|
||||||
|
wj_set = wj["confusables"]
|
||||||
|
else:
|
||||||
|
wj_set = set()
|
||||||
|
|
||||||
|
missing = apkg_set - wj_set
|
||||||
|
extra = wj_set - apkg_set
|
||||||
|
matched = apkg_set & wj_set
|
||||||
|
all_missing += len(missing)
|
||||||
|
all_extra += len(extra)
|
||||||
|
|
||||||
|
status = "PASS" if not missing else "FAIL"
|
||||||
|
print(f" {status} {deck_name} (mid={mid})")
|
||||||
|
print(
|
||||||
|
f" apkg={len(apkg_set)}, words.json={len(wj_set)}, "
|
||||||
|
f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if missing and args.verbose:
|
||||||
|
# Try to find what word each missing GUID belongs to in the apkg
|
||||||
|
print(" Missing GUIDs (in apkg, not in words.json):")
|
||||||
|
for g in sorted(missing)[:20]:
|
||||||
|
print(f" {g!r}")
|
||||||
|
if len(missing) > 20:
|
||||||
|
print(f" ... ({len(missing) - 20} more)")
|
||||||
|
|
||||||
|
if extra and args.verbose:
|
||||||
|
print(" Extra GUIDs (in words.json, not in apkg):")
|
||||||
|
for g in sorted(extra)[:10]:
|
||||||
|
print(f" {g!r}")
|
||||||
|
if len(extra) > 10:
|
||||||
|
print(f" ... ({len(extra) - 10} more)")
|
||||||
|
|
||||||
|
print()
|
||||||
|
|
||||||
|
# Check for unknown model IDs in apkg
|
||||||
|
unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
|
||||||
|
if unknown_mids:
|
||||||
|
print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}")
|
||||||
|
for mid in unknown_mids:
|
||||||
|
print(f" mid={mid}: {len(apkg_by_model[mid])} notes")
|
||||||
|
|
||||||
|
print("─" * 60)
|
||||||
|
if all_missing:
|
||||||
|
print(f" FAILED: {all_missing} apkg GUIDs not found in words.json")
|
||||||
|
print(" (These notes would lose study progress on reimport)")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f" All {total_apkg} apkg GUIDs accounted for in words.json.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
1041
scripts/migrate_to_json.py
Normal file
1041
scripts/migrate_to_json.py
Normal file
File diff suppressed because it is too large
Load diff
420
scripts/repair_slugs.py
Normal file
420
scripts/repair_slugs.py
Normal file
|
|
@ -0,0 +1,420 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Repair duplicate slugs in data/words.json.
|
||||||
|
|
||||||
|
Homographs (words with identical spelling but different meanings) were
|
||||||
|
assigned the same slug by the scraper. This script fetches the pealim.com
|
||||||
|
search page for each affected word, matches entries by meaning (and nikkud),
|
||||||
|
and writes the corrected slugs back to words.json and the source CSV.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/repair_slugs.py [--dry-run]
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from collections import defaultdict
|
||||||
|
from difflib import SequenceMatcher
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pandas as pd
|
||||||
|
import requests
|
||||||
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Paths
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||||
|
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||||
|
CSV_PATH = PROJECT_ROOT / "data" / "hebrew_dict_for_anki.csv"
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# HTTP session
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
SESSION = requests.Session()
|
||||||
|
SESSION.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
|
||||||
|
COOKIES: dict[str, str] = {"translit": "none", "hebstyle": "mo"}
|
||||||
|
REQUEST_DELAY = 1.5 # seconds between requests
|
||||||
|
REQUEST_TIMEOUT = 15 # seconds
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Logging
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s %(levelname)s %(message)s",
|
||||||
|
datefmt="%H:%M:%S",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Similarity helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
FUZZY_THRESHOLD = 0.4
|
||||||
|
|
||||||
|
|
||||||
|
def _similarity(a: str, b: str) -> float:
|
||||||
|
"""Return SequenceMatcher ratio between two strings (both lowercased)."""
|
||||||
|
return SequenceMatcher(None, a.lower(), b.lower()).ratio()
|
||||||
|
|
||||||
|
|
||||||
|
def _best_match(
|
||||||
|
our_meaning: str,
|
||||||
|
candidates: list[dict],
|
||||||
|
our_nikkud: str,
|
||||||
|
) -> tuple[dict | None, float]:
|
||||||
|
"""
|
||||||
|
Return (best_candidate, ratio) by comparing our_meaning against each
|
||||||
|
candidate's meaning field. Nikkud exact-match gives a bonus to break ties.
|
||||||
|
"""
|
||||||
|
best: dict | None = None
|
||||||
|
best_score = -1.0
|
||||||
|
|
||||||
|
for cand in candidates:
|
||||||
|
ratio = _similarity(our_meaning, cand["meaning"])
|
||||||
|
# Nikkud exact match adds a small bonus so the right homograph wins
|
||||||
|
# even when meanings are very similar
|
||||||
|
if our_nikkud and cand["word"] == our_nikkud:
|
||||||
|
ratio = min(1.0, ratio + 0.05)
|
||||||
|
if ratio > best_score:
|
||||||
|
best_score = ratio
|
||||||
|
best = cand
|
||||||
|
|
||||||
|
return best, best_score
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Search-page parser
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def _parse_search_results(html: bytes) -> list[dict]:
|
||||||
|
"""
|
||||||
|
Parse pealim.com search results page.
|
||||||
|
|
||||||
|
Each ``div.verb-search-result`` block contains:
|
||||||
|
- div.verb-search-data > a[href] → slug
|
||||||
|
- div.verb-search-lemma > span.menukad → nikkud word
|
||||||
|
- div.verb-search-binyan → part of speech
|
||||||
|
- div.verb-search-meaning → meaning text
|
||||||
|
|
||||||
|
Returns a list of dicts with keys: slug, word, pos, meaning.
|
||||||
|
"""
|
||||||
|
soup = BeautifulSoup(html, "html.parser")
|
||||||
|
results: list[dict] = []
|
||||||
|
|
||||||
|
for block in soup.find_all("div", class_="verb-search-result"):
|
||||||
|
data_div = block.find("div", class_="verb-search-data")
|
||||||
|
if not data_div:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Slug from the detail-page link
|
||||||
|
slug = ""
|
||||||
|
link = data_div.find("a", href=True)
|
||||||
|
if link:
|
||||||
|
m = re.search(r"/dict/([^/#]+)/", link["href"])
|
||||||
|
if m:
|
||||||
|
slug = m.group(1)
|
||||||
|
|
||||||
|
# Nikkud word
|
||||||
|
lemma_div = block.find("div", class_="verb-search-lemma")
|
||||||
|
menukad = lemma_div.find("span", class_="menukad") if lemma_div else None
|
||||||
|
word = menukad.get_text(strip=True) if menukad else (lemma_div.get_text(strip=True) if lemma_div else "")
|
||||||
|
|
||||||
|
# Part of speech
|
||||||
|
pos_div = block.find("div", class_="verb-search-binyan")
|
||||||
|
pos = pos_div.get_text(strip=True).replace("Part of speech:", "").strip() if pos_div else ""
|
||||||
|
|
||||||
|
# Meaning
|
||||||
|
meaning_div = block.find("div", class_="verb-search-meaning")
|
||||||
|
meaning = meaning_div.get_text(strip=True) if meaning_div else ""
|
||||||
|
|
||||||
|
if slug:
|
||||||
|
results.append({"slug": slug, "word": word, "pos": pos, "meaning": meaning})
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def _fetch_search_results(ktiv_male: str) -> list[dict]:
|
||||||
|
"""Fetch and parse search results for a given consonant-only spelling."""
|
||||||
|
url = f"https://www.pealim.com/search/?q={ktiv_male}"
|
||||||
|
logger.debug("GET %s", url)
|
||||||
|
resp = SESSION.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
|
||||||
|
resp.raise_for_status()
|
||||||
|
return _parse_search_results(resp.content)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Core logic
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def find_duplicate_groups(data: dict) -> dict[str, list[str]]:
|
||||||
|
"""
|
||||||
|
Return mapping slug → [word_key, ...] for all slugs shared by 2+ entries.
|
||||||
|
The word_key is the top-level key in words.json (nikkud + PoS + meaning).
|
||||||
|
"""
|
||||||
|
slug_to_keys: dict[str, list[str]] = defaultdict(list)
|
||||||
|
for key, entry in data.items():
|
||||||
|
slug = entry.get("slug", "")
|
||||||
|
if slug:
|
||||||
|
slug_to_keys[slug].append(key)
|
||||||
|
return {slug: keys for slug, keys in slug_to_keys.items() if len(keys) > 1}
|
||||||
|
|
||||||
|
|
||||||
|
def repair_group(
|
||||||
|
slug: str,
|
||||||
|
keys: list[str],
|
||||||
|
data: dict,
|
||||||
|
dry_run: bool,
|
||||||
|
) -> tuple[int, int]:
|
||||||
|
"""
|
||||||
|
Attempt to repair one group of entries sharing *slug*.
|
||||||
|
|
||||||
|
Homographs can have different ktiv_male spellings (e.g. אבידה vs אבדה for
|
||||||
|
the two spellings of אֲבֵדָה). We therefore build a union of all search
|
||||||
|
results obtained by querying each distinct ktiv_male in the group.
|
||||||
|
|
||||||
|
Returns (fixed_count, skipped_count).
|
||||||
|
"""
|
||||||
|
# Collect distinct ktiv_male values across the group (usually one, but
|
||||||
|
# sometimes two when homographs have different consonant spellings).
|
||||||
|
ktiv_to_keys: dict[str, list[str]] = defaultdict(list)
|
||||||
|
for k in keys:
|
||||||
|
ktiv = data[k]["word"]["ktiv_male"]
|
||||||
|
ktiv_to_keys[ktiv].append(k)
|
||||||
|
|
||||||
|
nikkud_word = data[keys[0]]["word"]["nikkud"]
|
||||||
|
logger.info(
|
||||||
|
" Fetching search results for %s — %d entries share slug %s",
|
||||||
|
nikkud_word,
|
||||||
|
len(keys),
|
||||||
|
slug,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fetch search results for every distinct ktiv_male and merge
|
||||||
|
all_candidates: list[dict] = []
|
||||||
|
seen_slugs: set[str] = set()
|
||||||
|
for ktiv in ktiv_to_keys:
|
||||||
|
try:
|
||||||
|
results = _fetch_search_results(ktiv)
|
||||||
|
except requests.RequestException as exc:
|
||||||
|
logger.warning(" HTTP error for %s: %s", ktiv, exc)
|
||||||
|
results = []
|
||||||
|
for r in results:
|
||||||
|
if r["slug"] not in seen_slugs:
|
||||||
|
seen_slugs.add(r["slug"])
|
||||||
|
all_candidates.append(r)
|
||||||
|
if len(ktiv_to_keys) > 1:
|
||||||
|
# Small delay between sub-queries within the same group
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
if not all_candidates:
|
||||||
|
logger.warning(" No search results — skipping group")
|
||||||
|
return 0, len(keys)
|
||||||
|
|
||||||
|
# Filter candidates to those whose nikkud word matches the entry's nikkud.
|
||||||
|
# This avoids accidentally matching a completely different word that shares
|
||||||
|
# the same consonant spelling (e.g. different voweling entirely).
|
||||||
|
group_nikkuds = {data[k]["word"]["nikkud"] for k in keys}
|
||||||
|
filtered = [c for c in all_candidates if c["word"] in group_nikkuds]
|
||||||
|
|
||||||
|
if not filtered:
|
||||||
|
logger.warning(
|
||||||
|
" Search results don't contain nikkud %s — candidates: %s — skipping",
|
||||||
|
group_nikkuds,
|
||||||
|
[c["word"] for c in all_candidates],
|
||||||
|
)
|
||||||
|
return 0, len(keys)
|
||||||
|
|
||||||
|
fixed = 0
|
||||||
|
skipped = 0
|
||||||
|
|
||||||
|
for key in keys:
|
||||||
|
entry = data[key]
|
||||||
|
our_meaning = entry.get("meaning", "")
|
||||||
|
our_nikkud = entry["word"]["nikkud"]
|
||||||
|
|
||||||
|
# Only consider candidates that match this entry's nikkud
|
||||||
|
nikkud_filtered = [c for c in filtered if c["word"] == our_nikkud]
|
||||||
|
pool = nikkud_filtered if nikkud_filtered else filtered
|
||||||
|
|
||||||
|
best, score = _best_match(our_meaning, pool, our_nikkud)
|
||||||
|
|
||||||
|
if best is None or score < FUZZY_THRESHOLD:
|
||||||
|
logger.warning(
|
||||||
|
" SKIP key=%s | meaning=%r | best_score=%.2f",
|
||||||
|
key,
|
||||||
|
our_meaning,
|
||||||
|
score,
|
||||||
|
)
|
||||||
|
skipped += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
new_slug = best["slug"]
|
||||||
|
old_slug = entry["slug"]
|
||||||
|
|
||||||
|
if new_slug == old_slug:
|
||||||
|
logger.info(" SAME key=%s | slug=%s (score=%.2f)", key, old_slug, score)
|
||||||
|
fixed += 1
|
||||||
|
continue
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
" FIX key=%s | %s → %s | matched=%r (score=%.2f)",
|
||||||
|
key,
|
||||||
|
old_slug,
|
||||||
|
new_slug,
|
||||||
|
best["meaning"],
|
||||||
|
score,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not dry_run:
|
||||||
|
data[key]["slug"] = new_slug
|
||||||
|
|
||||||
|
fixed += 1
|
||||||
|
|
||||||
|
return fixed, skipped
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CSV update
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def update_csv(data: dict, dry_run: bool) -> None:
|
||||||
|
"""
|
||||||
|
Re-write the CSV so every row's slug column matches words.json.
|
||||||
|
|
||||||
|
The CSV is semicolon-delimited; the slug column is named 'slug'.
|
||||||
|
We match rows by 'Word Without Nikkud' (ktiv_male) AND 'Meaning' because
|
||||||
|
homographs share the same ktiv_male.
|
||||||
|
"""
|
||||||
|
df = pd.read_csv(CSV_PATH, sep=";", dtype=str)
|
||||||
|
|
||||||
|
if "slug" not in df.columns:
|
||||||
|
logger.warning("CSV has no 'slug' column — skipping CSV update")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Build a lookup: (ktiv_male, meaning) → new_slug from words.json
|
||||||
|
lookup: dict[tuple[str, str], str] = {}
|
||||||
|
for entry in data.values():
|
||||||
|
ktiv = entry["word"].get("ktiv_male", "")
|
||||||
|
meaning = entry.get("meaning", "")
|
||||||
|
slug = entry.get("slug", "")
|
||||||
|
if ktiv and slug:
|
||||||
|
lookup[(ktiv, meaning)] = slug
|
||||||
|
|
||||||
|
changes = 0
|
||||||
|
for idx, row in df.iterrows():
|
||||||
|
ktiv = str(row.get("Word Without Nikkud", "")).strip()
|
||||||
|
meaning = str(row.get("Meaning", "")).strip()
|
||||||
|
key = (ktiv, meaning)
|
||||||
|
if key in lookup:
|
||||||
|
new_slug = lookup[key]
|
||||||
|
old_slug = str(row["slug"]).strip()
|
||||||
|
if new_slug != old_slug:
|
||||||
|
logger.info(
|
||||||
|
" CSV row %d: %s → %s (%s)",
|
||||||
|
idx,
|
||||||
|
old_slug,
|
||||||
|
new_slug,
|
||||||
|
ktiv,
|
||||||
|
)
|
||||||
|
if not dry_run:
|
||||||
|
df.at[idx, "slug"] = new_slug
|
||||||
|
changes += 1
|
||||||
|
|
||||||
|
logger.info("CSV: %d slug(s) to update", changes)
|
||||||
|
if not dry_run and changes:
|
||||||
|
df.to_csv(CSV_PATH, sep=";", index=True)
|
||||||
|
logger.info("CSV written to %s", CSV_PATH)
|
||||||
|
elif dry_run:
|
||||||
|
logger.info("DRY-RUN: CSV not written")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Main
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
def main(argv: list[str] | None = None) -> int:
|
||||||
|
parser = argparse.ArgumentParser(description="Repair duplicate slugs in data/words.json")
|
||||||
|
parser.add_argument(
|
||||||
|
"--dry-run",
|
||||||
|
action="store_true",
|
||||||
|
help="Preview changes without writing any files",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
"-v",
|
||||||
|
action="store_true",
|
||||||
|
help="Enable debug logging",
|
||||||
|
)
|
||||||
|
args = parser.parse_args(argv)
|
||||||
|
|
||||||
|
if args.verbose:
|
||||||
|
logging.getLogger().setLevel(logging.DEBUG)
|
||||||
|
|
||||||
|
if args.dry_run:
|
||||||
|
logger.info("=== DRY-RUN mode — no files will be modified ===")
|
||||||
|
|
||||||
|
# Load data
|
||||||
|
logger.info("Loading %s", WORDS_JSON)
|
||||||
|
with WORDS_JSON.open(encoding="utf-8") as fh:
|
||||||
|
data: dict = json.load(fh)
|
||||||
|
logger.info("Loaded %d entries", len(data))
|
||||||
|
|
||||||
|
# Identify duplicate groups
|
||||||
|
groups = find_duplicate_groups(data)
|
||||||
|
total_groups = len(groups)
|
||||||
|
total_entries = sum(len(v) for v in groups.values())
|
||||||
|
logger.info(
|
||||||
|
"Found %d duplicate-slug groups covering %d entries",
|
||||||
|
total_groups,
|
||||||
|
total_entries,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process each group
|
||||||
|
total_fixed = 0
|
||||||
|
total_skipped = 0
|
||||||
|
|
||||||
|
for group_idx, (slug, keys) in enumerate(sorted(groups.items()), 1):
|
||||||
|
logger.info(
|
||||||
|
"[%d/%d] slug=%s (%d entries)",
|
||||||
|
group_idx,
|
||||||
|
total_groups,
|
||||||
|
slug,
|
||||||
|
len(keys),
|
||||||
|
)
|
||||||
|
fixed, skipped = repair_group(slug, keys, data, dry_run=args.dry_run)
|
||||||
|
total_fixed += fixed
|
||||||
|
total_skipped += skipped
|
||||||
|
|
||||||
|
# Respectful delay between HTTP requests
|
||||||
|
if group_idx < total_groups:
|
||||||
|
time.sleep(REQUEST_DELAY)
|
||||||
|
|
||||||
|
logger.info(
|
||||||
|
"Summary: %d fixed, %d skipped (out of %d entries in %d groups)",
|
||||||
|
total_fixed,
|
||||||
|
total_skipped,
|
||||||
|
total_entries,
|
||||||
|
total_groups,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Write updated words.json
|
||||||
|
if not args.dry_run:
|
||||||
|
logger.info("Writing %s", WORDS_JSON)
|
||||||
|
with WORDS_JSON.open("w", encoding="utf-8") as fh:
|
||||||
|
json.dump(data, fh, ensure_ascii=False, indent=2)
|
||||||
|
logger.info("words.json written")
|
||||||
|
else:
|
||||||
|
logger.info("DRY-RUN: words.json not written")
|
||||||
|
|
||||||
|
# Update CSV
|
||||||
|
logger.info("Updating CSV %s", CSV_PATH)
|
||||||
|
update_csv(data, dry_run=args.dry_run)
|
||||||
|
|
||||||
|
return 0 if total_skipped == 0 else 1
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
sys.exit(main())
|
||||||
800
scripts/validate_data.py
Normal file
800
scripts/validate_data.py
Normal file
|
|
@ -0,0 +1,800 @@
|
||||||
|
"""Standalone integrity validator for data/words.json.
|
||||||
|
|
||||||
|
Validates the unified Hebrew Flash Cards data against the schema defined in
|
||||||
|
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python3 scripts/validate_data.py
|
||||||
|
python3 scripts/validate_data.py --verbose
|
||||||
|
python3 scripts/validate_data.py --test confusable_symmetric
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import unicodedata
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Bootstrap: make project root importable so helpers.py is accessible
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Constants
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
|
||||||
|
|
||||||
|
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav
|
||||||
|
|
||||||
|
VALID_PERSON_CODES: frozenset[str] = frozenset(
|
||||||
|
["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
|
||||||
|
)
|
||||||
|
|
||||||
|
EMOJI_RE = re.compile(
|
||||||
|
r"[\U0001f600-\U0001f64f"
|
||||||
|
r"\U0001f300-\U0001f5ff"
|
||||||
|
r"\U0001f680-\U0001f6ff"
|
||||||
|
r"\U0001f1e0-\U0001f1ff"
|
||||||
|
r"\U00002702-\U000027b0"
|
||||||
|
r"\U0001f900-\U0001f9ff"
|
||||||
|
r"\U0001fa00-\U0001fa6f"
|
||||||
|
r"\U0001fa70-\U0001faff]"
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Result tracking
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
_failures: list[str] = []
|
||||||
|
_warnings: list[str] = []
|
||||||
|
_verbose: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
def _pass(name: str) -> None:
|
||||||
|
print(f" PASS {name}")
|
||||||
|
|
||||||
|
|
||||||
|
def _fail(name: str, details: list[str]) -> None:
|
||||||
|
global _failures
|
||||||
|
_failures.append(name)
|
||||||
|
print(f" FAIL {name}")
|
||||||
|
for d in details:
|
||||||
|
print(f" {d}")
|
||||||
|
|
||||||
|
|
||||||
|
def _warn(name: str, details: list[str]) -> None:
|
||||||
|
global _warnings
|
||||||
|
_warnings.extend(details)
|
||||||
|
print(f" WARN {name}")
|
||||||
|
for d in details:
|
||||||
|
print(f" {d}")
|
||||||
|
|
||||||
|
|
||||||
|
def _verbose_print(msg: str) -> None:
|
||||||
|
if _verbose:
|
||||||
|
print(f" {msg}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helper: load data
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def load_data() -> dict[str, Any]:
|
||||||
|
"""Load words.json and return the parsed dict."""
|
||||||
|
if not DATA_FILE.exists():
|
||||||
|
print(f"ERROR: data file not found: {DATA_FILE}")
|
||||||
|
sys.exit(2)
|
||||||
|
with DATA_FILE.open(encoding="utf-8") as fh:
|
||||||
|
return json.load(fh)
|
||||||
|
|
||||||
|
|
||||||
|
def _is_hebrew_consonant(ch: str) -> bool:
|
||||||
|
"""Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
|
||||||
|
|
||||||
|
Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
|
||||||
|
only the first base character after NFD decomposition.
|
||||||
|
"""
|
||||||
|
normalized = unicodedata.normalize("NFD", ch)
|
||||||
|
# The first codepoint is the base consonant; the rest are combining marks.
|
||||||
|
base = normalized[0]
|
||||||
|
cp = ord(base)
|
||||||
|
return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Individual tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def test_required_fields(data: dict[str, Any]) -> None:
|
||||||
|
"""Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
|
||||||
|
name = "required_fields"
|
||||||
|
errors: list[str] = []
|
||||||
|
warn_details: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
word = entry.get("word")
|
||||||
|
if not isinstance(word, dict):
|
||||||
|
errors.append(f"[{key}] 'word' is missing or not a dict")
|
||||||
|
else:
|
||||||
|
if not word.get("nikkud"):
|
||||||
|
errors.append(f"[{key}] word.nikkud is missing or empty")
|
||||||
|
if not word.get("ktiv_male"):
|
||||||
|
errors.append(f"[{key}] word.ktiv_male is missing or empty")
|
||||||
|
|
||||||
|
if not entry.get("slug"):
|
||||||
|
errors.append(f"[{key}] 'slug' is missing or empty")
|
||||||
|
if not entry.get("pos"):
|
||||||
|
errors.append(f"[{key}] 'pos' is missing or empty")
|
||||||
|
if not entry.get("meaning"):
|
||||||
|
errors.append(f"[{key}] 'meaning' is missing or empty")
|
||||||
|
|
||||||
|
if entry.get("frequency") is None:
|
||||||
|
warn_details.append(f"[{key}] 'frequency' is null/missing")
|
||||||
|
|
||||||
|
if warn_details:
|
||||||
|
_warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
|
||||||
|
if len(warn_details) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(warn_details) - 20} more; use --verbose)")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_root_format(data: dict[str, Any]) -> None:
|
||||||
|
"""root is a list of 2-5 Hebrew consonant chars, or an empty list."""
|
||||||
|
name = "root_format"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
root = entry.get("root")
|
||||||
|
if root is None:
|
||||||
|
errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
|
||||||
|
continue
|
||||||
|
if not isinstance(root, list):
|
||||||
|
errors.append(f"[{key}] 'root' is not a list: {root!r}")
|
||||||
|
continue
|
||||||
|
if len(root) == 0:
|
||||||
|
continue # rootless word — valid
|
||||||
|
if not (2 <= len(root) <= 5):
|
||||||
|
errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
|
||||||
|
continue
|
||||||
|
for ch in root:
|
||||||
|
# A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
|
||||||
|
# Validate by checking the base consonant after NFD decomposition.
|
||||||
|
if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
|
||||||
|
errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
|
||||||
|
break
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_unique_slugs(data: dict[str, Any]) -> None:
|
||||||
|
"""All non-empty slugs are unique across entries — each pealim page is a distinct word."""
|
||||||
|
name = "unique_slugs"
|
||||||
|
seen: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
slug = entry.get("slug")
|
||||||
|
if slug:
|
||||||
|
seen.setdefault(slug, []).append(key)
|
||||||
|
|
||||||
|
dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
|
||||||
|
if dups:
|
||||||
|
errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001
|
||||||
|
"""JSON loaded without top-level key collisions.
|
||||||
|
|
||||||
|
Python's json.load silently keeps the last value on duplicate keys;
|
||||||
|
we re-parse with a custom object_pairs_hook to detect them.
|
||||||
|
The pre-parsed ``_data`` dict is not used here because we need to
|
||||||
|
re-read the raw file to catch duplicate keys that json.load would
|
||||||
|
silently merge.
|
||||||
|
"""
|
||||||
|
name = "no_duplicate_keys"
|
||||||
|
duplicates: list[str] = []
|
||||||
|
|
||||||
|
def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
|
||||||
|
d: dict[str, Any] = {}
|
||||||
|
for k, v in pairs:
|
||||||
|
if k in d:
|
||||||
|
duplicates.append(k)
|
||||||
|
d[k] = v
|
||||||
|
return d
|
||||||
|
|
||||||
|
with DATA_FILE.open(encoding="utf-8") as fh:
|
||||||
|
json.load(fh, object_pairs_hook=_detect_dups)
|
||||||
|
|
||||||
|
if duplicates:
|
||||||
|
_fail(name, [f"duplicate key: {k!r}" for k in duplicates])
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_confusable_symmetric(data: dict[str, Any]) -> None:
|
||||||
|
"""If A lists B in confusable_group, B must list A."""
|
||||||
|
name = "confusable_symmetric"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
group = entry.get("confusable_group")
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
for other_key in group:
|
||||||
|
other = data.get(other_key)
|
||||||
|
if other is None:
|
||||||
|
errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
|
||||||
|
continue
|
||||||
|
other_group = other.get("confusable_group") or []
|
||||||
|
if key not in other_group:
|
||||||
|
errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
|
||||||
|
"""Every key in shared_roots must exist as a top-level key."""
|
||||||
|
name = "shared_roots_valid_keys"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
shared = entry.get("shared_roots")
|
||||||
|
if not shared:
|
||||||
|
continue
|
||||||
|
for ref_key in shared:
|
||||||
|
if ref_key not in data:
|
||||||
|
errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_unique_legacy_guids(data: dict[str, Any]) -> None:
|
||||||
|
"""No two entries share the same vocab_legacy_guid (excluding null).
|
||||||
|
|
||||||
|
Exception: entries that share the same word.nikkud value inherited the
|
||||||
|
same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
|
||||||
|
These are tolerated — the duplicate GUID is a known artefact of how
|
||||||
|
legacy GUIDs were generated from the nikkud word alone.
|
||||||
|
"""
|
||||||
|
name = "unique_legacy_guids"
|
||||||
|
seen: dict[str, list[str]] = {}
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
guid = entry.get("vocab_legacy_guid")
|
||||||
|
if guid:
|
||||||
|
seen.setdefault(guid, []).append(key)
|
||||||
|
|
||||||
|
errors: list[str] = []
|
||||||
|
for guid, keys in seen.items():
|
||||||
|
if len(keys) <= 1:
|
||||||
|
continue
|
||||||
|
# Tolerate sharing if ALL entries with this GUID share the same word.nikkud
|
||||||
|
nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
|
||||||
|
if len(nikkud_values) == 1:
|
||||||
|
# Same nikkud -> inherited from same legacy card; tolerable
|
||||||
|
_verbose_print(
|
||||||
|
f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
|
||||||
|
"""noun_inflection must be null if pos doesn't start with 'Noun'.
|
||||||
|
|
||||||
|
Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
|
||||||
|
"""
|
||||||
|
name = "no_noun_inflection_on_non_nouns"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
pos = entry.get("pos") or ""
|
||||||
|
noun_inf = entry.get("noun_inflection")
|
||||||
|
if not pos.startswith("Noun") and noun_inf is not None:
|
||||||
|
errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
|
||||||
|
_verbose_print(f"offending entry: {key!r}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
|
||||||
|
"""meaning field must not contain inline emoji characters."""
|
||||||
|
name = "no_emoji_in_meaning"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
meaning = entry.get("meaning") or ""
|
||||||
|
if EMOJI_RE.search(meaning):
|
||||||
|
errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
|
||||||
|
"""For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
|
||||||
|
|
||||||
|
Uses nikkud (exact) matching, not stripped matching.
|
||||||
|
"""
|
||||||
|
name = "example_sentences_contain_word"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
examples = entry.get("examples")
|
||||||
|
if not examples:
|
||||||
|
continue
|
||||||
|
vetted = examples.get("vetted")
|
||||||
|
if not vetted:
|
||||||
|
continue
|
||||||
|
|
||||||
|
word_obj = entry.get("word") or {}
|
||||||
|
nikkud_word = word_obj.get("nikkud") or ""
|
||||||
|
if not nikkud_word:
|
||||||
|
continue
|
||||||
|
|
||||||
|
found = any(nikkud_word in (s.get("text") or "") for s in vetted)
|
||||||
|
if not found:
|
||||||
|
sentences_preview = [s.get("text", "") for s in vetted[:2]]
|
||||||
|
errors.append(
|
||||||
|
f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_warn(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
|
||||||
|
"""cloze_word_start/end must be within text bounds when present.
|
||||||
|
|
||||||
|
Null offsets are tolerated (and warned separately) because some sentences
|
||||||
|
contain only inflected/construct/plural forms that cannot be matched back
|
||||||
|
to the base nikkud or ktiv_male — this is a data quality issue in
|
||||||
|
vetted_sentences.json, not a schema violation.
|
||||||
|
"""
|
||||||
|
name = "cloze_offsets_valid"
|
||||||
|
errors: list[str] = []
|
||||||
|
null_warn: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
examples = entry.get("examples")
|
||||||
|
if not examples:
|
||||||
|
continue
|
||||||
|
cloze = examples.get("cloze")
|
||||||
|
if not cloze:
|
||||||
|
continue
|
||||||
|
|
||||||
|
text = cloze.get("text") or ""
|
||||||
|
start = cloze.get("cloze_word_start")
|
||||||
|
end = cloze.get("cloze_word_end")
|
||||||
|
|
||||||
|
if start is None or end is None:
|
||||||
|
null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
|
||||||
|
continue
|
||||||
|
|
||||||
|
text_len = len(text)
|
||||||
|
if not isinstance(start, int) or not isinstance(end, int):
|
||||||
|
errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
|
||||||
|
continue
|
||||||
|
if start < 0 or end < 0:
|
||||||
|
errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
|
||||||
|
continue
|
||||||
|
if start >= end:
|
||||||
|
errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
|
||||||
|
continue
|
||||||
|
if end > text_len:
|
||||||
|
errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
|
||||||
|
|
||||||
|
if null_warn:
|
||||||
|
_warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
|
||||||
|
if len(null_warn) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(null_warn) - 20} more; use --verbose)")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
|
||||||
|
"""hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
|
||||||
|
name = "hufal_pual_only_on_hifil_piel"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
conj = entry.get("conjugation")
|
||||||
|
if not conj:
|
||||||
|
continue
|
||||||
|
hufal_pual = conj.get("hufal_pual_forms")
|
||||||
|
if hufal_pual is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
binyan = conj.get("binyan") or ""
|
||||||
|
binyan_lower = binyan.lower()
|
||||||
|
if "hif" not in binyan_lower and "pi" not in binyan_lower:
|
||||||
|
errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
|
||||||
|
"""All entries in a confusable_group must share the same word.ktiv_male."""
|
||||||
|
name = "confusable_group_shares_ktiv_male"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
group = entry.get("confusable_group")
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
|
||||||
|
my_word = entry.get("word") or {}
|
||||||
|
my_ktiv = my_word.get("ktiv_male")
|
||||||
|
if not my_ktiv:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for other_key in group:
|
||||||
|
other = data.get(other_key)
|
||||||
|
if not other:
|
||||||
|
continue # already caught by confusable_symmetric
|
||||||
|
other_word = other.get("word") or {}
|
||||||
|
other_ktiv = other_word.get("ktiv_male")
|
||||||
|
if other_ktiv and other_ktiv != my_ktiv:
|
||||||
|
errors.append(
|
||||||
|
f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_confusables_guid(data: dict[str, Any]) -> None:
|
||||||
|
"""confusables_guid must be consistent within each confusable_group.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- If confusable_group is non-null, confusables_guid must be non-null.
|
||||||
|
- If confusable_group is null, confusables_guid must be null.
|
||||||
|
- All entries that share a confusable_group must share the same
|
||||||
|
confusables_guid value.
|
||||||
|
"""
|
||||||
|
name = "confusables_guid"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
group = entry.get("confusable_group")
|
||||||
|
guid = entry.get("confusables_guid")
|
||||||
|
|
||||||
|
if group and not guid:
|
||||||
|
errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
|
||||||
|
elif not group and guid is not None:
|
||||||
|
errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
|
||||||
|
|
||||||
|
if not group or not guid:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for other_key in group:
|
||||||
|
other = data.get(other_key)
|
||||||
|
if not other:
|
||||||
|
continue # already caught by confusable_symmetric
|
||||||
|
other_guid = other.get("confusables_guid")
|
||||||
|
if other_guid != guid:
|
||||||
|
errors.append(
|
||||||
|
f"[{key}] confusables_guid={guid!r} but confusable member "
|
||||||
|
f"{other_key!r} has confusables_guid={other_guid!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_conjugation_form_guids(data: dict[str, Any]) -> None:
|
||||||
|
"""Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
|
||||||
|
|
||||||
|
Rules:
|
||||||
|
- Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
|
||||||
|
OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
|
||||||
|
1st person forms where multiple GUIDs are possible).
|
||||||
|
- No two forms within the same verb (across both form lists) may share a GUID.
|
||||||
|
"""
|
||||||
|
name = "conjugation_form_guids"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
conj = entry.get("conjugation")
|
||||||
|
if not conj:
|
||||||
|
continue
|
||||||
|
|
||||||
|
seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label
|
||||||
|
|
||||||
|
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||||
|
forms = conj.get(form_list_key)
|
||||||
|
if not forms:
|
||||||
|
continue
|
||||||
|
for form in forms:
|
||||||
|
person = form.get("person", "?")
|
||||||
|
label = f"{form_list_key}[{person}]"
|
||||||
|
guid = form.get("guid")
|
||||||
|
guid_candidates = form.get("guid_candidates")
|
||||||
|
|
||||||
|
if not guid and not guid_candidates:
|
||||||
|
errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
|
||||||
|
continue
|
||||||
|
|
||||||
|
if guid:
|
||||||
|
if guid in seen_guids:
|
||||||
|
errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
|
||||||
|
else:
|
||||||
|
seen_guids[guid] = label
|
||||||
|
elif guid_candidates:
|
||||||
|
for candidate in guid_candidates:
|
||||||
|
if candidate in seen_guids:
|
||||||
|
errors.append(
|
||||||
|
f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
seen_guids[candidate] = label
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_conjugation_person_codes(data: dict[str, Any]) -> None:
|
||||||
|
"""active_forms person codes must be from the defined valid set."""
|
||||||
|
name = "conjugation_person_codes"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
conj = entry.get("conjugation")
|
||||||
|
if not conj:
|
||||||
|
continue
|
||||||
|
|
||||||
|
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||||
|
forms = conj.get(form_list_key)
|
||||||
|
if not forms:
|
||||||
|
continue
|
||||||
|
for form in forms:
|
||||||
|
person = form.get("person")
|
||||||
|
if person not in VALID_PERSON_CODES:
|
||||||
|
errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_fail(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
else:
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
||||||
|
"""For confusable words, their example sentences must not contain the wrong
|
||||||
|
homograph's nikkud word.
|
||||||
|
|
||||||
|
Specifically: if A and B are confusable (same ktiv_male), A's vetted
|
||||||
|
sentences must not contain B's nikkud form, and vice versa.
|
||||||
|
"""
|
||||||
|
name = "no_stripped_form_sentence_collisions"
|
||||||
|
errors: list[str] = []
|
||||||
|
|
||||||
|
for key, entry in data.items():
|
||||||
|
group = entry.get("confusable_group")
|
||||||
|
if not group:
|
||||||
|
continue
|
||||||
|
|
||||||
|
examples = entry.get("examples")
|
||||||
|
if not examples:
|
||||||
|
continue
|
||||||
|
vetted = examples.get("vetted")
|
||||||
|
if not vetted:
|
||||||
|
continue
|
||||||
|
|
||||||
|
my_word = entry.get("word") or {}
|
||||||
|
my_nikkud = my_word.get("nikkud") or ""
|
||||||
|
|
||||||
|
my_texts = [s.get("text") or "" for s in vetted]
|
||||||
|
|
||||||
|
for other_key in group:
|
||||||
|
other = data.get(other_key)
|
||||||
|
if not other:
|
||||||
|
continue
|
||||||
|
other_word = other.get("word") or {}
|
||||||
|
other_nikkud = other_word.get("nikkud") or ""
|
||||||
|
if not other_nikkud or other_nikkud == my_nikkud:
|
||||||
|
continue # same nikkud homographs are ok (we can't distinguish by nikkud)
|
||||||
|
|
||||||
|
for text in my_texts:
|
||||||
|
if other_nikkud in text:
|
||||||
|
errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
|
||||||
|
_verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
|
||||||
|
break # one error per (key, other_key) pair is enough
|
||||||
|
|
||||||
|
if errors:
|
||||||
|
_warn(name, errors[:20] if not _verbose else errors)
|
||||||
|
if len(errors) > 20 and not _verbose:
|
||||||
|
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||||
|
_pass(name)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Stats summary
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def print_stats(data: dict[str, Any]) -> None:
|
||||||
|
"""Print a summary of dataset coverage metrics."""
|
||||||
|
total = len(data)
|
||||||
|
with_conj = sum(1 for e in data.values() if e.get("conjugation"))
|
||||||
|
with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
|
||||||
|
with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
|
||||||
|
with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
|
||||||
|
with_image = sum(1 for e in data.values() if e.get("image"))
|
||||||
|
with_emoji = sum(1 for e in data.values() if e.get("emoji"))
|
||||||
|
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
|
||||||
|
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
|
||||||
|
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("Stats Summary")
|
||||||
|
print("─" * 42)
|
||||||
|
print(f" Total entries: {total:>6}")
|
||||||
|
print(f" With conjugation data: {with_conj:>6}")
|
||||||
|
print(f" With noun_inflection: {with_noun_inf:>6}")
|
||||||
|
print(f" With vetted examples: {with_vetted:>6}")
|
||||||
|
print(f" With cloze examples: {with_cloze:>6}")
|
||||||
|
print(f" With images: {with_image:>6}")
|
||||||
|
print(f" With emoji: {with_emoji:>6}")
|
||||||
|
print(f" With legacy GUIDs: {with_guid:>6}")
|
||||||
|
print(f" In confusable groups: {in_confusable:>6}")
|
||||||
|
print(f" With shared roots: {with_shared_roots:>6}")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test registry
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
ALL_TESTS: dict[str, Any] = {
|
||||||
|
"required_fields": test_required_fields,
|
||||||
|
"root_format": test_root_format,
|
||||||
|
"unique_slugs": test_unique_slugs,
|
||||||
|
"no_duplicate_keys": test_no_duplicate_keys,
|
||||||
|
"confusable_symmetric": test_confusable_symmetric,
|
||||||
|
"shared_roots_valid_keys": test_shared_roots_valid_keys,
|
||||||
|
"unique_legacy_guids": test_unique_legacy_guids,
|
||||||
|
"no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
|
||||||
|
"no_emoji_in_meaning": test_no_emoji_in_meaning,
|
||||||
|
"example_sentences_contain_word": test_example_sentences_contain_word,
|
||||||
|
"cloze_offsets_valid": test_cloze_offsets_valid,
|
||||||
|
"hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
|
||||||
|
"confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
|
||||||
|
"confusables_guid": test_confusables_guid,
|
||||||
|
"conjugation_form_guids": test_conjugation_form_guids,
|
||||||
|
"conjugation_person_codes": test_conjugation_person_codes,
|
||||||
|
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Entry point
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
global _verbose
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
|
||||||
|
parser.add_argument(
|
||||||
|
"--verbose",
|
||||||
|
"-v",
|
||||||
|
action="store_true",
|
||||||
|
help="Print full details for all failures (not just first 20).",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--test",
|
||||||
|
metavar="NAME",
|
||||||
|
help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
_verbose = args.verbose
|
||||||
|
|
||||||
|
data = load_data()
|
||||||
|
|
||||||
|
# Select tests to run
|
||||||
|
if args.test:
|
||||||
|
if args.test not in ALL_TESTS:
|
||||||
|
print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
|
||||||
|
sys.exit(2)
|
||||||
|
tests_to_run = {args.test: ALL_TESTS[args.test]}
|
||||||
|
else:
|
||||||
|
tests_to_run = ALL_TESTS
|
||||||
|
|
||||||
|
print(f"Validating {DATA_FILE} ({len(data)} entries)")
|
||||||
|
print("─" * 60)
|
||||||
|
|
||||||
|
# no_duplicate_keys needs the file, not the pre-parsed dict
|
||||||
|
for test_fn in tests_to_run.values():
|
||||||
|
test_fn(data)
|
||||||
|
|
||||||
|
# Summary
|
||||||
|
if not args.test:
|
||||||
|
print_stats(data)
|
||||||
|
|
||||||
|
print()
|
||||||
|
print("─" * 60)
|
||||||
|
if _warnings:
|
||||||
|
print(f" Warnings : {len(_warnings)}")
|
||||||
|
if _failures:
|
||||||
|
print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
|
||||||
|
sys.exit(1)
|
||||||
|
else:
|
||||||
|
print(f" All {len(tests_to_run)} test(s) passed.")
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
441
tests/test_scraper_integration.py
Normal file
441
tests/test_scraper_integration.py
Normal file
|
|
@ -0,0 +1,441 @@
|
||||||
|
#!/usr/bin/env python3
|
||||||
|
"""Integration tests: scrape real pealim.com pages and validate data.
|
||||||
|
|
||||||
|
These tests hit pealim.com directly. They are skipped when the environment
|
||||||
|
variable SKIP_INTEGRATION is set to any non-empty string.
|
||||||
|
|
||||||
|
Run with:
|
||||||
|
pytest tests/test_scraper_integration.py -v -m integration
|
||||||
|
"""
|
||||||
|
|
||||||
|
import json
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Add project root to path so all sibling modules are importable
|
||||||
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||||
|
|
||||||
|
import pealim_detail_scrape
|
||||||
|
import pealim_list_scrape
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Skip marker
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
skip_integration = pytest.mark.skipif(
|
||||||
|
bool(os.environ.get("SKIP_INTEGRATION", "")),
|
||||||
|
reason="SKIP_INTEGRATION is set",
|
||||||
|
)
|
||||||
|
|
||||||
|
# A known Hif'il verb slug that is not page-1 dependent.
|
||||||
|
# לְהַגִּיד (to tell/say) — Hif'il, slug 4183-lehagid
|
||||||
|
HIFIL_VERB_SLUG = "4183-lehagid"
|
||||||
|
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
|
||||||
|
HIFIL_VERB_MEANING = "to say, to tell"
|
||||||
|
|
||||||
|
# Minimum expected entries from a single list page
|
||||||
|
MIN_LIST_ENTRIES = 10
|
||||||
|
|
||||||
|
# Hebrew character regex (Unicode block U+05D0–U+05EA)
|
||||||
|
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
|
||||||
|
|
||||||
|
# Slug pattern: one or more digits, hyphen, one or more word chars
|
||||||
|
SLUG_RE = re.compile(r"^\d+-\w+$")
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Helpers
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
def _has_hebrew(text: str) -> bool:
|
||||||
|
"""Return True if *text* contains at least one Hebrew consonant."""
|
||||||
|
return bool(HEBREW_CHAR_RE.search(text))
|
||||||
|
|
||||||
|
|
||||||
|
def _words_from_file(path: Path) -> dict:
|
||||||
|
with path.open(encoding="utf-8") as fh:
|
||||||
|
return json.load(fh)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test class: list page scrape
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
@skip_integration
|
||||||
|
class TestListScrape:
|
||||||
|
"""Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
|
||||||
|
|
||||||
|
def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
|
||||||
|
words_path = tmp_path / "words.json"
|
||||||
|
progress_path = tmp_path / "list_scrape_progress.json"
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||||
|
|
||||||
|
# Scrape exactly one page
|
||||||
|
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||||
|
|
||||||
|
assert words_path.exists(), "words.json was not created after scrape"
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
assert len(words) >= MIN_LIST_ENTRIES, (
|
||||||
|
f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
|
||||||
|
words_path = tmp_path / "words.json"
|
||||||
|
progress_path = tmp_path / "list_scrape_progress.json"
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||||
|
|
||||||
|
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
|
||||||
|
for key, entry in words.items():
|
||||||
|
word_block = entry.get("word", {})
|
||||||
|
nikkud = word_block.get("nikkud", "")
|
||||||
|
ktiv_male = word_block.get("ktiv_male", "")
|
||||||
|
slug = entry.get("slug", "")
|
||||||
|
pos = entry.get("pos", "")
|
||||||
|
meaning = entry.get("meaning", "")
|
||||||
|
|
||||||
|
assert nikkud, f"Entry '{key}': word.nikkud is empty"
|
||||||
|
assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
|
||||||
|
assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
|
||||||
|
assert slug, f"Entry '{key}': slug is empty"
|
||||||
|
assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
|
||||||
|
assert pos, f"Entry '{key}': pos is empty"
|
||||||
|
assert meaning, f"Entry '{key}': meaning is empty"
|
||||||
|
|
||||||
|
def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""At least one entry on page 1 must have a non-empty root list."""
|
||||||
|
words_path = tmp_path / "words.json"
|
||||||
|
progress_path = tmp_path / "list_scrape_progress.json"
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||||
|
|
||||||
|
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
|
||||||
|
entries_with_root = [e for e in words.values() if e.get("root")]
|
||||||
|
assert entries_with_root, "No entries on page 1 have a non-empty root list"
|
||||||
|
|
||||||
|
def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""At least one entry on page 1 must have a non-empty audio_url."""
|
||||||
|
words_path = tmp_path / "words.json"
|
||||||
|
progress_path = tmp_path / "list_scrape_progress.json"
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||||
|
|
||||||
|
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
|
||||||
|
entries_with_audio = [e for e in words.values() if e.get("audio_url")]
|
||||||
|
assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
|
||||||
|
|
||||||
|
def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
|
||||||
|
words_path = tmp_path / "words.json"
|
||||||
|
progress_path = tmp_path / "list_scrape_progress.json"
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||||
|
|
||||||
|
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
|
||||||
|
for key, entry in words.items():
|
||||||
|
assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
|
||||||
|
assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
|
||||||
|
assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test class: noun detail scrape
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
@skip_integration
|
||||||
|
class TestDetailScrapeNoun:
|
||||||
|
"""Validate pealim_detail_scrape for a real noun detail page."""
|
||||||
|
|
||||||
|
def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
|
||||||
|
"""Return the first (key, entry) pair that is a Noun with a non-empty root."""
|
||||||
|
for key, entry in words.items():
|
||||||
|
if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
|
||||||
|
return key, entry
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
|
||||||
|
"""
|
||||||
|
Scrape page 1 into a fresh words.json and return (path, words).
|
||||||
|
Uses list scraper monkeypatched to tmp_path.
|
||||||
|
"""
|
||||||
|
words_path = tmp_path / "words.json"
|
||||||
|
progress_path = tmp_path / "list_scrape_progress.json"
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||||
|
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||||
|
|
||||||
|
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
return words_path, words
|
||||||
|
|
||||||
|
def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""After detail scrape, noun_inflection must not be null."""
|
||||||
|
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||||
|
|
||||||
|
pair = self._find_noun_with_root(words)
|
||||||
|
assert pair is not None, "No noun with a root found on page 1"
|
||||||
|
noun_key, noun_entry = pair
|
||||||
|
|
||||||
|
# Now monkeypatch detail scraper and run it on just this noun
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
|
||||||
|
# Small rate-limit delay between list scrape and detail scrape
|
||||||
|
time.sleep(1.0)
|
||||||
|
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
|
updated_words = _words_from_file(words_path)
|
||||||
|
entry = updated_words.get(noun_key, {})
|
||||||
|
|
||||||
|
assert entry.get("noun_inflection") is not None, (
|
||||||
|
f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
|
||||||
|
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||||
|
|
||||||
|
pair = self._find_noun_with_root(words)
|
||||||
|
assert pair is not None, "No noun with a root found on page 1"
|
||||||
|
noun_key, _noun_entry = pair
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
time.sleep(1.0)
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
|
updated_words = _words_from_file(words_path)
|
||||||
|
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||||
|
|
||||||
|
singular = ni.get("singular") or {}
|
||||||
|
plural = ni.get("plural") or {}
|
||||||
|
|
||||||
|
assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
|
||||||
|
assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
|
||||||
|
assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
|
||||||
|
assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
|
||||||
|
|
||||||
|
def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Noun gender must be 'masculine' or 'feminine'."""
|
||||||
|
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||||
|
|
||||||
|
pair = self._find_noun_with_root(words)
|
||||||
|
assert pair is not None, "No noun with a root found on page 1"
|
||||||
|
noun_key, _noun_entry = pair
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
time.sleep(1.0)
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
|
updated_words = _words_from_file(words_path)
|
||||||
|
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||||
|
|
||||||
|
gender = ni.get("gender", "")
|
||||||
|
assert gender in ("masculine", "feminine"), (
|
||||||
|
f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""detail_scraped must be True after a successful noun detail scrape."""
|
||||||
|
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||||
|
|
||||||
|
pair = self._find_noun_with_root(words)
|
||||||
|
assert pair is not None, "No noun with a root found on page 1"
|
||||||
|
noun_key, _ = pair
|
||||||
|
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
time.sleep(1.0)
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
|
updated_words = _words_from_file(words_path)
|
||||||
|
assert updated_words[noun_key].get("detail_scraped") is True, (
|
||||||
|
f"detail_scraped is not True after scrape for '{noun_key}'"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test class: verb detail scrape (Hif'il)
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.integration
|
||||||
|
@skip_integration
|
||||||
|
class TestDetailScrapeVerb:
|
||||||
|
"""Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
|
||||||
|
|
||||||
|
def _build_test_words_json(self, tmp_path: Path) -> Path:
|
||||||
|
"""
|
||||||
|
Write a minimal words.json containing only the known Hif'il verb entry.
|
||||||
|
The detail scraper's run() will pick it up because pos starts with 'Verb'
|
||||||
|
and detail_scraped is absent/False.
|
||||||
|
"""
|
||||||
|
words_path = tmp_path / "words.json"
|
||||||
|
entry = {
|
||||||
|
"word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
|
||||||
|
"slug": HIFIL_VERB_SLUG,
|
||||||
|
"root": ["נ", "ג", "ד"],
|
||||||
|
"pos": "Verb",
|
||||||
|
"pos_hebrew": "פֹּעַל — הִפְעִיל",
|
||||||
|
"meaning": HIFIL_VERB_MEANING,
|
||||||
|
"meaning_raw": HIFIL_VERB_MEANING,
|
||||||
|
"audio_url": "",
|
||||||
|
"audio_file": "להגיד.mp3",
|
||||||
|
"tags": "שורש::נגד פעלים",
|
||||||
|
"last_scrape_date": "2026-03-08",
|
||||||
|
"vocab_legacy_guid": None,
|
||||||
|
"frequency": None,
|
||||||
|
"pseudo_frequency": None,
|
||||||
|
"emoji": None,
|
||||||
|
"emoji_source": None,
|
||||||
|
"emoji_visible": False,
|
||||||
|
"image": None,
|
||||||
|
"image_source": None,
|
||||||
|
"hint": "",
|
||||||
|
"shared_roots": [],
|
||||||
|
"confusable_group": None,
|
||||||
|
"confusables_guid": None,
|
||||||
|
"examples": None,
|
||||||
|
"noun_inflection": None,
|
||||||
|
"conjugation": None,
|
||||||
|
"adjective_inflection": None,
|
||||||
|
"preposition_inflection": None,
|
||||||
|
# Intentionally no detail_scraped key so the scraper processes it
|
||||||
|
}
|
||||||
|
words = {HIFIL_VERB_NIKKUD: entry}
|
||||||
|
with words_path.open("w", encoding="utf-8") as fh:
|
||||||
|
json.dump(words, fh, ensure_ascii=False, indent=2)
|
||||||
|
return words_path
|
||||||
|
|
||||||
|
def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""After detail scrape, conjugation must not be null for the Hif'il verb."""
|
||||||
|
words_path = self._build_test_words_json(tmp_path)
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||||
|
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||||||
|
assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
|
||||||
|
|
||||||
|
def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
|
||||||
|
words_path = self._build_test_words_json(tmp_path)
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||||
|
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||||
|
|
||||||
|
assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
|
||||||
|
assert conj.get("binyan_hebrew") == "הִפְעִיל", (
|
||||||
|
f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
|
||||||
|
words_path = self._build_test_words_json(tmp_path)
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||||
|
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||||
|
|
||||||
|
infinitive = conj.get("infinitive") or {}
|
||||||
|
reference_form = conj.get("reference_form") or {}
|
||||||
|
|
||||||
|
inf_nikkud = infinitive.get("nikkud", "")
|
||||||
|
ref_nikkud = reference_form.get("nikkud", "")
|
||||||
|
|
||||||
|
assert inf_nikkud and _has_hebrew(inf_nikkud), (
|
||||||
|
f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
|
||||||
|
)
|
||||||
|
assert ref_nikkud and _has_hebrew(ref_nikkud), (
|
||||||
|
f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_verb_detail_active_forms_count_and_structure(
|
||||||
|
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
||||||
|
) -> None:
|
||||||
|
"""active_forms must be a list of at least 20 entries, each with required sub-fields."""
|
||||||
|
words_path = self._build_test_words_json(tmp_path)
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||||
|
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||||
|
active_forms = conj.get("active_forms")
|
||||||
|
|
||||||
|
assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
|
||||||
|
assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
|
||||||
|
|
||||||
|
for i, form in enumerate(active_forms):
|
||||||
|
assert form.get("person"), f"active_forms[{i}].person is empty"
|
||||||
|
assert form.get("tense"), f"active_forms[{i}].tense is empty"
|
||||||
|
form_block = form.get("form") or {}
|
||||||
|
assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
|
||||||
|
f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
|
||||||
|
)
|
||||||
|
assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
|
||||||
|
f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
|
||||||
|
words_path = self._build_test_words_json(tmp_path)
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||||
|
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||||
|
|
||||||
|
hufal_forms = conj.get("hufal_pual_forms")
|
||||||
|
assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
|
||||||
|
assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
|
||||||
|
assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
|
||||||
|
|
||||||
|
ref_passive = conj.get("reference_form_passive")
|
||||||
|
assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
|
||||||
|
passive_nikkud = (ref_passive or {}).get("nikkud", "")
|
||||||
|
assert passive_nikkud and _has_hebrew(passive_nikkud), (
|
||||||
|
f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
|
||||||
|
)
|
||||||
|
|
||||||
|
def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||||
|
"""detail_scraped must be True after a successful verb detail scrape."""
|
||||||
|
words_path = self._build_test_words_json(tmp_path)
|
||||||
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
|
|
||||||
|
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||||
|
|
||||||
|
words = _words_from_file(words_path)
|
||||||
|
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||||||
|
assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"
|
||||||
|
|
@ -25,8 +25,7 @@ def test_apkg_builder_imports():
|
||||||
|
|
||||||
def test_data_files_exist():
|
def test_data_files_exist():
|
||||||
data_dir = Path(__file__).resolve().parent.parent / "data"
|
data_dir = Path(__file__).resolve().parent.parent / "data"
|
||||||
assert (data_dir / "hebrew_dict_for_anki.csv").exists(), "vocab CSV missing"
|
assert (data_dir / "words.json").exists(), "words.json missing"
|
||||||
assert (data_dir / "conjugations.json").exists(), "conjugations cache missing"
|
|
||||||
|
|
||||||
|
|
||||||
def test_strip_nikkud_idempotent():
|
def test_strip_nikkud_idempotent():
|
||||||
|
|
@ -42,4 +41,4 @@ def test_strip_nikkud_all_marks():
|
||||||
# Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
|
# Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
|
||||||
nikkud = "הַמַּלְכָּה"
|
nikkud = "הַמַּלְכָּה"
|
||||||
plain = strip_nikkud(nikkud)
|
plain = strip_nikkud(nikkud)
|
||||||
assert all(ch < "\u0591" or ch > "\u05C7" for ch in plain), f"Residual nikkud in: {plain}"
|
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
|
||||||
|
|
|
||||||
Loading…
Reference in a new issue