Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
44 lines
1.2 KiB
Python
44 lines
1.2 KiB
Python
"""Smoke tests for the Hebrew Flash Cards project."""
|
|
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
# Ensure project root is on path
|
|
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
|
|
|
|
|
def test_helpers_strip_nikkud():
|
|
from helpers import strip_nikkud
|
|
|
|
assert strip_nikkud("שָׁלוֹם") == "שלום"
|
|
assert strip_nikkud("hello") == "hello"
|
|
assert strip_nikkud("") == ""
|
|
|
|
|
|
def test_apkg_builder_imports():
|
|
import apkg_builder
|
|
|
|
assert hasattr(apkg_builder, "build_vocab_deck")
|
|
assert hasattr(apkg_builder, "build_conj_deck")
|
|
assert apkg_builder.VOCAB_MODEL_ID == 1_701_222_017_968
|
|
|
|
|
|
def test_data_files_exist():
|
|
data_dir = Path(__file__).resolve().parent.parent / "data"
|
|
assert (data_dir / "words.json").exists(), "words.json missing"
|
|
|
|
|
|
def test_strip_nikkud_idempotent():
|
|
from helpers import strip_nikkud
|
|
|
|
plain = "שלום"
|
|
assert strip_nikkud(plain) == plain
|
|
|
|
|
|
def test_strip_nikkud_all_marks():
|
|
from helpers import strip_nikkud
|
|
|
|
# Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
|
|
nikkud = "הַמַּלְכָּה"
|
|
plain = strip_nikkud(nikkud)
|
|
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
|