hebrew_flash_cards/scripts/validate_data.py
Sochen efd0745ada Sprint 14: deck template/CSS overhaul + Sprint 12 detail scrape
Template & CSS fixes (15 items from Mar 9 feedback):
- Fix conjugation front showing 3ms form instead of infinitive
- Rename conjugation model to "Hebrew Conjugation"
- Strip Hebrew parenthesized text from English meanings
- Shoresh separator: spaces → dots (א.כ.ל)
- Remove duplicate English meaning from cloze back
- Remove example sentences from vocab front/back (cloze only)
- Center-align audio buttons on all decks
- Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)"
- Unify sec-key/sec-label fonts, make keys bold
- Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px)
- Center-align related words groups
- Sort confusables by average frequency
- Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning
- Clean duplicate quotation marks in cloze sentences

Sprint 12 carry-forward (detail scrape + EPUB):
- Adjective/preposition detail scraping in pealim_detail_scrape.py
- EPUB example matching rewrite in epub_examples.py
- Delete benyehuda.py and rebuild_sentence_matches.py (merged)
- 49 parser tests for detail scraping
- SCHEMA.yaml updates for new fields

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 07:44:47 +00:00

804 lines
29 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Standalone integrity validator for data/words.json.
Validates the unified Hebrew Flash Cards data against the schema defined in
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
Usage:
python3 scripts/validate_data.py
python3 scripts/validate_data.py --verbose
python3 scripts/validate_data.py --test confusable_symmetric
"""
from __future__ import annotations
import argparse
import json
import re
import sys
import unicodedata
from pathlib import Path
from typing import Any
# ---------------------------------------------------------------------------
# Bootstrap: make project root importable so helpers.py is accessible
# ---------------------------------------------------------------------------
sys.path.insert(0, str(Path(__file__).parent.parent))
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # aleftav
VALID_PERSON_CODES: frozenset[str] = frozenset(
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
)
EMOJI_RE = re.compile(
r"[\U0001f600-\U0001f64f"
r"\U0001f300-\U0001f5ff"
r"\U0001f680-\U0001f6ff"
r"\U0001f1e0-\U0001f1ff"
r"\U00002702-\U000027b0"
r"\U0001f900-\U0001f9ff"
r"\U0001fa00-\U0001fa6f"
r"\U0001fa70-\U0001faff]"
)
# ---------------------------------------------------------------------------
# Result tracking
# ---------------------------------------------------------------------------
_failures: list[str] = []
_warnings: list[str] = []
_verbose: bool = False
def _pass(name: str) -> None:
print(f" PASS {name}")
def _fail(name: str, details: list[str]) -> None:
global _failures
_failures.append(name)
print(f" FAIL {name}")
for d in details:
print(f" {d}")
def _warn(name: str, details: list[str]) -> None:
global _warnings
_warnings.extend(details)
print(f" WARN {name}")
for d in details:
print(f" {d}")
def _verbose_print(msg: str) -> None:
if _verbose:
print(f" {msg}")
# ---------------------------------------------------------------------------
# Helper: load data
# ---------------------------------------------------------------------------
def load_data() -> dict[str, Any]:
"""Load words.json and return the parsed dict."""
if not DATA_FILE.exists():
print(f"ERROR: data file not found: {DATA_FILE}")
sys.exit(2)
with DATA_FILE.open(encoding="utf-8") as fh:
return json.load(fh)
def _is_hebrew_consonant(ch: str) -> bool:
"""Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
only the first base character after NFD decomposition.
"""
normalized = unicodedata.normalize("NFD", ch)
# The first codepoint is the base consonant; the rest are combining marks.
base = normalized[0]
cp = ord(base)
return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
# ---------------------------------------------------------------------------
# Individual tests
# ---------------------------------------------------------------------------
def test_required_fields(data: dict[str, Any]) -> None:
"""Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
name = "required_fields"
errors: list[str] = []
warn_details: list[str] = []
for key, entry in data.items():
word = entry.get("word")
if not isinstance(word, dict):
errors.append(f"[{key}] 'word' is missing or not a dict")
else:
if not word.get("nikkud"):
errors.append(f"[{key}] word.nikkud is missing or empty")
if not word.get("ktiv_male"):
errors.append(f"[{key}] word.ktiv_male is missing or empty")
if not entry.get("slug"):
errors.append(f"[{key}] 'slug' is missing or empty")
if not entry.get("pos"):
errors.append(f"[{key}] 'pos' is missing or empty")
if not entry.get("meaning"):
errors.append(f"[{key}] 'meaning' is missing or empty")
if entry.get("frequency") is None:
warn_details.append(f"[{key}] 'frequency' is null/missing")
if warn_details:
_warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
if len(warn_details) > 20 and not _verbose:
print(f" ... ({len(warn_details) - 20} more; use --verbose)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_root_format(data: dict[str, Any]) -> None:
"""root is a list of 2-5 Hebrew consonant chars, or an empty list."""
name = "root_format"
errors: list[str] = []
for key, entry in data.items():
root = entry.get("root")
if root is None:
errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
continue
if not isinstance(root, list):
errors.append(f"[{key}] 'root' is not a list: {root!r}")
continue
if len(root) == 0:
continue # rootless word — valid
if not (2 <= len(root) <= 5):
errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
continue
for ch in root:
# A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
# Validate by checking the base consonant after NFD decomposition.
if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
break
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_unique_slugs(data: dict[str, Any]) -> None:
"""All non-empty slugs are unique across entries — each pealim page is a distinct word."""
name = "unique_slugs"
seen: dict[str, list[str]] = {}
for key, entry in data.items():
slug = entry.get("slug")
if slug:
seen.setdefault(slug, []).append(key)
dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
if dups:
errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001
"""JSON loaded without top-level key collisions.
Python's json.load silently keeps the last value on duplicate keys;
we re-parse with a custom object_pairs_hook to detect them.
The pre-parsed ``_data`` dict is not used here because we need to
re-read the raw file to catch duplicate keys that json.load would
silently merge.
"""
name = "no_duplicate_keys"
duplicates: list[str] = []
def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
d: dict[str, Any] = {}
for k, v in pairs:
if k in d:
duplicates.append(k)
d[k] = v
return d
with DATA_FILE.open(encoding="utf-8") as fh:
json.load(fh, object_pairs_hook=_detect_dups)
if duplicates:
_fail(name, [f"duplicate key: {k!r}" for k in duplicates])
else:
_pass(name)
def test_confusable_symmetric(data: dict[str, Any]) -> None:
"""If A lists B in confusable_group, B must list A."""
name = "confusable_symmetric"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
for other_key in group:
other = data.get(other_key)
if other is None:
errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
continue
other_group = other.get("confusable_group") or []
if key not in other_group:
errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
"""Every key in shared_roots must exist as a top-level key."""
name = "shared_roots_valid_keys"
errors: list[str] = []
for key, entry in data.items():
shared = entry.get("shared_roots")
if not shared:
continue
for ref_key in shared:
if ref_key not in data:
errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_unique_legacy_guids(data: dict[str, Any]) -> None:
"""No two entries share the same vocab_legacy_guid (excluding null).
Exception: entries that share the same word.nikkud value inherited the
same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
These are tolerated — the duplicate GUID is a known artefact of how
legacy GUIDs were generated from the nikkud word alone.
"""
name = "unique_legacy_guids"
seen: dict[str, list[str]] = {}
for key, entry in data.items():
guid = entry.get("vocab_legacy_guid")
if guid:
seen.setdefault(guid, []).append(key)
errors: list[str] = []
for guid, keys in seen.items():
if len(keys) <= 1:
continue
# Tolerate sharing if ALL entries with this GUID share the same word.nikkud
nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
if len(nikkud_values) == 1:
# Same nikkud -> inherited from same legacy card; tolerable
_verbose_print(
f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
)
continue
errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
"""noun_inflection must be null if pos doesn't start with 'Noun'.
Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
"""
name = "no_noun_inflection_on_non_nouns"
errors: list[str] = []
for key, entry in data.items():
pos = entry.get("pos") or ""
noun_inf = entry.get("noun_inflection")
if not pos.startswith("Noun") and noun_inf is not None:
errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
_verbose_print(f"offending entry: {key!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
"""meaning field must not contain inline emoji characters."""
name = "no_emoji_in_meaning"
errors: list[str] = []
for key, entry in data.items():
meaning = entry.get("meaning") or ""
if EMOJI_RE.search(meaning):
errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
"""For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
Uses nikkud (exact) matching, not stripped matching.
"""
name = "example_sentences_contain_word"
errors: list[str] = []
for key, entry in data.items():
examples = entry.get("examples")
if not examples:
continue
vetted = examples.get("vetted")
if not vetted:
continue
word_obj = entry.get("word") or {}
nikkud_word = word_obj.get("nikkud") or ""
if not nikkud_word:
continue
found = any(nikkud_word in (s.get("text") or "") for s in vetted)
if not found:
sentences_preview = [s.get("text", "") for s in vetted[:2]]
errors.append(
f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
)
if errors:
_warn(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
_pass(name)
def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
"""cloze_word_start/end must be within text bounds when present.
Null offsets are tolerated (and warned separately) because some sentences
contain only inflected/construct/plural forms that cannot be matched back
to the base nikkud or ktiv_male — this is a data quality issue in
vetted_sentences.json, not a schema violation.
"""
name = "cloze_offsets_valid"
errors: list[str] = []
null_warn: list[str] = []
for key, entry in data.items():
examples = entry.get("examples")
if not examples:
continue
cloze = examples.get("cloze")
if not cloze:
continue
text = cloze.get("text") or ""
start = cloze.get("cloze_word_start")
end = cloze.get("cloze_word_end")
if start is None or end is None:
null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
continue
text_len = len(text)
if not isinstance(start, int) or not isinstance(end, int):
errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
continue
if start < 0 or end < 0:
errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
continue
if start >= end:
errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
continue
if end > text_len:
errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
if null_warn:
_warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
if len(null_warn) > 20 and not _verbose:
print(f" ... ({len(null_warn) - 20} more; use --verbose)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
"""hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
name = "hufal_pual_only_on_hifil_piel"
errors: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
hufal_pual = conj.get("hufal_pual_forms")
if hufal_pual is None:
continue
binyan = conj.get("binyan") or ""
binyan_lower = binyan.lower()
if "hif" not in binyan_lower and "pi" not in binyan_lower:
errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
"""All entries in a confusable_group must share the same word.ktiv_male."""
name = "confusable_group_shares_ktiv_male"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
my_word = entry.get("word") or {}
my_ktiv = my_word.get("ktiv_male")
if not my_ktiv:
continue
for other_key in group:
other = data.get(other_key)
if not other:
continue # already caught by confusable_symmetric
other_word = other.get("word") or {}
other_ktiv = other_word.get("ktiv_male")
if other_ktiv and other_ktiv != my_ktiv:
errors.append(
f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
)
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_confusables_guid(data: dict[str, Any]) -> None:
"""confusables_guid must be consistent within each confusable_group.
Rules:
- If confusable_group is non-null, confusables_guid must be non-null.
- If confusable_group is null, confusables_guid must be null.
- All entries that share a confusable_group must share the same
confusables_guid value.
"""
name = "confusables_guid"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
guid = entry.get("confusables_guid")
if group and not guid:
errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
elif not group and guid is not None:
errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
if not group or not guid:
continue
for other_key in group:
other = data.get(other_key)
if not other:
continue # already caught by confusable_symmetric
other_guid = other.get("confusables_guid")
if other_guid != guid:
errors.append(
f"[{key}] confusables_guid={guid!r} but confusable member "
f"{other_key!r} has confusables_guid={other_guid!r}"
)
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_conjugation_form_guids(data: dict[str, Any]) -> None:
"""Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
Rules:
- Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
1st person forms where multiple GUIDs are possible).
- No two forms within the same verb (across both form lists) may share a GUID.
"""
name = "conjugation_form_guids"
errors: list[str] = []
warnings: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
person = form.get("person", "?")
label = f"{form_list_key}[{person}]"
guid = form.get("guid")
guid_candidates = form.get("guid_candidates")
if not guid and not guid_candidates:
# New forms from rescrape use deterministic fallback — warn, don't fail
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
continue
if guid:
if guid in seen_guids:
errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
else:
seen_guids[guid] = label
elif guid_candidates:
for candidate in guid_candidates:
if candidate in seen_guids:
errors.append(
f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
)
else:
seen_guids[candidate] = label
if warnings:
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_conjugation_person_codes(data: dict[str, Any]) -> None:
"""active_forms person codes must be from the defined valid set."""
name = "conjugation_person_codes"
errors: list[str] = []
for key, entry in data.items():
conj = entry.get("conjugation")
if not conj:
continue
for form_list_key in ("active_forms", "hufal_pual_forms"):
forms = conj.get(form_list_key)
if not forms:
continue
for form in forms:
person = form.get("person")
if person not in VALID_PERSON_CODES:
errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
if errors:
_fail(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
else:
_pass(name)
def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
"""For confusable words, their example sentences must not contain the wrong
homograph's nikkud word.
Specifically: if A and B are confusable (same ktiv_male), A's vetted
sentences must not contain B's nikkud form, and vice versa.
"""
name = "no_stripped_form_sentence_collisions"
errors: list[str] = []
for key, entry in data.items():
group = entry.get("confusable_group")
if not group:
continue
examples = entry.get("examples")
if not examples:
continue
vetted = examples.get("vetted")
if not vetted:
continue
my_word = entry.get("word") or {}
my_nikkud = my_word.get("nikkud") or ""
my_texts = [s.get("text") or "" for s in vetted]
for other_key in group:
other = data.get(other_key)
if not other:
continue
other_word = other.get("word") or {}
other_nikkud = other_word.get("nikkud") or ""
if not other_nikkud or other_nikkud == my_nikkud:
continue # same nikkud homographs are ok (we can't distinguish by nikkud)
for text in my_texts:
if other_nikkud in text:
errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
_verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
break # one error per (key, other_key) pair is enough
if errors:
_warn(name, errors[:20] if not _verbose else errors)
if len(errors) > 20 and not _verbose:
print(f" ... ({len(errors) - 20} more; use --verbose)")
_pass(name)
# ---------------------------------------------------------------------------
# Stats summary
# ---------------------------------------------------------------------------
def print_stats(data: dict[str, Any]) -> None:
"""Print a summary of dataset coverage metrics."""
total = len(data)
with_conj = sum(1 for e in data.values() if e.get("conjugation"))
with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
with_image = sum(1 for e in data.values() if e.get("image"))
with_emoji = sum(1 for e in data.values() if e.get("emoji"))
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
print()
print("Stats Summary")
print("" * 42)
print(f" Total entries: {total:>6}")
print(f" With conjugation data: {with_conj:>6}")
print(f" With noun_inflection: {with_noun_inf:>6}")
print(f" With vetted examples: {with_vetted:>6}")
print(f" With cloze examples: {with_cloze:>6}")
print(f" With images: {with_image:>6}")
print(f" With emoji: {with_emoji:>6}")
print(f" With legacy GUIDs: {with_guid:>6}")
print(f" In confusable groups: {in_confusable:>6}")
print(f" With shared roots: {with_shared_roots:>6}")
# ---------------------------------------------------------------------------
# Test registry
# ---------------------------------------------------------------------------
ALL_TESTS: dict[str, Any] = {
"required_fields": test_required_fields,
"root_format": test_root_format,
"unique_slugs": test_unique_slugs,
"no_duplicate_keys": test_no_duplicate_keys,
"confusable_symmetric": test_confusable_symmetric,
"shared_roots_valid_keys": test_shared_roots_valid_keys,
"unique_legacy_guids": test_unique_legacy_guids,
"no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
"no_emoji_in_meaning": test_no_emoji_in_meaning,
"example_sentences_contain_word": test_example_sentences_contain_word,
"cloze_offsets_valid": test_cloze_offsets_valid,
"hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
"confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
"confusables_guid": test_confusables_guid,
"conjugation_form_guids": test_conjugation_form_guids,
"conjugation_person_codes": test_conjugation_person_codes,
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
}
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
global _verbose
parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
parser.add_argument(
"--verbose",
"-v",
action="store_true",
help="Print full details for all failures (not just first 20).",
)
parser.add_argument(
"--test",
metavar="NAME",
help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
)
args = parser.parse_args()
_verbose = args.verbose
data = load_data()
# Select tests to run
if args.test:
if args.test not in ALL_TESTS:
print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
sys.exit(2)
tests_to_run = {args.test: ALL_TESTS[args.test]}
else:
tests_to_run = ALL_TESTS
print(f"Validating {DATA_FILE} ({len(data)} entries)")
print("" * 60)
# no_duplicate_keys needs the file, not the pre-parsed dict
for test_fn in tests_to_run.values():
test_fn(data)
# Summary
if not args.test:
print_stats(data)
print()
print("" * 60)
if _warnings:
print(f" Warnings : {len(_warnings)}")
if _failures:
print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
sys.exit(1)
else:
print(f" All {len(tests_to_run)} test(s) passed.")
sys.exit(0)
if __name__ == "__main__":
main()