Template & CSS fixes (15 items from Mar 9 feedback): - Fix conjugation front showing 3ms form instead of infinitive - Rename conjugation model to "Hebrew Conjugation" - Strip Hebrew parenthesized text from English meanings - Shoresh separator: spaces → dots (א.כ.ל) - Remove duplicate English meaning from cloze back - Remove example sentences from vocab front/back (cloze only) - Center-align audio buttons on all decks - Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)" - Unify sec-key/sec-label fonts, make keys bold - Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px) - Center-align related words groups - Sort confusables by average frequency - Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning - Clean duplicate quotation marks in cloze sentences Sprint 12 carry-forward (detail scrape + EPUB): - Adjective/preposition detail scraping in pealim_detail_scrape.py - EPUB example matching rewrite in epub_examples.py - Delete benyehuda.py and rebuild_sentence_matches.py (merged) - 49 parser tests for detail scraping - SCHEMA.yaml updates for new fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
804 lines
29 KiB
Python
804 lines
29 KiB
Python
"""Standalone integrity validator for data/words.json.
|
||
|
||
Validates the unified Hebrew Flash Cards data against the schema defined in
|
||
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
|
||
|
||
Usage:
|
||
python3 scripts/validate_data.py
|
||
python3 scripts/validate_data.py --verbose
|
||
python3 scripts/validate_data.py --test confusable_symmetric
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import json
|
||
import re
|
||
import sys
|
||
import unicodedata
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Bootstrap: make project root importable so helpers.py is accessible
|
||
# ---------------------------------------------------------------------------
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Constants
|
||
# ---------------------------------------------------------------------------
|
||
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
|
||
|
||
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav
|
||
|
||
VALID_PERSON_CODES: frozenset[str] = frozenset(
|
||
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
|
||
)
|
||
|
||
EMOJI_RE = re.compile(
|
||
r"[\U0001f600-\U0001f64f"
|
||
r"\U0001f300-\U0001f5ff"
|
||
r"\U0001f680-\U0001f6ff"
|
||
r"\U0001f1e0-\U0001f1ff"
|
||
r"\U00002702-\U000027b0"
|
||
r"\U0001f900-\U0001f9ff"
|
||
r"\U0001fa00-\U0001fa6f"
|
||
r"\U0001fa70-\U0001faff]"
|
||
)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Result tracking
|
||
# ---------------------------------------------------------------------------
|
||
_failures: list[str] = []
|
||
_warnings: list[str] = []
|
||
_verbose: bool = False
|
||
|
||
|
||
def _pass(name: str) -> None:
|
||
print(f" PASS {name}")
|
||
|
||
|
||
def _fail(name: str, details: list[str]) -> None:
|
||
global _failures
|
||
_failures.append(name)
|
||
print(f" FAIL {name}")
|
||
for d in details:
|
||
print(f" {d}")
|
||
|
||
|
||
def _warn(name: str, details: list[str]) -> None:
|
||
global _warnings
|
||
_warnings.extend(details)
|
||
print(f" WARN {name}")
|
||
for d in details:
|
||
print(f" {d}")
|
||
|
||
|
||
def _verbose_print(msg: str) -> None:
|
||
if _verbose:
|
||
print(f" {msg}")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helper: load data
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def load_data() -> dict[str, Any]:
|
||
"""Load words.json and return the parsed dict."""
|
||
if not DATA_FILE.exists():
|
||
print(f"ERROR: data file not found: {DATA_FILE}")
|
||
sys.exit(2)
|
||
with DATA_FILE.open(encoding="utf-8") as fh:
|
||
return json.load(fh)
|
||
|
||
|
||
def _is_hebrew_consonant(ch: str) -> bool:
|
||
"""Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
|
||
|
||
Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
|
||
only the first base character after NFD decomposition.
|
||
"""
|
||
normalized = unicodedata.normalize("NFD", ch)
|
||
# The first codepoint is the base consonant; the rest are combining marks.
|
||
base = normalized[0]
|
||
cp = ord(base)
|
||
return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Individual tests
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def test_required_fields(data: dict[str, Any]) -> None:
|
||
"""Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
|
||
name = "required_fields"
|
||
errors: list[str] = []
|
||
warn_details: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
word = entry.get("word")
|
||
if not isinstance(word, dict):
|
||
errors.append(f"[{key}] 'word' is missing or not a dict")
|
||
else:
|
||
if not word.get("nikkud"):
|
||
errors.append(f"[{key}] word.nikkud is missing or empty")
|
||
if not word.get("ktiv_male"):
|
||
errors.append(f"[{key}] word.ktiv_male is missing or empty")
|
||
|
||
if not entry.get("slug"):
|
||
errors.append(f"[{key}] 'slug' is missing or empty")
|
||
if not entry.get("pos"):
|
||
errors.append(f"[{key}] 'pos' is missing or empty")
|
||
if not entry.get("meaning"):
|
||
errors.append(f"[{key}] 'meaning' is missing or empty")
|
||
|
||
if entry.get("frequency") is None:
|
||
warn_details.append(f"[{key}] 'frequency' is null/missing")
|
||
|
||
if warn_details:
|
||
_warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
|
||
if len(warn_details) > 20 and not _verbose:
|
||
print(f" ... ({len(warn_details) - 20} more; use --verbose)")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_root_format(data: dict[str, Any]) -> None:
|
||
"""root is a list of 2-5 Hebrew consonant chars, or an empty list."""
|
||
name = "root_format"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
root = entry.get("root")
|
||
if root is None:
|
||
errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
|
||
continue
|
||
if not isinstance(root, list):
|
||
errors.append(f"[{key}] 'root' is not a list: {root!r}")
|
||
continue
|
||
if len(root) == 0:
|
||
continue # rootless word — valid
|
||
if not (2 <= len(root) <= 5):
|
||
errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
|
||
continue
|
||
for ch in root:
|
||
# A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
|
||
# Validate by checking the base consonant after NFD decomposition.
|
||
if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
|
||
errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
|
||
break
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_unique_slugs(data: dict[str, Any]) -> None:
|
||
"""All non-empty slugs are unique across entries — each pealim page is a distinct word."""
|
||
name = "unique_slugs"
|
||
seen: dict[str, list[str]] = {}
|
||
|
||
for key, entry in data.items():
|
||
slug = entry.get("slug")
|
||
if slug:
|
||
seen.setdefault(slug, []).append(key)
|
||
|
||
dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
|
||
if dups:
|
||
errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001
|
||
"""JSON loaded without top-level key collisions.
|
||
|
||
Python's json.load silently keeps the last value on duplicate keys;
|
||
we re-parse with a custom object_pairs_hook to detect them.
|
||
The pre-parsed ``_data`` dict is not used here because we need to
|
||
re-read the raw file to catch duplicate keys that json.load would
|
||
silently merge.
|
||
"""
|
||
name = "no_duplicate_keys"
|
||
duplicates: list[str] = []
|
||
|
||
def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
|
||
d: dict[str, Any] = {}
|
||
for k, v in pairs:
|
||
if k in d:
|
||
duplicates.append(k)
|
||
d[k] = v
|
||
return d
|
||
|
||
with DATA_FILE.open(encoding="utf-8") as fh:
|
||
json.load(fh, object_pairs_hook=_detect_dups)
|
||
|
||
if duplicates:
|
||
_fail(name, [f"duplicate key: {k!r}" for k in duplicates])
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_confusable_symmetric(data: dict[str, Any]) -> None:
|
||
"""If A lists B in confusable_group, B must list A."""
|
||
name = "confusable_symmetric"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
group = entry.get("confusable_group")
|
||
if not group:
|
||
continue
|
||
for other_key in group:
|
||
other = data.get(other_key)
|
||
if other is None:
|
||
errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
|
||
continue
|
||
other_group = other.get("confusable_group") or []
|
||
if key not in other_group:
|
||
errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
|
||
"""Every key in shared_roots must exist as a top-level key."""
|
||
name = "shared_roots_valid_keys"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
shared = entry.get("shared_roots")
|
||
if not shared:
|
||
continue
|
||
for ref_key in shared:
|
||
if ref_key not in data:
|
||
errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_unique_legacy_guids(data: dict[str, Any]) -> None:
|
||
"""No two entries share the same vocab_legacy_guid (excluding null).
|
||
|
||
Exception: entries that share the same word.nikkud value inherited the
|
||
same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
|
||
These are tolerated — the duplicate GUID is a known artefact of how
|
||
legacy GUIDs were generated from the nikkud word alone.
|
||
"""
|
||
name = "unique_legacy_guids"
|
||
seen: dict[str, list[str]] = {}
|
||
|
||
for key, entry in data.items():
|
||
guid = entry.get("vocab_legacy_guid")
|
||
if guid:
|
||
seen.setdefault(guid, []).append(key)
|
||
|
||
errors: list[str] = []
|
||
for guid, keys in seen.items():
|
||
if len(keys) <= 1:
|
||
continue
|
||
# Tolerate sharing if ALL entries with this GUID share the same word.nikkud
|
||
nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
|
||
if len(nikkud_values) == 1:
|
||
# Same nikkud -> inherited from same legacy card; tolerable
|
||
_verbose_print(
|
||
f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
|
||
)
|
||
continue
|
||
errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
|
||
"""noun_inflection must be null if pos doesn't start with 'Noun'.
|
||
|
||
Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
|
||
"""
|
||
name = "no_noun_inflection_on_non_nouns"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
pos = entry.get("pos") or ""
|
||
noun_inf = entry.get("noun_inflection")
|
||
if not pos.startswith("Noun") and noun_inf is not None:
|
||
errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
|
||
_verbose_print(f"offending entry: {key!r}")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
|
||
"""meaning field must not contain inline emoji characters."""
|
||
name = "no_emoji_in_meaning"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
meaning = entry.get("meaning") or ""
|
||
if EMOJI_RE.search(meaning):
|
||
errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
|
||
"""For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
|
||
|
||
Uses nikkud (exact) matching, not stripped matching.
|
||
"""
|
||
name = "example_sentences_contain_word"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
examples = entry.get("examples")
|
||
if not examples:
|
||
continue
|
||
vetted = examples.get("vetted")
|
||
if not vetted:
|
||
continue
|
||
|
||
word_obj = entry.get("word") or {}
|
||
nikkud_word = word_obj.get("nikkud") or ""
|
||
if not nikkud_word:
|
||
continue
|
||
|
||
found = any(nikkud_word in (s.get("text") or "") for s in vetted)
|
||
if not found:
|
||
sentences_preview = [s.get("text", "") for s in vetted[:2]]
|
||
errors.append(
|
||
f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
|
||
)
|
||
|
||
if errors:
|
||
_warn(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
_pass(name)
|
||
|
||
|
||
def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
|
||
"""cloze_word_start/end must be within text bounds when present.
|
||
|
||
Null offsets are tolerated (and warned separately) because some sentences
|
||
contain only inflected/construct/plural forms that cannot be matched back
|
||
to the base nikkud or ktiv_male — this is a data quality issue in
|
||
vetted_sentences.json, not a schema violation.
|
||
"""
|
||
name = "cloze_offsets_valid"
|
||
errors: list[str] = []
|
||
null_warn: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
examples = entry.get("examples")
|
||
if not examples:
|
||
continue
|
||
cloze = examples.get("cloze")
|
||
if not cloze:
|
||
continue
|
||
|
||
text = cloze.get("text") or ""
|
||
start = cloze.get("cloze_word_start")
|
||
end = cloze.get("cloze_word_end")
|
||
|
||
if start is None or end is None:
|
||
null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
|
||
continue
|
||
|
||
text_len = len(text)
|
||
if not isinstance(start, int) or not isinstance(end, int):
|
||
errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
|
||
continue
|
||
if start < 0 or end < 0:
|
||
errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
|
||
continue
|
||
if start >= end:
|
||
errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
|
||
continue
|
||
if end > text_len:
|
||
errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
|
||
|
||
if null_warn:
|
||
_warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
|
||
if len(null_warn) > 20 and not _verbose:
|
||
print(f" ... ({len(null_warn) - 20} more; use --verbose)")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
|
||
"""hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
|
||
name = "hufal_pual_only_on_hifil_piel"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
conj = entry.get("conjugation")
|
||
if not conj:
|
||
continue
|
||
hufal_pual = conj.get("hufal_pual_forms")
|
||
if hufal_pual is None:
|
||
continue
|
||
|
||
binyan = conj.get("binyan") or ""
|
||
binyan_lower = binyan.lower()
|
||
if "hif" not in binyan_lower and "pi" not in binyan_lower:
|
||
errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
|
||
"""All entries in a confusable_group must share the same word.ktiv_male."""
|
||
name = "confusable_group_shares_ktiv_male"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
group = entry.get("confusable_group")
|
||
if not group:
|
||
continue
|
||
|
||
my_word = entry.get("word") or {}
|
||
my_ktiv = my_word.get("ktiv_male")
|
||
if not my_ktiv:
|
||
continue
|
||
|
||
for other_key in group:
|
||
other = data.get(other_key)
|
||
if not other:
|
||
continue # already caught by confusable_symmetric
|
||
other_word = other.get("word") or {}
|
||
other_ktiv = other_word.get("ktiv_male")
|
||
if other_ktiv and other_ktiv != my_ktiv:
|
||
errors.append(
|
||
f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
|
||
)
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_confusables_guid(data: dict[str, Any]) -> None:
|
||
"""confusables_guid must be consistent within each confusable_group.
|
||
|
||
Rules:
|
||
- If confusable_group is non-null, confusables_guid must be non-null.
|
||
- If confusable_group is null, confusables_guid must be null.
|
||
- All entries that share a confusable_group must share the same
|
||
confusables_guid value.
|
||
"""
|
||
name = "confusables_guid"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
group = entry.get("confusable_group")
|
||
guid = entry.get("confusables_guid")
|
||
|
||
if group and not guid:
|
||
errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
|
||
elif not group and guid is not None:
|
||
errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
|
||
|
||
if not group or not guid:
|
||
continue
|
||
|
||
for other_key in group:
|
||
other = data.get(other_key)
|
||
if not other:
|
||
continue # already caught by confusable_symmetric
|
||
other_guid = other.get("confusables_guid")
|
||
if other_guid != guid:
|
||
errors.append(
|
||
f"[{key}] confusables_guid={guid!r} but confusable member "
|
||
f"{other_key!r} has confusables_guid={other_guid!r}"
|
||
)
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_conjugation_form_guids(data: dict[str, Any]) -> None:
|
||
"""Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
|
||
|
||
Rules:
|
||
- Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
|
||
OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
|
||
1st person forms where multiple GUIDs are possible).
|
||
- No two forms within the same verb (across both form lists) may share a GUID.
|
||
"""
|
||
name = "conjugation_form_guids"
|
||
errors: list[str] = []
|
||
warnings: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
conj = entry.get("conjugation")
|
||
if not conj:
|
||
continue
|
||
|
||
seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label
|
||
|
||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||
forms = conj.get(form_list_key)
|
||
if not forms:
|
||
continue
|
||
for form in forms:
|
||
person = form.get("person", "?")
|
||
label = f"{form_list_key}[{person}]"
|
||
guid = form.get("guid")
|
||
guid_candidates = form.get("guid_candidates")
|
||
|
||
if not guid and not guid_candidates:
|
||
# New forms from rescrape use deterministic fallback — warn, don't fail
|
||
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
|
||
continue
|
||
|
||
if guid:
|
||
if guid in seen_guids:
|
||
errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
|
||
else:
|
||
seen_guids[guid] = label
|
||
elif guid_candidates:
|
||
for candidate in guid_candidates:
|
||
if candidate in seen_guids:
|
||
errors.append(
|
||
f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
|
||
)
|
||
else:
|
||
seen_guids[candidate] = label
|
||
|
||
if warnings:
|
||
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_conjugation_person_codes(data: dict[str, Any]) -> None:
|
||
"""active_forms person codes must be from the defined valid set."""
|
||
name = "conjugation_person_codes"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
conj = entry.get("conjugation")
|
||
if not conj:
|
||
continue
|
||
|
||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||
forms = conj.get(form_list_key)
|
||
if not forms:
|
||
continue
|
||
for form in forms:
|
||
person = form.get("person")
|
||
if person not in VALID_PERSON_CODES:
|
||
errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
|
||
|
||
if errors:
|
||
_fail(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
else:
|
||
_pass(name)
|
||
|
||
|
||
def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
||
"""For confusable words, their example sentences must not contain the wrong
|
||
homograph's nikkud word.
|
||
|
||
Specifically: if A and B are confusable (same ktiv_male), A's vetted
|
||
sentences must not contain B's nikkud form, and vice versa.
|
||
"""
|
||
name = "no_stripped_form_sentence_collisions"
|
||
errors: list[str] = []
|
||
|
||
for key, entry in data.items():
|
||
group = entry.get("confusable_group")
|
||
if not group:
|
||
continue
|
||
|
||
examples = entry.get("examples")
|
||
if not examples:
|
||
continue
|
||
vetted = examples.get("vetted")
|
||
if not vetted:
|
||
continue
|
||
|
||
my_word = entry.get("word") or {}
|
||
my_nikkud = my_word.get("nikkud") or ""
|
||
|
||
my_texts = [s.get("text") or "" for s in vetted]
|
||
|
||
for other_key in group:
|
||
other = data.get(other_key)
|
||
if not other:
|
||
continue
|
||
other_word = other.get("word") or {}
|
||
other_nikkud = other_word.get("nikkud") or ""
|
||
if not other_nikkud or other_nikkud == my_nikkud:
|
||
continue # same nikkud homographs are ok (we can't distinguish by nikkud)
|
||
|
||
for text in my_texts:
|
||
if other_nikkud in text:
|
||
errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
|
||
_verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
|
||
break # one error per (key, other_key) pair is enough
|
||
|
||
if errors:
|
||
_warn(name, errors[:20] if not _verbose else errors)
|
||
if len(errors) > 20 and not _verbose:
|
||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||
_pass(name)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Stats summary
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def print_stats(data: dict[str, Any]) -> None:
|
||
"""Print a summary of dataset coverage metrics."""
|
||
total = len(data)
|
||
with_conj = sum(1 for e in data.values() if e.get("conjugation"))
|
||
with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
|
||
with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
|
||
with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
|
||
with_image = sum(1 for e in data.values() if e.get("image"))
|
||
with_emoji = sum(1 for e in data.values() if e.get("emoji"))
|
||
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
|
||
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
|
||
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
|
||
|
||
print()
|
||
print("Stats Summary")
|
||
print("─" * 42)
|
||
print(f" Total entries: {total:>6}")
|
||
print(f" With conjugation data: {with_conj:>6}")
|
||
print(f" With noun_inflection: {with_noun_inf:>6}")
|
||
print(f" With vetted examples: {with_vetted:>6}")
|
||
print(f" With cloze examples: {with_cloze:>6}")
|
||
print(f" With images: {with_image:>6}")
|
||
print(f" With emoji: {with_emoji:>6}")
|
||
print(f" With legacy GUIDs: {with_guid:>6}")
|
||
print(f" In confusable groups: {in_confusable:>6}")
|
||
print(f" With shared roots: {with_shared_roots:>6}")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test registry
|
||
# ---------------------------------------------------------------------------
|
||
|
||
ALL_TESTS: dict[str, Any] = {
|
||
"required_fields": test_required_fields,
|
||
"root_format": test_root_format,
|
||
"unique_slugs": test_unique_slugs,
|
||
"no_duplicate_keys": test_no_duplicate_keys,
|
||
"confusable_symmetric": test_confusable_symmetric,
|
||
"shared_roots_valid_keys": test_shared_roots_valid_keys,
|
||
"unique_legacy_guids": test_unique_legacy_guids,
|
||
"no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
|
||
"no_emoji_in_meaning": test_no_emoji_in_meaning,
|
||
"example_sentences_contain_word": test_example_sentences_contain_word,
|
||
"cloze_offsets_valid": test_cloze_offsets_valid,
|
||
"hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
|
||
"confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
|
||
"confusables_guid": test_confusables_guid,
|
||
"conjugation_form_guids": test_conjugation_form_guids,
|
||
"conjugation_person_codes": test_conjugation_person_codes,
|
||
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||
}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Entry point
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def main() -> None:
|
||
global _verbose
|
||
|
||
parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
|
||
parser.add_argument(
|
||
"--verbose",
|
||
"-v",
|
||
action="store_true",
|
||
help="Print full details for all failures (not just first 20).",
|
||
)
|
||
parser.add_argument(
|
||
"--test",
|
||
metavar="NAME",
|
||
help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
|
||
)
|
||
args = parser.parse_args()
|
||
_verbose = args.verbose
|
||
|
||
data = load_data()
|
||
|
||
# Select tests to run
|
||
if args.test:
|
||
if args.test not in ALL_TESTS:
|
||
print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
|
||
sys.exit(2)
|
||
tests_to_run = {args.test: ALL_TESTS[args.test]}
|
||
else:
|
||
tests_to_run = ALL_TESTS
|
||
|
||
print(f"Validating {DATA_FILE} ({len(data)} entries)")
|
||
print("─" * 60)
|
||
|
||
# no_duplicate_keys needs the file, not the pre-parsed dict
|
||
for test_fn in tests_to_run.values():
|
||
test_fn(data)
|
||
|
||
# Summary
|
||
if not args.test:
|
||
print_stats(data)
|
||
|
||
print()
|
||
print("─" * 60)
|
||
if _warnings:
|
||
print(f" Warnings : {len(_warnings)}")
|
||
if _failures:
|
||
print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
|
||
sys.exit(1)
|
||
else:
|
||
print(f" All {len(tests_to_run)} test(s) passed.")
|
||
sys.exit(0)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|