"""Standalone integrity validator for data/words.json. Validates the unified Hebrew Flash Cards data against the schema defined in SCHEMA.yaml. Each test prints PASS/FAIL with details on failures. Usage: python3 scripts/validate_data.py python3 scripts/validate_data.py --verbose python3 scripts/validate_data.py --test confusable_symmetric """ from __future__ import annotations import argparse import json import re import sys import unicodedata from pathlib import Path from typing import Any # --------------------------------------------------------------------------- # Bootstrap: make project root importable so helpers.py is accessible # --------------------------------------------------------------------------- sys.path.insert(0, str(Path(__file__).parent.parent)) # --------------------------------------------------------------------------- # Constants # --------------------------------------------------------------------------- DATA_FILE = Path(__file__).parent.parent / "data" / "words.json" HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav VALID_PERSON_CODES: frozenset[str] = frozenset( ["1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"] ) EMOJI_RE = re.compile( r"[\U0001f600-\U0001f64f" r"\U0001f300-\U0001f5ff" r"\U0001f680-\U0001f6ff" r"\U0001f1e0-\U0001f1ff" r"\U00002702-\U000027b0" r"\U0001f900-\U0001f9ff" r"\U0001fa00-\U0001fa6f" r"\U0001fa70-\U0001faff]" ) # --------------------------------------------------------------------------- # Result tracking # --------------------------------------------------------------------------- _failures: list[str] = [] _warnings: list[str] = [] _verbose: bool = False def _pass(name: str) -> None: print(f" PASS {name}") def _fail(name: str, details: list[str]) -> None: global _failures _failures.append(name) print(f" FAIL {name}") for d in details: print(f" {d}") def _warn(name: str, details: list[str]) -> None: global _warnings _warnings.extend(details) print(f" WARN {name}") for d in details: print(f" {d}") def _verbose_print(msg: str) -> None: if _verbose: print(f" {msg}") # --------------------------------------------------------------------------- # Helper: load data # --------------------------------------------------------------------------- def load_data() -> dict[str, Any]: """Load words.json and return the parsed dict.""" if not DATA_FILE.exists(): print(f"ERROR: data file not found: {DATA_FILE}") sys.exit(2) with DATA_FILE.open(encoding="utf-8") as fh: return json.load(fh) def _is_hebrew_consonant(ch: str) -> bool: """Return True if ch is a Hebrew consonant (U+05D0..U+05EA). Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking only the first base character after NFD decomposition. """ normalized = unicodedata.normalize("NFD", ch) # The first codepoint is the base consonant; the rest are combining marks. base = normalized[0] cp = ord(base) return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1] # --------------------------------------------------------------------------- # Individual tests # --------------------------------------------------------------------------- def test_required_fields(data: dict[str, Any]) -> None: """Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning.""" name = "required_fields" errors: list[str] = [] warn_details: list[str] = [] for key, entry in data.items(): word = entry.get("word") if not isinstance(word, dict): errors.append(f"[{key}] 'word' is missing or not a dict") else: if not word.get("nikkud"): errors.append(f"[{key}] word.nikkud is missing or empty") if not word.get("ktiv_male"): errors.append(f"[{key}] word.ktiv_male is missing or empty") if not entry.get("slug"): errors.append(f"[{key}] 'slug' is missing or empty") if not entry.get("pos"): errors.append(f"[{key}] 'pos' is missing or empty") if not entry.get("meaning"): errors.append(f"[{key}] 'meaning' is missing or empty") if entry.get("frequency") is None: warn_details.append(f"[{key}] 'frequency' is null/missing") if warn_details: _warn("frequency_missing", warn_details[:20] if not _verbose else warn_details) if len(warn_details) > 20 and not _verbose: print(f" ... ({len(warn_details) - 20} more; use --verbose)") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_root_format(data: dict[str, Any]) -> None: """root is a list of 2-5 Hebrew consonant chars, or an empty list.""" name = "root_format" errors: list[str] = [] for key, entry in data.items(): root = entry.get("root") if root is None: errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)") continue if not isinstance(root, list): errors.append(f"[{key}] 'root' is not a list: {root!r}") continue if len(root) == 0: continue # rootless word — valid if not (2 <= len(root) <= 5): errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}") continue for ch in root: # A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot). # Validate by checking the base consonant after NFD decomposition. if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch): errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)") break if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_unique_slugs(data: dict[str, Any]) -> None: """All non-empty slugs are unique across entries — each pealim page is a distinct word.""" name = "unique_slugs" seen: dict[str, list[str]] = {} for key, entry in data.items(): slug = entry.get("slug") if slug: seen.setdefault(slug, []).append(key) dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1} if dups: errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()] _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001 """JSON loaded without top-level key collisions. Python's json.load silently keeps the last value on duplicate keys; we re-parse with a custom object_pairs_hook to detect them. The pre-parsed ``_data`` dict is not used here because we need to re-read the raw file to catch duplicate keys that json.load would silently merge. """ name = "no_duplicate_keys" duplicates: list[str] = [] def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]: d: dict[str, Any] = {} for k, v in pairs: if k in d: duplicates.append(k) d[k] = v return d with DATA_FILE.open(encoding="utf-8") as fh: json.load(fh, object_pairs_hook=_detect_dups) if duplicates: _fail(name, [f"duplicate key: {k!r}" for k in duplicates]) else: _pass(name) def test_confusable_symmetric(data: dict[str, Any]) -> None: """If A lists B in confusable_group, B must list A.""" name = "confusable_symmetric" errors: list[str] = [] for key, entry in data.items(): group = entry.get("confusable_group") if not group: continue for other_key in group: other = data.get(other_key) if other is None: errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}") continue other_group = other.get("confusable_group") or [] if key not in other_group: errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_shared_roots_valid_keys(data: dict[str, Any]) -> None: """Every key in shared_roots must exist as a top-level key.""" name = "shared_roots_valid_keys" errors: list[str] = [] for key, entry in data.items(): shared = entry.get("shared_roots") if not shared: continue for ref_key in shared: if ref_key not in data: errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_unique_legacy_guids(data: dict[str, Any]) -> None: """No two entries share the same vocab_legacy_guid (excluding null). Exception: entries that share the same word.nikkud value inherited the same legacy Anki card (PoS homographs like חַד Particle vs Adjective). These are tolerated — the duplicate GUID is a known artefact of how legacy GUIDs were generated from the nikkud word alone. """ name = "unique_legacy_guids" seen: dict[str, list[str]] = {} for key, entry in data.items(): guid = entry.get("vocab_legacy_guid") if guid: seen.setdefault(guid, []).append(key) errors: list[str] = [] for guid, keys in seen.items(): if len(keys) <= 1: continue # Tolerate sharing if ALL entries with this GUID share the same word.nikkud nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys} if len(nikkud_values) == 1: # Same nikkud -> inherited from same legacy card; tolerable _verbose_print( f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}" ) continue errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None: """noun_inflection must be null if pos doesn't start with 'Noun'. Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection. """ name = "no_noun_inflection_on_non_nouns" errors: list[str] = [] for key, entry in data.items(): pos = entry.get("pos") or "" noun_inf = entry.get("noun_inflection") if not pos.startswith("Noun") and noun_inf is not None: errors.append(f"[{key}] pos={pos!r} but noun_inflection is set") _verbose_print(f"offending entry: {key!r}") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_no_emoji_in_meaning(data: dict[str, Any]) -> None: """meaning field must not contain inline emoji characters.""" name = "no_emoji_in_meaning" errors: list[str] = [] for key, entry in data.items(): meaning = entry.get("meaning") or "" if EMOJI_RE.search(meaning): errors.append(f"[{key}] meaning contains emoji: {meaning!r}") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_example_sentences_contain_word(data: dict[str, Any]) -> None: """For entries with examples.vetted, the word.nikkud must appear in at least one sentence. Uses nikkud (exact) matching, not stripped matching. """ name = "example_sentences_contain_word" errors: list[str] = [] for key, entry in data.items(): examples = entry.get("examples") if not examples: continue vetted = examples.get("vetted") if not vetted: continue word_obj = entry.get("word") or {} nikkud_word = word_obj.get("nikkud") or "" if not nikkud_word: continue found = any(nikkud_word in (s.get("text") or "") for s in vetted) if not found: sentences_preview = [s.get("text", "") for s in vetted[:2]] errors.append( f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}" ) if errors: _warn(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") _pass(name) def test_cloze_offsets_valid(data: dict[str, Any]) -> None: """cloze_word_start/end must be within text bounds when present. Null offsets are tolerated (and warned separately) because some sentences contain only inflected/construct/plural forms that cannot be matched back to the base nikkud or ktiv_male — this is a data quality issue in vetted_sentences.json, not a schema violation. """ name = "cloze_offsets_valid" errors: list[str] = [] null_warn: list[str] = [] for key, entry in data.items(): examples = entry.get("examples") if not examples: continue cloze = examples.get("cloze") if not cloze: continue text = cloze.get("text") or "" start = cloze.get("cloze_word_start") end = cloze.get("cloze_word_end") if start is None or end is None: null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null") continue text_len = len(text) if not isinstance(start, int) or not isinstance(end, int): errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}") continue if start < 0 or end < 0: errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}") continue if start >= end: errors.append(f"[{key}] cloze start >= end: start={start}, end={end}") continue if end > text_len: errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}") if null_warn: _warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn) if len(null_warn) > 20 and not _verbose: print(f" ... ({len(null_warn) - 20} more; use --verbose)") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None: """hufal_pual_forms must only be set for Hif'il or Pi'el verbs.""" name = "hufal_pual_only_on_hifil_piel" errors: list[str] = [] for key, entry in data.items(): conj = entry.get("conjugation") if not conj: continue hufal_pual = conj.get("hufal_pual_forms") if hufal_pual is None: continue binyan = conj.get("binyan") or "" binyan_lower = binyan.lower() if "hif" not in binyan_lower and "pi" not in binyan_lower: errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None: """All entries in a confusable_group must share the same word.ktiv_male.""" name = "confusable_group_shares_ktiv_male" errors: list[str] = [] for key, entry in data.items(): group = entry.get("confusable_group") if not group: continue my_word = entry.get("word") or {} my_ktiv = my_word.get("ktiv_male") if not my_ktiv: continue for other_key in group: other = data.get(other_key) if not other: continue # already caught by confusable_symmetric other_word = other.get("word") or {} other_ktiv = other_word.get("ktiv_male") if other_ktiv and other_ktiv != my_ktiv: errors.append( f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}" ) if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_confusables_guid(data: dict[str, Any]) -> None: """confusables_guid must be consistent within each confusable_group. Rules: - If confusable_group is non-null, confusables_guid must be non-null. - If confusable_group is null, confusables_guid must be null. - All entries that share a confusable_group must share the same confusables_guid value. """ name = "confusables_guid" errors: list[str] = [] for key, entry in data.items(): group = entry.get("confusable_group") guid = entry.get("confusables_guid") if group and not guid: errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing") elif not group and guid is not None: errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null") if not group or not guid: continue for other_key in group: other = data.get(other_key) if not other: continue # already caught by confusable_symmetric other_guid = other.get("confusables_guid") if other_guid != guid: errors.append( f"[{key}] confusables_guid={guid!r} but confusable member " f"{other_key!r} has confusables_guid={other_guid!r}" ) if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_conjugation_form_guids(data: dict[str, Any]) -> None: """Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb. Rules: - Each form in active_forms and hufal_pual_forms must have a non-null ``guid`` OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and 1st person forms where multiple GUIDs are possible). - No two forms within the same verb (across both form lists) may share a GUID. """ name = "conjugation_form_guids" errors: list[str] = [] for key, entry in data.items(): conj = entry.get("conjugation") if not conj: continue seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label for form_list_key in ("active_forms", "hufal_pual_forms"): forms = conj.get(form_list_key) if not forms: continue for form in forms: person = form.get("person", "?") label = f"{form_list_key}[{person}]" guid = form.get("guid") guid_candidates = form.get("guid_candidates") if not guid and not guid_candidates: errors.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'") continue if guid: if guid in seen_guids: errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}") else: seen_guids[guid] = label elif guid_candidates: for candidate in guid_candidates: if candidate in seen_guids: errors.append( f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}" ) else: seen_guids[candidate] = label if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_conjugation_person_codes(data: dict[str, Any]) -> None: """active_forms person codes must be from the defined valid set.""" name = "conjugation_person_codes" errors: list[str] = [] for key, entry in data.items(): conj = entry.get("conjugation") if not conj: continue for form_list_key in ("active_forms", "hufal_pual_forms"): forms = conj.get(form_list_key) if not forms: continue for form in forms: person = form.get("person") if person not in VALID_PERSON_CODES: errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}") if errors: _fail(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") else: _pass(name) def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None: """For confusable words, their example sentences must not contain the wrong homograph's nikkud word. Specifically: if A and B are confusable (same ktiv_male), A's vetted sentences must not contain B's nikkud form, and vice versa. """ name = "no_stripped_form_sentence_collisions" errors: list[str] = [] for key, entry in data.items(): group = entry.get("confusable_group") if not group: continue examples = entry.get("examples") if not examples: continue vetted = examples.get("vetted") if not vetted: continue my_word = entry.get("word") or {} my_nikkud = my_word.get("nikkud") or "" my_texts = [s.get("text") or "" for s in vetted] for other_key in group: other = data.get(other_key) if not other: continue other_word = other.get("word") or {} other_nikkud = other_word.get("nikkud") or "" if not other_nikkud or other_nikkud == my_nikkud: continue # same nikkud homographs are ok (we can't distinguish by nikkud) for text in my_texts: if other_nikkud in text: errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}") _verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}") break # one error per (key, other_key) pair is enough if errors: _warn(name, errors[:20] if not _verbose else errors) if len(errors) > 20 and not _verbose: print(f" ... ({len(errors) - 20} more; use --verbose)") _pass(name) # --------------------------------------------------------------------------- # Stats summary # --------------------------------------------------------------------------- def print_stats(data: dict[str, Any]) -> None: """Print a summary of dataset coverage metrics.""" total = len(data) with_conj = sum(1 for e in data.values() if e.get("conjugation")) with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection")) with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted")) with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze")) with_image = sum(1 for e in data.values() if e.get("image")) with_emoji = sum(1 for e in data.values() if e.get("emoji")) with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid")) in_confusable = sum(1 for e in data.values() if e.get("confusable_group")) with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots")) print() print("Stats Summary") print("─" * 42) print(f" Total entries: {total:>6}") print(f" With conjugation data: {with_conj:>6}") print(f" With noun_inflection: {with_noun_inf:>6}") print(f" With vetted examples: {with_vetted:>6}") print(f" With cloze examples: {with_cloze:>6}") print(f" With images: {with_image:>6}") print(f" With emoji: {with_emoji:>6}") print(f" With legacy GUIDs: {with_guid:>6}") print(f" In confusable groups: {in_confusable:>6}") print(f" With shared roots: {with_shared_roots:>6}") # --------------------------------------------------------------------------- # Test registry # --------------------------------------------------------------------------- ALL_TESTS: dict[str, Any] = { "required_fields": test_required_fields, "root_format": test_root_format, "unique_slugs": test_unique_slugs, "no_duplicate_keys": test_no_duplicate_keys, "confusable_symmetric": test_confusable_symmetric, "shared_roots_valid_keys": test_shared_roots_valid_keys, "unique_legacy_guids": test_unique_legacy_guids, "no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns, "no_emoji_in_meaning": test_no_emoji_in_meaning, "example_sentences_contain_word": test_example_sentences_contain_word, "cloze_offsets_valid": test_cloze_offsets_valid, "hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel, "confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male, "confusables_guid": test_confusables_guid, "conjugation_form_guids": test_conjugation_form_guids, "conjugation_person_codes": test_conjugation_person_codes, "no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions, } # --------------------------------------------------------------------------- # Entry point # --------------------------------------------------------------------------- def main() -> None: global _verbose parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.") parser.add_argument( "--verbose", "-v", action="store_true", help="Print full details for all failures (not just first 20).", ) parser.add_argument( "--test", metavar="NAME", help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}", ) args = parser.parse_args() _verbose = args.verbose data = load_data() # Select tests to run if args.test: if args.test not in ALL_TESTS: print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}") sys.exit(2) tests_to_run = {args.test: ALL_TESTS[args.test]} else: tests_to_run = ALL_TESTS print(f"Validating {DATA_FILE} ({len(data)} entries)") print("─" * 60) # no_duplicate_keys needs the file, not the pre-parsed dict for test_fn in tests_to_run.values(): test_fn(data) # Summary if not args.test: print_stats(data) print() print("─" * 60) if _warnings: print(f" Warnings : {len(_warnings)}") if _failures: print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}") sys.exit(1) else: print(f" All {len(tests_to_run)} test(s) passed.") sys.exit(0) if __name__ == "__main__": main()