hebrew_flash_cards/scripts/validate_data.py

"""Standalone integrity validator for data/words.json.

Validates the unified Hebrew Flash Cards data against the schema defined in
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.

Usage:
    python3 scripts/validate_data.py
    python3 scripts/validate_data.py --verbose
    python3 scripts/validate_data.py --test confusable_symmetric
"""

from __future__ import annotations

import argparse
import json
import re
import sys
import unicodedata
from pathlib import Path
from typing import Any

# ---------------------------------------------------------------------------
# Bootstrap: make project root importable so helpers.py is accessible
# ---------------------------------------------------------------------------
sys.path.insert(0, str(Path(__file__).parent.parent))

# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"

HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA)  # alef–tav

VALID_PERSON_CODES: frozenset[str] = frozenset(
    ["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
)

EMOJI_RE = re.compile(
    r"[\U0001f600-\U0001f64f"
    r"\U0001f300-\U0001f5ff"
    r"\U0001f680-\U0001f6ff"
    r"\U0001f1e0-\U0001f1ff"
    r"\U00002702-\U000027b0"
    r"\U0001f900-\U0001f9ff"
    r"\U0001fa00-\U0001fa6f"
    r"\U0001fa70-\U0001faff]"
)

# ---------------------------------------------------------------------------
# Result tracking
# ---------------------------------------------------------------------------
_failures: list[str] = []
_warnings: list[str] = []
_verbose: bool = False


def _pass(name: str) -> None:
    print(f"  PASS  {name}")


def _fail(name: str, details: list[str]) -> None:
    global _failures
    _failures.append(name)
    print(f"  FAIL  {name}")
    for d in details:
        print(f"          {d}")


def _warn(name: str, details: list[str]) -> None:
    global _warnings
    _warnings.extend(details)
    print(f"  WARN  {name}")
    for d in details:
        print(f"          {d}")


def _verbose_print(msg: str) -> None:
    if _verbose:
        print(f"        {msg}")


# ---------------------------------------------------------------------------
# Helper: load data
# ---------------------------------------------------------------------------


def load_data() -> dict[str, Any]:
    """Load words.json and return the parsed dict."""
    if not DATA_FILE.exists():
        print(f"ERROR: data file not found: {DATA_FILE}")
        sys.exit(2)
    with DATA_FILE.open(encoding="utf-8") as fh:
        return json.load(fh)


def _is_hebrew_consonant(ch: str) -> bool:
    """Return True if ch is a Hebrew consonant (U+05D0..U+05EA).

    Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
    only the first base character after NFD decomposition.
    """
    normalized = unicodedata.normalize("NFD", ch)
    # The first codepoint is the base consonant; the rest are combining marks.
    base = normalized[0]
    cp = ord(base)
    return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]


# ---------------------------------------------------------------------------
# Individual tests
# ---------------------------------------------------------------------------


def test_required_fields(data: dict[str, Any]) -> None:
    """Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
    name = "required_fields"
    errors: list[str] = []
    warn_details: list[str] = []

    for key, entry in data.items():
        word = entry.get("word")
        if not isinstance(word, dict):
            errors.append(f"[{key}] 'word' is missing or not a dict")
        else:
            if not word.get("nikkud"):
                errors.append(f"[{key}] word.nikkud is missing or empty")
            if not word.get("ktiv_male"):
                errors.append(f"[{key}] word.ktiv_male is missing or empty")

        if not entry.get("slug"):
            errors.append(f"[{key}] 'slug' is missing or empty")
        if not entry.get("pos"):
            errors.append(f"[{key}] 'pos' is missing or empty")
        if not entry.get("meaning"):
            errors.append(f"[{key}] 'meaning' is missing or empty")

        if entry.get("frequency") is None:
            warn_details.append(f"[{key}] 'frequency' is null/missing")

    if warn_details:
        _warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
        if len(warn_details) > 20 and not _verbose:
            print(f"          ... ({len(warn_details) - 20} more; use --verbose)")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_root_format(data: dict[str, Any]) -> None:
    """root is a list of 2-5 Hebrew consonant chars, or an empty list."""
    name = "root_format"
    errors: list[str] = []

    for key, entry in data.items():
        root = entry.get("root")
        if root is None:
            errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
            continue
        if not isinstance(root, list):
            errors.append(f"[{key}] 'root' is not a list: {root!r}")
            continue
        if len(root) == 0:
            continue  # rootless word — valid
        if not (2 <= len(root) <= 5):
            errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
            continue
        for ch in root:
            # A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
            # Validate by checking the base consonant after NFD decomposition.
            if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
                errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
                break

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_unique_slugs(data: dict[str, Any]) -> None:
    """All non-empty slugs are unique across entries — each pealim page is a distinct word."""
    name = "unique_slugs"
    seen: dict[str, list[str]] = {}

    for key, entry in data.items():
        slug = entry.get("slug")
        if slug:
            seen.setdefault(slug, []).append(key)

    dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
    if dups:
        errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_no_duplicate_keys(_data: dict[str, Any]) -> None:  # noqa: ARG001
    """JSON loaded without top-level key collisions.

    Python's json.load silently keeps the last value on duplicate keys;
    we re-parse with a custom object_pairs_hook to detect them.
    The pre-parsed ``_data`` dict is not used here because we need to
    re-read the raw file to catch duplicate keys that json.load would
    silently merge.
    """
    name = "no_duplicate_keys"
    duplicates: list[str] = []

    def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
        d: dict[str, Any] = {}
        for k, v in pairs:
            if k in d:
                duplicates.append(k)
            d[k] = v
        return d

    with DATA_FILE.open(encoding="utf-8") as fh:
        json.load(fh, object_pairs_hook=_detect_dups)

    if duplicates:
        _fail(name, [f"duplicate key: {k!r}" for k in duplicates])
    else:
        _pass(name)


def test_confusable_symmetric(data: dict[str, Any]) -> None:
    """If A lists B in confusable_group, B must list A."""
    name = "confusable_symmetric"
    errors: list[str] = []

    for key, entry in data.items():
        group = entry.get("confusable_group")
        if not group:
            continue
        for other_key in group:
            other = data.get(other_key)
            if other is None:
                errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
                continue
            other_group = other.get("confusable_group") or []
            if key not in other_group:
                errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
    """Every key in shared_roots must exist as a top-level key."""
    name = "shared_roots_valid_keys"
    errors: list[str] = []

    for key, entry in data.items():
        shared = entry.get("shared_roots")
        if not shared:
            continue
        for ref_key in shared:
            if ref_key not in data:
                errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_unique_legacy_guids(data: dict[str, Any]) -> None:
    """No two entries share the same vocab_legacy_guid (excluding null).

    Exception: entries that share the same word.nikkud value inherited the
    same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
    These are tolerated — the duplicate GUID is a known artefact of how
    legacy GUIDs were generated from the nikkud word alone.
    """
    name = "unique_legacy_guids"
    seen: dict[str, list[str]] = {}

    for key, entry in data.items():
        guid = entry.get("vocab_legacy_guid")
        if guid:
            seen.setdefault(guid, []).append(key)

    errors: list[str] = []
    for guid, keys in seen.items():
        if len(keys) <= 1:
            continue
        # Tolerate sharing if ALL entries with this GUID share the same word.nikkud
        nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
        if len(nikkud_values) == 1:
            # Same nikkud -> inherited from same legacy card; tolerable
            _verbose_print(
                f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
            )
            continue
        errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
    """noun_inflection must be null if pos doesn't start with 'Noun'.

    Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
    """
    name = "no_noun_inflection_on_non_nouns"
    errors: list[str] = []

    for key, entry in data.items():
        pos = entry.get("pos") or ""
        noun_inf = entry.get("noun_inflection")
        if not pos.startswith("Noun") and noun_inf is not None:
            errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
            _verbose_print(f"offending entry: {key!r}")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
    """meaning field must not contain inline emoji characters."""
    name = "no_emoji_in_meaning"
    errors: list[str] = []

    for key, entry in data.items():
        meaning = entry.get("meaning") or ""
        if EMOJI_RE.search(meaning):
            errors.append(f"[{key}] meaning contains emoji: {meaning!r}")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
    """For entries with examples.vetted, the word.nikkud must appear in at least one sentence.

    Uses nikkud (exact) matching, not stripped matching.
    """
    name = "example_sentences_contain_word"
    errors: list[str] = []

    for key, entry in data.items():
        examples = entry.get("examples")
        if not examples:
            continue
        vetted = examples.get("vetted")
        if not vetted:
            continue

        word_obj = entry.get("word") or {}
        nikkud_word = word_obj.get("nikkud") or ""
        if not nikkud_word:
            continue

        found = any(nikkud_word in (s.get("text") or "") for s in vetted)
        if not found:
            sentences_preview = [s.get("text", "") for s in vetted[:2]]
            errors.append(
                f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
            )

    if errors:
        _warn(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    _pass(name)


def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
    """cloze_word_start/end must be within text bounds when present.

    Null offsets are tolerated (and warned separately) because some sentences
    contain only inflected/construct/plural forms that cannot be matched back
    to the base nikkud or ktiv_male — this is a data quality issue in
    vetted_sentences.json, not a schema violation.
    """
    name = "cloze_offsets_valid"
    errors: list[str] = []
    null_warn: list[str] = []

    for key, entry in data.items():
        examples = entry.get("examples")
        if not examples:
            continue
        cloze = examples.get("cloze")
        if not cloze:
            continue

        text = cloze.get("text") or ""
        start = cloze.get("cloze_word_start")
        end = cloze.get("cloze_word_end")

        if start is None or end is None:
            null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
            continue

        text_len = len(text)
        if not isinstance(start, int) or not isinstance(end, int):
            errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
            continue
        if start < 0 or end < 0:
            errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
            continue
        if start >= end:
            errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
            continue
        if end > text_len:
            errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")

    if null_warn:
        _warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
        if len(null_warn) > 20 and not _verbose:
            print(f"          ... ({len(null_warn) - 20} more; use --verbose)")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
    """hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
    name = "hufal_pual_only_on_hifil_piel"
    errors: list[str] = []

    for key, entry in data.items():
        conj = entry.get("conjugation")
        if not conj:
            continue
        hufal_pual = conj.get("hufal_pual_forms")
        if hufal_pual is None:
            continue

        binyan = conj.get("binyan") or ""
        binyan_lower = binyan.lower()
        if "hif" not in binyan_lower and "pi" not in binyan_lower:
            errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
    """All entries in a confusable_group must share the same word.ktiv_male."""
    name = "confusable_group_shares_ktiv_male"
    errors: list[str] = []

    for key, entry in data.items():
        group = entry.get("confusable_group")
        if not group:
            continue

        my_word = entry.get("word") or {}
        my_ktiv = my_word.get("ktiv_male")
        if not my_ktiv:
            continue

        for other_key in group:
            other = data.get(other_key)
            if not other:
                continue  # already caught by confusable_symmetric
            other_word = other.get("word") or {}
            other_ktiv = other_word.get("ktiv_male")
            if other_ktiv and other_ktiv != my_ktiv:
                errors.append(
                    f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
                )

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_confusables_guid(data: dict[str, Any]) -> None:
    """confusables_guid must be consistent within each confusable_group.

    Rules:
    - If confusable_group is non-null, confusables_guid must be non-null.
    - If confusable_group is null, confusables_guid must be null.
    - All entries that share a confusable_group must share the same
      confusables_guid value.
    """
    name = "confusables_guid"
    errors: list[str] = []

    for key, entry in data.items():
        group = entry.get("confusable_group")
        guid = entry.get("confusables_guid")

        if group and not guid:
            errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
        elif not group and guid is not None:
            errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")

        if not group or not guid:
            continue

        for other_key in group:
            other = data.get(other_key)
            if not other:
                continue  # already caught by confusable_symmetric
            other_guid = other.get("confusables_guid")
            if other_guid != guid:
                errors.append(
                    f"[{key}] confusables_guid={guid!r} but confusable member "
                    f"{other_key!r} has confusables_guid={other_guid!r}"
                )

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_conjugation_form_guids(data: dict[str, Any]) -> None:
    """Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.

    Rules:
    - Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
      OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
      1st person forms where multiple GUIDs are possible).
    - No two forms within the same verb (across both form lists) may share a GUID.
    """
    name = "conjugation_form_guids"
    errors: list[str] = []
    warnings: list[str] = []

    for key, entry in data.items():
        conj = entry.get("conjugation")
        if not conj:
            continue

        seen_guids: dict[str, str] = {}  # guid -> "form_list_key[person]" label

        for form_list_key in ("active_forms", "hufal_pual_forms"):
            forms = conj.get(form_list_key)
            if not forms:
                continue
            for form in forms:
                person = form.get("person", "?")
                label = f"{form_list_key}[{person}]"
                guid = form.get("guid")
                guid_candidates = form.get("guid_candidates")

                if not guid and not guid_candidates:
                    # New forms from rescrape use deterministic fallback — warn, don't fail
                    warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
                    continue

                if guid:
                    if guid in seen_guids:
                        errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
                    else:
                        seen_guids[guid] = label
                elif guid_candidates:
                    for candidate in guid_candidates:
                        if candidate in seen_guids:
                            errors.append(
                                f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
                            )
                        else:
                            seen_guids[candidate] = label

    if warnings:
        _warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_conjugation_person_codes(data: dict[str, Any]) -> None:
    """active_forms person codes must be from the defined valid set."""
    name = "conjugation_person_codes"
    errors: list[str] = []

    for key, entry in data.items():
        conj = entry.get("conjugation")
        if not conj:
            continue

        for form_list_key in ("active_forms", "hufal_pual_forms"):
            forms = conj.get(form_list_key)
            if not forms:
                continue
            for form in forms:
                person = form.get("person")
                if person not in VALID_PERSON_CODES:
                    errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")

    if errors:
        _fail(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    else:
        _pass(name)


def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
    """For confusable words, their example sentences must not contain the wrong
    homograph's nikkud word.

    Specifically: if A and B are confusable (same ktiv_male), A's vetted
    sentences must not contain B's nikkud form, and vice versa.
    """
    name = "no_stripped_form_sentence_collisions"
    errors: list[str] = []

    for key, entry in data.items():
        group = entry.get("confusable_group")
        if not group:
            continue

        examples = entry.get("examples")
        if not examples:
            continue
        vetted = examples.get("vetted")
        if not vetted:
            continue

        my_word = entry.get("word") or {}
        my_nikkud = my_word.get("nikkud") or ""

        my_texts = [s.get("text") or "" for s in vetted]

        for other_key in group:
            other = data.get(other_key)
            if not other:
                continue
            other_word = other.get("word") or {}
            other_nikkud = other_word.get("nikkud") or ""
            if not other_nikkud or other_nikkud == my_nikkud:
                continue  # same nikkud homographs are ok (we can't distinguish by nikkud)

            for text in my_texts:
                if other_nikkud in text:
                    errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
                    _verbose_print(f"  my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
                    break  # one error per (key, other_key) pair is enough

    if errors:
        _warn(name, errors[:20] if not _verbose else errors)
        if len(errors) > 20 and not _verbose:
            print(f"          ... ({len(errors) - 20} more; use --verbose)")
    _pass(name)


# ---------------------------------------------------------------------------
# Stats summary
# ---------------------------------------------------------------------------


def print_stats(data: dict[str, Any]) -> None:
    """Print a summary of dataset coverage metrics."""
    total = len(data)
    with_conj = sum(1 for e in data.values() if e.get("conjugation"))
    with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
    with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
    with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
    with_image = sum(1 for e in data.values() if e.get("image"))
    with_emoji = sum(1 for e in data.values() if e.get("emoji"))
    with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
    in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
    with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))

    print()
    print("Stats Summary")
    print("─" * 42)
    print(f"  Total entries:                {total:>6}")
    print(f"  With conjugation data:        {with_conj:>6}")
    print(f"  With noun_inflection:         {with_noun_inf:>6}")
    print(f"  With vetted examples:         {with_vetted:>6}")
    print(f"  With cloze examples:          {with_cloze:>6}")
    print(f"  With images:                  {with_image:>6}")
    print(f"  With emoji:                   {with_emoji:>6}")
    print(f"  With legacy GUIDs:            {with_guid:>6}")
    print(f"  In confusable groups:         {in_confusable:>6}")
    print(f"  With shared roots:            {with_shared_roots:>6}")


# ---------------------------------------------------------------------------
# Test registry
# ---------------------------------------------------------------------------

ALL_TESTS: dict[str, Any] = {
    "required_fields": test_required_fields,
    "root_format": test_root_format,
    "unique_slugs": test_unique_slugs,
    "no_duplicate_keys": test_no_duplicate_keys,
    "confusable_symmetric": test_confusable_symmetric,
    "shared_roots_valid_keys": test_shared_roots_valid_keys,
    "unique_legacy_guids": test_unique_legacy_guids,
    "no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
    "no_emoji_in_meaning": test_no_emoji_in_meaning,
    "example_sentences_contain_word": test_example_sentences_contain_word,
    "cloze_offsets_valid": test_cloze_offsets_valid,
    "hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
    "confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
    "confusables_guid": test_confusables_guid,
    "conjugation_form_guids": test_conjugation_form_guids,
    "conjugation_person_codes": test_conjugation_person_codes,
    "no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
}


# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------


def main() -> None:
    global _verbose

    parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
    parser.add_argument(
        "--verbose",
        "-v",
        action="store_true",
        help="Print full details for all failures (not just first 20).",
    )
    parser.add_argument(
        "--test",
        metavar="NAME",
        help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
    )
    args = parser.parse_args()
    _verbose = args.verbose

    data = load_data()

    # Select tests to run
    if args.test:
        if args.test not in ALL_TESTS:
            print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
            sys.exit(2)
        tests_to_run = {args.test: ALL_TESTS[args.test]}
    else:
        tests_to_run = ALL_TESTS

    print(f"Validating {DATA_FILE} ({len(data)} entries)")
    print("─" * 60)

    # no_duplicate_keys needs the file, not the pre-parsed dict
    for test_fn in tests_to_run.values():
        test_fn(data)

    # Summary
    if not args.test:
        print_stats(data)

    print()
    print("─" * 60)
    if _warnings:
        print(f"  Warnings : {len(_warnings)}")
    if _failures:
        print(f"  FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
        sys.exit(1)
    else:
        print(f"  All {len(tests_to_run)} test(s) passed.")
        sys.exit(0)


if __name__ == "__main__":
    main()