hebrew_flash_cards/tests/test_scraper_integration.py

#!/usr/bin/env python3
"""Integration tests: scrape real pealim.com pages and validate data.

These tests hit pealim.com directly. They are skipped when the environment
variable SKIP_INTEGRATION is set to any non-empty string.

Run with:
    pytest tests/test_scraper_integration.py -v -m integration
"""

import json
import os
import re
import sys
import time
from pathlib import Path

import pytest

# Add project root to path so all sibling modules are importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

import pealim_detail_scrape
import pealim_list_scrape

# ---------------------------------------------------------------------------
# Skip marker
# ---------------------------------------------------------------------------

skip_integration = pytest.mark.skipif(
    bool(os.environ.get("SKIP_INTEGRATION", "")),
    reason="SKIP_INTEGRATION is set",
)

# A known Hif'il verb slug that is not page-1 dependent.
# לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
HIFIL_VERB_SLUG = "1135-lehagid"
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
HIFIL_VERB_MEANING = "to say, to tell"

# Minimum expected entries from a single list page
MIN_LIST_ENTRIES = 10

# Hebrew character regex (Unicode block U+05D0–U+05EA)
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")

# Slug pattern: one or more digits, hyphen, one or more word chars
SLUG_RE = re.compile(r"^\d+-\w+$")


# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------


def _has_hebrew(text: str) -> bool:
    """Return True if *text* contains at least one Hebrew consonant."""
    return bool(HEBREW_CHAR_RE.search(text))


def _words_from_file(path: Path) -> dict:
    with path.open(encoding="utf-8") as fh:
        return json.load(fh)


# ---------------------------------------------------------------------------
# Test class: list page scrape
# ---------------------------------------------------------------------------


@pytest.mark.integration
@skip_integration
class TestListScrape:
    """Validate pealim_list_scrape against a real /dict/?page=1 fetch."""

    def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"

        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)

        # Scrape exactly one page
        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)

        assert words_path.exists(), "words.json was not created after scrape"
        words = _words_from_file(words_path)
        assert len(words) >= MIN_LIST_ENTRIES, (
            f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
        )

    def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"

        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)

        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)

        for key, entry in words.items():
            word_block = entry.get("word", {})
            nikkud = word_block.get("nikkud", "")
            ktiv_male = word_block.get("ktiv_male", "")
            slug = entry.get("slug", "")
            pos = entry.get("pos", "")
            meaning = entry.get("meaning", "")

            assert nikkud, f"Entry '{key}': word.nikkud is empty"
            assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
            assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
            assert slug, f"Entry '{key}': slug is empty"
            assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
            assert pos, f"Entry '{key}': pos is empty"
            assert meaning, f"Entry '{key}': meaning is empty"

    def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """At least one entry on page 1 must have a non-empty root list."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"

        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)

        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)

        entries_with_root = [e for e in words.values() if e.get("root")]
        assert entries_with_root, "No entries on page 1 have a non-empty root list"

    def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """At least one entry on page 1 must have a non-empty audio_url."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"

        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)

        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)

        entries_with_audio = [e for e in words.values() if e.get("audio_url")]
        assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"

    def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"

        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)

        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)

        for key, entry in words.items():
            assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
            assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
            assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"


# ---------------------------------------------------------------------------
# Test class: noun detail scrape
# ---------------------------------------------------------------------------


@pytest.mark.integration
@skip_integration
class TestDetailScrapeNoun:
    """Validate pealim_detail_scrape for a real noun detail page."""

    def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
        """Return the first (key, entry) pair that is a Noun with a non-empty root."""
        for key, entry in words.items():
            if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
                return key, entry
        return None

    def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
        """
        Scrape page 1 into a fresh words.json and return (path, words).
        Uses list scraper monkeypatched to tmp_path.
        """
        words_path = tmp_path / "words.json"
        progress_path = tmp_path / "list_scrape_progress.json"

        monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
        monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)

        pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
        words = _words_from_file(words_path)
        return words_path, words

    def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """After detail scrape, noun_inflection must not be null."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)

        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, noun_entry = pair

        # Now monkeypatch detail scraper and run it on just this noun
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)

        # Small rate-limit delay between list scrape and detail scrape
        time.sleep(1.0)

        pealim_detail_scrape.run(force_refresh=True, nouns_only=True)

        updated_words = _words_from_file(words_path)
        entry = updated_words.get(noun_key, {})

        assert entry.get("noun_inflection") is not None, (
            f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
        )

    def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)

        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, _noun_entry = pair

        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        time.sleep(1.0)
        pealim_detail_scrape.run(force_refresh=True, nouns_only=True)

        updated_words = _words_from_file(words_path)
        ni = updated_words[noun_key].get("noun_inflection", {}) or {}

        singular = ni.get("singular") or {}
        plural = ni.get("plural") or {}

        assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
        assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
        assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
        assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"

    def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Noun gender must be 'masculine' or 'feminine'."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)

        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, _noun_entry = pair

        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        time.sleep(1.0)
        pealim_detail_scrape.run(force_refresh=True, nouns_only=True)

        updated_words = _words_from_file(words_path)
        ni = updated_words[noun_key].get("noun_inflection", {}) or {}

        gender = ni.get("gender", "")
        assert gender in ("masculine", "feminine"), (
            f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
        )

    def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """detail_scraped must be True after a successful noun detail scrape."""
        words_path, words = self._prepare_words_json(tmp_path, monkeypatch)

        pair = self._find_noun_with_root(words)
        assert pair is not None, "No noun with a root found on page 1"
        noun_key, _ = pair

        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
        time.sleep(1.0)
        pealim_detail_scrape.run(force_refresh=True, nouns_only=True)

        updated_words = _words_from_file(words_path)
        assert updated_words[noun_key].get("detail_scraped") is True, (
            f"detail_scraped is not True after scrape for '{noun_key}'"
        )


# ---------------------------------------------------------------------------
# Test class: verb detail scrape (Hif'il)
# ---------------------------------------------------------------------------


@pytest.mark.integration
@skip_integration
class TestDetailScrapeVerb:
    """Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""

    def _build_test_words_json(self, tmp_path: Path) -> Path:
        """
        Write a minimal words.json containing only the known Hif'il verb entry.
        The detail scraper's run() will pick it up because pos starts with 'Verb'
        and detail_scraped is absent/False.
        """
        words_path = tmp_path / "words.json"
        entry = {
            "word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
            "slug": HIFIL_VERB_SLUG,
            "root": ["נ", "ג", "ד"],
            "pos": "Verb",
            "pos_hebrew": "פֹּעַל — הִפְעִיל",
            "meaning": HIFIL_VERB_MEANING,
            "meaning_raw": HIFIL_VERB_MEANING,
            "audio_url": "",
            "audio_file": "להגיד.mp3",
            "tags": "שורש::נגד פעלים",
            "last_scrape_date": "2026-03-08",
            "vocab_legacy_guid": None,
            "frequency": None,
            "pseudo_frequency": None,
            "emoji": None,
            "emoji_source": None,
            "emoji_visible": False,
            "image": None,
            "image_source": None,
            "hint": "",
            "shared_roots": [],
            "confusable_group": None,
            "confusables_guid": None,
            "examples": None,
            "noun_inflection": None,
            "conjugation": None,
            "adjective_inflection": None,
            "preposition_inflection": None,
            # Intentionally no detail_scraped key so the scraper processes it
        }
        words = {HIFIL_VERB_NIKKUD: entry}
        with words_path.open("w", encoding="utf-8") as fh:
            json.dump(words, fh, ensure_ascii=False, indent=2)
        return words_path

    def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """After detail scrape, conjugation must not be null for the Hif'il verb."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)

        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)

        words = _words_from_file(words_path)
        entry = words.get(HIFIL_VERB_NIKKUD, {})
        assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"

    def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)

        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)

        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}

        assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
        assert conj.get("binyan_hebrew") == "הִפְעִיל", (
            f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
        )

    def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)

        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)

        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}

        infinitive = conj.get("infinitive") or {}
        reference_form = conj.get("reference_form") or {}

        inf_nikkud = infinitive.get("nikkud", "")
        ref_nikkud = reference_form.get("nikkud", "")

        assert inf_nikkud and _has_hebrew(inf_nikkud), (
            f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
        )
        assert ref_nikkud and _has_hebrew(ref_nikkud), (
            f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
        )

    def test_verb_detail_active_forms_count_and_structure(
        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
    ) -> None:
        """active_forms must be a list of at least 20 entries, each with required sub-fields."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)

        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)

        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
        active_forms = conj.get("active_forms")

        assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
        assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"

        for i, form in enumerate(active_forms):
            assert form.get("person"), f"active_forms[{i}].person is empty"
            assert form.get("tense"), f"active_forms[{i}].tense is empty"
            form_block = form.get("form") or {}
            assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
                f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
            )
            assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
                f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
            )

    def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)

        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)

        words = _words_from_file(words_path)
        conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}

        hufal_forms = conj.get("hufal_pual_forms")
        assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
        assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
        assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"

        ref_passive = conj.get("reference_form_passive")
        assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
        passive_nikkud = (ref_passive or {}).get("nikkud", "")
        assert passive_nikkud and _has_hebrew(passive_nikkud), (
            f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
        )

    def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
        """detail_scraped must be True after a successful verb detail scrape."""
        words_path = self._build_test_words_json(tmp_path)
        monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)

        pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)

        words = _words_from_file(words_path)
        entry = words.get(HIFIL_VERB_NIKKUD, {})
        assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"