hebrew_flash_cards/tests/test_scraper_integration.py
Sochen b2fef5aa8a Sprint 11.1: strip_nikkud cleanup, dead code removal, test fixes
Remove strip_nikkud from all pipeline files — use ktiv_male directly.
Fix case-insensitive binyan matching in detail scraper (og:description
uses UPPERCASE). Fix integration test slugs and test limits. Delete
legacy CSVs, stale .apkg, and dead scripts from git. Add vulture to
pre-commit hook.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 04:03:47 +00:00

441 lines
20 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env python3
"""Integration tests: scrape real pealim.com pages and validate data.
These tests hit pealim.com directly. They are skipped when the environment
variable SKIP_INTEGRATION is set to any non-empty string.
Run with:
pytest tests/test_scraper_integration.py -v -m integration
"""
import json
import os
import re
import sys
import time
from pathlib import Path
import pytest
# Add project root to path so all sibling modules are importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
import pealim_detail_scrape
import pealim_list_scrape
# ---------------------------------------------------------------------------
# Skip marker
# ---------------------------------------------------------------------------
skip_integration = pytest.mark.skipif(
bool(os.environ.get("SKIP_INTEGRATION", "")),
reason="SKIP_INTEGRATION is set",
)
# A known Hif'il verb slug that is not page-1 dependent.
# לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
HIFIL_VERB_SLUG = "1135-lehagid"
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
HIFIL_VERB_MEANING = "to say, to tell"
# Minimum expected entries from a single list page
MIN_LIST_ENTRIES = 10
# Hebrew character regex (Unicode block U+05D0U+05EA)
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
# Slug pattern: one or more digits, hyphen, one or more word chars
SLUG_RE = re.compile(r"^\d+-\w+$")
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _has_hebrew(text: str) -> bool:
"""Return True if *text* contains at least one Hebrew consonant."""
return bool(HEBREW_CHAR_RE.search(text))
def _words_from_file(path: Path) -> dict:
with path.open(encoding="utf-8") as fh:
return json.load(fh)
# ---------------------------------------------------------------------------
# Test class: list page scrape
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestListScrape:
"""Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
# Scrape exactly one page
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
assert words_path.exists(), "words.json was not created after scrape"
words = _words_from_file(words_path)
assert len(words) >= MIN_LIST_ENTRIES, (
f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
)
def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
for key, entry in words.items():
word_block = entry.get("word", {})
nikkud = word_block.get("nikkud", "")
ktiv_male = word_block.get("ktiv_male", "")
slug = entry.get("slug", "")
pos = entry.get("pos", "")
meaning = entry.get("meaning", "")
assert nikkud, f"Entry '{key}': word.nikkud is empty"
assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
assert slug, f"Entry '{key}': slug is empty"
assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
assert pos, f"Entry '{key}': pos is empty"
assert meaning, f"Entry '{key}': meaning is empty"
def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""At least one entry on page 1 must have a non-empty root list."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
entries_with_root = [e for e in words.values() if e.get("root")]
assert entries_with_root, "No entries on page 1 have a non-empty root list"
def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""At least one entry on page 1 must have a non-empty audio_url."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
entries_with_audio = [e for e in words.values() if e.get("audio_url")]
assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
for key, entry in words.items():
assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
# ---------------------------------------------------------------------------
# Test class: noun detail scrape
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestDetailScrapeNoun:
"""Validate pealim_detail_scrape for a real noun detail page."""
def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
"""Return the first (key, entry) pair that is a Noun with a non-empty root."""
for key, entry in words.items():
if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
return key, entry
return None
def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
"""
Scrape page 1 into a fresh words.json and return (path, words).
Uses list scraper monkeypatched to tmp_path.
"""
words_path = tmp_path / "words.json"
progress_path = tmp_path / "list_scrape_progress.json"
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
words = _words_from_file(words_path)
return words_path, words
def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After detail scrape, noun_inflection must not be null."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, noun_entry = pair
# Now monkeypatch detail scraper and run it on just this noun
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
# Small rate-limit delay between list scrape and detail scrape
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
entry = updated_words.get(noun_key, {})
assert entry.get("noun_inflection") is not None, (
f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
)
def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _noun_entry = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
singular = ni.get("singular") or {}
plural = ni.get("plural") or {}
assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Noun gender must be 'masculine' or 'feminine'."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _noun_entry = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
gender = ni.get("gender", "")
assert gender in ("masculine", "feminine"), (
f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
)
def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""detail_scraped must be True after a successful noun detail scrape."""
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
pair = self._find_noun_with_root(words)
assert pair is not None, "No noun with a root found on page 1"
noun_key, _ = pair
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0)
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path)
assert updated_words[noun_key].get("detail_scraped") is True, (
f"detail_scraped is not True after scrape for '{noun_key}'"
)
# ---------------------------------------------------------------------------
# Test class: verb detail scrape (Hif'il)
# ---------------------------------------------------------------------------
@pytest.mark.integration
@skip_integration
class TestDetailScrapeVerb:
"""Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
def _build_test_words_json(self, tmp_path: Path) -> Path:
"""
Write a minimal words.json containing only the known Hif'il verb entry.
The detail scraper's run() will pick it up because pos starts with 'Verb'
and detail_scraped is absent/False.
"""
words_path = tmp_path / "words.json"
entry = {
"word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
"slug": HIFIL_VERB_SLUG,
"root": ["נ", "ג", "ד"],
"pos": "Verb",
"pos_hebrew": "פֹּעַל — הִפְעִיל",
"meaning": HIFIL_VERB_MEANING,
"meaning_raw": HIFIL_VERB_MEANING,
"audio_url": "",
"audio_file": "להגיד.mp3",
"tags": "שורש::נגד פעלים",
"last_scrape_date": "2026-03-08",
"vocab_legacy_guid": None,
"frequency": None,
"pseudo_frequency": None,
"emoji": None,
"emoji_source": None,
"emoji_visible": False,
"image": None,
"image_source": None,
"hint": "",
"shared_roots": [],
"confusable_group": None,
"confusables_guid": None,
"examples": None,
"noun_inflection": None,
"conjugation": None,
"adjective_inflection": None,
"preposition_inflection": None,
# Intentionally no detail_scraped key so the scraper processes it
}
words = {HIFIL_VERB_NIKKUD: entry}
with words_path.open("w", encoding="utf-8") as fh:
json.dump(words, fh, ensure_ascii=False, indent=2)
return words_path
def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""After detail scrape, conjugation must not be null for the Hif'il verb."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
entry = words.get(HIFIL_VERB_NIKKUD, {})
assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
assert conj.get("binyan_hebrew") == "הִפְעִיל", (
f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
)
def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
infinitive = conj.get("infinitive") or {}
reference_form = conj.get("reference_form") or {}
inf_nikkud = infinitive.get("nikkud", "")
ref_nikkud = reference_form.get("nikkud", "")
assert inf_nikkud and _has_hebrew(inf_nikkud), (
f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
)
assert ref_nikkud and _has_hebrew(ref_nikkud), (
f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
)
def test_verb_detail_active_forms_count_and_structure(
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
) -> None:
"""active_forms must be a list of at least 20 entries, each with required sub-fields."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
active_forms = conj.get("active_forms")
assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
for i, form in enumerate(active_forms):
assert form.get("person"), f"active_forms[{i}].person is empty"
assert form.get("tense"), f"active_forms[{i}].tense is empty"
form_block = form.get("form") or {}
assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
)
assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
)
def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
hufal_forms = conj.get("hufal_pual_forms")
assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
ref_passive = conj.get("reference_form_passive")
assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
passive_nikkud = (ref_passive or {}).get("nikkud", "")
assert passive_nikkud and _has_hebrew(passive_nikkud), (
f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
)
def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
"""detail_scraped must be True after a successful verb detail scrape."""
words_path = self._build_test_words_json(tmp_path)
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
words = _words_from_file(words_path)
entry = words.get(HIFIL_VERB_NIKKUD, {})
assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"