Remove strip_nikkud from all pipeline files — use ktiv_male directly. Fix case-insensitive binyan matching in detail scraper (og:description uses UPPERCASE). Fix integration test slugs and test limits. Delete legacy CSVs, stale .apkg, and dead scripts from git. Add vulture to pre-commit hook. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
441 lines
20 KiB
Python
441 lines
20 KiB
Python
#!/usr/bin/env python3
|
||
"""Integration tests: scrape real pealim.com pages and validate data.
|
||
|
||
These tests hit pealim.com directly. They are skipped when the environment
|
||
variable SKIP_INTEGRATION is set to any non-empty string.
|
||
|
||
Run with:
|
||
pytest tests/test_scraper_integration.py -v -m integration
|
||
"""
|
||
|
||
import json
|
||
import os
|
||
import re
|
||
import sys
|
||
import time
|
||
from pathlib import Path
|
||
|
||
import pytest
|
||
|
||
# Add project root to path so all sibling modules are importable
|
||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||
|
||
import pealim_detail_scrape
|
||
import pealim_list_scrape
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Skip marker
|
||
# ---------------------------------------------------------------------------
|
||
|
||
skip_integration = pytest.mark.skipif(
|
||
bool(os.environ.get("SKIP_INTEGRATION", "")),
|
||
reason="SKIP_INTEGRATION is set",
|
||
)
|
||
|
||
# A known Hif'il verb slug that is not page-1 dependent.
|
||
# לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
|
||
HIFIL_VERB_SLUG = "1135-lehagid"
|
||
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
|
||
HIFIL_VERB_MEANING = "to say, to tell"
|
||
|
||
# Minimum expected entries from a single list page
|
||
MIN_LIST_ENTRIES = 10
|
||
|
||
# Hebrew character regex (Unicode block U+05D0–U+05EA)
|
||
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
|
||
|
||
# Slug pattern: one or more digits, hyphen, one or more word chars
|
||
SLUG_RE = re.compile(r"^\d+-\w+$")
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _has_hebrew(text: str) -> bool:
|
||
"""Return True if *text* contains at least one Hebrew consonant."""
|
||
return bool(HEBREW_CHAR_RE.search(text))
|
||
|
||
|
||
def _words_from_file(path: Path) -> dict:
|
||
with path.open(encoding="utf-8") as fh:
|
||
return json.load(fh)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test class: list page scrape
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@pytest.mark.integration
|
||
@skip_integration
|
||
class TestListScrape:
|
||
"""Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
|
||
|
||
def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
|
||
words_path = tmp_path / "words.json"
|
||
progress_path = tmp_path / "list_scrape_progress.json"
|
||
|
||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||
|
||
# Scrape exactly one page
|
||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||
|
||
assert words_path.exists(), "words.json was not created after scrape"
|
||
words = _words_from_file(words_path)
|
||
assert len(words) >= MIN_LIST_ENTRIES, (
|
||
f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
|
||
)
|
||
|
||
def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
|
||
words_path = tmp_path / "words.json"
|
||
progress_path = tmp_path / "list_scrape_progress.json"
|
||
|
||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||
|
||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||
words = _words_from_file(words_path)
|
||
|
||
for key, entry in words.items():
|
||
word_block = entry.get("word", {})
|
||
nikkud = word_block.get("nikkud", "")
|
||
ktiv_male = word_block.get("ktiv_male", "")
|
||
slug = entry.get("slug", "")
|
||
pos = entry.get("pos", "")
|
||
meaning = entry.get("meaning", "")
|
||
|
||
assert nikkud, f"Entry '{key}': word.nikkud is empty"
|
||
assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
|
||
assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
|
||
assert slug, f"Entry '{key}': slug is empty"
|
||
assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
|
||
assert pos, f"Entry '{key}': pos is empty"
|
||
assert meaning, f"Entry '{key}': meaning is empty"
|
||
|
||
def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""At least one entry on page 1 must have a non-empty root list."""
|
||
words_path = tmp_path / "words.json"
|
||
progress_path = tmp_path / "list_scrape_progress.json"
|
||
|
||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||
|
||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||
words = _words_from_file(words_path)
|
||
|
||
entries_with_root = [e for e in words.values() if e.get("root")]
|
||
assert entries_with_root, "No entries on page 1 have a non-empty root list"
|
||
|
||
def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""At least one entry on page 1 must have a non-empty audio_url."""
|
||
words_path = tmp_path / "words.json"
|
||
progress_path = tmp_path / "list_scrape_progress.json"
|
||
|
||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||
|
||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||
words = _words_from_file(words_path)
|
||
|
||
entries_with_audio = [e for e in words.values() if e.get("audio_url")]
|
||
assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
|
||
|
||
def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
|
||
words_path = tmp_path / "words.json"
|
||
progress_path = tmp_path / "list_scrape_progress.json"
|
||
|
||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||
|
||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||
words = _words_from_file(words_path)
|
||
|
||
for key, entry in words.items():
|
||
assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
|
||
assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
|
||
assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test class: noun detail scrape
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@pytest.mark.integration
|
||
@skip_integration
|
||
class TestDetailScrapeNoun:
|
||
"""Validate pealim_detail_scrape for a real noun detail page."""
|
||
|
||
def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
|
||
"""Return the first (key, entry) pair that is a Noun with a non-empty root."""
|
||
for key, entry in words.items():
|
||
if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
|
||
return key, entry
|
||
return None
|
||
|
||
def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
|
||
"""
|
||
Scrape page 1 into a fresh words.json and return (path, words).
|
||
Uses list scraper monkeypatched to tmp_path.
|
||
"""
|
||
words_path = tmp_path / "words.json"
|
||
progress_path = tmp_path / "list_scrape_progress.json"
|
||
|
||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||
|
||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||
words = _words_from_file(words_path)
|
||
return words_path, words
|
||
|
||
def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""After detail scrape, noun_inflection must not be null."""
|
||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||
|
||
pair = self._find_noun_with_root(words)
|
||
assert pair is not None, "No noun with a root found on page 1"
|
||
noun_key, noun_entry = pair
|
||
|
||
# Now monkeypatch detail scraper and run it on just this noun
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
|
||
# Small rate-limit delay between list scrape and detail scrape
|
||
time.sleep(1.0)
|
||
|
||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||
|
||
updated_words = _words_from_file(words_path)
|
||
entry = updated_words.get(noun_key, {})
|
||
|
||
assert entry.get("noun_inflection") is not None, (
|
||
f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
|
||
)
|
||
|
||
def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
|
||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||
|
||
pair = self._find_noun_with_root(words)
|
||
assert pair is not None, "No noun with a root found on page 1"
|
||
noun_key, _noun_entry = pair
|
||
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
time.sleep(1.0)
|
||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||
|
||
updated_words = _words_from_file(words_path)
|
||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||
|
||
singular = ni.get("singular") or {}
|
||
plural = ni.get("plural") or {}
|
||
|
||
assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
|
||
assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
|
||
assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
|
||
assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
|
||
|
||
def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""Noun gender must be 'masculine' or 'feminine'."""
|
||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||
|
||
pair = self._find_noun_with_root(words)
|
||
assert pair is not None, "No noun with a root found on page 1"
|
||
noun_key, _noun_entry = pair
|
||
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
time.sleep(1.0)
|
||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||
|
||
updated_words = _words_from_file(words_path)
|
||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||
|
||
gender = ni.get("gender", "")
|
||
assert gender in ("masculine", "feminine"), (
|
||
f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
|
||
)
|
||
|
||
def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""detail_scraped must be True after a successful noun detail scrape."""
|
||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||
|
||
pair = self._find_noun_with_root(words)
|
||
assert pair is not None, "No noun with a root found on page 1"
|
||
noun_key, _ = pair
|
||
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
time.sleep(1.0)
|
||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||
|
||
updated_words = _words_from_file(words_path)
|
||
assert updated_words[noun_key].get("detail_scraped") is True, (
|
||
f"detail_scraped is not True after scrape for '{noun_key}'"
|
||
)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Test class: verb detail scrape (Hif'il)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
@pytest.mark.integration
|
||
@skip_integration
|
||
class TestDetailScrapeVerb:
|
||
"""Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
|
||
|
||
def _build_test_words_json(self, tmp_path: Path) -> Path:
|
||
"""
|
||
Write a minimal words.json containing only the known Hif'il verb entry.
|
||
The detail scraper's run() will pick it up because pos starts with 'Verb'
|
||
and detail_scraped is absent/False.
|
||
"""
|
||
words_path = tmp_path / "words.json"
|
||
entry = {
|
||
"word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
|
||
"slug": HIFIL_VERB_SLUG,
|
||
"root": ["נ", "ג", "ד"],
|
||
"pos": "Verb",
|
||
"pos_hebrew": "פֹּעַל — הִפְעִיל",
|
||
"meaning": HIFIL_VERB_MEANING,
|
||
"meaning_raw": HIFIL_VERB_MEANING,
|
||
"audio_url": "",
|
||
"audio_file": "להגיד.mp3",
|
||
"tags": "שורש::נגד פעלים",
|
||
"last_scrape_date": "2026-03-08",
|
||
"vocab_legacy_guid": None,
|
||
"frequency": None,
|
||
"pseudo_frequency": None,
|
||
"emoji": None,
|
||
"emoji_source": None,
|
||
"emoji_visible": False,
|
||
"image": None,
|
||
"image_source": None,
|
||
"hint": "",
|
||
"shared_roots": [],
|
||
"confusable_group": None,
|
||
"confusables_guid": None,
|
||
"examples": None,
|
||
"noun_inflection": None,
|
||
"conjugation": None,
|
||
"adjective_inflection": None,
|
||
"preposition_inflection": None,
|
||
# Intentionally no detail_scraped key so the scraper processes it
|
||
}
|
||
words = {HIFIL_VERB_NIKKUD: entry}
|
||
with words_path.open("w", encoding="utf-8") as fh:
|
||
json.dump(words, fh, ensure_ascii=False, indent=2)
|
||
return words_path
|
||
|
||
def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""After detail scrape, conjugation must not be null for the Hif'il verb."""
|
||
words_path = self._build_test_words_json(tmp_path)
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
|
||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||
|
||
words = _words_from_file(words_path)
|
||
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||
assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
|
||
|
||
def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
|
||
words_path = self._build_test_words_json(tmp_path)
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
|
||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||
|
||
words = _words_from_file(words_path)
|
||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||
|
||
assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
|
||
assert conj.get("binyan_hebrew") == "הִפְעִיל", (
|
||
f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
|
||
)
|
||
|
||
def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
|
||
words_path = self._build_test_words_json(tmp_path)
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
|
||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||
|
||
words = _words_from_file(words_path)
|
||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||
|
||
infinitive = conj.get("infinitive") or {}
|
||
reference_form = conj.get("reference_form") or {}
|
||
|
||
inf_nikkud = infinitive.get("nikkud", "")
|
||
ref_nikkud = reference_form.get("nikkud", "")
|
||
|
||
assert inf_nikkud and _has_hebrew(inf_nikkud), (
|
||
f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
|
||
)
|
||
assert ref_nikkud and _has_hebrew(ref_nikkud), (
|
||
f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
|
||
)
|
||
|
||
def test_verb_detail_active_forms_count_and_structure(
|
||
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
||
) -> None:
|
||
"""active_forms must be a list of at least 20 entries, each with required sub-fields."""
|
||
words_path = self._build_test_words_json(tmp_path)
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
|
||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||
|
||
words = _words_from_file(words_path)
|
||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||
active_forms = conj.get("active_forms")
|
||
|
||
assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
|
||
assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
|
||
|
||
for i, form in enumerate(active_forms):
|
||
assert form.get("person"), f"active_forms[{i}].person is empty"
|
||
assert form.get("tense"), f"active_forms[{i}].tense is empty"
|
||
form_block = form.get("form") or {}
|
||
assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
|
||
f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
|
||
)
|
||
assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
|
||
f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
|
||
)
|
||
|
||
def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
|
||
words_path = self._build_test_words_json(tmp_path)
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
|
||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||
|
||
words = _words_from_file(words_path)
|
||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||
|
||
hufal_forms = conj.get("hufal_pual_forms")
|
||
assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
|
||
assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
|
||
assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
|
||
|
||
ref_passive = conj.get("reference_form_passive")
|
||
assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
|
||
passive_nikkud = (ref_passive or {}).get("nikkud", "")
|
||
assert passive_nikkud and _has_hebrew(passive_nikkud), (
|
||
f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
|
||
)
|
||
|
||
def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||
"""detail_scraped must be True after a successful verb detail scrape."""
|
||
words_path = self._build_test_words_json(tmp_path)
|
||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||
|
||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||
|
||
words = _words_from_file(words_path)
|
||
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||
assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"
|