hebrew_flash_cards/tests/test_detail_scrape.py
Sochen efd0745ada Sprint 14: deck template/CSS overhaul + Sprint 12 detail scrape
Template & CSS fixes (15 items from Mar 9 feedback):
- Fix conjugation front showing 3ms form instead of infinitive
- Rename conjugation model to "Hebrew Conjugation"
- Strip Hebrew parenthesized text from English meanings
- Shoresh separator: spaces → dots (א.כ.ל)
- Remove duplicate English meaning from cloze back
- Remove example sentences from vocab front/back (cloze only)
- Center-align audio buttons on all decks
- Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)"
- Unify sec-key/sec-label fonts, make keys bold
- Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px)
- Center-align related words groups
- Sort confusables by average frequency
- Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning
- Clean duplicate quotation marks in cloze sentences

Sprint 12 carry-forward (detail scrape + EPUB):
- Adjective/preposition detail scraping in pealim_detail_scrape.py
- EPUB example matching rewrite in epub_examples.py
- Delete benyehuda.py and rebuild_sentence_matches.py (merged)
- 49 parser tests for detail scraping
- SCHEMA.yaml updates for new fields

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-10 07:44:47 +00:00

486 lines
18 KiB
Python

"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
import sys
from pathlib import Path
import pytest
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from pealim_detail_scrape import (
_parse_adjective_table,
_parse_adjective_table_vl,
_parse_preposition_table,
_parse_preposition_table_vl,
_scrape_adjective_detail,
_scrape_preposition_detail,
)
# ---------------------------------------------------------------------------
# Fixtures — real HTML snippets from pealim.com
# ---------------------------------------------------------------------------
ADJECTIVE_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִי</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fs-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִית</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="mp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיִּים</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
<td class="conj-td">
<div id="fp-a">
<div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">&#128266;</span>
<span class="menukad">אֲבִיבִיּוֹת</span>
</div></div>
<div class="meaning">spring-like, vernal</div>
</div>
</td>
</tr>
</tbody>
</table>
"""
# VL version: menukad spans contain unvowelled text (hebstyle=vl)
ADJECTIVE_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<td class="conj-td">
<div id="ms-a"><div><div>
<span class="menukad">אביבי</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fs-a"><div><div>
<span class="menukad">אביבית</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="mp-a"><div><div>
<span class="menukad">אביביים</span>
</div></div></div>
</td>
<td class="conj-td">
<div id="fp-a"><div><div>
<span class="menukad">אביביות</span>
</div></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_MO_TABLE = """
<table class="table table-condensed conjugation-table">
<thead>
<tr>
<th rowspan="2">Person</th>
<th class="column-header" colspan="2">Singular</th>
<th class="column-header" colspan="2">Plural</th>
</tr>
<tr>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
<th class="column-header">Masculine</th>
<th class="column-header">Feminine</th>
</tr>
</thead>
<tbody>
<tr>
<th>1st</th>
<td class="conj-td" colspan="2">
<div id="P-1s"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">&#128266;</span>
<span class="menukad">שֶׁלִּי</span>
</div></div><div class="meaning"><strong>of mine</strong></div></div>
</td>
<td class="conj-td" colspan="2">
<div id="P-1p"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּנוּ</span>
</div></div><div class="meaning"><strong>of ours</strong></div></div>
</td>
</tr>
<tr>
<th>2nd</th>
<td class="conj-td">
<div id="P-2ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">&#128266;</span>
<span class="menukad">שֶׁלְּךָ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּךְ</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶם</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
</td>
<td class="conj-td">
<div id="P-2fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּכֶן</span>
</div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
</td>
</tr>
<tr>
<th>3rd</th>
<td class="conj-td">
<div id="P-3ms"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">&#128266;</span>
<span class="menukad">שֶׁלּוֹ</span>
</div></div><div class="meaning"><strong>of his</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3fs"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהּ</span>
</div></div><div class="meaning"><strong>of hers</strong></div></div>
</td>
<td class="conj-td">
<div id="P-3mp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶם</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
</td>
<td class="conj-td">
<div id="P-3fp"><div><div>
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">&#128266;</span>
<span class="menukad">שֶׁלָּהֶן</span>
</div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
</td>
</tr>
</tbody>
</table>
"""
PREPOSITION_VL_TABLE = """
<table class="table table-condensed conjugation-table">
<tbody>
<tr>
<th>1st</th>
<td colspan="2"><div id="P-1s"><div><div>
<span class="menukad">שלי</span>
</div></div></div></td>
<td colspan="2"><div id="P-1p"><div><div>
<span class="menukad">שלנו</span>
</div></div></div></td>
</tr>
<tr>
<th>2nd</th>
<td><div id="P-2ms"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2fs"><div><div>
<span class="menukad">שלך</span>
</div></div></div></td>
<td><div id="P-2mp"><div><div>
<span class="menukad">שלכם</span>
</div></div></div></td>
<td><div id="P-2fp"><div><div>
<span class="menukad">שלכן</span>
</div></div></div></td>
</tr>
<tr>
<th>3rd</th>
<td><div id="P-3ms"><div><div>
<span class="menukad">שלו</span>
</div></div></div></td>
<td><div id="P-3fs"><div><div>
<span class="menukad">שלה</span>
</div></div></div></td>
<td><div id="P-3mp"><div><div>
<span class="menukad">שלהם</span>
</div></div></div></td>
<td><div id="P-3fp"><div><div>
<span class="menukad">שלהן</span>
</div></div></div></td>
</tr>
</tbody>
</table>
"""
# Minimal full-page wrappers so _scrape_*_detail() can parse them
_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
# ---------------------------------------------------------------------------
# Adjective table tests
# ---------------------------------------------------------------------------
class TestParseAdjectiveTable:
"""Tests for _parse_adjective_table (mo/nikkud page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["nikkud"] == "אֲבִיבִי"
def test_fs_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fs"]["nikkud"] == "אֲבִיבִית"
def test_mp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
def test_fp_nikkud(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
def test_audio_url_present(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParseAdjectiveTableVl:
"""Tests for _parse_adjective_table_vl (ktiv male page)."""
def test_returns_four_form_keys(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
def test_ms_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["ms"] == "אביבי"
def test_fs_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fs"] == "אביבית"
def test_mp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["mp"] == "אביביים"
def test_fp_ktiv(self) -> None:
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
assert result["fp"] == "אביביות"
# ---------------------------------------------------------------------------
# _scrape_adjective_detail tests
# ---------------------------------------------------------------------------
class TestScrapeAdjectiveDetail:
"""Tests for _scrape_adjective_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["ms"]["nikkud"] == "אֲבִיבִי"
assert result["ms"]["ktiv_male"] == "אביבי"
def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fs"]["nikkud"] == "אֲבִיבִית"
assert result["fs"]["ktiv_male"] == "אביבית"
def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
assert result["mp"]["ktiv_male"] == "אביביים"
def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
assert result["fp"]["ktiv_male"] == "אביביות"
def test_mishkal_key_present(self, result: dict) -> None:
# mishkal may be None since no PoS section is in our minimal fixture
assert "mishkal" in result
def test_mishkal_hebrew_key_present(self, result: dict) -> None:
assert "mishkal_hebrew" in result
def test_all_schema_keys_present(self, result: dict) -> None:
expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
assert expected.issubset(result.keys())
def test_empty_on_no_table(self) -> None:
result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}
# ---------------------------------------------------------------------------
# Preposition table tests
# ---------------------------------------------------------------------------
class TestParsePrepositionTable:
"""Tests for _parse_preposition_table (mo/nikkud page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_nikkud(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
def test_1p_nikkud(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
def test_2ms_nikkud(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
def test_2fs_nikkud(self, result: dict) -> None:
assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
def test_2mp_nikkud(self, result: dict) -> None:
assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
def test_2fp_nikkud(self, result: dict) -> None:
assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
def test_3ms_nikkud(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
def test_3fs_nikkud(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
def test_3mp_nikkud(self, result: dict) -> None:
assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
def test_3fp_nikkud(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
def test_audio_url_present(self, result: dict) -> None:
assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
def test_empty_on_missing_table(self) -> None:
result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
assert result == {}
class TestParsePrepositionTableVl:
"""Tests for _parse_preposition_table_vl (ktiv male page)."""
@pytest.fixture()
def result(self) -> dict:
return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
def test_returns_ten_form_keys(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert set(result.keys()) == expected
def test_1s_ktiv(self, result: dict) -> None:
assert result["1s"] == "שלי"
def test_1p_ktiv(self, result: dict) -> None:
assert result["1p"] == "שלנו"
def test_2ms_ktiv(self, result: dict) -> None:
assert result["2ms"] == "שלך"
def test_3ms_ktiv(self, result: dict) -> None:
assert result["3ms"] == "שלו"
def test_3fp_ktiv(self, result: dict) -> None:
assert result["3fp"] == "שלהן"
# ---------------------------------------------------------------------------
# _scrape_preposition_detail tests
# ---------------------------------------------------------------------------
class TestScrapePrepositionDetail:
"""Tests for _scrape_preposition_detail — schema compliance."""
@pytest.fixture()
def result(self) -> dict:
return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
def test_returns_non_empty_dict(self, result: dict) -> None:
assert result
def test_all_ten_person_keys_present(self, result: dict) -> None:
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
assert expected.issubset(result.keys())
def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1s"]["nikkud"] == "שֶׁלִּי"
assert result["1s"]["ktiv_male"] == "שלי"
def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
assert result["1p"]["ktiv_male"] == "שלנו"
def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
assert result["2ms"]["ktiv_male"] == "שלך"
def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
assert result["3ms"]["ktiv_male"] == "שלו"
def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
assert result["3fs"]["ktiv_male"] == "שלה"
def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
assert result["3fp"]["ktiv_male"] == "שלהן"
def test_empty_on_no_table(self) -> None:
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
assert result == {}