Sprint 11.1: strip_nikkud cleanup, dead code removal, test fixes

Remove strip_nikkud from all pipeline files — use ktiv_male directly.
Fix case-insensitive binyan matching in detail scraper (og:description
uses UPPERCASE). Fix integration test slugs and test limits. Delete
legacy CSVs, stale .apkg, and dead scripts from git. Add vulture to
pre-commit hook.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-09 04:03:47 +00:00
parent a1d970a782
commit b2fef5aa8a
18 changed files with 71 additions and 60993 deletions

8
.gitignore vendored
View file

@ -47,6 +47,14 @@ data/epubs/
# Stray deck files # Stray deck files
Everything__*.apkg Everything__*.apkg
*.apkg
# Legacy CSV files (replaced by data/words.json)
*.csv
data/*.csv
# Dead whitelist files
vulture_whitelist.py
# Release artifacts — distributed via Forgejo releases, not committed to tree # Release artifacts — distributed via Forgejo releases, not committed to tree
releases/ releases/

View file

@ -15,8 +15,6 @@ from pathlib import Path
import genanki import genanki
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# Stable deck/model IDs — do not change these # Stable deck/model IDs — do not change these
@ -1315,7 +1313,8 @@ def build_confusables_deck(
at = f"[sound:{af}]" at = f"[sound:{af}]"
if not at: if not at:
slug = e.get("slug", "") or "" slug = e.get("slug", "") or ""
at = _audio_tag(_strip_nikkud(w), slug=slug) ktiv_male = e.get("word", {}).get("ktiv_male", "") or ""
at = _audio_tag(ktiv_male, slug=slug)
if at and at not in audio_parts: if at and at not in audio_parts:
audio_parts.append(at) audio_parts.append(at)
mp3_name = at.removeprefix("[sound:").removesuffix("]") mp3_name = at.removeprefix("[sound:").removesuffix("]")
@ -1415,11 +1414,15 @@ PLURAL_MODEL = genanki.Model(
) )
def _is_irregular_plural(gender: str, plural: str) -> bool: def _is_irregular_plural(gender: str, plural_ktiv: str) -> bool:
"""Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix.""" """Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix.
plural_stripped = _strip_nikkud(plural)
return (gender == "masculine" and plural_stripped.endswith("ות")) or ( Args:
gender == "feminine" and plural_stripped.endswith("ים") gender: ``"masculine"`` or ``"feminine"``.
plural_ktiv: ktiv male (no nikkud) form of the plural.
"""
return (gender == "masculine" and plural_ktiv.endswith("ות")) or (
gender == "feminine" and plural_ktiv.endswith("ים")
) )
@ -1453,13 +1456,14 @@ def build_plural_deck(
continue continue
singular = singular_data.get("nikkud", "") singular = singular_data.get("nikkud", "")
plural = plural_data.get("nikkud", "") plural = plural_data.get("nikkud", "")
plural_ktiv = plural_data.get("ktiv_male", "")
if not singular or not plural: if not singular or not plural:
continue continue
gender = noun_inflection.get("gender", "") gender = noun_inflection.get("gender", "")
mishkal = noun_inflection.get("mishkal") or "" mishkal = noun_inflection.get("mishkal") or ""
if _is_irregular_plural(gender, plural): if _is_irregular_plural(gender, plural_ktiv):
irregulars.append((unique_key, entry, noun_inflection)) irregulars.append((unique_key, entry, noun_inflection))
elif mishkal: elif mishkal:
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection)) by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
@ -1475,7 +1479,9 @@ def build_plural_deck(
note_count = 0 note_count = 0
for _unique_key, entry, noun_inflection in selected: for _unique_key, entry, noun_inflection in selected:
singular = noun_inflection["singular"]["nikkud"] singular = noun_inflection["singular"]["nikkud"]
singular_ktiv = noun_inflection["singular"].get("ktiv_male", "")
plural = noun_inflection["plural"]["nikkud"] plural = noun_inflection["plural"]["nikkud"]
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
gender = noun_inflection.get("gender") or "" gender = noun_inflection.get("gender") or ""
mishkal = noun_inflection.get("mishkal") or "" mishkal = noun_inflection.get("mishkal") or ""
meaning = entry.get("meaning") or "" meaning = entry.get("meaning") or ""
@ -1490,8 +1496,7 @@ def build_plural_deck(
sg_audio = "" sg_audio = ""
pl_audio = "" pl_audio = ""
if include_audio: if include_audio:
sg_no_nik = _strip_nikkud(singular) sg_tag = _audio_tag(singular_ktiv)
sg_tag = _audio_tag(sg_no_nik)
if sg_tag: if sg_tag:
sg_audio = sg_tag sg_audio = sg_tag
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]") mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
@ -1501,7 +1506,7 @@ def build_plural_deck(
tags = [RELEASE_TAG] tags = [RELEASE_TAG]
if mishkal: if mishkal:
tags.append(f"mishkal::{mishkal}") tags.append(f"mishkal::{mishkal}")
if _is_irregular_plural(gender, plural): if _is_irregular_plural(gender, plural_ktiv):
tags.append("irregular") tags.append("irregular")
note = genanki.Note( note = genanki.Note(

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -15,8 +15,6 @@ from pathlib import Path
import requests import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt" FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
@ -45,7 +43,7 @@ def load(cache_path: Path = CACHE_PATH) -> None:
line = line.strip() line = line.strip()
if not line: if not line:
continue continue
word = _strip_nikkud(line.split()[0]) word = line.split()[0]
if word and word not in _freq: if word and word not in _freq:
_freq[word] = rank _freq[word] = rank
rank += 1 rank += 1
@ -60,11 +58,11 @@ def get_frequency_rank(word_no_nikkud: str) -> int | None:
""" """
Return the frequency rank of a word (1 = most common). Return the frequency rank of a word (1 = most common).
Returns None if not found in the corpus. Returns None if not found in the corpus.
Strips nikkud from the input before lookup. Expects ktiv male (no nikkud) input.
""" """
if not _freq: if not _freq:
load() load()
clean = _strip_nikkud(word_no_nikkud.strip()) clean = word_no_nikkud.strip()
return _freq.get(clean) return _freq.get(clean)

View file

@ -31,8 +31,6 @@ from pathlib import Path
import requests import requests
from helpers import strip_nikkud as _strip_nikkud
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data" DATA_DIR = Path(__file__).parent / "data"
@ -78,7 +76,7 @@ def is_concrete(english_meaning: str) -> bool:
def _safe_name(word_no_nikkud: str) -> str: def _safe_name(word_no_nikkud: str) -> str:
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only).""" """Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud)) hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
return hebrew_only if hebrew_only else "unknown" return hebrew_only if hebrew_only else "unknown"
@ -261,7 +259,7 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
if single_word and word_plain != single_word: if single_word and word_plain != single_word:
continue continue
cache_key = word_plain or _strip_nikkud(word) cache_key = word_plain
if cache_key in cache: if cache_key in cache:
skipped_cached += 1 skipped_cached += 1

Binary file not shown.

View file

@ -17,8 +17,6 @@ from pathlib import Path
import requests import requests
from helpers import strip_nikkud
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
DATA_DIR = Path(__file__).parent / "data" DATA_DIR = Path(__file__).parent / "data"
@ -58,11 +56,15 @@ def _make_audio_file(entry: dict) -> str:
Returns: Returns:
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``. Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
""" """
slug: str = entry["slug"] audio_file = entry.get("audio_file", "")
if audio_file:
return audio_file
# Fallback: use slug for confusables, ktiv_male for others
slug = entry.get("slug", "")
if entry.get("confusable_group"): if entry.get("confusable_group"):
return f"{slug}.mp3" return f"{slug}.mp3"
word: str = entry.get("word", "") ktiv_male = entry.get("word", {}).get("ktiv_male", "")
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word)) safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
return f"{safe_name}.mp3" return f"{safe_name}.mp3"

View file

@ -25,8 +25,6 @@ from pathlib import Path
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from helpers import strip_nikkud
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -41,6 +39,7 @@ SAVE_INTERVAL = 50 # write words.json every N processed entries
WORDS_JSON = Path(__file__).parent / "data" / "words.json" WORDS_JSON = Path(__file__).parent / "data" / "words.json"
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al") BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
BINYAN_HEBREW: dict[str, str] = { BINYAN_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל", "Pa'al": "פָּעַל",
@ -422,8 +421,9 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
def form_or_null(nikkud: str, ktiv: str) -> dict | None: def form_or_null(nikkud: str, ktiv: str) -> dict | None:
if not nikkud: if not nikkud:
return None return None
ktiv_clean = ktiv if ktiv else strip_nikkud(nikkud) if not ktiv:
return {"nikkud": nikkud, "ktiv_male": ktiv_clean} logger.warning("No ktiv_male for noun form: %s", nikkud)
return {"nikkud": nikkud, "ktiv_male": ktiv}
singular_nikkud = str(mo_data.get("singular_nikkud", "")) singular_nikkud = str(mo_data.get("singular_nikkud", ""))
plural_nikkud = str(mo_data.get("plural_nikkud", "")) plural_nikkud = str(mo_data.get("plural_nikkud", ""))
@ -464,17 +464,15 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
def _extract_binyan_from_page(soup: BeautifulSoup) -> str: def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
"""Extract binyan from page header span or og:description.""" """Extract binyan from page header span or og:description."""
for h3 in soup.find_all("h3", class_="page-header"): texts = [h3.get_text(" ", strip=True) for h3 in soup.find_all("h3", class_="page-header")]
text = h3.get_text(" ", strip=True)
for bname in BINYAN_NAMES:
if bname in text:
return bname
meta = soup.find("meta", {"property": "og:description"}) meta = soup.find("meta", {"property": "og:description"})
if meta: if meta:
desc = meta.get("content", "") texts.append(str(meta.get("content", "")))
for bname in BINYAN_NAMES: for text in texts:
if bname in desc: text_lower = text.lower()
return bname for i, bname_lower in enumerate(_BINYAN_NAMES_LOWER):
if bname_lower in text_lower:
return BINYAN_NAMES[i]
return "" return ""
@ -784,7 +782,9 @@ def _forms_to_active_list(
person = FORM_KEY_TO_PERSON.get(form_key, form_key) person = FORM_KEY_TO_PERSON.get(form_key, form_key)
tense = TENSE_DESCRIPTION.get(form_key, "") tense = TENSE_DESCRIPTION.get(form_key, "")
nikkud = form_data["form_nikkud"] nikkud = form_data["form_nikkud"]
ktiv = vl_forms.get(form_key, "") or strip_nikkud(nikkud) ktiv = vl_forms.get(form_key, "")
if not ktiv:
logger.warning("No ktiv_male for verb form %s: %s", form_key, nikkud)
audio_url = form_data.get("audio_url", "") audio_url = form_data.get("audio_url", "")
pronoun = PRONOUN_LABELS.get(form_key, "") pronoun = PRONOUN_LABELS.get(form_key, "")
@ -838,9 +838,13 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
# Determine infinitive and reference form # Determine infinitive and reference form
infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "") infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "")
infinitive_ktiv = vl_active.get("infinitive", "") or strip_nikkud(infinitive_nikkud) infinitive_ktiv = vl_active.get("infinitive", "")
if infinitive_nikkud and not infinitive_ktiv:
logger.warning("No ktiv_male for infinitive: %s (slug=%s)", infinitive_nikkud, slug)
past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "") past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "")
past_3ms_ktiv = vl_active.get("past_3ms", "") or strip_nikkud(past_3ms_nikkud) past_3ms_ktiv = vl_active.get("past_3ms", "")
if past_3ms_nikkud and not past_3ms_ktiv:
logger.warning("No ktiv_male for past_3ms: %s (slug=%s)", past_3ms_nikkud, slug)
# Build active forms list, preserving GUIDs # Build active forms list, preserving GUIDs
existing_active_forms = existing.get("active_forms") existing_active_forms = existing.get("active_forms")
@ -861,7 +865,9 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
existing_passive_forms = existing.get("hufal_pual_forms") existing_passive_forms = existing.get("hufal_pual_forms")
hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms) hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms)
passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "") passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "")
passive_3ms_ktiv = vl_passive.get("past_3ms", "") or strip_nikkud(passive_3ms_nikkud) passive_3ms_ktiv = vl_passive.get("past_3ms", "")
if passive_3ms_nikkud and not passive_3ms_ktiv:
logger.warning("No ktiv_male for passive past_3ms: %s (slug=%s)", passive_3ms_nikkud, slug)
if passive_3ms_nikkud: if passive_3ms_nikkud:
reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv} reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv}

File diff suppressed because it is too large Load diff

File diff suppressed because it is too large Load diff

View file

@ -21,8 +21,6 @@ from pathlib import Path
import requests import requests
from bs4 import BeautifulSoup from bs4 import BeautifulSoup
from helpers import strip_nikkud
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
# Paths # Paths
# --------------------------------------------------------------------------- # ---------------------------------------------------------------------------
@ -262,7 +260,7 @@ def _build_tags(pos_en: str, root: list[str]) -> str:
parts: list[str] = [] parts: list[str] = []
if root: if root:
root_str = "".join(strip_nikkud(c) for c in root) root_str = "".join(root)
parts.append(f"שורש::{root_str}") parts.append(f"שורש::{root_str}")
pos_heb_tag = pos_tag_map.get(pos_en, "") pos_heb_tag = pos_tag_map.get(pos_en, "")
@ -280,7 +278,7 @@ def _compute_audio_file(slug: str, ktiv_male: str) -> str:
here we store a placeholder that post_process() will correct. here we store a placeholder that post_process() will correct.
We default to the consonant-based name; confusables get slug-based names. We default to the consonant-based name; confusables get slug-based names.
""" """
consonants = strip_nikkud(ktiv_male) if ktiv_male else "" consonants = ktiv_male or ""
return f"{consonants}.mp3" if consonants else f"{slug}.mp3" return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
@ -532,7 +530,7 @@ def _post_process(words: dict) -> None:
entry["confusable_group"] = None entry["confusable_group"] = None
# Non-confusable → consonant-based audio filename # Non-confusable → consonant-based audio filename
ktiv_male = entry.get("word", {}).get("ktiv_male", "") ktiv_male = entry.get("word", {}).get("ktiv_male", "")
consonants = strip_nikkud(ktiv_male) if ktiv_male else "" consonants = ktiv_male or ""
slug = entry.get("slug", "") slug = entry.get("slug", "")
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3" entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"

6
run.py
View file

@ -34,8 +34,6 @@ import re
import sys import sys
from pathlib import Path from pathlib import Path
from helpers import strip_nikkud
sys.path.insert(0, str(Path(__file__).parent)) sys.path.insert(0, str(Path(__file__).parent))
logging.basicConfig( logging.basicConfig(
@ -127,7 +125,7 @@ def step_examples(args, _freq_cache: dict):
for entry in entries: for entry in entries:
ktiv_male = entry.get("word", {}).get("ktiv_male", "") ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if ktiv_male: if ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male)) safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
if safe: if safe:
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1 consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1} confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
@ -138,7 +136,7 @@ def step_examples(args, _freq_cache: dict):
word_nikkud = entry.get("word", {}).get("nikkud", "") word_nikkud = entry.get("word", {}).get("nikkud", "")
ktiv_male = entry.get("word", {}).get("ktiv_male", "") ktiv_male = entry.get("word", {}).get("ktiv_male", "")
if word_nikkud and ktiv_male: if word_nikkud and ktiv_male:
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male)) safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache: if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
del benyehuda._examples_cache[word_nikkud] del benyehuda._examples_cache[word_nikkud]
stale_deleted += 1 stale_deleted += 1

View file

@ -33,8 +33,8 @@ skip_integration = pytest.mark.skipif(
) )
# A known Hif'il verb slug that is not page-1 dependent. # A known Hif'il verb slug that is not page-1 dependent.
# לְהַגִּיד (to tell/say) — Hif'il, slug 4183-lehagid # לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
HIFIL_VERB_SLUG = "4183-lehagid" HIFIL_VERB_SLUG = "1135-lehagid"
HIFIL_VERB_NIKKUD = "לְהַגִּיד" HIFIL_VERB_NIKKUD = "לְהַגִּיד"
HIFIL_VERB_MEANING = "to say, to tell" HIFIL_VERB_MEANING = "to say, to tell"
@ -208,7 +208,7 @@ class TestDetailScrapeNoun:
# Small rate-limit delay between list scrape and detail scrape # Small rate-limit delay between list scrape and detail scrape
time.sleep(1.0) time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True) pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path) updated_words = _words_from_file(words_path)
entry = updated_words.get(noun_key, {}) entry = updated_words.get(noun_key, {})
@ -227,7 +227,7 @@ class TestDetailScrapeNoun:
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path) monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0) time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True) pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path) updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {} ni = updated_words[noun_key].get("noun_inflection", {}) or {}
@ -250,7 +250,7 @@ class TestDetailScrapeNoun:
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path) monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0) time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True) pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path) updated_words = _words_from_file(words_path)
ni = updated_words[noun_key].get("noun_inflection", {}) or {} ni = updated_words[noun_key].get("noun_inflection", {}) or {}
@ -270,7 +270,7 @@ class TestDetailScrapeNoun:
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path) monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
time.sleep(1.0) time.sleep(1.0)
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True) pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
updated_words = _words_from_file(words_path) updated_words = _words_from_file(words_path)
assert updated_words[noun_key].get("detail_scraped") is True, ( assert updated_words[noun_key].get("detail_scraped") is True, (

View file

@ -1,256 +0,0 @@
#!/usr/bin/env python3
"""
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
For each verb:
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
2. Searches pealim.com to find URL slug
3. Fetches the page to confirm the binyan
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
Output:
verbs_input.txt cleaned verb list for conjugation_extract.py
Printed validation report table
Usage:
python3 validate_verb_list.py
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
running conjugation extraction.
"""
import re
import sys
import time
import urllib.parse
from pathlib import Path
import requests
from bs4 import BeautifulSoup
PEALIM_BASE = "https://www.pealim.com"
REQUEST_DELAY = 1.5
REQUEST_TIMEOUT = 15
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
# Known problem entries: word → (action, note)
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
}
# Expected binyan by line range (1-indexed) per plan analysis
LINE_RANGES: list[tuple[range, str]] = [
(range(1, 18), "Pa'al"),
(range(18, 29), "Nif'al"),
(range(29, 37), "Pi'el"),
(range(37, 43), "Pu'al"),
(range(43, 53), "Hitpa'el"),
(range(53, 63), "Hif'il"),
(range(63, 71), "Huf'al"),
]
SECTION_HEADERS: dict[str, str] = {
"Pa'al": "# Pa'al (פָּעַל)",
"Nif'al": "# Nif'al (נִפְעַל)",
"Pi'el": "# Pi'el (פִּעֵל)",
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
"Hif'il": "# Hif'il (הִפְעִיל)",
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
}
session = requests.Session()
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
def classify_by_line(line_num: int) -> str:
"""Return expected binyan for a 1-indexed line number."""
for r, binyan in LINE_RANGES:
if line_num in r:
return binyan
return "Unknown"
def find_slug(query: str) -> str | None:
"""Search pealim.com and return first URL slug found."""
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
try:
resp = session.get(url, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
return slugs[0] if slugs else None
except Exception as e:
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
return None
def get_page_binyan(slug: str) -> str:
"""Fetch /dict/<slug>/ and extract binyan from page header."""
url = f"{PEALIM_BASE}/dict/{slug}/"
try:
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
resp.raise_for_status()
soup = BeautifulSoup(resp.text, "lxml")
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
for h3 in soup.find_all("h3", class_="page-header"):
text = h3.get_text(" ", strip=True)
for bname in binyan_names:
if bname in text:
return bname
meta = soup.find("meta", {"property": "og:description"})
if meta:
desc = meta.get("content", "")
for bname in binyan_names:
if bname in desc:
return bname
except Exception as e:
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
return ""
def main() -> None:
if not SOURCE_FILE.exists():
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
sys.exit(1)
lines = [line.strip() for line in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
results = []
for line_num, word in enumerate(lines, start=1):
expected_binyan = classify_by_line(line_num)
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
if issue_type == "REVIEW":
# Don't query pealim for known-bad entries
print("REVIEW (skipping query)")
results.append(
{
"line": line_num,
"word": word,
"expected_binyan": expected_binyan,
"slug": "",
"page_binyan": "",
"status": "REVIEW",
"notes": issue_note,
"is_3ms": is_3ms_by_position,
}
)
continue
time.sleep(REQUEST_DELAY)
slug = find_slug(word)
if slug:
time.sleep(REQUEST_DELAY)
page_binyan = get_page_binyan(slug)
else:
page_binyan = ""
# Determine status
if issue_type == "3ms" or is_3ms_by_position:
status = "3ms"
notes = issue_note or "Pu'al/Huf'al 3ms past form"
elif not slug:
status = "NOT_FOUND"
notes = "no search result on pealim.com"
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
status = "MISMATCH"
notes = f"expected {expected_binyan}, page says {page_binyan}"
else:
status = "OK"
notes = ""
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
results.append(
{
"line": line_num,
"word": word,
"expected_binyan": expected_binyan,
"slug": slug or "",
"page_binyan": page_binyan,
"status": status,
"notes": notes,
"is_3ms": is_3ms_by_position or issue_type == "3ms",
}
)
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
review_lines: list[str] = []
for r in results:
b = r["expected_binyan"]
if b not in sections:
b = list(sections.keys())[0]
if r["status"] == "REVIEW":
review_lines.append(f"# REVIEW: {r['word']}{r['notes']}")
elif r["status"] == "3ms":
sections[b].append(f"# 3ms: {r['word']}")
elif r["status"] in ("OK", "MISMATCH"):
sections[b].append(r["word"])
else: # NOT_FOUND
sections[b].append(f"# NOT_FOUND: {r['word']}{r['notes']}")
output_lines = [
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
"",
]
for binyan, header in SECTION_HEADERS.items():
if sections.get(binyan):
output_lines.append(header)
output_lines.extend(sections[binyan])
output_lines.append("")
if review_lines:
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
output_lines.extend(review_lines)
output_lines.append("")
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
print(f"\nWrote → {OUTPUT_FILE}")
# ── Print summary table ──────────────────────────────────────────────────────
print("\n" + "=" * 95)
print("VALIDATION REPORT")
print("=" * 95)
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
print("-" * 95)
for r in results:
print(
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
)
print("=" * 95)
counts = {s: sum(1 for r in results if r["status"] == s) for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
print(
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
)
print(f"Total entries: {len(results)}")
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
print("\n⚠ Review flagged entries in verbs_input.txt before running:\n python3 conjugation_extract.py")
if __name__ == "__main__":
main()

View file

@ -1,3 +0,0 @@
# Vulture whitelist: suppress false positives for interface methods
# HTMLParser.handle_starttag requires (self, tag, attrs) signature
attrs # noqa