Sprint 11.1: strip_nikkud cleanup, dead code removal, test fixes
Remove strip_nikkud from all pipeline files — use ktiv_male directly. Fix case-insensitive binyan matching in detail scraper (og:description uses UPPERCASE). Fix integration test slugs and test limits. Delete legacy CSVs, stale .apkg, and dead scripts from git. Add vulture to pre-commit hook. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
a1d970a782
commit
b2fef5aa8a
18 changed files with 71 additions and 60993 deletions
8
.gitignore
vendored
8
.gitignore
vendored
|
|
@ -47,6 +47,14 @@ data/epubs/
|
||||||
|
|
||||||
# Stray deck files
|
# Stray deck files
|
||||||
Everything__*.apkg
|
Everything__*.apkg
|
||||||
|
*.apkg
|
||||||
|
|
||||||
|
# Legacy CSV files (replaced by data/words.json)
|
||||||
|
*.csv
|
||||||
|
data/*.csv
|
||||||
|
|
||||||
|
# Dead whitelist files
|
||||||
|
vulture_whitelist.py
|
||||||
|
|
||||||
# Release artifacts — distributed via Forgejo releases, not committed to tree
|
# Release artifacts — distributed via Forgejo releases, not committed to tree
|
||||||
releases/
|
releases/
|
||||||
|
|
|
||||||
|
|
@ -15,8 +15,6 @@ from pathlib import Path
|
||||||
|
|
||||||
import genanki
|
import genanki
|
||||||
|
|
||||||
from helpers import strip_nikkud as _strip_nikkud
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# Stable deck/model IDs — do not change these
|
# Stable deck/model IDs — do not change these
|
||||||
|
|
@ -1315,7 +1313,8 @@ def build_confusables_deck(
|
||||||
at = f"[sound:{af}]"
|
at = f"[sound:{af}]"
|
||||||
if not at:
|
if not at:
|
||||||
slug = e.get("slug", "") or ""
|
slug = e.get("slug", "") or ""
|
||||||
at = _audio_tag(_strip_nikkud(w), slug=slug)
|
ktiv_male = e.get("word", {}).get("ktiv_male", "") or ""
|
||||||
|
at = _audio_tag(ktiv_male, slug=slug)
|
||||||
if at and at not in audio_parts:
|
if at and at not in audio_parts:
|
||||||
audio_parts.append(at)
|
audio_parts.append(at)
|
||||||
mp3_name = at.removeprefix("[sound:").removesuffix("]")
|
mp3_name = at.removeprefix("[sound:").removesuffix("]")
|
||||||
|
|
@ -1415,11 +1414,15 @@ PLURAL_MODEL = genanki.Model(
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def _is_irregular_plural(gender: str, plural: str) -> bool:
|
def _is_irregular_plural(gender: str, plural_ktiv: str) -> bool:
|
||||||
"""Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix."""
|
"""Detect irregular plurals: masc nouns with ות- suffix, fem with ים- suffix.
|
||||||
plural_stripped = _strip_nikkud(plural)
|
|
||||||
return (gender == "masculine" and plural_stripped.endswith("ות")) or (
|
Args:
|
||||||
gender == "feminine" and plural_stripped.endswith("ים")
|
gender: ``"masculine"`` or ``"feminine"``.
|
||||||
|
plural_ktiv: ktiv male (no nikkud) form of the plural.
|
||||||
|
"""
|
||||||
|
return (gender == "masculine" and plural_ktiv.endswith("ות")) or (
|
||||||
|
gender == "feminine" and plural_ktiv.endswith("ים")
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -1453,13 +1456,14 @@ def build_plural_deck(
|
||||||
continue
|
continue
|
||||||
singular = singular_data.get("nikkud", "")
|
singular = singular_data.get("nikkud", "")
|
||||||
plural = plural_data.get("nikkud", "")
|
plural = plural_data.get("nikkud", "")
|
||||||
|
plural_ktiv = plural_data.get("ktiv_male", "")
|
||||||
if not singular or not plural:
|
if not singular or not plural:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
gender = noun_inflection.get("gender", "")
|
gender = noun_inflection.get("gender", "")
|
||||||
mishkal = noun_inflection.get("mishkal") or ""
|
mishkal = noun_inflection.get("mishkal") or ""
|
||||||
|
|
||||||
if _is_irregular_plural(gender, plural):
|
if _is_irregular_plural(gender, plural_ktiv):
|
||||||
irregulars.append((unique_key, entry, noun_inflection))
|
irregulars.append((unique_key, entry, noun_inflection))
|
||||||
elif mishkal:
|
elif mishkal:
|
||||||
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
|
by_mishkal.setdefault(mishkal, []).append((unique_key, entry, noun_inflection))
|
||||||
|
|
@ -1475,7 +1479,9 @@ def build_plural_deck(
|
||||||
note_count = 0
|
note_count = 0
|
||||||
for _unique_key, entry, noun_inflection in selected:
|
for _unique_key, entry, noun_inflection in selected:
|
||||||
singular = noun_inflection["singular"]["nikkud"]
|
singular = noun_inflection["singular"]["nikkud"]
|
||||||
|
singular_ktiv = noun_inflection["singular"].get("ktiv_male", "")
|
||||||
plural = noun_inflection["plural"]["nikkud"]
|
plural = noun_inflection["plural"]["nikkud"]
|
||||||
|
plural_ktiv = noun_inflection["plural"].get("ktiv_male", "")
|
||||||
gender = noun_inflection.get("gender") or ""
|
gender = noun_inflection.get("gender") or ""
|
||||||
mishkal = noun_inflection.get("mishkal") or ""
|
mishkal = noun_inflection.get("mishkal") or ""
|
||||||
meaning = entry.get("meaning") or ""
|
meaning = entry.get("meaning") or ""
|
||||||
|
|
@ -1490,8 +1496,7 @@ def build_plural_deck(
|
||||||
sg_audio = ""
|
sg_audio = ""
|
||||||
pl_audio = ""
|
pl_audio = ""
|
||||||
if include_audio:
|
if include_audio:
|
||||||
sg_no_nik = _strip_nikkud(singular)
|
sg_tag = _audio_tag(singular_ktiv)
|
||||||
sg_tag = _audio_tag(sg_no_nik)
|
|
||||||
if sg_tag:
|
if sg_tag:
|
||||||
sg_audio = sg_tag
|
sg_audio = sg_tag
|
||||||
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
|
mp3_path = AUDIO_DIR / sg_tag.removeprefix("[sound:").removesuffix("]")
|
||||||
|
|
@ -1501,7 +1506,7 @@ def build_plural_deck(
|
||||||
tags = [RELEASE_TAG]
|
tags = [RELEASE_TAG]
|
||||||
if mishkal:
|
if mishkal:
|
||||||
tags.append(f"mishkal::{mishkal}")
|
tags.append(f"mishkal::{mishkal}")
|
||||||
if _is_irregular_plural(gender, plural):
|
if _is_irregular_plural(gender, plural_ktiv):
|
||||||
tags.append("irregular")
|
tags.append("irregular")
|
||||||
|
|
||||||
note = genanki.Note(
|
note = genanki.Note(
|
||||||
|
|
|
||||||
9121
data/hebrew_dict.csv
9121
data/hebrew_dict.csv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
9106
data/pealim_dict.csv
9106
data/pealim_dict.csv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
|
|
@ -15,8 +15,6 @@ from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from helpers import strip_nikkud as _strip_nikkud
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
|
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
|
||||||
|
|
@ -45,7 +43,7 @@ def load(cache_path: Path = CACHE_PATH) -> None:
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
if not line:
|
if not line:
|
||||||
continue
|
continue
|
||||||
word = _strip_nikkud(line.split()[0])
|
word = line.split()[0]
|
||||||
if word and word not in _freq:
|
if word and word not in _freq:
|
||||||
_freq[word] = rank
|
_freq[word] = rank
|
||||||
rank += 1
|
rank += 1
|
||||||
|
|
@ -60,11 +58,11 @@ def get_frequency_rank(word_no_nikkud: str) -> int | None:
|
||||||
"""
|
"""
|
||||||
Return the frequency rank of a word (1 = most common).
|
Return the frequency rank of a word (1 = most common).
|
||||||
Returns None if not found in the corpus.
|
Returns None if not found in the corpus.
|
||||||
Strips nikkud from the input before lookup.
|
Expects ktiv male (no nikkud) input.
|
||||||
"""
|
"""
|
||||||
if not _freq:
|
if not _freq:
|
||||||
load()
|
load()
|
||||||
clean = _strip_nikkud(word_no_nikkud.strip())
|
clean = word_no_nikkud.strip()
|
||||||
return _freq.get(clean)
|
return _freq.get(clean)
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -31,8 +31,6 @@ from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from helpers import strip_nikkud as _strip_nikkud
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
DATA_DIR = Path(__file__).parent / "data"
|
DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
@ -78,7 +76,7 @@ def is_concrete(english_meaning: str) -> bool:
|
||||||
|
|
||||||
def _safe_name(word_no_nikkud: str) -> str:
|
def _safe_name(word_no_nikkud: str) -> str:
|
||||||
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
|
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
|
||||||
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud))
|
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
|
||||||
return hebrew_only if hebrew_only else "unknown"
|
return hebrew_only if hebrew_only else "unknown"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -261,7 +259,7 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
|
||||||
if single_word and word_plain != single_word:
|
if single_word and word_plain != single_word:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
cache_key = word_plain or _strip_nikkud(word)
|
cache_key = word_plain
|
||||||
|
|
||||||
if cache_key in cache:
|
if cache_key in cache:
|
||||||
skipped_cached += 1
|
skipped_cached += 1
|
||||||
|
|
|
||||||
BIN
pealim.apkg
BIN
pealim.apkg
Binary file not shown.
|
|
@ -17,8 +17,6 @@ from pathlib import Path
|
||||||
|
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
from helpers import strip_nikkud
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
DATA_DIR = Path(__file__).parent / "data"
|
DATA_DIR = Path(__file__).parent / "data"
|
||||||
|
|
@ -58,11 +56,15 @@ def _make_audio_file(entry: dict) -> str:
|
||||||
Returns:
|
Returns:
|
||||||
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
|
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
|
||||||
"""
|
"""
|
||||||
slug: str = entry["slug"]
|
audio_file = entry.get("audio_file", "")
|
||||||
|
if audio_file:
|
||||||
|
return audio_file
|
||||||
|
# Fallback: use slug for confusables, ktiv_male for others
|
||||||
|
slug = entry.get("slug", "")
|
||||||
if entry.get("confusable_group"):
|
if entry.get("confusable_group"):
|
||||||
return f"{slug}.mp3"
|
return f"{slug}.mp3"
|
||||||
word: str = entry.get("word", "")
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||||
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word))
|
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
||||||
return f"{safe_name}.mp3"
|
return f"{safe_name}.mp3"
|
||||||
|
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -25,8 +25,6 @@ from pathlib import Path
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from helpers import strip_nikkud
|
|
||||||
|
|
||||||
logger = logging.getLogger(__name__)
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -41,6 +39,7 @@ SAVE_INTERVAL = 50 # write words.json every N processed entries
|
||||||
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
|
WORDS_JSON = Path(__file__).parent / "data" / "words.json"
|
||||||
|
|
||||||
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
||||||
|
_BINYAN_NAMES_LOWER: tuple[str, ...] = tuple(b.lower() for b in BINYAN_NAMES)
|
||||||
|
|
||||||
BINYAN_HEBREW: dict[str, str] = {
|
BINYAN_HEBREW: dict[str, str] = {
|
||||||
"Pa'al": "פָּעַל",
|
"Pa'al": "פָּעַל",
|
||||||
|
|
@ -422,8 +421,9 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
||||||
def form_or_null(nikkud: str, ktiv: str) -> dict | None:
|
def form_or_null(nikkud: str, ktiv: str) -> dict | None:
|
||||||
if not nikkud:
|
if not nikkud:
|
||||||
return None
|
return None
|
||||||
ktiv_clean = ktiv if ktiv else strip_nikkud(nikkud)
|
if not ktiv:
|
||||||
return {"nikkud": nikkud, "ktiv_male": ktiv_clean}
|
logger.warning("No ktiv_male for noun form: %s", nikkud)
|
||||||
|
return {"nikkud": nikkud, "ktiv_male": ktiv}
|
||||||
|
|
||||||
singular_nikkud = str(mo_data.get("singular_nikkud", ""))
|
singular_nikkud = str(mo_data.get("singular_nikkud", ""))
|
||||||
plural_nikkud = str(mo_data.get("plural_nikkud", ""))
|
plural_nikkud = str(mo_data.get("plural_nikkud", ""))
|
||||||
|
|
@ -464,17 +464,15 @@ def _scrape_noun_detail(_slug: str, mo_html: str, vl_html: str) -> dict:
|
||||||
|
|
||||||
def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
|
def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
|
||||||
"""Extract binyan from page header span or og:description."""
|
"""Extract binyan from page header span or og:description."""
|
||||||
for h3 in soup.find_all("h3", class_="page-header"):
|
texts = [h3.get_text(" ", strip=True) for h3 in soup.find_all("h3", class_="page-header")]
|
||||||
text = h3.get_text(" ", strip=True)
|
|
||||||
for bname in BINYAN_NAMES:
|
|
||||||
if bname in text:
|
|
||||||
return bname
|
|
||||||
meta = soup.find("meta", {"property": "og:description"})
|
meta = soup.find("meta", {"property": "og:description"})
|
||||||
if meta:
|
if meta:
|
||||||
desc = meta.get("content", "")
|
texts.append(str(meta.get("content", "")))
|
||||||
for bname in BINYAN_NAMES:
|
for text in texts:
|
||||||
if bname in desc:
|
text_lower = text.lower()
|
||||||
return bname
|
for i, bname_lower in enumerate(_BINYAN_NAMES_LOWER):
|
||||||
|
if bname_lower in text_lower:
|
||||||
|
return BINYAN_NAMES[i]
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -784,7 +782,9 @@ def _forms_to_active_list(
|
||||||
person = FORM_KEY_TO_PERSON.get(form_key, form_key)
|
person = FORM_KEY_TO_PERSON.get(form_key, form_key)
|
||||||
tense = TENSE_DESCRIPTION.get(form_key, "")
|
tense = TENSE_DESCRIPTION.get(form_key, "")
|
||||||
nikkud = form_data["form_nikkud"]
|
nikkud = form_data["form_nikkud"]
|
||||||
ktiv = vl_forms.get(form_key, "") or strip_nikkud(nikkud)
|
ktiv = vl_forms.get(form_key, "")
|
||||||
|
if not ktiv:
|
||||||
|
logger.warning("No ktiv_male for verb form %s: %s", form_key, nikkud)
|
||||||
audio_url = form_data.get("audio_url", "")
|
audio_url = form_data.get("audio_url", "")
|
||||||
pronoun = PRONOUN_LABELS.get(form_key, "")
|
pronoun = PRONOUN_LABELS.get(form_key, "")
|
||||||
|
|
||||||
|
|
@ -838,9 +838,13 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
||||||
|
|
||||||
# Determine infinitive and reference form
|
# Determine infinitive and reference form
|
||||||
infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "")
|
infinitive_nikkud = mo_active.get("infinitive", {}).get("form_nikkud", "")
|
||||||
infinitive_ktiv = vl_active.get("infinitive", "") or strip_nikkud(infinitive_nikkud)
|
infinitive_ktiv = vl_active.get("infinitive", "")
|
||||||
|
if infinitive_nikkud and not infinitive_ktiv:
|
||||||
|
logger.warning("No ktiv_male for infinitive: %s (slug=%s)", infinitive_nikkud, slug)
|
||||||
past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "")
|
past_3ms_nikkud = mo_active.get("past_3ms", {}).get("form_nikkud", "")
|
||||||
past_3ms_ktiv = vl_active.get("past_3ms", "") or strip_nikkud(past_3ms_nikkud)
|
past_3ms_ktiv = vl_active.get("past_3ms", "")
|
||||||
|
if past_3ms_nikkud and not past_3ms_ktiv:
|
||||||
|
logger.warning("No ktiv_male for past_3ms: %s (slug=%s)", past_3ms_nikkud, slug)
|
||||||
|
|
||||||
# Build active forms list, preserving GUIDs
|
# Build active forms list, preserving GUIDs
|
||||||
existing_active_forms = existing.get("active_forms")
|
existing_active_forms = existing.get("active_forms")
|
||||||
|
|
@ -861,7 +865,9 @@ def _scrape_verb_detail(slug: str, mo_html: str, vl_html: str, existing_conj: di
|
||||||
existing_passive_forms = existing.get("hufal_pual_forms")
|
existing_passive_forms = existing.get("hufal_pual_forms")
|
||||||
hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms)
|
hufal_pual_forms = _forms_to_active_list(mo_passive, vl_passive, existing_passive_forms)
|
||||||
passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "")
|
passive_3ms_nikkud = mo_passive.get("past_3ms", {}).get("form_nikkud", "")
|
||||||
passive_3ms_ktiv = vl_passive.get("past_3ms", "") or strip_nikkud(passive_3ms_nikkud)
|
passive_3ms_ktiv = vl_passive.get("past_3ms", "")
|
||||||
|
if passive_3ms_nikkud and not passive_3ms_ktiv:
|
||||||
|
logger.warning("No ktiv_male for passive past_3ms: %s (slug=%s)", passive_3ms_nikkud, slug)
|
||||||
if passive_3ms_nikkud:
|
if passive_3ms_nikkud:
|
||||||
reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv}
|
reference_form_passive = {"nikkud": passive_3ms_nikkud, "ktiv_male": passive_3ms_ktiv}
|
||||||
|
|
||||||
|
|
|
||||||
9106
pealim_dict.csv
9106
pealim_dict.csv
File diff suppressed because it is too large
Load diff
12111
pealim_dict_for_anki.csv
12111
pealim_dict_for_anki.csv
File diff suppressed because it is too large
Load diff
|
|
@ -21,8 +21,6 @@ from pathlib import Path
|
||||||
import requests
|
import requests
|
||||||
from bs4 import BeautifulSoup
|
from bs4 import BeautifulSoup
|
||||||
|
|
||||||
from helpers import strip_nikkud
|
|
||||||
|
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
# Paths
|
# Paths
|
||||||
# ---------------------------------------------------------------------------
|
# ---------------------------------------------------------------------------
|
||||||
|
|
@ -262,7 +260,7 @@ def _build_tags(pos_en: str, root: list[str]) -> str:
|
||||||
|
|
||||||
parts: list[str] = []
|
parts: list[str] = []
|
||||||
if root:
|
if root:
|
||||||
root_str = "".join(strip_nikkud(c) for c in root)
|
root_str = "".join(root)
|
||||||
parts.append(f"שורש::{root_str}")
|
parts.append(f"שורש::{root_str}")
|
||||||
|
|
||||||
pos_heb_tag = pos_tag_map.get(pos_en, "")
|
pos_heb_tag = pos_tag_map.get(pos_en, "")
|
||||||
|
|
@ -280,7 +278,7 @@ def _compute_audio_file(slug: str, ktiv_male: str) -> str:
|
||||||
here we store a placeholder that post_process() will correct.
|
here we store a placeholder that post_process() will correct.
|
||||||
We default to the consonant-based name; confusables get slug-based names.
|
We default to the consonant-based name; confusables get slug-based names.
|
||||||
"""
|
"""
|
||||||
consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
|
consonants = ktiv_male or ""
|
||||||
return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||||
|
|
||||||
|
|
||||||
|
|
@ -532,7 +530,7 @@ def _post_process(words: dict) -> None:
|
||||||
entry["confusable_group"] = None
|
entry["confusable_group"] = None
|
||||||
# Non-confusable → consonant-based audio filename
|
# Non-confusable → consonant-based audio filename
|
||||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||||
consonants = strip_nikkud(ktiv_male) if ktiv_male else ""
|
consonants = ktiv_male or ""
|
||||||
slug = entry.get("slug", "")
|
slug = entry.get("slug", "")
|
||||||
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||||
|
|
||||||
|
|
|
||||||
6
run.py
6
run.py
|
|
@ -34,8 +34,6 @@ import re
|
||||||
import sys
|
import sys
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
from helpers import strip_nikkud
|
|
||||||
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent))
|
sys.path.insert(0, str(Path(__file__).parent))
|
||||||
|
|
||||||
logging.basicConfig(
|
logging.basicConfig(
|
||||||
|
|
@ -127,7 +125,7 @@ def step_examples(args, _freq_cache: dict):
|
||||||
for entry in entries:
|
for entry in entries:
|
||||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||||
if ktiv_male:
|
if ktiv_male:
|
||||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
|
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
||||||
if safe:
|
if safe:
|
||||||
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
consonant_counts[safe] = consonant_counts.get(safe, 0) + 1
|
||||||
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
confusable_consonants = {k for k, v in consonant_counts.items() if v > 1}
|
||||||
|
|
@ -138,7 +136,7 @@ def step_examples(args, _freq_cache: dict):
|
||||||
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
word_nikkud = entry.get("word", {}).get("nikkud", "")
|
||||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||||
if word_nikkud and ktiv_male:
|
if word_nikkud and ktiv_male:
|
||||||
safe = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(ktiv_male))
|
safe = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
||||||
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
|
if safe in confusable_consonants and word_nikkud in benyehuda._examples_cache:
|
||||||
del benyehuda._examples_cache[word_nikkud]
|
del benyehuda._examples_cache[word_nikkud]
|
||||||
stale_deleted += 1
|
stale_deleted += 1
|
||||||
|
|
|
||||||
|
|
@ -33,8 +33,8 @@ skip_integration = pytest.mark.skipif(
|
||||||
)
|
)
|
||||||
|
|
||||||
# A known Hif'il verb slug that is not page-1 dependent.
|
# A known Hif'il verb slug that is not page-1 dependent.
|
||||||
# לְהַגִּיד (to tell/say) — Hif'il, slug 4183-lehagid
|
# לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
|
||||||
HIFIL_VERB_SLUG = "4183-lehagid"
|
HIFIL_VERB_SLUG = "1135-lehagid"
|
||||||
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
|
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
|
||||||
HIFIL_VERB_MEANING = "to say, to tell"
|
HIFIL_VERB_MEANING = "to say, to tell"
|
||||||
|
|
||||||
|
|
@ -208,7 +208,7 @@ class TestDetailScrapeNoun:
|
||||||
# Small rate-limit delay between list scrape and detail scrape
|
# Small rate-limit delay between list scrape and detail scrape
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
|
|
||||||
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
updated_words = _words_from_file(words_path)
|
updated_words = _words_from_file(words_path)
|
||||||
entry = updated_words.get(noun_key, {})
|
entry = updated_words.get(noun_key, {})
|
||||||
|
|
@ -227,7 +227,7 @@ class TestDetailScrapeNoun:
|
||||||
|
|
||||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
updated_words = _words_from_file(words_path)
|
updated_words = _words_from_file(words_path)
|
||||||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||||
|
|
@ -250,7 +250,7 @@ class TestDetailScrapeNoun:
|
||||||
|
|
||||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
updated_words = _words_from_file(words_path)
|
updated_words = _words_from_file(words_path)
|
||||||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||||
|
|
@ -270,7 +270,7 @@ class TestDetailScrapeNoun:
|
||||||
|
|
||||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
pealim_detail_scrape.run(test=1, force_refresh=True, nouns_only=True)
|
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||||
|
|
||||||
updated_words = _words_from_file(words_path)
|
updated_words = _words_from_file(words_path)
|
||||||
assert updated_words[noun_key].get("detail_scraped") is True, (
|
assert updated_words[noun_key].get("detail_scraped") is True, (
|
||||||
|
|
|
||||||
|
|
@ -1,256 +0,0 @@
|
||||||
#!/usr/bin/env python3
|
|
||||||
"""
|
|
||||||
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
|
|
||||||
|
|
||||||
For each verb:
|
|
||||||
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
|
|
||||||
2. Searches pealim.com to find URL slug
|
|
||||||
3. Fetches the page to confirm the binyan
|
|
||||||
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
|
|
||||||
|
|
||||||
Output:
|
|
||||||
verbs_input.txt — cleaned verb list for conjugation_extract.py
|
|
||||||
Printed validation report table
|
|
||||||
|
|
||||||
Usage:
|
|
||||||
python3 validate_verb_list.py
|
|
||||||
|
|
||||||
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
|
|
||||||
running conjugation extraction.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import re
|
|
||||||
import sys
|
|
||||||
import time
|
|
||||||
import urllib.parse
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from bs4 import BeautifulSoup
|
|
||||||
|
|
||||||
PEALIM_BASE = "https://www.pealim.com"
|
|
||||||
REQUEST_DELAY = 1.5
|
|
||||||
REQUEST_TIMEOUT = 15
|
|
||||||
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
|
|
||||||
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
|
|
||||||
|
|
||||||
# Known problem entries: word → (action, note)
|
|
||||||
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
|
|
||||||
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
|
|
||||||
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
|
|
||||||
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
|
|
||||||
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
|
|
||||||
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
|
|
||||||
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
|
|
||||||
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
|
|
||||||
}
|
|
||||||
|
|
||||||
# Expected binyan by line range (1-indexed) per plan analysis
|
|
||||||
LINE_RANGES: list[tuple[range, str]] = [
|
|
||||||
(range(1, 18), "Pa'al"),
|
|
||||||
(range(18, 29), "Nif'al"),
|
|
||||||
(range(29, 37), "Pi'el"),
|
|
||||||
(range(37, 43), "Pu'al"),
|
|
||||||
(range(43, 53), "Hitpa'el"),
|
|
||||||
(range(53, 63), "Hif'il"),
|
|
||||||
(range(63, 71), "Huf'al"),
|
|
||||||
]
|
|
||||||
|
|
||||||
SECTION_HEADERS: dict[str, str] = {
|
|
||||||
"Pa'al": "# Pa'al (פָּעַל)",
|
|
||||||
"Nif'al": "# Nif'al (נִפְעַל)",
|
|
||||||
"Pi'el": "# Pi'el (פִּעֵל)",
|
|
||||||
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
|
|
||||||
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
|
|
||||||
"Hif'il": "# Hif'il (הִפְעִיל)",
|
|
||||||
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
|
|
||||||
}
|
|
||||||
|
|
||||||
session = requests.Session()
|
|
||||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
|
|
||||||
|
|
||||||
|
|
||||||
def classify_by_line(line_num: int) -> str:
|
|
||||||
"""Return expected binyan for a 1-indexed line number."""
|
|
||||||
for r, binyan in LINE_RANGES:
|
|
||||||
if line_num in r:
|
|
||||||
return binyan
|
|
||||||
return "Unknown"
|
|
||||||
|
|
||||||
|
|
||||||
def find_slug(query: str) -> str | None:
|
|
||||||
"""Search pealim.com and return first URL slug found."""
|
|
||||||
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
|
|
||||||
try:
|
|
||||||
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
|
||||||
resp.raise_for_status()
|
|
||||||
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
|
||||||
return slugs[0] if slugs else None
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
|
|
||||||
return None
|
|
||||||
|
|
||||||
|
|
||||||
def get_page_binyan(slug: str) -> str:
|
|
||||||
"""Fetch /dict/<slug>/ and extract binyan from page header."""
|
|
||||||
url = f"{PEALIM_BASE}/dict/{slug}/"
|
|
||||||
try:
|
|
||||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
|
||||||
resp.raise_for_status()
|
|
||||||
soup = BeautifulSoup(resp.text, "lxml")
|
|
||||||
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
|
|
||||||
for h3 in soup.find_all("h3", class_="page-header"):
|
|
||||||
text = h3.get_text(" ", strip=True)
|
|
||||||
for bname in binyan_names:
|
|
||||||
if bname in text:
|
|
||||||
return bname
|
|
||||||
meta = soup.find("meta", {"property": "og:description"})
|
|
||||||
if meta:
|
|
||||||
desc = meta.get("content", "")
|
|
||||||
for bname in binyan_names:
|
|
||||||
if bname in desc:
|
|
||||||
return bname
|
|
||||||
except Exception as e:
|
|
||||||
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
|
|
||||||
return ""
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
|
||||||
if not SOURCE_FILE.exists():
|
|
||||||
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
|
|
||||||
sys.exit(1)
|
|
||||||
|
|
||||||
lines = [line.strip() for line in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
|
|
||||||
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
|
|
||||||
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
for line_num, word in enumerate(lines, start=1):
|
|
||||||
expected_binyan = classify_by_line(line_num)
|
|
||||||
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
|
|
||||||
|
|
||||||
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
|
|
||||||
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
|
|
||||||
|
|
||||||
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
|
|
||||||
|
|
||||||
if issue_type == "REVIEW":
|
|
||||||
# Don't query pealim for known-bad entries
|
|
||||||
print("REVIEW (skipping query)")
|
|
||||||
results.append(
|
|
||||||
{
|
|
||||||
"line": line_num,
|
|
||||||
"word": word,
|
|
||||||
"expected_binyan": expected_binyan,
|
|
||||||
"slug": "",
|
|
||||||
"page_binyan": "",
|
|
||||||
"status": "REVIEW",
|
|
||||||
"notes": issue_note,
|
|
||||||
"is_3ms": is_3ms_by_position,
|
|
||||||
}
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
|
|
||||||
time.sleep(REQUEST_DELAY)
|
|
||||||
slug = find_slug(word)
|
|
||||||
|
|
||||||
if slug:
|
|
||||||
time.sleep(REQUEST_DELAY)
|
|
||||||
page_binyan = get_page_binyan(slug)
|
|
||||||
else:
|
|
||||||
page_binyan = ""
|
|
||||||
|
|
||||||
# Determine status
|
|
||||||
if issue_type == "3ms" or is_3ms_by_position:
|
|
||||||
status = "3ms"
|
|
||||||
notes = issue_note or "Pu'al/Huf'al 3ms past form"
|
|
||||||
elif not slug:
|
|
||||||
status = "NOT_FOUND"
|
|
||||||
notes = "no search result on pealim.com"
|
|
||||||
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
|
|
||||||
status = "MISMATCH"
|
|
||||||
notes = f"expected {expected_binyan}, page says {page_binyan}"
|
|
||||||
else:
|
|
||||||
status = "OK"
|
|
||||||
notes = ""
|
|
||||||
|
|
||||||
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
|
|
||||||
results.append(
|
|
||||||
{
|
|
||||||
"line": line_num,
|
|
||||||
"word": word,
|
|
||||||
"expected_binyan": expected_binyan,
|
|
||||||
"slug": slug or "",
|
|
||||||
"page_binyan": page_binyan,
|
|
||||||
"status": status,
|
|
||||||
"notes": notes,
|
|
||||||
"is_3ms": is_3ms_by_position or issue_type == "3ms",
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
|
|
||||||
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
|
|
||||||
review_lines: list[str] = []
|
|
||||||
|
|
||||||
for r in results:
|
|
||||||
b = r["expected_binyan"]
|
|
||||||
if b not in sections:
|
|
||||||
b = list(sections.keys())[0]
|
|
||||||
|
|
||||||
if r["status"] == "REVIEW":
|
|
||||||
review_lines.append(f"# REVIEW: {r['word']} — {r['notes']}")
|
|
||||||
elif r["status"] == "3ms":
|
|
||||||
sections[b].append(f"# 3ms: {r['word']}")
|
|
||||||
elif r["status"] in ("OK", "MISMATCH"):
|
|
||||||
sections[b].append(r["word"])
|
|
||||||
else: # NOT_FOUND
|
|
||||||
sections[b].append(f"# NOT_FOUND: {r['word']} — {r['notes']}")
|
|
||||||
|
|
||||||
output_lines = [
|
|
||||||
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
|
|
||||||
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
|
|
||||||
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
|
|
||||||
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
|
|
||||||
"",
|
|
||||||
]
|
|
||||||
for binyan, header in SECTION_HEADERS.items():
|
|
||||||
if sections.get(binyan):
|
|
||||||
output_lines.append(header)
|
|
||||||
output_lines.extend(sections[binyan])
|
|
||||||
output_lines.append("")
|
|
||||||
|
|
||||||
if review_lines:
|
|
||||||
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
|
|
||||||
output_lines.extend(review_lines)
|
|
||||||
output_lines.append("")
|
|
||||||
|
|
||||||
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
|
|
||||||
print(f"\nWrote → {OUTPUT_FILE}")
|
|
||||||
|
|
||||||
# ── Print summary table ──────────────────────────────────────────────────────
|
|
||||||
print("\n" + "=" * 95)
|
|
||||||
print("VALIDATION REPORT")
|
|
||||||
print("=" * 95)
|
|
||||||
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
|
|
||||||
print("-" * 95)
|
|
||||||
for r in results:
|
|
||||||
print(
|
|
||||||
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
|
|
||||||
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
|
|
||||||
)
|
|
||||||
print("=" * 95)
|
|
||||||
|
|
||||||
counts = {s: sum(1 for r in results if r["status"] == s) for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
|
|
||||||
print(
|
|
||||||
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
|
|
||||||
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
|
|
||||||
)
|
|
||||||
print(f"Total entries: {len(results)}")
|
|
||||||
|
|
||||||
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
|
|
||||||
print("\n⚠ Review flagged entries in verbs_input.txt before running:\n python3 conjugation_extract.py")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
main()
|
|
||||||
|
|
@ -1,3 +0,0 @@
|
||||||
# Vulture whitelist: suppress false positives for interface methods
|
|
||||||
# HTMLParser.handle_starttag requires (self, tag, attrs) signature
|
|
||||||
attrs # noqa
|
|
||||||
Loading…
Reference in a new issue