Sprint 4: fix insertion order, skip infinitive cards, split past_3p, fix empty binyan

- vocab deck uses frequency insertion order (genanki.Package); conjugation deck random (_RandomOrderPackage)
- skip infinitive form_key in conjugation deck build (reference only, not a quiz target)
- PAST_3P_EXPANSION: split past_3p into separate הֵם and הֵן cards
- SECTION_BINYAN parsing: read section headers from verbs_input.txt as binyan hints
- add binyan_hint param to _extract_conjugations and _extract_passive_from_active_slug
- patch 20 cached entries with empty binyan (Pa'al, Nif'al) using section hints
- result: 2428 notes across 69 verbs, all with populated binyan

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Sochen 2026-03-04 06:42:32 +00:00
parent 58dc1b8d9b
commit 0db11b1aa1
3 changed files with 65 additions and 32 deletions

View file

@ -311,6 +311,12 @@ FP_MODERN_FALLBACK = {
"imperative_fp": "imperative_mp",
}
# 3rd person plural past: same form for m/f — generate two separate pronoun cards
PAST_3P_EXPANSION = [
("הֵם", "עָבָר"),
("הֵן", "עָבָר"),
]
# Voice field: passive label only (shown inline on card front for Pu'al/Huf'al)
VOICE_MAP = {
"Pu'al": "סָבִיל",
@ -573,6 +579,10 @@ def build_conj_deck(
conj_form = form_data.get("form", "")
audio_url_for_key = form_data.get("audio_url", "")
# Infinitive: shown on card front as reference — skip as a quiz form
if form_key == "infinitive":
continue
# Audio tag: use downloaded file if present
audio_tag = ""
if slug:
@ -588,6 +598,12 @@ def build_conj_deck(
add_note(pronoun, tense_label, conj_form, audio_tag)
continue
# Past 3rd plural: same form for m/f → two separate pronoun cards
if form_key == "past_3p":
for pronoun, tense_label in PAST_3P_EXPANSION:
add_note(pronoun, tense_label, conj_form, audio_tag)
continue
# 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens
if form_key in FP_MODERN_FALLBACK:
mp_key = FP_MODERN_FALLBACK[form_key]
@ -641,7 +657,7 @@ def write_vocab_apkg(
out_path: Path = VOCAB_APKG,
) -> None:
out_path.parent.mkdir(parents=True, exist_ok=True)
pkg = _RandomOrderPackage(deck)
pkg = genanki.Package(deck) # insertion order = frequency rank (new.order=1 default)
pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files()
pkg.write_to_file(str(out_path))
logger.info(f"Vocabulary deck written → {out_path}")

View file

@ -414,7 +414,7 @@ def _extract_passive_binyan_from_page(soup: BeautifulSoup) -> str:
return ""
def _extract_conjugations(slug: str, search_term: str, is_3ms_search: bool = False) -> dict | None:
def _extract_conjugations(slug: str, search_term: str, is_3ms_search: bool = False, binyan_hint: str = "") -> dict | None:
"""Fetch /dict/<slug>/ and parse conjugation table (active + passive)."""
url = f"{PEALIM_BASE}/dict/{slug}/"
try:
@ -434,10 +434,12 @@ def _extract_conjugations(slug: str, search_term: str, is_3ms_search: bool = Fal
root = txt
break
# Extract binyan: try PoS lookup first, then page header
# Extract binyan: try PoS lookup first, then page header, then section hint
binyan = _binyan_from_pos(search_term) if not is_3ms_search else ""
if not binyan:
binyan = _extract_binyan_from_page(soup)
if not binyan:
binyan = binyan_hint
# Parse active forms table
forms_raw = _parse_table(soup, passive=False)
@ -495,7 +497,7 @@ def _save_conjugations(data: dict) -> None:
json.dump(data, f, ensure_ascii=False, indent=2)
def _extract_passive_from_active_slug(active_slug: str, search_term: str) -> dict | None:
def _extract_passive_from_active_slug(active_slug: str, search_term: str, binyan_hint: str = "") -> dict | None:
"""Fetch active verb page and extract only the passive section forms.
Used for Pu'al/Huf'al 3ms entries where we know the active verb's slug."""
url = f"{PEALIM_BASE}/dict/{active_slug}/"
@ -527,6 +529,8 @@ def _extract_passive_from_active_slug(active_slug: str, search_term: str) -> dic
passive_binyan = _extract_passive_binyan_from_page(soup)
if not passive_binyan:
passive_binyan = "Pu'al" if active_binyan == "Pi'el" else "Huf'al" if active_binyan == "Hif'il" else ""
if not passive_binyan:
passive_binyan = binyan_hint
result = {
"infinitive": search_term,
@ -567,8 +571,16 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict:
if len(parts) >= 2:
slug_overrides[parts[0]] = parts[1]
# Map section header keywords → binyan name (for binyan_hint fallback)
SECTION_BINYAN = {
"pa'al": "Pa'al", "nif'al": "Nif'al", "pi'el": "Pi'el",
"pu'al": "Pu'al", "hitpa'el": "Hitpa'el", "hif'il": "Hif'il", "huf'al": "Huf'al",
}
# Parse: regular verbs and # 3ms: lines (optional active slug on 3ms lines)
verbs: list[tuple[str, bool, str | None]] = [] # (search_term, is_3ms_search, active_slug)
# Track current section binyan from comment headers for use as a hint
verbs: list[tuple[str, bool, str | None, str]] = [] # (search_term, is_3ms_search, active_slug, binyan_hint)
current_binyan_hint = ""
for line in raw_lines:
stripped = line.strip()
if not stripped or stripped.startswith("# slug:"):
@ -578,21 +590,26 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict:
if parts:
form = parts[0]
active_slug = parts[1] if len(parts) >= 2 else None
verbs.append((form, True, active_slug))
verbs.append((form, True, active_slug, current_binyan_hint))
elif stripped.startswith("#"):
continue
# Check if this is a section header setting the binyan context
low = stripped.lower()
for key, bname in SECTION_BINYAN.items():
if key in low:
current_binyan_hint = bname
break
else:
verbs.append((stripped, False, None))
verbs.append((stripped, False, None, current_binyan_hint))
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file} "
f"({sum(1 for _, p, _ in verbs if p)} passive 3ms)")
f"({sum(1 for _, p, _, _ in verbs if p)} passive 3ms)")
if slug_overrides:
logger.info(f" Slug overrides: {slug_overrides}")
conjugations = _load_conjugations()
new_count = 0
for verb, is_3ms, active_slug in verbs:
for verb, is_3ms, active_slug, binyan_hint in verbs:
if verb in conjugations:
logger.info(f"Skipping {verb} (cached)")
continue
@ -614,7 +631,7 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict:
continue
logger.info(f" Found active slug {slug} for passive extraction")
time.sleep(REQUEST_DELAY)
data = _extract_passive_from_active_slug(slug, verb)
data = _extract_passive_from_active_slug(slug, verb, binyan_hint=binyan_hint)
else:
override = slug_overrides.get(verb)
if override:
@ -628,7 +645,7 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict:
_save_conjugations(conjugations)
continue
time.sleep(REQUEST_DELAY)
data = _extract_conjugations(slug, verb, is_3ms_search=False)
data = _extract_conjugations(slug, verb, is_3ms_search=False, binyan_hint=binyan_hint)
conjugations[verb] = data
_save_conjugations(conjugations)

View file

@ -3,7 +3,7 @@
"infinitive": "לשמור",
"slug": "2255-lishmor",
"root": "שׁ - מ - ר",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לִשְׁמֹר",
"forms": {
@ -181,7 +181,7 @@
"infinitive": "ללמוד",
"slug": "41-lilmod",
"root": "ל - מ - ד",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לִלְמֹד",
"forms": {
@ -359,7 +359,7 @@
"infinitive": "לאסוף",
"slug": "128-leesof",
"root": "א - ס - ף",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לֶאֱסֹף",
"forms": {
@ -537,7 +537,7 @@
"infinitive": "לעבוד",
"slug": "51-laavod",
"root": "ע - ב - ד",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לַעֲבֹד",
"forms": {
@ -715,7 +715,7 @@
"infinitive": "לחבוש",
"slug": "553-lachbosh",
"root": "ח - ב - שׁ",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לַחְבֹּשׁ",
"forms": {
@ -893,7 +893,7 @@
"infinitive": "לאכול",
"slug": "30-leechol",
"root": "א - כ - ל",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לֶאֱכֹל",
"forms": {
@ -1071,7 +1071,7 @@
"infinitive": "לשאול",
"slug": "39-lishol",
"root": "שׁ - א - ל",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לִשְׁאֹל",
"forms": {
@ -1249,7 +1249,7 @@
"infinitive": "לשלוח",
"slug": "2220-lishloach",
"root": "שׁ - ל - ח",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לִשְׁלֹחַ",
"forms": {
@ -1427,7 +1427,7 @@
"infinitive": "לגבוה",
"slug": "286-ligboah",
"root": "ג - ב - הּ",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לִגְבֹּהַּ",
"forms": {
@ -1961,7 +1961,7 @@
"infinitive": "לִיפּוֹל",
"slug": "1230-lipol",
"root": "נ - פ - ל",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לִפֹּל",
"forms": {
@ -2317,7 +2317,7 @@
"infinitive": "לחון",
"slug": "634-lachon",
"root": "ח - נ - ן",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לָחֹן",
"forms": {
@ -2495,7 +2495,7 @@
"infinitive": "לקרוא",
"slug": "13-likro",
"root": "ק - ר - א",
"binyan": "",
"binyan": "Pa'al",
"is_passive": false,
"reference_form": "לִקְרֹא",
"forms": {
@ -2851,7 +2851,7 @@
"infinitive": "להיבדק",
"slug": "178-lehibadek",
"root": "ב - ד - ק",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לְהִבָּדֵק",
"forms": {
@ -3207,7 +3207,7 @@
"infinitive": "להיהרג",
"slug": "474-lehehareg",
"root": "ה - ר - ג",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לְהֵהָרֵג",
"forms": {
@ -3563,7 +3563,7 @@
"infinitive": "להישאר",
"slug": "47-lehishaer",
"root": "שׁ - א - ר",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לְהִשָּׁאֵר",
"forms": {
@ -3741,7 +3741,7 @@
"infinitive": "להיפגע",
"slug": "1591-lehipagea",
"root": "פ - ג - ע",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לְהִפָּגֵעַ",
"forms": {
@ -3919,7 +3919,7 @@
"infinitive": "להיוולד",
"slug": "800-lehivaled",
"root": "י - ל - ד",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לְהִוָּלֵד",
"forms": {
@ -4275,7 +4275,7 @@
"infinitive": "להיסוג",
"slug": "1323-laseget",
"root": "ס - ו - ג",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לָסֶגֶת",
"forms": {
@ -4453,7 +4453,7 @@
"infinitive": "להימצא",
"slug": "1084-lehimatze",
"root": "מ - צ - א",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לְהִמָּצֵא",
"forms": {
@ -4631,7 +4631,7 @@
"infinitive": "להיבנות",
"slug": "235-lehibanot",
"root": "ב - נ - ה",
"binyan": "",
"binyan": "Nif'al",
"is_passive": false,
"reference_form": "לְהִבָּנוֹת",
"forms": {