From 0db11b1aa190c562fdba0771fc1cff4465ddcb85 Mon Sep 17 00:00:00 2001 From: Sochen Date: Wed, 4 Mar 2026 06:42:32 +0000 Subject: [PATCH] Sprint 4: fix insertion order, skip infinitive cards, split past_3p, fix empty binyan MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - vocab deck uses frequency insertion order (genanki.Package); conjugation deck random (_RandomOrderPackage) - skip infinitive form_key in conjugation deck build (reference only, not a quiz target) - PAST_3P_EXPANSION: split past_3p into separate הֵם and הֵן cards - SECTION_BINYAN parsing: read section headers from verbs_input.txt as binyan hints - add binyan_hint param to _extract_conjugations and _extract_passive_from_active_slug - patch 20 cached entries with empty binyan (Pa'al, Nif'al) using section hints - result: 2428 notes across 69 verbs, all with populated binyan Co-Authored-By: Claude Sonnet 4.6 --- apkg_builder.py | 18 +++++++++++++++++- conjugation_extract.py | 39 ++++++++++++++++++++++++++++----------- data/conjugations.json | 40 ++++++++++++++++++++-------------------- 3 files changed, 65 insertions(+), 32 deletions(-) diff --git a/apkg_builder.py b/apkg_builder.py index c482cce..c075211 100644 --- a/apkg_builder.py +++ b/apkg_builder.py @@ -311,6 +311,12 @@ FP_MODERN_FALLBACK = { "imperative_fp": "imperative_mp", } +# 3rd person plural past: same form for m/f — generate two separate pronoun cards +PAST_3P_EXPANSION = [ + ("הֵם", "עָבָר"), + ("הֵן", "עָבָר"), +] + # Voice field: passive label only (shown inline on card front for Pu'al/Huf'al) VOICE_MAP = { "Pu'al": "סָבִיל", @@ -573,6 +579,10 @@ def build_conj_deck( conj_form = form_data.get("form", "") audio_url_for_key = form_data.get("audio_url", "") + # Infinitive: shown on card front as reference — skip as a quiz form + if form_key == "infinitive": + continue + # Audio tag: use downloaded file if present audio_tag = "" if slug: @@ -588,6 +598,12 @@ def build_conj_deck( add_note(pronoun, tense_label, conj_form, audio_tag) continue + # Past 3rd plural: same form for m/f → two separate pronoun cards + if form_key == "past_3p": + for pronoun, tense_label in PAST_3P_EXPANSION: + add_note(pronoun, tense_label, conj_form, audio_tag) + continue + # 2fp/3fp future and imperative: show modern (mp) form + classical (fp) in parens if form_key in FP_MODERN_FALLBACK: mp_key = FP_MODERN_FALLBACK[form_key] @@ -641,7 +657,7 @@ def write_vocab_apkg( out_path: Path = VOCAB_APKG, ) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) - pkg = _RandomOrderPackage(deck) + pkg = genanki.Package(deck) # insertion order = frequency rank (new.order=1 default) pkg.media_files = [str(p) for p in media_files if p.exists()] + _font_media_files() pkg.write_to_file(str(out_path)) logger.info(f"Vocabulary deck written → {out_path}") diff --git a/conjugation_extract.py b/conjugation_extract.py index 7b4656f..9e3434a 100755 --- a/conjugation_extract.py +++ b/conjugation_extract.py @@ -414,7 +414,7 @@ def _extract_passive_binyan_from_page(soup: BeautifulSoup) -> str: return "" -def _extract_conjugations(slug: str, search_term: str, is_3ms_search: bool = False) -> dict | None: +def _extract_conjugations(slug: str, search_term: str, is_3ms_search: bool = False, binyan_hint: str = "") -> dict | None: """Fetch /dict// and parse conjugation table (active + passive).""" url = f"{PEALIM_BASE}/dict/{slug}/" try: @@ -434,10 +434,12 @@ def _extract_conjugations(slug: str, search_term: str, is_3ms_search: bool = Fal root = txt break - # Extract binyan: try PoS lookup first, then page header + # Extract binyan: try PoS lookup first, then page header, then section hint binyan = _binyan_from_pos(search_term) if not is_3ms_search else "" if not binyan: binyan = _extract_binyan_from_page(soup) + if not binyan: + binyan = binyan_hint # Parse active forms table forms_raw = _parse_table(soup, passive=False) @@ -495,7 +497,7 @@ def _save_conjugations(data: dict) -> None: json.dump(data, f, ensure_ascii=False, indent=2) -def _extract_passive_from_active_slug(active_slug: str, search_term: str) -> dict | None: +def _extract_passive_from_active_slug(active_slug: str, search_term: str, binyan_hint: str = "") -> dict | None: """Fetch active verb page and extract only the passive section forms. Used for Pu'al/Huf'al 3ms entries where we know the active verb's slug.""" url = f"{PEALIM_BASE}/dict/{active_slug}/" @@ -527,6 +529,8 @@ def _extract_passive_from_active_slug(active_slug: str, search_term: str) -> dic passive_binyan = _extract_passive_binyan_from_page(soup) if not passive_binyan: passive_binyan = "Pu'al" if active_binyan == "Pi'el" else "Huf'al" if active_binyan == "Hif'il" else "" + if not passive_binyan: + passive_binyan = binyan_hint result = { "infinitive": search_term, @@ -567,8 +571,16 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict: if len(parts) >= 2: slug_overrides[parts[0]] = parts[1] + # Map section header keywords → binyan name (for binyan_hint fallback) + SECTION_BINYAN = { + "pa'al": "Pa'al", "nif'al": "Nif'al", "pi'el": "Pi'el", + "pu'al": "Pu'al", "hitpa'el": "Hitpa'el", "hif'il": "Hif'il", "huf'al": "Huf'al", + } + # Parse: regular verbs and # 3ms: lines (optional active slug on 3ms lines) - verbs: list[tuple[str, bool, str | None]] = [] # (search_term, is_3ms_search, active_slug) + # Track current section binyan from comment headers for use as a hint + verbs: list[tuple[str, bool, str | None, str]] = [] # (search_term, is_3ms_search, active_slug, binyan_hint) + current_binyan_hint = "" for line in raw_lines: stripped = line.strip() if not stripped or stripped.startswith("# slug:"): @@ -578,21 +590,26 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict: if parts: form = parts[0] active_slug = parts[1] if len(parts) >= 2 else None - verbs.append((form, True, active_slug)) + verbs.append((form, True, active_slug, current_binyan_hint)) elif stripped.startswith("#"): - continue + # Check if this is a section header setting the binyan context + low = stripped.lower() + for key, bname in SECTION_BINYAN.items(): + if key in low: + current_binyan_hint = bname + break else: - verbs.append((stripped, False, None)) + verbs.append((stripped, False, None, current_binyan_hint)) logger.info(f"Loaded {len(verbs)} verbs from {verbs_file} " - f"({sum(1 for _, p, _ in verbs if p)} passive 3ms)") + f"({sum(1 for _, p, _, _ in verbs if p)} passive 3ms)") if slug_overrides: logger.info(f" Slug overrides: {slug_overrides}") conjugations = _load_conjugations() new_count = 0 - for verb, is_3ms, active_slug in verbs: + for verb, is_3ms, active_slug, binyan_hint in verbs: if verb in conjugations: logger.info(f"Skipping {verb} (cached)") continue @@ -614,7 +631,7 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict: continue logger.info(f" Found active slug {slug} for passive extraction") time.sleep(REQUEST_DELAY) - data = _extract_passive_from_active_slug(slug, verb) + data = _extract_passive_from_active_slug(slug, verb, binyan_hint=binyan_hint) else: override = slug_overrides.get(verb) if override: @@ -628,7 +645,7 @@ def main(verbs_file: Path = VERBS_INPUT) -> dict: _save_conjugations(conjugations) continue time.sleep(REQUEST_DELAY) - data = _extract_conjugations(slug, verb, is_3ms_search=False) + data = _extract_conjugations(slug, verb, is_3ms_search=False, binyan_hint=binyan_hint) conjugations[verb] = data _save_conjugations(conjugations) diff --git a/data/conjugations.json b/data/conjugations.json index c194f78..255d3a8 100644 --- a/data/conjugations.json +++ b/data/conjugations.json @@ -3,7 +3,7 @@ "infinitive": "לשמור", "slug": "2255-lishmor", "root": "שׁ - מ - ר", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לִשְׁמֹר", "forms": { @@ -181,7 +181,7 @@ "infinitive": "ללמוד", "slug": "41-lilmod", "root": "ל - מ - ד", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לִלְמֹד", "forms": { @@ -359,7 +359,7 @@ "infinitive": "לאסוף", "slug": "128-leesof", "root": "א - ס - ף", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לֶאֱסֹף", "forms": { @@ -537,7 +537,7 @@ "infinitive": "לעבוד", "slug": "51-laavod", "root": "ע - ב - ד", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לַעֲבֹד", "forms": { @@ -715,7 +715,7 @@ "infinitive": "לחבוש", "slug": "553-lachbosh", "root": "ח - ב - שׁ", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לַחְבֹּשׁ", "forms": { @@ -893,7 +893,7 @@ "infinitive": "לאכול", "slug": "30-leechol", "root": "א - כ - ל", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לֶאֱכֹל", "forms": { @@ -1071,7 +1071,7 @@ "infinitive": "לשאול", "slug": "39-lishol", "root": "שׁ - א - ל", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לִשְׁאֹל", "forms": { @@ -1249,7 +1249,7 @@ "infinitive": "לשלוח", "slug": "2220-lishloach", "root": "שׁ - ל - ח", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לִשְׁלֹחַ", "forms": { @@ -1427,7 +1427,7 @@ "infinitive": "לגבוה", "slug": "286-ligboah", "root": "ג - ב - הּ", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לִגְבֹּהַּ", "forms": { @@ -1961,7 +1961,7 @@ "infinitive": "לִיפּוֹל", "slug": "1230-lipol", "root": "נ - פ - ל", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לִפֹּל", "forms": { @@ -2317,7 +2317,7 @@ "infinitive": "לחון", "slug": "634-lachon", "root": "ח - נ - ן", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לָחֹן", "forms": { @@ -2495,7 +2495,7 @@ "infinitive": "לקרוא", "slug": "13-likro", "root": "ק - ר - א", - "binyan": "", + "binyan": "Pa'al", "is_passive": false, "reference_form": "לִקְרֹא", "forms": { @@ -2851,7 +2851,7 @@ "infinitive": "להיבדק", "slug": "178-lehibadek", "root": "ב - ד - ק", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לְהִבָּדֵק", "forms": { @@ -3207,7 +3207,7 @@ "infinitive": "להיהרג", "slug": "474-lehehareg", "root": "ה - ר - ג", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לְהֵהָרֵג", "forms": { @@ -3563,7 +3563,7 @@ "infinitive": "להישאר", "slug": "47-lehishaer", "root": "שׁ - א - ר", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לְהִשָּׁאֵר", "forms": { @@ -3741,7 +3741,7 @@ "infinitive": "להיפגע", "slug": "1591-lehipagea", "root": "פ - ג - ע", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לְהִפָּגֵעַ", "forms": { @@ -3919,7 +3919,7 @@ "infinitive": "להיוולד", "slug": "800-lehivaled", "root": "י - ל - ד", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לְהִוָּלֵד", "forms": { @@ -4275,7 +4275,7 @@ "infinitive": "להיסוג", "slug": "1323-laseget", "root": "ס - ו - ג", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לָסֶגֶת", "forms": { @@ -4453,7 +4453,7 @@ "infinitive": "להימצא", "slug": "1084-lehimatze", "root": "מ - צ - א", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לְהִמָּצֵא", "forms": { @@ -4631,7 +4631,7 @@ "infinitive": "להיבנות", "slug": "235-lehibanot", "root": "ב - נ - ה", - "binyan": "", + "binyan": "Nif'al", "is_passive": false, "reference_form": "לְהִבָּנוֹת", "forms": {