From bb79725a7fe27dcaad553c20c82e1afc14c64da1 Mon Sep 17 00:00:00 2001 From: Sochen Date: Wed, 4 Mar 2026 07:33:14 +0000 Subject: [PATCH] Python code cleanup (python-pro review) - Type annotations: dict|None defaults, return types, nested func annotations - Dead code: removed unused row_forms_with_audio(), duplicate _strip_nikkud defs, redundant guards, duplicate 'ism' in ABSTRACT_SUFFIXES - Exceptions: narrowed bare except to (ValueError, pd.errors.ParserError) and (json.JSONDecodeError, OSError) throughout; all raise ValueError given messages - Deduplication: extracted deduplicate() helper in _parse_table; setdefault() for dict building in benyehuda and apkg_builder; list comprehension in benyehuda - Correctness: limit=0 guard fixed (is not None); audio tag parsing uses removeprefix/removesuffix instead of magic offsets; vectorized pandas sum - Constants: BINYAN_NAMES extracted; unicodedata imports moved to top level - benyehuda load(): removed wasted cache read on force_rebuild; word-boundary regex simplified from double-negative to \w Co-Authored-By: Claude Sonnet 4.6 --- apkg_builder.py | 34 +++++++-------- benyehuda.py | 29 ++++++------- conjugation_extract.py | 98 ++++++++++++++++-------------------------- frequency_lookup.py | 10 ++--- image_fetch.py | 13 +++--- run.py | 20 ++++----- 6 files changed, 85 insertions(+), 119 deletions(-) diff --git a/apkg_builder.py b/apkg_builder.py index 537d6eb..dbd6421 100644 --- a/apkg_builder.py +++ b/apkg_builder.py @@ -399,7 +399,7 @@ def build_vocab_deck( df = pd.read_csv(dict_csv, sep=";", index_col=0) if df.shape[1] < 3: raise ValueError("too few columns") - except Exception: + except (ValueError, pd.errors.ParserError): df = pd.read_csv(dict_csv, index_col=0) if limit: @@ -417,8 +417,8 @@ def build_vocab_deck( try: with open(image_cache_path) as _f: image_cache = json.load(_f) - except Exception: - pass + except (json.JSONDecodeError, OSError) as e: + logger.debug(f"Could not load image cache from disk: {e}") images_dir = DATA_DIR / "images" @@ -451,7 +451,7 @@ def build_vocab_deck( word_no_nik = str(row.get("Word Without Nikkud", "")).strip() shared_roots = str(row.get("shared roots", row.get("SharedRoots", ""))).strip() tags_str = str(row.get("tags", row.get("Tags", ""))).strip() - freq_rank_raw = int(row["_freq_rank"]) + freq_rank_raw = row["_freq_rank"] freq_display = str(freq_rank_raw) if freq_rank_raw < 999_999 else "50k+" root = "" if root in ("nan", "None", "-") else root @@ -470,7 +470,7 @@ def build_vocab_deck( # Audio audio_tag = _audio_tag(word_no_nik) if audio_tag: - mp3_name = audio_tag[7:-1] # strip [sound: and ] + mp3_name = audio_tag.removeprefix("[sound:").removesuffix("]") mp3_path = AUDIO_DIR / mp3_name if mp3_path not in media_files: media_files.append(mp3_path) @@ -482,14 +482,12 @@ def build_vocab_deck( # Related words grouped by PoS category related_html = "" - if shared_roots and shared_roots not in ("nan", "None", ""): + if shared_roots: related_words = shared_roots.split() groups: dict[str, list[str]] = {} for rw in related_words: cat = word_to_pos_cat.get(_strip_nikkud(rw), "Other") - if cat not in groups: - groups[cat] = [] - groups[cat].append(rw) + groups.setdefault(cat, []).append(rw) parts = [] for cat, words in groups.items(): if cat == "Other": @@ -532,13 +530,15 @@ def build_vocab_deck( deck.add_note(note) # Diagnostic: count words without PoS coverage in shared_roots - other_count = sum( - 1 for _, row in df.iterrows() - for rw in str(row.get("shared roots", row.get("SharedRoots", ""))).split() - if str(row.get("shared roots", row.get("SharedRoots", ""))) not in ("nan", "None", "") - and word_to_pos_cat.get(_strip_nikkud(rw)) is None - ) - unlisted = sum(1 for v in df["_freq_rank"] if int(v) >= 999_999) + other_count = 0 + for _, row in df.iterrows(): + sr = str(row.get("shared roots", row.get("SharedRoots", ""))).strip() + if sr and sr not in ("nan", "None"): + other_count += sum( + 1 for rw in sr.split() + if word_to_pos_cat.get(_strip_nikkud(rw)) is None + ) + unlisted = int((df["_freq_rank"] >= 999_999).sum()) logger.info(f" Unlisted words (not in frequency corpus): {unlisted}/{len(df)}") logger.info(f" Related-words without PoS coverage: {other_count} (shown unlabeled)") logger.info(f"Vocabulary deck: {len(deck.notes)} notes") @@ -681,7 +681,7 @@ def write_vocab_apkg( def write_conj_apkg( deck: genanki.Deck, - media_files: list[Path] = None, + media_files: list[Path] | None = None, out_path: Path = CONJ_APKG, ) -> None: out_path.parent.mkdir(parents=True, exist_ok=True) diff --git a/benyehuda.py b/benyehuda.py index 229ec1e..abe5c56 100644 --- a/benyehuda.py +++ b/benyehuda.py @@ -80,10 +80,9 @@ def _build_index(corpus_zip_bytes: bytes) -> None: words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence) for w in set(words): if len(w) >= 2: - if w not in _index: - _index[w] = [] - if len(_index[w]) < MAX_INDEX_ENTRIES: - _index[w].append(sentence) + bucket = _index.setdefault(w, []) + if len(bucket) < MAX_INDEX_ENTRIES: + bucket.append(sentence) logger.info(f"Index built: {len(_index)} unique word forms") @@ -108,19 +107,19 @@ def load(force_rebuild: bool = False) -> None: if _index and not force_rebuild: return - # Load persisted examples cache - if EXAMPLES_CACHE_PATH.exists(): - with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f: - _examples_cache = json.load(f) - if force_rebuild: - # Delete old index so it rebuilds + # Delete old index and discard examples cache if INDEX_PATH.exists(): INDEX_PATH.unlink() logger.info("Deleted old Ben Yehuda index (force rebuild)") _examples_cache = {} + else: + # Load persisted examples cache (not needed on rebuild) + if EXAMPLES_CACHE_PATH.exists(): + with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f: + _examples_cache = json.load(f) - if INDEX_PATH.exists() and not force_rebuild: + if INDEX_PATH.exists(): _load_index() return @@ -175,12 +174,8 @@ def get_examples(word_nikkud: str) -> list[str]: # Filter: word must appear as a whole token # Match the stripped form (for robustness with nikkud variants in sentence) if word_stripped: - pattern = r"(? str: + """Remove Hebrew nikkud (diacritics) from a string.""" + return "".join( + ch for ch in unicodedata.normalize("NFD", text) + if unicodedata.category(ch) != "Mn" + ) + + def _build_pos_lookup() -> dict[str, str]: """Build word_stripped → binyan dict from pealim_dict_for_anki.csv.""" - import unicodedata - - def strip_nikkud(t: str) -> str: - return "".join(c for c in unicodedata.normalize("NFD", t) - if unicodedata.category(c) != "Mn") - lookup: dict[str, str] = {} if not DICT_CSV.exists(): return lookup @@ -119,15 +126,15 @@ def _build_pos_lookup() -> dict[str, str]: try: df = pd.read_csv(DICT_CSV, sep=";", index_col=0) if df.shape[1] < 3: - raise ValueError - except Exception: + raise ValueError("too few columns") + except (ValueError, pd.errors.ParserError): df = pd.read_csv(DICT_CSV, index_col=0) for _, row in df.iterrows(): word = str(row.get("Word", "")).strip() pos = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip() if word and pos and "nan" not in pos.lower(): - lookup[strip_nikkud(word)] = pos + lookup[_strip_nikkud(word)] = pos except Exception as e: logger.debug(f"Could not load PoS lookup: {e}") @@ -147,14 +154,8 @@ def _get_pos_lookup() -> dict[str, str]: def _binyan_from_pos(word: str) -> str: """Look up binyan from PoS field: 'Verb – pa\'al' or 'Verb – Pi\'el' → canonical name.""" - import unicodedata - - def strip_nikkud(t: str) -> str: - return "".join(c for c in unicodedata.normalize("NFD", t) - if unicodedata.category(c) != "Mn") - lookup = _get_pos_lookup() - pos_str = lookup.get(strip_nikkud(word), "") + pos_str = lookup.get(_strip_nikkud(word), "") if not pos_str: return "" @@ -191,10 +192,8 @@ def _find_slug(query: str) -> str | None: def _is_passive_binyan(binyan: str) -> bool: - for marker in ["פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al", "pual", "hufal"]: - if marker.lower() in binyan.lower(): - return True - return False + """Return True if the binyan is a passive (Pu'al or Huf'al).""" + return any(marker in binyan for marker in ("פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al")) def _get_menukad(cell) -> tuple[str, str]: @@ -255,21 +254,6 @@ def _parse_table(soup: BeautifulSoup, passive: bool = False, table_el=None) -> d forms: dict[str, dict] = {} - def row_forms_with_audio(row_idx: int) -> list[tuple[str, str]]: - """Extract all (Hebrew form, audio_url) pairs from a row (expanding colspans).""" - cells = rows[row_idx].find_all(["th", "td"]) - result = [] - for cell in cells: - txt, audio_url = _get_menukad(cell) - colspan = int(cell.get("colspan", 1)) - if txt: - for _ in range(colspan): - result.append((txt, audio_url)) - else: - for _ in range(colspan): - result.append(("", "")) - return result - def first_heb_forms(row_idx: int) -> list[tuple[str, str]]: """Get only the Hebrew-text cells from a row (skip label cells).""" cells = rows[row_idx].find_all(["th", "td"]) @@ -282,6 +266,16 @@ def _parse_table(soup: BeautifulSoup, passive: bool = False, table_el=None) -> d result.append((txt, audio_url)) return result + def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]: + """Return pairs with duplicate form-text entries removed (first occurrence kept).""" + seen: set[str] = set() + out: list[tuple[str, str]] = [] + for pair in pairs: + if pair[0] not in seen: + seen.add(pair[0]) + out.append(pair) + return out + # Find rows by tense label present_row = past_row = future_row = imp_row = inf_row = -1 for i, row in enumerate(rows): @@ -310,13 +304,7 @@ def _parse_table(soup: BeautifulSoup, passive: bool = False, table_el=None) -> d # Past tense if past_row >= 0: - hf = first_heb_forms(past_row) - unique: list[tuple[str, str]] = [] - seen_forms: set[str] = set() - for pair in hf: - if pair[0] not in seen_forms: - seen_forms.add(pair[0]) - unique.append(pair) + unique = deduplicate(first_heb_forms(past_row)) if len(unique) >= 1: store("past_1s", unique[0][0], unique[0][1]) if len(unique) >= 2: @@ -329,26 +317,14 @@ def _parse_table(soup: BeautifulSoup, passive: bool = False, table_el=None) -> d store(k, v, au) if past_row + 2 < len(rows): - hf3 = first_heb_forms(past_row + 2) - unique3: list[tuple[str, str]] = [] - seen3: set[str] = set() - for pair in hf3: - if pair[0] not in seen3: - seen3.add(pair[0]) - unique3.append(pair) + unique3 = deduplicate(first_heb_forms(past_row + 2)) keys3 = ["past_3ms", "past_3fs", "past_3p"] for k, (v, au) in zip(keys3, unique3): store(k, v, au) # Future tense if future_row >= 0: - hf = first_heb_forms(future_row) - unique_f: list[tuple[str, str]] = [] - seen_f: set[str] = set() - for pair in hf: - if pair[0] not in seen_f: - seen_f.add(pair[0]) - unique_f.append(pair) + unique_f = deduplicate(first_heb_forms(future_row)) if len(unique_f) >= 1: store("future_1s", unique_f[0][0], unique_f[0][1]) if len(unique_f) >= 2: @@ -386,14 +362,14 @@ def _extract_binyan_from_page(soup: BeautifulSoup) -> str: """Extract binyan from page header span.""" for h3 in soup.find_all("h3", class_="page-header"): text = h3.get_text(" ", strip=True) - for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]: + for bname in BINYAN_NAMES: if bname in text: return bname # Also try og:description meta = soup.find("meta", {"property": "og:description"}) if meta: desc = meta.get("content", "") - for bname in ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]: + for bname in BINYAN_NAMES: if bname in desc: return bname return "" @@ -403,15 +379,15 @@ def _extract_passive_binyan_from_page(soup: BeautifulSoup) -> str: """Extract passive binyan name from passive section heading.""" for h3 in soup.find_all("h3"): text = h3.get_text(" ", strip=True) - if "passive" in text.lower() or "Passive" in text: - for bname in ["Pu'al", "Huf'al"]: + if "passive" in text.lower(): + for bname in ("Pu'al", "Huf'al"): if bname in text: return bname # Infer: Pa'al/Pi'el → Pu'al; Hif'il → Huf'al (stored as span text) span = h3.find("span", class_="small") if span: span_text = span.get_text(strip=True) - for bname in ["Pu'al", "Huf'al"]: + for bname in ("Pu'al", "Huf'al"): if bname in span_text: return bname return "" diff --git a/frequency_lookup.py b/frequency_lookup.py index cdb2d90..bbdc8dc 100644 --- a/frequency_lookup.py +++ b/frequency_lookup.py @@ -52,12 +52,10 @@ def load(cache_path: Path = CACHE_PATH) -> None: line = line.strip() if not line: continue - parts = line.split() - if len(parts) >= 1: - word = _strip_nikkud(parts[0]) - if word and word not in _freq: - _freq[word] = rank - rank += 1 + word = _strip_nikkud(line.split()[0]) + if word and word not in _freq: + _freq[word] = rank + rank += 1 cache_path.parent.mkdir(parents=True, exist_ok=True) with open(cache_path, "w", encoding="utf-8") as f: diff --git a/image_fetch.py b/image_fetch.py index 35ff814..f20a763 100644 --- a/image_fetch.py +++ b/image_fetch.py @@ -41,7 +41,7 @@ REQUEST_TIMEOUT = 10 # Abstract noun suffixes — words whose English meaning ends in these are skipped ABSTRACT_SUFFIXES = ( "tion", "ity", "ness", "ment", "ance", "ence", "ism", - "hood", "ship", "ure", "ism", "age", + "hood", "ship", "ure", "age", ) session = requests.Session() @@ -67,10 +67,7 @@ def is_concrete(english_meaning: str) -> bool: return False # Check last word for abstract suffixes last = words[-1] if words else "" - for suffix in ABSTRACT_SUFFIXES: - if last.endswith(suffix): - return False - return True + return not any(last.endswith(suffix) for suffix in ABSTRACT_SUFFIXES) def _safe_name(word_no_nikkud: str) -> str: @@ -227,8 +224,8 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None try: df = pd.read_csv(dict_csv, sep=";", index_col=0) if df.shape[1] < 3: - raise ValueError - except Exception: + raise ValueError("too few columns") + except (ValueError, pd.errors.ParserError): df = pd.read_csv(dict_csv, index_col=0) cache = load_cache() @@ -238,7 +235,7 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None skipped_cached = 0 for _, row in df.iterrows(): - if limit and processed >= limit: + if limit is not None and processed >= limit: break word = str(row.get("Word", "")).strip() diff --git a/run.py b/run.py index c49611c..a268c7e 100644 --- a/run.py +++ b/run.py @@ -22,6 +22,7 @@ import logging import re import sys import time +import unicodedata from pathlib import Path sys.path.insert(0, str(Path(__file__).parent)) @@ -78,7 +79,7 @@ def step_scrape(args): logger.info(f" Saved Anki CSV → {anki_csv}") -def step_frequency(): +def step_frequency() -> dict[str, int]: """Step 2 — load/download word frequency data.""" logger.info("[2] Loading word frequency data …") import frequency_lookup @@ -109,8 +110,8 @@ def step_examples(args, freq_cache: dict): try: df = pd.read_csv(dict_csv, sep=";", index_col=0) if df.shape[1] < 3: - raise ValueError - except Exception: + raise ValueError("too few columns") + except (ValueError, pd.errors.ParserError): df = pd.read_csv(dict_csv, index_col=0) if args.test: @@ -152,13 +153,12 @@ def step_audio(args): import pandas as pd import requests - import unicodedata try: try: df = pd.read_csv(dict_csv, sep=";", index_col=0) if df.shape[1] < 3: - raise ValueError - except Exception: + raise ValueError("too few columns") + except (ValueError, pd.errors.ParserError): df = pd.read_csv(dict_csv, index_col=0) if args.test: @@ -168,7 +168,7 @@ def step_audio(args): downloaded = 0 skipped = 0 - def strip_nik(t): + def strip_nik(t: str) -> str: return "".join(c for c in unicodedata.normalize("NFD", t) if unicodedata.category(c) != "Mn") @@ -345,7 +345,7 @@ def step_images(args) -> dict: return image_fetch.run(limit=limit) -def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache: dict = None): +def step_build_vocab(args, examples_cache: dict, freq_cache: dict, image_cache: dict | None = None): """Step 5 — build vocabulary .apkg.""" logger.info("[5] Building vocabulary deck …") import apkg_builder @@ -406,8 +406,8 @@ def print_summary(args, examples_cache, freq_cache, conjugations): try: df = pd.read_csv(dict_csv, sep=";", index_col=0) if df.shape[1] < 3: - raise ValueError - except Exception: + raise ValueError("too few columns") + except (ValueError, pd.errors.ParserError): df = pd.read_csv(dict_csv, index_col=0) logger.info(f" Dictionary words: {len(df)}")