hebrew_flash_cards/scripts/migrate_to_json.py
Sochen 08fb7009d8 Sprint 11: unified JSON architecture + consolidated scraping pipeline
Migrate from fragmented CSV + 10 JSON files to a single data/words.json
(9,104 entries) as the unified data store. All GUIDs preserved for Anki
study progress continuity.

New files:
- SCHEMA.yaml: authoritative schema for words.json
- pealim_list_scrape.py: consolidated list page scraper → words.json
- pealim_detail_scrape.py: noun/verb detail scraper → words.json
- pealim_audio_download.py: audio downloader reading from words.json
- scripts/migrate_to_json.py: one-time CSV→JSON migration
- scripts/validate_data.py: 17 data integrity tests
- scripts/check_guid_coverage.py: GUID preservation checker
- scripts/repair_slugs.py: slug deduplication repair tool
- tests/test_scraper_integration.py: live scraper integration tests

Updated:
- apkg_builder.py: reads from words.json (no more pandas)
- run.py: 8-step pipeline (list scrape → frequency → examples →
  detail scrape → audio download → fonts → images → build)
- benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers
  for future words.json integration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:54:58 +00:00

1041 lines
41 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Migration script: builds data/words.json from all existing data sources.
Run:
python3 scripts/migrate_to_json.py
python3 scripts/migrate_to_json.py --dry-run
"""
from __future__ import annotations
import argparse
import csv
import json
import logging
import re
import sys
import unicodedata
from collections import defaultdict
from pathlib import Path
from typing import Any
import genanki
# ---------------------------------------------------------------------------
# Bootstrap: parent package helpers
# ---------------------------------------------------------------------------
sys.path.insert(0, str(Path(__file__).parent.parent))
from helpers import strip_nikkud # noqa: E402
# ---------------------------------------------------------------------------
# Logging
# ---------------------------------------------------------------------------
logging.basicConfig(
format="%(levelname)s %(message)s",
level=logging.INFO,
)
log = logging.getLogger(__name__)
# ---------------------------------------------------------------------------
# Constants
# ---------------------------------------------------------------------------
DATA_DIR = Path(__file__).parent.parent / "data"
OUTPUT_FILE = DATA_DIR / "words.json"
MIGRATION_DATE = "2026-03-08"
EMOJI_RE = re.compile(
r"[\U0001F300-\U0001FFFF"
r"\U00002600-\U000027BF"
r"\U0001F000-\U0001F9FF"
r"\u2600-\u26FF"
r"\u2700-\u27BF]+",
re.UNICODE,
)
# NFC-normalise once; used throughout for consistent Unicode comparisons.
def _nfc(s: str) -> str:
return unicodedata.normalize("NFC", s)
# ---------------------------------------------------------------------------
# PoS → Hebrew mapping
# ---------------------------------------------------------------------------
POS_HEBREW: dict[str, str] = {
"Noun": "שֵׁם עֶצֶם",
"Verb": "פֹּעַל",
"Adjective": "שֵׁם תֹּאַר",
"Adverb": "תֹּאַר הַפֹּעַל",
"Pronoun": "כִּנּוּי גּוּף",
"Preposition": "מִילַּת יַחַס",
"Conjunction": "מִילַּת חִבּוּר",
"Interjection": "מִילַּת קְרִיאָה",
"Numeral": "שֵׁם מִסְפָּר",
"Cardinal numeral": "שֵׁם מִסְפָּר",
"Particle": "מִילִּית",
"Determiner": "מְגַדִּיר",
"Existential": "מִילַּת קִיּוּם",
"Interrogative": "מִילַּת שְׁאֵלָה",
}
# Binyan suffix appended to pos_hebrew for verbs
BINYAN_HEBREW: dict[str, str] = {
"Pa'al": "פָּעַל",
"Nif'al": "נִפְעַל",
"Pi'el": "פִּיעֵל",
"Pu'al": "פֻּעַל",
"Hif'il": "הִפְעִיל",
"Huf'al": "הֻפְעַל",
"Hitpa'el": "הִתְפַּעֵל",
}
# Conjugation form-key → person code
FORM_KEY_TO_PERSON: dict[str, str] = {
"present_ms": "ms",
"present_fs": "fs",
"present_mp": "mp",
"present_fp": "fp",
"past_1s": "1s",
"past_1p": "1p",
"past_2ms": "2ms",
"past_2fs": "2fs",
"past_2mp": "2mp",
"past_2fp": "2fp",
"past_3ms": "3ms",
"past_3fs": "3fs",
"past_3p": "3p",
"future_1s": "1s",
"future_1p": "1p",
"future_2ms": "2ms",
"future_2fs": "2fs",
"future_2mp": "2mp",
"future_2fp": "2fp",
"future_3ms": "3ms",
"future_3fs": "3fs",
"future_3mp": "3mp",
"future_3fp": "3fp",
"imperative_ms": "ms",
"imperative_fs": "fs",
"imperative_mp": "mp",
"imperative_fp": "fp",
"infinitive": "inf",
}
# Mirrors apkg_builder.PRESENT_EXPANSION — all pronoun/tense choices per present form key.
# The builder uses a per-verb seeded RNG to pick one; we store all possible GUIDs.
PRESENT_EXPANSION: dict[str, list[tuple[str, str]]] = {
"present_ms": [
("אֲנִי (זָכָר)", "הוֹוֶה"),
("אַתָּה", "הוֹוֶה"),
("הוּא", "הוֹוֶה"),
],
"present_fs": [
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
("אַתְּ", "הוֹוֶה"),
("הִיא", "הוֹוֶה"),
],
"present_mp": [
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
("אַתֶּם", "הוֹוֶה"),
("הֵם", "הוֹוֶה"),
],
"present_fp": [
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
("אַתֶּן", "הוֹוֶה"),
("הֵן", "הוֹוֶה"),
],
}
# Mirrors apkg_builder.PAST_3P_EXPANSION
PAST_3P_EXPANSION: list[tuple[str, str]] = [
("הֵם", "עָבָר"),
("הֵן", "עָבָר"),
]
# Mirrors apkg_builder.FP_MODERN_FALLBACK
FP_MODERN_FALLBACK: dict[str, str] = {
"future_2fp": "future_2mp",
"future_3fp": "future_3mp",
"imperative_fp": "imperative_mp",
}
# 1st-person forms that get a randomly assigned gender label in the builder
_FIRST_PERSON_GENDERED: set[str] = {"past_1s", "past_1p", "future_1s", "future_1p"}
# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def _strip(text: str) -> str:
"""Strip nikkud using the shared helper."""
return strip_nikkud(text)
def _hebrew_word(nikkud: str) -> dict[str, str]:
"""Build a {nikkud, ktiv_male} sub-object."""
return {"nikkud": nikkud, "ktiv_male": _strip(nikkud)}
def _parse_root(raw: str) -> list[str]:
"""Parse root string like 'שׁ - מ - ר' into list of consonants.
Returns empty list for '-' or empty input.
"""
raw = raw.strip()
if not raw or raw == "-":
return []
parts = [p.strip() for p in raw.split(" - ")]
return [p for p in parts if p]
def _extract_emoji(meaning: str) -> tuple[str, str | None]:
"""Split emoji from meaning string.
Returns (clean_meaning, emoji_char_or_None).
"""
emoji_match = EMOJI_RE.search(meaning)
if not emoji_match:
return meaning.strip(), None
emoji = emoji_match.group(0)
clean = EMOJI_RE.sub("", meaning).strip()
# Collapse multiple spaces
clean = re.sub(r"\s{2,}", " ", clean).strip()
return clean, emoji
def _parse_pos(raw_pos: str) -> tuple[str, str]:
"""Return (pos_english, pos_hebrew) from raw PoS string.
Handles patterns like:
- "Noun masculine" → ("Noun", "שֵׁם עֶצֶם")
- "Verb pa'al" → ("Verb", "פֹּעַל — פָּעַל")
- "Noun ketelpattern, feminine" → ("Noun", "שֵׁם עֶצֶם")
- "" → ("Existential", "מִילַּת קִיּוּם")
- "Cardinal numeral masculine" → ("Cardinal numeral", "שֵׁם מִסְפָּר")
"""
raw_pos = raw_pos.strip()
# Special case for bare "" (יש, אין)
if raw_pos == "":
return "Existential", POS_HEBREW["Existential"]
# Split on " " (em-dash with spaces) or " " (em-dash no space)
first_part = re.split(r"\s*", raw_pos)[0].strip()
# Map the first word to canonical English PoS key
# "Cardinal numeral" needs two words
if first_part.lower().startswith("cardinal"):
pos_en = "Cardinal numeral"
else:
pos_en = first_part.split()[0].capitalize() if first_part else raw_pos
# Detect binyan for verbs: "Verb pa'al" → part after is binyan slug
binyan_hebrew: str | None = None
if pos_en == "Verb":
# extract the binyan part: everything after the dash, strip "pattern" etc.
after = re.split(r"\s*", raw_pos, maxsplit=1)
if len(after) > 1:
binyan_slug_raw = after[1].split(",")[0].strip()
# Normalise: "pa'al" → "Pa'al", "hif'il" → "Hif'il" etc.
for k in BINYAN_HEBREW:
if k.lower() == binyan_slug_raw.lower():
binyan_hebrew = BINYAN_HEBREW[k]
break
base_hebrew = POS_HEBREW.get(pos_en, "")
if binyan_hebrew:
pos_hebrew = f"{base_hebrew}{binyan_hebrew}" if base_hebrew else binyan_hebrew
else:
pos_hebrew = base_hebrew
return pos_en, pos_hebrew
def _strip_construct_hyphen(form: str) -> str:
"""Remove trailing maqqef hyphen from construct form (e.g. 'אֲבִי־''אֲבִי')."""
return form.rstrip("־").rstrip("-").strip()
# ---------------------------------------------------------------------------
# Data loaders
# ---------------------------------------------------------------------------
def load_csv(path: Path) -> list[dict[str, str]]:
rows: list[dict[str, str]] = []
with path.open(encoding="utf-8") as f:
reader = csv.DictReader(f, delimiter=";")
for row in reader:
rows.append(dict(row))
log.info("Loaded %d rows from %s", len(rows), path.name)
return rows
def load_json(path: Path) -> Any:
with path.open(encoding="utf-8") as f:
data = json.load(f)
log.info("Loaded %s (%d entries)", path.name, len(data))
return data
# ---------------------------------------------------------------------------
# Build legacy GUID lookup
# ---------------------------------------------------------------------------
def build_guid_lookup(
guid_map: dict[str, str],
) -> tuple[dict[str, str], dict[tuple[str, str], str]]:
"""Split guid_map into plain-word and (word, meaning) keyed dicts.
All keys NFC-normalised for consistent comparison.
"""
base: dict[str, str] = {}
disambig: dict[tuple[str, str], str] = {}
for raw_k, guid in guid_map.items():
k = _nfc(raw_k)
if "||" in k:
word, meaning = k.split("||", 1)
disambig[(word, meaning)] = guid
else:
base[k] = guid
return base, disambig
def resolve_guid(
word_nikkud: str,
meaning: str,
base: dict[str, str],
disambig: dict[tuple[str, str], str],
) -> str | None:
w = _nfc(word_nikkud)
m = _nfc(meaning)
# Prefer explicit disambiguation
if (w, m) in disambig:
return disambig[(w, m)]
# Check any disambiguation key that starts with same prefix (truncated meanings)
for (dw, dm), g in disambig.items():
if dw == w and (m.startswith(dm) or dm.startswith(m[:20])):
return g
return base.get(w)
# ---------------------------------------------------------------------------
# Unique key generation
# ---------------------------------------------------------------------------
def build_unique_keys(
rows: list[dict[str, str]],
) -> tuple[dict[int, str], list[str]]:
"""Assign unique_key to each CSV row (by index).
Escalation:
1. nikkud word
2. "word|pos" (if nikkud collides)
3. "word|pos|meaning" (if nikkud+pos collides)
4. "word|pos|meaning|N" (N=2,3,… for true CSV exact-duplicates)
Returns:
idx_to_key — map from CSV row index to unique_key
collisions — list of collision descriptions logged
"""
collisions: list[str] = []
idx_to_key: dict[int, str] = {}
def _pos_short(pos: str) -> str:
"""Canonical short PoS label for key construction."""
if pos == "":
return "Existential"
return re.split(r"\s*[-]", pos)[0].strip()
# Pass 1: try plain nikkud key
key_to_indices: dict[str, list[int]] = defaultdict(list)
for i, row in enumerate(rows):
k = row["Word"]
key_to_indices[k].append(i)
for k, indices in key_to_indices.items():
if len(indices) == 1:
idx_to_key[indices[0]] = k
else:
collisions.append(f"Nikkud collision '{k}' ({len(indices)} rows) — escalating to word|pos")
# Pass 2: try word|pos
pos_key_to_indices: dict[str, list[int]] = defaultdict(list)
for i in indices:
short_pos = _pos_short(rows[i]["Part of Speech"])
pos_key = f"{k}|{short_pos}"
pos_key_to_indices[pos_key].append(i)
for pk, pk_indices in pos_key_to_indices.items():
if len(pk_indices) == 1:
idx_to_key[pk_indices[0]] = pk
else:
collisions.append(
f" Nikkud+PoS collision '{pk}' ({len(pk_indices)} rows) — escalating to word|pos|meaning"
)
# Pass 3: try word|pos|meaning
meaning_key_to_indices: dict[str, list[int]] = defaultdict(list)
for j in pk_indices:
meaning = rows[j]["Meaning"]
full_key = f"{pk}|{meaning}"
meaning_key_to_indices[full_key].append(j)
for mk, mk_indices in meaning_key_to_indices.items():
if len(mk_indices) == 1:
idx_to_key[mk_indices[0]] = mk
else:
# True exact duplicates: append numeric suffix |2, |3, …
collisions.append(
f" Exact duplicate '{mk}' ({len(mk_indices)} rows, same slug) "
f"— appending numeric suffix"
)
idx_to_key[mk_indices[0]] = mk
for n, j in enumerate(mk_indices[1:], start=2):
idx_to_key[j] = f"{mk}|{n}"
# Verify completeness
unkeyed = [i for i in range(len(rows)) if i not in idx_to_key]
if unkeyed:
log.error("BUG: %d rows have no unique_key assigned!", len(unkeyed))
return idx_to_key, collisions
# ---------------------------------------------------------------------------
# Conjugation builder
# ---------------------------------------------------------------------------
def _conj_guids(
infinitive_nikkud: str,
form_key: str,
form_data: dict,
) -> list[str]:
"""Return the list of possible GUIDs for a conjugation form.
Mirrors apkg_builder's add_note call logic:
- Present tense: one GUID per PRESENT_EXPANSION choice (all stored).
- past_3p: two GUIDs (הֵם / הֵן).
- FP_MODERN_FALLBACK keys: GUID from form_data pronoun/tense directly.
- 1st-person gendered: two GUIDs (זָכָר / נְקֵבָה suffix).
- Standard: single GUID from form_data pronoun + tense.
The builder uses a seeded RNG to *pick one* for present/past_3p; we store
all candidates so a future reader can identify which GUID is live.
"""
if form_key in PRESENT_EXPANSION:
return [genanki.guid_for(infinitive_nikkud, pronoun, tense) for pronoun, tense in PRESENT_EXPANSION[form_key]]
if form_key == "past_3p":
return [genanki.guid_for(infinitive_nikkud, pronoun, tense) for pronoun, tense in PAST_3P_EXPANSION]
if form_key in FP_MODERN_FALLBACK:
# Builder uses form_data pronoun/tense directly for these
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
return [genanki.guid_for(infinitive_nikkud, pronoun, tense)]
pronoun = form_data.get("pronoun", "")
tense = form_data.get("tense", "")
if form_key in _FIRST_PERSON_GENDERED:
# Builder appends " (זָכָר)" or " (נְקֵבָה)" — store both
return [
genanki.guid_for(infinitive_nikkud, f"{pronoun} (זָכָר)", tense),
genanki.guid_for(infinitive_nikkud, f"{pronoun} (נְקֵבָה)", tense),
]
return [genanki.guid_for(infinitive_nikkud, pronoun, tense)]
def build_conjugation_forms(forms_dict: dict, infinitive_nikkud: str = "") -> list[dict]:
"""Convert raw forms dict to list of ConjugationForm objects.
Args:
forms_dict: Raw forms dict from conjugations.json.
infinitive_nikkud: Nikkud infinitive string used for GUID generation.
"""
result: list[dict] = []
# We store all candidate GUIDs rather than selecting one
for form_key, form_data in forms_dict.items():
if form_key == "infinitive":
continue # stored separately at conjugation.infinitive
person = FORM_KEY_TO_PERSON.get(form_key)
if person is None:
log.warning("Unknown form key: %s", form_key)
continue
nikkud_form = form_data.get("form", "")
if not nikkud_form:
continue
guids = _conj_guids(infinitive_nikkud, form_key, form_data) if infinitive_nikkud else []
result.append(
{
"person": person,
"tense": form_data.get("tense", ""),
"pronoun_hebrew": form_data.get("pronoun", ""),
"form": _hebrew_word(nikkud_form),
"audio_url": form_data.get("audio_url") or None,
"audio_file": None,
"guid": guids[0] if len(guids) == 1 else None,
"guid_candidates": guids if len(guids) > 1 else None,
}
)
return result
# ---------------------------------------------------------------------------
# Main migration
# ---------------------------------------------------------------------------
def migrate(dry_run: bool = False) -> None: # noqa: C901 (complex but linear)
# ------------------------------------------------------------------
# 1. Load all sources
# ------------------------------------------------------------------
csv_rows = load_csv(DATA_DIR / "hebrew_dict_for_anki.csv")
conjugations: dict = load_json(DATA_DIR / "conjugations.json")
noun_plurals: dict = load_json(DATA_DIR / "noun_plurals.json")
vetted_sentences: dict = load_json(DATA_DIR / "vetted_sentences.json")
guid_map_raw: dict = load_json(DATA_DIR / "legacy_guid_map.json")
refined_meanings: dict = load_json(DATA_DIR / "refined_meanings.json")
image_cache: dict = load_json(DATA_DIR / "image_cache.json")
frequency_cache: dict = load_json(DATA_DIR / "frequency_cache.json")
# ------------------------------------------------------------------
# 2. Pre-process lookups
# ------------------------------------------------------------------
guid_base, guid_disambig = build_guid_lookup(guid_map_raw)
# noun_plurals: two lookup maps — by slug (primary), by nikkud singular (fallback)
plurals_by_slug: dict[str, dict] = {}
plurals_by_nikkud: dict[str, dict] = {}
for pdata in noun_plurals.values():
slug = pdata.get("slug", "")
if slug:
plurals_by_slug[slug] = pdata
sing = _nfc(pdata.get("singular", ""))
if sing:
plurals_by_nikkud[sing] = pdata
# vetted_sentences: keyed by stripped word; build NFC lookup of word_nikkud too
sentences_by_stripped: dict[str, dict] = {}
for sdata in vetted_sentences.values():
wn = sdata.get("word_nikkud", "")
if wn:
sentences_by_stripped[_strip(wn)] = sdata
# conjugations: indexed by slug (100% coverage) and by stripped infinitive
# Some active/passive pairs share the same slug (e.g. הופל/להפיל → 1231-lehapil).
# When slug collides, always prefer the ACTIVE verb in conj_by_slug so the
# entry is correctly associated with its active conjugation data.
conj_by_slug: dict[str, dict] = {}
conj_by_stripped_inf: dict[str, dict] = {}
for cdata in conjugations.values():
slug = cdata.get("slug", "")
if slug:
existing = conj_by_slug.get(slug)
if existing is None:
conj_by_slug[slug] = cdata
elif cdata.get("is_passive") and not existing.get("is_passive"):
# Keep the active verb; skip overwriting with passive
pass
elif existing.get("is_passive") and not cdata.get("is_passive"):
# Replace passive with active
conj_by_slug[slug] = cdata
else:
conj_by_slug[slug] = cdata
inf = cdata.get("infinitive", "")
if inf:
conj_by_stripped_inf[_strip(inf)] = cdata
# Build passive→active link:
# passive verbs store reference_form = nikkud infinitive of the ACTIVE verb
# We need: active_slug → passive_conj_data
passive_by_active_slug: dict[str, dict] = {}
for cdata in conjugations.values():
if not cdata.get("is_passive"):
continue
ref_nikkud = cdata.get("reference_form", "")
ref_stripped = _strip(ref_nikkud)
# find the active verb's slug
active_cdata = conj_by_stripped_inf.get(ref_stripped)
if active_cdata:
active_slug = active_cdata.get("slug", "")
if active_slug:
passive_by_active_slug[active_slug] = cdata
else:
log.warning(
"Passive verb '%s' references active '%s' (stripped='%s') — no match in conjugations",
cdata.get("infinitive"),
ref_nikkud,
ref_stripped,
)
# refined_meanings: NFC-keyed
refined_nfc: dict[str, str] = {_nfc(k): v for k, v in refined_meanings.items()}
# image_cache: stripped-word keyed
image_stripped: dict[str, str | None] = dict(image_cache)
# frequency_cache: stripped-word keyed
freq_stripped: dict[str, int] = {k: int(v) for k, v in frequency_cache.items() if v is not None}
# ------------------------------------------------------------------
# 3. Assign unique keys
# ------------------------------------------------------------------
idx_to_key, collisions = build_unique_keys(csv_rows)
for msg in collisions:
log.info("KEY COLLISION: %s", msg)
log.info("Collision summary: %d collision events", len(collisions))
# ------------------------------------------------------------------
# 3b. Identify exact-duplicate |N suffix rows to skip
# ------------------------------------------------------------------
# |N suffix rows (N=2,3,…) are true CSV exact-duplicates that share the
# same slug as the base entry. We drop them entirely so the unique_key
# space stays clean and no GUID collisions are emitted.
import re as _re
_dup_indices: set[int] = set()
for _i, _k in idx_to_key.items():
if _re.search(r"\|\d+$", _k):
_base_k = _re.sub(r"\|\d+$", "", _k)
_base_i = next((j for j, kk in idx_to_key.items() if kk == _base_k), None)
if _base_i is not None and csv_rows[_i]["slug"] == csv_rows[_base_i]["slug"]:
_dup_indices.add(_i)
if _dup_indices:
log.info(
"Skipping %d exact-duplicate |N suffix rows (same slug as base entry)",
len(_dup_indices),
)
# ------------------------------------------------------------------
# 4. Confusable groups: group by ktiv_male (from ktiv_male_forms)
# ------------------------------------------------------------------
# Build: stripped_word → set of slugs sharing that ktiv_male form
# We care about the *base* form (absolute_singular or absolute form of the word).
# Strategy: use "Word Without Nikkud" from CSV as ktiv_male, then group slugs.
# A confusable group = multiple *different* slugs sharing the same ktiv_male.
slug_to_ktiv_male: dict[str, str] = {}
for row in csv_rows:
slug_to_ktiv_male[row["slug"]] = row["Word Without Nikkud"]
ktiv_male_to_slugs: dict[str, set[str]] = defaultdict(set)
for slug, km in slug_to_ktiv_male.items():
ktiv_male_to_slugs[km].add(slug)
# Only keep those with >1 distinct slug
confusable_slug_groups: dict[str, set[str]] = {
km: slugs for km, slugs in ktiv_male_to_slugs.items() if len(slugs) > 1
}
log.info("Confusable ktiv_male groups: %d", len(confusable_slug_groups))
# Build reverse: slug → list of co-confusable slugs
slug_to_confusable_slugs: dict[str, set[str]] = {}
for _km, slugs in confusable_slug_groups.items():
for slug in slugs:
slug_to_confusable_slugs[slug] = slugs - {slug}
# We need to map slug → unique_key(s) for the confusable_group field
# But unique_key is per-row; one slug may map to multiple keys (duplicate entries with same slug).
# Exclude exact-duplicate rows so dropped entries don't pollute confusable groups.
slug_to_unique_keys: dict[str, list[str]] = defaultdict(list)
for i, row in enumerate(csv_rows):
if i not in _dup_indices:
slug_to_unique_keys[row["slug"]].append(idx_to_key[i])
# ------------------------------------------------------------------
# 5. Build entries
# ------------------------------------------------------------------
words: dict[str, dict] = {}
stats = {
"total": 0,
"has_conjugation": 0,
"has_noun_inflection": 0,
"has_examples": 0,
"has_guid": 0,
"has_image": 0,
"has_frequency": 0,
"has_hint": 0,
"has_emoji": 0,
"key_collisions": len(collisions),
}
for i, row in enumerate(csv_rows):
if i in _dup_indices:
continue
unique_key = idx_to_key[i]
word_nikkud = row["Word"]
word_ktiv = row["Word Without Nikkud"]
slug = row["slug"]
raw_pos = row["Part of Speech"]
meaning_raw = row["Meaning"]
audio_url = row["audio_url"] or None
tags = row["tags"] or ""
# -- PoS
pos_en, pos_hebrew = _parse_pos(raw_pos)
# -- Root
root = _parse_root(row["Root"])
# -- Meaning + emoji
meaning_clean, emoji_char = _extract_emoji(meaning_raw)
# -- GUID
guid = resolve_guid(word_nikkud, meaning_raw, guid_base, guid_disambig)
if guid:
stats["has_guid"] += 1
# -- Frequency (keyed by ktiv_male / stripped)
frequency = freq_stripped.get(word_ktiv)
if frequency:
stats["has_frequency"] += 1
# -- Image
image_filename = image_stripped.get(word_ktiv)
if image_filename:
stats["has_image"] += 1
# -- Hint (refined_meanings, NFC-keyed by nikkud)
hint = refined_nfc.get(_nfc(word_nikkud), "")
if hint:
stats["has_hint"] += 1
# -- Examples (vetted_sentences keyed by stripped word)
examples_block: dict | None = None
s_data = sentences_by_stripped.get(word_ktiv)
if s_data:
good = s_data.get("good_sentences", [])
if good:
vetted_list = [
{
"text": s["text"],
"source": s.get("book", "unknown"),
"vetted": True,
}
for s in good
]
# Pick best cloze sentence (first good one)
cloze_sent = good[0]
# cloze_guid: deterministic ID for the cloze card on this vocab note.
# Pattern: guid_for(word_nikkud, "cloze") — unique per word.
_cloze_guid = genanki.guid_for(word_nikkud, "cloze")
_cloze_text = cloze_sent["text"]
# Compute cloze_word_start / cloze_word_end from the text.
# Strategy (in order):
# 1. Use stored offsets if already present in source data.
# 2. Exact nikkud form search.
# 3. Exact ktiv_male (plain consonants) search in the raw text.
# 4. Scan each Hebrew word token in the text; match by stripped consonants.
# This handles inflected/construct/plural forms with different nikkud.
_cw_start: int | None = cloze_sent.get("cloze_word_start")
_cw_end: int | None = cloze_sent.get("cloze_word_end")
if _cw_start is None or _cw_end is None:
_idx = _cloze_text.find(word_nikkud)
if _idx >= 0:
_cw_start = _idx
_cw_end = _idx + len(word_nikkud)
else:
# Try exact ktiv_male substring
_idx2 = _cloze_text.find(word_ktiv)
if _idx2 >= 0:
_cw_start = _idx2
_cw_end = _idx2 + len(word_ktiv)
else:
# Scan Hebrew word tokens; find one whose stripped form
# matches word_ktiv (handles inflected/construct/plural).
_HEBREW_TOK = re.compile(
r"[\u05D0-\u05FA\u05B0-\u05BD\u05BF\u05C1\u05C2\u05C7"
r"\uFB1D-\uFB4E]+"
)
for _m in _HEBREW_TOK.finditer(_cloze_text):
if _strip(_m.group(0)) == word_ktiv:
_cw_start = _m.start()
_cw_end = _m.end()
break
# else leave both as None
cloze_block = {
"text": _cloze_text,
"cloze_word_start": _cw_start,
"cloze_word_end": _cw_end,
"cloze_hint": cloze_sent.get("cloze_hint"),
"cloze_guid": _cloze_guid,
}
examples_block = {
"vetted": vetted_list,
"cloze": cloze_block,
"rejected_count": s_data.get("rejected_count", 0),
}
stats["has_examples"] += 1
# -- Noun inflection
noun_inflection: dict | None = None
pdata = plurals_by_slug.get(slug) or plurals_by_nikkud.get(_nfc(word_nikkud))
if pdata and pos_en.startswith("Noun"):
def _hw_or_null(nk: str) -> dict | None:
nk = _strip_construct_hyphen(nk)
return _hebrew_word(nk) if nk else None
gender = pdata.get("gender") or None
gender_hebrew_map = {
"masculine": {"nikkud": "זָכָר", "ktiv_male": "זכר"},
"feminine": {"nikkud": "נְקֵבָה", "ktiv_male": "נקבה"},
}
# Plural GUID mirrors apkg_builder line 1609: guid_for("plural", singular_nikkud)
_plural_singular_nikkud = pdata.get("singular", "")
_plurals_guid = genanki.guid_for("plural", _plural_singular_nikkud) if _plural_singular_nikkud else None
noun_inflection = {
"plurals_guid": _plurals_guid,
"singular": _hw_or_null(pdata.get("singular", "")),
"plural": _hw_or_null(pdata.get("plural", "")),
"singular_audio": pdata.get("singular_audio") or None,
"plural_audio": pdata.get("plural_audio") or None,
"construct_singular": _hw_or_null(pdata.get("construct_singular", "")),
"construct_plural": _hw_or_null(pdata.get("construct_plural", "")),
"pronominal_suffixes": None,
"gender": gender,
"gender_hebrew": gender_hebrew_map.get(gender) if gender else None,
"mishkal": pdata.get("mishkal") or None,
"mishkal_hebrew": None,
}
stats["has_noun_inflection"] += 1
# -- Verb conjugation
conjugation_block: dict | None = None
cdata = conj_by_slug.get(slug)
if cdata and not cdata.get("is_passive"):
# This entry is an active verb with conjugation data
forms_dict = cdata.get("forms", {})
# Resolve infinitive nikkud for GUID generation (prefer forms dict, fall back to cdata key)
_inf_data = forms_dict.get("infinitive", {})
_inf_nikkud_for_guid = _inf_data.get("form", "") or cdata.get("infinitive", "")
active_forms = build_conjugation_forms(forms_dict, _inf_nikkud_for_guid)
# Passive counterpart, if any
passive_cdata = passive_by_active_slug.get(slug)
hufal_pual_forms: list | None = None
reference_form_passive: dict | None = None
if passive_cdata:
passive_forms_dict = passive_cdata.get("forms", {})
_passive_inf_data = passive_forms_dict.get("infinitive", {})
_passive_inf_nikkud = _passive_inf_data.get("form", "") or passive_cdata.get("infinitive", "")
hufal_pual_forms = build_conjugation_forms(passive_forms_dict, _passive_inf_nikkud)
# reference_form of passive = active infinitive; 3ms past is in its forms
rf_passive_nikkud = passive_cdata.get("forms", {}).get("past_3ms", {}).get("form", "")
if rf_passive_nikkud:
reference_form_passive = _hebrew_word(rf_passive_nikkud)
# Infinitive form (from forms dict)
inf_form_data = forms_dict.get("infinitive", {})
inf_nikkud = inf_form_data.get("form", "") or cdata.get("infinitive", "")
infinitive_hw = _hebrew_word(inf_nikkud) if inf_nikkud else None
# Reference form
ref_nikkud = cdata.get("reference_form", "")
reference_form_hw = _hebrew_word(ref_nikkud) if ref_nikkud else None
binyan = cdata.get("binyan", "")
binyan_hebrew = BINYAN_HEBREW.get(binyan, "")
conjugation_block = {
"in_conjugation_deck": True,
"infinitive": infinitive_hw,
"reference_form": reference_form_hw,
"binyan": binyan,
"binyan_hebrew": binyan_hebrew,
"prep": None,
"active_forms": active_forms,
"hufal_pual_forms": hufal_pual_forms,
"reference_form_passive": reference_form_passive,
}
stats["has_conjugation"] += 1
elif cdata and cdata.get("is_passive"):
# Passive-only entry: store a minimal conjugation block referencing the active verb
binyan = cdata.get("binyan", "")
binyan_hebrew = BINYAN_HEBREW.get(binyan, "")
forms_dict = cdata.get("forms", {})
_passive_only_inf_data = forms_dict.get("infinitive", {})
_passive_only_inf_nikkud = _passive_only_inf_data.get("form", "") or cdata.get("infinitive", "")
passive_forms = build_conjugation_forms(forms_dict, _passive_only_inf_nikkud)
inf_form_data = forms_dict.get("infinitive", {})
inf_nikkud = inf_form_data.get("form", "") or cdata.get("infinitive", "")
infinitive_hw = _hebrew_word(inf_nikkud) if inf_nikkud else None
ref_nikkud = cdata.get("reference_form", "")
reference_form_hw = _hebrew_word(ref_nikkud) if ref_nikkud else None
conjugation_block = {
"in_conjugation_deck": True,
"infinitive": infinitive_hw,
"reference_form": reference_form_hw,
"binyan": binyan,
"binyan_hebrew": binyan_hebrew,
"prep": None,
"active_forms": passive_forms,
"hufal_pual_forms": None,
"reference_form_passive": None,
}
stats["has_conjugation"] += 1
# -- Confusable group (filled in pass 2 below)
# -- Shared roots (filled in pass 2 below)
# -- Audio filename: slug-based for confusables, word-based otherwise
audio_file = f"{word_ktiv}.mp3"
entry: dict = {
"word": {"nikkud": word_nikkud, "ktiv_male": word_ktiv},
"slug": slug,
"root": root,
"pos": pos_en,
"pos_hebrew": pos_hebrew,
"meaning": meaning_clean,
"meaning_raw": meaning_raw,
"audio_url": audio_url,
"audio_file": audio_file,
"tags": tags,
"last_scrape_date": MIGRATION_DATE,
# Identity
"vocab_legacy_guid": guid,
# Frequency
"frequency": frequency,
"pseudo_frequency": None,
# Display
"emoji": emoji_char,
"emoji_source": "from_pealim" if emoji_char else None,
"emoji_visible": False,
"image": image_filename,
"image_source": "wikipedia" if image_filename else None,
"hint": hint,
# Populated in pass 2
"shared_roots": [],
"confusable_group": None,
"confusables_guid": None,
# Sub-sections
"examples": examples_block,
"noun_inflection": noun_inflection,
"conjugation": conjugation_block,
"adjective_inflection": None,
"preposition_inflection": None,
}
if emoji_char:
stats["has_emoji"] += 1
if unique_key in words:
log.warning(
"DUPLICATE unique_key '%s' — row %d would overwrite row %d",
unique_key,
i,
list(words.keys()).index(unique_key),
)
words[unique_key] = entry
stats["total"] += 1
# ------------------------------------------------------------------
# 6. Pass 2 — shared_roots and confusable_group
# ------------------------------------------------------------------
# shared_roots: group unique_keys by root tuple
root_to_keys: dict[tuple, list[str]] = defaultdict(list)
for uk, entry in words.items():
r = entry["root"]
if r:
root_to_keys[tuple(r)].append(uk)
for uks in root_to_keys.values():
if len(uks) > 1:
for uk in uks:
words[uk]["shared_roots"] = [k for k in uks if k != uk]
# confusable_group: update audio_file to slug-based for confusable words
# Also set confusables_guid: genanki.guid_for("confusable", ktiv_male)
# where ktiv_male is the shared stripped form (key in confusable_slug_groups).
# Build reverse: slug → ktiv_male (for GUID generation)
slug_to_confusable_ktiv_male: dict[str, str] = {}
for km, slugs in confusable_slug_groups.items():
for slug_in_group in slugs:
slug_to_confusable_ktiv_male[slug_in_group] = km
for i, row in enumerate(csv_rows):
if i in _dup_indices:
continue
slug = row["slug"]
uk = idx_to_key[i]
co_slugs = slug_to_confusable_slugs.get(slug, set())
if co_slugs:
# Gather all unique_keys for co-confusable slugs
group_keys: list[str] = []
for co_slug in co_slugs:
group_keys.extend(slug_to_unique_keys.get(co_slug, []))
group_keys.append(uk)
group_keys = sorted(set(group_keys))
words[uk]["confusable_group"] = group_keys
# confusables_guid: mirrors apkg_builder line 1401
ktiv_male_key = slug_to_confusable_ktiv_male.get(slug, "")
if ktiv_male_key:
words[uk]["confusables_guid"] = genanki.guid_for("confusable", ktiv_male_key)
# Use slug-based audio file for confusables to disambiguate
words[uk]["audio_file"] = f"{slug}.mp3"
# ------------------------------------------------------------------
# 7. Stats report
# ------------------------------------------------------------------
log.info("=" * 60)
log.info("MIGRATION COMPLETE — summary stats:")
log.info(" Total entries: %d", stats["total"])
log.info(" Key collision events: %d", stats["key_collisions"])
log.info(" Has conjugation: %d", stats["has_conjugation"])
log.info(" Has noun_inflection: %d", stats["has_noun_inflection"])
log.info(" Has examples: %d", stats["has_examples"])
log.info(" Has legacy GUID: %d", stats["has_guid"])
log.info(" Has image: %d", stats["has_image"])
log.info(" Has frequency: %d", stats["has_frequency"])
log.info(" Has hint: %d", stats["has_hint"])
log.info(" Has emoji: %d", stats["has_emoji"])
# Confusable entries
confusable_entries = sum(1 for e in words.values() if e["confusable_group"])
log.info(" In confusable group: %d", confusable_entries)
# Entries with shared roots
with_shared_roots = sum(1 for e in words.values() if e["shared_roots"])
log.info(" Has shared roots: %d", with_shared_roots)
if dry_run:
log.info("DRY RUN — output file NOT written.")
return
# ------------------------------------------------------------------
# 8. Write output
# ------------------------------------------------------------------
with OUTPUT_FILE.open("w", encoding="utf-8") as f:
json.dump(words, f, ensure_ascii=False, indent=2)
f.write("\n")
log.info("Wrote %d entries to %s", len(words), OUTPUT_FILE)
# ---------------------------------------------------------------------------
# Entry point
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(
description="Migrate all pealim data sources into data/words.json",
)
parser.add_argument(
"--dry-run",
action="store_true",
help="Print stats without writing the output file.",
)
args = parser.parse_args()
migrate(dry_run=args.dry_run)
if __name__ == "__main__":
main()