Migrate from fragmented CSV + 10 JSON files to a single data/words.json (9,104 entries) as the unified data store. All GUIDs preserved for Anki study progress continuity. New files: - SCHEMA.yaml: authoritative schema for words.json - pealim_list_scrape.py: consolidated list page scraper → words.json - pealim_detail_scrape.py: noun/verb detail scraper → words.json - pealim_audio_download.py: audio downloader reading from words.json - scripts/migrate_to_json.py: one-time CSV→JSON migration - scripts/validate_data.py: 17 data integrity tests - scripts/check_guid_coverage.py: GUID preservation checker - scripts/repair_slugs.py: slug deduplication repair tool - tests/test_scraper_integration.py: live scraper integration tests Updated: - apkg_builder.py: reads from words.json (no more pandas) - run.py: 8-step pipeline (list scrape → frequency → examples → detail scrape → audio download → fonts → images → build) - benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers for future words.json integration Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1041 lines
41 KiB
Python
1041 lines
41 KiB
Python
"""Migration script: builds data/words.json from all existing data sources.
|
||
|
||
Run:
|
||
python3 scripts/migrate_to_json.py
|
||
python3 scripts/migrate_to_json.py --dry-run
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import argparse
|
||
import csv
|
||
import json
|
||
import logging
|
||
import re
|
||
import sys
|
||
import unicodedata
|
||
from collections import defaultdict
|
||
from pathlib import Path
|
||
from typing import Any
|
||
|
||
import genanki
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Bootstrap: parent package helpers
|
||
# ---------------------------------------------------------------------------
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
from helpers import strip_nikkud # noqa: E402
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Logging
|
||
# ---------------------------------------------------------------------------
|
||
logging.basicConfig(
|
||
format="%(levelname)s %(message)s",
|
||
level=logging.INFO,
|
||
)
|
||
log = logging.getLogger(__name__)
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Constants
|
||
# ---------------------------------------------------------------------------
|
||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||
OUTPUT_FILE = DATA_DIR / "words.json"
|
||
MIGRATION_DATE = "2026-03-08"
|
||
|
||
EMOJI_RE = re.compile(
|
||
r"[\U0001F300-\U0001FFFF"
|
||
r"\U00002600-\U000027BF"
|
||
r"\U0001F000-\U0001F9FF"
|
||
r"\u2600-\u26FF"
|
||
r"\u2700-\u27BF]+",
|
||
re.UNICODE,
|
||
)
|
||
|
||
|
||
# NFC-normalise once; used throughout for consistent Unicode comparisons.
|
||
def _nfc(s: str) -> str:
|
||
return unicodedata.normalize("NFC", s)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# PoS → Hebrew mapping
|
||
# ---------------------------------------------------------------------------
|
||
POS_HEBREW: dict[str, str] = {
|
||
"Noun": "שֵׁם עֶצֶם",
|
||
"Verb": "פֹּעַל",
|
||
"Adjective": "שֵׁם תֹּאַר",
|
||
"Adverb": "תֹּאַר הַפֹּעַל",
|
||
"Pronoun": "כִּנּוּי גּוּף",
|
||
"Preposition": "מִילַּת יַחַס",
|
||
"Conjunction": "מִילַּת חִבּוּר",
|
||
"Interjection": "מִילַּת קְרִיאָה",
|
||
"Numeral": "שֵׁם מִסְפָּר",
|
||
"Cardinal numeral": "שֵׁם מִסְפָּר",
|
||
"Particle": "מִילִּית",
|
||
"Determiner": "מְגַדִּיר",
|
||
"Existential": "מִילַּת קִיּוּם",
|
||
"Interrogative": "מִילַּת שְׁאֵלָה",
|
||
}
|
||
|
||
# Binyan suffix appended to pos_hebrew for verbs
|
||
BINYAN_HEBREW: dict[str, str] = {
|
||
"Pa'al": "פָּעַל",
|
||
"Nif'al": "נִפְעַל",
|
||
"Pi'el": "פִּיעֵל",
|
||
"Pu'al": "פֻּעַל",
|
||
"Hif'il": "הִפְעִיל",
|
||
"Huf'al": "הֻפְעַל",
|
||
"Hitpa'el": "הִתְפַּעֵל",
|
||
}
|
||
|
||
# Conjugation form-key → person code
|
||
FORM_KEY_TO_PERSON: dict[str, str] = {
|
||
"present_ms": "ms",
|
||
"present_fs": "fs",
|
||
"present_mp": "mp",
|
||
"present_fp": "fp",
|
||
"past_1s": "1s",
|
||
"past_1p": "1p",
|
||
"past_2ms": "2ms",
|
||
"past_2fs": "2fs",
|
||
"past_2mp": "2mp",
|
||
"past_2fp": "2fp",
|
||
"past_3ms": "3ms",
|
||
"past_3fs": "3fs",
|
||
"past_3p": "3p",
|
||
"future_1s": "1s",
|
||
"future_1p": "1p",
|
||
"future_2ms": "2ms",
|
||
"future_2fs": "2fs",
|
||
"future_2mp": "2mp",
|
||
"future_2fp": "2fp",
|
||
"future_3ms": "3ms",
|
||
"future_3fs": "3fs",
|
||
"future_3mp": "3mp",
|
||
"future_3fp": "3fp",
|
||
"imperative_ms": "ms",
|
||
"imperative_fs": "fs",
|
||
"imperative_mp": "mp",
|
||
"imperative_fp": "fp",
|
||
"infinitive": "inf",
|
||
}
|
||
|
||
# Mirrors apkg_builder.PRESENT_EXPANSION — all pronoun/tense choices per present form key.
|
||
# The builder uses a per-verb seeded RNG to pick one; we store all possible GUIDs.
|
||
PRESENT_EXPANSION: dict[str, list[tuple[str, str]]] = {
|
||
"present_ms": [
|
||
("אֲנִי (זָכָר)", "הוֹוֶה"),
|
||
("אַתָּה", "הוֹוֶה"),
|
||
("הוּא", "הוֹוֶה"),
|
||
],
|
||
"present_fs": [
|
||
("אֲנִי (נְקֵבָה)", "הוֹוֶה"),
|
||
("אַתְּ", "הוֹוֶה"),
|
||
("הִיא", "הוֹוֶה"),
|
||
],
|
||
"present_mp": [
|
||
("אֲנַחְנוּ (זָכָר)", "הוֹוֶה"),
|
||
("אַתֶּם", "הוֹוֶה"),
|
||
("הֵם", "הוֹוֶה"),
|
||
],
|
||
"present_fp": [
|
||
("אֲנַחְנוּ (נְקֵבָה)", "הוֹוֶה"),
|
||
("אַתֶּן", "הוֹוֶה"),
|
||
("הֵן", "הוֹוֶה"),
|
||
],
|
||
}
|
||
|
||
# Mirrors apkg_builder.PAST_3P_EXPANSION
|
||
PAST_3P_EXPANSION: list[tuple[str, str]] = [
|
||
("הֵם", "עָבָר"),
|
||
("הֵן", "עָבָר"),
|
||
]
|
||
|
||
# Mirrors apkg_builder.FP_MODERN_FALLBACK
|
||
FP_MODERN_FALLBACK: dict[str, str] = {
|
||
"future_2fp": "future_2mp",
|
||
"future_3fp": "future_3mp",
|
||
"imperative_fp": "imperative_mp",
|
||
}
|
||
|
||
# 1st-person forms that get a randomly assigned gender label in the builder
|
||
_FIRST_PERSON_GENDERED: set[str] = {"past_1s", "past_1p", "future_1s", "future_1p"}
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Helpers
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _strip(text: str) -> str:
|
||
"""Strip nikkud using the shared helper."""
|
||
return strip_nikkud(text)
|
||
|
||
|
||
def _hebrew_word(nikkud: str) -> dict[str, str]:
|
||
"""Build a {nikkud, ktiv_male} sub-object."""
|
||
return {"nikkud": nikkud, "ktiv_male": _strip(nikkud)}
|
||
|
||
|
||
def _parse_root(raw: str) -> list[str]:
|
||
"""Parse root string like 'שׁ - מ - ר' into list of consonants.
|
||
|
||
Returns empty list for '-' or empty input.
|
||
"""
|
||
raw = raw.strip()
|
||
if not raw or raw == "-":
|
||
return []
|
||
parts = [p.strip() for p in raw.split(" - ")]
|
||
return [p for p in parts if p]
|
||
|
||
|
||
def _extract_emoji(meaning: str) -> tuple[str, str | None]:
|
||
"""Split emoji from meaning string.
|
||
|
||
Returns (clean_meaning, emoji_char_or_None).
|
||
"""
|
||
emoji_match = EMOJI_RE.search(meaning)
|
||
if not emoji_match:
|
||
return meaning.strip(), None
|
||
emoji = emoji_match.group(0)
|
||
clean = EMOJI_RE.sub("", meaning).strip()
|
||
# Collapse multiple spaces
|
||
clean = re.sub(r"\s{2,}", " ", clean).strip()
|
||
return clean, emoji
|
||
|
||
|
||
def _parse_pos(raw_pos: str) -> tuple[str, str]:
|
||
"""Return (pos_english, pos_hebrew) from raw PoS string.
|
||
|
||
Handles patterns like:
|
||
- "Noun – masculine" → ("Noun", "שֵׁם עֶצֶם")
|
||
- "Verb –pa'al" → ("Verb", "פֹּעַל — פָּעַל")
|
||
- "Noun –ketelpattern, feminine" → ("Noun", "שֵׁם עֶצֶם")
|
||
- "–" → ("Existential", "מִילַּת קִיּוּם")
|
||
- "Cardinal numeral – masculine" → ("Cardinal numeral", "שֵׁם מִסְפָּר")
|
||
"""
|
||
raw_pos = raw_pos.strip()
|
||
|
||
# Special case for bare "–" (יש, אין)
|
||
if raw_pos == "–":
|
||
return "Existential", POS_HEBREW["Existential"]
|
||
|
||
# Split on " – " (em-dash with spaces) or " –" (em-dash no space)
|
||
first_part = re.split(r"\s*–", raw_pos)[0].strip()
|
||
|
||
# Map the first word to canonical English PoS key
|
||
# "Cardinal numeral" needs two words
|
||
if first_part.lower().startswith("cardinal"):
|
||
pos_en = "Cardinal numeral"
|
||
else:
|
||
pos_en = first_part.split()[0].capitalize() if first_part else raw_pos
|
||
|
||
# Detect binyan for verbs: "Verb –pa'al" → part after – is binyan slug
|
||
binyan_hebrew: str | None = None
|
||
if pos_en == "Verb":
|
||
# extract the binyan part: everything after the dash, strip "pattern" etc.
|
||
after = re.split(r"–\s*", raw_pos, maxsplit=1)
|
||
if len(after) > 1:
|
||
binyan_slug_raw = after[1].split(",")[0].strip()
|
||
# Normalise: "pa'al" → "Pa'al", "hif'il" → "Hif'il" etc.
|
||
for k in BINYAN_HEBREW:
|
||
if k.lower() == binyan_slug_raw.lower():
|
||
binyan_hebrew = BINYAN_HEBREW[k]
|
||
break
|
||
|
||
base_hebrew = POS_HEBREW.get(pos_en, "")
|
||
if binyan_hebrew:
|
||
pos_hebrew = f"{base_hebrew} — {binyan_hebrew}" if base_hebrew else binyan_hebrew
|
||
else:
|
||
pos_hebrew = base_hebrew
|
||
|
||
return pos_en, pos_hebrew
|
||
|
||
|
||
def _strip_construct_hyphen(form: str) -> str:
|
||
"""Remove trailing maqqef hyphen from construct form (e.g. 'אֲבִי־' → 'אֲבִי')."""
|
||
return form.rstrip("־").rstrip("-").strip()
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Data loaders
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def load_csv(path: Path) -> list[dict[str, str]]:
|
||
rows: list[dict[str, str]] = []
|
||
with path.open(encoding="utf-8") as f:
|
||
reader = csv.DictReader(f, delimiter=";")
|
||
for row in reader:
|
||
rows.append(dict(row))
|
||
log.info("Loaded %d rows from %s", len(rows), path.name)
|
||
return rows
|
||
|
||
|
||
def load_json(path: Path) -> Any:
|
||
with path.open(encoding="utf-8") as f:
|
||
data = json.load(f)
|
||
log.info("Loaded %s (%d entries)", path.name, len(data))
|
||
return data
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Build legacy GUID lookup
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def build_guid_lookup(
|
||
guid_map: dict[str, str],
|
||
) -> tuple[dict[str, str], dict[tuple[str, str], str]]:
|
||
"""Split guid_map into plain-word and (word, meaning) keyed dicts.
|
||
|
||
All keys NFC-normalised for consistent comparison.
|
||
"""
|
||
base: dict[str, str] = {}
|
||
disambig: dict[tuple[str, str], str] = {}
|
||
for raw_k, guid in guid_map.items():
|
||
k = _nfc(raw_k)
|
||
if "||" in k:
|
||
word, meaning = k.split("||", 1)
|
||
disambig[(word, meaning)] = guid
|
||
else:
|
||
base[k] = guid
|
||
return base, disambig
|
||
|
||
|
||
def resolve_guid(
|
||
word_nikkud: str,
|
||
meaning: str,
|
||
base: dict[str, str],
|
||
disambig: dict[tuple[str, str], str],
|
||
) -> str | None:
|
||
w = _nfc(word_nikkud)
|
||
m = _nfc(meaning)
|
||
# Prefer explicit disambiguation
|
||
if (w, m) in disambig:
|
||
return disambig[(w, m)]
|
||
# Check any disambiguation key that starts with same prefix (truncated meanings)
|
||
for (dw, dm), g in disambig.items():
|
||
if dw == w and (m.startswith(dm) or dm.startswith(m[:20])):
|
||
return g
|
||
return base.get(w)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Unique key generation
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def build_unique_keys(
|
||
rows: list[dict[str, str]],
|
||
) -> tuple[dict[int, str], list[str]]:
|
||
"""Assign unique_key to each CSV row (by index).
|
||
|
||
Escalation:
|
||
1. nikkud word
|
||
2. "word|pos" (if nikkud collides)
|
||
3. "word|pos|meaning" (if nikkud+pos collides)
|
||
4. "word|pos|meaning|N" (N=2,3,… for true CSV exact-duplicates)
|
||
|
||
Returns:
|
||
idx_to_key — map from CSV row index to unique_key
|
||
collisions — list of collision descriptions logged
|
||
"""
|
||
collisions: list[str] = []
|
||
idx_to_key: dict[int, str] = {}
|
||
|
||
def _pos_short(pos: str) -> str:
|
||
"""Canonical short PoS label for key construction."""
|
||
if pos == "–":
|
||
return "Existential"
|
||
return re.split(r"\s*[–-]", pos)[0].strip()
|
||
|
||
# Pass 1: try plain nikkud key
|
||
key_to_indices: dict[str, list[int]] = defaultdict(list)
|
||
for i, row in enumerate(rows):
|
||
k = row["Word"]
|
||
key_to_indices[k].append(i)
|
||
|
||
for k, indices in key_to_indices.items():
|
||
if len(indices) == 1:
|
||
idx_to_key[indices[0]] = k
|
||
else:
|
||
collisions.append(f"Nikkud collision '{k}' ({len(indices)} rows) — escalating to word|pos")
|
||
# Pass 2: try word|pos
|
||
pos_key_to_indices: dict[str, list[int]] = defaultdict(list)
|
||
for i in indices:
|
||
short_pos = _pos_short(rows[i]["Part of Speech"])
|
||
pos_key = f"{k}|{short_pos}"
|
||
pos_key_to_indices[pos_key].append(i)
|
||
for pk, pk_indices in pos_key_to_indices.items():
|
||
if len(pk_indices) == 1:
|
||
idx_to_key[pk_indices[0]] = pk
|
||
else:
|
||
collisions.append(
|
||
f" Nikkud+PoS collision '{pk}' ({len(pk_indices)} rows) — escalating to word|pos|meaning"
|
||
)
|
||
# Pass 3: try word|pos|meaning
|
||
meaning_key_to_indices: dict[str, list[int]] = defaultdict(list)
|
||
for j in pk_indices:
|
||
meaning = rows[j]["Meaning"]
|
||
full_key = f"{pk}|{meaning}"
|
||
meaning_key_to_indices[full_key].append(j)
|
||
for mk, mk_indices in meaning_key_to_indices.items():
|
||
if len(mk_indices) == 1:
|
||
idx_to_key[mk_indices[0]] = mk
|
||
else:
|
||
# True exact duplicates: append numeric suffix |2, |3, …
|
||
collisions.append(
|
||
f" Exact duplicate '{mk}' ({len(mk_indices)} rows, same slug) "
|
||
f"— appending numeric suffix"
|
||
)
|
||
idx_to_key[mk_indices[0]] = mk
|
||
for n, j in enumerate(mk_indices[1:], start=2):
|
||
idx_to_key[j] = f"{mk}|{n}"
|
||
|
||
# Verify completeness
|
||
unkeyed = [i for i in range(len(rows)) if i not in idx_to_key]
|
||
if unkeyed:
|
||
log.error("BUG: %d rows have no unique_key assigned!", len(unkeyed))
|
||
|
||
return idx_to_key, collisions
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Conjugation builder
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def _conj_guids(
|
||
infinitive_nikkud: str,
|
||
form_key: str,
|
||
form_data: dict,
|
||
) -> list[str]:
|
||
"""Return the list of possible GUIDs for a conjugation form.
|
||
|
||
Mirrors apkg_builder's add_note call logic:
|
||
- Present tense: one GUID per PRESENT_EXPANSION choice (all stored).
|
||
- past_3p: two GUIDs (הֵם / הֵן).
|
||
- FP_MODERN_FALLBACK keys: GUID from form_data pronoun/tense directly.
|
||
- 1st-person gendered: two GUIDs (זָכָר / נְקֵבָה suffix).
|
||
- Standard: single GUID from form_data pronoun + tense.
|
||
|
||
The builder uses a seeded RNG to *pick one* for present/past_3p; we store
|
||
all candidates so a future reader can identify which GUID is live.
|
||
"""
|
||
if form_key in PRESENT_EXPANSION:
|
||
return [genanki.guid_for(infinitive_nikkud, pronoun, tense) for pronoun, tense in PRESENT_EXPANSION[form_key]]
|
||
|
||
if form_key == "past_3p":
|
||
return [genanki.guid_for(infinitive_nikkud, pronoun, tense) for pronoun, tense in PAST_3P_EXPANSION]
|
||
|
||
if form_key in FP_MODERN_FALLBACK:
|
||
# Builder uses form_data pronoun/tense directly for these
|
||
pronoun = form_data.get("pronoun", "")
|
||
tense = form_data.get("tense", "")
|
||
return [genanki.guid_for(infinitive_nikkud, pronoun, tense)]
|
||
|
||
pronoun = form_data.get("pronoun", "")
|
||
tense = form_data.get("tense", "")
|
||
|
||
if form_key in _FIRST_PERSON_GENDERED:
|
||
# Builder appends " (זָכָר)" or " (נְקֵבָה)" — store both
|
||
return [
|
||
genanki.guid_for(infinitive_nikkud, f"{pronoun} (זָכָר)", tense),
|
||
genanki.guid_for(infinitive_nikkud, f"{pronoun} (נְקֵבָה)", tense),
|
||
]
|
||
|
||
return [genanki.guid_for(infinitive_nikkud, pronoun, tense)]
|
||
|
||
|
||
def build_conjugation_forms(forms_dict: dict, infinitive_nikkud: str = "") -> list[dict]:
|
||
"""Convert raw forms dict to list of ConjugationForm objects.
|
||
|
||
Args:
|
||
forms_dict: Raw forms dict from conjugations.json.
|
||
infinitive_nikkud: Nikkud infinitive string used for GUID generation.
|
||
"""
|
||
result: list[dict] = []
|
||
# We store all candidate GUIDs rather than selecting one
|
||
for form_key, form_data in forms_dict.items():
|
||
if form_key == "infinitive":
|
||
continue # stored separately at conjugation.infinitive
|
||
person = FORM_KEY_TO_PERSON.get(form_key)
|
||
if person is None:
|
||
log.warning("Unknown form key: %s", form_key)
|
||
continue
|
||
nikkud_form = form_data.get("form", "")
|
||
if not nikkud_form:
|
||
continue
|
||
guids = _conj_guids(infinitive_nikkud, form_key, form_data) if infinitive_nikkud else []
|
||
result.append(
|
||
{
|
||
"person": person,
|
||
"tense": form_data.get("tense", ""),
|
||
"pronoun_hebrew": form_data.get("pronoun", ""),
|
||
"form": _hebrew_word(nikkud_form),
|
||
"audio_url": form_data.get("audio_url") or None,
|
||
"audio_file": None,
|
||
"guid": guids[0] if len(guids) == 1 else None,
|
||
"guid_candidates": guids if len(guids) > 1 else None,
|
||
}
|
||
)
|
||
return result
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Main migration
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def migrate(dry_run: bool = False) -> None: # noqa: C901 (complex but linear)
|
||
# ------------------------------------------------------------------
|
||
# 1. Load all sources
|
||
# ------------------------------------------------------------------
|
||
csv_rows = load_csv(DATA_DIR / "hebrew_dict_for_anki.csv")
|
||
conjugations: dict = load_json(DATA_DIR / "conjugations.json")
|
||
noun_plurals: dict = load_json(DATA_DIR / "noun_plurals.json")
|
||
vetted_sentences: dict = load_json(DATA_DIR / "vetted_sentences.json")
|
||
guid_map_raw: dict = load_json(DATA_DIR / "legacy_guid_map.json")
|
||
refined_meanings: dict = load_json(DATA_DIR / "refined_meanings.json")
|
||
image_cache: dict = load_json(DATA_DIR / "image_cache.json")
|
||
frequency_cache: dict = load_json(DATA_DIR / "frequency_cache.json")
|
||
# ------------------------------------------------------------------
|
||
# 2. Pre-process lookups
|
||
# ------------------------------------------------------------------
|
||
guid_base, guid_disambig = build_guid_lookup(guid_map_raw)
|
||
|
||
# noun_plurals: two lookup maps — by slug (primary), by nikkud singular (fallback)
|
||
plurals_by_slug: dict[str, dict] = {}
|
||
plurals_by_nikkud: dict[str, dict] = {}
|
||
for pdata in noun_plurals.values():
|
||
slug = pdata.get("slug", "")
|
||
if slug:
|
||
plurals_by_slug[slug] = pdata
|
||
sing = _nfc(pdata.get("singular", ""))
|
||
if sing:
|
||
plurals_by_nikkud[sing] = pdata
|
||
|
||
# vetted_sentences: keyed by stripped word; build NFC lookup of word_nikkud too
|
||
sentences_by_stripped: dict[str, dict] = {}
|
||
for sdata in vetted_sentences.values():
|
||
wn = sdata.get("word_nikkud", "")
|
||
if wn:
|
||
sentences_by_stripped[_strip(wn)] = sdata
|
||
|
||
# conjugations: indexed by slug (100% coverage) and by stripped infinitive
|
||
# Some active/passive pairs share the same slug (e.g. הופל/להפיל → 1231-lehapil).
|
||
# When slug collides, always prefer the ACTIVE verb in conj_by_slug so the
|
||
# entry is correctly associated with its active conjugation data.
|
||
conj_by_slug: dict[str, dict] = {}
|
||
conj_by_stripped_inf: dict[str, dict] = {}
|
||
for cdata in conjugations.values():
|
||
slug = cdata.get("slug", "")
|
||
if slug:
|
||
existing = conj_by_slug.get(slug)
|
||
if existing is None:
|
||
conj_by_slug[slug] = cdata
|
||
elif cdata.get("is_passive") and not existing.get("is_passive"):
|
||
# Keep the active verb; skip overwriting with passive
|
||
pass
|
||
elif existing.get("is_passive") and not cdata.get("is_passive"):
|
||
# Replace passive with active
|
||
conj_by_slug[slug] = cdata
|
||
else:
|
||
conj_by_slug[slug] = cdata
|
||
inf = cdata.get("infinitive", "")
|
||
if inf:
|
||
conj_by_stripped_inf[_strip(inf)] = cdata
|
||
|
||
# Build passive→active link:
|
||
# passive verbs store reference_form = nikkud infinitive of the ACTIVE verb
|
||
# We need: active_slug → passive_conj_data
|
||
passive_by_active_slug: dict[str, dict] = {}
|
||
for cdata in conjugations.values():
|
||
if not cdata.get("is_passive"):
|
||
continue
|
||
ref_nikkud = cdata.get("reference_form", "")
|
||
ref_stripped = _strip(ref_nikkud)
|
||
# find the active verb's slug
|
||
active_cdata = conj_by_stripped_inf.get(ref_stripped)
|
||
if active_cdata:
|
||
active_slug = active_cdata.get("slug", "")
|
||
if active_slug:
|
||
passive_by_active_slug[active_slug] = cdata
|
||
else:
|
||
log.warning(
|
||
"Passive verb '%s' references active '%s' (stripped='%s') — no match in conjugations",
|
||
cdata.get("infinitive"),
|
||
ref_nikkud,
|
||
ref_stripped,
|
||
)
|
||
|
||
# refined_meanings: NFC-keyed
|
||
refined_nfc: dict[str, str] = {_nfc(k): v for k, v in refined_meanings.items()}
|
||
|
||
# image_cache: stripped-word keyed
|
||
image_stripped: dict[str, str | None] = dict(image_cache)
|
||
|
||
# frequency_cache: stripped-word keyed
|
||
freq_stripped: dict[str, int] = {k: int(v) for k, v in frequency_cache.items() if v is not None}
|
||
|
||
# ------------------------------------------------------------------
|
||
# 3. Assign unique keys
|
||
# ------------------------------------------------------------------
|
||
idx_to_key, collisions = build_unique_keys(csv_rows)
|
||
for msg in collisions:
|
||
log.info("KEY COLLISION: %s", msg)
|
||
log.info("Collision summary: %d collision events", len(collisions))
|
||
|
||
# ------------------------------------------------------------------
|
||
# 3b. Identify exact-duplicate |N suffix rows to skip
|
||
# ------------------------------------------------------------------
|
||
# |N suffix rows (N=2,3,…) are true CSV exact-duplicates that share the
|
||
# same slug as the base entry. We drop them entirely so the unique_key
|
||
# space stays clean and no GUID collisions are emitted.
|
||
import re as _re
|
||
|
||
_dup_indices: set[int] = set()
|
||
for _i, _k in idx_to_key.items():
|
||
if _re.search(r"\|\d+$", _k):
|
||
_base_k = _re.sub(r"\|\d+$", "", _k)
|
||
_base_i = next((j for j, kk in idx_to_key.items() if kk == _base_k), None)
|
||
if _base_i is not None and csv_rows[_i]["slug"] == csv_rows[_base_i]["slug"]:
|
||
_dup_indices.add(_i)
|
||
if _dup_indices:
|
||
log.info(
|
||
"Skipping %d exact-duplicate |N suffix rows (same slug as base entry)",
|
||
len(_dup_indices),
|
||
)
|
||
|
||
# ------------------------------------------------------------------
|
||
# 4. Confusable groups: group by ktiv_male (from ktiv_male_forms)
|
||
# ------------------------------------------------------------------
|
||
# Build: stripped_word → set of slugs sharing that ktiv_male form
|
||
# We care about the *base* form (absolute_singular or absolute form of the word).
|
||
# Strategy: use "Word Without Nikkud" from CSV as ktiv_male, then group slugs.
|
||
# A confusable group = multiple *different* slugs sharing the same ktiv_male.
|
||
slug_to_ktiv_male: dict[str, str] = {}
|
||
for row in csv_rows:
|
||
slug_to_ktiv_male[row["slug"]] = row["Word Without Nikkud"]
|
||
|
||
ktiv_male_to_slugs: dict[str, set[str]] = defaultdict(set)
|
||
for slug, km in slug_to_ktiv_male.items():
|
||
ktiv_male_to_slugs[km].add(slug)
|
||
|
||
# Only keep those with >1 distinct slug
|
||
confusable_slug_groups: dict[str, set[str]] = {
|
||
km: slugs for km, slugs in ktiv_male_to_slugs.items() if len(slugs) > 1
|
||
}
|
||
log.info("Confusable ktiv_male groups: %d", len(confusable_slug_groups))
|
||
|
||
# Build reverse: slug → list of co-confusable slugs
|
||
slug_to_confusable_slugs: dict[str, set[str]] = {}
|
||
for _km, slugs in confusable_slug_groups.items():
|
||
for slug in slugs:
|
||
slug_to_confusable_slugs[slug] = slugs - {slug}
|
||
|
||
# We need to map slug → unique_key(s) for the confusable_group field
|
||
# But unique_key is per-row; one slug may map to multiple keys (duplicate entries with same slug).
|
||
# Exclude exact-duplicate rows so dropped entries don't pollute confusable groups.
|
||
slug_to_unique_keys: dict[str, list[str]] = defaultdict(list)
|
||
for i, row in enumerate(csv_rows):
|
||
if i not in _dup_indices:
|
||
slug_to_unique_keys[row["slug"]].append(idx_to_key[i])
|
||
|
||
# ------------------------------------------------------------------
|
||
# 5. Build entries
|
||
# ------------------------------------------------------------------
|
||
words: dict[str, dict] = {}
|
||
stats = {
|
||
"total": 0,
|
||
"has_conjugation": 0,
|
||
"has_noun_inflection": 0,
|
||
"has_examples": 0,
|
||
"has_guid": 0,
|
||
"has_image": 0,
|
||
"has_frequency": 0,
|
||
"has_hint": 0,
|
||
"has_emoji": 0,
|
||
"key_collisions": len(collisions),
|
||
}
|
||
|
||
for i, row in enumerate(csv_rows):
|
||
if i in _dup_indices:
|
||
continue
|
||
unique_key = idx_to_key[i]
|
||
word_nikkud = row["Word"]
|
||
word_ktiv = row["Word Without Nikkud"]
|
||
slug = row["slug"]
|
||
raw_pos = row["Part of Speech"]
|
||
meaning_raw = row["Meaning"]
|
||
audio_url = row["audio_url"] or None
|
||
tags = row["tags"] or ""
|
||
|
||
# -- PoS
|
||
pos_en, pos_hebrew = _parse_pos(raw_pos)
|
||
|
||
# -- Root
|
||
root = _parse_root(row["Root"])
|
||
|
||
# -- Meaning + emoji
|
||
meaning_clean, emoji_char = _extract_emoji(meaning_raw)
|
||
|
||
# -- GUID
|
||
guid = resolve_guid(word_nikkud, meaning_raw, guid_base, guid_disambig)
|
||
if guid:
|
||
stats["has_guid"] += 1
|
||
|
||
# -- Frequency (keyed by ktiv_male / stripped)
|
||
frequency = freq_stripped.get(word_ktiv)
|
||
if frequency:
|
||
stats["has_frequency"] += 1
|
||
|
||
# -- Image
|
||
image_filename = image_stripped.get(word_ktiv)
|
||
if image_filename:
|
||
stats["has_image"] += 1
|
||
|
||
# -- Hint (refined_meanings, NFC-keyed by nikkud)
|
||
hint = refined_nfc.get(_nfc(word_nikkud), "")
|
||
if hint:
|
||
stats["has_hint"] += 1
|
||
|
||
# -- Examples (vetted_sentences keyed by stripped word)
|
||
examples_block: dict | None = None
|
||
s_data = sentences_by_stripped.get(word_ktiv)
|
||
if s_data:
|
||
good = s_data.get("good_sentences", [])
|
||
if good:
|
||
vetted_list = [
|
||
{
|
||
"text": s["text"],
|
||
"source": s.get("book", "unknown"),
|
||
"vetted": True,
|
||
}
|
||
for s in good
|
||
]
|
||
# Pick best cloze sentence (first good one)
|
||
cloze_sent = good[0]
|
||
# cloze_guid: deterministic ID for the cloze card on this vocab note.
|
||
# Pattern: guid_for(word_nikkud, "cloze") — unique per word.
|
||
_cloze_guid = genanki.guid_for(word_nikkud, "cloze")
|
||
_cloze_text = cloze_sent["text"]
|
||
|
||
# Compute cloze_word_start / cloze_word_end from the text.
|
||
# Strategy (in order):
|
||
# 1. Use stored offsets if already present in source data.
|
||
# 2. Exact nikkud form search.
|
||
# 3. Exact ktiv_male (plain consonants) search in the raw text.
|
||
# 4. Scan each Hebrew word token in the text; match by stripped consonants.
|
||
# This handles inflected/construct/plural forms with different nikkud.
|
||
_cw_start: int | None = cloze_sent.get("cloze_word_start")
|
||
_cw_end: int | None = cloze_sent.get("cloze_word_end")
|
||
if _cw_start is None or _cw_end is None:
|
||
_idx = _cloze_text.find(word_nikkud)
|
||
if _idx >= 0:
|
||
_cw_start = _idx
|
||
_cw_end = _idx + len(word_nikkud)
|
||
else:
|
||
# Try exact ktiv_male substring
|
||
_idx2 = _cloze_text.find(word_ktiv)
|
||
if _idx2 >= 0:
|
||
_cw_start = _idx2
|
||
_cw_end = _idx2 + len(word_ktiv)
|
||
else:
|
||
# Scan Hebrew word tokens; find one whose stripped form
|
||
# matches word_ktiv (handles inflected/construct/plural).
|
||
_HEBREW_TOK = re.compile(
|
||
r"[\u05D0-\u05FA\u05B0-\u05BD\u05BF\u05C1\u05C2\u05C7"
|
||
r"\uFB1D-\uFB4E]+"
|
||
)
|
||
for _m in _HEBREW_TOK.finditer(_cloze_text):
|
||
if _strip(_m.group(0)) == word_ktiv:
|
||
_cw_start = _m.start()
|
||
_cw_end = _m.end()
|
||
break
|
||
# else leave both as None
|
||
|
||
cloze_block = {
|
||
"text": _cloze_text,
|
||
"cloze_word_start": _cw_start,
|
||
"cloze_word_end": _cw_end,
|
||
"cloze_hint": cloze_sent.get("cloze_hint"),
|
||
"cloze_guid": _cloze_guid,
|
||
}
|
||
examples_block = {
|
||
"vetted": vetted_list,
|
||
"cloze": cloze_block,
|
||
"rejected_count": s_data.get("rejected_count", 0),
|
||
}
|
||
stats["has_examples"] += 1
|
||
|
||
# -- Noun inflection
|
||
noun_inflection: dict | None = None
|
||
pdata = plurals_by_slug.get(slug) or plurals_by_nikkud.get(_nfc(word_nikkud))
|
||
if pdata and pos_en.startswith("Noun"):
|
||
|
||
def _hw_or_null(nk: str) -> dict | None:
|
||
nk = _strip_construct_hyphen(nk)
|
||
return _hebrew_word(nk) if nk else None
|
||
|
||
gender = pdata.get("gender") or None
|
||
gender_hebrew_map = {
|
||
"masculine": {"nikkud": "זָכָר", "ktiv_male": "זכר"},
|
||
"feminine": {"nikkud": "נְקֵבָה", "ktiv_male": "נקבה"},
|
||
}
|
||
# Plural GUID mirrors apkg_builder line 1609: guid_for("plural", singular_nikkud)
|
||
_plural_singular_nikkud = pdata.get("singular", "")
|
||
_plurals_guid = genanki.guid_for("plural", _plural_singular_nikkud) if _plural_singular_nikkud else None
|
||
noun_inflection = {
|
||
"plurals_guid": _plurals_guid,
|
||
"singular": _hw_or_null(pdata.get("singular", "")),
|
||
"plural": _hw_or_null(pdata.get("plural", "")),
|
||
"singular_audio": pdata.get("singular_audio") or None,
|
||
"plural_audio": pdata.get("plural_audio") or None,
|
||
"construct_singular": _hw_or_null(pdata.get("construct_singular", "")),
|
||
"construct_plural": _hw_or_null(pdata.get("construct_plural", "")),
|
||
"pronominal_suffixes": None,
|
||
"gender": gender,
|
||
"gender_hebrew": gender_hebrew_map.get(gender) if gender else None,
|
||
"mishkal": pdata.get("mishkal") or None,
|
||
"mishkal_hebrew": None,
|
||
}
|
||
stats["has_noun_inflection"] += 1
|
||
|
||
# -- Verb conjugation
|
||
conjugation_block: dict | None = None
|
||
cdata = conj_by_slug.get(slug)
|
||
if cdata and not cdata.get("is_passive"):
|
||
# This entry is an active verb with conjugation data
|
||
forms_dict = cdata.get("forms", {})
|
||
# Resolve infinitive nikkud for GUID generation (prefer forms dict, fall back to cdata key)
|
||
_inf_data = forms_dict.get("infinitive", {})
|
||
_inf_nikkud_for_guid = _inf_data.get("form", "") or cdata.get("infinitive", "")
|
||
active_forms = build_conjugation_forms(forms_dict, _inf_nikkud_for_guid)
|
||
|
||
# Passive counterpart, if any
|
||
passive_cdata = passive_by_active_slug.get(slug)
|
||
hufal_pual_forms: list | None = None
|
||
reference_form_passive: dict | None = None
|
||
if passive_cdata:
|
||
passive_forms_dict = passive_cdata.get("forms", {})
|
||
_passive_inf_data = passive_forms_dict.get("infinitive", {})
|
||
_passive_inf_nikkud = _passive_inf_data.get("form", "") or passive_cdata.get("infinitive", "")
|
||
hufal_pual_forms = build_conjugation_forms(passive_forms_dict, _passive_inf_nikkud)
|
||
# reference_form of passive = active infinitive; 3ms past is in its forms
|
||
rf_passive_nikkud = passive_cdata.get("forms", {}).get("past_3ms", {}).get("form", "")
|
||
if rf_passive_nikkud:
|
||
reference_form_passive = _hebrew_word(rf_passive_nikkud)
|
||
|
||
# Infinitive form (from forms dict)
|
||
inf_form_data = forms_dict.get("infinitive", {})
|
||
inf_nikkud = inf_form_data.get("form", "") or cdata.get("infinitive", "")
|
||
infinitive_hw = _hebrew_word(inf_nikkud) if inf_nikkud else None
|
||
|
||
# Reference form
|
||
ref_nikkud = cdata.get("reference_form", "")
|
||
reference_form_hw = _hebrew_word(ref_nikkud) if ref_nikkud else None
|
||
|
||
binyan = cdata.get("binyan", "")
|
||
binyan_hebrew = BINYAN_HEBREW.get(binyan, "")
|
||
|
||
conjugation_block = {
|
||
"in_conjugation_deck": True,
|
||
"infinitive": infinitive_hw,
|
||
"reference_form": reference_form_hw,
|
||
"binyan": binyan,
|
||
"binyan_hebrew": binyan_hebrew,
|
||
"prep": None,
|
||
"active_forms": active_forms,
|
||
"hufal_pual_forms": hufal_pual_forms,
|
||
"reference_form_passive": reference_form_passive,
|
||
}
|
||
stats["has_conjugation"] += 1
|
||
|
||
elif cdata and cdata.get("is_passive"):
|
||
# Passive-only entry: store a minimal conjugation block referencing the active verb
|
||
binyan = cdata.get("binyan", "")
|
||
binyan_hebrew = BINYAN_HEBREW.get(binyan, "")
|
||
forms_dict = cdata.get("forms", {})
|
||
_passive_only_inf_data = forms_dict.get("infinitive", {})
|
||
_passive_only_inf_nikkud = _passive_only_inf_data.get("form", "") or cdata.get("infinitive", "")
|
||
passive_forms = build_conjugation_forms(forms_dict, _passive_only_inf_nikkud)
|
||
|
||
inf_form_data = forms_dict.get("infinitive", {})
|
||
inf_nikkud = inf_form_data.get("form", "") or cdata.get("infinitive", "")
|
||
infinitive_hw = _hebrew_word(inf_nikkud) if inf_nikkud else None
|
||
|
||
ref_nikkud = cdata.get("reference_form", "")
|
||
reference_form_hw = _hebrew_word(ref_nikkud) if ref_nikkud else None
|
||
|
||
conjugation_block = {
|
||
"in_conjugation_deck": True,
|
||
"infinitive": infinitive_hw,
|
||
"reference_form": reference_form_hw,
|
||
"binyan": binyan,
|
||
"binyan_hebrew": binyan_hebrew,
|
||
"prep": None,
|
||
"active_forms": passive_forms,
|
||
"hufal_pual_forms": None,
|
||
"reference_form_passive": None,
|
||
}
|
||
stats["has_conjugation"] += 1
|
||
|
||
# -- Confusable group (filled in pass 2 below)
|
||
# -- Shared roots (filled in pass 2 below)
|
||
|
||
# -- Audio filename: slug-based for confusables, word-based otherwise
|
||
audio_file = f"{word_ktiv}.mp3"
|
||
|
||
entry: dict = {
|
||
"word": {"nikkud": word_nikkud, "ktiv_male": word_ktiv},
|
||
"slug": slug,
|
||
"root": root,
|
||
"pos": pos_en,
|
||
"pos_hebrew": pos_hebrew,
|
||
"meaning": meaning_clean,
|
||
"meaning_raw": meaning_raw,
|
||
"audio_url": audio_url,
|
||
"audio_file": audio_file,
|
||
"tags": tags,
|
||
"last_scrape_date": MIGRATION_DATE,
|
||
# Identity
|
||
"vocab_legacy_guid": guid,
|
||
# Frequency
|
||
"frequency": frequency,
|
||
"pseudo_frequency": None,
|
||
# Display
|
||
"emoji": emoji_char,
|
||
"emoji_source": "from_pealim" if emoji_char else None,
|
||
"emoji_visible": False,
|
||
"image": image_filename,
|
||
"image_source": "wikipedia" if image_filename else None,
|
||
"hint": hint,
|
||
# Populated in pass 2
|
||
"shared_roots": [],
|
||
"confusable_group": None,
|
||
"confusables_guid": None,
|
||
# Sub-sections
|
||
"examples": examples_block,
|
||
"noun_inflection": noun_inflection,
|
||
"conjugation": conjugation_block,
|
||
"adjective_inflection": None,
|
||
"preposition_inflection": None,
|
||
}
|
||
|
||
if emoji_char:
|
||
stats["has_emoji"] += 1
|
||
|
||
if unique_key in words:
|
||
log.warning(
|
||
"DUPLICATE unique_key '%s' — row %d would overwrite row %d",
|
||
unique_key,
|
||
i,
|
||
list(words.keys()).index(unique_key),
|
||
)
|
||
words[unique_key] = entry
|
||
stats["total"] += 1
|
||
|
||
# ------------------------------------------------------------------
|
||
# 6. Pass 2 — shared_roots and confusable_group
|
||
# ------------------------------------------------------------------
|
||
|
||
# shared_roots: group unique_keys by root tuple
|
||
root_to_keys: dict[tuple, list[str]] = defaultdict(list)
|
||
for uk, entry in words.items():
|
||
r = entry["root"]
|
||
if r:
|
||
root_to_keys[tuple(r)].append(uk)
|
||
|
||
for uks in root_to_keys.values():
|
||
if len(uks) > 1:
|
||
for uk in uks:
|
||
words[uk]["shared_roots"] = [k for k in uks if k != uk]
|
||
|
||
# confusable_group: update audio_file to slug-based for confusable words
|
||
# Also set confusables_guid: genanki.guid_for("confusable", ktiv_male)
|
||
# where ktiv_male is the shared stripped form (key in confusable_slug_groups).
|
||
# Build reverse: slug → ktiv_male (for GUID generation)
|
||
slug_to_confusable_ktiv_male: dict[str, str] = {}
|
||
for km, slugs in confusable_slug_groups.items():
|
||
for slug_in_group in slugs:
|
||
slug_to_confusable_ktiv_male[slug_in_group] = km
|
||
|
||
for i, row in enumerate(csv_rows):
|
||
if i in _dup_indices:
|
||
continue
|
||
slug = row["slug"]
|
||
uk = idx_to_key[i]
|
||
co_slugs = slug_to_confusable_slugs.get(slug, set())
|
||
if co_slugs:
|
||
# Gather all unique_keys for co-confusable slugs
|
||
group_keys: list[str] = []
|
||
for co_slug in co_slugs:
|
||
group_keys.extend(slug_to_unique_keys.get(co_slug, []))
|
||
group_keys.append(uk)
|
||
group_keys = sorted(set(group_keys))
|
||
words[uk]["confusable_group"] = group_keys
|
||
# confusables_guid: mirrors apkg_builder line 1401
|
||
ktiv_male_key = slug_to_confusable_ktiv_male.get(slug, "")
|
||
if ktiv_male_key:
|
||
words[uk]["confusables_guid"] = genanki.guid_for("confusable", ktiv_male_key)
|
||
# Use slug-based audio file for confusables to disambiguate
|
||
words[uk]["audio_file"] = f"{slug}.mp3"
|
||
|
||
# ------------------------------------------------------------------
|
||
# 7. Stats report
|
||
# ------------------------------------------------------------------
|
||
log.info("=" * 60)
|
||
log.info("MIGRATION COMPLETE — summary stats:")
|
||
log.info(" Total entries: %d", stats["total"])
|
||
log.info(" Key collision events: %d", stats["key_collisions"])
|
||
log.info(" Has conjugation: %d", stats["has_conjugation"])
|
||
log.info(" Has noun_inflection: %d", stats["has_noun_inflection"])
|
||
log.info(" Has examples: %d", stats["has_examples"])
|
||
log.info(" Has legacy GUID: %d", stats["has_guid"])
|
||
log.info(" Has image: %d", stats["has_image"])
|
||
log.info(" Has frequency: %d", stats["has_frequency"])
|
||
log.info(" Has hint: %d", stats["has_hint"])
|
||
log.info(" Has emoji: %d", stats["has_emoji"])
|
||
# Confusable entries
|
||
confusable_entries = sum(1 for e in words.values() if e["confusable_group"])
|
||
log.info(" In confusable group: %d", confusable_entries)
|
||
# Entries with shared roots
|
||
with_shared_roots = sum(1 for e in words.values() if e["shared_roots"])
|
||
log.info(" Has shared roots: %d", with_shared_roots)
|
||
|
||
if dry_run:
|
||
log.info("DRY RUN — output file NOT written.")
|
||
return
|
||
|
||
# ------------------------------------------------------------------
|
||
# 8. Write output
|
||
# ------------------------------------------------------------------
|
||
with OUTPUT_FILE.open("w", encoding="utf-8") as f:
|
||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||
f.write("\n")
|
||
|
||
log.info("Wrote %d entries to %s", len(words), OUTPUT_FILE)
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# Entry point
|
||
# ---------------------------------------------------------------------------
|
||
|
||
|
||
def main() -> None:
|
||
parser = argparse.ArgumentParser(
|
||
description="Migrate all pealim data sources into data/words.json",
|
||
)
|
||
parser.add_argument(
|
||
"--dry-run",
|
||
action="store_true",
|
||
help="Print stats without writing the output file.",
|
||
)
|
||
args = parser.parse_args()
|
||
migrate(dry_run=args.dry_run)
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main()
|