hebrew_flash_cards/SCHEMA.yaml
Sochen 08fb7009d8 Sprint 11: unified JSON architecture + consolidated scraping pipeline
Migrate from fragmented CSV + 10 JSON files to a single data/words.json
(9,104 entries) as the unified data store. All GUIDs preserved for Anki
study progress continuity.

New files:
- SCHEMA.yaml: authoritative schema for words.json
- pealim_list_scrape.py: consolidated list page scraper → words.json
- pealim_detail_scrape.py: noun/verb detail scraper → words.json
- pealim_audio_download.py: audio downloader reading from words.json
- scripts/migrate_to_json.py: one-time CSV→JSON migration
- scripts/validate_data.py: 17 data integrity tests
- scripts/check_guid_coverage.py: GUID preservation checker
- scripts/repair_slugs.py: slug deduplication repair tool
- tests/test_scraper_integration.py: live scraper integration tests

Updated:
- apkg_builder.py: reads from words.json (no more pandas)
- run.py: 8-step pipeline (list scrape → frequency → examples →
  detail scrape → audio download → fonts → images → build)
- benyehuda.py, frequency_lookup.py, image_fetch.py: TODO markers
  for future words.json integration

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-08 10:54:58 +00:00

148 lines
6.6 KiB
YAML

# Hebrew Flash Cards — Unified Data Schema (words.json)
# Revised based on Nevo's feedback (2026-03-08)
#
# Top-level: dict keyed by unique_key
# Unique key: nikkud word for most entries (e.g. "אָב")
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
#
# Hebrew text fields use nikkud/ktiv_male subfields:
# field:
# nikkud: "אָב" # with nikkud (hebstyle=mo)
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
#
# Pronoun notation for conjugation forms uses grammatical codes:
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
entry:
# --- Core Identity ---
word:
nikkud: "אָב"
ktiv_male: "אב"
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
root: ["א", "ב"] # Shoresh as list of consonant chars
pos: "Noun" # Part of speech in English (as from pealim)
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
audio_url: "https://..." # Pealim audio URL
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
tags: "" # Pealim tags if any
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
# --- Identity & Progress ---
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
# --- Frequency ---
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
# --- Display Enrichment ---
emoji: "👨"
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
image: "father.jpg" # Wikipedia/Commons image filename, or null
image_source: "wikipedia" # One of: wikipedia, commons, null
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
# --- Shared Roots ---
shared_roots: [] # List of unique_keys of other words sharing the same root
# Computed by iterating all entries and grouping by root
# --- Confusables ---
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
# --- Example Sentences ---
examples:
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
vetted: true
cloze: # Best sentence for cloze card, or null
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
cloze_word_start: 0 # Character offset of the clozed word in text
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
cloze_hint: "family member"
cloze_guid: "def456..." # GUID for the cloze note
rejected_count: 0
# --- Noun-specific: Inflection Forms ---
noun_inflection: null # null for non-nouns
# When populated:
# plurals_guid: "ghi789..." # GUID for plurals deck note
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
# nikkud: "אָב"
# ktiv_male: "אב"
# plural:
# nikkud: "אָבוֹת"
# ktiv_male: "אבות"
# singular_audio: "6009-av.mp3"
# plural_audio: null # TODO: scrape from detail pages
# construct_singular:
# nikkud: "אֲבִי"
# ktiv_male: "אבי"
# construct_plural:
# nikkud: "אֲבוֹת"
# ktiv_male: "אבות"
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
# 1s:
# nikkud: "אָבִי"
# ktiv_male: "אבי"
# 1p:
# nikkud: "אָבִינוּ"
# ktiv_male: "אבינו"
# 2ms: ...
# 2fs: ...
# 2mp: ...
# 2fp: ...
# 3ms: ...
# 3fs: ...
# 3mp: ...
# 3fp: ...
# gender: "masculine"
# gender_hebrew:
# nikkud: "זָכָר"
# ktiv_male: "זכר"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Verb-specific: Conjugation Data ---
conjugation: null # null for non-verbs
# When populated:
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
# infinitive:
# nikkud: "לִשְׁמֹר"
# ktiv_male: "לשמור"
# reference_form: # 3ms past (the citation form)
# nikkud: "שָׁמַר"
# ktiv_male: "שמר"
# binyan: "Pa'al" # English binyan name
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
# prep: "על" # Hebrew preposition the verb takes, or null
# active_forms:
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# tense: "עָבָר"
# form:
# nikkud: "שָׁמַרְתִּי"
# ktiv_male: "שמרתי"
# audio_url: "https://..."
# audio_file: null # For future use
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
# nikkud: "שֻׁמַּר"
# ktiv_male: "שומר"
# --- Adjective-specific ---
adjective_inflection: null # Reserved for future use
# When populated:
# ms/fs/mp/fp forms with nikkud/ktiv_male subfields
# --- Preposition-specific ---
preposition_inflection: null # Reserved for future use
# When populated:
# Inflected forms with pronominal suffixes (e.g. שלי, שלך, שלו...)