hebrew_flash_cards/SCHEMA.yaml
Sochen 14d567a261 schema: add difficulty_score field + update spec with MIN_WORDS=3
Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-15 13:30:13 +00:00

192 lines
8 KiB
YAML

# Hebrew Flash Cards — Unified Data Schema (words.json)
# Revised based on Nevo's feedback (2026-03-08)
#
# Top-level: dict keyed by unique_key
# Unique key: nikkud word for most entries (e.g. "אָב")
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
#
# Hebrew text fields use nikkud/ktiv_male subfields:
# field:
# nikkud: "אָב" # with nikkud (hebstyle=mo)
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
#
# Pronoun notation for conjugation forms uses grammatical codes:
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
entry:
# --- Core Identity ---
word:
nikkud: "אָב"
ktiv_male: "אב"
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
root: ["א", "ב"] # Shoresh as list of consonant chars
pos: "Noun" # Part of speech in English (as from pealim)
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
audio_url: "https://..." # Pealim audio URL
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
tags: "" # Pealim tags if any
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
# --- Identity & Progress ---
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
# --- Frequency ---
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
# --- Display Enrichment ---
emoji: "👨"
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
image: "father.jpg" # Wikipedia/Commons image filename, or null
image_source: "wikipedia" # One of: wikipedia, commons, null
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
# --- Shared Roots ---
shared_roots: [] # List of unique_keys of other words sharing the same root
# Computed by iterating all entries and grouping by root
# --- Confusables ---
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
# --- Example Sentences ---
examples:
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
vetted: true
cloze: # Best sentence for cloze card, or null
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
cloze_word_start: 0 # Character offset of the clozed word in text
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
cloze_hint: "family member"
cloze_guid: "def456..." # GUID for the cloze note
difficulty_score: 234 # Median frequency rank of context words (lower = easier); optional
rejected_count: 0
# --- Noun-specific: Inflection Forms ---
noun_inflection: null # null for non-nouns
# When populated:
# plurals_guid: "ghi789..." # GUID for plurals deck note
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
# nikkud: "אָב"
# ktiv_male: "אב"
# plural:
# nikkud: "אָבוֹת"
# ktiv_male: "אבות"
# singular_audio: "6009-av.mp3"
# plural_audio: null # TODO: scrape from detail pages
# construct_singular:
# nikkud: "אֲבִי"
# ktiv_male: "אבי"
# construct_plural:
# nikkud: "אֲבוֹת"
# ktiv_male: "אבות"
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
# 1s:
# nikkud: "אָבִי"
# ktiv_male: "אבי"
# 1p:
# nikkud: "אָבִינוּ"
# ktiv_male: "אבינו"
# 2ms: ...
# 2fs: ...
# 2mp: ...
# 2fp: ...
# 3ms: ...
# 3fs: ...
# 3mp: ...
# 3fp: ...
# gender: "masculine"
# gender_hebrew:
# nikkud: "זָכָר"
# ktiv_male: "זכר"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Verb-specific: Conjugation Data ---
conjugation: null # null for non-verbs
# When populated:
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
# infinitive:
# nikkud: "לִשְׁמֹר"
# ktiv_male: "לשמור"
# reference_form: # 3ms past (the citation form)
# nikkud: "שָׁמַר"
# ktiv_male: "שמר"
# binyan: "Pa'al" # English binyan name
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
# prep: "על" # Hebrew preposition the verb takes, or null
# active_forms:
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
# tense: "עָבָר"
# form:
# nikkud: "שָׁמַרְתִּי"
# ktiv_male: "שמרתי"
# audio_url: "https://..."
# audio_file: null # For future use
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
# nikkud: "שֻׁמַּר"
# ktiv_male: "שומר"
# --- Adjective-specific ---
adjective_inflection: null # null for non-adjectives
# When populated:
# ms:
# nikkud: "גָּדוֹל"
# ktiv_male: "גדול"
# fs:
# nikkud: "גְּדוֹלָה"
# ktiv_male: "גדולה"
# mp:
# nikkud: "גְּדוֹלִים"
# ktiv_male: "גדולים"
# fp:
# nikkud: "גְּדוֹלוֹת"
# ktiv_male: "גדולות"
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
# --- Preposition-specific ---
preposition_inflection: null # null for non-prepositions
# When populated:
# 1s:
# nikkud: "שֶׁלִּי"
# ktiv_male: "שלי"
# 1p:
# nikkud: "שֶׁלָּנוּ"
# ktiv_male: "שלנו"
# 2ms:
# nikkud: "שֶׁלְּךָ"
# ktiv_male: "שלך"
# 2fs:
# nikkud: "שֶׁלָּךְ"
# ktiv_male: "שלך"
# 2mp:
# nikkud: "שֶׁלָּכֶם"
# ktiv_male: "שלכם"
# 2fp:
# nikkud: "שֶׁלָּכֶן"
# ktiv_male: "שלכן"
# 3ms:
# nikkud: "שֶׁלּוֹ"
# ktiv_male: "שלו"
# 3fs:
# nikkud: "שֶׁלָּהּ"
# ktiv_male: "שלה"
# 3mp:
# nikkud: "שֶׁלָּהֶם"
# ktiv_male: "שלהם"
# 3fp:
# nikkud: "שֶׁלָּהֶן"
# ktiv_male: "שלהן"