Template & CSS fixes (15 items from Mar 9 feedback): - Fix conjugation front showing 3ms form instead of infinitive - Rename conjugation model to "Hebrew Conjugation" - Strip Hebrew parenthesized text from English meanings - Shoresh separator: spaces → dots (א.כ.ל) - Remove duplicate English meaning from cloze back - Remove example sentences from vocab front/back (cloze only) - Center-align audio buttons on all decks - Fix parenthesis spacing: "you(feminine,singular)" → "you (feminine, singular)" - Unify sec-key/sec-label fonts, make keys bold - Size overhaul: bigger Hebrew (42px), meaning (34px), secondary (28px) - Center-align related words groups - Sort confusables by average frequency - Plurals: show Gender (Hebrew) before Mishkal, strip emoji from meaning - Clean duplicate quotation marks in cloze sentences Sprint 12 carry-forward (detail scrape + EPUB): - Adjective/preposition detail scraping in pealim_detail_scrape.py - EPUB example matching rewrite in epub_examples.py - Delete benyehuda.py and rebuild_sentence_matches.py (merged) - 49 parser tests for detail scraping - SCHEMA.yaml updates for new fields Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
190 lines
7.7 KiB
YAML
190 lines
7.7 KiB
YAML
# Hebrew Flash Cards — Unified Data Schema (words.json)
|
|
# Revised based on Nevo's feedback (2026-03-08)
|
|
#
|
|
# Top-level: dict keyed by unique_key
|
|
# Unique key: nikkud word for most entries (e.g. "אָב")
|
|
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
|
|
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
|
|
#
|
|
# Hebrew text fields use nikkud/ktiv_male subfields:
|
|
# field:
|
|
# nikkud: "אָב" # with nikkud (hebstyle=mo)
|
|
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
|
|
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
|
|
#
|
|
# Pronoun notation for conjugation forms uses grammatical codes:
|
|
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
|
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
|
|
|
|
entry:
|
|
# --- Core Identity ---
|
|
word:
|
|
nikkud: "אָב"
|
|
ktiv_male: "אב"
|
|
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
|
|
root: ["א", "ב"] # Shoresh as list of consonant chars
|
|
pos: "Noun" # Part of speech in English (as from pealim)
|
|
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
|
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
|
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
|
audio_url: "https://..." # Pealim audio URL
|
|
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
|
tags: "" # Pealim tags if any
|
|
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
|
|
|
|
# --- Identity & Progress ---
|
|
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
|
|
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
|
|
|
|
# --- Frequency ---
|
|
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
|
|
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
|
|
|
|
# --- Display Enrichment ---
|
|
emoji: "👨"
|
|
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
|
|
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
|
|
image: "father.jpg" # Wikipedia/Commons image filename, or null
|
|
image_source: "wikipedia" # One of: wikipedia, commons, null
|
|
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
|
|
|
|
# --- Shared Roots ---
|
|
shared_roots: [] # List of unique_keys of other words sharing the same root
|
|
# Computed by iterating all entries and grouping by root
|
|
|
|
# --- Confusables ---
|
|
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
|
|
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
|
|
|
|
# --- Example Sentences ---
|
|
examples:
|
|
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
|
|
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
|
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
|
|
vetted: true
|
|
cloze: # Best sentence for cloze card, or null
|
|
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
|
cloze_word_start: 0 # Character offset of the clozed word in text
|
|
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
|
|
cloze_hint: "family member"
|
|
cloze_guid: "def456..." # GUID for the cloze note
|
|
rejected_count: 0
|
|
|
|
# --- Noun-specific: Inflection Forms ---
|
|
noun_inflection: null # null for non-nouns
|
|
# When populated:
|
|
# plurals_guid: "ghi789..." # GUID for plurals deck note
|
|
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
|
|
# nikkud: "אָב"
|
|
# ktiv_male: "אב"
|
|
# plural:
|
|
# nikkud: "אָבוֹת"
|
|
# ktiv_male: "אבות"
|
|
# singular_audio: "6009-av.mp3"
|
|
# plural_audio: null # TODO: scrape from detail pages
|
|
# construct_singular:
|
|
# nikkud: "אֲבִי"
|
|
# ktiv_male: "אבי"
|
|
# construct_plural:
|
|
# nikkud: "אֲבוֹת"
|
|
# ktiv_male: "אבות"
|
|
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
|
|
# 1s:
|
|
# nikkud: "אָבִי"
|
|
# ktiv_male: "אבי"
|
|
# 1p:
|
|
# nikkud: "אָבִינוּ"
|
|
# ktiv_male: "אבינו"
|
|
# 2ms: ...
|
|
# 2fs: ...
|
|
# 2mp: ...
|
|
# 2fp: ...
|
|
# 3ms: ...
|
|
# 3fs: ...
|
|
# 3mp: ...
|
|
# 3fp: ...
|
|
# gender: "masculine"
|
|
# gender_hebrew:
|
|
# nikkud: "זָכָר"
|
|
# ktiv_male: "זכר"
|
|
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
|
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
|
|
|
# --- Verb-specific: Conjugation Data ---
|
|
conjugation: null # null for non-verbs
|
|
# When populated:
|
|
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
|
|
# infinitive:
|
|
# nikkud: "לִשְׁמֹר"
|
|
# ktiv_male: "לשמור"
|
|
# reference_form: # 3ms past (the citation form)
|
|
# nikkud: "שָׁמַר"
|
|
# ktiv_male: "שמר"
|
|
# binyan: "Pa'al" # English binyan name
|
|
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
|
|
# prep: "על" # Hebrew preposition the verb takes, or null
|
|
# active_forms:
|
|
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
|
# tense: "עָבָר"
|
|
# form:
|
|
# nikkud: "שָׁמַרְתִּי"
|
|
# ktiv_male: "שמרתי"
|
|
# audio_url: "https://..."
|
|
# audio_file: null # For future use
|
|
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
|
|
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
|
|
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
|
|
# nikkud: "שֻׁמַּר"
|
|
# ktiv_male: "שומר"
|
|
|
|
# --- Adjective-specific ---
|
|
adjective_inflection: null # null for non-adjectives
|
|
# When populated:
|
|
# ms:
|
|
# nikkud: "גָּדוֹל"
|
|
# ktiv_male: "גדול"
|
|
# fs:
|
|
# nikkud: "גְּדוֹלָה"
|
|
# ktiv_male: "גדולה"
|
|
# mp:
|
|
# nikkud: "גְּדוֹלִים"
|
|
# ktiv_male: "גדולים"
|
|
# fp:
|
|
# nikkud: "גְּדוֹלוֹת"
|
|
# ktiv_male: "גדולות"
|
|
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
|
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
|
|
|
# --- Preposition-specific ---
|
|
preposition_inflection: null # null for non-prepositions
|
|
# When populated:
|
|
# 1s:
|
|
# nikkud: "שֶׁלִּי"
|
|
# ktiv_male: "שלי"
|
|
# 1p:
|
|
# nikkud: "שֶׁלָּנוּ"
|
|
# ktiv_male: "שלנו"
|
|
# 2ms:
|
|
# nikkud: "שֶׁלְּךָ"
|
|
# ktiv_male: "שלך"
|
|
# 2fs:
|
|
# nikkud: "שֶׁלָּךְ"
|
|
# ktiv_male: "שלך"
|
|
# 2mp:
|
|
# nikkud: "שֶׁלָּכֶם"
|
|
# ktiv_male: "שלכם"
|
|
# 2fp:
|
|
# nikkud: "שֶׁלָּכֶן"
|
|
# ktiv_male: "שלכן"
|
|
# 3ms:
|
|
# nikkud: "שֶׁלּוֹ"
|
|
# ktiv_male: "שלו"
|
|
# 3fs:
|
|
# nikkud: "שֶׁלָּהּ"
|
|
# ktiv_male: "שלה"
|
|
# 3mp:
|
|
# nikkud: "שֶׁלָּהֶם"
|
|
# ktiv_male: "שלהם"
|
|
# 3fp:
|
|
# nikkud: "שֶׁלָּהֶן"
|
|
# ktiv_male: "שלהן"
|