Compare commits
No commits in common. "master" and "v0.12" have entirely different histories.
65 changed files with 66164 additions and 2746331 deletions
|
|
@ -1,26 +0,0 @@
|
|||
{
|
||||
"hooks": {
|
||||
"PostToolUse": [
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=\"$CLAUDE_FILE_PATH\"; if [ -n \"$file\" ] && echo \"$file\" | grep -q '\\.py$'; then ruff format --quiet \"$file\" && ruff check --fix --quiet \"$file\" 2>/dev/null; fi"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"PreToolUse": [
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=\"$CLAUDE_FILE_PATH\"; if echo \"$file\" | grep -qE '(legacy_guid_map\\.json|\\.env)$'; then echo 'BLOCKED: Protected file — legacy_guid_map.json and .env are read-only' >&2; exit 2; fi"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
root = true
|
||||
|
||||
[*]
|
||||
indent_style = space
|
||||
indent_size = 4
|
||||
end_of_line = lf
|
||||
charset = utf-8
|
||||
trim_trailing_whitespace = true
|
||||
insert_final_newline = true
|
||||
|
||||
[*.{json,yml,yaml,toml}]
|
||||
indent_size = 2
|
||||
|
||||
[*.md]
|
||||
trim_trailing_whitespace = false
|
||||
25
.gitignore
vendored
25
.gitignore
vendored
|
|
@ -11,11 +11,9 @@ pyvenv.cfg
|
|||
venv/
|
||||
__pycache__/
|
||||
*.pyc
|
||||
.pytest_cache/
|
||||
|
||||
# Large generated cache files (rebuild locally)
|
||||
data/benyehuda_index.json
|
||||
data/colliding_forms.json
|
||||
|
||||
# Audio directories (large; rebuild locally)
|
||||
data/audio/
|
||||
|
|
@ -30,32 +28,9 @@ output/
|
|||
|
||||
# Internal / private files — not for public repo
|
||||
ANKIWEB_DESCRIPTION.md
|
||||
PROJECT_NOTES.md
|
||||
PROJECTS.md
|
||||
SPRINT_LOG.md
|
||||
CLAUDE.md
|
||||
RECOMMENDATIONS.md
|
||||
|
||||
# Intermediate scrape progress files
|
||||
data/ktiv_male_forms.json.partial
|
||||
data/ktiv_male_forms_partial.json
|
||||
data/ktiv_scrape_progress.json
|
||||
data/noun_slug_map_progress.json
|
||||
data/top_verbs_to_scrape.json
|
||||
|
||||
# EPUB source files (large; user-specific)
|
||||
data/epubs/
|
||||
|
||||
# Stray deck files
|
||||
Everything__*.apkg
|
||||
*.apkg
|
||||
|
||||
# Legacy CSV files (replaced by data/words.json)
|
||||
*.csv
|
||||
data/*.csv
|
||||
|
||||
# Dead whitelist files
|
||||
vulture_whitelist.py
|
||||
|
||||
# Release artifacts — distributed via Forgejo releases, not committed to tree
|
||||
releases/
|
||||
|
|
|
|||
154
README.md
154
README.md
|
|
@ -6,17 +6,16 @@
|
|||
|
||||
## For Hebrew learners
|
||||
|
||||
A set of Anki flashcard decks for learning Modern Hebrew — vocabulary, verb conjugations, and more. All words include nikkud (vowel marks), audio, and are sorted by frequency so you learn the most useful words first.
|
||||
This project generates two Anki decks for learning Modern Hebrew:
|
||||
|
||||
### What's included
|
||||
- **Vocabulary deck** — ~9,100 words from [pealim.com](https://www.pealim.com/dict/), with nikkud (vowel marks), roots, parts of speech, related words, and example sentences from classic Hebrew literature.
|
||||
- **Conjugation deck** — 70 paradigm verbs from Coffin & Bolozky's *A Reference Grammar of Modern Hebrew* (2005), fully conjugated in all tenses and persons, across all seven binyanim.
|
||||
|
||||
- **Vocabulary** — ~9,100 Hebrew words with pronunciation audio, roots, example sentences from Hebrew literature, images, and frequency rankings.
|
||||
- **Verb conjugations** — 71 core verbs fully conjugated in all tenses and persons, covering all seven binyanim (verb patterns).
|
||||
- **Confusables** — Words that look the same without vowel marks (e.g., דָּבָר "thing" vs. דִּבֵּר "spoke") shown side by side so you can tell them apart.
|
||||
- **Noun plurals** — Practice forming singular↔plural pairs, with a focus on irregular plurals and common patterns.
|
||||
- **All-in-one** — A combined deck with everything above, organized as subdecks.
|
||||
|
||||
You can download and import any deck individually — or use the combined deck to get everything at once.
|
||||
All card data comes from open or academic sources:
|
||||
- Word data: [pealim.com](https://www.pealim.com) — a free Modern Hebrew dictionary
|
||||
- Example sentences: [Project Ben-Yehuda](https://benyehuda.org) — public-domain Hebrew literature corpus
|
||||
- Word frequency: [hermitdave/FrequencyWords](https://github.com/hermitdave/FrequencyWords) — Hebrew frequency list
|
||||
- Verb paradigm list: Coffin, Edna Amir and Shmuel Bolozky. *A Reference Grammar of Modern Hebrew*. Cambridge University Press, 2005.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -26,19 +25,17 @@ You can download and import any deck individually — or use the combined deck t
|
|||
2. Double-click to import into [Anki](https://apps.ankiweb.net/) (free, cross-platform)
|
||||
3. Start studying
|
||||
|
||||
All decks can be imported independently — pick just the ones you want. Re-importing the same file later updates your deck without losing study progress.
|
||||
Both decks can be imported independently. If you already have one, re-importing the same file updates your deck without losing study progress.
|
||||
|
||||
---
|
||||
|
||||
## What's in the vocabulary deck
|
||||
|
||||
Each note generates up to three cards:
|
||||
Each card has two sides:
|
||||
|
||||
**Hebrew → English:** See the Hebrew word (with nikkud) + hear audio → recall the meaning.
|
||||
|
||||
**English → Hebrew:** See the English meaning → recall the Hebrew word. When multiple words share the same English meaning, a disambiguation hint (part of speech + binyan) helps you know which word is expected.
|
||||
|
||||
**Sentence Cloze:** A Hebrew sentence with the target word blanked out → fill in the missing word. Only generated for words with a vetted example sentence. Tests recognition in context.
|
||||
**English → Hebrew:** See the English meaning → recall the Hebrew word, its root, and how to write it.
|
||||
|
||||
Fields on each card:
|
||||
| Field | Example |
|
||||
|
|
@ -46,70 +43,44 @@ Fields on each card:
|
|||
| Hebrew word (nikkud) | שָׁמַר |
|
||||
| Meaning | kept, watched over |
|
||||
| Root | שמ״ר |
|
||||
| Part of speech | פועל — פָּעַל |
|
||||
| Part of speech | פועל (verb) |
|
||||
| Without nikkud | שמר |
|
||||
| Related words | שׁוֹמֵר, שְׁמִירָה (grouped by Part of Speech) |
|
||||
| Example sentence | from nikkud'd Hebrew books |
|
||||
| Related words | שׁוֹמֵר, שְׁמִירָה |
|
||||
| Example sentence | from Ben-Yehuda corpus |
|
||||
| Audio | pronunciation from pealim.com |
|
||||
| Frequency rank | #412 |
|
||||
| Image / Emoji | for concrete nouns |
|
||||
| Plural form | for nouns: רבים: שֻׁלְחָנוֹת |
|
||||
| Disambiguation hint | for ambiguous Eng→Heb cards |
|
||||
|
||||
Cards are presented in **frequency order** — Anki will show you the most common words first. Note that because frequency is collected with words without nikkud, words that have the same letters but different nikkud will be assigned the same frequency.
|
||||
|
||||
### Eng→Heb disambiguation
|
||||
|
||||
When two Hebrew words translate to the same English (e.g., both mean "to return"), the Eng→Heb card shows a hint to tell them apart:
|
||||
|
||||
- **Layer 1:** Automatic Part of Speech + binyan hints for words with different parts of speech (163 words)
|
||||
- **Layer 2:** AI-refined distinct glosses for true synonyms sharing the same Part of Speech (440 words)
|
||||
Cards are presented in **frequency order** — Anki will show you the most common words first. Frequency rank is displayed on every card so you can see how common each word is. Words not in the top 50,000 show a "50k+" badge.
|
||||
|
||||
---
|
||||
|
||||
## What's in the conjugation deck
|
||||
|
||||
71 verbs listed in Appendix 1 of Coffin & Bolozky's *A Reference Grammar of Modern Hebrew* covering all seven binyanim, and **all irregular forms**
|
||||
70 paradigm verbs from Coffin & Bolozky's *A Reference Grammar of Modern Hebrew* (Appendix 1), covering all seven binyanim:
|
||||
- פָּעַל (Pa'al), נִפְעַל (Nif'al), פִּעֵל (Pi'el), פֻּעַל (Pu'al)
|
||||
- הִתְפַּעֵל (Hitpa'el), הִפְעִיל (Hif'il), הֻפְעַל (Huf'al)
|
||||
|
||||
Each verb is drilled in: present, past, future, and imperative — all persons and genders. Each card shows the English meaning and related vocabulary from the same root.
|
||||
Each verb is drilled in: present, past, future, and imperative — all persons and genders. The infinitive is shown on the card front as context but is not quizzed.
|
||||
|
||||
**Present tense expansion:** Each present tense form randomly generates a pronoun to be shown in the front of the card, so you acclimate to seeing אֲנִי, אַתָּה, and הוּא with the conjugated verb, even though they are all conjugated the same in present tense.
|
||||
**Present tense expansion:** Each present form generates 3 cards (one per pronoun that uses it), so you learn אֲנִי, אַתָּה, and הוּא all separately with the same masculine singular form.
|
||||
|
||||
**Modern Hebrew 2fp/3fp:** Classical feminine plural future forms (e.g., תִּשְׁמֹרְנָה) are shown in parentheses, and played via audio (for the audio-included decks). the card's primary answer is the modern masculine plural form used in everyday speech.
|
||||
**Modern Hebrew 2fp/3fp:** Classical feminine plural future forms (e.g., תִּשְׁמֹרְנָה) are shown in parentheses; the card's primary answer is the modern masculine plural form used in everyday speech.
|
||||
|
||||
**Passive label:** Pu'al and Huf'al cards show the active partner's infinitive on the front (e.g., לְבַטֵּל) followed by **(סָבִיל)** in smaller text, so you know you're drilling the passive conjugation.
|
||||
**Passive label:** Pu'al and Huf'al cards show the active partner's infinitive on the front (e.g., לְבַטֵּל) followed by **(סָבִיל)** in smaller text, so you know you're drilling the passive conjugation. Active verbs show no label.
|
||||
|
||||
**Card order:** New conjugation cards are introduced in random order (not grouped by verb).
|
||||
**Card order:** New cards are introduced in random order.
|
||||
|
||||
---
|
||||
|
||||
## What's in the confusables deck
|
||||
|
||||
Hebrew without vowel marks is full of lookalikes. This deck groups words that are spelled identically without nikkud and asks "מה ההבדל?" (what's the difference?). The answer reveals all the words side by side with their nikkud and definitions.
|
||||
|
||||
Examples: דָּבָר (thing) vs. דִּבֵּר (spoke), סֵפֶר (book) vs. סָפַר (counted) vs. סַפָּר (barber).
|
||||
|
||||
---
|
||||
|
||||
## What's in the plurals deck
|
||||
|
||||
Two card directions for each noun:
|
||||
- **Singular → Plural:** See שֻׁלְחָן → produce שֻׁלְחָנוֹת
|
||||
- **Plural → Singular:** See שֻׁלְחָנוֹת → produce שֻׁלְחָן
|
||||
|
||||
Focuses on irregular plurals (the tricky ones that don't follow the rules) and common examples from each noun pattern. Cards are tagged by pattern for filtered study.
|
||||
**Citation:** Coffin, Edna Amir and Shmuel Bolozky. *A Reference Grammar of Modern Hebrew*. Cambridge University Press, 2005.
|
||||
|
||||
---
|
||||
|
||||
## Suggested study strategy
|
||||
|
||||
Start with the vocabulary deck. Anki will present the most frequent words first. Don't try to study too many cards every single day — Anki suggests 20 per day.
|
||||
Start with the vocabulary deck. Anki will present the most frequent words first. Don't try to study to many cards every single day-- Anki suggests 20 per day.
|
||||
|
||||
The conjugation cards reinforce verb forms you've already seen in vocabulary.
|
||||
|
||||
Use the Hebrew → English direction to build reading comprehension. Use the English → Hebrew direction to build writing and speaking recall. The sentence cloze cards test whether you can recognize words in real Hebrew text.
|
||||
Use the Hebrew → English direction to build reading comprehension. Use the English → Hebrew direction to build writing and speaking recall.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -119,11 +90,9 @@ Use the Hebrew → English direction to build reading comprehension. Use the Eng
|
|||
|
||||
**Project Ben-Yehuda** — A public-domain digital library of Hebrew literature. Example sentences come from the nikkud corpus (classic texts with full vowel marks).
|
||||
|
||||
**Hebrew books** — Additional example sentences from nikkud'd (menukad) Hebrew books, with Claude Sonnet AI-vetted quality filtering. The AI doesn't generate the sentences, it just determines whether it is a high quality sentence as an example, or not.
|
||||
|
||||
**FrequencyWords** — An open Hebrew word frequency list derived from subtitle data. Used to sort vocabulary cards from most to least common.
|
||||
|
||||
**Coffin & Bolozky** — The verb list, and known good conjugation reference for the conjugation deck comes from Appendix 1 of *A Reference Grammar of Modern Hebrew* (Cambridge University Press, 2005).
|
||||
**Coffin & Bolozky** — The verb paradigm list for the conjugation deck comes from Appendix 1 of *A Reference Grammar of Modern Hebrew* (Cambridge University Press, 2005), which provides a comprehensive reference for Modern Hebrew verbal morphology.
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -133,7 +102,7 @@ If you notice a wrong translation, missing audio, or incorrect conjugation:
|
|||
|
||||
- For vocabulary errors: the source is pealim.com — you can suggest corrections there. But if you think morfix has a correct translation and pealim.com does not, we may be able to encode an override.
|
||||
|
||||
For any other issue, whether you know how to code or not: Email me at hebrew [at] nevo [dot] engineer
|
||||
For any other issue, whether you know to code or not: Email me at pealim [at] nevo [dot] engineer
|
||||
|
||||
---
|
||||
|
||||
|
|
@ -167,14 +136,13 @@ python run.py --skip-scrape --refresh-examples
|
|||
```
|
||||
python run.py [options]
|
||||
|
||||
--only {vocab,conjugations,confusables,plurals,complete}
|
||||
Build only one deck type
|
||||
--skip-scrape Use cached data/hebrew_dict.csv
|
||||
--skip-scrape Use cached data/hebrew_dict.csv (no pealim.com scraping)
|
||||
--skip-audio Skip audio .mp3 downloads
|
||||
--skip-examples Skip Ben Yehuda example fetching
|
||||
--skip-conjugations Skip verb conjugation extraction
|
||||
--only {vocab,conjugations} Run only one deck (skips all unrelated steps)
|
||||
--skip-conjugations Skip verb conjugation extraction (deprecated: use --only vocab)
|
||||
--skip-images Skip image fetching for concrete nouns
|
||||
--refresh-examples Force rebuild of Ben Yehuda index
|
||||
--refresh-examples Force rebuild of Ben Yehuda index (nikkud corpus)
|
||||
--test N Process only first N words
|
||||
```
|
||||
|
||||
|
|
@ -182,60 +150,28 @@ python run.py [options]
|
|||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `output/hebrew_vocabulary.apkg` | Vocabulary deck (text only) |
|
||||
| `output/hebrew_vocabulary_audio.apkg` | Vocabulary deck + audio |
|
||||
| `output/hebrew_vocabulary_images.apkg` | Vocabulary deck + images |
|
||||
| `output/hebrew_vocabulary_audio_images.apkg` | Vocabulary deck + audio + images |
|
||||
| `output/hebrew_conjugations.apkg` | Conjugation deck |
|
||||
| `output/hebrew_conjugations_audio.apkg` | Conjugation deck + audio |
|
||||
| `output/hebrew_confusables.apkg` | Confusables deck |
|
||||
| `output/hebrew_confusables_audio.apkg` | Confusables deck + audio |
|
||||
| `output/hebrew_plurals.apkg` | Plurals deck |
|
||||
| `output/hebrew_plurals_audio.apkg` | Plurals deck + audio |
|
||||
| `output/hebrew_complete.apkg` | All decks combined |
|
||||
| `output/hebrew_complete_audio.apkg` | All decks combined + audio |
|
||||
|
||||
### Data files
|
||||
|
||||
| File | Description |
|
||||
|------|-------------|
|
||||
| `data/hebrew_dict_for_anki.csv` | Enriched vocabulary CSV |
|
||||
| `data/conjugations.json` | Verb conjugation data (71 verbs) |
|
||||
| `data/noun_plurals.json` | Noun plural/construct forms |
|
||||
| `data/refined_meanings.json` | AI-disambiguated meanings (440 words) |
|
||||
| `data/vetted_sentences.json` | AI-vetted example sentences |
|
||||
| `data/ktiv_male_forms.json` | Ktiv male (plene) forms for sentence matching |
|
||||
| `data/legacy_guid_map.json` | Legacy GUIDs for study progress preservation |
|
||||
| `data/hebrew_dict.csv` | Raw dictionary |
|
||||
| `data/hebrew_dict_for_anki.csv` | Enriched Anki CSV |
|
||||
| `data/conjugations.json` | Verb conjugation data |
|
||||
| `data/audio/` | Vocabulary audio (.mp3) |
|
||||
| `data/audio_conj/` | Conjugation audio (.mp3) |
|
||||
| `data/fonts/` | Heebo font files (bundled in .apkg) |
|
||||
| `data/images/` | Noun images from Wikipedia/Commons |
|
||||
| `data/image_cache.json` | Image fetch cache |
|
||||
| `output/hebrew_vocabulary.apkg` | Vocabulary Anki deck |
|
||||
| `output/hebrew_conjugations.apkg` | Conjugation Anki deck |
|
||||
|
||||
### Pipeline overview
|
||||
|
||||
1. `hebrew_extract.py` — scrapes pealim.com dictionary
|
||||
2. `frequency_lookup.py` — downloads/loads Hebrew frequency data
|
||||
3. `benyehuda.py` — builds sentence index from Ben-Yehuda nikkud corpus
|
||||
3. `benyehuda.py` — builds sentence index from Ben-Yehuda corpus
|
||||
4. `extract_verb_list.py` — extracts verb list from Coffin & Bolozky PDF
|
||||
5. `conjugation_extract.py` — fetches conjugation tables + meanings from pealim.com
|
||||
5. `conjugation_extract.py` — fetches conjugation tables from pealim.com
|
||||
6. `image_fetch.py` — fetches Wikipedia/Commons images for concrete nouns
|
||||
7. `scrape_noun_plurals.py` — scrapes noun plural/construct forms from pealim.com
|
||||
8. `scrape_ktiv_male.py` — scrapes ktiv male (plene) forms for sentence matching
|
||||
9. `rebuild_sentence_matches.py` — matches vocab words to book sentences
|
||||
10. `apkg_builder.py` — assembles all `.apkg` files
|
||||
11. `run.py` — orchestrates all steps
|
||||
12. `validate_apkg.py` — validates output decks
|
||||
|
||||
---
|
||||
|
||||
## Deck variants
|
||||
|
||||
| Variant | Contents | Size |
|
||||
|---------|----------|------|
|
||||
| `hebrew_vocabulary.apkg` | Text + images | ~15 MB |
|
||||
| `hebrew_vocabulary_audio.apkg` | Text + images + audio | ~80 MB |
|
||||
| `hebrew_conjugations.apkg` | Text only | ~1 MB |
|
||||
| `hebrew_conjugations_audio.apkg` | Text + audio | ~5 MB |
|
||||
| `hebrew_confusables.apkg` | Text only | ~1 MB |
|
||||
| `hebrew_plurals.apkg` | Text only | ~1 MB |
|
||||
| `hebrew_complete.apkg` | Everything combined | ~20 MB |
|
||||
| `hebrew_complete_audio.apkg` | Everything + audio | ~90 MB |
|
||||
7. `validate_verb_list.py` — validates verb list against pealim.com
|
||||
8. `apkg_builder.py` — assembles both `.apkg` files
|
||||
9. `run.py` — orchestrates all steps
|
||||
|
||||
---
|
||||
|
||||
|
|
|
|||
192
SCHEMA.yaml
192
SCHEMA.yaml
|
|
@ -1,192 +0,0 @@
|
|||
# Hebrew Flash Cards — Unified Data Schema (words.json)
|
||||
# Revised based on Nevo's feedback (2026-03-08)
|
||||
#
|
||||
# Top-level: dict keyed by unique_key
|
||||
# Unique key: nikkud word for most entries (e.g. "אָב")
|
||||
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
|
||||
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
|
||||
#
|
||||
# Hebrew text fields use nikkud/ktiv_male subfields:
|
||||
# field:
|
||||
# nikkud: "אָב" # with nikkud (hebstyle=mo)
|
||||
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
|
||||
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
|
||||
#
|
||||
# Pronoun notation for conjugation forms uses grammatical codes:
|
||||
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
||||
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
|
||||
|
||||
entry:
|
||||
# --- Core Identity ---
|
||||
word:
|
||||
nikkud: "אָב"
|
||||
ktiv_male: "אב"
|
||||
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
|
||||
root: ["א", "ב"] # Shoresh as list of consonant chars
|
||||
pos: "Noun" # Part of speech in English (as from pealim)
|
||||
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
||||
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
||||
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
||||
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
|
||||
audio_url: "https://..." # Pealim audio URL
|
||||
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
||||
tags: "" # Pealim tags if any
|
||||
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
|
||||
|
||||
# --- Identity & Progress ---
|
||||
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
|
||||
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
|
||||
|
||||
# --- Frequency ---
|
||||
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
|
||||
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
|
||||
|
||||
# --- Display Enrichment ---
|
||||
emoji: "👨"
|
||||
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
|
||||
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
|
||||
image: "father.jpg" # Wikipedia/Commons image filename, or null
|
||||
image_source: "wikipedia" # One of: wikipedia, commons, null
|
||||
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
|
||||
|
||||
# --- Shared Roots ---
|
||||
shared_roots: [] # List of unique_keys of other words sharing the same root
|
||||
# Computed by iterating all entries and grouping by root
|
||||
|
||||
# --- Confusables ---
|
||||
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
|
||||
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
|
||||
|
||||
# --- Example Sentences ---
|
||||
examples:
|
||||
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
|
||||
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
||||
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
|
||||
vetted: true
|
||||
cloze: # Best sentence for cloze card, or null
|
||||
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
||||
cloze_word_start: 0 # Character offset of the clozed word in text
|
||||
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
|
||||
cloze_hint: "family member"
|
||||
cloze_guid: "def456..." # GUID for the cloze note
|
||||
difficulty_score: 234 # Median frequency rank of context words (lower = easier); optional
|
||||
rejected_count: 0
|
||||
|
||||
# --- Noun-specific: Inflection Forms ---
|
||||
noun_inflection: null # null for non-nouns
|
||||
# When populated:
|
||||
# plurals_guid: "ghi789..." # GUID for plurals deck note
|
||||
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
|
||||
# nikkud: "אָב"
|
||||
# ktiv_male: "אב"
|
||||
# plural:
|
||||
# nikkud: "אָבוֹת"
|
||||
# ktiv_male: "אבות"
|
||||
# singular_audio: "6009-av.mp3"
|
||||
# plural_audio: null # TODO: scrape from detail pages
|
||||
# construct_singular:
|
||||
# nikkud: "אֲבִי"
|
||||
# ktiv_male: "אבי"
|
||||
# construct_plural:
|
||||
# nikkud: "אֲבוֹת"
|
||||
# ktiv_male: "אבות"
|
||||
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
|
||||
# 1s:
|
||||
# nikkud: "אָבִי"
|
||||
# ktiv_male: "אבי"
|
||||
# 1p:
|
||||
# nikkud: "אָבִינוּ"
|
||||
# ktiv_male: "אבינו"
|
||||
# 2ms: ...
|
||||
# 2fs: ...
|
||||
# 2mp: ...
|
||||
# 2fp: ...
|
||||
# 3ms: ...
|
||||
# 3fs: ...
|
||||
# 3mp: ...
|
||||
# 3fp: ...
|
||||
# gender: "masculine"
|
||||
# gender_hebrew:
|
||||
# nikkud: "זָכָר"
|
||||
# ktiv_male: "זכר"
|
||||
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
||||
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
||||
|
||||
# --- Verb-specific: Conjugation Data ---
|
||||
conjugation: null # null for non-verbs
|
||||
# When populated:
|
||||
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
|
||||
# infinitive:
|
||||
# nikkud: "לִשְׁמֹר"
|
||||
# ktiv_male: "לשמור"
|
||||
# reference_form: # 3ms past (the citation form)
|
||||
# nikkud: "שָׁמַר"
|
||||
# ktiv_male: "שמר"
|
||||
# binyan: "Pa'al" # English binyan name
|
||||
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
|
||||
# prep: "על" # Hebrew preposition the verb takes, or null
|
||||
# active_forms:
|
||||
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
||||
# tense: "עָבָר"
|
||||
# form:
|
||||
# nikkud: "שָׁמַרְתִּי"
|
||||
# ktiv_male: "שמרתי"
|
||||
# audio_url: "https://..."
|
||||
# audio_file: null # For future use
|
||||
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
|
||||
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
|
||||
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
|
||||
# nikkud: "שֻׁמַּר"
|
||||
# ktiv_male: "שומר"
|
||||
|
||||
# --- Adjective-specific ---
|
||||
adjective_inflection: null # null for non-adjectives
|
||||
# When populated:
|
||||
# ms:
|
||||
# nikkud: "גָּדוֹל"
|
||||
# ktiv_male: "גדול"
|
||||
# fs:
|
||||
# nikkud: "גְּדוֹלָה"
|
||||
# ktiv_male: "גדולה"
|
||||
# mp:
|
||||
# nikkud: "גְּדוֹלִים"
|
||||
# ktiv_male: "גדולים"
|
||||
# fp:
|
||||
# nikkud: "גְּדוֹלוֹת"
|
||||
# ktiv_male: "גדולות"
|
||||
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
||||
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
||||
|
||||
# --- Preposition-specific ---
|
||||
preposition_inflection: null # null for non-prepositions
|
||||
# When populated:
|
||||
# 1s:
|
||||
# nikkud: "שֶׁלִּי"
|
||||
# ktiv_male: "שלי"
|
||||
# 1p:
|
||||
# nikkud: "שֶׁלָּנוּ"
|
||||
# ktiv_male: "שלנו"
|
||||
# 2ms:
|
||||
# nikkud: "שֶׁלְּךָ"
|
||||
# ktiv_male: "שלך"
|
||||
# 2fs:
|
||||
# nikkud: "שֶׁלָּךְ"
|
||||
# ktiv_male: "שלך"
|
||||
# 2mp:
|
||||
# nikkud: "שֶׁלָּכֶם"
|
||||
# ktiv_male: "שלכם"
|
||||
# 2fp:
|
||||
# nikkud: "שֶׁלָּכֶן"
|
||||
# ktiv_male: "שלכן"
|
||||
# 3ms:
|
||||
# nikkud: "שֶׁלּוֹ"
|
||||
# ktiv_male: "שלו"
|
||||
# 3fs:
|
||||
# nikkud: "שֶׁלָּהּ"
|
||||
# ktiv_male: "שלה"
|
||||
# 3mp:
|
||||
# nikkud: "שֶׁלָּהֶם"
|
||||
# ktiv_male: "שלהם"
|
||||
# 3fp:
|
||||
# nikkud: "שֶׁלָּהֶן"
|
||||
# ktiv_male: "שלהן"
|
||||
1813
apkg_builder.py
1813
apkg_builder.py
File diff suppressed because it is too large
Load diff
205
benyehuda.py
Normal file
205
benyehuda.py
Normal file
|
|
@ -0,0 +1,205 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
|
||||
|
||||
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
|
||||
then answers queries locally.
|
||||
|
||||
Exposed API:
|
||||
load(force_rebuild=False)
|
||||
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
|
||||
save_examples_cache()
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
|
||||
CORPUS_URL = (
|
||||
"https://github.com/projectbenyehuda/public_domain_dump/releases/"
|
||||
"download/2025-10/txt.zip"
|
||||
)
|
||||
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
|
||||
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
|
||||
REQUEST_TIMEOUT = 120
|
||||
MIN_SENTENCE_LEN = 20
|
||||
MAX_SENTENCE_LEN = 200
|
||||
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
|
||||
|
||||
# Module-level state
|
||||
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
|
||||
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
|
||||
|
||||
|
||||
def _strip_nikkud(text: str) -> str:
|
||||
return "".join(
|
||||
ch for ch in unicodedata.normalize("NFD", text)
|
||||
if unicodedata.category(ch) != "Mn"
|
||||
)
|
||||
|
||||
|
||||
def _split_sentences(text: str) -> list[str]:
|
||||
"""
|
||||
Split text into sentences on newlines only (Hebrew sentences don't have
|
||||
mid-word period issues like English). Min 20 chars, max 200 chars.
|
||||
"""
|
||||
out = []
|
||||
for line in text.split("\n"):
|
||||
s = line.strip().strip("\"'.,;:!?")
|
||||
s = s.strip()
|
||||
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
def _build_index(corpus_zip_bytes: bytes) -> None:
|
||||
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
|
||||
global _index
|
||||
_index = {}
|
||||
logger.info("Building Ben Yehuda index from nikkud corpus …")
|
||||
|
||||
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
|
||||
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
|
||||
logger.info(f" Corpus contains {len(txt_files)} text files")
|
||||
for fname in txt_files:
|
||||
try:
|
||||
raw = zf.read(fname).decode("utf-8", errors="ignore")
|
||||
except Exception:
|
||||
continue
|
||||
for sentence in _split_sentences(raw):
|
||||
# Index by each unique Hebrew token (with nikkud) in the sentence
|
||||
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
|
||||
for w in set(words):
|
||||
if len(w) >= 2:
|
||||
bucket = _index.setdefault(w, [])
|
||||
if len(bucket) < MAX_INDEX_ENTRIES:
|
||||
bucket.append(sentence)
|
||||
|
||||
logger.info(f"Index built: {len(_index)} unique word forms")
|
||||
|
||||
|
||||
def _save_index() -> None:
|
||||
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(INDEX_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(_index, f, ensure_ascii=False)
|
||||
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
|
||||
|
||||
|
||||
def _load_index() -> None:
|
||||
global _index
|
||||
with open(INDEX_PATH, encoding="utf-8") as f:
|
||||
_index = json.load(f)
|
||||
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
|
||||
|
||||
|
||||
def load(force_rebuild: bool = False) -> None:
|
||||
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
|
||||
global _index, _examples_cache
|
||||
if _index and not force_rebuild:
|
||||
return
|
||||
|
||||
if force_rebuild:
|
||||
# Delete old index and discard examples cache
|
||||
if INDEX_PATH.exists():
|
||||
INDEX_PATH.unlink()
|
||||
logger.info("Deleted old Ben Yehuda index (force rebuild)")
|
||||
_examples_cache = {}
|
||||
else:
|
||||
# Load persisted examples cache (not needed on rebuild)
|
||||
if EXAMPLES_CACHE_PATH.exists():
|
||||
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
|
||||
_examples_cache = json.load(f)
|
||||
|
||||
if INDEX_PATH.exists():
|
||||
_load_index()
|
||||
return
|
||||
|
||||
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
|
||||
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
|
||||
resp.raise_for_status()
|
||||
data = resp.content
|
||||
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
|
||||
|
||||
_build_index(data)
|
||||
_save_index()
|
||||
|
||||
|
||||
def save_examples_cache() -> None:
|
||||
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(_examples_cache, f, ensure_ascii=False)
|
||||
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
|
||||
|
||||
|
||||
def get_examples(word_nikkud: str) -> list[str]:
|
||||
"""
|
||||
Return 0 or 1 example sentences for the given word (nikkud form).
|
||||
|
||||
Lookup strategy:
|
||||
1. Try exact nikkud match in index.
|
||||
2. Fall back to stripped (no-nikkud) match against index keys.
|
||||
|
||||
Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
|
||||
the word as a whole token.
|
||||
"""
|
||||
if not _index:
|
||||
load()
|
||||
|
||||
word = word_nikkud.strip()
|
||||
word_stripped = _strip_nikkud(word)
|
||||
|
||||
cache_key = word
|
||||
|
||||
if cache_key in _examples_cache:
|
||||
return _examples_cache[cache_key]
|
||||
|
||||
# Lookup: try exact nikkud first, then stripped fallback
|
||||
candidates = _index.get(word, [])
|
||||
if not candidates and word_stripped:
|
||||
# Try looking up by stripped form across index keys
|
||||
for k, v in _index.items():
|
||||
if _strip_nikkud(k) == word_stripped:
|
||||
candidates = v
|
||||
break
|
||||
|
||||
# Filter: word must appear as a whole token
|
||||
# Match the stripped form (for robustness with nikkud variants in sentence)
|
||||
if word_stripped:
|
||||
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
|
||||
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
|
||||
else:
|
||||
matched = candidates[:]
|
||||
|
||||
# Filter by length
|
||||
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
|
||||
|
||||
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
|
||||
if matched:
|
||||
best = max(matched, key=len)
|
||||
result = [best]
|
||||
else:
|
||||
result = []
|
||||
|
||||
_examples_cache[cache_key] = result
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
load()
|
||||
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
|
||||
for w in tests:
|
||||
exs = get_examples(w)
|
||||
print(f"\n{w}: {len(exs)} example(s)")
|
||||
for ex in exs:
|
||||
print(f" → {ex[:100]}")
|
||||
save_examples_cache()
|
||||
|
|
@ -1,110 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html dir="rtl">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
|
||||
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
|
||||
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
|
||||
.card-content { padding: 16px; text-align: center; }
|
||||
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
|
||||
|
||||
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
|
||||
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
|
||||
.emoji-img { font-size: 48px; text-align: center; margin: 4px 0; }
|
||||
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
|
||||
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
|
||||
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
|
||||
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
|
||||
.example { font-size: 24px; color: #222; padding: 6px 8px; direction: rtl; text-align: center; border-left: 3px solid #ccc; font-style: italic; margin: 6px auto; max-width: 90%; }
|
||||
.voice-label { font-size: 20px; color: #888; }
|
||||
|
||||
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
|
||||
.more-header {
|
||||
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
|
||||
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
|
||||
}
|
||||
.more-header::-webkit-details-marker { display: none; }
|
||||
.more-header::before { content: "○ "; font-size: 14px; }
|
||||
details[open] > .more-header::before { content: "● "; }
|
||||
|
||||
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
|
||||
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Vocab: English → Hebrew (BACK) — collapsed</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">English → Hebrew — Back (default: collapsed)</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
|
||||
<div class="emoji-img">📍</div>
|
||||
<div class="divider"></div>
|
||||
<div class="hebrew">פַּעַם</div>
|
||||
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
|
||||
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Same card — EXPANDED</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">English → Hebrew — Back (expanded)</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
|
||||
<div class="emoji-img">📍</div>
|
||||
<div class="divider"></div>
|
||||
<div class="hebrew">פַּעַם</div>
|
||||
|
||||
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
|
||||
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -1,114 +0,0 @@
|
|||
<!DOCTYPE html>
|
||||
<html dir="rtl">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
|
||||
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
|
||||
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
|
||||
.card-content { padding: 16px; text-align: center; }
|
||||
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
|
||||
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
|
||||
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
|
||||
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
|
||||
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
|
||||
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
|
||||
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
|
||||
.voice-label { font-size: 20px; color: #888; }
|
||||
|
||||
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
|
||||
.more-header {
|
||||
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
|
||||
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
|
||||
}
|
||||
.more-header::-webkit-details-marker { display: none; }
|
||||
.more-header::before { content: "○ "; font-size: 14px; }
|
||||
details[open] > .more-header::before { content: "● "; }
|
||||
|
||||
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
|
||||
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — FRONT</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Front</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (collapsed)</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Back — default state</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
<hr>
|
||||
<div class="hebrew">שׁוֹמֵר (על)</div>
|
||||
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (expanded)</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Back — expanded</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
<hr>
|
||||
<div class="hebrew">שׁוֹמֵר (על)</div>
|
||||
|
||||
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
679
conjugation_extract.py
Executable file
679
conjugation_extract.py
Executable file
|
|
@ -0,0 +1,679 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Hebrew verb conjugations from pealim.com.
|
||||
Input: verbs_input.txt (one Hebrew infinitive per line;
|
||||
lines starting with '# 3ms:' search by 3ms past form for Pu'al/Huf'al)
|
||||
Output: data/conjugations.json
|
||||
|
||||
For each verb:
|
||||
1. Search pealim.com/search/?q=<verb> to find URL slug
|
||||
2. Fetch /dict/<slug>/ with hebstyle=mo cookie
|
||||
3. Parse conjugation table by row labels
|
||||
4. Capture audio URLs per form
|
||||
5. Parse passive (Pu'al/Huf'al) forms from the same page
|
||||
|
||||
Resume-safe: verbs already in conjugations.json are skipped.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import unicodedata
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PEALIM_BASE = "https://www.pealim.com"
|
||||
REQUEST_DELAY = 1.5
|
||||
REQUEST_TIMEOUT = 15
|
||||
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
|
||||
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
|
||||
DICT_CSV = next(
|
||||
(p for p in [
|
||||
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
|
||||
Path(__file__).parent / "data" / "pealim_dict_for_anki.csv",
|
||||
] if p.exists()),
|
||||
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
|
||||
)
|
||||
|
||||
# Pronoun labels (for card front display)
|
||||
PRONOUN_LABELS = {
|
||||
"present_ms": "",
|
||||
"present_fs": "",
|
||||
"present_mp": "",
|
||||
"present_fp": "",
|
||||
"past_1s": "אֲנִי",
|
||||
"past_1p": "אֲנַחְנוּ",
|
||||
"past_2ms": "אַתָּה",
|
||||
"past_2fs": "אַתְּ",
|
||||
"past_2mp": "אַתֶּם",
|
||||
"past_2fp": "אַתֶּן",
|
||||
"past_3ms": "הוּא",
|
||||
"past_3fs": "הִיא",
|
||||
"past_3p": "הֵם / הֵן",
|
||||
"future_1s": "אֲנִי",
|
||||
"future_1p": "אֲנַחְנוּ",
|
||||
"future_2ms": "אַתָּה",
|
||||
"future_2fs": "אַתְּ",
|
||||
"future_2mp": "אַתֶּם",
|
||||
"future_2fp": "אַתֶּן",
|
||||
"future_3ms": "הוּא",
|
||||
"future_3fs": "הִיא",
|
||||
"future_3mp": "הֵם",
|
||||
"future_3fp": "הֵן",
|
||||
"imperative_ms": "אַתָּה",
|
||||
"imperative_fs": "אַתְּ",
|
||||
"imperative_mp": "אַתֶּם",
|
||||
"imperative_fp": "אַתֶּן",
|
||||
"infinitive": "",
|
||||
}
|
||||
|
||||
# Human-readable tense description for card front
|
||||
TENSE_DESCRIPTION = {
|
||||
"present_ms": "הוֹוֶה",
|
||||
"present_fs": "הוֹוֶה",
|
||||
"present_mp": "הוֹוֶה",
|
||||
"present_fp": "הוֹוֶה",
|
||||
"past_1s": "עָבָר",
|
||||
"past_1p": "עָבָר",
|
||||
"past_2ms": "עָבָר",
|
||||
"past_2fs": "עָבָר",
|
||||
"past_2mp": "עָבָר",
|
||||
"past_2fp": "עָבָר",
|
||||
"past_3ms": "עָבָר",
|
||||
"past_3fs": "עָבָר",
|
||||
"past_3p": "עָבָר",
|
||||
"future_1s": "עָתִיד",
|
||||
"future_1p": "עָתִיד",
|
||||
"future_2ms": "עָתִיד",
|
||||
"future_2fs": "עָתִיד",
|
||||
"future_2mp": "עָתִיד",
|
||||
"future_2fp": "עָתִיד",
|
||||
"future_3ms": "עָתִיד",
|
||||
"future_3fs": "עָתִיד",
|
||||
"future_3mp": "עָתִיד",
|
||||
"future_3fp": "עָתִיד",
|
||||
"imperative_ms": "צִוּוּי",
|
||||
"imperative_fs": "צִוּוּי",
|
||||
"imperative_mp": "צִוּוּי",
|
||||
"imperative_fp": "צִוּוּי",
|
||||
"infinitive": "מְקוֹר",
|
||||
}
|
||||
|
||||
BINYAN_NAMES: tuple[str, ...] = (
|
||||
"Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"
|
||||
)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
|
||||
|
||||
|
||||
def _strip_nikkud(text: str) -> str:
|
||||
"""Remove Hebrew nikkud (diacritics) from a string."""
|
||||
return "".join(
|
||||
ch for ch in unicodedata.normalize("NFD", text)
|
||||
if unicodedata.category(ch) != "Mn"
|
||||
)
|
||||
|
||||
|
||||
def _build_pos_lookup() -> dict[str, str]:
|
||||
"""Build word_stripped → binyan dict from pealim_dict_for_anki.csv."""
|
||||
lookup: dict[str, str] = {}
|
||||
if not DICT_CSV.exists():
|
||||
return lookup
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
try:
|
||||
df = pd.read_csv(DICT_CSV, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(DICT_CSV, index_col=0)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
word = str(row.get("Word", "")).strip()
|
||||
pos = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||||
if word and pos and "nan" not in pos.lower():
|
||||
lookup[_strip_nikkud(word)] = pos
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not load PoS lookup: {e}")
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
# Cache PoS lookup (built once)
|
||||
_pos_lookup: dict[str, str] | None = None
|
||||
|
||||
|
||||
def _get_pos_lookup() -> dict[str, str]:
|
||||
global _pos_lookup
|
||||
if _pos_lookup is None:
|
||||
_pos_lookup = _build_pos_lookup()
|
||||
return _pos_lookup
|
||||
|
||||
|
||||
def _binyan_from_pos(word: str) -> str:
|
||||
"""Look up binyan from PoS field: 'Verb – pa\'al' or 'Verb – Pi\'el' → canonical name."""
|
||||
lookup = _get_pos_lookup()
|
||||
pos_str = lookup.get(_strip_nikkud(word), "")
|
||||
if not pos_str:
|
||||
return ""
|
||||
|
||||
pos_lower = pos_str.lower()
|
||||
# Map lowercase pealim.com PoS variants → canonical names
|
||||
for bname, variants in [
|
||||
("Pa'al", ["pa'al", "paal"]),
|
||||
("Nif'al", ["nif'al", "nifal"]),
|
||||
("Pi'el", ["pi'el", "piel"]),
|
||||
("Pu'al", ["pu'al", "pual"]),
|
||||
("Hitpa'el", ["hitpa'el", "hitpael"]),
|
||||
("Hif'il", ["hif'il", "hifil"]),
|
||||
("Huf'al", ["huf'al", "hufal"]),
|
||||
]:
|
||||
if any(v in pos_lower for v in variants):
|
||||
return bname
|
||||
return ""
|
||||
|
||||
|
||||
def _find_slug(query: str) -> str | None:
|
||||
"""Search pealim.com/search/?q=<verb> and return the URL slug."""
|
||||
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
|
||||
try:
|
||||
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
||||
if slugs:
|
||||
slug = slugs[0]
|
||||
logger.info(f" Slug: {slug}")
|
||||
return slug
|
||||
except Exception as e:
|
||||
logger.error(f" Error searching for '{query}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _is_passive_binyan(binyan: str) -> bool:
|
||||
"""Return True if the binyan is a passive (Pu'al or Huf'al)."""
|
||||
return any(marker in binyan for marker in ("פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al"))
|
||||
|
||||
|
||||
def _get_menukad(cell) -> tuple[str, str]:
|
||||
"""
|
||||
Extract nikkud Hebrew text and audio URL from a table cell.
|
||||
Returns (form_text, audio_url).
|
||||
"""
|
||||
# Audio URL
|
||||
audio_span = cell.find("span", class_=lambda c: c and "audio-play" in c)
|
||||
audio_url = ""
|
||||
if audio_span:
|
||||
audio_url = audio_span.get("data-audio", "")
|
||||
|
||||
span = cell.find("span", class_="menukad")
|
||||
if span:
|
||||
return span.get_text(strip=True), audio_url
|
||||
|
||||
txt = cell.get_text(strip=True)
|
||||
if re.search(r"[\u05d0-\u05ea]", txt):
|
||||
return txt, audio_url
|
||||
return "", audio_url
|
||||
|
||||
|
||||
def _parse_table(soup: BeautifulSoup, passive: bool = False, table_el=None) -> dict[str, dict]:
|
||||
"""
|
||||
Parse the pealim conjugation table and return form_key -> {form, audio_url} mapping.
|
||||
If passive=True, look for the passive table (after "Passive" heading).
|
||||
If table_el is provided (and passive=False), parse that table directly.
|
||||
"""
|
||||
if passive:
|
||||
# Find <h3> containing "Passive"
|
||||
passive_h3 = None
|
||||
for h3 in soup.find_all("h3"):
|
||||
if "passive" in h3.get_text(strip=True).lower():
|
||||
passive_h3 = h3
|
||||
break
|
||||
if not passive_h3:
|
||||
return {}
|
||||
# Find next conjugation table after this heading
|
||||
table = None
|
||||
for sib in passive_h3.find_all_next():
|
||||
if sib.name == "table" and "conjugation-table" in sib.get("class", []):
|
||||
table = sib
|
||||
break
|
||||
if not table:
|
||||
return {}
|
||||
elif table_el is not None:
|
||||
table = table_el
|
||||
else:
|
||||
table = soup.find("table", class_="conjugation-table")
|
||||
|
||||
if not table:
|
||||
return {}
|
||||
|
||||
rows = table.find_all("tr")
|
||||
if len(rows) < 9:
|
||||
return {}
|
||||
|
||||
forms: dict[str, dict] = {}
|
||||
|
||||
def first_heb_forms(row_idx: int) -> list[tuple[str, str]]:
|
||||
"""Get only the Hebrew-text cells from a row (skip label cells)."""
|
||||
cells = rows[row_idx].find_all(["th", "td"])
|
||||
result = []
|
||||
for cell in cells:
|
||||
txt, audio_url = _get_menukad(cell)
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
if txt and re.search(r"[\u05d0-\u05ea]", txt):
|
||||
for _ in range(colspan):
|
||||
result.append((txt, audio_url))
|
||||
return result
|
||||
|
||||
def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
||||
"""Return pairs with duplicate form-text entries removed (first occurrence kept)."""
|
||||
seen: set[str] = set()
|
||||
out: list[tuple[str, str]] = []
|
||||
for pair in pairs:
|
||||
if pair[0] not in seen:
|
||||
seen.add(pair[0])
|
||||
out.append(pair)
|
||||
return out
|
||||
|
||||
# Find rows by tense label
|
||||
present_row = past_row = future_row = imp_row = inf_row = -1
|
||||
for i, row in enumerate(rows):
|
||||
label = row.get_text(" ", strip=True).lower()
|
||||
if "present" in label and present_row < 0:
|
||||
present_row = i
|
||||
elif "past" in label and past_row < 0:
|
||||
past_row = i
|
||||
elif "future" in label and future_row < 0:
|
||||
future_row = i
|
||||
elif "imperative" in label and imp_row < 0:
|
||||
imp_row = i
|
||||
elif "infinitive" in label and inf_row < 0:
|
||||
inf_row = i
|
||||
|
||||
def store(key: str, form: str, audio_url: str) -> None:
|
||||
if form:
|
||||
forms[key] = {"form": form, "audio_url": audio_url}
|
||||
|
||||
# Present tense (4 forms: ms fs mp fp)
|
||||
if present_row >= 0:
|
||||
hf = first_heb_forms(present_row)
|
||||
keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
|
||||
for k, (v, au) in zip(keys, hf):
|
||||
store(k, v, au)
|
||||
|
||||
# Past tense
|
||||
if past_row >= 0:
|
||||
unique = deduplicate(first_heb_forms(past_row))
|
||||
if len(unique) >= 1:
|
||||
store("past_1s", unique[0][0], unique[0][1])
|
||||
if len(unique) >= 2:
|
||||
store("past_1p", unique[1][0], unique[1][1])
|
||||
|
||||
if past_row + 1 < len(rows):
|
||||
hf2 = first_heb_forms(past_row + 1)
|
||||
keys2 = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
|
||||
for k, (v, au) in zip(keys2, hf2):
|
||||
store(k, v, au)
|
||||
|
||||
if past_row + 2 < len(rows):
|
||||
unique3 = deduplicate(first_heb_forms(past_row + 2))
|
||||
keys3 = ["past_3ms", "past_3fs", "past_3p"]
|
||||
for k, (v, au) in zip(keys3, unique3):
|
||||
store(k, v, au)
|
||||
|
||||
# Future tense
|
||||
if future_row >= 0:
|
||||
unique_f = deduplicate(first_heb_forms(future_row))
|
||||
if len(unique_f) >= 1:
|
||||
store("future_1s", unique_f[0][0], unique_f[0][1])
|
||||
if len(unique_f) >= 2:
|
||||
store("future_1p", unique_f[1][0], unique_f[1][1])
|
||||
|
||||
if future_row + 1 < len(rows):
|
||||
hf2 = first_heb_forms(future_row + 1)
|
||||
keys2 = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
|
||||
for k, (v, au) in zip(keys2, hf2):
|
||||
store(k, v, au)
|
||||
|
||||
if future_row + 2 < len(rows):
|
||||
hf3 = first_heb_forms(future_row + 2)
|
||||
keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
|
||||
for k, (v, au) in zip(keys3, hf3):
|
||||
store(k, v, au)
|
||||
|
||||
# Imperative
|
||||
if imp_row >= 0:
|
||||
hf = first_heb_forms(imp_row)
|
||||
keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
|
||||
for k, (v, au) in zip(keys, hf):
|
||||
store(k, v, au)
|
||||
|
||||
# Infinitive
|
||||
if inf_row >= 0:
|
||||
hf = first_heb_forms(inf_row)
|
||||
if hf:
|
||||
store("infinitive", hf[0][0], hf[0][1])
|
||||
|
||||
return forms
|
||||
|
||||
|
||||
def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
|
||||
"""Extract binyan from page header span."""
|
||||
for h3 in soup.find_all("h3", class_="page-header"):
|
||||
text = h3.get_text(" ", strip=True)
|
||||
for bname in BINYAN_NAMES:
|
||||
if bname in text:
|
||||
return bname
|
||||
# Also try og:description
|
||||
meta = soup.find("meta", {"property": "og:description"})
|
||||
if meta:
|
||||
desc = meta.get("content", "")
|
||||
for bname in BINYAN_NAMES:
|
||||
if bname in desc:
|
||||
return bname
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_passive_binyan_from_page(soup: BeautifulSoup) -> str:
|
||||
"""Extract passive binyan name from passive section heading."""
|
||||
for h3 in soup.find_all("h3"):
|
||||
text = h3.get_text(" ", strip=True)
|
||||
if "passive" in text.lower():
|
||||
for bname in ("Pu'al", "Huf'al"):
|
||||
if bname in text:
|
||||
return bname
|
||||
# Infer: Pa'al/Pi'el → Pu'al; Hif'il → Huf'al (stored as span text)
|
||||
span = h3.find("span", class_="small")
|
||||
if span:
|
||||
span_text = span.get_text(strip=True)
|
||||
for bname in ("Pu'al", "Huf'al"):
|
||||
if bname in span_text:
|
||||
return bname
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_conjugations(slug: str, search_term: str, is_3ms_search: bool = False, binyan_hint: str = "") -> dict | None:
|
||||
"""Fetch /dict/<slug>/ and parse conjugation table (active + passive)."""
|
||||
url = f"{PEALIM_BASE}/dict/{slug}/"
|
||||
try:
|
||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.error(f" Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
# Extract root
|
||||
root = ""
|
||||
for span in soup.find_all("span", class_="menukad"):
|
||||
txt = span.get_text(strip=True)
|
||||
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
|
||||
root = txt
|
||||
break
|
||||
|
||||
# Extract binyan: try PoS lookup first, then page header, then section hint
|
||||
binyan = _binyan_from_pos(search_term) if not is_3ms_search else ""
|
||||
if not binyan:
|
||||
binyan = _extract_binyan_from_page(soup)
|
||||
if not binyan:
|
||||
binyan = binyan_hint
|
||||
|
||||
# Parse active forms table
|
||||
forms_raw = _parse_table(soup, passive=False)
|
||||
|
||||
if not forms_raw:
|
||||
logger.warning(f" No forms found for {slug}")
|
||||
return None
|
||||
|
||||
is_passive = _is_passive_binyan(binyan)
|
||||
|
||||
# For passive binyan search (3ms search), the "active" table is actually the passive one
|
||||
# Determine reference form
|
||||
infinitive_form = forms_raw.get("infinitive", {}).get("form", "") if not is_passive else ""
|
||||
past_3ms_form = forms_raw.get("past_3ms", {}).get("form", "")
|
||||
|
||||
if is_passive:
|
||||
reference_form = past_3ms_form or search_term
|
||||
else:
|
||||
reference_form = infinitive_form or search_term
|
||||
|
||||
# Build active result
|
||||
result = {
|
||||
"infinitive": search_term,
|
||||
"slug": slug,
|
||||
"root": root,
|
||||
"binyan": binyan,
|
||||
"is_passive": is_passive,
|
||||
"reference_form": reference_form,
|
||||
"forms": {},
|
||||
}
|
||||
|
||||
for key, form_data in forms_raw.items():
|
||||
if key in PRONOUN_LABELS:
|
||||
result["forms"][key] = {
|
||||
"form": form_data["form"],
|
||||
"audio_url": form_data.get("audio_url", ""),
|
||||
"pronoun": PRONOUN_LABELS[key],
|
||||
"tense": TENSE_DESCRIPTION.get(key, ""),
|
||||
}
|
||||
|
||||
# Check for a second conjugation table (alternate paradigm, e.g. להתגלות)
|
||||
# Collect all active tables (exclude passive tables which follow the "Passive" h3)
|
||||
passive_h3 = next(
|
||||
(h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
|
||||
None,
|
||||
)
|
||||
passive_table_ids = {
|
||||
id(t) for t in (passive_h3.find_all_next("table", class_="conjugation-table") if passive_h3 else [])
|
||||
}
|
||||
active_tables = [
|
||||
t for t in soup.find_all("table", class_="conjugation-table")
|
||||
if id(t) not in passive_table_ids
|
||||
]
|
||||
if len(active_tables) >= 2:
|
||||
alt_raw = _parse_table(soup, passive=False, table_el=active_tables[1])
|
||||
alternate_forms = {}
|
||||
for key, form_data in alt_raw.items():
|
||||
if key in PRONOUN_LABELS:
|
||||
alt_form = form_data["form"]
|
||||
primary_form = forms_raw.get(key, {}).get("form", "")
|
||||
if alt_form and alt_form != primary_form:
|
||||
alternate_forms[key] = alt_form
|
||||
if alternate_forms:
|
||||
result["alternate_forms"] = alternate_forms
|
||||
logger.info(f" Found {len(alternate_forms)} alternate forms for {search_term}")
|
||||
|
||||
logger.info(f" Extracted {len(result['forms'])} forms for {search_term}")
|
||||
return result
|
||||
|
||||
|
||||
def _load_conjugations() -> dict:
|
||||
if CONJUGATIONS_PATH.exists():
|
||||
with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def _save_conjugations(data: dict) -> None:
|
||||
CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def _extract_passive_from_active_slug(active_slug: str, search_term: str, binyan_hint: str = "") -> dict | None:
|
||||
"""Fetch active verb page and extract only the passive section forms.
|
||||
Used for Pu'al/Huf'al 3ms entries where we know the active verb's slug."""
|
||||
url = f"{PEALIM_BASE}/dict/{active_slug}/"
|
||||
try:
|
||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.error(f" Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
root = ""
|
||||
for span in soup.find_all("span", class_="menukad"):
|
||||
txt = span.get_text(strip=True)
|
||||
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
|
||||
root = txt
|
||||
break
|
||||
|
||||
active_binyan = _extract_binyan_from_page(soup)
|
||||
active_forms_raw = _parse_table(soup, passive=False)
|
||||
active_infinitive = active_forms_raw.get("infinitive", {}).get("form", "")
|
||||
|
||||
passive_forms_raw = _parse_table(soup, passive=True)
|
||||
if not passive_forms_raw:
|
||||
logger.warning(f" No passive forms found on {active_slug} for {search_term}")
|
||||
return None
|
||||
|
||||
passive_binyan = _extract_passive_binyan_from_page(soup)
|
||||
if not passive_binyan:
|
||||
passive_binyan = "Pu'al" if active_binyan == "Pi'el" else "Huf'al" if active_binyan == "Hif'il" else ""
|
||||
if not passive_binyan:
|
||||
passive_binyan = binyan_hint
|
||||
|
||||
result = {
|
||||
"infinitive": search_term,
|
||||
"slug": active_slug,
|
||||
"root": root,
|
||||
"binyan": passive_binyan,
|
||||
"is_passive": True,
|
||||
"reference_form": active_infinitive or search_term,
|
||||
"forms": {},
|
||||
}
|
||||
for key, form_data in passive_forms_raw.items():
|
||||
if key in PRONOUN_LABELS:
|
||||
result["forms"][key] = {
|
||||
"form": form_data["form"],
|
||||
"audio_url": form_data.get("audio_url", ""),
|
||||
"pronoun": PRONOUN_LABELS[key],
|
||||
"tense": TENSE_DESCRIPTION.get(key, ""),
|
||||
}
|
||||
|
||||
logger.info(f" Extracted {len(result['forms'])} passive forms for {search_term} from {active_slug}")
|
||||
return result
|
||||
|
||||
|
||||
def main(verbs_file: Path = VERBS_INPUT) -> dict:
|
||||
"""Read verbs from file and extract conjugations. Returns full conjugations dict."""
|
||||
if not verbs_file.exists():
|
||||
logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
|
||||
return _load_conjugations()
|
||||
|
||||
raw_lines = verbs_file.read_text(encoding="utf-8").splitlines()
|
||||
|
||||
# Parse slug overrides: "# slug: VERB SLUG" anywhere in the file
|
||||
slug_overrides: dict[str, str] = {}
|
||||
for line in raw_lines:
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("# slug:"):
|
||||
parts = stripped[len("# slug:"):].strip().split()
|
||||
if len(parts) >= 2:
|
||||
slug_overrides[parts[0]] = parts[1]
|
||||
|
||||
# Map section header keywords → binyan name (for binyan_hint fallback)
|
||||
SECTION_BINYAN = {
|
||||
"pa'al": "Pa'al", "nif'al": "Nif'al", "pi'el": "Pi'el",
|
||||
"pu'al": "Pu'al", "hitpa'el": "Hitpa'el", "hif'il": "Hif'il", "huf'al": "Huf'al",
|
||||
}
|
||||
|
||||
# Parse: regular verbs and # 3ms: lines (optional active slug on 3ms lines)
|
||||
# Track current section binyan from comment headers for use as a hint
|
||||
verbs: list[tuple[str, bool, str | None, str]] = [] # (search_term, is_3ms_search, active_slug, binyan_hint)
|
||||
current_binyan_hint = ""
|
||||
for line in raw_lines:
|
||||
stripped = line.strip()
|
||||
if not stripped or stripped.startswith("# slug:"):
|
||||
continue
|
||||
if stripped.startswith("# 3ms:"):
|
||||
parts = stripped[len("# 3ms:"):].strip().split()
|
||||
if parts:
|
||||
form = parts[0]
|
||||
active_slug = parts[1] if len(parts) >= 2 else None
|
||||
verbs.append((form, True, active_slug, current_binyan_hint))
|
||||
elif stripped.startswith("#"):
|
||||
# Check if this is a section header setting the binyan context
|
||||
low = stripped.lower()
|
||||
for key, bname in SECTION_BINYAN.items():
|
||||
if key in low:
|
||||
current_binyan_hint = bname
|
||||
break
|
||||
else:
|
||||
verbs.append((stripped, False, None, current_binyan_hint))
|
||||
|
||||
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file} "
|
||||
f"({sum(1 for _, p, _, _ in verbs if p)} passive 3ms)")
|
||||
if slug_overrides:
|
||||
logger.info(f" Slug overrides: {slug_overrides}")
|
||||
|
||||
conjugations = _load_conjugations()
|
||||
new_count = 0
|
||||
|
||||
for verb, is_3ms, active_slug, binyan_hint in verbs:
|
||||
if verb in conjugations:
|
||||
logger.info(f"Skipping {verb} (cached)")
|
||||
continue
|
||||
|
||||
logger.info(f"Processing: {verb} {'(3ms search)' if is_3ms else ''}")
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if is_3ms:
|
||||
# Passive-only extraction: use provided active slug or search to find it
|
||||
if active_slug:
|
||||
slug = active_slug
|
||||
logger.info(f" Using active slug {slug} for passive extraction")
|
||||
else:
|
||||
slug = _find_slug(verb)
|
||||
if not slug:
|
||||
logger.warning(f" No slug found for {verb}")
|
||||
conjugations[verb] = None
|
||||
_save_conjugations(conjugations)
|
||||
continue
|
||||
logger.info(f" Found active slug {slug} for passive extraction")
|
||||
time.sleep(REQUEST_DELAY)
|
||||
data = _extract_passive_from_active_slug(slug, verb, binyan_hint=binyan_hint)
|
||||
else:
|
||||
override = slug_overrides.get(verb)
|
||||
if override:
|
||||
logger.info(f" Slug override: {override}")
|
||||
slug = override
|
||||
else:
|
||||
slug = _find_slug(verb)
|
||||
if not slug:
|
||||
logger.warning(f" No slug found for {verb}")
|
||||
conjugations[verb] = None
|
||||
_save_conjugations(conjugations)
|
||||
continue
|
||||
time.sleep(REQUEST_DELAY)
|
||||
data = _extract_conjugations(slug, verb, is_3ms_search=False, binyan_hint=binyan_hint)
|
||||
|
||||
conjugations[verb] = data
|
||||
_save_conjugations(conjugations)
|
||||
new_count += 1
|
||||
|
||||
logger.info(f"Done: {new_count} new verbs processed")
|
||||
return conjugations
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
result = main()
|
||||
for verb, data in result.items():
|
||||
if data:
|
||||
forms = data.get("forms", {})
|
||||
print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
|
||||
sample_form = next(iter(forms.values()), {}) if forms else {}
|
||||
print(f" sample audio_url: {sample_form.get('audio_url', 'MISSING')[:60]}")
|
||||
else:
|
||||
print(f"{verb}: no data")
|
||||
|
|
@ -175,8 +175,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to guard; to keep, to maintain (על)"
|
||||
}
|
||||
},
|
||||
"ללמוד": {
|
||||
"infinitive": "ללמוד",
|
||||
|
|
@ -354,8 +353,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to learn, to study"
|
||||
}
|
||||
},
|
||||
"לאסוף": {
|
||||
"infinitive": "לאסוף",
|
||||
|
|
@ -533,8 +531,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to collect, to pick up, to reap"
|
||||
}
|
||||
},
|
||||
"לעבוד": {
|
||||
"infinitive": "לעבוד",
|
||||
|
|
@ -712,8 +709,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to work; to operate, to function"
|
||||
}
|
||||
},
|
||||
"לחבוש": {
|
||||
"infinitive": "לחבוש",
|
||||
|
|
@ -891,8 +887,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to bandage; to put on (a hat)"
|
||||
}
|
||||
},
|
||||
"לאכול": {
|
||||
"infinitive": "לאכול",
|
||||
|
|
@ -1070,8 +1065,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to eat"
|
||||
}
|
||||
},
|
||||
"לשאול": {
|
||||
"infinitive": "לשאול",
|
||||
|
|
@ -1249,8 +1243,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to ask; to borrow"
|
||||
}
|
||||
},
|
||||
"לשלוח": {
|
||||
"infinitive": "לשלוח",
|
||||
|
|
@ -1428,8 +1421,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to send, to dispatch"
|
||||
}
|
||||
},
|
||||
"לגבוה": {
|
||||
"infinitive": "לגבוה",
|
||||
|
|
@ -1607,8 +1599,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be high, exalted"
|
||||
}
|
||||
},
|
||||
"לשבת": {
|
||||
"infinitive": "לשבת",
|
||||
|
|
@ -1786,8 +1777,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to sit, to settle"
|
||||
}
|
||||
},
|
||||
"לרשת": {
|
||||
"infinitive": "לרשת",
|
||||
|
|
@ -1965,8 +1955,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to inherit"
|
||||
}
|
||||
},
|
||||
"לִיפּוֹל": {
|
||||
"infinitive": "לִיפּוֹל",
|
||||
|
|
@ -2144,8 +2133,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to fall, to drop"
|
||||
}
|
||||
},
|
||||
"לקום": {
|
||||
"infinitive": "לקום",
|
||||
|
|
@ -2323,8 +2311,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to get up, to stand up, to arise; to be established, to come into being"
|
||||
}
|
||||
},
|
||||
"לחון": {
|
||||
"infinitive": "לחון",
|
||||
|
|
@ -2502,8 +2489,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to pardon, to amnesty; to endow"
|
||||
}
|
||||
},
|
||||
"לקרוא": {
|
||||
"infinitive": "לקרוא",
|
||||
|
|
@ -2681,8 +2667,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to read (ב-, את); to call (ל-)"
|
||||
}
|
||||
},
|
||||
"לקנות": {
|
||||
"infinitive": "לקנות",
|
||||
|
|
@ -2860,8 +2845,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to buy, to purchase"
|
||||
}
|
||||
},
|
||||
"להיבדק": {
|
||||
"infinitive": "להיבדק",
|
||||
|
|
@ -3039,8 +3023,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be tested, examined"
|
||||
}
|
||||
},
|
||||
"להרדם": {
|
||||
"infinitive": "להרדם",
|
||||
|
|
@ -3218,8 +3201,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to fall asleep, to doze off"
|
||||
}
|
||||
},
|
||||
"להיהרג": {
|
||||
"infinitive": "להיהרג",
|
||||
|
|
@ -3397,8 +3379,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be killed"
|
||||
}
|
||||
},
|
||||
"להחקר": {
|
||||
"infinitive": "להחקר",
|
||||
|
|
@ -3576,8 +3557,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be investigated, explored"
|
||||
}
|
||||
},
|
||||
"להישאר": {
|
||||
"infinitive": "להישאר",
|
||||
|
|
@ -3755,8 +3735,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to remain"
|
||||
}
|
||||
},
|
||||
"להיפגע": {
|
||||
"infinitive": "להיפגע",
|
||||
|
|
@ -3934,8 +3913,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be damaged, to be injured, to be wounded; to be insulted, to be offended"
|
||||
}
|
||||
},
|
||||
"להיוולד": {
|
||||
"infinitive": "להיוולד",
|
||||
|
|
@ -4113,8 +4091,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be born"
|
||||
}
|
||||
},
|
||||
"להנצל": {
|
||||
"infinitive": "להנצל",
|
||||
|
|
@ -4292,8 +4269,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be saved, to be rescued, to survive"
|
||||
}
|
||||
},
|
||||
"להיסוג": {
|
||||
"infinitive": "להיסוג",
|
||||
|
|
@ -4471,8 +4447,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to withdraw, to retreat"
|
||||
}
|
||||
},
|
||||
"להימצא": {
|
||||
"infinitive": "להימצא",
|
||||
|
|
@ -4650,8 +4625,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be found, discovered; to be present, to be located"
|
||||
}
|
||||
},
|
||||
"להיבנות": {
|
||||
"infinitive": "להיבנות",
|
||||
|
|
@ -4829,8 +4803,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be built, constructed"
|
||||
}
|
||||
},
|
||||
"לדבר": {
|
||||
"infinitive": "לדבר",
|
||||
|
|
@ -5157,8 +5130,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to speak, to talk"
|
||||
}
|
||||
},
|
||||
"לברך": {
|
||||
"infinitive": "לברך",
|
||||
|
|
@ -5485,8 +5457,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to bless, to greet, to felicitate"
|
||||
}
|
||||
},
|
||||
"לנהל": {
|
||||
"infinitive": "לנהל",
|
||||
|
|
@ -5813,8 +5784,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to manage, to organize"
|
||||
}
|
||||
},
|
||||
"לנצח": {
|
||||
"infinitive": "לנצח",
|
||||
|
|
@ -6141,8 +6111,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to win; to overcome, to beat; to conduct, to orchestrate"
|
||||
}
|
||||
},
|
||||
"לקומם": {
|
||||
"infinitive": "לקומם",
|
||||
|
|
@ -6469,8 +6438,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to outrage, to anger"
|
||||
}
|
||||
},
|
||||
"למלא": {
|
||||
"infinitive": "למלא",
|
||||
|
|
@ -6797,8 +6765,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to fill; to fill out; to fulfil"
|
||||
}
|
||||
},
|
||||
"לחכות": {
|
||||
"infinitive": "לחכות",
|
||||
|
|
@ -7125,8 +7092,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to await, to wait for (ל-)"
|
||||
}
|
||||
},
|
||||
"לגלגל": {
|
||||
"infinitive": "לגלגל",
|
||||
|
|
@ -7453,8 +7419,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to roll, to revolve (transitive)"
|
||||
}
|
||||
},
|
||||
"להתלבש": {
|
||||
"infinitive": "להתלבש",
|
||||
|
|
@ -7632,8 +7597,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to dress oneself"
|
||||
}
|
||||
},
|
||||
"להסתלק": {
|
||||
"infinitive": "להסתלק",
|
||||
|
|
@ -7811,8 +7775,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to leave, to go away"
|
||||
}
|
||||
},
|
||||
"להצטלם": {
|
||||
"infinitive": "להצטלם",
|
||||
|
|
@ -7990,8 +7953,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to pose for a photograph, to be photographed"
|
||||
}
|
||||
},
|
||||
"להזדקק": {
|
||||
"infinitive": "להזדקק",
|
||||
|
|
@ -8169,8 +8131,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to need, to require (ל-)"
|
||||
}
|
||||
},
|
||||
"להתנהג": {
|
||||
"infinitive": "להתנהג",
|
||||
|
|
@ -8348,8 +8309,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to behave"
|
||||
}
|
||||
},
|
||||
"להתקומם": {
|
||||
"infinitive": "להתקומם",
|
||||
|
|
@ -8527,8 +8487,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to rebel, to revolt"
|
||||
}
|
||||
},
|
||||
"להתפלא": {
|
||||
"infinitive": "להתפלא",
|
||||
|
|
@ -8706,8 +8665,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to wonder, to be surprised"
|
||||
}
|
||||
},
|
||||
"להתקלקל": {
|
||||
"infinitive": "להתקלקל",
|
||||
|
|
@ -8885,8 +8843,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be damaged, to be spoiled (of food products)"
|
||||
}
|
||||
},
|
||||
"להכניס": {
|
||||
"infinitive": "להכניס",
|
||||
|
|
@ -9213,8 +9170,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to insert, to bring in"
|
||||
}
|
||||
},
|
||||
"להעסיק": {
|
||||
"infinitive": "להעסיק",
|
||||
|
|
@ -9541,8 +9497,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to keep busy; to employ"
|
||||
}
|
||||
},
|
||||
"להחליט": {
|
||||
"infinitive": "להחליט",
|
||||
|
|
@ -9869,8 +9824,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to decide"
|
||||
}
|
||||
},
|
||||
"להבטיח": {
|
||||
"infinitive": "להבטיח",
|
||||
|
|
@ -10197,8 +10151,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to ensure, to promise"
|
||||
}
|
||||
},
|
||||
"להוריד": {
|
||||
"infinitive": "להוריד",
|
||||
|
|
@ -10525,8 +10478,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to lower, to reduce; to download (computing)"
|
||||
}
|
||||
},
|
||||
"להפיל": {
|
||||
"infinitive": "להפיל",
|
||||
|
|
@ -10853,8 +10805,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to drop, to throw down"
|
||||
}
|
||||
},
|
||||
"להקים": {
|
||||
"infinitive": "להקים",
|
||||
|
|
@ -11181,8 +11132,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to build, to found, to establish"
|
||||
}
|
||||
},
|
||||
"להמציא": {
|
||||
"infinitive": "להמציא",
|
||||
|
|
@ -11509,8 +11459,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to invent; to make up; to present"
|
||||
}
|
||||
},
|
||||
"להרשות": {
|
||||
"infinitive": "להרשות",
|
||||
|
|
@ -11837,8 +11786,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to allow, to permit"
|
||||
}
|
||||
},
|
||||
"להקל": {
|
||||
"infinitive": "להקל",
|
||||
|
|
@ -12165,8 +12113,7 @@
|
|||
"tense": "עָתִיד"
|
||||
}
|
||||
}
|
||||
},
|
||||
"meaning": "to ease, to alleviate"
|
||||
}
|
||||
},
|
||||
"לָשִׂים": {
|
||||
"infinitive": "לָשִׂים",
|
||||
|
|
@ -12344,8 +12291,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to put, to put on"
|
||||
}
|
||||
},
|
||||
"בוטל": {
|
||||
"infinitive": "בוטל",
|
||||
|
|
@ -12493,8 +12439,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to cancel, to undo"
|
||||
}
|
||||
},
|
||||
"תואם": {
|
||||
"infinitive": "תואם",
|
||||
|
|
@ -12642,8 +12587,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to coordinate"
|
||||
}
|
||||
},
|
||||
"קומם": {
|
||||
"infinitive": "קומם",
|
||||
|
|
@ -12791,8 +12735,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to outrage, to anger"
|
||||
}
|
||||
},
|
||||
"דוכא": {
|
||||
"infinitive": "דוכא",
|
||||
|
|
@ -12940,8 +12883,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to oppress, to crush; to cause depression"
|
||||
}
|
||||
},
|
||||
"זוכה": {
|
||||
"infinitive": "זוכה",
|
||||
|
|
@ -13089,8 +13031,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to achieve; to credit"
|
||||
}
|
||||
},
|
||||
"פורסם": {
|
||||
"infinitive": "פורסם",
|
||||
|
|
@ -13238,8 +13179,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to advertise, to publish, to publicize"
|
||||
}
|
||||
},
|
||||
"הוגבל": {
|
||||
"infinitive": "הוגבל",
|
||||
|
|
@ -13387,8 +13327,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to limit, to restrict, to confine"
|
||||
}
|
||||
},
|
||||
"העבר": {
|
||||
"infinitive": "העבר",
|
||||
|
|
@ -13536,8 +13475,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to transfer, to pass something"
|
||||
}
|
||||
},
|
||||
"הוזהר": {
|
||||
"infinitive": "הוזהר",
|
||||
|
|
@ -13685,8 +13623,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to warn"
|
||||
}
|
||||
},
|
||||
"הופל": {
|
||||
"infinitive": "הופל",
|
||||
|
|
@ -13834,8 +13771,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to drop, to throw down"
|
||||
}
|
||||
},
|
||||
"הוקם": {
|
||||
"infinitive": "הוקם",
|
||||
|
|
@ -13983,8 +13919,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to build, to found, to establish"
|
||||
}
|
||||
},
|
||||
"הוחל": {
|
||||
"infinitive": "הוחל",
|
||||
|
|
@ -14132,8 +14067,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to apply, to enforce, to put in force"
|
||||
}
|
||||
},
|
||||
"הוקפא": {
|
||||
"infinitive": "הוקפא",
|
||||
|
|
@ -14281,8 +14215,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to freeze (something)"
|
||||
}
|
||||
},
|
||||
"הופנה": {
|
||||
"infinitive": "הופנה",
|
||||
|
|
@ -14430,8 +14363,7 @@
|
|||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
}
|
||||
},
|
||||
"meaning": "to direct; to refer someone"
|
||||
}
|
||||
},
|
||||
"להתקלח": {
|
||||
"infinitive": "להתקלח",
|
||||
|
|
@ -14609,8 +14541,7 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to take a shower"
|
||||
}
|
||||
},
|
||||
"להתגלות": {
|
||||
"infinitive": "להתגלות",
|
||||
|
|
@ -14788,162 +14719,6 @@
|
|||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be discovered, to appear"
|
||||
},
|
||||
"להיות": {
|
||||
"infinitive": "להיות",
|
||||
"slug": "454-lihyot",
|
||||
"root": "ה - י - ה",
|
||||
"binyan": "Pa'al",
|
||||
"is_passive": false,
|
||||
"reference_form": "לִהְיוֹת",
|
||||
"forms": {
|
||||
"past_1s": {
|
||||
"form": "הָיִיתִי",
|
||||
"audio_url": "https://audio.pealim.com/v0/bx/bxtedharx4kd.mp3",
|
||||
"pronoun": "אֲנִי",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_1p": {
|
||||
"form": "הָיִינוּ",
|
||||
"audio_url": "https://audio.pealim.com/v0/bz/bztr7bt7yw8j.mp3",
|
||||
"pronoun": "אֲנַחְנוּ",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_2ms": {
|
||||
"form": "הָיִיתָ",
|
||||
"audio_url": "https://audio.pealim.com/v0/1i/1imxfddysg8d8.mp3",
|
||||
"pronoun": "אַתָּה",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_2fs": {
|
||||
"form": "הָיִית",
|
||||
"audio_url": "https://audio.pealim.com/v0/si/sizbwqsi2wej.mp3",
|
||||
"pronoun": "אַתְּ",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_2mp": {
|
||||
"form": "הֱיִיתֶם",
|
||||
"audio_url": "https://audio.pealim.com/v0/31/31081nk4lvxj.mp3",
|
||||
"pronoun": "אַתֶּם",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_2fp": {
|
||||
"form": "הֱיִיתֶן",
|
||||
"audio_url": "https://audio.pealim.com/v0/30/30zpav63u9ig.mp3",
|
||||
"pronoun": "אַתֶּן",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_3ms": {
|
||||
"form": "הָיָה",
|
||||
"audio_url": "https://audio.pealim.com/v0/1h/1hxhgoyxra6fs.mp3",
|
||||
"pronoun": "הוּא",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_3fs": {
|
||||
"form": "הָיְתָה",
|
||||
"audio_url": "https://audio.pealim.com/v0/17/17fb6fulu2da8.mp3",
|
||||
"pronoun": "הִיא",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"past_3p": {
|
||||
"form": "הָיוּ",
|
||||
"audio_url": "https://audio.pealim.com/v0/1h/1hxhgf26s3ou9.mp3",
|
||||
"pronoun": "הֵם / הֵן",
|
||||
"tense": "עָבָר"
|
||||
},
|
||||
"future_1s": {
|
||||
"form": "אֶהְיֶה",
|
||||
"audio_url": "https://audio.pealim.com/v0/at/atd2i0kljhge.mp3",
|
||||
"pronoun": "אֲנִי",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_1p": {
|
||||
"form": "נִהְיֶה",
|
||||
"audio_url": "https://audio.pealim.com/v0/2a/2a41xa7h8jei.mp3",
|
||||
"pronoun": "אֲנַחְנוּ",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_2ms": {
|
||||
"form": "תִּהְיֶה",
|
||||
"audio_url": "https://audio.pealim.com/v0/g6/g6saa9abkllk.mp3",
|
||||
"pronoun": "אַתָּה",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_2fs": {
|
||||
"form": "תִּהְיִי",
|
||||
"audio_url": "https://audio.pealim.com/v0/g6/g6s9q8uugtnx.mp3",
|
||||
"pronoun": "אַתְּ",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_2mp": {
|
||||
"form": "תִּהְיוּ",
|
||||
"audio_url": "https://audio.pealim.com/v0/g6/g6sjf854r5a7.mp3",
|
||||
"pronoun": "אַתֶּם",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_2fp": {
|
||||
"form": "תִּהְיֶינָה",
|
||||
"audio_url": "https://audio.pealim.com/v0/12/12upso035jy8g.mp3",
|
||||
"pronoun": "אַתֶּן",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_3ms": {
|
||||
"form": "יִהְיֶה",
|
||||
"audio_url": "https://audio.pealim.com/v0/yy/yyo97spf6rob.mp3",
|
||||
"pronoun": "הוּא",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_3fs": {
|
||||
"form": "תִּהְיֶה",
|
||||
"audio_url": "https://audio.pealim.com/v0/g6/g6saa9abkllk.mp3",
|
||||
"pronoun": "הִיא",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_3mp": {
|
||||
"form": "יִהְיוּ",
|
||||
"audio_url": "https://audio.pealim.com/v0/yy/yyo02tum07zo.mp3",
|
||||
"pronoun": "הֵם",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"future_3fp": {
|
||||
"form": "תִּהְיֶינָה",
|
||||
"audio_url": "https://audio.pealim.com/v0/12/12upso035jy8g.mp3",
|
||||
"pronoun": "הֵן",
|
||||
"tense": "עָתִיד"
|
||||
},
|
||||
"imperative_ms": {
|
||||
"form": "הֱיֵה!",
|
||||
"audio_url": "https://audio.pealim.com/v0/1h/1hxjabs7uspli.mp3",
|
||||
"pronoun": "אַתָּה",
|
||||
"tense": "צִוּוּי"
|
||||
},
|
||||
"imperative_fs": {
|
||||
"form": "הֱיִי!",
|
||||
"audio_url": "https://audio.pealim.com/v0/1h/1hxjac2th43as.mp3",
|
||||
"pronoun": "אַתְּ",
|
||||
"tense": "צִוּוּי"
|
||||
},
|
||||
"imperative_mp": {
|
||||
"form": "הֱיוּ!",
|
||||
"audio_url": "https://audio.pealim.com/v0/1h/1hxja0tjuptcu.mp3",
|
||||
"pronoun": "אַתֶּם",
|
||||
"tense": "צִוּוּי"
|
||||
},
|
||||
"imperative_fp": {
|
||||
"form": "הֱיֶינָה!",
|
||||
"audio_url": "https://audio.pealim.com/v0/xe/xef6kg7mexvb.mp3",
|
||||
"pronoun": "אַתֶּן",
|
||||
"tense": "צִוּוּי"
|
||||
},
|
||||
"infinitive": {
|
||||
"form": "לִהְיוֹת",
|
||||
"audio_url": "https://audio.pealim.com/v0/1n/1nej50k4t35xi.mp3",
|
||||
"pronoun": "",
|
||||
"tense": "מְקוֹר"
|
||||
}
|
||||
},
|
||||
"meaning": "to be"
|
||||
}
|
||||
}
|
||||
}
|
||||
File diff suppressed because one or more lines are too long
50000
data/en_50k.txt
50000
data/en_50k.txt
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load diff
9106
data/hebrew_dict.csv
Normal file
9106
data/hebrew_dict.csv
Normal file
File diff suppressed because it is too large
Load diff
12111
data/hebrew_dict_for_anki.csv
Normal file
12111
data/hebrew_dict_for_anki.csv
Normal file
File diff suppressed because it is too large
Load diff
140248
data/ktiv_male_forms.json
140248
data/ktiv_male_forms.json
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
46510
data/noun_plurals.json
46510
data/noun_plurals.json
File diff suppressed because it is too large
Load diff
29598
data/noun_slug_map.json
29598
data/noun_slug_map.json
File diff suppressed because it is too large
Load diff
9106
data/pealim_dict.csv
Normal file
9106
data/pealim_dict.csv
Normal file
File diff suppressed because it is too large
Load diff
12111
data/pealim_dict_for_anki.csv
Normal file
12111
data/pealim_dict_for_anki.csv
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -1,442 +0,0 @@
|
|||
{
|
||||
"שְׁלָל": "abundance; loot, plunder, spoils",
|
||||
"שֶׁפַע": "abundance, plenty, profusion",
|
||||
"נַר": "acquaintance (person one knows)",
|
||||
"הֶכֵּרוּת": "acquaintance (the state of knowing someone)",
|
||||
"כְּתֹבֶת": "address (postal/location)",
|
||||
"מַעַן": "address (formal, for the sake of; destination)",
|
||||
"שׁוּב": "again (once more, to repeat an action)",
|
||||
"שֵׁנִית": "again; a second time, secondly",
|
||||
"כְּנֶגֶד": "against; compared to, as opposed to",
|
||||
"מוּל": "opposite, facing; against",
|
||||
"נֶגֶד": "against; contrary to",
|
||||
"נֶכֶס": "asset, property (financial/material possession)",
|
||||
"קִנְיָן": "asset, property; possession, ownership (abstract or acquired)",
|
||||
"הִתְבּוֹלְלוּת": "assimilation (cultural/ethnic blending in)",
|
||||
"הִטַּמְּעוּת": "assimilation (absorption, integration into surroundings)",
|
||||
"כְּפִיפָה": "basket (woven, traditional/biblical)",
|
||||
"סַל": "basket (general, everyday)",
|
||||
"מַשְׁמִים": "boring, dreary (causing desolation/boredom)",
|
||||
"מְשַׁעְמֵם": "boring, tedious (causing boredom, common usage)",
|
||||
"מַשָּׂא": "burden, load (heavy cargo; figurative weight)",
|
||||
"נֵטֶל": "burden, load; ballast (dead weight)",
|
||||
"טָרוּד": "busy, preoccupied (mentally troubled/distracted)",
|
||||
"עָסוּק": "busy, occupied (engaged in an activity)",
|
||||
"מַמְתָּק": "candy, sweet (generic confection)",
|
||||
"סֻכָּרִיָּה": "candy, sweet (individual wrapped candy piece)",
|
||||
"מַרְבָד": "carpet, rug (literary/poetic); bedspread",
|
||||
"שָׁטִיחַ": "carpet, rug (standard, everyday word)",
|
||||
"כַּרְפַּס": "celery (also: the Passover seder vegetable)",
|
||||
"סֶלֶרִי": "celery (modern loanword, everyday usage)",
|
||||
"שַׁלְשֶׁלֶת": "chain (figurative: chain of events, lineage)",
|
||||
"שַׁרְשֶׁרֶת": "chain (physical chain, links)",
|
||||
"אָפְיָן": "characteristic (trait, attribute of a person/thing)",
|
||||
"סַמְמָן": "characteristic; indicator, hallmark",
|
||||
"שׁוֹקוֹלָד": "chocolate (the substance, mass noun, masc.)",
|
||||
"שׁוֹקוֹלָדָה": "chocolate (a piece of chocolate; hot chocolate, fem.)",
|
||||
"עִגּוּל": "circle (the shape); rounding",
|
||||
"מַעֲגָל": "circle (circular path, cycle, circuit)",
|
||||
"נִקּוּי": "cleaning (the act of cleaning, removing dirt)",
|
||||
"נִקָּיוֹן": "cleanliness, tidiness (state of being clean)",
|
||||
"בִּקּוּעַ": "cleaving, splitting (a single crack or fissure)",
|
||||
"הִתְבַּקְּעוּת": "cleaving, splitting (the process of cracking apart)",
|
||||
"בְּעִילָה": "coitus, sexual intercourse (legal/halachic term)",
|
||||
"מִשְׁגָּל": "coitus, sexual intercourse (formal/literary)",
|
||||
"מִדְרָשָׁה": "college (religious seminary, study institute)",
|
||||
"מִכְלָלָה": "college (academic institution, secular)",
|
||||
"תַּחֲרוּת": "competition, contest (an event or rivalry)",
|
||||
"הִתְחָרוּת": "competition (the act/process of competing)",
|
||||
"לְגַמְרֵי": "completely, totally (colloquial, very common)",
|
||||
"כָּלִיל": "completely, entirely (literary/formal); wholly",
|
||||
"רְכִיב": "component (technical part, element in a system)",
|
||||
"מַרְכִּיב": "component, ingredient (constituent that makes up a whole)",
|
||||
"תַּבְעֵרָה": "conflagration, fire (intense blaze, biblical/literary)",
|
||||
"דְּלֵקָה": "fire (accidental fire, house fire, everyday)",
|
||||
"צַרְכָנוּת": "consumerism; consumer advocacy",
|
||||
"צְרִיכָה": "consumption (using up resources, usage)",
|
||||
"קֵרוּר": "cooling, refrigeration (active process of making cold)",
|
||||
"הִתְקָרְרוּת": "cooling (becoming cold); catching a cold",
|
||||
"חָשׁוּךְ": "dark (of a place, lacking light; figuratively bleak)",
|
||||
"כֵּהֶה": "dark (of a color, shade; dim)",
|
||||
"אֲפֵלָה": "darkness (deep gloom; figurative despair)",
|
||||
"אֹפֶל": "darkness (poetic/literary, deep darkness)",
|
||||
"חֹשֶׁךְ": "darkness (general, common word)",
|
||||
"יַקִּיר": "darling, dear (masculine form)",
|
||||
"יַקִּירָה": "darling, dear (feminine form)",
|
||||
"מִרְמָה": "deceit, fraud (cunning deception, trickery)",
|
||||
"תַּרְמִית": "deceit, fraud (a specific act of swindling)",
|
||||
"אֲבַדּוֹן": "destruction (total ruin, perdition; the abyss)",
|
||||
"הֶרֶס": "destruction, demolition (physical wreckage)",
|
||||
"הֶבְדֵּל": "difference, distinction (between two things)",
|
||||
"שֹׁנִי": "difference (variance, otherness)",
|
||||
"הֵעָלְמוּת": "disappearance (the act of vanishing, going missing)",
|
||||
"הֶעֱלֵם": "disappearance (concealment, suppression of information)",
|
||||
"נְדָבָה": "donation (voluntary, charitable gift; tip)",
|
||||
"תְּרוּמָה": "donation, contribution (formal; also: religious offering)",
|
||||
"הִשְׁתַּעְבְּדוּת": "enslavement (the process of becoming enslaved)",
|
||||
"שִׁעְבּוּד": "enslavement, subjugation; mortgaging (finance)",
|
||||
"טָעוּת": "mistake, error (common, everyday blunder)",
|
||||
"שְׁגִיאָה": "error, mistake (formal, technical error)",
|
||||
"הִתְאַדּוּת": "evaporation (natural process of turning to vapor)",
|
||||
"הִתְאַיְּדוּת": "evaporation (process of dissipating, vaporizing)",
|
||||
"דֻּגְמָה": "example, sample (concrete instance or specimen)",
|
||||
"מָשָׁל": "example; parable, allegory, proverb",
|
||||
"גּוֹלָה": "exile, diaspora (the community in exile)",
|
||||
"גָּלוּת": "exile, diaspora (the state/condition of being exiled)",
|
||||
"חֲוָיָה": "experience (a lived event, an adventure)",
|
||||
"הִתְנַסּוּת": "experience (the process of trying/experimenting)",
|
||||
"נִסָּיוֹן": "experience (accumulated knowledge); attempt, trial",
|
||||
"בֵּאוּר": "explanation, elucidation (detailed clarification)",
|
||||
"הֶסְבֵּר": "explanation (the act of explaining, making understood)",
|
||||
"פָּנִים": "face (standard word); surface",
|
||||
"פַּרְצוּף": "face (appearance, facial expression; colloquial)",
|
||||
"מֶחְדָּל": "failure, omission (negligent failure to act)",
|
||||
"כִּשָּׁלוֹן": "failure (general: failed attempt or endeavor)",
|
||||
"כֶּשֶׁל": "failure, malfunction (technical breakdown)",
|
||||
"תַּעְנִית": "fast (religious fast day, formal term)",
|
||||
"צוֹם": "fast, fasting (the act of fasting, general)",
|
||||
"תְּחוּשָׁה": "feeling, sensation (physical or gut feeling)",
|
||||
"הַרְגָּשָׁה": "feeling (emotional sense; well-being)",
|
||||
"רֶגֶשׁ": "feeling, emotion (inner emotional state)",
|
||||
"לֶהָבָה": "flame (common word for a flame)",
|
||||
"שַׁלְהֶבֶת": "flame (poetic/literary, blazing flame)",
|
||||
"כָּפִיף": "flexible, pliable (can be bent physically)",
|
||||
"מָתִיחַ": "flexible, elastic (stretchy, resilient)",
|
||||
"זֶרֶם": "flow, current (of water, electricity, or ideas)",
|
||||
"זְרִימָה": "flow, flowing (the act/process of flowing)",
|
||||
"אֹכֶל": "food (general, everyday word for food/meal)",
|
||||
"מַאֲכָל": "food (a specific dish, a prepared food item)",
|
||||
"מָזוֹן": "food, nourishment (sustenance, nutrition)",
|
||||
"חֹפֶשׁ": "freedom; vacation, time off (colloquial)",
|
||||
"חֵרוּת": "freedom, liberty (formal, political/ideological)",
|
||||
"הַקְפָּאָה": "freezing (active act of freezing something; a freeze/suspension)",
|
||||
"קִפָּאוֹן": "freezing; standstill, stagnation (frozen state)",
|
||||
"תְּדִירוּת": "frequency (how often something occurs)",
|
||||
"תֶּדֶר": "frequency (radio/physics frequency)",
|
||||
"תָּדִיר": "frequent, regular (happening at steady intervals)",
|
||||
"תָּכוּף": "frequent, rapid (happening in quick succession)",
|
||||
"גָּאוֹן": "genius (title of greatness; rabbinical title Gaon)",
|
||||
"עִלּוּי": "genius, prodigy (exceptionally gifted person)",
|
||||
"תְּשׁוּרָה": "gift, present (formal/literary offering)",
|
||||
"שַׁי": "gift, present (a token gift, small present)",
|
||||
"אַכְלָן": "glutton (big eater, food-lover, common)",
|
||||
"רְעַבְתָּן": "glutton (insatiably hungry person)",
|
||||
"מֶמְשֶׁלֶת": "government (construct state form, used in compounds)",
|
||||
"מֶמְשָׁלָה": "government (standard form)",
|
||||
"מֶמְשַׁלְתִּי": "governmental (relating to the government/cabinet)",
|
||||
"שִׁלְטוֹנִי": "governmental (relating to ruling authority/regime)",
|
||||
"חֹפֶן": "handful (cupped palm, a scooped amount)",
|
||||
"קֹמֶץ": "handful (a pinch, a small quantity)",
|
||||
"יָד": "handle (of a tool, door); hand",
|
||||
"יָדִית": "handle (a knob or grip, specifically a handle)",
|
||||
"כָּאן": "here (standard, common usage)",
|
||||
"פֹּה": "here (colloquial/informal variant)",
|
||||
"טָמוּן": "hidden (buried, latent, lying within)",
|
||||
"נִסְתָּר": "hidden, concealed (secret, mysterious; grammar: 3rd person)",
|
||||
"מֻצְנָע": "hidden, concealed (modestly tucked away, discreet)",
|
||||
"תְּמוּנָה": "image, picture (photo, illustration, scene)",
|
||||
"צֶלֶם": "image (likeness, form); idol",
|
||||
"הִתְרַשְּׁמוּת": "impression (the experience of being impressed)",
|
||||
"רֹשֶׁם": "impression (a mark left; an effect on someone)",
|
||||
"בִּפְנִים": "inside (location: on the inside, indoors)",
|
||||
"פְּנִימָה": "inside (direction: inward, toward the inside)",
|
||||
"עֶלְבּוֹן": "insult, offence (the slight or affront itself)",
|
||||
"הַעֲלָבָה": "insult (the act of insulting someone)",
|
||||
"פְּנִים": "interior, inside (inner part, inner side)",
|
||||
"קֶרֶב": "interior; innards, midst (among, in the thick of)",
|
||||
"תָּוֶךְ": "interior, inside; center, middle; essence",
|
||||
"תַּחְקִיר": "investigation (journalistic/official inquiry)",
|
||||
"חֲקִירָה": "investigation, inquiry (police/legal; research)",
|
||||
"רִנָּה": "joy; joyful song, singing (literary)",
|
||||
"מָשׂוֹשׂ": "joy, delight (source of joy, literary)",
|
||||
"גִּיל": "joy, elation (exuberant happiness; age)",
|
||||
"שִׂמְחָה": "joy, happiness (celebration, festive occasion)",
|
||||
"עֶלְצוֹן": "jubilance, exultation (archaic, the feeling)",
|
||||
"עֶלְצָה": "jubilance, exultation (archaic, feminine noun form)",
|
||||
"עָצֵל": "lazy, idle (basic adjective form)",
|
||||
"עַצְלָן": "lazy, lazybones (characteristically lazy person)",
|
||||
"תְּחִקָּה": "legislation (a specific statute or enacted law)",
|
||||
"חֲקִיקָה": "legislation (the process/act of legislating)",
|
||||
"הִתְהוֹלְלוּת": "licentiousness, revelry (wild raucous behavior)",
|
||||
"הוֹלֵלוּת": "licentiousness, debauchery (moral depravity)",
|
||||
"שׁוֹשָׁן": "lily (the flower, masculine; also: the name Shoshan)",
|
||||
"שׁוֹשַׁנָּה": "lily; rose (archaic); the name Shoshana",
|
||||
"הִמָּצְאוּת": "location; presence (being found/situated somewhere)",
|
||||
"מִקּוּם": "location, positioning (placing in a specific spot)",
|
||||
"נַעֲלֶה": "lofty, exalted (elevated, superior in quality)",
|
||||
"נִשְׂגָּב": "lofty, exalted (sublime, beyond reach, grand)",
|
||||
"תַּאֲוָה": "lust, craving (appetite, physical desire)",
|
||||
"תְּשׁוּקָה": "passion, desire (deep longing, yearning)",
|
||||
"אַחְזָקָה": "maintenance; holding (corporate; upkeep of property)",
|
||||
"תַּחְזוּקָה": "maintenance (technical upkeep of systems/equipment)",
|
||||
"תִּחְזוּק": "maintenance (the process/act of maintaining)",
|
||||
"מִנְהָל": "administration, management (the office/system)",
|
||||
"נִהוּל": "management (the act/process of managing)",
|
||||
"הַנְהָלָה": "management (the managing body, executive board)",
|
||||
"פֵּרוּשׁ": "meaning; interpretation, commentary",
|
||||
"מַשְׁמָעוּת": "meaning, significance (broader importance)",
|
||||
"מַשְׁמָע": "meaning, implication (what is implied)",
|
||||
"לַחַן": "melody, tune (a musical composition)",
|
||||
"נִגּוּן": "melody, tune (a chant; Hasidic wordless melody)",
|
||||
"נְעִימָה": "melody, tune; tone, intonation (of voice)",
|
||||
"נֵס": "miracle (divine intervention; common word)",
|
||||
"פֶּלֶא": "wonder, marvel (something astonishing)",
|
||||
"תְּזוּזָה": "movement (a budge, slight motion, shift)",
|
||||
"תְּנוּעָה": "movement (broad: traffic; organization; vowel mark)",
|
||||
"מִסְתּוֹרִין": "mystery (enigma, something hidden/secret)",
|
||||
"תַּעֲלוּמָה": "mystery (unsolved puzzle, unknown secret)",
|
||||
"עֵירֹם": "naked (completely nude, formal)",
|
||||
"עָרֹם": "naked (nude; also: shrewd, cunning in biblical Hebrew)",
|
||||
"אֻמָּה": "nation (a unified political/cultural entity)",
|
||||
"לְאֹם": "nation, people (ethnic group; literary/formal)",
|
||||
"זִלְזוּל": "negligence; contempt, disrespect (dismissive attitude)",
|
||||
"הִתְרַשְּׁלוּת": "negligence (carelessness, failure to take proper care)",
|
||||
"נֵיטְרָלִי": "neutral (politically/scientifically neutral, loanword)",
|
||||
"סְתָמִי": "neutral; vague, nondescript, generic",
|
||||
"אֲצֻלָּה": "nobility, aristocracy (the aristocratic class)",
|
||||
"אֲצִילוּת": "nobility (the quality of being noble, refinement)",
|
||||
"הִסְתַּכְּלוּת": "observation (looking, watching, contemplation)",
|
||||
"תַּצְפִּית": "observation (military/scientific lookout; observation post)",
|
||||
"מִכְשׁוֹל": "obstacle, stumbling block (impediment to progress)",
|
||||
"נֶגֶף": "obstacle; plague, affliction (biblical)",
|
||||
"עַל": "on, upon; about, regarding",
|
||||
"עַל גַּב": "on, upon (on the back/surface of)",
|
||||
"עַל גַּבֵּי": "on, upon (on top of, on the surface of)",
|
||||
"פְּקֻדָּה": "order, command (military/authoritative directive)",
|
||||
"צַו": "order, decree (legal injunction, official order)",
|
||||
"בָּחוּץ": "outside (location: on the outside, outdoors)",
|
||||
"הַחוּצָה": "outside (direction: outward, to the outside)",
|
||||
"מַאֲרָז": "package (a packed container, packaging)",
|
||||
"חֲבִילָה": "package, parcel (a bundle, a wrapped item)",
|
||||
"מְחִילָה": "pardon, forgiveness (personal, between individuals)",
|
||||
"סְלִיחָה": "pardon, forgiveness (also: excuse me; liturgical pardon)",
|
||||
"סַיֶּרֶת": "patrol (elite military unit, commando squad)",
|
||||
"סִיּוּר": "patrol; tour (a round of inspection or sightseeing)",
|
||||
"שָׂכָר": "payment; salary, wage (earned compensation)",
|
||||
"תַּשְׁלוּם": "payment (a single payment/installment; compensation)",
|
||||
"עֲצוּמָה": "petition (public petition with signatures)",
|
||||
"עֲתִירָה": "petition (legal petition, court appeal)",
|
||||
"דַּלּוּת": "poverty; meagerness, paucity (scarcity of quality/quantity)",
|
||||
"עֹנִי": "poverty (destitution, financial hardship)",
|
||||
"עָצְמָתִי": "powerful (having great inherent power)",
|
||||
"רַב עָצְמָה": "powerful (of great might, formidable)",
|
||||
"הַאֲמָרָה": "price increase (deliberate raising of prices)",
|
||||
"הִתְיַקְּרוּת": "price increase (becoming more expensive, rising costs)",
|
||||
"קִדְמָה": "progress (general/societal advancement, modernity)",
|
||||
"הִתְקַדְּמוּת": "progress (the process of advancing, making headway)",
|
||||
"הַסְבָּרָה": "propaganda; public diplomacy (Israeli hasbara)",
|
||||
"תַּעֲמוּלָה": "propaganda (political propaganda, agitation)",
|
||||
"סְמִיכוּת": "proximity; construct state (grammar term)",
|
||||
"קִרְבָה": "proximity; kinship, closeness (relational nearness)",
|
||||
"תְּהִלּוֹת": "Psalms (variant plural form)",
|
||||
"תְּהִלִּים": "Psalms (standard name for the Book of Psalms)",
|
||||
"קְנִיָּה": "purchase (a buy, an act of buying, everyday)",
|
||||
"רְכִישָׁה": "acquisition (formal purchase, procurement)",
|
||||
"בִּזְרִיזוּת": "quickly, nimbly (with agile efficiency)",
|
||||
"בִּמְהִירוּת": "quickly, at high speed (with velocity)",
|
||||
"רִיצָה": "running (the activity of running)",
|
||||
"מְרוּצָה": "race (a competitive running event)",
|
||||
"גְּאֻלָּה": "redemption (national/messianic deliverance)",
|
||||
"פְּדוּת": "redemption (ransoming, being redeemed; literary)",
|
||||
"הוֹצָאָה": "removal; expense, expenditure; publishing house",
|
||||
"הַסָּחָה": "removal; deflection, diversion, distraction",
|
||||
"יִצּוּג": "representation (acting on behalf of; depiction)",
|
||||
"נְצִיגוּת": "representation (the body of representatives, delegation)",
|
||||
"מְכִירָה": "sale (the act of selling, a transaction)",
|
||||
"מֶכֶר": "sale; merchandise, value (literary/biblical)",
|
||||
"יֶשַׁע": "salvation, deliverance (divine rescue, literary)",
|
||||
"תְּשׁוּעָה": "salvation, victory (triumphant rescue, literary)",
|
||||
"הַפְרָדָה": "separation (active act of separating things/people)",
|
||||
"הִפָּרְדוּת": "separation (the process of parting ways)",
|
||||
"חַד": "sharp (of edges, blades; clear-cut)",
|
||||
"חָרִיף": "sharp, acute; spicy, pungent; keen, witty",
|
||||
"חָסוּת": "shelter, patronage (protection under authority)",
|
||||
"מִקְלָט": "shelter, refuge (bomb shelter, safe haven, physical place)",
|
||||
"חֻלְצָה": "shirt, blouse (modern everyday word)",
|
||||
"כֻּתֹּנֶת": "shirt; tunic, gown (biblical/traditional garment)",
|
||||
"שֶׁקֶט": "silence, quiet (peaceful calm, serenity)",
|
||||
"שְׁתִיקָה": "silence (the act of keeping silent, not speaking)",
|
||||
"חֶטְא": "sin (a specific transgression, missing the mark)",
|
||||
"עָווֹן": "sin, iniquity (moral guilt; legal: misdemeanor)",
|
||||
"זִמְרָה": "singing (musical performance, song/hymn)",
|
||||
"רְנָנָה": "singing; joyful song, jubilant cry (literary)",
|
||||
"נָטוּי": "slanted, inclined (tilted, leaning; grammar: inflected)",
|
||||
"מְשֻׁפָּע": "slanted, inclined; having an abundance of something",
|
||||
"כִּשּׁוּף": "sorcery, witchcraft (dark magic, spellcasting)",
|
||||
"קֶסֶם": "magic, charm (enchantment, allure)",
|
||||
"נֶפֶשׁ": "soul (life force, self, being; appetite)",
|
||||
"נְשָׁמָה": "soul (divine breath of life, spiritual essence)",
|
||||
"מַצָּת": "spark plug (automotive ignition component)",
|
||||
"פְּלָג": "spark plug (variant/slang term)",
|
||||
"דּוֹבֵר": "speaker, spokesman (masculine form)",
|
||||
"דּוֹבֶרֶת": "speaker, spokeswoman (feminine form)",
|
||||
"סוּפָה": "storm, tempest (violent windstorm)",
|
||||
"סְעָרָה": "storm, tempest (raging storm; figurative turmoil)",
|
||||
"קַשׁ": "straw (dry stalks; figuratively: trivial thing)",
|
||||
"תֶּבֶן": "straw, hay (animal feed, dried grass)",
|
||||
"עִקֵּשׁ": "stubborn, obstinate (perversely rigid)",
|
||||
"עַקְשָׁן": "stubborn, obstinate (characteristically persistent/stubborn person)",
|
||||
"חָנִיךְ": "student, pupil (trainee, apprentice, cadet)",
|
||||
"תַּלְמִיד": "student, pupil (school student, common word)",
|
||||
"פִּקּוּחַ": "supervision (regulatory oversight, monitoring)",
|
||||
"הַשְׁגָּחָה": "supervision (watchful care, divine providence; kosher certification)",
|
||||
"הַסְפָּקָה": "supply, provision (the act of supplying goods)",
|
||||
"אַסְפָּקָה": "supply, provision (military/logistical provisioning)",
|
||||
"אֲרָעִי": "temporary, provisional (makeshift, not permanent)",
|
||||
"זְמַנִּי": "temporary, time-limited (for a limited period)",
|
||||
"אֵלֶה": "these (standard demonstrative pronoun)",
|
||||
"אֵלוּ": "these (literary/Mishnaic variant)",
|
||||
"בֹּהֶן": "thumb; big toe (anatomical term)",
|
||||
"אֲגוּדָל": "thumb (common/colloquial word for thumb)",
|
||||
"זְמַן": "time (general, measurable time; tense in grammar)",
|
||||
"עֵת": "time (a specific moment, epoch, literary/biblical)",
|
||||
"עִתּוּי": "timing (choosing the right moment)",
|
||||
"תִּזְמוּן": "timing (synchronization, technical scheduling)",
|
||||
"לְכַתֵּב": "to address (write an address on); to engrave",
|
||||
"לְמַעֵן": "to address (direct/target communication toward)",
|
||||
"לְזַיֵּן": "to arm (equip with weapons; vulgar slang)",
|
||||
"לְחַמֵּשׁ": "to arm (equip/furnish with armaments)",
|
||||
"לְהִתְאַסֵּף": "to assemble, to gather together (of people collecting)",
|
||||
"לְהִתְכַּנֵּס": "to assemble, to convene (a formal meeting/conference)",
|
||||
"לְהִכָּבֵל": "to be bound (chained, shackled with chains)",
|
||||
"לְהִכָּפֵת": "to be bound (handcuffed, tied up physically)",
|
||||
"לְהִבָּרֵא": "to be created (divine/fundamental creation, ex nihilo)",
|
||||
"לְהִוָּצֵר": "to be created (formed, shaped, manufactured)",
|
||||
"לְהִגָּזֵז": "to be cut off (sheared, trimmed, as hair/wool)",
|
||||
"לְהִגָּזֵר": "to be cut off (decreed, sentenced; derived from)",
|
||||
"לְהִקָּטֵעַ": "to be cut off (interrupted, severed abruptly)",
|
||||
"לְהִנָּגֵף": "to be defeated (struck down, plagued; biblical)",
|
||||
"לְהֵרָעֵץ": "to be defeated (crushed, shattered; literary)",
|
||||
"לְהֵהָרֵס": "to be destroyed (demolished, wrecked; slang: exhausted)",
|
||||
"לְהֵחָרֵב": "to be destroyed (laid waste, devastated; of cities/temples)",
|
||||
"לְהִסָּתֵר": "to be hidden; to hide oneself (take cover)",
|
||||
"לְהִצָּפֵן": "to be hidden (encoded, concealed from view)",
|
||||
"לְהִנָּטֵעַ": "to be planted (of trees/plants, set in soil)",
|
||||
"לְהִשָּׁתֵל": "to be planted (implanted, transplanted; of an organ or undercover agent)",
|
||||
"לָדֹם": "to be silent (to become utterly still; literary)",
|
||||
"לִשְׁתֹּק": "to be silent (to stop talking, keep quiet; common)",
|
||||
"לְהִתְקַמֵּץ": "to be stingy (to pinch pennies, scrimp)",
|
||||
"לְהִתְקַמְצֵן": "to be stingy (to act like a miser, be miserly)",
|
||||
"לְהִבָּדֵק": "to be tested, checked (verified, inspected)",
|
||||
"לְהִבָּחֵן": "to be tested, examined (undergo a formal exam/evaluation)",
|
||||
"נִהְיָה": "to become (turn into, come to be; common)",
|
||||
"לְהֵעָשׂוֹת": "to become; to be made, to be done, to be carried out",
|
||||
"לְהִתְבַּהֵר": "to become clear (clarified, understood)",
|
||||
"לְהִצְטַלֵּל": "to become clear (of liquid becoming transparent/limpid)",
|
||||
"לְכוֹפֵף": "to bend (flex, bow down, curve something)",
|
||||
"לְקַמֵּר": "to bend, to vault (arch over, create a dome shape)",
|
||||
"לְקַשֵּׁת": "to bend, to curve (form into a bow/arc shape)",
|
||||
"לְפַחֵם": "to blacken (carbonize, char with coal/charcoal)",
|
||||
"לְפַיֵּחַ": "to blacken (cover with soot, smoke residue)",
|
||||
"לְמַצְמֵץ": "to blink (rapidly open and close one's eyes)",
|
||||
"לְעַפְעֵף": "to blink (flutter one's eyelids)",
|
||||
"לִנְפֹּחַ": "to blow (puff up, inflate; blow air)",
|
||||
"לִנְשֹׁף": "to blow, to exhale; to play a wind instrument",
|
||||
"לְצַיֵּץ": "to chirp, to tweet (of birds; to post on social media)",
|
||||
"לְצַפְצֵף": "to chirp, to whistle (shrill piping sound; to not care — slang)",
|
||||
"לְחַבֵּר": "to connect, to join (attach together; to compose/write)",
|
||||
"לְקַשֵּׁר": "to connect, to link (establish a relationship/connection)",
|
||||
"לְהָסִיחַ": "to converse (engage in casual talk; to divert attention)",
|
||||
"לְהָשִׂיחַ": "to converse, to talk (literary; to speak with)",
|
||||
"לְסַלְסֵל": "to curl (hair); to trill (music)",
|
||||
"לְתַלְתֵּל": "to curl (hair into ringlets/curls)",
|
||||
"לְיַפּוֹת": "to beautify, to embellish (make more attractive)",
|
||||
"לְפַרְכֵּס": "to embellish; to squirm, to flounder",
|
||||
"לִדְרֹשׁ": "to demand; to inquire, to preach (seek/expound)",
|
||||
"לִתְבֹּעַ": "to demand; to sue, to claim (legal demand)",
|
||||
"לְהֵישִׁיר": "to direct; to straighten, to look straight at",
|
||||
"לְהַפְנוֹת": "to direct; to refer someone (redirect attention/person)",
|
||||
"לְהַגְזִים": "to exaggerate (overstate, blow out of proportion; common)",
|
||||
"לְהַפְרִיז": "to exaggerate (go to extremes, overdo; formal)",
|
||||
"לְהִמּוֹג": "to fade, to dissolve (melt away, lose form; literary)",
|
||||
"לְהִנָּדֵף": "to fade, to dissipate (blown away, scattered by wind)",
|
||||
"לִפֹּל": "to fall (general: fall down, collapse; common word)",
|
||||
"לִנְשֹׁר": "to fall, to drop (shed: leaves, hair; drop out of school)",
|
||||
"לְכַלּוֹת": "to finish (consume entirely, exhaust; to annihilate)",
|
||||
"לְסַיֵּם": "to finish, to complete (conclude, bring to an end; common)",
|
||||
"לִנְהֹר": "to flow (stream toward); to shine, to glow",
|
||||
"לִשְׁתֹּת": "to flow (pour forth, stream out; literary)",
|
||||
"לִמְחֹל": "to forgive (pardon on a personal level, waive a claim)",
|
||||
"לִסְלֹחַ": "to forgive, to pardon (general, standard word for forgiving)",
|
||||
"לְהַחְבִּיא": "to hide, to conceal (physically stash away; common)",
|
||||
"לְהַעֲלִים": "to hide, to conceal (suppress information; to evade)",
|
||||
"לִדְלֹף": "to leak (of a pipe, roof; seep through)",
|
||||
"לִנְזֹל": "to drip, to trickle (flow in drops, ooze)",
|
||||
"לִזְנֹחַ": "to abandon, to neglect (forsake, discard)",
|
||||
"לַעֲזֹב": "to leave, to abandon (depart from; give up; common word)",
|
||||
"לְהַנִּיחַ": "to place, to put (set down carefully); to assume",
|
||||
"לְהָשִׂים": "to place, to put (set/assign); to turn into something",
|
||||
"לְפָאֵר": "to glorify, to adorn (extol with grandeur)",
|
||||
"לְשַׁבֵּחַ": "to praise, to commend (express approval; common)",
|
||||
"לִדְחֹף": "to push, to shove (physically push forward; common)",
|
||||
"לִדְחֹק": "to push, to press (squeeze, crowd; urge insistently)",
|
||||
"לְהַבְרִיא": "to recover (regain health, get well; common)",
|
||||
"לְהַחְלִים": "to recover, to convalesce (heal fully from illness; formal)",
|
||||
"לַעֲלֹץ": "to rejoice, to exult (leap with joy; literary)",
|
||||
"לָשׂוּשׂ": "to rejoice (be glad, delight in; biblical/literary)",
|
||||
"לְהוֹשִׁיעַ": "to rescue, to save (deliver from danger; biblical/literary)",
|
||||
"לְהַצִּיל": "to rescue, to save (common, everyday word)",
|
||||
"לְחַכֵּךְ": "to rub (scratch an itch, abrade gently)",
|
||||
"לְשַׁפְשֵׁף": "to rub (scrub, polish by rubbing repeatedly)",
|
||||
"לִסְרֹט": "to scratch (scrape with a sharp object; to make a video/film)",
|
||||
"לִשְׂרֹט": "to scratch (draw a line, score a surface)",
|
||||
"לִנְגֹּהַּ": "to shine (glow with bright light; literary)",
|
||||
"לִקְרֹן": "to shine, to beam (radiate light, as from horns of light)",
|
||||
"לְהַחֲרִישׁ": "to silence; to be silent (choose not to respond; literary)",
|
||||
"לְהַשְׁתִּיק": "to silence (make someone/something stop making noise; common)",
|
||||
"לִטְבֹּחַ": "to slaughter (massacre, butcher violently)",
|
||||
"לִשְׁחֹט": "to slaughter (ritually slaughter an animal; shecht)",
|
||||
"לְהִתְמַחוֹת": "to specialize (become an expert in a field)",
|
||||
"לְהִתְמַקְצֵעַ": "to specialize (become a professional, gain proficiency)",
|
||||
"לְבַקֵּעַ": "to split, to cleave (crack open forcefully)",
|
||||
"לְבַתֵּק": "to split, to cleave; to pierce (cut through)",
|
||||
"לִמְרֹחַ": "to spread (smear, apply a spread on surface)",
|
||||
"לִשְׁטֹחַ": "to spread (lay out flat, unfurl); to present, explicate",
|
||||
"לְאַשֵּׁשׁ": "to strengthen, to establish (shore up, substantiate)",
|
||||
"לְחַזֵּק": "to strengthen (make stronger, reinforce; common word)",
|
||||
"לְהִתְיַסֵּר": "to suffer (be tormented, endure agony)",
|
||||
"לְהִתְעַנּוֹת": "to suffer; to fast (endure hardship/deprivation; literary)",
|
||||
"לִידוֹת": "to throw, to hurl (cast, fling; biblical)",
|
||||
"לִרְמוֹת": "to throw, to hurl (toss; biblical)",
|
||||
"לִגְזֹז": "to trim (shear wool/hair, clip close)",
|
||||
"לִגְזֹם": "to trim (prune branches/bushes, cut back vegetation)",
|
||||
"לְאַדּוֹת": "to vaporize (steam, evaporate); to simmer, to poach (cooking)",
|
||||
"לְאַיֵּד": "to vaporize, to evaporate (cause to turn into vapor)",
|
||||
"לֶאֱרֹג": "to weave (on a loom, produce fabric; common word)",
|
||||
"לִשְׁזֹר": "to weave (intertwine, braid, thread together)",
|
||||
"בְּיַחַד": "together (as a group, common usage with 'be-')",
|
||||
"יַחַד": "together (jointly, in unison; literary)",
|
||||
"יַחְדָּו": "together (jointly; biblical/poetic variant)",
|
||||
"מִסְחָר": "trade, commerce (the business/sector of trading)",
|
||||
"סַחַר": "trade, commerce (goods traded, merchandise; literary)",
|
||||
"אֱמֶת": "truth (common word for truth, verity)",
|
||||
"אֲמִתָּה": "truth; axiom (fundamental truth, literary)",
|
||||
"מִצְנֶפֶת": "turban (formal headdress, priestly turban)",
|
||||
"צָנִיף": "turban, head wrap (wrapped head covering)",
|
||||
"אַחְדוּת": "unity (state of being united, solidarity)",
|
||||
"אִחוּד": "unification (the act of uniting, merging)",
|
||||
"בִּקְעָה": "valley (broad, flat valley plain)",
|
||||
"עֵמֶק": "valley (deep valley between mountains/hills)",
|
||||
"אִשְׁרָה": "visa; approval (entry permit; formal approval)",
|
||||
"וִיזָה": "visa (travel visa, loanword)",
|
||||
"כֹּתֶל": "wall (the Western Wall; a freestanding stone wall)",
|
||||
"קִיר": "wall (common word for wall of a room/building)",
|
||||
"אַזְהָרָה": "warning (a caution, alert; legal/safety warning)",
|
||||
"הַזְהָרָה": "warning (the act of warning someone; admonition)",
|
||||
"רַהַט": "water trough (channel, gutter for water flow)",
|
||||
"שֹׁקֶת": "water trough (feeding/drinking trough for animals)",
|
||||
"אִלּוּלֵי": "were it not for (standard conditional; common)",
|
||||
"לוּלֵא": "were it not for (literary/Talmudic variant)",
|
||||
"אוֹפַןּ": "wheel (a single wheel; biblical/poetic)",
|
||||
"גַּלְגַּל": "wheel (rolling wheel; cycle, pulley)",
|
||||
"אַיֵּה": "where? (literary/biblical: where is?)",
|
||||
"הֵיכָן": "where? (standard literary form of 'where')",
|
||||
"לֹבֶן": "whiteness (white of the eye; white color)",
|
||||
"צְחוֹר": "whiteness; purity (brilliant white, radiance)",
|
||||
"עוֹלָם": "world (the world, universe; eternity; common word)",
|
||||
"תֵּבֵל": "world, universe (the inhabited world; poetic/literary)",
|
||||
"פֶּצַע": "wound (a specific cut, gash, open wound)",
|
||||
"פְּצִיעָה": "wound, injury (the event/act of being wounded)",
|
||||
"כִּסּוּפִים": "yearning, longing (wistful craving, literary; plural)",
|
||||
"עֶרְגָּה": "yearning, longing (deep nostalgic longing, literary)"
|
||||
}
|
||||
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
2297586
data/words.json
2297586
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -1,150 +0,0 @@
|
|||
# Adaptive Sentence Difficulty Cloze — v0.20 Design Spec
|
||||
|
||||
**Date:** 2026-03-15
|
||||
**Status:** Approved
|
||||
**Release:** v0.20
|
||||
|
||||
## Problem
|
||||
|
||||
Cloze cards currently select the example sentence closest to 9 words in length. This ignores whether the surrounding context words are familiar to the learner. A sentence full of rare words is harder than one with common words, regardless of length.
|
||||
|
||||
## Solution
|
||||
|
||||
Replace the length-based `_score()` function in `epub_examples.py` with a **frequency-based difficulty score**. The easiest sentence (most common context words) becomes the cloze. All vetted sentences remain on the card, ordered easy→hard.
|
||||
|
||||
## Scoring Pipeline
|
||||
|
||||
### Token Frequency Lookup (5-tier)
|
||||
|
||||
Given a nikkud sentence token, resolve its frequency rank:
|
||||
|
||||
1. **Known mapping** — look up token in the nikkud→ktiv_male map built from words.json headwords, conjugations, and inflections (94k mappings). If found, look up the ktiv_male in the frequency data.
|
||||
2. **Nikkud prefix stripping** — use `_try_strip_prefix()` to strip validated Hebrew prefixes (בהוכלמש), then resolve the remainder via the known mapping.
|
||||
3. **Academy rules converter** — apply `nikkud_to_ktiv_male.convert()` (91.6% accuracy) to produce ktiv_male, look up in frequency data.
|
||||
4. **strip_nikkud fallback** — use `helpers.strip_nikkud()` as a lossy fallback.
|
||||
5. **Ktiv_male prefix stripping** — strip 1-2 character Hebrew prefixes from the converted/stripped form and look up the stem.
|
||||
|
||||
Tokens not found in any tier are assigned a default high rank (50,000).
|
||||
|
||||
**Coverage:** ~93% of example sentence tokens resolve to a frequency rank (measured empirically on 7,588 sentences).
|
||||
|
||||
**Frequency data source:** Use `frequency_lookup.py` which auto-selects `frequency_clean.json` when available, falling back to `frequency_cache.json`.
|
||||
|
||||
### Sentence Difficulty Score
|
||||
|
||||
For a given word's candidate sentence:
|
||||
|
||||
1. Tokenize: split on whitespace, strip punctuation (.,!?;:"'"״׳–—()[]{}), split on maqaf (־).
|
||||
2. Exclude the target word's token using `cloze_word_start`/`cloze_word_end` offsets from the matched sentence.
|
||||
3. For each remaining token (length >= 2), resolve its frequency rank via the 5-tier pipeline.
|
||||
4. **Score = median frequency rank of context tokens.**
|
||||
|
||||
Lower score = easier (context words are more common). Median resists outliers (one rare proper noun shouldn't dominate).
|
||||
|
||||
### Integration Point
|
||||
|
||||
The scoring integrates into `epub_examples.py`'s existing `_score()` closure inside `update_words_json()` (line ~677). Currently:
|
||||
|
||||
```python
|
||||
def _score(s: dict) -> tuple[int,]:
|
||||
wc = s["word_count"]
|
||||
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
|
||||
return (length_score,)
|
||||
```
|
||||
|
||||
New scoring replaces length with frequency-based difficulty. The `_score` function gains access to the frequency pipeline via closure over the nikkud_map, nikkud_index, and freq_data built once at the start of `update_words_json()`.
|
||||
|
||||
**Minimum sentence length:** Reduced from 4 words to 3 words (`MIN_WORDS = 3` in epub_examples.py). Hebrew is more concise than English — 3-word sentences are valid and common. This expands the candidate pool for cloze selection.
|
||||
|
||||
**Behavioral change:** Because `pool.sort(key=_score)` determines which 3 sentences are selected as `best = pool[:3]`, changing the scoring function changes **which sentences are selected**, not just their order. This is intentional — we want the easiest sentences as cloze candidates, not the closest-to-9-words ones. Existing cloze GUIDs will be preserved when the same sentence text is re-selected; entries where a different sentence wins will get new GUIDs.
|
||||
|
||||
## Data Model Changes
|
||||
|
||||
### words.json
|
||||
|
||||
The `examples.cloze` dict (single sentence) gains an optional `difficulty_score` field:
|
||||
|
||||
```json
|
||||
{
|
||||
"examples": {
|
||||
"vetted": [
|
||||
{"text": "...", "source": "...", "match_method": "..."},
|
||||
{"text": "...", "source": "...", "match_method": "..."}
|
||||
],
|
||||
"cloze": {
|
||||
"text": "...",
|
||||
"cloze_word_start": 5,
|
||||
"cloze_word_end": 10,
|
||||
"cloze_hint": null,
|
||||
"cloze_guid": "abc123",
|
||||
"difficulty_score": 234
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The vetted list is also sorted by difficulty (easiest first), so the card back shows sentences in pedagogically useful order.
|
||||
|
||||
### SCHEMA.yaml
|
||||
|
||||
Add `difficulty_score` as optional integer field under `examples.cloze`.
|
||||
|
||||
## Implementation Scope
|
||||
|
||||
### New file: `sentence_difficulty.py`
|
||||
|
||||
Standalone module for sentence scoring. No pipeline step — called by `epub_examples.py`.
|
||||
|
||||
- `score_sentence(sentence_text: str, target_start: int, target_end: int, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — returns median context frequency rank. Uses `target_start`/`target_end` character offsets to exclude the cloze target token.
|
||||
- `build_nikkud_map(words: dict) -> dict[str, str]` — builds nikkud→ktiv_male lookup from words.json (headwords + conjugation forms + noun inflections). Returns `{nikkud_form: ktiv_male_form}`. Implementation note: should share iteration logic with `epub_examples._build_nikkud_index()` or derive from its output to avoid duplicating the traversal of words.json forms.
|
||||
- `_resolve_token_frequency(token: str, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — the 5-tier lookup. Uses `_try_strip_prefix` from epub_examples (made importable by removing underscore or adding a public wrapper).
|
||||
|
||||
### Modified files
|
||||
|
||||
- **`epub_examples.py`**:
|
||||
- Import `sentence_difficulty.score_sentence` and `sentence_difficulty.build_nikkud_map`
|
||||
- In `update_words_json()`: build nikkud_map and load freq_data once at start (before per-word loop)
|
||||
- Replace `_score()` closure with frequency-based scoring that calls `score_sentence()`
|
||||
- Sort vetted list by difficulty score (easiest first)
|
||||
- Store `difficulty_score` in the cloze dict
|
||||
- Make `_try_strip_prefix` importable (rename to `try_strip_prefix` or add public alias)
|
||||
- **`frequency_lookup.py`** — add `get_freq_data() -> dict` public accessor to expose the loaded frequency dict (avoids accessing private `_freq` directly)
|
||||
- **`SCHEMA.yaml`** — add `difficulty_score` field
|
||||
- **`run.py`** — no changes; scoring happens inside epub_examples step
|
||||
|
||||
### Not modified
|
||||
|
||||
- **`apkg_builder.py`** — reads cloze as-is; vetted order is already respected
|
||||
- **`nikkud_to_ktiv_male.py`** — used as-is
|
||||
- **Card templates** — no changes needed
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `nikkud_to_ktiv_male.convert()` — Academy rules converter (already written)
|
||||
- `epub_examples._try_strip_prefix()` / `_build_nikkud_index()` — nikkud prefix stripping and index
|
||||
- `frequency_lookup.py` — loads frequency data (auto-selects clean vs cache)
|
||||
- `helpers.strip_nikkud()` — fallback converter
|
||||
|
||||
## Validation
|
||||
|
||||
- **Unit tests** for `score_sentence()` with known easy/hard sentences
|
||||
- **Unit tests** for `_resolve_token_frequency()` covering all 5 tiers
|
||||
- **Integration test**: verify cloze selection picks easiest sentence, vetted list is sorted
|
||||
- **Spot check**: manually review 10 words with 3+ sentences to confirm ordering
|
||||
- **Regression**: existing tests pass, GUID coverage unchanged, deck validates
|
||||
|
||||
## Constraints
|
||||
|
||||
- `examples.cloze` remains a single dict (not converted to list)
|
||||
- No new Anki card types or fields
|
||||
- No runtime JS in Anki cards
|
||||
- No network calls during scoring
|
||||
- `difficulty_score` is informational metadata; card rendering doesn't depend on it
|
||||
- Existing cloze GUIDs preserved when the same sentence is re-selected
|
||||
|
||||
## Scope Exclusions (Future Work)
|
||||
|
||||
- **Pronominal suffix stripping** — would improve the ~7% unscored token rate; deferred (PROJECT_NOTES.md)
|
||||
- **Kamatz katan disambiguation** — requires morphological analysis; accepted limitation
|
||||
- **Per-learner adaptive difficulty** — requires Anki plugin; out of scope for static deck
|
||||
- **Multiple cloze sentences per card** — would require schema migration to list; deferred
|
||||
913
epub_examples.py
913
epub_examples.py
|
|
@ -1,913 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract example sentences from nikud'd Hebrew EPUB files, match them against
|
||||
the vocabulary list in data/words.json, and write matched examples back into
|
||||
words.json.
|
||||
|
||||
Usage (standalone):
|
||||
python3 epub_examples.py
|
||||
|
||||
Called from run.py via:
|
||||
run(words) — words dict is passed in and updated in place
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import zipfile
|
||||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
import frequency_lookup
|
||||
from helpers import strip_nikkud
|
||||
from sentence_difficulty import build_nikkud_map, score_sentence
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
EPUB_DIR = DATA_DIR / "epubs"
|
||||
WORDS_JSON = DATA_DIR / "words.json"
|
||||
|
||||
|
||||
# Book metadata: filename -> display name
|
||||
def _discover_epubs() -> dict[str, str]:
|
||||
"""Auto-discover all .epub and .txt files in EPUB_DIR, returning {filepath: display_name}."""
|
||||
if not EPUB_DIR.exists():
|
||||
return {}
|
||||
books: dict[str, str] = {}
|
||||
for path in sorted(EPUB_DIR.glob("*.epub")):
|
||||
stem = path.stem
|
||||
stem_stripped = strip_nikkud(stem).lower()
|
||||
# Derive a brief English display name from the filename
|
||||
parts = stem.split(" -- ")
|
||||
title_part = strip_nikkud(parts[0]).strip().lower()
|
||||
if "alice" in stem_stripped or "אליס" in title_part:
|
||||
name = "alice_wonderland"
|
||||
elif "little_prince" in stem_stripped or "נסיך" in title_part:
|
||||
name = "little_prince"
|
||||
elif "מנהרת" in title_part or "time_tunnel" in stem_stripped:
|
||||
num_match = re.search(r"(\d+)", stem_stripped)
|
||||
num = num_match.group(1) if num_match else stem_stripped.replace("time_tunnel_", "")
|
||||
name = f"time_tunnel_{num}"
|
||||
else:
|
||||
name = stem_stripped[:40]
|
||||
books[str(path)] = name
|
||||
# Also discover plain-text files (e.g. Ben Yehuda downloads)
|
||||
for path in sorted(EPUB_DIR.glob("*.txt")):
|
||||
books[str(path)] = path.stem
|
||||
return books
|
||||
|
||||
|
||||
# Sentence length bounds (word count)
|
||||
MIN_WORDS = 3
|
||||
MAX_WORDS = 15
|
||||
|
||||
|
||||
# ── HTML text extraction ─────────────────────────────────────────
|
||||
|
||||
|
||||
class _TextExtractor(HTMLParser):
|
||||
"""Extract text content from HTML, skipping script/style tags."""
|
||||
|
||||
SKIP_TAGS = {"script", "style", "head"}
|
||||
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
self.parts: list[str] = []
|
||||
self._skip_depth = 0
|
||||
|
||||
def handle_starttag(self, tag, attrs):
|
||||
_ = attrs # required by HTMLParser interface
|
||||
if tag in self.SKIP_TAGS:
|
||||
self._skip_depth += 1
|
||||
# Insert newline for block-level elements to avoid word concatenation
|
||||
if tag in (
|
||||
"p",
|
||||
"div",
|
||||
"br",
|
||||
"li",
|
||||
"h1",
|
||||
"h2",
|
||||
"h3",
|
||||
"h4",
|
||||
"h5",
|
||||
"h6",
|
||||
"td",
|
||||
"th",
|
||||
"tr",
|
||||
"blockquote",
|
||||
"section",
|
||||
):
|
||||
self.parts.append("\n")
|
||||
|
||||
def handle_endtag(self, tag):
|
||||
if tag in self.SKIP_TAGS:
|
||||
self._skip_depth = max(0, self._skip_depth - 1)
|
||||
|
||||
def handle_data(self, data):
|
||||
if self._skip_depth == 0:
|
||||
self.parts.append(data)
|
||||
|
||||
def get_text(self) -> str:
|
||||
return "".join(self.parts)
|
||||
|
||||
|
||||
def extract_text_from_html(html: str) -> str:
|
||||
"""Parse HTML and return plain text."""
|
||||
parser = _TextExtractor()
|
||||
parser.feed(html)
|
||||
return parser.get_text()
|
||||
|
||||
|
||||
# ── EPUB processing ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def _content_files_from_epub(zf: zipfile.ZipFile) -> list[str]:
|
||||
"""Get ordered list of content XHTML files from the OPF manifest."""
|
||||
opf_path = None
|
||||
for name in zf.namelist():
|
||||
if name.endswith(".opf"):
|
||||
opf_path = name
|
||||
break
|
||||
if not opf_path:
|
||||
# Fallback: just use all xhtml files
|
||||
return sorted(
|
||||
n
|
||||
for n in zf.namelist()
|
||||
if n.endswith((".xhtml", ".html"))
|
||||
and "toc" not in n.lower()
|
||||
and "cover" not in n.lower()
|
||||
and "nav" not in n.lower()
|
||||
)
|
||||
|
||||
# Parse OPF to get spine order
|
||||
opf_content = zf.read(opf_path).decode("utf-8")
|
||||
opf_dir = os.path.dirname(opf_path)
|
||||
|
||||
# Extract manifest items: id -> href
|
||||
manifest: dict[str, str] = {}
|
||||
for m in re.finditer(r'<item\s+[^>]*id="([^"]+)"[^>]*href="([^"]+)"', opf_content):
|
||||
manifest[m.group(1)] = m.group(2)
|
||||
# Also try reversed attribute order
|
||||
for m in re.finditer(r'<item\s+[^>]*href="([^"]+)"[^>]*id="([^"]+)"', opf_content):
|
||||
manifest[m.group(2)] = m.group(1)
|
||||
|
||||
# Extract spine order
|
||||
spine_ids = re.findall(r'<itemref\s+[^>]*idref="([^"]+)"', opf_content)
|
||||
|
||||
result = []
|
||||
for sid in spine_ids:
|
||||
href = manifest.get(sid, "")
|
||||
if href and href.endswith((".xhtml", ".html")):
|
||||
full_path = os.path.join(opf_dir, href) if opf_dir else href
|
||||
# Normalize path separators
|
||||
full_path = full_path.replace("\\", "/")
|
||||
if full_path in zf.namelist():
|
||||
result.append(full_path)
|
||||
|
||||
if not result:
|
||||
# Fallback
|
||||
return sorted(
|
||||
n
|
||||
for n in zf.namelist()
|
||||
if n.endswith((".xhtml", ".html")) and "toc" not in n.lower() and "cover" not in n.lower()
|
||||
)
|
||||
return result
|
||||
|
||||
|
||||
def extract_sentences_from_epub(epub_path: Path, book_name: str) -> list[dict]:
|
||||
"""Extract sentences from an EPUB file.
|
||||
|
||||
Args:
|
||||
epub_path: Path to the .epub file.
|
||||
book_name: Human-readable book name used as the ``source`` field.
|
||||
|
||||
Returns:
|
||||
List of ``{"text": str, "source": str}`` dicts.
|
||||
"""
|
||||
zf = zipfile.ZipFile(epub_path)
|
||||
content_files = _content_files_from_epub(zf)
|
||||
|
||||
all_text = []
|
||||
for cf in content_files:
|
||||
try:
|
||||
html = zf.read(cf).decode("utf-8")
|
||||
except (KeyError, UnicodeDecodeError):
|
||||
continue
|
||||
text = extract_text_from_html(html)
|
||||
all_text.append(text)
|
||||
|
||||
full_text = "\n".join(all_text)
|
||||
return _split_into_sentences(full_text, book_name)
|
||||
|
||||
|
||||
def extract_sentences_from_text(text_path: Path, book_name: str) -> list[dict]:
|
||||
"""Extract sentences from a plain-text file (e.g. Ben Yehuda downloads).
|
||||
|
||||
Args:
|
||||
text_path: Path to the .txt file.
|
||||
book_name: Human-readable book name used as the ``source`` field.
|
||||
|
||||
Returns:
|
||||
List of ``{"text": str, "source": str}`` dicts.
|
||||
"""
|
||||
full_text = text_path.read_text(encoding="utf-8")
|
||||
return _split_into_sentences(full_text, book_name)
|
||||
|
||||
|
||||
# ── Sentence splitting ───────────────────────────────────────────
|
||||
|
||||
# Hebrew sentence terminators: period, exclamation, question mark, sof pasuk
|
||||
_SENT_SPLIT = re.compile(r"[.!?\u05C3]+")
|
||||
|
||||
# Punctuation to strip from word boundaries when matching
|
||||
_PUNCT = re.compile(
|
||||
r'^[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+|'
|
||||
r'[\u0022\u0027\u05F4\u05F3,;:\-\u2013\u2014\u2026\u201C\u201D\u201E\u201F\u2018\u2019()\[\]{}«»"\']+$'
|
||||
)
|
||||
|
||||
|
||||
def _split_into_sentences(text: str, book_name: str) -> list[dict]:
|
||||
"""Split text into Hebrew sentences and filter by word count.
|
||||
|
||||
Args:
|
||||
text: Raw extracted text from an EPUB chapter.
|
||||
book_name: Source label for each sentence dict.
|
||||
|
||||
Returns:
|
||||
List of ``{"text": str, "source": str}`` dicts, deduplicated by exact text.
|
||||
"""
|
||||
# Normalize whitespace
|
||||
text = re.sub(r"\s+", " ", text).strip()
|
||||
|
||||
raw_sentences = _SENT_SPLIT.split(text)
|
||||
results: list[dict] = []
|
||||
seen: set[str] = set()
|
||||
|
||||
for sent in raw_sentences:
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
|
||||
# Count Hebrew words (skip non-Hebrew tokens like numbers)
|
||||
words = sent.split()
|
||||
hebrew_words = [w for w in words if any("\u0590" <= c <= "\u05ff" for c in w)]
|
||||
|
||||
if len(hebrew_words) < MIN_WORDS or len(hebrew_words) > MAX_WORDS:
|
||||
continue
|
||||
|
||||
# Deduplicate by exact nikkud text
|
||||
if sent in seen:
|
||||
continue
|
||||
seen.add(sent)
|
||||
|
||||
results.append({"text": sent, "source": book_name})
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# ── Nikkud index ─────────────────────────────────────────────────
|
||||
|
||||
# Unicode ranges for Hebrew combining marks
|
||||
_NIKKUD_LOW = 0x05B0 # start of vowel points (shva)
|
||||
_NIKKUD_HIGH = 0x05BD # end of vowel range (meteg); 0x05BE is maqaf (punctuation)
|
||||
_DAGESH = "\u05bc"
|
||||
_SHIN_DOT = "\u05c1"
|
||||
_SIN_DOT = "\u05c2"
|
||||
|
||||
# Valid prefix consonants
|
||||
_PREFIX_CONSONANTS = set("בהוכלמש")
|
||||
|
||||
# Named vowel combining marks
|
||||
_SHVA = "\u05b0"
|
||||
_HIRIQ = "\u05b4"
|
||||
_TSERE = "\u05b5"
|
||||
_SEGOL = "\u05b6"
|
||||
_PATACH = "\u05b7"
|
||||
_QAMATZ = "\u05b8"
|
||||
|
||||
# Valid nikkud patterns on each prefix consonant.
|
||||
# Key = consonant, Value = set of frozensets of combining marks valid for that prefix.
|
||||
_VALID_PREFIX_MARKS: dict[str, set[frozenset]] = {
|
||||
"ב": {
|
||||
frozenset({_SHVA, _DAGESH}), # בְּ standard
|
||||
frozenset({_HIRIQ, _DAGESH}), # בִּ before shva
|
||||
frozenset({_PATACH, _DAGESH}), # בַּ with definite article
|
||||
frozenset({_QAMATZ, _DAGESH}), # בָּ before chataf qamatz
|
||||
frozenset({_SEGOL, _DAGESH}), # בֶּ before chataf segol
|
||||
},
|
||||
"כ": {
|
||||
frozenset({_SHVA, _DAGESH}), # כְּ
|
||||
frozenset({_HIRIQ, _DAGESH}), # כִּ
|
||||
frozenset({_PATACH, _DAGESH}), # כַּ
|
||||
frozenset({_QAMATZ, _DAGESH}), # כָּ
|
||||
frozenset({_SEGOL, _DAGESH}), # כֶּ
|
||||
},
|
||||
"ל": {
|
||||
frozenset({_SHVA}), # לְ standard
|
||||
frozenset({_HIRIQ}), # לִ before shva
|
||||
frozenset({_PATACH}), # לַ with definite article
|
||||
frozenset({_QAMATZ}), # לָ demonstratives
|
||||
frozenset({_SEGOL}), # לֶ before chataf segol
|
||||
},
|
||||
"ו": {
|
||||
frozenset({_SHVA}), # וְ standard
|
||||
frozenset({_DAGESH}), # וּ (shureq) before shva/bumf
|
||||
frozenset({_PATACH}), # וַ before chataf patach
|
||||
frozenset({_QAMATZ}), # וָ before chataf qamatz
|
||||
frozenset({_SEGOL}), # וֶ before chataf segol
|
||||
frozenset({_HIRIQ}), # וִ before yud-shva
|
||||
},
|
||||
"מ": {
|
||||
frozenset({_HIRIQ}), # מִ standard
|
||||
frozenset({_TSERE}), # מֵ before gutturals
|
||||
},
|
||||
"ש": {
|
||||
frozenset({_SEGOL, _DAGESH}), # שֶׁ standard
|
||||
frozenset({_SEGOL, _DAGESH, _SHIN_DOT}), # שֶׁ with explicit shin dot
|
||||
},
|
||||
"ה": {
|
||||
frozenset({_PATACH}), # הַ standard definite article
|
||||
frozenset({_QAMATZ}), # הָ before gutturals
|
||||
frozenset({_SEGOL}), # הֶ before qamatz-bearing gutturals
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _is_combining_mark(ch: str) -> bool:
|
||||
"""Return True if ch is a Hebrew combining mark (nikkud, dagesh, or dots)."""
|
||||
cp = ord(ch)
|
||||
if _NIKKUD_LOW <= cp <= _NIKKUD_HIGH:
|
||||
return True
|
||||
return ch in (_DAGESH, _SHIN_DOT, _SIN_DOT)
|
||||
|
||||
|
||||
def _decompose_first_char(token: str) -> tuple[str, frozenset, str]:
|
||||
"""Split token into (first_consonant, its_combining_marks, remainder).
|
||||
|
||||
Args:
|
||||
token: A nikkud Hebrew token string.
|
||||
|
||||
Returns:
|
||||
A tuple of (consonant, marks, rest). Returns ("", frozenset(), token)
|
||||
if the token does not start with a Hebrew consonant (alef–tav range).
|
||||
"""
|
||||
if not token:
|
||||
return ("", frozenset(), token)
|
||||
|
||||
first = token[0]
|
||||
# Check it's a Hebrew consonant (alef–tav)
|
||||
if not ("\u05d0" <= first <= "\u05ea"):
|
||||
return ("", frozenset(), token)
|
||||
|
||||
# Collect all combining marks that follow the consonant
|
||||
marks: set[str] = set()
|
||||
i = 1
|
||||
while i < len(token):
|
||||
ch = token[i]
|
||||
if _is_combining_mark(ch):
|
||||
marks.add(ch)
|
||||
i += 1
|
||||
else:
|
||||
break
|
||||
|
||||
return (first, frozenset(marks), token[i:])
|
||||
|
||||
|
||||
def _is_valid_prefix(consonant: str, marks: frozenset) -> bool:
|
||||
"""Check if consonant + marks form a valid Hebrew prefix combination.
|
||||
|
||||
Args:
|
||||
consonant: The prefix consonant character.
|
||||
marks: Frozenset of combining mark characters on that consonant.
|
||||
|
||||
Returns:
|
||||
True if this is a recognised Hebrew prefix vocalization.
|
||||
"""
|
||||
valid = _VALID_PREFIX_MARKS.get(consonant)
|
||||
if not valid:
|
||||
return False
|
||||
# For ש, allow shin dot to be present or absent
|
||||
if consonant == "ש":
|
||||
marks_without_shin = marks - {_SHIN_DOT}
|
||||
return marks_without_shin in valid or marks in valid
|
||||
return marks in valid
|
||||
|
||||
|
||||
def _rebuild_token(consonant: str, marks: frozenset, rest: str) -> str:
|
||||
"""Reassemble a token from its decomposed parts, sorting marks by codepoint."""
|
||||
return consonant + "".join(sorted(marks)) + rest
|
||||
|
||||
|
||||
def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, str]]:
|
||||
"""Try stripping 1 or 2 prefix letters from a nikkud token.
|
||||
|
||||
Args:
|
||||
token: A cleaned nikkud word token.
|
||||
nikkud_index: Mapping from nikkud form to list of (unique_key, match_type).
|
||||
|
||||
Returns:
|
||||
List of (unique_key, match_type, matched_remainder) for each hit found.
|
||||
The match_type will have ``"_prefix"`` appended to the base type.
|
||||
"""
|
||||
results: list[tuple[str, str, str]] = []
|
||||
|
||||
# Try 1-letter prefix
|
||||
c1, m1, rest1 = _decompose_first_char(token)
|
||||
if not (c1 and _is_valid_prefix(c1, m1) and rest1):
|
||||
return results
|
||||
|
||||
# Direct match on 1-prefix remainder
|
||||
if rest1 in nikkud_index:
|
||||
for unique_key, match_type in nikkud_index[rest1]:
|
||||
results.append((unique_key, match_type + "_prefix", rest1))
|
||||
|
||||
# Try removing dagesh from first letter of remainder
|
||||
# (handles absorbed definite article: לַמֶּלֶךְ → מֶּלֶךְ → מֶלֶךְ)
|
||||
c2, m2, rest2_inner = _decompose_first_char(rest1)
|
||||
if c2 and _DAGESH in m2:
|
||||
without_dagesh = _rebuild_token(c2, m2 - {_DAGESH}, rest2_inner)
|
||||
if without_dagesh != rest1 and without_dagesh in nikkud_index:
|
||||
for unique_key, match_type in nikkud_index[without_dagesh]:
|
||||
results.append((unique_key, match_type + "_prefix", without_dagesh))
|
||||
|
||||
# Try 2-letter prefix (ו and ש commonly stack with another prefix)
|
||||
if c1 in "וש":
|
||||
c2b, m2b, rest2b = _decompose_first_char(rest1)
|
||||
if c2b and c2b in _PREFIX_CONSONANTS and _is_valid_prefix(c2b, m2b) and rest2b:
|
||||
if rest2b in nikkud_index:
|
||||
for unique_key, match_type in nikkud_index[rest2b]:
|
||||
results.append((unique_key, match_type + "_prefix", rest2b))
|
||||
|
||||
# Also try dagesh removal on remainder of 2-letter prefix
|
||||
c3, m3, rest3_inner = _decompose_first_char(rest2b)
|
||||
if c3 and _DAGESH in m3:
|
||||
without_dagesh2 = _rebuild_token(c3, m3 - {_DAGESH}, rest3_inner)
|
||||
if without_dagesh2 != rest2b and without_dagesh2 in nikkud_index:
|
||||
for unique_key, match_type in nikkud_index[without_dagesh2]:
|
||||
results.append((unique_key, match_type + "_prefix", without_dagesh2))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
# Public alias for use by sentence_difficulty module
|
||||
try_strip_prefix = _try_strip_prefix
|
||||
|
||||
|
||||
def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||
"""Build a mapping from nikkud form to list of (unique_key, match_type).
|
||||
|
||||
Indexes the following sources per entry:
|
||||
|
||||
- ``word.nikkud`` → "direct"
|
||||
- conjugation active/passive forms → "conjugated"
|
||||
- conjugation infinitive and reference_form → "conjugated"
|
||||
- noun inflection singular/plural/construct/pronominal → "inflected"
|
||||
|
||||
Args:
|
||||
words: The full words.json dict keyed by unique_key.
|
||||
|
||||
Returns:
|
||||
Dict mapping each nikkud form to a list of (unique_key, match_type) tuples.
|
||||
"""
|
||||
index: dict[str, list[tuple[str, str]]] = {}
|
||||
|
||||
def _add(form: str | None, unique_key: str, match_type: str) -> None:
|
||||
if form:
|
||||
index.setdefault(form, []).append((unique_key, match_type))
|
||||
|
||||
for unique_key, entry in words.items():
|
||||
# Direct word form
|
||||
word = entry.get("word") or {}
|
||||
_add(word.get("nikkud"), unique_key, "direct")
|
||||
|
||||
# Conjugation forms
|
||||
conj = entry.get("conjugation") or {}
|
||||
|
||||
for form_entry in conj.get("active_forms") or []:
|
||||
form = (form_entry.get("form") or {}).get("nikkud")
|
||||
_add(form, unique_key, "conjugated")
|
||||
|
||||
for form_entry in conj.get("hufal_pual_forms") or []:
|
||||
form = (form_entry.get("form") or {}).get("nikkud")
|
||||
_add(form, unique_key, "conjugated")
|
||||
|
||||
inf = conj.get("infinitive") or {}
|
||||
_add(inf.get("nikkud"), unique_key, "conjugated")
|
||||
|
||||
ref = conj.get("reference_form") or {}
|
||||
_add(ref.get("nikkud"), unique_key, "conjugated")
|
||||
|
||||
# Noun inflection forms
|
||||
noun = entry.get("noun_inflection") or {}
|
||||
|
||||
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||
sub = noun.get(field) or {}
|
||||
form = sub.get("nikkud")
|
||||
_add(form, unique_key, "inflected")
|
||||
# Index construct forms without maqaf too — modern text often
|
||||
# writes smichut as two space-separated words without maqaf
|
||||
if form and form.endswith("־"):
|
||||
_add(form[:-1], unique_key, "inflected")
|
||||
|
||||
pronominal = noun.get("pronominal_suffixes") or {}
|
||||
for _person, sub in pronominal.items():
|
||||
if isinstance(sub, dict):
|
||||
_add(sub.get("nikkud"), unique_key, "inflected")
|
||||
|
||||
return index
|
||||
|
||||
|
||||
def _filter_collision_forms(nikkud_index: dict) -> dict:
|
||||
"""Remove colliding forms for entries that have other unique forms.
|
||||
|
||||
A "colliding form" maps to 2+ unique_keys. For each unique_key that
|
||||
appears in a collision, check whether it also has at least one
|
||||
non-colliding form in the index. If so, remove it from the colliding
|
||||
form's entry list. If a unique_key's *only* indexed forms all collide,
|
||||
keep them (otherwise the entry would get zero matches).
|
||||
|
||||
Returns a new index dict with the same structure.
|
||||
"""
|
||||
# Identify collision forms and build reverse map (key → its forms)
|
||||
collision_forms: set[str] = set()
|
||||
key_to_forms: dict[str, set[str]] = {}
|
||||
|
||||
for form, entries in nikkud_index.items():
|
||||
keys = {uk for uk, _ in entries}
|
||||
if len(keys) >= 2:
|
||||
collision_forms.add(form)
|
||||
for uk, _ in entries:
|
||||
key_to_forms.setdefault(uk, set()).add(form)
|
||||
|
||||
# For each key, check if it has any non-colliding form
|
||||
keys_with_unique_forms: set[str] = set()
|
||||
for uk, forms in key_to_forms.items():
|
||||
if forms - collision_forms:
|
||||
keys_with_unique_forms.add(uk)
|
||||
|
||||
# Build filtered index
|
||||
filtered: dict[str, list[tuple[str, str]]] = {}
|
||||
removed = 0
|
||||
for form, entries in nikkud_index.items():
|
||||
if form in collision_forms:
|
||||
kept = [(uk, mt) for uk, mt in entries if uk not in keys_with_unique_forms]
|
||||
removed += len(entries) - len(kept)
|
||||
if kept:
|
||||
filtered[form] = kept
|
||||
else:
|
||||
filtered[form] = entries
|
||||
|
||||
logger.info(f" Filtered {removed} collision mappings from entries with unique forms")
|
||||
return filtered
|
||||
|
||||
|
||||
# ── Matching ─────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def match_sentences(
|
||||
sentences: list[dict],
|
||||
nikkud_index: dict,
|
||||
confusable_keys: set[str],
|
||||
) -> dict:
|
||||
"""Match sentences to vocab words using the nikkud index.
|
||||
|
||||
Args:
|
||||
sentences: List of ``{"text": str, "source": str}`` dicts.
|
||||
nikkud_index: Output of ``_build_nikkud_index``.
|
||||
confusable_keys: Set of unique_keys that are in confusable groups.
|
||||
|
||||
Returns:
|
||||
Dict mapping unique_key → list of match dicts, each containing:
|
||||
``text``, ``source``, ``match_method``, ``word_count``,
|
||||
``matched_form``, ``char_offset``, ``char_end``.
|
||||
"""
|
||||
matches: dict[str, list[dict]] = {}
|
||||
|
||||
for sent_info in sentences:
|
||||
text = sent_info["text"]
|
||||
source = sent_info["source"]
|
||||
words_in_sent = text.split()
|
||||
word_count = len(words_in_sent)
|
||||
|
||||
char_pos = 0
|
||||
for raw_word in words_in_sent:
|
||||
cleaned = _PUNCT.sub("", raw_word)
|
||||
if not cleaned:
|
||||
word_start = text.find(raw_word, char_pos)
|
||||
char_pos = word_start + len(raw_word) if word_start >= 0 else char_pos
|
||||
continue
|
||||
|
||||
# Locate positions within the sentence
|
||||
word_start_in_sent = text.find(raw_word, char_pos)
|
||||
if word_start_in_sent < 0:
|
||||
word_start_in_sent = char_pos
|
||||
clean_offset_in_raw = raw_word.find(cleaned)
|
||||
if clean_offset_in_raw < 0:
|
||||
clean_offset_in_raw = 0
|
||||
clean_start = word_start_in_sent + clean_offset_in_raw
|
||||
clean_end = clean_start + len(cleaned)
|
||||
|
||||
found: list[tuple[str, str]] = []
|
||||
|
||||
# Direct nikkud match
|
||||
if cleaned in nikkud_index:
|
||||
for unique_key, match_type in nikkud_index[cleaned]:
|
||||
found.append((unique_key, match_type))
|
||||
|
||||
# Prefix stripping — only if no direct match exists
|
||||
if cleaned not in nikkud_index:
|
||||
for unique_key, match_type, _remainder in _try_strip_prefix(cleaned, nikkud_index):
|
||||
found.append((unique_key, match_type))
|
||||
|
||||
for unique_key, match_method in found:
|
||||
matches.setdefault(unique_key, []).append(
|
||||
{
|
||||
"text": text,
|
||||
"source": source,
|
||||
"match_method": match_method,
|
||||
"word_count": word_count,
|
||||
"matched_form": cleaned,
|
||||
"char_offset": clean_start,
|
||||
"char_end": clean_end,
|
||||
}
|
||||
)
|
||||
|
||||
char_pos = word_start_in_sent + len(raw_word)
|
||||
|
||||
return matches
|
||||
|
||||
|
||||
# ── Writing results ──────────────────────────────────────────────
|
||||
|
||||
|
||||
def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) -> int:
|
||||
"""Update words dict entries with matched example sentences.
|
||||
|
||||
Selects up to 3 best sentences per word (scoring prefers 6–12 word
|
||||
sentences and non-prefix matches). Also generates a cloze entry for
|
||||
the top match, unless the word is in the confusable set.
|
||||
|
||||
Args:
|
||||
words: The full words.json dict, modified in place.
|
||||
matches: Output of ``match_sentences``.
|
||||
confusable_keys: Set of unique_keys in confusable groups.
|
||||
|
||||
Returns:
|
||||
Count of words.json entries that were updated.
|
||||
"""
|
||||
import genanki # noqa: PLC0415 — import only where needed
|
||||
|
||||
updated = 0
|
||||
|
||||
# Build frequency scoring infrastructure (once for all words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
|
||||
for unique_key, sent_list in matches.items():
|
||||
if unique_key not in words:
|
||||
continue
|
||||
|
||||
entry = words[unique_key]
|
||||
|
||||
# Deduplicate by sentence text
|
||||
seen_texts: set[str] = set()
|
||||
unique: list[dict] = []
|
||||
for s in sent_list:
|
||||
if s["text"] not in seen_texts:
|
||||
seen_texts.add(s["text"])
|
||||
unique.append(s)
|
||||
|
||||
# Prefer direct matches; only fall back to prefix if none exist
|
||||
direct = [s for s in unique if "prefix" not in s["match_method"]]
|
||||
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
|
||||
pool = direct if direct else prefix_only
|
||||
|
||||
# Score: prefer sentences with easier (more common) context words
|
||||
def _score(s: dict) -> tuple[int,]:
|
||||
return (
|
||||
score_sentence(
|
||||
s["text"],
|
||||
s["char_offset"],
|
||||
s["char_end"],
|
||||
nikkud_map,
|
||||
nikkud_index,
|
||||
freq_data,
|
||||
),
|
||||
)
|
||||
|
||||
pool.sort(key=_score)
|
||||
best = pool[:3]
|
||||
|
||||
# Build vetted list
|
||||
if not entry.get("examples"):
|
||||
entry["examples"] = {}
|
||||
examples: dict = entry["examples"]
|
||||
examples["vetted"] = [
|
||||
{
|
||||
"text": s["text"],
|
||||
"source": s["source"],
|
||||
"match_method": s["match_method"],
|
||||
}
|
||||
for s in best
|
||||
]
|
||||
|
||||
# Build cloze from best sentence (skip confusables)
|
||||
is_confusable = unique_key in confusable_keys
|
||||
if not is_confusable and best:
|
||||
top = best[0]
|
||||
# Preserve existing cloze_guid if sentence text unchanged
|
||||
old_cloze = examples.get("cloze") or {}
|
||||
if old_cloze.get("text") == top["text"]:
|
||||
cloze_guid = old_cloze.get("cloze_guid")
|
||||
else:
|
||||
cloze_guid = genanki.guid_for("cloze", unique_key)
|
||||
|
||||
examples["cloze"] = {
|
||||
"text": top["text"],
|
||||
"cloze_word_start": top["char_offset"],
|
||||
"cloze_word_end": top["char_end"],
|
||||
"cloze_hint": None,
|
||||
"cloze_guid": cloze_guid,
|
||||
"difficulty_score": _score(top)[0],
|
||||
}
|
||||
elif is_confusable:
|
||||
examples.pop("cloze", None)
|
||||
|
||||
examples["rejected_count"] = 0
|
||||
updated += 1
|
||||
|
||||
# Deduplicate shared examples across confusable groups
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
if cleared:
|
||||
logger.info(f" Cleared shared examples from {cleared} confusable entries")
|
||||
|
||||
return updated
|
||||
|
||||
|
||||
def _deduplicate_confusable_examples(words: dict) -> int:
|
||||
"""Remove shared examples from less-common confusable group members.
|
||||
|
||||
After example matching assigns sentences, confusable entries often share
|
||||
identical examples (matched via shared nikkud forms). This function keeps
|
||||
examples only on the highest-frequency member, clearing others.
|
||||
|
||||
Args:
|
||||
words: The full words.json dict, modified in place (examples already
|
||||
assigned).
|
||||
|
||||
Returns:
|
||||
Count of entries whose examples were cleared.
|
||||
"""
|
||||
from collections import defaultdict
|
||||
|
||||
# Build confusable group map: group_id → [unique_key, ...]
|
||||
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||
for key, entry in words.items():
|
||||
cg = entry.get("confusable_group")
|
||||
if cg:
|
||||
group_id = tuple(sorted(cg))
|
||||
group_map[group_id].append(key)
|
||||
|
||||
cleared = 0
|
||||
|
||||
for _group_id, members in group_map.items():
|
||||
if len(members) < 2:
|
||||
continue
|
||||
|
||||
# Collect vetted sentence text sets per member
|
||||
member_texts: dict[str, frozenset[str]] = {}
|
||||
for key in members:
|
||||
vetted = (words[key].get("examples") or {}).get("vetted") or []
|
||||
texts = frozenset(e.get("text", "") for e in vetted)
|
||||
member_texts[key] = texts
|
||||
|
||||
# Find members with identical non-empty sentence sets
|
||||
# Group members by their sentence set
|
||||
text_groups: dict[frozenset[str], list[str]] = defaultdict(list)
|
||||
for key, texts in member_texts.items():
|
||||
if texts: # skip entries with no examples
|
||||
text_groups[texts].append(key)
|
||||
|
||||
# For each set of members sharing identical examples, keep only the
|
||||
# highest-frequency one
|
||||
for _texts, sharing_keys in text_groups.items():
|
||||
if len(sharing_keys) < 2:
|
||||
continue
|
||||
|
||||
# Sort by frequency_rank (lower = more common = winner).
|
||||
# No frequency → sort last (use large sentinel).
|
||||
# Tie-break: alphabetical by unique_key.
|
||||
def _sort_key(k: str) -> tuple[int, str]:
|
||||
rank = words[k].get("frequency_rank")
|
||||
return (rank if rank is not None else 999999, k)
|
||||
|
||||
sharing_keys.sort(key=_sort_key)
|
||||
winner = sharing_keys[0]
|
||||
losers = sharing_keys[1:]
|
||||
|
||||
for loser_key in losers:
|
||||
entry = words[loser_key]
|
||||
examples = entry.get("examples") or {}
|
||||
examples["vetted"] = []
|
||||
examples.pop("cloze", None)
|
||||
entry["examples"] = examples
|
||||
cleared += 1
|
||||
logger.debug(f" Cleared examples from {loser_key} (kept on {winner})")
|
||||
|
||||
return cleared
|
||||
|
||||
|
||||
# ── Public API ───────────────────────────────────────────────────
|
||||
|
||||
|
||||
def run(words: dict) -> dict:
|
||||
"""Extract EPUB sentences, match against words, update words dict in place.
|
||||
|
||||
Called from run.py with the already-loaded words.json dict.
|
||||
|
||||
Args:
|
||||
words: The full words.json dict keyed by unique_key. Modified in place.
|
||||
|
||||
Returns:
|
||||
Summary stats dict with keys ``books``, ``matched``, ``total_vocab``.
|
||||
"""
|
||||
logger.info(" Extracting sentences from EPUBs ...")
|
||||
all_sentences: list[dict] = []
|
||||
book_counts: dict[str, int] = {}
|
||||
|
||||
for filepath, book_name in _discover_epubs().items():
|
||||
path = Path(filepath)
|
||||
if path.suffix == ".txt":
|
||||
sentences = extract_sentences_from_text(path, book_name)
|
||||
else:
|
||||
sentences = extract_sentences_from_epub(path, book_name)
|
||||
book_counts[book_name] = len(sentences)
|
||||
all_sentences.extend(sentences)
|
||||
logger.info(f" {book_name}: {len(sentences)} sentences")
|
||||
|
||||
if not all_sentences:
|
||||
logger.warning(" No EPUB files found — skipping example extraction")
|
||||
return {"books": {}, "matched": 0, "total_vocab": len(words)}
|
||||
|
||||
logger.info(f" Total sentences: {len(all_sentences)}")
|
||||
|
||||
# Build nikkud index
|
||||
logger.info(" Building nikkud index from words.json ...")
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
logger.info(f" {len(nikkud_index)} unique nikkud forms indexed")
|
||||
|
||||
# Filter out collision forms for entries that have unique forms
|
||||
nikkud_index = _filter_collision_forms(nikkud_index)
|
||||
|
||||
# Build confusable key set
|
||||
confusable_keys: set[str] = set()
|
||||
for key, entry in words.items():
|
||||
if entry.get("confusable_group"):
|
||||
confusable_keys.add(key)
|
||||
|
||||
# Match sentences
|
||||
logger.info(" Matching sentences against vocab ...")
|
||||
matches = match_sentences(all_sentences, nikkud_index, confusable_keys)
|
||||
logger.info(f" {len(matches)} words matched")
|
||||
|
||||
# Break down by match method
|
||||
method_counts: dict[str, int] = {}
|
||||
for sent_list in matches.values():
|
||||
for s in sent_list:
|
||||
method = s["match_method"]
|
||||
method_counts[method] = method_counts.get(method, 0) + 1
|
||||
for method, count in sorted(method_counts.items()):
|
||||
logger.info(f" {method}: {count} sentence-word pairs")
|
||||
|
||||
# Update words dict in place
|
||||
updated = update_words_json(words, matches, confusable_keys)
|
||||
logger.info(f" Updated {updated} entries in words.json")
|
||||
|
||||
return {
|
||||
"books": book_counts,
|
||||
"matched": len(matches),
|
||||
"total_vocab": len(words),
|
||||
}
|
||||
|
||||
|
||||
# ── Standalone entry point ───────────────────────────────────────
|
||||
|
||||
if __name__ == "__main__":
|
||||
import json
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(message)s")
|
||||
|
||||
words_path = DATA_DIR / "words.json"
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
stats = run(words)
|
||||
|
||||
# Save updated words.json
|
||||
with open(words_path, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
coverage = stats["matched"] * 100 / stats["total_vocab"] if stats["total_vocab"] else 0
|
||||
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']} ({coverage:.1f}%)")
|
||||
|
|
@ -21,10 +21,9 @@ from pathlib import Path
|
|||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).resolve().parent.parent
|
||||
PDF_URL = "" # Set to URL or local path of Coffin & Bolozky PDF
|
||||
PDF_URL = "https://books.nevo.engineer/opds/download/117/pdf/"
|
||||
PDF_PATH = Path("/tmp/coffin_bolozky.pdf")
|
||||
OUTPUT_PATH = PROJECT_ROOT / "verbs_input.txt"
|
||||
OUTPUT_PATH = Path(__file__).parent / "verbs_input.txt"
|
||||
|
||||
# Pages to scan (Appendix 1)
|
||||
PAGE_START = 390
|
||||
|
|
@ -32,38 +31,24 @@ PAGE_END = 411
|
|||
|
||||
# Binyan headings in Hebrew (vowelled and unvowelled variants)
|
||||
BINYAN_HEADINGS_HEB = [
|
||||
"פָּעַל",
|
||||
"פעל",
|
||||
"נִפְעַל",
|
||||
"נפעל",
|
||||
"פִּעֵל",
|
||||
"פיעל",
|
||||
"פֻּעַל",
|
||||
"פועל",
|
||||
"הִתְפַּעֵל",
|
||||
"התפעל",
|
||||
"הִפְעִיל",
|
||||
"הפעיל",
|
||||
"הֻפְעַל",
|
||||
"הופעל",
|
||||
"פָּעַל", "פעל",
|
||||
"נִפְעַל", "נפעל",
|
||||
"פִּעֵל", "פיעל",
|
||||
"פֻּעַל", "פועל",
|
||||
"הִתְפַּעֵל", "התפעל",
|
||||
"הִפְעִיל", "הפעיל",
|
||||
"הֻפְעַל", "הופעל",
|
||||
]
|
||||
|
||||
# Binyan heading → canonical name
|
||||
BINYAN_CANONICAL = {
|
||||
"פָּעַל": "Pa'al",
|
||||
"פעל": "Pa'al",
|
||||
"נִפְעַל": "Nif'al",
|
||||
"נפעל": "Nif'al",
|
||||
"פִּעֵל": "Pi'el",
|
||||
"פיעל": "Pi'el",
|
||||
"פֻּעַל": "Pu'al",
|
||||
"פועל": "Pu'al",
|
||||
"הִתְפַּעֵל": "Hitpa'el",
|
||||
"התפעל": "Hitpa'el",
|
||||
"הִפְעִיל": "Hif'il",
|
||||
"הפעיל": "Hif'il",
|
||||
"הֻפְעַל": "Huf'al",
|
||||
"הופעל": "Huf'al",
|
||||
"פָּעַל": "Pa'al", "פעל": "Pa'al",
|
||||
"נִפְעַל": "Nif'al", "נפעל": "Nif'al",
|
||||
"פִּעֵל": "Pi'el", "פיעל": "Pi'el",
|
||||
"פֻּעַל": "Pu'al", "פועל": "Pu'al",
|
||||
"הִתְפַּעֵל": "Hitpa'el", "התפעל": "Hitpa'el",
|
||||
"הִפְעִיל": "Hif'il", "הפעיל": "Hif'il",
|
||||
"הֻפְעַל": "Huf'al", "הופעל": "Huf'al",
|
||||
}
|
||||
|
||||
# Passive binyan names — no infinitive, use 3ms past
|
||||
|
|
@ -171,16 +156,15 @@ FALLBACK_VERBS = """# Verb list from Coffin & Bolozky, A Reference Grammar of Mo
|
|||
def _install_deps():
|
||||
"""Install pymupdf and python-bidi if not available."""
|
||||
try:
|
||||
import bidi # noqa: F401
|
||||
import fitz # noqa: F401
|
||||
|
||||
import bidi # noqa: F401
|
||||
return True
|
||||
except ImportError:
|
||||
logger.info("Installing pymupdf and python-bidi …")
|
||||
import subprocess
|
||||
|
||||
result = subprocess.run(
|
||||
[sys.executable, "-m", "pip", "install", "pymupdf", "python-bidi", "--break-system-packages", "-q"],
|
||||
[sys.executable, "-m", "pip", "install",
|
||||
"pymupdf", "python-bidi", "--break-system-packages", "-q"],
|
||||
capture_output=True,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
|
|
@ -198,7 +182,6 @@ def _download_pdf() -> bool:
|
|||
logger.info(f"Downloading PDF from {PDF_URL} …")
|
||||
try:
|
||||
import requests
|
||||
|
||||
resp = requests.get(PDF_URL, timeout=120, stream=True)
|
||||
resp.raise_for_status()
|
||||
PDF_PATH.write_bytes(resp.content)
|
||||
|
|
@ -228,7 +211,10 @@ def _needs_bidi_fix(text: str) -> bool:
|
|||
|
||||
|
||||
def _strip_nikkud(text: str) -> str:
|
||||
return "".join(ch for ch in unicodedata.normalize("NFD", text) if unicodedata.category(ch) != "Mn")
|
||||
return "".join(
|
||||
ch for ch in unicodedata.normalize("NFD", text)
|
||||
if unicodedata.category(ch) != "Mn"
|
||||
)
|
||||
|
||||
|
||||
def _extract_from_pdf() -> list[tuple[str, str, str]]:
|
||||
|
|
@ -258,9 +244,10 @@ def _extract_from_pdf() -> list[tuple[str, str, str]]:
|
|||
# Check if we need bidi correction
|
||||
test_text = ""
|
||||
try:
|
||||
for page_num in range(min(PAGE_START, doc.page_count - 1), min(PAGE_START + 3, doc.page_count)):
|
||||
for page_num in range(min(PAGE_START, doc.page_count - 1),
|
||||
min(PAGE_START + 3, doc.page_count)):
|
||||
test_text += doc[page_num].get_text("text")
|
||||
except Exception: # noqa: S110
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
use_bidi = _needs_bidi_fix(test_text)
|
||||
|
|
@ -272,7 +259,6 @@ def _extract_from_pdf() -> list[tuple[str, str, str]]:
|
|||
return t
|
||||
try:
|
||||
from bidi.algorithm import get_display
|
||||
|
||||
lines = t.split("\n")
|
||||
fixed = []
|
||||
for line in lines:
|
||||
|
|
@ -288,7 +274,7 @@ def _extract_from_pdf() -> list[tuple[str, str, str]]:
|
|||
for page_num in range(PAGE_START - 1, page_end): # fitz is 0-indexed
|
||||
try:
|
||||
raw = doc[page_num].get_text("text")
|
||||
except Exception: # noqa: S112
|
||||
except Exception:
|
||||
continue
|
||||
|
||||
text = fix_text(raw)
|
||||
|
|
@ -330,12 +316,9 @@ def _extract_from_pdf() -> list[tuple[str, str, str]]:
|
|||
heb_words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7]{3,}", line)
|
||||
for w in heb_words:
|
||||
stripped_w = _strip_nikkud(w)
|
||||
if (
|
||||
current_binyan == "Pu'al"
|
||||
and stripped_w.startswith("פ")
|
||||
or current_binyan == "Huf'al"
|
||||
and stripped_w.startswith("ה")
|
||||
):
|
||||
if current_binyan == "Pu'al" and stripped_w.startswith("פ"):
|
||||
entries.append((current_binyan, "3ms", w))
|
||||
elif current_binyan == "Huf'al" and stripped_w.startswith("ה"):
|
||||
entries.append((current_binyan, "3ms", w))
|
||||
|
||||
doc.close()
|
||||
|
|
@ -374,20 +357,16 @@ def _write_output(entries: list[tuple[str, str, str]]) -> None:
|
|||
lines.append(form)
|
||||
|
||||
OUTPUT_PATH.write_text("\n".join(lines) + "\n", encoding="utf-8")
|
||||
verb_count = sum(1 for ln in lines if ln and not ln.startswith("#"))
|
||||
passive_count = sum(1 for ln in lines if ln.startswith("# 3ms:"))
|
||||
verb_count = sum(1 for l in lines if l and not l.startswith("#"))
|
||||
passive_count = sum(1 for l in lines if l.startswith("# 3ms:"))
|
||||
logger.info(f"Written {verb_count} active verbs + {passive_count} passive (3ms) → {OUTPUT_PATH}")
|
||||
|
||||
|
||||
def _binyan_heb(name: str) -> str:
|
||||
mapping = {
|
||||
"Pa'al": "פָּעַל",
|
||||
"Nif'al": "נִפְעַל",
|
||||
"Pi'el": "פִּעֵל",
|
||||
"Pu'al": "פֻּעַל",
|
||||
"Hitpa'el": "הִתְפַּעֵל",
|
||||
"Hif'il": "הִפְעִיל",
|
||||
"Huf'al": "הֻפְעַל",
|
||||
"Pa'al": "פָּעַל", "Nif'al": "נִפְעַל", "Pi'el": "פִּעֵל",
|
||||
"Pu'al": "פֻּעַל", "Hitpa'el": "הִתְפַּעֵל",
|
||||
"Hif'il": "הִפְעִיל", "Huf'al": "הֻפְעַל",
|
||||
}
|
||||
return mapping.get(name, name)
|
||||
|
||||
|
|
@ -3,43 +3,44 @@
|
|||
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
|
||||
Downloads he_50k.txt once; subsequent runs read from cache.
|
||||
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
|
||||
|
||||
TODO: Rewrite to update words.json frequency field directly instead of
|
||||
writing to a separate frequency_cache.json. Currently the migration script
|
||||
bridges the gap. See Phase 5 in SPRINT_LOG.md.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
|
||||
FREQ_URL = (
|
||||
"https://raw.githubusercontent.com/hermitdave/FrequencyWords/"
|
||||
"master/content/2016/he/he_50k.txt"
|
||||
)
|
||||
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
|
||||
CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json"
|
||||
REQUEST_TIMEOUT = 30
|
||||
|
||||
# Module-level cache: word_no_nikkud -> rank (1 = most common)
|
||||
_freq: dict[str, int] = {}
|
||||
|
||||
|
||||
def load(cache_path: Path = CACHE_PATH) -> None:
|
||||
"""Load frequency data from cache, downloading if not present.
|
||||
def _strip_nikkud(text: str) -> str:
|
||||
"""Remove Hebrew nikkud (diacritics) from a string."""
|
||||
return "".join(
|
||||
ch for ch in unicodedata.normalize("NFD", text)
|
||||
if unicodedata.category(ch) != "Mn"
|
||||
)
|
||||
|
||||
Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json.
|
||||
"""
|
||||
|
||||
def load(cache_path: Path = CACHE_PATH) -> None:
|
||||
"""Load frequency data from cache, downloading if not present."""
|
||||
global _freq
|
||||
# Prefer YAP-cleaned frequency data if available
|
||||
clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None
|
||||
load_path = clean_path if clean_path and clean_path.exists() else cache_path
|
||||
if load_path.exists():
|
||||
with open(load_path, encoding="utf-8") as f:
|
||||
if cache_path.exists():
|
||||
with open(cache_path, encoding="utf-8") as f:
|
||||
_freq = json.load(f)
|
||||
label = "clean" if load_path == clean_path else "raw"
|
||||
logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries")
|
||||
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
|
||||
return
|
||||
|
||||
logger.info("Downloading FrequencyWords he_50k.txt …")
|
||||
|
|
@ -51,7 +52,7 @@ def load(cache_path: Path = CACHE_PATH) -> None:
|
|||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
word = line.split()[0]
|
||||
word = _strip_nikkud(line.split()[0])
|
||||
if word and word not in _freq:
|
||||
_freq[word] = rank
|
||||
rank += 1
|
||||
|
|
@ -66,24 +67,14 @@ def get_frequency_rank(word_no_nikkud: str) -> int | None:
|
|||
"""
|
||||
Return the frequency rank of a word (1 = most common).
|
||||
Returns None if not found in the corpus.
|
||||
Expects ktiv male (no nikkud) input.
|
||||
Strips nikkud from the input before lookup.
|
||||
"""
|
||||
if not _freq:
|
||||
load()
|
||||
clean = word_no_nikkud.strip()
|
||||
clean = _strip_nikkud(word_no_nikkud.strip())
|
||||
return _freq.get(clean)
|
||||
|
||||
|
||||
def get_freq_data() -> dict[str, int]:
|
||||
"""Return the full frequency dict (word -> rank).
|
||||
|
||||
Auto-loads from cache if not yet loaded.
|
||||
"""
|
||||
if not _freq:
|
||||
load()
|
||||
return _freq
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
load()
|
||||
|
|
|
|||
219
hebrew_extract.py
Normal file
219
hebrew_extract.py
Normal file
|
|
@ -0,0 +1,219 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Hebrew vocabulary from pealim.com dictionary.
|
||||
Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
from bs4 import BeautifulSoup
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Session for connection pooling
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
|
||||
})
|
||||
|
||||
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||||
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
|
||||
REQUEST_TIMEOUT = 10 # seconds
|
||||
|
||||
|
||||
def get_total_pages() -> int:
|
||||
"""Dynamically determine total pages from first request."""
|
||||
try:
|
||||
logger.info("Fetching total page count...")
|
||||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||||
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
# Hardcoded — pealim.com has ~608 pages at ~15 words/page
|
||||
return 608
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching page count: {e}. Using default (608).")
|
||||
return 608
|
||||
|
||||
|
||||
def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
|
||||
"""
|
||||
Parse a dict page with BeautifulSoup to extract word data + audio URL.
|
||||
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
|
||||
"""
|
||||
soup = BeautifulSoup(html_bytes, 'html.parser')
|
||||
rows = []
|
||||
for tr in soup.select('table tr'):
|
||||
tds = tr.find_all('td')
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
# Audio URL from span[data-audio] in first td
|
||||
audio_span = tds[0].find(attrs={'data-audio': True})
|
||||
audio_url = audio_span['data-audio'] if audio_span else ''
|
||||
# Word with nikkud
|
||||
menukad = tds[0].find('span', class_='menukad')
|
||||
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
# Root (may be link or plain text)
|
||||
root = tds[1].get_text(strip=True)
|
||||
# Part of speech
|
||||
pos = tds[2].get_text(strip=True)
|
||||
# Meaning
|
||||
meaning = tds[3].get_text(strip=True)
|
||||
if word:
|
||||
rows.append({
|
||||
'Word': word,
|
||||
'Root': root if root else '-',
|
||||
'Part of Speech': pos,
|
||||
'Meaning': meaning,
|
||||
'audio_url': audio_url,
|
||||
})
|
||||
return rows
|
||||
|
||||
|
||||
def extract_from_website(max_pages: Optional[int] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Extract dictionary entries from pealim.com.
|
||||
Captures audio URLs from each word entry's data-audio attribute.
|
||||
|
||||
Args:
|
||||
max_pages: Maximum pages to scrape (None = all)
|
||||
|
||||
Returns:
|
||||
DataFrame with Word, Root, Part of Speech, Meaning, Word Without Nikkud, audio_url columns
|
||||
"""
|
||||
total_pages = max_pages or get_total_pages()
|
||||
logger.info(f"Starting extraction from {total_pages} pages...")
|
||||
|
||||
all_rows: list[dict] = []
|
||||
|
||||
for page_num in range(1, total_pages):
|
||||
try:
|
||||
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||||
|
||||
# First request: with nikkud — parse with BeautifulSoup for audio URL
|
||||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
page_rows = _parse_page_with_audio(response.content)
|
||||
|
||||
# Second request: without nikkud — just get the word column
|
||||
cookies_vl = {'translit': 'none', 'hebstyle': 'vl', 'showmeaning': 'off'}
|
||||
resp_vl = session.get(url, cookies=cookies_vl, timeout=REQUEST_TIMEOUT)
|
||||
resp_vl.raise_for_status()
|
||||
soup_vl = BeautifulSoup(resp_vl.content, 'html.parser')
|
||||
no_nik_words = []
|
||||
for tr in soup_vl.select('table tr'):
|
||||
tds = tr.find_all('td')
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
menukad = tds[0].find('span', class_='menukad')
|
||||
w = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
no_nik_words.append(w)
|
||||
|
||||
# Merge no-nikkud words into rows
|
||||
for i, row in enumerate(page_rows):
|
||||
row['Word Without Nikkud'] = no_nik_words[i] if i < len(no_nik_words) else ''
|
||||
|
||||
all_rows.extend(page_rows)
|
||||
|
||||
if page_num % 50 == 0:
|
||||
logger.info(f"Processed {page_num}/{total_pages} pages ({len(all_rows)} words so far)...")
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error on page {page_num}: {e}")
|
||||
continue
|
||||
|
||||
df = pd.DataFrame(all_rows)
|
||||
audio_count = (df['audio_url'] != '').sum() if 'audio_url' in df.columns else 0
|
||||
logger.info(f"Extraction complete. Total words: {len(df)}, with audio URL: {audio_count}")
|
||||
return df
|
||||
|
||||
|
||||
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Transform dictionary DataFrame for Anki import.
|
||||
Adds shared root words and Hebrew tags. Preserves audio_url column.
|
||||
"""
|
||||
logger.info("Preparing data for Anki...")
|
||||
|
||||
# Find shared root words
|
||||
shared_root_words = []
|
||||
for idx, row in df.iterrows():
|
||||
root = row['Root']
|
||||
word = row['Word']
|
||||
|
||||
if root != '-' and pd.notna(root):
|
||||
same_root = df[(df['Root'] == root) & (df['Word'] != word)]['Word'].values
|
||||
shared = ' '.join(str(w) for w in same_root)
|
||||
shared_root_words.append(shared)
|
||||
else:
|
||||
shared_root_words.append('')
|
||||
|
||||
df['shared roots'] = shared_root_words
|
||||
|
||||
# Generate Hebrew tags
|
||||
tags = []
|
||||
for idx, row in df.iterrows():
|
||||
tag_parts = []
|
||||
|
||||
root = str(row['Root']).replace(' ', '').replace('-', '')
|
||||
if 'nan' not in root and root:
|
||||
root_clean = root.replace('.', '')
|
||||
tag_parts.append(f"שורש::{root_clean}")
|
||||
|
||||
pos = str(row['Part of Speech'])
|
||||
pos_tags = {
|
||||
'Adverb': 'תוארי_הפועל',
|
||||
'Pronoun': 'כינויי_גוף',
|
||||
'Noun': 'שם_עצם',
|
||||
'Verb': 'פעלים',
|
||||
'Adjective': 'שם_תואר',
|
||||
'Preposition': 'מילות_יחס',
|
||||
'Conjunction': 'מילות_חיבור',
|
||||
'Particle': 'מילית'
|
||||
}
|
||||
|
||||
for key, value in pos_tags.items():
|
||||
if key in pos:
|
||||
tag_parts.append(value)
|
||||
break
|
||||
|
||||
tags.append(' '.join(tag_parts))
|
||||
|
||||
df['tags'] = tags
|
||||
logger.info("Anki preparation complete.")
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
try:
|
||||
df = extract_from_website()
|
||||
df.to_csv('hebrew_dict.csv', index=True)
|
||||
logger.info("Saved: hebrew_dict.csv")
|
||||
|
||||
df = modify_for_anki(df)
|
||||
df.to_csv('hebrew_dict_for_anki.csv', sep=';', index=True)
|
||||
logger.info("Saved: hebrew_dict_for_anki.csv")
|
||||
|
||||
logger.info("Complete!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
"""Shared helper functions for the Hebrew Flash Cards project."""
|
||||
|
||||
import unicodedata
|
||||
|
||||
|
||||
def strip_nikkud(text: str) -> str:
|
||||
"""Remove Hebrew nikkud (diacritics) from a string."""
|
||||
return "".join(ch for ch in unicodedata.normalize("NFD", text) if unicodedata.category(ch) != "Mn")
|
||||
|
|
@ -2,10 +2,6 @@
|
|||
"""
|
||||
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
|
||||
|
||||
TODO: Rewrite to update words.json image/image_source fields directly instead of
|
||||
writing to a separate image_cache.json. Currently the migration script bridges
|
||||
the gap. See Phase 5 in SPRINT_LOG.md.
|
||||
|
||||
Scope: Noun PoS entries only. Concreteness heuristic:
|
||||
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
|
||||
-ship, -ure, -al, -ing when not a gerund, -ence)
|
||||
|
|
@ -26,7 +22,9 @@ import argparse
|
|||
import json
|
||||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
|
@ -42,22 +40,20 @@ REQUEST_TIMEOUT = 10
|
|||
|
||||
# Abstract noun suffixes — words whose English meaning ends in these are skipped
|
||||
ABSTRACT_SUFFIXES = (
|
||||
"tion",
|
||||
"ity",
|
||||
"ness",
|
||||
"ment",
|
||||
"ance",
|
||||
"ence",
|
||||
"ism",
|
||||
"hood",
|
||||
"ship",
|
||||
"ure",
|
||||
"age",
|
||||
"tion", "ity", "ness", "ment", "ance", "ence", "ism",
|
||||
"hood", "ship", "ure", "age",
|
||||
)
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update(
|
||||
{"User-Agent": "pealim-anki/3.0 (educational Hebrew Anki deck builder; contact: anki@pealim.invalid)"}
|
||||
session.headers.update({
|
||||
"User-Agent": "pealim-anki/3.0 (educational Hebrew Anki deck builder; contact: anki@pealim.invalid)"
|
||||
})
|
||||
|
||||
|
||||
def _strip_nikkud(text: str) -> str:
|
||||
return "".join(
|
||||
ch for ch in unicodedata.normalize("NFD", text)
|
||||
if unicodedata.category(ch) != "Mn"
|
||||
)
|
||||
|
||||
|
||||
|
|
@ -76,7 +72,7 @@ def is_concrete(english_meaning: str) -> bool:
|
|||
|
||||
def _safe_name(word_no_nikkud: str) -> str:
|
||||
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
|
||||
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
|
||||
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud))
|
||||
return hebrew_only if hebrew_only else "unknown"
|
||||
|
||||
|
||||
|
|
@ -200,7 +196,7 @@ def load_cache() -> dict:
|
|||
try:
|
||||
with open(CACHE_PATH, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
except Exception: # noqa: S110
|
||||
except Exception:
|
||||
pass
|
||||
return {}
|
||||
|
||||
|
|
@ -259,7 +255,7 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
|
|||
if single_word and word_plain != single_word:
|
||||
continue
|
||||
|
||||
cache_key = word_plain
|
||||
cache_key = word_plain or _strip_nikkud(word)
|
||||
|
||||
if cache_key in cache:
|
||||
skipped_cached += 1
|
||||
|
|
|
|||
|
|
@ -1,185 +0,0 @@
|
|||
"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling).
|
||||
|
||||
Implements Hebrew Academy rules for matres lectionis insertion:
|
||||
- Rule A: U vowel (kubutz) → always insert vav
|
||||
- Rule B: O vowel (holam on non-vav) → insert vav
|
||||
- Rule C: I vowel (hiriq) → insert yod (conditionally)
|
||||
- Rule D: E vowel (tsere) → insert yod (limited cases)
|
||||
- Rule E/F: Consonantal vav/yod doubling
|
||||
|
||||
Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
|
||||
# Hebrew nikkud code points
|
||||
SHVA = "\u05b0"
|
||||
HATAF_SEGOL = "\u05b1"
|
||||
HATAF_PATAH = "\u05b2"
|
||||
HATAF_KAMATZ = "\u05b3"
|
||||
HIRIQ = "\u05b4"
|
||||
TSERE = "\u05b5"
|
||||
SEGOL = "\u05b6"
|
||||
PATAH = "\u05b7"
|
||||
KAMATZ = "\u05b8"
|
||||
HOLAM = "\u05b9"
|
||||
HOLAM_HASER = "\u05ba"
|
||||
KUBUTZ = "\u05bb"
|
||||
DAGESH = "\u05bc"
|
||||
METEG = "\u05bd"
|
||||
RAFE = "\u05bf"
|
||||
SHIN_DOT = "\u05c1"
|
||||
SIN_DOT = "\u05c2"
|
||||
|
||||
VAV = "ו"
|
||||
YOD = "י"
|
||||
MAQAF = "־"
|
||||
|
||||
VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ}
|
||||
|
||||
NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT}
|
||||
|
||||
|
||||
def _parse_segments(text: str) -> list[tuple[str, list[str]]]:
|
||||
"""Parse nikkud text into (character, [marks]) segments."""
|
||||
segments: list[tuple[str, list[str]]] = []
|
||||
cur_char: str | None = None
|
||||
cur_marks: list[str] = []
|
||||
|
||||
for ch in text:
|
||||
if unicodedata.category(ch) == "Mn":
|
||||
cur_marks.append(ch)
|
||||
else:
|
||||
if cur_char is not None:
|
||||
segments.append((cur_char, cur_marks))
|
||||
cur_char = ch
|
||||
cur_marks = []
|
||||
|
||||
if cur_char is not None:
|
||||
segments.append((cur_char, cur_marks))
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def _get_vowel(marks: list[str]) -> str | None:
|
||||
"""Extract the vowel mark from a list of combining marks."""
|
||||
for m in marks:
|
||||
if m in VOWELS:
|
||||
return m
|
||||
return None
|
||||
|
||||
|
||||
def _has_dagesh(marks: list[str]) -> bool:
|
||||
return DAGESH in marks
|
||||
|
||||
|
||||
def _is_hebrew_letter(ch: str) -> bool:
|
||||
return "\u05d0" <= ch <= "\u05ea"
|
||||
|
||||
|
||||
def convert(text: str) -> str:
|
||||
"""Convert nikkud Hebrew text to ktiv male.
|
||||
|
||||
Strips all nikkud marks and inserts matres lectionis (vav/yod)
|
||||
according to Hebrew Academy spelling rules.
|
||||
"""
|
||||
segments = _parse_segments(text)
|
||||
result: list[str] = []
|
||||
|
||||
for i, (ch, marks) in enumerate(segments):
|
||||
if not _is_hebrew_letter(ch):
|
||||
# Non-Hebrew character: output as-is (no marks)
|
||||
result.append(ch)
|
||||
continue
|
||||
|
||||
vowel = _get_vowel(marks)
|
||||
has_dag = _has_dagesh(marks)
|
||||
|
||||
# Output the base letter (strip all nikkud marks)
|
||||
result.append(ch)
|
||||
|
||||
# --- Rule A: U vowel (kubutz) → always add vav ---
|
||||
if vowel == KUBUTZ:
|
||||
result.append(VAV)
|
||||
continue
|
||||
|
||||
# --- Shuruk detection ---
|
||||
# Vav with dagesh and no other vowel = shuruk (already a mater)
|
||||
# Vav with dagesh AND a vowel = consonantal vav (ב with dagesh)
|
||||
# If letter is vav with dagesh only → it's shuruk, already output
|
||||
if ch == VAV and has_dag and vowel is None:
|
||||
# Shuruk: vav IS the mater lectionis, already output
|
||||
continue
|
||||
|
||||
# --- Rule B: O vowel (holam) → add vav ---
|
||||
if vowel in (HOLAM, HOLAM_HASER):
|
||||
if ch != VAV:
|
||||
# Exception: holam before aleph (pe-aleph verbs) — no vav
|
||||
# e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד
|
||||
next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א"
|
||||
if not next_is_aleph:
|
||||
result.append(VAV)
|
||||
# If ch IS vav (holam male), vav already output
|
||||
continue
|
||||
|
||||
# --- Rule C: I vowel (hiriq) → conditionally add yod ---
|
||||
if vowel == HIRIQ:
|
||||
if ch == YOD:
|
||||
# Yod already present, don't double
|
||||
continue
|
||||
|
||||
# Don't insert yod if next letter is already yod
|
||||
if i + 1 < len(segments) and segments[i + 1][0] == YOD:
|
||||
continue
|
||||
|
||||
# Rule C Section 3: Don't add yod if the NEXT consonant
|
||||
# has shva (indicating shva nach on that consonant)
|
||||
add_yod = True
|
||||
|
||||
if i + 1 < len(segments):
|
||||
next_ch, next_marks = segments[i + 1]
|
||||
next_vowel = _get_vowel(next_marks)
|
||||
|
||||
# Shva on next consonant = shva nach → don't add yod
|
||||
# UNLESS next consonant also has dagesh (= shva na / doubled)
|
||||
next_has_dagesh = _has_dagesh(next_marks)
|
||||
if next_vowel == SHVA and not next_has_dagesh:
|
||||
add_yod = False
|
||||
# No vowel on next consonant (word-final) = closed syllable
|
||||
# → don't add yod (e.g., suffix -תי -נו -תם)
|
||||
elif next_vowel is None and _is_hebrew_letter(next_ch):
|
||||
# Check if this is truly word-final or next-to-last
|
||||
remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0]))
|
||||
if remaining_letters <= 2:
|
||||
# Short suffix like תי, נו — don't add yod
|
||||
add_yod = False
|
||||
|
||||
if add_yod:
|
||||
result.append(YOD)
|
||||
continue
|
||||
|
||||
# --- Rule D: E vowel (tsere/segol) → generally NO yod ---
|
||||
# Exception (b): tsere before guttural/resh gets yod ONLY
|
||||
# in word-initial position (dagesh substitution in Hif'il/noun patterns)
|
||||
# e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע
|
||||
# but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר
|
||||
if vowel == TSERE:
|
||||
add_yod = False
|
||||
|
||||
if i + 1 < len(segments):
|
||||
next_ch = segments[i + 1][0]
|
||||
if next_ch in "אהחער":
|
||||
# Only at word-initial (pos 0) or after prefix (pos 1)
|
||||
# where dagesh substitution applies
|
||||
hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0]))
|
||||
if hebrew_pos <= 1:
|
||||
add_yod = True
|
||||
|
||||
if add_yod:
|
||||
result.append(YOD)
|
||||
continue
|
||||
|
||||
# All other vowels (patah, kamatz, segol, shva, hataf-*):
|
||||
# No mater lectionis insertion needed
|
||||
|
||||
return "".join(result)
|
||||
BIN
pealim.apkg
Normal file
BIN
pealim.apkg
Normal file
Binary file not shown.
|
|
@ -1,348 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Download audio files from URLs stored in words.json.
|
||||
|
||||
Three audio categories are handled:
|
||||
1. Vocab audio → data/audio/{audio_file}
|
||||
2. Noun plural → data/audio/{slug}_plural.mp3
|
||||
3. Conjugation → data/audio_conj/{slug}_{form_key}.mp3
|
||||
data/audio_conj/{slug}_passive_{form_key}.mp3
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
AUDIO_DIR = DATA_DIR / "audio"
|
||||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||
WORDS_JSON = DATA_DIR / "words.json"
|
||||
|
||||
DOWNLOAD_DELAY = 0.3
|
||||
MAX_RETRIES = 3
|
||||
|
||||
# Map Hebrew tense names to English prefixes for form_key construction.
|
||||
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
|
||||
# appear in the current dataset but the form_key collapses to bare "infinitive".
|
||||
TENSE_TO_PREFIX = {
|
||||
"הוֹוֶה": "present",
|
||||
"עָבָר": "past",
|
||||
"עָתִיד": "future",
|
||||
"צִוּוּי": "imperative",
|
||||
"מְקוֹר": "infinitive",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_audio_file(entry: dict) -> str:
|
||||
"""Derive the vocab audio filename when audio_file is absent.
|
||||
|
||||
Slug-based for confusable entries (slug contains the disambiguating ID),
|
||||
consonant-only for all others.
|
||||
|
||||
Args:
|
||||
entry: A words.json entry dict.
|
||||
|
||||
Returns:
|
||||
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
|
||||
"""
|
||||
audio_file = entry.get("audio_file", "")
|
||||
if audio_file:
|
||||
return audio_file
|
||||
# Fallback: use slug for confusables, ktiv_male for others
|
||||
slug = entry.get("slug", "")
|
||||
if entry.get("confusable_group"):
|
||||
return f"{slug}.mp3"
|
||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
||||
return f"{safe_name}.mp3"
|
||||
|
||||
|
||||
def _form_key(person: str, tense: str) -> str:
|
||||
"""Build a filesystem-safe form key from person and tense fields.
|
||||
|
||||
Args:
|
||||
person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
|
||||
tense: Hebrew tense string from the conjugation form.
|
||||
|
||||
Returns:
|
||||
Form key such as ``"past_1s"`` or ``"present_ms"``.
|
||||
Infinitive tense always returns ``"infinitive"`` (no person suffix).
|
||||
"""
|
||||
prefix = TENSE_TO_PREFIX.get(tense, tense)
|
||||
if prefix == "infinitive":
|
||||
return "infinitive"
|
||||
return f"{prefix}_{person}"
|
||||
|
||||
|
||||
def _download(url: str, dest: Path, session: requests.Session) -> bool:
|
||||
"""Download *url* to *dest*, retrying up to MAX_RETRIES times.
|
||||
|
||||
Skips the download silently if *dest* already exists.
|
||||
|
||||
Args:
|
||||
url: HTTP(S) URL to download.
|
||||
dest: Local path to write the file to.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
``True`` if the file was downloaded (or already existed),
|
||||
``False`` if all retries were exhausted.
|
||||
"""
|
||||
if dest.exists():
|
||||
return True
|
||||
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
resp = session.get(url, timeout=15)
|
||||
resp.raise_for_status()
|
||||
dest.write_bytes(resp.content)
|
||||
logger.debug("Downloaded %s → %s", url, dest.name)
|
||||
return True
|
||||
except requests.RequestException as exc:
|
||||
wait = 2**attempt
|
||||
if attempt < MAX_RETRIES:
|
||||
logger.warning(
|
||||
"Attempt %d/%d failed for %s (%s) — retrying in %ds",
|
||||
attempt,
|
||||
MAX_RETRIES,
|
||||
url,
|
||||
exc,
|
||||
wait,
|
||||
)
|
||||
time.sleep(wait)
|
||||
else:
|
||||
logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-category downloaders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def download_vocab_audio(
|
||||
entries: list[dict],
|
||||
session: requests.Session,
|
||||
) -> tuple[int, int, int]:
|
||||
"""Download vocabulary audio files.
|
||||
|
||||
Args:
|
||||
entries: List of words.json entry dicts.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
Tuple of (downloaded, cached, no_url) counts.
|
||||
"""
|
||||
downloaded = cached = no_url = 0
|
||||
|
||||
for entry in entries:
|
||||
url: str | None = entry.get("audio_url")
|
||||
if not url:
|
||||
no_url += 1
|
||||
continue
|
||||
|
||||
audio_file: str | None = entry.get("audio_file")
|
||||
if not audio_file:
|
||||
audio_file = _make_audio_file(entry)
|
||||
|
||||
dest = AUDIO_DIR / audio_file
|
||||
|
||||
if dest.exists():
|
||||
cached += 1
|
||||
continue
|
||||
|
||||
if _download(url, dest, session):
|
||||
downloaded += 1
|
||||
time.sleep(DOWNLOAD_DELAY)
|
||||
else:
|
||||
no_url += 1 # count persistent failures alongside missing URLs
|
||||
|
||||
return downloaded, cached, no_url
|
||||
|
||||
|
||||
def download_noun_plural_audio(
|
||||
entries: list[dict],
|
||||
session: requests.Session,
|
||||
) -> tuple[int, int]:
|
||||
"""Download noun plural audio files.
|
||||
|
||||
Destination: ``data/audio/{slug}_plural.mp3``
|
||||
|
||||
Args:
|
||||
entries: List of words.json entry dicts.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
Tuple of (downloaded, cached) counts.
|
||||
"""
|
||||
downloaded = cached = 0
|
||||
|
||||
for entry in entries:
|
||||
ni = entry.get("noun_inflection")
|
||||
if not ni or not isinstance(ni, dict):
|
||||
continue
|
||||
|
||||
url: str | None = ni.get("plural_audio")
|
||||
if not url or not url.startswith("http"):
|
||||
continue
|
||||
|
||||
slug: str = entry["slug"]
|
||||
dest = AUDIO_DIR / f"{slug}_plural.mp3"
|
||||
|
||||
if dest.exists():
|
||||
cached += 1
|
||||
continue
|
||||
|
||||
if _download(url, dest, session):
|
||||
downloaded += 1
|
||||
time.sleep(DOWNLOAD_DELAY)
|
||||
|
||||
return downloaded, cached
|
||||
|
||||
|
||||
def download_conjugation_audio(
|
||||
entries: list[dict],
|
||||
session: requests.Session,
|
||||
) -> tuple[int, int, int]:
|
||||
"""Download conjugation form audio files.
|
||||
|
||||
Active forms → ``data/audio_conj/{slug}_{form_key}.mp3``
|
||||
Passive forms → ``data/audio_conj/{slug}_passive_{form_key}.mp3``
|
||||
|
||||
Args:
|
||||
entries: List of words.json entry dicts.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
Tuple of (downloaded, cached, failed) counts.
|
||||
"""
|
||||
downloaded = cached = failed = 0
|
||||
|
||||
for entry in entries:
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
|
||||
slug: str = entry["slug"]
|
||||
|
||||
form_sets: list[tuple[str, list]] = [
|
||||
("", conj.get("active_forms") or []),
|
||||
("passive_", conj.get("hufal_pual_forms") or []),
|
||||
]
|
||||
|
||||
for prefix, forms in form_sets:
|
||||
for form in forms:
|
||||
url: str | None = form.get("audio_url")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
key = _form_key(form.get("person", ""), form.get("tense", ""))
|
||||
dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
|
||||
|
||||
if dest.exists():
|
||||
cached += 1
|
||||
continue
|
||||
|
||||
if _download(url, dest, session):
|
||||
downloaded += 1
|
||||
time.sleep(DOWNLOAD_DELAY)
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
return downloaded, cached, failed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Parse CLI args and run the audio download pipeline."""
|
||||
parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
|
||||
parser.add_argument(
|
||||
"--skip-vocab",
|
||||
action="store_true",
|
||||
help="Skip vocabulary audio downloads.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-conj",
|
||||
action="store_true",
|
||||
help="Skip conjugation audio downloads.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
metavar="N",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Limit processing to the first N words.json entries.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(message)s",
|
||||
)
|
||||
|
||||
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as fh:
|
||||
raw: dict[str, dict] = json.load(fh)
|
||||
|
||||
entries = list(raw.values())
|
||||
if args.test is not None:
|
||||
entries = entries[: args.test]
|
||||
|
||||
logger.info("[4] Downloading audio files …")
|
||||
|
||||
session = requests.Session()
|
||||
session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
|
||||
|
||||
# --- Vocab ---
|
||||
if not args.skip_vocab:
|
||||
v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
|
||||
else:
|
||||
v_dl = v_cached = v_no_url = 0
|
||||
|
||||
# --- Noun plural ---
|
||||
np_dl, np_cached = download_noun_plural_audio(entries, session)
|
||||
|
||||
# --- Conjugation ---
|
||||
if not args.skip_conj:
|
||||
c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
|
||||
else:
|
||||
c_dl = c_cached = c_failed = 0
|
||||
|
||||
# --- Summary ---
|
||||
if not args.skip_vocab:
|
||||
logger.info(
|
||||
" Vocab: %d downloaded, %d cached, %d no URL",
|
||||
v_dl,
|
||||
v_cached,
|
||||
v_no_url,
|
||||
)
|
||||
logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached)
|
||||
if not args.skip_conj:
|
||||
failed_msg = f", {c_failed} failed" if c_failed else ""
|
||||
logger.info(
|
||||
" Conjugation: %d downloaded, %d cached%s",
|
||||
c_dl,
|
||||
c_cached,
|
||||
failed_msg,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
File diff suppressed because it is too large
Load diff
9106
pealim_dict.csv
Normal file
9106
pealim_dict.csv
Normal file
File diff suppressed because it is too large
Load diff
12111
pealim_dict_for_anki.csv
Normal file
12111
pealim_dict_for_anki.csv
Normal file
File diff suppressed because it is too large
Load diff
187
pealim_extract.py
Executable file
187
pealim_extract.py
Executable file
|
|
@ -0,0 +1,187 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Hebrew vocabulary from pealim.com dictionary.
|
||||
Scrapes word entries, roots, and parts of speech for Anki flashcards.
|
||||
"""
|
||||
|
||||
import requests
|
||||
import pandas as pd
|
||||
import logging
|
||||
import time
|
||||
from typing import Optional
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Session for connection pooling
|
||||
session = requests.Session()
|
||||
session.headers.update({
|
||||
'User-Agent': 'Mozilla/5.0 (compatible; pealim-scraper/1.0)'
|
||||
})
|
||||
|
||||
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||||
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
|
||||
REQUEST_TIMEOUT = 10 # seconds
|
||||
|
||||
|
||||
def get_total_pages() -> int:
|
||||
"""Dynamically determine total pages from first request."""
|
||||
try:
|
||||
logger.info("Fetching total page count...")
|
||||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||||
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
|
||||
dfs = pd.read_html(response.content)
|
||||
if dfs:
|
||||
# Estimate pages from first page (typically 15 words per page)
|
||||
# For now, use hardcoded value but this could be improved
|
||||
return 608
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching page count: {e}. Using default (608).")
|
||||
return 608
|
||||
|
||||
|
||||
def extract_from_website(max_pages: Optional[int] = None) -> pd.DataFrame:
|
||||
"""
|
||||
Extract dictionary entries from pealim.com.
|
||||
|
||||
Args:
|
||||
max_pages: Maximum pages to scrape (None = all)
|
||||
|
||||
Returns:
|
||||
DataFrame with Word, Root, Part of Speech, and Word Without Nikkud columns
|
||||
"""
|
||||
total_pages = max_pages or get_total_pages()
|
||||
logger.info(f"Starting extraction from {total_pages} pages...")
|
||||
|
||||
df = pd.DataFrame()
|
||||
|
||||
for page_num in range(1, total_pages):
|
||||
try:
|
||||
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||||
|
||||
# First request: with nikkud
|
||||
cookies = {'translit': 'none', 'hebstyle': 'mo'}
|
||||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
df_list = pd.read_html(response.content)
|
||||
|
||||
# Second request: without nikkud
|
||||
cookies = {'translit': 'none', 'hebstyle': 'vl', 'showmeaning': 'off'}
|
||||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
without_nikkud_words = pd.read_html(response.content)[-1]['Word']
|
||||
without_nikkud_words = without_nikkud_words.rename('Word Without Nikkud')
|
||||
|
||||
# Combine and append
|
||||
df_to_add = pd.concat([df_list[-1], without_nikkud_words], axis=1)
|
||||
df = pd.concat([df, df_to_add], ignore_index=True)
|
||||
|
||||
if page_num % 50 == 0:
|
||||
logger.info(f"Processed {page_num}/{total_pages} pages...")
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error on page {page_num}: {e}")
|
||||
continue
|
||||
|
||||
logger.info(f"Extraction complete. Total words: {len(df)}")
|
||||
return df
|
||||
|
||||
|
||||
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Transform dictionary DataFrame for Anki import.
|
||||
Adds shared root words and Hebrew tags.
|
||||
|
||||
Args:
|
||||
df: Dictionary DataFrame
|
||||
|
||||
Returns:
|
||||
Modified DataFrame ready for Anki
|
||||
"""
|
||||
logger.info("Preparing data for Anki...")
|
||||
|
||||
# Find shared root words
|
||||
shared_root_words = []
|
||||
for idx, row in df.iterrows():
|
||||
root = row['Root']
|
||||
word = row['Word']
|
||||
|
||||
if root != '-' and pd.notna(root):
|
||||
# Find other words with same root
|
||||
same_root = df[(df['Root'] == root) & (df['Word'] != word)]['Word'].values
|
||||
shared = ' '.join(str(w) for w in same_root)
|
||||
shared_root_words.append(shared)
|
||||
else:
|
||||
shared_root_words.append('')
|
||||
|
||||
df['shared roots'] = shared_root_words
|
||||
|
||||
# Generate Hebrew tags
|
||||
tags = []
|
||||
for idx, row in df.iterrows():
|
||||
tag_parts = []
|
||||
|
||||
# Root tag
|
||||
root = str(row['Root']).replace(' ', '').replace('-', '')
|
||||
if 'nan' not in root and root:
|
||||
root_clean = root.replace('.', '')
|
||||
tag_parts.append(f"שורש::{root_clean}")
|
||||
|
||||
# Part of speech tag
|
||||
pos = str(row['Part of Speech'])
|
||||
pos_tags = {
|
||||
'Adverb': 'תוארי_הפועל',
|
||||
'Pronoun': 'כינויי_גוף',
|
||||
'Noun': 'שם_עצם',
|
||||
'Verb': 'פעלים',
|
||||
'Adjective': 'שם_תואר',
|
||||
'Preposition': 'מילות_יחס',
|
||||
'Conjunction': 'מילות_חיבור',
|
||||
'Particle': 'מילית'
|
||||
}
|
||||
|
||||
for key, value in pos_tags.items():
|
||||
if key in pos:
|
||||
tag_parts.append(value)
|
||||
break
|
||||
|
||||
tags.append(' '.join(tag_parts))
|
||||
|
||||
df['tags'] = tags
|
||||
logger.info("Anki preparation complete.")
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
try:
|
||||
# Extract from website
|
||||
df = extract_from_website()
|
||||
df.to_csv('pealim_dict.csv', index=True)
|
||||
logger.info("Saved: pealim_dict.csv")
|
||||
|
||||
# Transform for Anki
|
||||
df = modify_for_anki(df)
|
||||
df.to_csv('pealim_dict_for_anki.csv', sep=';', index=True)
|
||||
logger.info("Saved: pealim_dict_for_anki.csv")
|
||||
|
||||
logger.info("✅ Complete!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
|
|
@ -1,714 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Consolidated list page scraper for pealim.com.
|
||||
|
||||
Scrapes /dict/?page=N with two cookie variants (hebstyle=mo for nikkud,
|
||||
hebstyle=vl for ktiv male) and writes results directly to data/words.json.
|
||||
|
||||
Usage:
|
||||
python3 pealim_list_scrape.py [--test N] [--force-refresh]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Paths
|
||||
# ---------------------------------------------------------------------------
|
||||
PROJECT_ROOT = Path(__file__).parent
|
||||
DATA_DIR = PROJECT_ROOT / "data"
|
||||
WORDS_JSON = DATA_DIR / "words.json"
|
||||
PROGRESS_JSON = DATA_DIR / "list_scrape_progress.json"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||||
REQUEST_DELAY = 1.5 # seconds between requests
|
||||
REQUEST_TIMEOUT = 15 # seconds
|
||||
DEFAULT_TOTAL_PAGES = 608
|
||||
SAVE_EVERY = 10 # pages between incremental saves
|
||||
TODAY = date.today().isoformat()
|
||||
|
||||
# Prefer lxml if available; html.parser is the fallback
|
||||
try:
|
||||
import lxml # type: ignore[import-untyped] # noqa: F401
|
||||
|
||||
BS4_PARSER = "lxml"
|
||||
except ImportError:
|
||||
BS4_PARSER = "html.parser"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Part-of-speech mappings
|
||||
# ---------------------------------------------------------------------------
|
||||
POS_HEBREW: dict[str, str] = {
|
||||
"Noun": "שֵׁם עֶצֶם",
|
||||
"Verb": "פֹּעַל",
|
||||
"Adjective": "שֵׁם תֹּאַר",
|
||||
"Adverb": "תֹּאַר הַפֹּעַל",
|
||||
"Pronoun": "כִּנּוּי גּוּף",
|
||||
"Preposition": "מִילַּת יַחַס",
|
||||
"Conjunction": "מִילַּת חִבּוּר",
|
||||
"Interjection": "מִילַּת קְרִיאָה",
|
||||
"Numeral": "שֵׁם מִסְפָּר",
|
||||
"Cardinal numeral": "שֵׁם מִסְפָּר",
|
||||
"Particle": "מִילִּית",
|
||||
"Determiner": "מְגַדִּיר",
|
||||
"Existential": "מִילַּת קִיּוּם",
|
||||
"Interrogative": "מִילַּת שְׁאֵלָה",
|
||||
}
|
||||
|
||||
# Use exact match on the POS string prefix; longer keys must be checked first.
|
||||
POS_HEBREW_ORDERED: list[tuple[str, str]] = sorted(POS_HEBREW.items(), key=lambda x: -len(x[0]))
|
||||
|
||||
BINYAN_HEBREW: dict[str, str] = {
|
||||
"Pa'al": "פָּעַל",
|
||||
"Nif'al": "נִפְעַל",
|
||||
"Pi'el": "פִּיעֵל",
|
||||
"Pu'al": "פֻּעַל",
|
||||
"Hif'il": "הִפְעִיל",
|
||||
"Huf'al": "הֻפְעַל",
|
||||
"Hitpa'el": "הִתְפַּעֵל",
|
||||
}
|
||||
|
||||
# Regex for extracting emoji characters
|
||||
EMOJI_RE = re.compile(
|
||||
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF\uFE0E\uFE0F\u200D]+",
|
||||
re.UNICODE,
|
||||
)
|
||||
|
||||
# Regex for extracting Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||
|
||||
# Fields that must never be overwritten when updating an existing entry
|
||||
PROTECTED_FIELDS = frozenset(
|
||||
[
|
||||
"vocab_legacy_guid",
|
||||
"confusables_guid",
|
||||
"frequency",
|
||||
"pseudo_frequency",
|
||||
"emoji",
|
||||
"emoji_source",
|
||||
"emoji_visible",
|
||||
"image",
|
||||
"image_source",
|
||||
"hint",
|
||||
"examples",
|
||||
"noun_inflection",
|
||||
"conjugation",
|
||||
"adjective_inflection",
|
||||
"preposition_inflection",
|
||||
]
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP session
|
||||
# ---------------------------------------------------------------------------
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki-scraper/1.0)"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default entry template
|
||||
# ---------------------------------------------------------------------------
|
||||
def _default_entry() -> dict:
|
||||
"""Return a fresh entry with all fields initialised to safe defaults."""
|
||||
return {
|
||||
"word": {"nikkud": "", "ktiv_male": ""},
|
||||
"slug": "",
|
||||
"root": [],
|
||||
"pos": "",
|
||||
"pos_hebrew": "",
|
||||
"meaning": "",
|
||||
"meaning_raw": "",
|
||||
"audio_url": "",
|
||||
"audio_file": "",
|
||||
"tags": "",
|
||||
"last_scrape_date": "",
|
||||
"vocab_legacy_guid": None,
|
||||
"frequency": None,
|
||||
"pseudo_frequency": None,
|
||||
"emoji": None,
|
||||
"emoji_source": None,
|
||||
"emoji_visible": False,
|
||||
"image": None,
|
||||
"image_source": None,
|
||||
"hint": "",
|
||||
"prep": None,
|
||||
"shared_roots": [],
|
||||
"confusable_group": None,
|
||||
"confusables_guid": None,
|
||||
"examples": None,
|
||||
"noun_inflection": None,
|
||||
"conjugation": None,
|
||||
"adjective_inflection": None,
|
||||
"preposition_inflection": None,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
def _extract_emoji(text: str) -> str | None:
|
||||
"""Return the first emoji run found in *text*, or None."""
|
||||
m = EMOJI_RE.search(text)
|
||||
return m.group(0) if m else None
|
||||
|
||||
|
||||
def _clean_meaning(raw: str) -> str:
|
||||
"""Strip emoji, Hebrew parenthesized prepositions, and extra whitespace from a raw meaning string."""
|
||||
cleaned = EMOJI_RE.sub("", raw)
|
||||
cleaned = HBPAREN_RE.sub("", cleaned)
|
||||
return " ".join(cleaned.split())
|
||||
|
||||
|
||||
def _parse_pos(pos_raw: str) -> tuple[str, str]:
|
||||
"""
|
||||
Parse raw PoS string into (pos_en, pos_hebrew).
|
||||
|
||||
Examples:
|
||||
"Noun – masculine" → ("Noun", "שֵׁם עֶצֶם")
|
||||
"Verb – pa'al" → ("Verb", "פֹּעַל — פָּעַל")
|
||||
"Cardinal numeral" → ("Cardinal numeral", "שֵׁם מִסְפָּר")
|
||||
"""
|
||||
# Strip leading/trailing whitespace; normalise dashes
|
||||
pos_clean = pos_raw.strip()
|
||||
|
||||
# Determine the base English PoS with longest-match strategy
|
||||
pos_en = ""
|
||||
for key, _ in POS_HEBREW_ORDERED:
|
||||
if pos_clean.startswith(key):
|
||||
pos_en = key
|
||||
break
|
||||
if not pos_en:
|
||||
# Fallback: take everything up to " – " or the full string
|
||||
pos_en = pos_clean.split(" – ")[0].split(" - ")[0].strip()
|
||||
|
||||
pos_heb = POS_HEBREW.get(pos_en, pos_en)
|
||||
|
||||
# For verbs, attempt to append binyan
|
||||
if pos_en == "Verb":
|
||||
# Look for binyan after dash; pealim uses "Verb – pa'al"
|
||||
dash_parts = re.split(r"\s*[–-]\s*", pos_clean)
|
||||
if len(dash_parts) >= 2:
|
||||
binyan_raw = dash_parts[1].strip()
|
||||
# Normalise capitalisation for lookup: "pa'al" → "Pa'al"
|
||||
binyan_key = binyan_raw.capitalize()
|
||||
# Handle mixed-case entries like "Nif'al"
|
||||
for bkey in BINYAN_HEBREW:
|
||||
if bkey.lower() == binyan_raw.lower():
|
||||
binyan_key = bkey
|
||||
break
|
||||
binyan_heb = BINYAN_HEBREW.get(binyan_key)
|
||||
if binyan_heb:
|
||||
pos_heb = f"{pos_heb} — {binyan_heb}"
|
||||
|
||||
return pos_en, pos_heb
|
||||
|
||||
|
||||
def _parse_root(root_raw: str) -> list[str]:
|
||||
"""
|
||||
Convert raw root text to a list of consonants.
|
||||
|
||||
Pealim shows roots as "פ - ע - ל" or "פ.ע.ל" or "—" (no root).
|
||||
"""
|
||||
if not root_raw or root_raw in ("-", "—", "–"):
|
||||
return []
|
||||
# Split on " - " or "." separators
|
||||
parts = re.split(r"\s*[-–—.]\s*", root_raw.strip())
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _build_tags(pos_en: str, root: list[str]) -> str:
|
||||
"""
|
||||
Generate Anki tags string matching the existing project convention.
|
||||
|
||||
Examples:
|
||||
pos=Noun, root=[] → "שם_עצם"
|
||||
pos=Noun, root=["א","ב"] → "שורש::אב שם_עצם"
|
||||
pos=Verb, root=["שמר"] → "שורש::שמר פעלים"
|
||||
"""
|
||||
pos_tag_map = {
|
||||
"Noun": "שם_עצם",
|
||||
"Verb": "פעלים",
|
||||
"Adjective": "שם_תואר",
|
||||
"Adverb": "תוארי_הפועל",
|
||||
"Pronoun": "כינויי_גוף",
|
||||
"Preposition": "מילות_יחס",
|
||||
"Conjunction": "מילות_חיבור",
|
||||
"Particle": "מילית",
|
||||
"Numeral": "שם_מספר",
|
||||
"Cardinal numeral": "שם_מספר",
|
||||
"Determiner": "מגדיר",
|
||||
"Existential": "מילת_קיום",
|
||||
"Interrogative": "מילת_שאלה",
|
||||
"Interjection": "מילת_קריאה",
|
||||
}
|
||||
|
||||
parts: list[str] = []
|
||||
if root:
|
||||
root_str = "".join(root)
|
||||
parts.append(f"שורש::{root_str}")
|
||||
|
||||
pos_heb_tag = pos_tag_map.get(pos_en, "")
|
||||
if pos_heb_tag:
|
||||
parts.append(pos_heb_tag)
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _compute_audio_file(slug: str, ktiv_male: str) -> str:
|
||||
"""
|
||||
Return the local audio filename for an entry.
|
||||
|
||||
The actual confusable detection happens later (after all pages are scraped);
|
||||
here we store a placeholder that post_process() will correct.
|
||||
We default to the consonant-based name; confusables get slug-based names.
|
||||
"""
|
||||
consonants = ktiv_male or ""
|
||||
return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
def _parse_mo_page(html: bytes) -> list[dict]:
|
||||
"""
|
||||
Parse a hebstyle=mo (nikkud) list page.
|
||||
|
||||
Returns a list of raw row dicts with keys:
|
||||
nikkud, slug, root_raw, pos_raw, meaning_raw, audio_url
|
||||
"""
|
||||
soup = BeautifulSoup(html, BS4_PARSER)
|
||||
rows: list[dict] = []
|
||||
for tr in soup.select("table tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
|
||||
# Audio URL
|
||||
audio_span = tds[0].find(attrs={"data-audio": True})
|
||||
audio_url: str = audio_span["data-audio"] if audio_span else ""
|
||||
|
||||
# Slug
|
||||
slug = ""
|
||||
link = tds[0].find("a", href=True)
|
||||
if link:
|
||||
m = re.search(r"/dict/([^/]+)/", link["href"])
|
||||
if m:
|
||||
slug = m.group(1)
|
||||
|
||||
# Nikkud word
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
nikkud = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
|
||||
root_raw = tds[1].get_text(strip=True)
|
||||
pos_raw = tds[2].get_text(strip=True)
|
||||
meaning_raw = tds[3].get_text(strip=True)
|
||||
|
||||
if nikkud:
|
||||
rows.append(
|
||||
{
|
||||
"nikkud": nikkud,
|
||||
"slug": slug,
|
||||
"root_raw": root_raw,
|
||||
"pos_raw": pos_raw,
|
||||
"meaning_raw": meaning_raw,
|
||||
"audio_url": audio_url,
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _parse_vl_words(html: bytes) -> list[str]:
|
||||
"""
|
||||
Parse a hebstyle=vl (ktiv male) list page.
|
||||
|
||||
Returns ordered list of ktiv male strings (one per table row).
|
||||
"""
|
||||
soup = BeautifulSoup(html, BS4_PARSER)
|
||||
words: list[str] = []
|
||||
for tr in soup.select("table tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
words.append(word)
|
||||
return words
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# words.json I/O
|
||||
# ---------------------------------------------------------------------------
|
||||
def _load_words() -> dict:
|
||||
"""Load words.json; return empty dict if missing."""
|
||||
if not WORDS_JSON.exists():
|
||||
logger.info("data/words.json not found — starting fresh.")
|
||||
return {}
|
||||
with WORDS_JSON.open(encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _save_words(words: dict) -> None:
|
||||
"""Atomically write words to words.json via a .tmp file."""
|
||||
tmp = WORDS_JSON.with_suffix(".json.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
json.dump(words, fh, ensure_ascii=False, indent=2)
|
||||
os.replace(tmp, WORDS_JSON)
|
||||
logger.info("Saved data/words.json (%d entries)", len(words))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Progress tracking
|
||||
# ---------------------------------------------------------------------------
|
||||
def _load_progress() -> set[int]:
|
||||
"""Return set of already-completed page numbers."""
|
||||
if not PROGRESS_JSON.exists():
|
||||
return set()
|
||||
with PROGRESS_JSON.open(encoding="utf-8") as fh:
|
||||
data = json.load(fh)
|
||||
return set(data.get("completed_pages", []))
|
||||
|
||||
|
||||
def _save_progress(completed: set[int]) -> None:
|
||||
"""Atomically write progress file."""
|
||||
tmp = PROGRESS_JSON.with_suffix(".json.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
json.dump({"completed_pages": sorted(completed)}, fh)
|
||||
os.replace(tmp, PROGRESS_JSON)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unique key generation
|
||||
# ---------------------------------------------------------------------------
|
||||
def _make_unique_key(nikkud: str, pos_en: str, meaning: str, existing_keys: set[str]) -> str:
|
||||
"""
|
||||
Generate a collision-free unique key for a new entry.
|
||||
|
||||
Escalation:
|
||||
1. nikkud
|
||||
2. nikkud|pos_en
|
||||
3. nikkud|pos_en|meaning
|
||||
4. nikkud|pos_en|meaning|N (N = 2, 3, …)
|
||||
"""
|
||||
candidate = nikkud
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
|
||||
candidate = f"{nikkud}|{pos_en}"
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
|
||||
candidate = f"{nikkud}|{pos_en}|{meaning}"
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{nikkud}|{pos_en}|{meaning}|{n}"
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
n += 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core: merge one scraped row into words dict
|
||||
# ---------------------------------------------------------------------------
|
||||
def _merge_row(
|
||||
words: dict,
|
||||
slug_index: dict[str, str],
|
||||
nikkud: str,
|
||||
ktiv_male: str,
|
||||
slug: str,
|
||||
root_raw: str,
|
||||
pos_raw: str,
|
||||
meaning_raw_raw: str,
|
||||
audio_url: str,
|
||||
) -> None:
|
||||
"""
|
||||
Upsert a single scraped row into *words* in-place.
|
||||
|
||||
*slug_index* maps slug → unique_key for fast lookup and is updated here
|
||||
when a new entry is created.
|
||||
"""
|
||||
# Derived fields
|
||||
pos_en, pos_heb = _parse_pos(pos_raw)
|
||||
root = _parse_root(root_raw)
|
||||
meaning_raw = meaning_raw_raw
|
||||
meaning = _clean_meaning(meaning_raw)
|
||||
emoji = _extract_emoji(meaning_raw_raw)
|
||||
tags = _build_tags(pos_en, root)
|
||||
audio_file = _compute_audio_file(slug, ktiv_male)
|
||||
# Extract Hebrew preposition(s) from the raw meaning (e.g. "(על)" → "על")
|
||||
prep_matches = HBPAREN_RE.findall(meaning_raw)
|
||||
prep: str | None = " ".join(prep_matches) if prep_matches else None
|
||||
|
||||
# ---- locate existing entry ----
|
||||
unique_key: str | None = slug_index.get(slug) if slug else None
|
||||
|
||||
if unique_key and unique_key in words:
|
||||
# Update list-level fields only; never touch protected fields
|
||||
entry = words[unique_key]
|
||||
entry["word"]["nikkud"] = nikkud
|
||||
entry["word"]["ktiv_male"] = ktiv_male
|
||||
entry["slug"] = slug
|
||||
entry["root"] = root
|
||||
entry["pos"] = pos_en
|
||||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["audio_url"] = audio_url
|
||||
entry["audio_file"] = audio_file
|
||||
entry["tags"] = tags
|
||||
entry["last_scrape_date"] = TODAY
|
||||
else:
|
||||
# Create new entry
|
||||
unique_key = _make_unique_key(nikkud, pos_en, meaning, set(words.keys()))
|
||||
entry = _default_entry()
|
||||
entry["word"]["nikkud"] = nikkud
|
||||
entry["word"]["ktiv_male"] = ktiv_male
|
||||
entry["slug"] = slug
|
||||
entry["root"] = root
|
||||
entry["pos"] = pos_en
|
||||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["emoji"] = emoji
|
||||
entry["emoji_source"] = "from_pealim" if emoji else None
|
||||
entry["audio_url"] = audio_url
|
||||
entry["audio_file"] = audio_file
|
||||
entry["tags"] = tags
|
||||
entry["last_scrape_date"] = TODAY
|
||||
words[unique_key] = entry
|
||||
if slug:
|
||||
slug_index[slug] = unique_key
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Post-processing: recompute confusable_group, shared_roots, audio_file
|
||||
# ---------------------------------------------------------------------------
|
||||
def _post_process(words: dict) -> None:
|
||||
"""
|
||||
After all pages are scraped, recompute derived cross-entry fields:
|
||||
|
||||
- confusable_group: entries sharing the same ktiv_male (2+)
|
||||
- shared_roots: entries sharing the same root (excluding self)
|
||||
- audio_file: slug-based for confusables, consonant-based otherwise
|
||||
"""
|
||||
logger.info("Post-processing: recomputing confusable groups and shared roots...")
|
||||
|
||||
# --- confusable groups ---
|
||||
ktiv_to_keys: dict[str, list[str]] = {}
|
||||
for key, entry in words.items():
|
||||
ktiv = entry.get("word", {}).get("ktiv_male", "")
|
||||
if ktiv:
|
||||
ktiv_to_keys.setdefault(ktiv, []).append(key)
|
||||
|
||||
for _, entry in words.items():
|
||||
ktiv = entry.get("word", {}).get("ktiv_male", "")
|
||||
group = ktiv_to_keys.get(ktiv, [])
|
||||
if len(group) >= 2:
|
||||
entry["confusable_group"] = sorted(group)
|
||||
# Confusable → slug-based audio filename
|
||||
slug = entry.get("slug", "")
|
||||
if slug:
|
||||
entry["audio_file"] = f"{slug}.mp3"
|
||||
else:
|
||||
# Only clear confusable_group if it wasn't set by enrichment (i.e. no confusables_guid)
|
||||
if not entry.get("confusables_guid"):
|
||||
entry["confusable_group"] = None
|
||||
# Non-confusable → consonant-based audio filename
|
||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||
consonants = ktiv_male or ""
|
||||
slug = entry.get("slug", "")
|
||||
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||
|
||||
# --- shared roots ---
|
||||
root_to_keys: dict[str, list[str]] = {}
|
||||
for key, entry in words.items():
|
||||
root = entry.get("root")
|
||||
if root:
|
||||
root_str = "|".join(root) # canonical form for grouping
|
||||
root_to_keys.setdefault(root_str, []).append(key)
|
||||
|
||||
for key, entry in words.items():
|
||||
root = entry.get("root")
|
||||
if root:
|
||||
root_str = "|".join(root)
|
||||
siblings = root_to_keys.get(root_str, [])
|
||||
entry["shared_roots"] = sorted(k for k in siblings if k != key)
|
||||
else:
|
||||
entry["shared_roots"] = []
|
||||
|
||||
logger.info("Post-processing complete.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scraping loop
|
||||
# ---------------------------------------------------------------------------
|
||||
def _build_slug_index(words: dict) -> dict[str, str]:
|
||||
"""Build slug → unique_key lookup from the current words dict."""
|
||||
index: dict[str, str] = {}
|
||||
for key, entry in words.items():
|
||||
slug = entry.get("slug", "")
|
||||
if slug and slug not in index:
|
||||
index[slug] = key
|
||||
return index
|
||||
|
||||
|
||||
def _fetch_page(url: str, cookies: dict) -> bytes | None:
|
||||
"""Fetch a single page; return raw bytes or None on failure."""
|
||||
try:
|
||||
resp = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.content
|
||||
except requests.RequestException as exc:
|
||||
logger.error("Request failed for %s: %s", url, exc)
|
||||
return None
|
||||
|
||||
|
||||
def run_scrape(total_pages: int, force_refresh: bool) -> None:
|
||||
"""
|
||||
Main scrape loop.
|
||||
|
||||
Args:
|
||||
total_pages: Number of list pages to scrape.
|
||||
force_refresh: If True, ignore progress file and re-scrape all pages.
|
||||
"""
|
||||
words = _load_words()
|
||||
slug_index = _build_slug_index(words)
|
||||
completed = set() if force_refresh else _load_progress()
|
||||
|
||||
if force_refresh and completed:
|
||||
logger.info("--force-refresh: ignoring %d completed pages.", len(completed))
|
||||
|
||||
pages_to_do = [p for p in range(1, total_pages + 1) if p not in completed]
|
||||
logger.info(
|
||||
"Pages to scrape: %d / %d (already done: %d)",
|
||||
len(pages_to_do),
|
||||
total_pages,
|
||||
len(completed),
|
||||
)
|
||||
|
||||
pages_since_save = 0
|
||||
|
||||
for page_num in pages_to_do:
|
||||
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||||
logger.info("Scraping page %d / %d …", page_num, total_pages)
|
||||
|
||||
# --- hebstyle=mo (nikkud + audio + slug) ---
|
||||
mo_html = _fetch_page(url, {"translit": "none", "hebstyle": "mo"})
|
||||
if mo_html is None:
|
||||
logger.warning("Skipping page %d (mo fetch failed).", page_num)
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
continue
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# --- hebstyle=vl (ktiv male) ---
|
||||
vl_html = _fetch_page(url, {"translit": "none", "hebstyle": "vl"})
|
||||
if vl_html is None:
|
||||
logger.warning("Skipping page %d (vl fetch failed).", page_num)
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
continue
|
||||
|
||||
# Parse
|
||||
mo_rows = _parse_mo_page(mo_html)
|
||||
vl_words = _parse_vl_words(vl_html)
|
||||
|
||||
if not mo_rows:
|
||||
logger.warning("Page %d returned no rows — might be past end.", page_num)
|
||||
completed.add(page_num)
|
||||
_save_progress(completed)
|
||||
time.sleep(REQUEST_DELAY)
|
||||
continue
|
||||
|
||||
# Merge each row
|
||||
for i, row in enumerate(mo_rows):
|
||||
ktiv_male = vl_words[i] if i < len(vl_words) else ""
|
||||
_merge_row(
|
||||
words=words,
|
||||
slug_index=slug_index,
|
||||
nikkud=row["nikkud"],
|
||||
ktiv_male=ktiv_male,
|
||||
slug=row["slug"],
|
||||
root_raw=row["root_raw"],
|
||||
pos_raw=row["pos_raw"],
|
||||
meaning_raw_raw=row["meaning_raw"],
|
||||
audio_url=row["audio_url"],
|
||||
)
|
||||
|
||||
completed.add(page_num)
|
||||
pages_since_save += 1
|
||||
|
||||
# Incremental save every SAVE_EVERY pages
|
||||
if pages_since_save >= SAVE_EVERY:
|
||||
_save_words(words)
|
||||
_save_progress(completed)
|
||||
pages_since_save = 0
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# Final save + post-processing
|
||||
logger.info("All pages scraped. Running post-processing…")
|
||||
_post_process(words)
|
||||
_save_words(words)
|
||||
_save_progress(completed)
|
||||
logger.info("Done. Total entries in words.json: %d", len(words))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
def main() -> None:
|
||||
"""Entry point."""
|
||||
parser = argparse.ArgumentParser(description="Scrape pealim.com list pages into data/words.json.")
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
metavar="N",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Scrape only the first N pages (for testing).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-refresh",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Re-scrape all pages, ignoring existing progress.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
total_pages = args.test if args.test is not None else DEFAULT_TOTAL_PAGES
|
||||
logger.info(
|
||||
"Starting pealim list scraper | pages=%d | force=%s | parser=%s",
|
||||
total_pages,
|
||||
args.force_refresh,
|
||||
BS4_PARSER,
|
||||
)
|
||||
|
||||
run_scrape(total_pages=total_pages, force_refresh=args.force_refresh)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
[project]
|
||||
name = "hebrew-flash-cards"
|
||||
version = "0.13"
|
||||
description = "Hebrew vocabulary & verb conjugation flashcards for Anki"
|
||||
requires-python = ">=3.11"
|
||||
dependencies = [
|
||||
"beautifulsoup4>=4.11.0",
|
||||
"genanki>=0.8.0",
|
||||
"lxml>=4.9.0",
|
||||
"numpy>=1.21.0",
|
||||
"pandas>=1.3.0",
|
||||
"pymupdf>=1.23.0",
|
||||
"pypdf>=3.0.0",
|
||||
"python-bidi>=0.4.2",
|
||||
"requests>=2.26.0",
|
||||
]
|
||||
|
||||
[project.optional-dependencies]
|
||||
dev = [
|
||||
"bandit",
|
||||
"pytest",
|
||||
"ruff",
|
||||
"vulture",
|
||||
]
|
||||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
markers = [
|
||||
"integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py311"
|
||||
line-length = 120
|
||||
exclude = [
|
||||
"lib/",
|
||||
"bin/",
|
||||
"include/",
|
||||
"lib64/",
|
||||
"archive/",
|
||||
"venv/",
|
||||
]
|
||||
|
||||
[tool.ruff.lint]
|
||||
select = [
|
||||
"E", # pycodestyle errors
|
||||
"W", # pycodestyle warnings
|
||||
"F", # pyflakes
|
||||
"I", # isort
|
||||
"UP", # pyupgrade
|
||||
"B", # flake8-bugbear
|
||||
"SIM", # flake8-simplify
|
||||
"PIE", # flake8-pie
|
||||
"T20", # flake8-print (flag print statements)
|
||||
"RET", # flake8-return
|
||||
"C4", # flake8-comprehensions
|
||||
"S", # flake8-bandit (security)
|
||||
]
|
||||
ignore = [
|
||||
"T201", # allow print() — this is a CLI tool, not a library
|
||||
"S603", # subprocess call with shell=False is fine
|
||||
"S607", # partial executable path is fine for CLI tools
|
||||
"S105", # PASS = "✓" is not a password
|
||||
"S108", # /tmp paths are intentional for temp downloads
|
||||
"S311", # random.Random() is for card ordering, not crypto
|
||||
"E501", # line too long — handled by formatter
|
||||
]
|
||||
|
||||
[tool.ruff.lint.per-file-ignores]
|
||||
"test_*.py" = ["S101"] # allow assert in tests
|
||||
|
||||
[tool.ruff.format]
|
||||
quote-style = "double"
|
||||
indent-style = "space"
|
||||
|
||||
[tool.vulture]
|
||||
paths = ["."]
|
||||
exclude = ["lib/", "bin/", "include/", "lib64/", "venv/", "archive/"]
|
||||
min_confidence = 80
|
||||
|
||||
[tool.bandit]
|
||||
exclude_dirs = ["lib", "bin", "include", "lib64", "venv", "archive"]
|
||||
skips = ["B101"] # allow assert
|
||||
208
release.py
208
release.py
|
|
@ -1,208 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create a Forgejo release and upload all .apkg deck variants.
|
||||
|
||||
Usage:
|
||||
python3 release.py # uses RELEASE_TAG from apkg_builder.py
|
||||
python3 release.py v0.14 # explicit tag
|
||||
python3 release.py --dry-run # show what would be uploaded without doing it
|
||||
python3 release.py --validate # run validate_apkg.py first, abort on failure
|
||||
|
||||
Requires:
|
||||
FORGEJO_TOKEN env var or hardcoded token below.
|
||||
Git tag must not already exist (creates tag + release).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
sys.path.insert(0, "/home/node/projects")
|
||||
import load_keeshare
|
||||
|
||||
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
||||
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["password"]
|
||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
|
||||
# All deck variants to include in release
|
||||
DECK_PREFIX = "hebrew_"
|
||||
DECK_VARIANTS = [
|
||||
"hebrew_vocabulary.apkg",
|
||||
"hebrew_vocabulary_audio.apkg",
|
||||
"hebrew_vocabulary_images.apkg",
|
||||
"hebrew_vocabulary_audio_images.apkg",
|
||||
"hebrew_conjugations.apkg",
|
||||
"hebrew_conjugations_audio.apkg",
|
||||
"hebrew_confusables.apkg",
|
||||
"hebrew_confusables_audio.apkg",
|
||||
"hebrew_plurals.apkg",
|
||||
"hebrew_plurals_audio.apkg",
|
||||
"hebrew_complete.apkg",
|
||||
"hebrew_complete_audio.apkg",
|
||||
]
|
||||
|
||||
|
||||
def get_release_tag() -> str:
|
||||
"""Import RELEASE_TAG from apkg_builder."""
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from apkg_builder import RELEASE_TAG
|
||||
|
||||
return RELEASE_TAG
|
||||
|
||||
|
||||
def api(method: str, endpoint: str, **kwargs) -> requests.Response:
|
||||
url = f"{REPO_API}{endpoint}"
|
||||
headers = {"Authorization": f"token {FORGEJO_TOKEN}"}
|
||||
resp = requests.request(method, url, headers=headers, timeout=30, **kwargs)
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
|
||||
|
||||
def tag_exists(tag: str) -> bool:
|
||||
try:
|
||||
api("GET", f"/tags/{tag}")
|
||||
return True
|
||||
except requests.HTTPError:
|
||||
return False
|
||||
|
||||
|
||||
def release_exists(tag: str) -> dict | None:
|
||||
try:
|
||||
resp = api("GET", f"/releases/tags/{tag}")
|
||||
return resp.json()
|
||||
except requests.HTTPError:
|
||||
return None
|
||||
|
||||
|
||||
def create_git_tag(tag: str) -> None:
|
||||
subprocess.run(["git", "tag", tag], check=True)
|
||||
subprocess.run(["git", "push", "origin", tag], check=True)
|
||||
print(f" Created and pushed tag: {tag}")
|
||||
|
||||
|
||||
def create_release(tag: str, assets: list[Path]) -> int:
|
||||
"""Create release, return release ID."""
|
||||
# Build release body from deck file sizes
|
||||
lines = ["## Deck Variants\n", "| File | Size |", "|------|------|"]
|
||||
for p in sorted(assets):
|
||||
size_mb = p.stat().st_size / 1_048_576
|
||||
lines.append(f"| {p.name} | {size_mb:.1f} MB |")
|
||||
|
||||
body = "\n".join(lines)
|
||||
data = {
|
||||
"tag_name": tag,
|
||||
"name": f"{tag} — Hebrew Flash Cards",
|
||||
"body": body,
|
||||
"draft": False,
|
||||
"prerelease": False,
|
||||
}
|
||||
resp = api("POST", "/releases", json=data)
|
||||
release_id = resp.json()["id"]
|
||||
print(f" Created release: {tag} (ID {release_id})")
|
||||
return release_id
|
||||
|
||||
|
||||
def delete_release_assets(release_id: int) -> int:
|
||||
"""Delete all existing assets on a release. Returns count deleted."""
|
||||
resp = api("GET", f"/releases/{release_id}/assets")
|
||||
assets = resp.json()
|
||||
for asset in assets:
|
||||
api("DELETE", f"/releases/{release_id}/assets/{asset['id']}")
|
||||
return len(assets)
|
||||
|
||||
|
||||
def upload_assets(release_id: int, assets: list[Path]) -> None:
|
||||
for p in sorted(assets):
|
||||
size_mb = p.stat().st_size / 1_048_576
|
||||
print(f" Uploading {p.name} ({size_mb:.1f} MB) ... ", end="", flush=True)
|
||||
with open(p, "rb") as f:
|
||||
api(
|
||||
"POST",
|
||||
f"/releases/{release_id}/assets?name={p.name}",
|
||||
files={"attachment": (p.name, f, "application/octet-stream")},
|
||||
)
|
||||
print("ok")
|
||||
|
||||
|
||||
def validate_decks() -> bool:
|
||||
"""Run validate_apkg.py, return True if all checks pass."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, "validate_apkg.py"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
print(result.stdout)
|
||||
if result.returncode != 0:
|
||||
print(result.stderr)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Create Forgejo release with deck assets")
|
||||
parser.add_argument("tag", nargs="?", help="Release tag (default: from apkg_builder.RELEASE_TAG)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be done without doing it")
|
||||
parser.add_argument("--validate", action="store_true", help="Run validate_apkg.py before releasing")
|
||||
parser.add_argument("--force", action="store_true", help="Re-upload assets if release already exists")
|
||||
args = parser.parse_args()
|
||||
|
||||
tag = args.tag or get_release_tag()
|
||||
print(f"Release tag: {tag}")
|
||||
|
||||
# Collect assets
|
||||
assets = [OUTPUT_DIR / name for name in DECK_VARIANTS]
|
||||
missing = [p for p in assets if not p.exists()]
|
||||
if missing:
|
||||
print("\nERROR: Missing deck files:")
|
||||
for p in missing:
|
||||
print(f" {p}")
|
||||
print("\nRun the build pipeline first: python3 run.py --skip-scrape")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Assets: {len(assets)} deck files")
|
||||
total_mb = sum(p.stat().st_size for p in assets) / 1_048_576
|
||||
print(f"Total size: {total_mb:.0f} MB")
|
||||
|
||||
if args.validate:
|
||||
print("\nValidating decks ...")
|
||||
if not validate_decks():
|
||||
print("ERROR: Validation failed. Aborting release.")
|
||||
sys.exit(1)
|
||||
print("Validation passed.\n")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] Would upload:")
|
||||
for p in sorted(assets):
|
||||
size_mb = p.stat().st_size / 1_048_576
|
||||
print(f" {p.name} ({size_mb:.1f} MB)")
|
||||
print(f"\n[DRY RUN] Tag: {tag}")
|
||||
return
|
||||
|
||||
# Check if release already exists
|
||||
existing = release_exists(tag)
|
||||
if existing and not args.force:
|
||||
print(f"\nRelease {tag} already exists (ID {existing['id']}).")
|
||||
print("Use --force to delete existing assets and re-upload.")
|
||||
sys.exit(1)
|
||||
|
||||
if existing and args.force:
|
||||
release_id = existing["id"]
|
||||
deleted = delete_release_assets(release_id)
|
||||
print(f" Deleted {deleted} existing assets from release {tag}")
|
||||
else:
|
||||
# Create tag if needed
|
||||
if not tag_exists(tag):
|
||||
create_git_tag(tag)
|
||||
release_id = create_release(tag, assets)
|
||||
|
||||
# Upload
|
||||
print(f"\nUploading {len(assets)} files ...")
|
||||
upload_assets(release_id, assets)
|
||||
|
||||
print(f"\nDone. Release: https://git.nevo.engineer/nevo/hebrew_flash_cards/releases/tag/{tag}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
533
run.py
533
run.py
|
|
@ -6,24 +6,14 @@ Usage:
|
|||
python run.py [options]
|
||||
|
||||
Options:
|
||||
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
|
||||
Pipeline steps:
|
||||
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
|
||||
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
|
||||
3. Frequency — load/download word frequency data
|
||||
4. Examples — extract example sentences from Hebrew EPUBs
|
||||
5. Audio download — download audio mp3 files
|
||||
6. Fonts — download Heebo font files
|
||||
7. Images — fetch noun images from Wikipedia
|
||||
8. Build — build all .apkg deck variants
|
||||
|
||||
Options:
|
||||
--skip-scrape Skip list page scraping (use existing words.json)
|
||||
--skip-detail Skip detail page scraping
|
||||
--only {vocab,conjugations} Run only one deck (skips all unrelated steps)
|
||||
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
|
||||
--skip-audio Skip audio .mp3 downloads
|
||||
--skip-examples Skip EPUB example extraction
|
||||
--skip-examples Skip Ben Yehuda example fetching
|
||||
--skip-conjugations Skip verb conjugation extraction
|
||||
--skip-images Skip image fetching for concrete nouns
|
||||
--test N Limit to first N words/pages
|
||||
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
|
||||
--test N Process only the first N dictionary words (for quick testing)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -31,6 +21,8 @@ import json
|
|||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
|
@ -46,127 +38,286 @@ OUTPUT_DIR = Path(__file__).parent / "output"
|
|||
AUDIO_DIR = DATA_DIR / "audio"
|
||||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||
FONTS_DIR = DATA_DIR / "fonts"
|
||||
WORDS_JSON = DATA_DIR / "words.json"
|
||||
|
||||
|
||||
def parse_args():
|
||||
p = argparse.ArgumentParser(description="Pealim Anki deck builder")
|
||||
p.add_argument(
|
||||
"--only",
|
||||
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
|
||||
help="Run only one deck (skips all unrelated steps)",
|
||||
)
|
||||
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
|
||||
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
|
||||
p.add_argument("--only", choices=["vocab", "conjugations"], help="Run only one deck (skips all unrelated steps)")
|
||||
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
|
||||
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
||||
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
|
||||
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
||||
p.add_argument("--skip-conjugations", action="store_true", help="Skip verb conjugation extraction (deprecated: use --only vocab)")
|
||||
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
||||
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
|
||||
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def step_list_scrape(args):
|
||||
"""Step 1 — scrape pealim.com list pages → words.json."""
|
||||
def step_scrape(args):
|
||||
"""Step 1 — scrape or load dictionary."""
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
# Legacy fallback names
|
||||
legacy_dict = DATA_DIR / "pealim_dict.csv"
|
||||
legacy_anki = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
|
||||
if args.skip_scrape:
|
||||
if WORDS_JSON.exists():
|
||||
logger.info("[1] Using existing words.json (--skip-scrape)")
|
||||
if dict_csv.exists():
|
||||
logger.info(f"[1] Using existing {dict_csv}")
|
||||
elif legacy_dict.exists():
|
||||
logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
|
||||
else:
|
||||
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
|
||||
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
logger.info("[1] Scraping dictionary list pages from pealim.com …")
|
||||
import pealim_list_scrape
|
||||
logger.info("[1] Scraping dictionary from pealim.com …")
|
||||
import hebrew_extract
|
||||
import pandas as pd
|
||||
|
||||
total_pages = args.test if args.test else None
|
||||
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
|
||||
df = hebrew_extract.extract_from_website()
|
||||
df.to_csv(dict_csv, index=True)
|
||||
logger.info(f" Saved {len(df)} words → {dict_csv}")
|
||||
|
||||
df = hebrew_extract.modify_for_anki(df)
|
||||
df.to_csv(anki_csv, sep=";", index=True)
|
||||
logger.info(f" Saved Anki CSV → {anki_csv}")
|
||||
|
||||
|
||||
def step_frequency() -> dict[str, int]:
|
||||
"""Step 3 — load/download word frequency data."""
|
||||
logger.info("[3] Loading word frequency data …")
|
||||
"""Step 2 — load/download word frequency data."""
|
||||
logger.info("[2] Loading word frequency data …")
|
||||
import frequency_lookup
|
||||
|
||||
frequency_lookup.load()
|
||||
return frequency_lookup._freq
|
||||
|
||||
|
||||
def step_examples(args) -> dict:
|
||||
"""Step 4 — extract example sentences from Hebrew EPUBs."""
|
||||
def step_examples(args, freq_cache: dict):
|
||||
"""Step 3 — load/build Ben Yehuda example index."""
|
||||
if args.skip_examples:
|
||||
logger.info("[4] Skipping examples (--skip-examples)")
|
||||
logger.info("[3] Skipping examples (--skip-examples)")
|
||||
examples_path = DATA_DIR / "examples_cache.json"
|
||||
if examples_path.exists():
|
||||
with open(examples_path) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
logger.info("[4] Extracting EPUB example sentences …")
|
||||
import epub_examples
|
||||
logger.info("[3] Loading Ben Yehuda example index …")
|
||||
import benyehuda
|
||||
benyehuda.load(force_rebuild=args.refresh_examples)
|
||||
|
||||
if not WORDS_JSON.exists():
|
||||
logger.warning("[4] words.json not found, skipping examples")
|
||||
return {}
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
try:
|
||||
import pandas as pd
|
||||
try:
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
|
||||
stats = epub_examples.run(words)
|
||||
if args.test:
|
||||
df = df.head(args.test)
|
||||
|
||||
# Save updated words.json
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
logger.info(f" Pre-fetching examples for {len(df)} words …")
|
||||
for _, row in df.iterrows():
|
||||
# Use nikkud word form as primary key (nikkud corpus)
|
||||
word_nikkud = str(row.get("Word", "")).strip()
|
||||
if word_nikkud:
|
||||
benyehuda.get_examples(word_nikkud)
|
||||
|
||||
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
|
||||
return stats
|
||||
except Exception as e:
|
||||
logger.warning(f" Could not pre-fetch all examples: {e}")
|
||||
|
||||
benyehuda.save_examples_cache()
|
||||
return benyehuda._examples_cache
|
||||
|
||||
|
||||
def step_detail_scrape(args):
|
||||
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
|
||||
if args.skip_detail:
|
||||
logger.info("[2] Skipping detail scrape (--skip-detail)")
|
||||
return
|
||||
|
||||
logger.info("[2] Scraping detail pages from pealim.com …")
|
||||
import pealim_detail_scrape
|
||||
|
||||
test_limit = args.test if args.test else None
|
||||
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
|
||||
|
||||
|
||||
def step_audio_download(args):
|
||||
"""Step 5 — download audio .mp3 files from URLs in words.json."""
|
||||
def step_audio(args):
|
||||
"""Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
|
||||
if args.skip_audio:
|
||||
logger.info("[5] Skipping audio (--skip-audio)")
|
||||
logger.info("[4] Skipping audio (--skip-audio)")
|
||||
return
|
||||
|
||||
logger.info("[5] Downloading audio files …")
|
||||
logger.info("[4] Downloading vocabulary audio files …")
|
||||
|
||||
import pealim_audio_download
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
test_limit = args.test if args.test else None
|
||||
pealim_audio_download.run(test=test_limit)
|
||||
import pandas as pd
|
||||
import requests
|
||||
try:
|
||||
try:
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
|
||||
if 'audio_url' not in df.columns:
|
||||
logger.warning(" No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
|
||||
return
|
||||
|
||||
if args.test:
|
||||
df = df.head(args.test)
|
||||
|
||||
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
no_url = 0
|
||||
|
||||
def strip_nik(t: str) -> str:
|
||||
return "".join(c for c in unicodedata.normalize("NFD", t)
|
||||
if unicodedata.category(c) != "Mn")
|
||||
|
||||
for _, row in df.iterrows():
|
||||
word = str(row.get("Word", "")).strip()
|
||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
||||
audio_url = str(row.get("audio_url", "")).strip()
|
||||
|
||||
if not word:
|
||||
continue
|
||||
|
||||
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nik(word_plain or word))
|
||||
if not safe_name:
|
||||
continue
|
||||
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
|
||||
|
||||
if mp3_path.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
if not audio_url or audio_url in ("nan", "None", ""):
|
||||
no_url += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.3)
|
||||
except Exception as e:
|
||||
logger.debug(f" Audio download failed for {word}: {e}")
|
||||
|
||||
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Audio step failed: {e}")
|
||||
|
||||
|
||||
def step_fonts(_args: argparse.Namespace):
|
||||
"""Step 6 — download Heebo font files (one-time, cached)."""
|
||||
def step_conj_audio(args, conjugations: dict):
|
||||
"""Step 4b — download conjugation audio .mp3 files."""
|
||||
if args.skip_audio:
|
||||
logger.info("[4b] Skipping conjugation audio (--skip-audio)")
|
||||
return
|
||||
|
||||
logger.info("[4b] Downloading conjugation audio files …")
|
||||
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
import requests
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for infinitive, data in conjugations.items():
|
||||
if not data or not data.get("forms"):
|
||||
continue
|
||||
|
||||
slug = data.get("slug", "")
|
||||
if not slug:
|
||||
continue
|
||||
|
||||
# Active forms
|
||||
for form_key, form_data in data["forms"].items():
|
||||
audio_url = form_data.get("audio_url", "")
|
||||
if not audio_url:
|
||||
continue
|
||||
filename = f"{slug}_{form_key}.mp3"
|
||||
mp3_path = AUDIO_CONJ_DIR / filename
|
||||
if mp3_path.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.2)
|
||||
except Exception as e:
|
||||
logger.debug(f" Conj audio failed {filename}: {e}")
|
||||
failed += 1
|
||||
|
||||
# Passive partner forms
|
||||
passive = data.get("passive_partner")
|
||||
if passive and passive.get("forms"):
|
||||
for form_key, form_data in passive["forms"].items():
|
||||
audio_url = form_data.get("audio_url", "")
|
||||
if not audio_url:
|
||||
continue
|
||||
filename = f"{slug}_passive_{form_key}.mp3"
|
||||
mp3_path = AUDIO_CONJ_DIR / filename
|
||||
if mp3_path.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.2)
|
||||
except Exception as e:
|
||||
logger.debug(f" Conj audio failed {filename}: {e}")
|
||||
failed += 1
|
||||
|
||||
logger.info(
|
||||
f" Conjugation audio: {downloaded} downloaded, "
|
||||
f"{skipped} cached, {failed} failed"
|
||||
)
|
||||
|
||||
|
||||
def step_fonts(args):
|
||||
"""Step 4c — download Heebo font files (one-time, cached)."""
|
||||
FONTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
regular = FONTS_DIR / "_Heebo-Regular.ttf"
|
||||
bold = FONTS_DIR / "_Heebo-Bold.ttf"
|
||||
|
||||
if regular.exists() and bold.exists():
|
||||
logger.info("[6] Heebo fonts already cached")
|
||||
logger.info("[4c] Heebo fonts already cached")
|
||||
return
|
||||
|
||||
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
|
||||
logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
|
||||
|
||||
# Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
|
||||
import requests as _req
|
||||
|
||||
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
|
||||
headers = {
|
||||
# Request TTF (not woff2) so Anki can embed them
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
|
||||
}
|
||||
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
|
||||
try:
|
||||
css_resp = _req.get(css_url, headers=headers, timeout=15)
|
||||
css_resp.raise_for_status()
|
||||
css_text = css_resp.text
|
||||
|
||||
# Find all src: url(...) references (may be woff2 for modern UA)
|
||||
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
|
||||
logger.debug(f" Found {len(font_urls)} font URL(s) in CSS")
|
||||
|
||||
# Prefer TTF; if only woff2 available, download first two and note
|
||||
downloaded = []
|
||||
for i, fu in enumerate(font_urls[:2]):
|
||||
fu = fu.strip("'\"")
|
||||
dest = regular if i == 0 else bold
|
||||
|
|
@ -175,78 +326,135 @@ def step_fonts(_args: argparse.Namespace):
|
|||
fr = _req.get(fu, timeout=15)
|
||||
fr.raise_for_status()
|
||||
dest.write_bytes(fr.content)
|
||||
downloaded.append(dest.name)
|
||||
logger.info(f" Downloaded → {dest.name}")
|
||||
|
||||
if not downloaded:
|
||||
logger.info(" All font files already present")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Heebo download failed: {e}")
|
||||
logger.warning(" Cards will fall back to Arial Hebrew / David.")
|
||||
logger.warning(
|
||||
" To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
|
||||
"from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
|
||||
f"into {FONTS_DIR}"
|
||||
)
|
||||
|
||||
|
||||
def step_images(args) -> dict:
|
||||
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
|
||||
"""Step 4d — fetch images for concrete nouns (resume-safe)."""
|
||||
if args.skip_images:
|
||||
logger.info("[7] Skipping images (--skip-images)")
|
||||
logger.info("[4d] Skipping images (--skip-images)")
|
||||
cache_path = DATA_DIR / "image_cache.json"
|
||||
if cache_path.exists():
|
||||
with open(cache_path) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
limit = args.test
|
||||
logger.info("[7] Fetching images for concrete nouns …")
|
||||
limit = args.test # When in test mode, limit images too
|
||||
logger.info("[4d] Fetching images for concrete nouns …")
|
||||
import image_fetch
|
||||
|
||||
return image_fetch.run(limit=limit)
|
||||
|
||||
|
||||
def step_build_all(args):
|
||||
"""Step 8 — build all 12 release variants from the unified words.json."""
|
||||
logger.info("[8] Building all deck variants …")
|
||||
def step_build_all(args, examples_cache: dict, freq_cache: dict, conjugations: dict | None, image_cache: dict | None = None):
|
||||
"""Step 5 — build all 6 release variants (4 vocab + 2 conj)."""
|
||||
logger.info("[5] Building all deck variants …")
|
||||
import apkg_builder
|
||||
|
||||
if not WORDS_JSON.exists():
|
||||
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
|
||||
sys.exit(1)
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
apkg_builder.build_all_variants(words, limit=args.test)
|
||||
apkg_builder.build_all_variants(
|
||||
dict_csv,
|
||||
conjugations=conjugations or {},
|
||||
examples_cache=examples_cache,
|
||||
freq_cache=freq_cache,
|
||||
image_cache=image_cache or {},
|
||||
limit=args.test,
|
||||
)
|
||||
|
||||
|
||||
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
|
||||
def step_conjugations(args):
|
||||
"""Step 6 — extract conjugations (returns data; building handled by step_build_all).
|
||||
|
||||
--skip-conjugations skips re-extraction from pealim.com but still loads
|
||||
from cache so conj deck variants are built correctly.
|
||||
"""
|
||||
conj_cache = DATA_DIR / "conjugations.json"
|
||||
|
||||
if args.skip_conjugations:
|
||||
if conj_cache.exists():
|
||||
logger.info("[6] --skip-conjugations: loading from cache …")
|
||||
with open(conj_cache) as f:
|
||||
import json as _json
|
||||
return _json.load(f)
|
||||
logger.info("[6] --skip-conjugations: no cache found, skipping conj decks")
|
||||
return None
|
||||
|
||||
verbs_file = Path(__file__).parent / "verbs_input.txt"
|
||||
if not verbs_file.exists():
|
||||
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
|
||||
return None
|
||||
|
||||
if conj_cache.exists():
|
||||
logger.info("[6] Using cached conjugations.json …")
|
||||
with open(conj_cache) as f:
|
||||
import json as _json
|
||||
conjugations = _json.load(f)
|
||||
else:
|
||||
logger.info("[6] Extracting verb conjugations …")
|
||||
import conjugation_extract
|
||||
conjugations = conjugation_extract.main(verbs_file)
|
||||
|
||||
# Download conjugation audio
|
||||
step_conj_audio(args, conjugations)
|
||||
|
||||
return conjugations
|
||||
|
||||
|
||||
def print_summary(args, examples_cache, freq_cache, conjugations):
|
||||
logger.info("")
|
||||
logger.info("=" * 60)
|
||||
logger.info("SUMMARY")
|
||||
logger.info("=" * 60)
|
||||
|
||||
if WORDS_JSON.exists():
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
logger.info(f" Dictionary words: {len(words)}")
|
||||
|
||||
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
|
||||
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
|
||||
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
|
||||
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
if dict_csv.exists():
|
||||
import pandas as pd
|
||||
try:
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
logger.info(f" Dictionary words: {len(df)}")
|
||||
|
||||
logger.info(f" Frequency entries: {len(freq_cache)}")
|
||||
matched = example_stats.get("matched", 0)
|
||||
total = example_stats.get("total_vocab", 0)
|
||||
if total:
|
||||
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
|
||||
for book, count in example_stats.get("books", {}).items():
|
||||
logger.info(f" {book}: {count} sentences")
|
||||
logger.info(f" Example cache entries: {len(examples_cache)}")
|
||||
covered = sum(1 for v in examples_cache.values() if v)
|
||||
if examples_cache:
|
||||
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100*covered//len(examples_cache)}%)")
|
||||
|
||||
if AUDIO_DIR.exists():
|
||||
mp3s = list(AUDIO_DIR.glob("*.mp3"))
|
||||
logger.info(f" Vocabulary audio files: {len(mp3s)}")
|
||||
|
||||
if AUDIO_CONJ_DIR.exists():
|
||||
mp3s = [
|
||||
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
|
||||
]
|
||||
logger.info(f" Conjugation audio files (bundled): {len(mp3s)}")
|
||||
mp3s = list(AUDIO_CONJ_DIR.glob("*.mp3"))
|
||||
logger.info(f" Conjugation audio files: {len(mp3s)}")
|
||||
|
||||
image_cache_path = DATA_DIR / "image_cache.json"
|
||||
if image_cache_path.exists():
|
||||
|
|
@ -256,23 +464,17 @@ def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: di
|
|||
logger.info(f" Images: {found_imgs}/{len(ic)} nouns with images")
|
||||
|
||||
import apkg_builder as _ab
|
||||
|
||||
all_apkgs = [
|
||||
_ab.VOCAB_APKG,
|
||||
_ab.VOCAB_APKG_AUDIO,
|
||||
_ab.VOCAB_APKG_IMAGES,
|
||||
_ab.VOCAB_APKG_AUDIO_IMAGES,
|
||||
_ab.CONJ_APKG,
|
||||
_ab.CONJ_APKG_AUDIO,
|
||||
_ab.CONF_APKG,
|
||||
_ab.CONF_APKG_AUDIO,
|
||||
_ab.COMPLETE_APKG,
|
||||
_ab.COMPLETE_APKG_AUDIO,
|
||||
_ab.VOCAB_APKG, _ab.VOCAB_APKG_AUDIO, _ab.VOCAB_APKG_IMAGES, _ab.VOCAB_APKG_AUDIO_IMAGES,
|
||||
_ab.CONJ_APKG, _ab.CONJ_APKG_AUDIO,
|
||||
]
|
||||
for apkg in all_apkgs:
|
||||
if apkg.exists():
|
||||
size_mb = apkg.stat().st_size / 1e6
|
||||
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
|
||||
if conjugations:
|
||||
verb_count = sum(1 for v in conjugations.values() if v)
|
||||
logger.info(f" Verbs in conjugation deck: {verb_count}")
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("DONE")
|
||||
|
|
@ -287,75 +489,36 @@ def main():
|
|||
logger.info(f" MODE: --only {args.only}")
|
||||
if args.test:
|
||||
logger.info(f" TEST MODE: {args.test} words")
|
||||
if args.refresh_examples:
|
||||
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
|
||||
logger.info("=" * 60)
|
||||
|
||||
def _load_words_for_only() -> dict:
|
||||
if not WORDS_JSON.exists():
|
||||
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
|
||||
sys.exit(1)
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
if args.only == "conjugations":
|
||||
step_fonts(args)
|
||||
conjugations = step_conjugations(args)
|
||||
if conjugations:
|
||||
import apkg_builder
|
||||
|
||||
words = _load_words_for_only()
|
||||
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
|
||||
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
|
||||
apkg_builder.write_conj_apkg(deck, media, out_path=path)
|
||||
print_summary(args, {}, {})
|
||||
return
|
||||
|
||||
if args.only == "confusables":
|
||||
step_fonts(args)
|
||||
import apkg_builder
|
||||
|
||||
words = _load_words_for_only()
|
||||
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
|
||||
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
|
||||
apkg_builder.write_conf_apkg(deck, media, out_path=path)
|
||||
print_summary(args, {}, {})
|
||||
return
|
||||
|
||||
if args.only == "plurals":
|
||||
step_fonts(args)
|
||||
import apkg_builder
|
||||
|
||||
words = _load_words_for_only()
|
||||
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
|
||||
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
|
||||
apkg_builder.write_plural_apkg(deck, media, out_path=path)
|
||||
print_summary(args, {}, {})
|
||||
return
|
||||
|
||||
if args.only == "complete":
|
||||
step_fonts(args)
|
||||
import apkg_builder
|
||||
|
||||
words = _load_words_for_only()
|
||||
emoji_lookup = apkg_builder._load_emoji_lookup()
|
||||
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
|
||||
decks, media = apkg_builder.build_complete_deck(
|
||||
words,
|
||||
include_audio=audio,
|
||||
emoji_lookup=emoji_lookup,
|
||||
apkg_builder.build_all_variants(
|
||||
DATA_DIR / "hebrew_dict_for_anki.csv",
|
||||
conjugations=conjugations,
|
||||
limit=args.test,
|
||||
)
|
||||
apkg_builder.write_complete_apkg(decks, media, out_path=path)
|
||||
print_summary(args, {}, {})
|
||||
print_summary(args, {}, {}, conjugations or {})
|
||||
return
|
||||
|
||||
# Full pipeline
|
||||
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
|
||||
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
|
||||
freq_cache = step_frequency() # 3 — word frequency data
|
||||
example_stats = step_examples(args) # 4 — EPUB example sentences
|
||||
step_audio_download(args) # 5 — download audio mp3s
|
||||
step_fonts(args) # 6 — download Heebo fonts
|
||||
step_images(args) # 7 — fetch noun images
|
||||
step_build_all(args) # 8 — build all .apkg variants
|
||||
if args.only == "vocab":
|
||||
args.skip_conjugations = True
|
||||
|
||||
print_summary(args, example_stats, freq_cache)
|
||||
step_scrape(args)
|
||||
freq_cache = step_frequency()
|
||||
examples_cache = step_examples(args, freq_cache)
|
||||
step_audio(args)
|
||||
step_fonts(args)
|
||||
image_cache = step_images(args)
|
||||
conjugations = step_conjugations(args)
|
||||
step_build_all(args, examples_cache, freq_cache, conjugations, image_cache)
|
||||
|
||||
print_summary(args, examples_cache, freq_cache, conjugations or {})
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
|
|
@ -1,392 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Assign frequency ranks from the cleaned corpus to words.json entries.
|
||||
|
||||
Two-tier assignment with PoS priority:
|
||||
Tier 1: Match headword ktiv_male directly against corpus
|
||||
Tier 2: Match conjugated/inflected forms (only if no other entry already
|
||||
claimed that corpus word via tier 1)
|
||||
|
||||
PoS priority (based on standalone-word likelihood in Hebrew text):
|
||||
כינויי_גוף (Pronoun) > מילות_חיבור (Conjunction) > שם_תואר (Adjective) >
|
||||
מילית (Particle) > שם_עצם (Noun) > תוארי_הפועל (Adverb) >
|
||||
מילות_יחס (Preposition) > פעלים (Verb)
|
||||
|
||||
Usage:
|
||||
python3 scripts/assign_frequency.py # assign and save
|
||||
python3 scripts/assign_frequency.py --dry-run # preview only
|
||||
python3 scripts/assign_frequency.py --stats # show statistics only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
|
||||
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
|
||||
|
||||
# Function word PoS — these dominate content words in homograph groups
|
||||
FUNCTION_POS = frozenset({"כינויי_גוף", "מילות_חיבור", "מילית", "מילות_יחס", "תוארי_הפועל"})
|
||||
|
||||
# Content PoS that loses frequency when a function word dominates
|
||||
# Adjectives also lose (e.g. כן "honest" vs כן "yes") — they're rare collisions
|
||||
CONTENT_POS = frozenset({"שם_עצם", "שם_תואר", "פעלים"})
|
||||
|
||||
# Manual overrides: at these corpus ranks, ALL homographs share frequency.
|
||||
# These are cases where the content word is genuinely common enough to deserve it.
|
||||
# e.g. rank 15: עם "people" (NN) alongside עם "with" (PREP)
|
||||
# Manual overrides: at these ktiv_male forms, ALL homographs share frequency.
|
||||
# These are cases where the content word is genuinely common enough to deserve it.
|
||||
SHARE_ALL_WORDS = frozenset(
|
||||
{
|
||||
"עם", # "people" (NN) + "with" (PREP)
|
||||
"שם", # "name" (NN) + "there" (ADV)
|
||||
"אל", # "god" (NN) + "to" (PREP) + "don't" (PART)
|
||||
"עד", # "witness"/"eternity" (NN) + "until" (PREP)
|
||||
"פה", # "mouth" (NN) + "here" (ADV)
|
||||
"לאחר", # "to be late" (VB) + "after" (PREP)
|
||||
"יופי", # "beauty" (NN) + "great!" (ADV)
|
||||
"המון", # "crowd" (NN) + "lots of" (ADV)
|
||||
"חבל", # "rope" (NN) + "it's a pity" (ADV)
|
||||
"ראשית", # "beginning" (NN) + "firstly" (ADV)
|
||||
"עקב", # "heel"/"footprint" (NN) + "due to" (CONJ)
|
||||
"אולם", # "hall" (NN) + "however" (ADV)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _get_pos_tag(entry: dict) -> str:
|
||||
"""Extract primary PoS tag from entry's tags field."""
|
||||
tags = (entry.get("tags") or "").split()
|
||||
for t in tags:
|
||||
if not t.startswith("שורש"):
|
||||
return t
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _build_form_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||
"""Build reverse index: ktiv_male_form -> [(unique_key, match_type), ...]"""
|
||||
index: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
|
||||
for key, entry in words.items():
|
||||
w = entry.get("word") or {}
|
||||
if km := w.get("ktiv_male"):
|
||||
index[km].append((key, "headword"))
|
||||
|
||||
# Verb conjugations: indexed for new-assignment-only matching (no upgrades).
|
||||
# Conjugated forms collide with unrelated headwords, so tier 2 only uses
|
||||
# these for entries that have NO existing frequency.
|
||||
conj = entry.get("conjugation") or {}
|
||||
for form in conj.get("active_forms") or []:
|
||||
if isinstance(form, dict):
|
||||
form_data = form.get("form") or {}
|
||||
if km2 := form_data.get("ktiv_male"):
|
||||
km2 = km2.rstrip("!\u200f ")
|
||||
index[km2].append((key, "conjugation"))
|
||||
|
||||
for hp in conj.get("hufal_pual_forms") or []:
|
||||
if isinstance(hp, dict):
|
||||
hp_data = hp.get("form") or {}
|
||||
if km3 := hp_data.get("ktiv_male"):
|
||||
km3 = km3.rstrip("!\u200f ")
|
||||
index[km3].append((key, "conjugation"))
|
||||
|
||||
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
|
||||
for inf_data in (entry.get(field) or {}).values():
|
||||
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
|
||||
index[km4].append((key, "inflection"))
|
||||
|
||||
return dict(index)
|
||||
|
||||
|
||||
def _should_get_frequency(
|
||||
entry: dict,
|
||||
all_headword_entries: list[tuple[str, str]],
|
||||
corpus_word: str,
|
||||
words: dict,
|
||||
) -> bool:
|
||||
"""Decide if an entry should get frequency in a homograph group.
|
||||
|
||||
Rules:
|
||||
- If only one entry matches, it always gets frequency.
|
||||
- If SHARE_ALL_WORDS includes this corpus word, all entries share.
|
||||
- If the group has function words AND content words, content words lose.
|
||||
- Otherwise all entries share.
|
||||
"""
|
||||
if len(all_headword_entries) <= 1:
|
||||
return True
|
||||
if corpus_word in SHARE_ALL_WORDS:
|
||||
return True
|
||||
|
||||
pos = _get_pos_tag(entry)
|
||||
has_function = any(_get_pos_tag(words[k]) in FUNCTION_POS for k, _ in all_headword_entries)
|
||||
|
||||
return not (has_function and pos in CONTENT_POS)
|
||||
|
||||
|
||||
def assign_frequencies(
|
||||
words: dict,
|
||||
freq_corpus: dict[str, int],
|
||||
raw_corpus: dict[str, int] | None = None,
|
||||
upgrade: bool = False,
|
||||
) -> dict[str, dict]:
|
||||
"""Assign frequency ranks to words.json entries. Returns assignment details.
|
||||
|
||||
freq_corpus controls which words are valid (cleaned corpus).
|
||||
raw_corpus provides original rank numbers (with gaps). If not provided,
|
||||
uses freq_corpus ranks (re-ranked, no gaps).
|
||||
upgrade: if True, tier 2 can upgrade an entry's rank when a conjugated/inflected
|
||||
form has a better (lower) rank than the headword match.
|
||||
"""
|
||||
rank_source = raw_corpus if raw_corpus is not None else freq_corpus
|
||||
form_index = _build_form_index(words)
|
||||
|
||||
# Track which corpus words have been claimed by tier 1
|
||||
tier1_claimed: set[str] = set()
|
||||
|
||||
# Results tracking
|
||||
assignments: dict[str, dict] = {} # unique_key -> {rank, source, corpus_word}
|
||||
|
||||
# --- Tier 1: headword matches ---
|
||||
# For each corpus word, find all headword matches and assign to eligible entries.
|
||||
# Homograph groups: function words get frequency, content words don't (unless overridden).
|
||||
corpus_by_rank = sorted(freq_corpus.items(), key=lambda x: x[1])
|
||||
|
||||
for corpus_word, _clean_rank in corpus_by_rank:
|
||||
matches = form_index.get(corpus_word, [])
|
||||
headword_matches = [(k, t) for k, t in matches if t == "headword"]
|
||||
if not headword_matches:
|
||||
continue
|
||||
|
||||
original_rank = rank_source.get(corpus_word, _clean_rank)
|
||||
assigned_any = False
|
||||
for entry_key, _ in headword_matches:
|
||||
if entry_key in assignments:
|
||||
continue
|
||||
if _should_get_frequency(words[entry_key], headword_matches, corpus_word, words):
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": "headword",
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
assigned_any = True
|
||||
|
||||
if assigned_any:
|
||||
tier1_claimed.add(corpus_word)
|
||||
|
||||
tier1_count = len(assignments)
|
||||
logger.info("Tier 1 (headword): %d entries assigned", tier1_count)
|
||||
|
||||
# --- Tier 2: conjugation/inflection matches ---
|
||||
# Only use corpus words NOT claimed in tier 1.
|
||||
# A corpus word that matches an inflection is "owned" by that headword —
|
||||
# it cannot also upgrade an unrelated verb via conjugation.
|
||||
# Upgrades (when enabled) only apply within the same match type priority.
|
||||
for corpus_word, _clean_rank in corpus_by_rank:
|
||||
if corpus_word in tier1_claimed:
|
||||
continue
|
||||
|
||||
matches = form_index.get(corpus_word, [])
|
||||
secondary_matches = [(k, t) for k, t in matches if t in ("conjugation", "inflection")]
|
||||
if not secondary_matches:
|
||||
continue
|
||||
|
||||
original_rank = rank_source.get(corpus_word, _clean_rank)
|
||||
|
||||
# Split by type: inflections take priority over conjugations
|
||||
inflection_matches = [(k, t) for k, t in secondary_matches if t == "inflection"]
|
||||
conjugation_matches = [(k, t) for k, t in secondary_matches if t == "conjugation"]
|
||||
|
||||
# If any inflection matches exist, this corpus word belongs to inflection.
|
||||
# Don't let conjugations claim it.
|
||||
active_matches = inflection_matches if inflection_matches else conjugation_matches
|
||||
|
||||
for entry_key, match_type in active_matches:
|
||||
existing = assignments.get(entry_key)
|
||||
if existing is None:
|
||||
# New assignment — conjugations only allowed for rank > 5000
|
||||
# (too many false positives in the important tiers)
|
||||
if match_type == "conjugation" and original_rank <= 5000:
|
||||
continue
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": match_type,
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
break
|
||||
if upgrade and match_type == "inflection" and original_rank < existing["rank"]:
|
||||
# Upgrade — only allowed for inflections (conjugations collide too much)
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": f"upgrade:{match_type}",
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
break
|
||||
|
||||
tier2_count = len(assignments) - tier1_count
|
||||
logger.info("Tier 2 (conjugation/inflection): %d entries assigned", tier2_count)
|
||||
|
||||
return assignments
|
||||
|
||||
|
||||
def print_stats(words: dict, assignments: dict, freq_corpus: dict) -> None:
|
||||
"""Print detailed statistics about frequency assignment."""
|
||||
total = len(words)
|
||||
assigned = len(assignments)
|
||||
previously_had = sum(1 for e in words.values() if e.get("frequency") is not None)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("Frequency Assignment Statistics")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Words.json entries: {total}")
|
||||
print(f"Clean corpus size: {len(freq_corpus)}")
|
||||
print(f"Previously had freq: {previously_had}")
|
||||
print(f"Now assigned: {assigned}")
|
||||
print(f"Newly gained: {assigned - previously_had}")
|
||||
print(f"Still unlisted: {total - assigned}")
|
||||
|
||||
# By tier
|
||||
tier1 = sum(1 for a in assignments.values() if a["source"] == "headword")
|
||||
tier2_conj = sum(1 for a in assignments.values() if a["source"] == "conjugation")
|
||||
tier2_inf = sum(1 for a in assignments.values() if a["source"] == "inflection")
|
||||
print("\nBy assignment tier:")
|
||||
print(f" Tier 1 (headword): {tier1}")
|
||||
print(f" Tier 2 (conjugation): {tier2_conj}")
|
||||
print(f" Tier 2 (inflection): {tier2_inf}")
|
||||
|
||||
# By PoS
|
||||
print("\nBy PoS:")
|
||||
from collections import Counter
|
||||
|
||||
pos_assigned = Counter()
|
||||
pos_total = Counter()
|
||||
for k, v in words.items():
|
||||
pos = _get_pos_tag(v)
|
||||
pos_total[pos] += 1
|
||||
if k in assignments:
|
||||
pos_assigned[pos] += 1
|
||||
pos_order = [
|
||||
"כינויי_גוף",
|
||||
"מילות_חיבור",
|
||||
"שם_תואר",
|
||||
"מילית",
|
||||
"שם_עצם",
|
||||
"תוארי_הפועל",
|
||||
"מילות_יחס",
|
||||
"פעלים",
|
||||
"unknown",
|
||||
]
|
||||
for pos in sorted(pos_total, key=lambda p: pos_order.index(p) if p in pos_order else 99):
|
||||
a = pos_assigned[pos]
|
||||
t = pos_total[pos]
|
||||
pct = a / t * 100 if t else 0
|
||||
print(f" {pos:20s}: {a:5d}/{t:5d} ({pct:.0f}%)")
|
||||
|
||||
# By frequency tier (using apkg_builder tiers)
|
||||
print("\nBy frequency tier:")
|
||||
tiers = {
|
||||
"Core (1-500)": (1, 500),
|
||||
"Essential (501-1500)": (501, 1500),
|
||||
"Intermediate (1501-3000)": (1501, 3000),
|
||||
"Upper-intermediate (3001-5000)": (3001, 5000),
|
||||
"Advanced (5001-10000)": (5001, 10000),
|
||||
"Rare (10001+)": (10001, 999999),
|
||||
}
|
||||
for label, (lo, hi) in tiers.items():
|
||||
count = sum(1 for a in assignments.values() if lo <= a["rank"] <= hi)
|
||||
print(f" {label:35s}: {count}")
|
||||
|
||||
# Top 20 newly assigned (entries that didn't have frequency before)
|
||||
newly = []
|
||||
for k, a in assignments.items():
|
||||
if words[k].get("frequency") is None:
|
||||
w = words[k].get("word", {})
|
||||
newly.append((a["rank"], k, w.get("ktiv_male", ""), a["source"], a["corpus_word"]))
|
||||
newly.sort()
|
||||
if newly:
|
||||
print("\nTop 20 newly assigned entries:")
|
||||
for rank, _key, ktiv, source, corpus_word in newly[:20]:
|
||||
print(f" rank {rank:5d}: {ktiv:15s} via {source:12s} (corpus: {corpus_word})")
|
||||
|
||||
# Entries that LOST frequency (had it before, not assigned now)
|
||||
lost = []
|
||||
for k, v in words.items():
|
||||
old_freq = v.get("frequency")
|
||||
if old_freq is not None and k not in assignments:
|
||||
w = v.get("word", {})
|
||||
lost.append((old_freq, k, w.get("ktiv_male", "")))
|
||||
lost.sort()
|
||||
if lost:
|
||||
print(f"\nEntries that would LOSE frequency ({len(lost)} total):")
|
||||
for rank, _key, ktiv in lost[:20]:
|
||||
print(f" was rank {rank:5d}: {ktiv}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Assign frequency to words.json")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||
parser.add_argument("--stats", action="store_true", help="Show statistics only")
|
||||
parser.add_argument(
|
||||
"--upgrade", action="store_true", help="Allow tier 2 to upgrade headword rank from conjugated forms"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
# Load data
|
||||
freq_path = CLEAN_CACHE if CLEAN_CACHE.exists() else RAW_CACHE
|
||||
logger.info("Loading frequency corpus: %s", freq_path)
|
||||
with open(freq_path, encoding="utf-8") as f:
|
||||
freq_corpus: dict[str, int] = json.load(f)
|
||||
|
||||
# Load raw corpus for original rank numbers (with gaps)
|
||||
raw_corpus: dict[str, int] | None = None
|
||||
if RAW_CACHE.exists() and freq_path != RAW_CACHE:
|
||||
with open(RAW_CACHE, encoding="utf-8") as f:
|
||||
raw_corpus = json.load(f)
|
||||
logger.info("Using original ranks from %s", RAW_CACHE)
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words: dict = json.load(f)
|
||||
|
||||
logger.info("Corpus: %d entries, Words.json: %d entries", len(freq_corpus), len(words))
|
||||
|
||||
# Run assignment
|
||||
assignments = assign_frequencies(words, freq_corpus, raw_corpus, upgrade=args.upgrade)
|
||||
|
||||
# Stats
|
||||
print_stats(words, assignments, freq_corpus)
|
||||
|
||||
if args.stats or args.dry_run:
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — no changes saved")
|
||||
return
|
||||
|
||||
# Apply to words.json
|
||||
changed = 0
|
||||
for key, entry in words.items():
|
||||
if key in assignments:
|
||||
new_rank = assignments[key]["rank"]
|
||||
if entry.get("frequency") != new_rank:
|
||||
entry["frequency"] = new_rank
|
||||
changed += 1
|
||||
else:
|
||||
if entry.get("frequency") is not None:
|
||||
entry["frequency"] = None
|
||||
changed += 1
|
||||
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Updated %d entries in words.json", changed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,269 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Assign pseudo-frequency to confusable groups using English word frequency.
|
||||
|
||||
Problem: Confusable entries share the same ktiv_male and thus the same Hebrew
|
||||
frequency rank. This script uses English frequency to differentiate them so
|
||||
Anki sorts more-common meanings first.
|
||||
|
||||
Algorithm:
|
||||
1. For each confusable group where all entries share the same Hebrew frequency,
|
||||
extract the first meaningful English keyword from each entry's meaning field.
|
||||
2. Look up English frequency rank for each keyword.
|
||||
3. Assign pseudo_frequency: the most frequent English meaning keeps the original
|
||||
Hebrew rank; less frequent meanings get progressively higher (worse) ranks
|
||||
by adding an offset (100 * position in group).
|
||||
|
||||
Usage:
|
||||
python3 scripts/assign_pseudo_frequency.py # assign and save
|
||||
python3 scripts/assign_pseudo_frequency.py --dry-run # preview only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt"
|
||||
|
||||
# Words too common/vague to use as frequency signal
|
||||
_EN_STOP = frozenset(
|
||||
{
|
||||
"to",
|
||||
"be",
|
||||
"a",
|
||||
"an",
|
||||
"the",
|
||||
"of",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"for",
|
||||
"and",
|
||||
"with",
|
||||
"by",
|
||||
"or",
|
||||
"but",
|
||||
"not",
|
||||
"as",
|
||||
"its",
|
||||
"it",
|
||||
"is",
|
||||
"was",
|
||||
"are",
|
||||
"from",
|
||||
"that",
|
||||
"this",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"can",
|
||||
"could",
|
||||
"may",
|
||||
"might",
|
||||
"shall",
|
||||
"should",
|
||||
"must",
|
||||
"no",
|
||||
"yes",
|
||||
"very",
|
||||
"too",
|
||||
"also",
|
||||
"just",
|
||||
"only",
|
||||
"so",
|
||||
"up",
|
||||
"out",
|
||||
"into",
|
||||
"over",
|
||||
"after",
|
||||
"before",
|
||||
"about",
|
||||
"more",
|
||||
"than",
|
||||
"other",
|
||||
"some",
|
||||
"any",
|
||||
"all",
|
||||
"each",
|
||||
"every",
|
||||
"both",
|
||||
"few",
|
||||
"many",
|
||||
"much",
|
||||
"most",
|
||||
"such",
|
||||
"own",
|
||||
"same",
|
||||
"well",
|
||||
"still",
|
||||
"even",
|
||||
"how",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
"whose",
|
||||
"why",
|
||||
"because",
|
||||
"if",
|
||||
"then",
|
||||
"else",
|
||||
"while",
|
||||
"until",
|
||||
"though",
|
||||
"whether",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _load_en_freq() -> dict[str, int]:
|
||||
"""Load English frequency data: word -> rank (1 = most common)."""
|
||||
freq: dict[str, int] = {}
|
||||
rank = 1
|
||||
with open(EN_FREQ_PATH, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if parts:
|
||||
word = parts[0].lower()
|
||||
if word not in freq:
|
||||
freq[word] = rank
|
||||
rank += 1
|
||||
return freq
|
||||
|
||||
|
||||
def _extract_keywords(meaning: str) -> list[str]:
|
||||
"""Extract meaningful English keywords from a meaning string.
|
||||
|
||||
Returns list of lowercase words, filtered for stop words and short words.
|
||||
"""
|
||||
# Strip parenthesized content, punctuation
|
||||
cleaned = re.sub(r"\([^)]*\)", " ", meaning)
|
||||
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
|
||||
return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP]
|
||||
|
||||
|
||||
def assign_pseudo_frequencies(
|
||||
words: dict,
|
||||
en_freq: dict[str, int],
|
||||
dry_run: bool = False,
|
||||
) -> int:
|
||||
"""Assign pseudo_frequency to confusable groups. Returns count of changes."""
|
||||
|
||||
# Group by confusables_guid
|
||||
groups: dict[str, list[str]] = defaultdict(list)
|
||||
for key, entry in words.items():
|
||||
cg = entry.get("confusables_guid")
|
||||
if cg:
|
||||
groups[cg].append(key)
|
||||
|
||||
changes = 0
|
||||
assigned_groups = 0
|
||||
skipped_diff = 0
|
||||
skipped_no_en = 0
|
||||
|
||||
for _guid, keys in groups.items():
|
||||
entries = [words[k] for k in keys]
|
||||
freqs = [e.get("frequency") for e in entries]
|
||||
|
||||
# Skip groups that are already differentiated
|
||||
unique_freqs = set(freqs)
|
||||
if len(unique_freqs) > 1:
|
||||
skipped_diff += 1
|
||||
continue
|
||||
|
||||
base_freq = freqs[0] # All same (or all None)
|
||||
|
||||
# Look up English frequency for each entry
|
||||
en_ranks: list[tuple[int, str]] = [] # (en_rank, key)
|
||||
for key, entry in zip(keys, entries, strict=True):
|
||||
keywords = _extract_keywords(entry.get("meaning", ""))
|
||||
en_rank = 999_999
|
||||
for kw in keywords[:5]:
|
||||
r = en_freq.get(kw)
|
||||
if r is not None:
|
||||
en_rank = r
|
||||
break
|
||||
en_ranks.append((en_rank, key))
|
||||
|
||||
# Sort by English frequency (lower rank = more common)
|
||||
en_ranks.sort()
|
||||
|
||||
# Check if all entries have the same English rank (no signal)
|
||||
if len({r for r, _ in en_ranks}) <= 1:
|
||||
skipped_no_en += 1
|
||||
continue
|
||||
|
||||
assigned_groups += 1
|
||||
|
||||
# Assign pseudo_frequency: most common gets base, others get offset
|
||||
for position, (en_rank, key) in enumerate(en_ranks):
|
||||
pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank
|
||||
|
||||
if not dry_run:
|
||||
words[key]["pseudo_frequency"] = pseudo
|
||||
changes += 1
|
||||
|
||||
if dry_run:
|
||||
meaning = words[key].get("meaning", "")[:40]
|
||||
logger.info(
|
||||
" [en:%5d] pseudo=%6d %s",
|
||||
en_rank,
|
||||
pseudo,
|
||||
meaning,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal",
|
||||
assigned_groups,
|
||||
skipped_diff,
|
||||
skipped_no_en,
|
||||
)
|
||||
return changes
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
|
||||
logger.info("Loading English frequency data: %s", EN_FREQ_PATH)
|
||||
en_freq = _load_en_freq()
|
||||
logger.info("English frequency: %d entries", len(en_freq))
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words: dict = json.load(f)
|
||||
|
||||
changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run)
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — %d changes would be made", changes)
|
||||
return
|
||||
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Saved %d pseudo-frequency assignments to words.json", changes)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,212 +0,0 @@
|
|||
"""Check that every GUID in the last-release complete .apkg exists in words.json.
|
||||
|
||||
Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
|
||||
then compares against all GUID fields stored in data/words.json.
|
||||
|
||||
Usage:
|
||||
python3 scripts/check_guid_coverage.py
|
||||
python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
|
||||
python3 scripts/check_guid_coverage.py --verbose
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
|
||||
# Known model IDs (from apkg_builder.py)
|
||||
MODEL_IDS = {
|
||||
1701222017968: "vocab",
|
||||
1234567893: "conjugation",
|
||||
1234567897: "plurals",
|
||||
1234567895: "confusables",
|
||||
}
|
||||
|
||||
|
||||
def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
|
||||
"""Extract GUIDs from .apkg grouped by model ID."""
|
||||
by_model: dict[int, set[str]] = {}
|
||||
with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
|
||||
z.extractall(td)
|
||||
db_path = os.path.join(td, "collection.anki2")
|
||||
conn = sqlite3.connect(db_path)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT guid, mid FROM notes")
|
||||
for guid, mid in cur.fetchall():
|
||||
by_model.setdefault(mid, set()).add(guid)
|
||||
conn.close()
|
||||
return by_model
|
||||
|
||||
|
||||
def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
|
||||
"""Collect all GUIDs from words.json grouped by deck type."""
|
||||
vocab_guids: set[str] = set()
|
||||
cloze_guids: set[str] = set()
|
||||
conj_guids: set[str] = set()
|
||||
plurals_guids: set[str] = set()
|
||||
confusables_guids: set[str] = set()
|
||||
|
||||
for entry in data.values():
|
||||
# Vocab legacy GUID
|
||||
g = entry.get("vocab_legacy_guid")
|
||||
if g:
|
||||
vocab_guids.add(g)
|
||||
|
||||
# Cloze GUID (stored in examples.cloze.cloze_guid)
|
||||
examples = entry.get("examples")
|
||||
if examples:
|
||||
cloze = examples.get("cloze")
|
||||
if cloze:
|
||||
g = cloze.get("cloze_guid")
|
||||
if g:
|
||||
cloze_guids.add(g)
|
||||
|
||||
# Plurals GUID (stored inside noun_inflection)
|
||||
ni = entry.get("noun_inflection")
|
||||
if ni:
|
||||
g = ni.get("plurals_guid")
|
||||
if g:
|
||||
plurals_guids.add(g)
|
||||
|
||||
# Confusables GUID (top-level)
|
||||
g = entry.get("confusables_guid")
|
||||
if g:
|
||||
confusables_guids.add(g)
|
||||
|
||||
# Conjugation form GUIDs
|
||||
conj = entry.get("conjugation")
|
||||
if conj:
|
||||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||
forms = conj.get(form_list_key)
|
||||
if not forms:
|
||||
continue
|
||||
for form in forms:
|
||||
g = form.get("guid")
|
||||
if g:
|
||||
conj_guids.add(g)
|
||||
gc = form.get("guid_candidates")
|
||||
if gc:
|
||||
for g2 in gc:
|
||||
conj_guids.add(g2)
|
||||
|
||||
return {
|
||||
"vocab": vocab_guids,
|
||||
"cloze": cloze_guids,
|
||||
"conjugation": conj_guids,
|
||||
"plurals": plurals_guids,
|
||||
"confusables": confusables_guids,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
|
||||
parser.add_argument(
|
||||
"--apkg",
|
||||
type=Path,
|
||||
default=DEFAULT_APKG,
|
||||
help=f"Path to .apkg file (default: {DEFAULT_APKG})",
|
||||
)
|
||||
parser.add_argument("--verbose", "-v", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.apkg.exists():
|
||||
print(f"ERROR: apkg not found: {args.apkg}")
|
||||
sys.exit(2)
|
||||
if not WORDS_JSON.exists():
|
||||
print(f"ERROR: words.json not found: {WORDS_JSON}")
|
||||
sys.exit(2)
|
||||
|
||||
print(f"Checking: {args.apkg}")
|
||||
print(f"Against: {WORDS_JSON}")
|
||||
print()
|
||||
|
||||
apkg_by_model = extract_apkg_guids(args.apkg)
|
||||
data = json.load(WORDS_JSON.open(encoding="utf-8"))
|
||||
wj = collect_words_json_guids(data)
|
||||
|
||||
total_apkg = sum(len(s) for s in apkg_by_model.values())
|
||||
total_wj = sum(len(s) for s in wj.values())
|
||||
print(f"Total GUIDs in apkg: {total_apkg}")
|
||||
print(f"Total GUIDs in words.json: {total_wj}")
|
||||
print()
|
||||
|
||||
all_missing = 0
|
||||
all_extra = 0
|
||||
|
||||
for mid, deck_name in MODEL_IDS.items():
|
||||
apkg_set = apkg_by_model.get(mid, set())
|
||||
|
||||
# Map apkg model to words.json GUID sets
|
||||
if deck_name == "vocab":
|
||||
# Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
|
||||
# They share the note GUID — vocab_legacy_guid IS the note guid
|
||||
wj_set = wj["vocab"] | wj["cloze"]
|
||||
elif deck_name == "conjugation":
|
||||
wj_set = wj["conjugation"]
|
||||
elif deck_name == "plurals":
|
||||
wj_set = wj["plurals"]
|
||||
elif deck_name == "confusables":
|
||||
wj_set = wj["confusables"]
|
||||
else:
|
||||
wj_set = set()
|
||||
|
||||
missing = apkg_set - wj_set
|
||||
extra = wj_set - apkg_set
|
||||
matched = apkg_set & wj_set
|
||||
all_missing += len(missing)
|
||||
all_extra += len(extra)
|
||||
|
||||
status = "PASS" if not missing else "FAIL"
|
||||
print(f" {status} {deck_name} (mid={mid})")
|
||||
print(
|
||||
f" apkg={len(apkg_set)}, words.json={len(wj_set)}, "
|
||||
f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
|
||||
)
|
||||
|
||||
if missing and args.verbose:
|
||||
# Try to find what word each missing GUID belongs to in the apkg
|
||||
print(" Missing GUIDs (in apkg, not in words.json):")
|
||||
for g in sorted(missing)[:20]:
|
||||
print(f" {g!r}")
|
||||
if len(missing) > 20:
|
||||
print(f" ... ({len(missing) - 20} more)")
|
||||
|
||||
if extra and args.verbose:
|
||||
print(" Extra GUIDs (in words.json, not in apkg):")
|
||||
for g in sorted(extra)[:10]:
|
||||
print(f" {g!r}")
|
||||
if len(extra) > 10:
|
||||
print(f" ... ({len(extra) - 10} more)")
|
||||
|
||||
print()
|
||||
|
||||
# Check for unknown model IDs in apkg
|
||||
unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
|
||||
if unknown_mids:
|
||||
print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}")
|
||||
for mid in unknown_mids:
|
||||
print(f" mid={mid}: {len(apkg_by_model[mid])} notes")
|
||||
|
||||
print("─" * 60)
|
||||
if all_missing:
|
||||
print(f" FAILED: {all_missing} apkg GUIDs not found in words.json")
|
||||
print(" (These notes would lose study progress on reimport)")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f" All {total_apkg} apkg GUIDs accounted for in words.json.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,400 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Clean the Hebrew frequency corpus by removing prefix+word combinations.
|
||||
|
||||
Two modes:
|
||||
--mode yap (default) Use YAP morphological analyzer for accurate prefix detection.
|
||||
Requires YAP API running at localhost:8000.
|
||||
--mode heuristic Use rule-based prefix stripping (no external dependencies).
|
||||
|
||||
Both modes preserve words that exist as known dictionary forms in words.json.
|
||||
|
||||
Usage:
|
||||
python3 scripts/clean_frequency_corpus.py # YAP mode
|
||||
python3 scripts/clean_frequency_corpus.py --mode heuristic # heuristic fallback
|
||||
python3 scripts/clean_frequency_corpus.py --dry-run # preview only
|
||||
python3 scripts/clean_frequency_corpus.py --resume # resume YAP from checkpoint
|
||||
python3 scripts/clean_frequency_corpus.py --limit 1000 # process first N entries
|
||||
|
||||
Input: data/frequency_cache.json (raw he_50k.txt, 49999 entries)
|
||||
Output: data/frequency_clean.json (filtered, prefix combos removed)
|
||||
data/frequency_discarded.json (discarded entries with reason)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
|
||||
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
|
||||
DISCARDED = PROJECT_ROOT / "data" / "frequency_discarded.json"
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
CHECKPOINT = PROJECT_ROOT / "data" / "_yap_checkpoint.json"
|
||||
|
||||
YAP_URL = os.environ.get("YAP_URL", "http://localhost:8000/yap/heb/joint")
|
||||
YAP_TIMEOUT = 10
|
||||
BATCH_SAVE_INTERVAL = 500
|
||||
|
||||
# --- YAP mode constants ---
|
||||
# POS tags that indicate a prefix
|
||||
PREFIX_POS = frozenset({"PREPOSITION", "CONJ", "DEF", "REL"})
|
||||
# POS tags for the host word that make the combo a false positive
|
||||
HOST_POS = frozenset({"NN", "NNP", "NNT", "PRP", "CD", "DT", "EX"})
|
||||
|
||||
# --- Heuristic mode constants ---
|
||||
# Hebrew prefix combinations, longest first for greedy matching.
|
||||
PREFIXES = [
|
||||
# 4-char
|
||||
"וכשמ",
|
||||
"וכשב",
|
||||
"וכשל",
|
||||
"וכשה",
|
||||
# 3-char
|
||||
"וכש",
|
||||
"ומה",
|
||||
"ובה",
|
||||
"וכה",
|
||||
"ולה",
|
||||
"ומש",
|
||||
"ובש",
|
||||
"וכב",
|
||||
"ולב",
|
||||
"ומב",
|
||||
"וכל",
|
||||
"ולכ",
|
||||
"שבה",
|
||||
"שמה",
|
||||
# 2-char
|
||||
"כש",
|
||||
"מה",
|
||||
"בה",
|
||||
"כה",
|
||||
"לה",
|
||||
"מש",
|
||||
"בש",
|
||||
"וב",
|
||||
"וה",
|
||||
"וכ",
|
||||
"ול",
|
||||
"ומ",
|
||||
"וש",
|
||||
"כב",
|
||||
"לב",
|
||||
"מב",
|
||||
"כל",
|
||||
"לכ",
|
||||
"שב",
|
||||
"שה",
|
||||
"שכ",
|
||||
"של",
|
||||
"שמ",
|
||||
# 1-char
|
||||
"ב",
|
||||
"ה",
|
||||
"ו",
|
||||
"כ",
|
||||
"ל",
|
||||
"מ",
|
||||
"ש",
|
||||
]
|
||||
MIN_REMAINDER_LEN = 2
|
||||
|
||||
|
||||
def _load_known_forms(words_path: Path) -> set[str]:
|
||||
"""Load all known ktiv_male forms from words.json."""
|
||||
if not words_path.exists():
|
||||
logger.warning("words.json not found at %s — no dictionary filter", words_path)
|
||||
return set()
|
||||
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
known: set[str] = set()
|
||||
for entry in words.values():
|
||||
w = entry.get("word") or {}
|
||||
if km := w.get("ktiv_male"):
|
||||
known.add(km)
|
||||
|
||||
for form in entry.get("active_forms") or []:
|
||||
if isinstance(form, dict) and (km2 := form.get("ktiv_male")):
|
||||
known.add(km2)
|
||||
|
||||
for hp in entry.get("hufal_pual_forms") or []:
|
||||
if isinstance(hp, dict) and (km3 := hp.get("ktiv_male")):
|
||||
known.add(km3)
|
||||
|
||||
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
|
||||
for inf_data in (entry.get(field) or {}).values():
|
||||
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
|
||||
known.add(km4)
|
||||
|
||||
logger.info("Loaded %d known dictionary forms from words.json", len(known))
|
||||
return known
|
||||
|
||||
|
||||
# ── YAP mode ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def query_yap(word: str) -> dict | None:
|
||||
"""Send a single word to YAP and return the JSON response."""
|
||||
payload = {"text": f"{word} "}
|
||||
try:
|
||||
resp = requests.post(YAP_URL, json=payload, timeout=YAP_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except requests.RequestException as e:
|
||||
logger.warning("YAP request failed for '%s': %s", word, e)
|
||||
return None
|
||||
|
||||
|
||||
def is_prefix_combo_yap(yap_response: dict) -> tuple[bool, str]:
|
||||
"""Check if any morphological analysis segments the word as prefix+host.
|
||||
|
||||
Conservative: if ANY analysis in the lattice shows prefix+host → discard.
|
||||
"""
|
||||
lattice = yap_response.get("ma_lattice", "")
|
||||
if not lattice:
|
||||
return False, ""
|
||||
|
||||
arcs = []
|
||||
for line in lattice.strip().split("\n"):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 6:
|
||||
continue
|
||||
arcs.append(
|
||||
{
|
||||
"from": parts[0],
|
||||
"to": parts[1],
|
||||
"form": parts[2],
|
||||
"lemma": parts[3],
|
||||
"cpos": parts[4],
|
||||
"pos": parts[5],
|
||||
}
|
||||
)
|
||||
|
||||
if len(arcs) < 2:
|
||||
return False, ""
|
||||
|
||||
for a in arcs:
|
||||
if a["cpos"] not in PREFIX_POS and a["pos"] not in PREFIX_POS:
|
||||
continue
|
||||
for b in arcs:
|
||||
if b["from"] != a["to"]:
|
||||
continue
|
||||
if b["cpos"] in HOST_POS or b["pos"] in HOST_POS:
|
||||
reason = f"{a['form']}({a['cpos']})+{b['form']}({b['cpos']})"
|
||||
return True, reason
|
||||
|
||||
return False, ""
|
||||
|
||||
|
||||
# ── Heuristic mode ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def find_prefix_decomposition(word: str, freq: dict[str, int]) -> tuple[str, str] | None:
|
||||
"""Check if word is a prefix+higher-ranked-word combo (heuristic)."""
|
||||
if len(word) <= MIN_REMAINDER_LEN:
|
||||
return None
|
||||
|
||||
word_rank = freq.get(word, 999999)
|
||||
|
||||
for prefix in PREFIXES:
|
||||
if not word.startswith(prefix):
|
||||
continue
|
||||
remainder = word[len(prefix) :]
|
||||
if len(remainder) < MIN_REMAINDER_LEN:
|
||||
continue
|
||||
if remainder in freq and freq[remainder] < word_rank:
|
||||
return prefix, remainder
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Clean frequency corpus")
|
||||
parser.add_argument("--mode", choices=["yap", "heuristic"], default="yap", help="Detection mode")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show removals without saving")
|
||||
parser.add_argument("--resume", action="store_true", help="Resume YAP mode from checkpoint")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Process only first N words (0=all)")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
if not RAW_CACHE.exists():
|
||||
logger.error("Raw frequency cache not found: %s", RAW_CACHE)
|
||||
sys.exit(1)
|
||||
|
||||
with open(RAW_CACHE, encoding="utf-8") as f:
|
||||
raw_freq: dict[str, int] = json.load(f)
|
||||
|
||||
logger.info("Raw frequency corpus: %d entries", len(raw_freq))
|
||||
|
||||
# Sort by rank
|
||||
words_by_rank = sorted(raw_freq.items(), key=lambda x: x[1])
|
||||
if args.limit:
|
||||
words_by_rank = words_by_rank[: args.limit]
|
||||
|
||||
if args.mode == "yap":
|
||||
discarded_list = _run_yap_mode(words_by_rank, args)
|
||||
else:
|
||||
known_forms = _load_known_forms(WORDS_JSON)
|
||||
discarded_list = _run_heuristic_mode(words_by_rank, raw_freq, known_forms)
|
||||
|
||||
kept_count = len(words_by_rank) - len(discarded_list)
|
||||
logger.info("Done. Kept: %d, Discarded: %d", kept_count, len(discarded_list))
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — no files written")
|
||||
return
|
||||
|
||||
# Build clean frequency dict (re-ranked without gaps)
|
||||
discarded_words = {d["word"] for d in discarded_list}
|
||||
clean_freq: dict[str, int] = {}
|
||||
new_rank = 1
|
||||
for word, _rank in words_by_rank:
|
||||
if word not in discarded_words:
|
||||
clean_freq[word] = new_rank
|
||||
new_rank += 1
|
||||
|
||||
with open(CLEAN_CACHE, "w", encoding="utf-8") as f:
|
||||
json.dump(clean_freq, f, ensure_ascii=False)
|
||||
logger.info("Clean frequency saved: %d entries → %s", len(clean_freq), CLEAN_CACHE)
|
||||
|
||||
with open(DISCARDED, "w", encoding="utf-8") as f:
|
||||
json.dump(discarded_list, f, ensure_ascii=False, indent=2)
|
||||
logger.info("Discarded entries saved: %d → %s", len(discarded_list), DISCARDED)
|
||||
|
||||
|
||||
def _run_yap_mode(
|
||||
words_by_rank: list[tuple[str, int]],
|
||||
args: argparse.Namespace,
|
||||
) -> list[dict]:
|
||||
"""Run YAP-based prefix detection."""
|
||||
# Check YAP connectivity
|
||||
test = query_yap("בדיקה")
|
||||
if test is None:
|
||||
logger.error("Cannot connect to YAP API at %s", YAP_URL)
|
||||
sys.exit(1)
|
||||
logger.info("YAP API connected")
|
||||
|
||||
# Load checkpoint if resuming
|
||||
analyzed: dict[str, dict] = {}
|
||||
if args.resume and CHECKPOINT.exists():
|
||||
with open(CHECKPOINT, encoding="utf-8") as f:
|
||||
analyzed = json.load(f)
|
||||
logger.info("Resumed from checkpoint: %d words already analyzed", len(analyzed))
|
||||
|
||||
discarded_list: list[dict] = []
|
||||
discarded_count = 0
|
||||
kept_count = 0
|
||||
error_count = 0
|
||||
|
||||
for i, (word, rank) in enumerate(words_by_rank):
|
||||
# Already analyzed (from checkpoint)
|
||||
if word in analyzed:
|
||||
if analyzed[word]["discard"]:
|
||||
discarded_count += 1
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": analyzed[word]["reason"]})
|
||||
else:
|
||||
kept_count += 1
|
||||
continue
|
||||
|
||||
# Trivial: single char, ASCII, or too short
|
||||
if len(word) <= 1 or word.isascii():
|
||||
analyzed[word] = {"discard": False, "reason": ""}
|
||||
kept_count += 1
|
||||
continue
|
||||
|
||||
result = query_yap(word)
|
||||
if result is None:
|
||||
analyzed[word] = {"discard": False, "reason": "yap_error"}
|
||||
error_count += 1
|
||||
kept_count += 1
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
is_combo, reason = is_prefix_combo_yap(result)
|
||||
analyzed[word] = {"discard": is_combo, "reason": reason}
|
||||
|
||||
if is_combo:
|
||||
discarded_count += 1
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
|
||||
if rank <= 500 or discarded_count <= 50:
|
||||
logger.info(" DISCARD rank %5d: %s (%s)", rank, word, reason)
|
||||
else:
|
||||
kept_count += 1
|
||||
|
||||
# Rate limit
|
||||
if i % 10 == 0:
|
||||
time.sleep(0.01)
|
||||
|
||||
# Checkpoint
|
||||
if (i + 1) % BATCH_SAVE_INTERVAL == 0:
|
||||
if not args.dry_run:
|
||||
with open(CHECKPOINT, "w", encoding="utf-8") as f:
|
||||
json.dump(analyzed, f, ensure_ascii=False)
|
||||
logger.info(
|
||||
" [%d/%d] kept=%d discarded=%d errors=%d",
|
||||
i + 1,
|
||||
len(words_by_rank),
|
||||
kept_count,
|
||||
discarded_count,
|
||||
error_count,
|
||||
)
|
||||
|
||||
# Final checkpoint save
|
||||
if not args.dry_run and CHECKPOINT.exists():
|
||||
CHECKPOINT.unlink()
|
||||
|
||||
if error_count:
|
||||
logger.warning("%d YAP errors encountered", error_count)
|
||||
|
||||
return discarded_list
|
||||
|
||||
|
||||
def _run_heuristic_mode(
|
||||
words_by_rank: list[tuple[str, int]],
|
||||
raw_freq: dict[str, int],
|
||||
known_forms: set[str],
|
||||
) -> list[dict]:
|
||||
"""Run heuristic prefix detection (no external dependencies)."""
|
||||
discarded_list: list[dict] = []
|
||||
discarded_count = 0
|
||||
|
||||
for word, rank in words_by_rank:
|
||||
if len(word) <= 1 or word.isascii():
|
||||
continue
|
||||
|
||||
# Known dictionary form → keep
|
||||
if word in known_forms:
|
||||
continue
|
||||
|
||||
result = find_prefix_decomposition(word, raw_freq)
|
||||
if result is not None:
|
||||
prefix, remainder = result
|
||||
discarded_count += 1
|
||||
reason = f"{prefix}+{remainder} (rank {raw_freq[remainder]})"
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
|
||||
if rank <= 500 or discarded_count <= 50:
|
||||
logger.info(" DISCARD rank %5d: %s = %s", rank, word, reason)
|
||||
|
||||
return discarded_list
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,919 +0,0 @@
|
|||
"""Standalone integrity validator for data/words.json.
|
||||
|
||||
Validates the unified Hebrew Flash Cards data against the schema defined in
|
||||
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
|
||||
|
||||
Usage:
|
||||
python3 scripts/validate_data.py
|
||||
python3 scripts/validate_data.py --verbose
|
||||
python3 scripts/validate_data.py --test confusable_symmetric
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bootstrap: make project root importable so helpers.py is accessible
|
||||
# ---------------------------------------------------------------------------
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
|
||||
|
||||
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav
|
||||
|
||||
VALID_PERSON_CODES: frozenset[str] = frozenset(
|
||||
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
|
||||
)
|
||||
|
||||
EMOJI_RE = re.compile(
|
||||
r"[\U0001f600-\U0001f64f"
|
||||
r"\U0001f300-\U0001f5ff"
|
||||
r"\U0001f680-\U0001f6ff"
|
||||
r"\U0001f1e0-\U0001f1ff"
|
||||
r"\U00002702-\U000027b0"
|
||||
r"\U0001f900-\U0001f9ff"
|
||||
r"\U0001fa00-\U0001fa6f"
|
||||
r"\U0001fa70-\U0001faff]"
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Result tracking
|
||||
# ---------------------------------------------------------------------------
|
||||
_failures: list[str] = []
|
||||
_warnings: list[str] = []
|
||||
_verbose: bool = False
|
||||
|
||||
|
||||
def _pass(name: str) -> None:
|
||||
print(f" PASS {name}")
|
||||
|
||||
|
||||
def _fail(name: str, details: list[str]) -> None:
|
||||
global _failures
|
||||
_failures.append(name)
|
||||
print(f" FAIL {name}")
|
||||
for d in details:
|
||||
print(f" {d}")
|
||||
|
||||
|
||||
def _warn(name: str, details: list[str]) -> None:
|
||||
global _warnings
|
||||
_warnings.extend(details)
|
||||
print(f" WARN {name}")
|
||||
for d in details:
|
||||
print(f" {d}")
|
||||
|
||||
|
||||
def _verbose_print(msg: str) -> None:
|
||||
if _verbose:
|
||||
print(f" {msg}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper: load data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_data() -> dict[str, Any]:
|
||||
"""Load words.json and return the parsed dict."""
|
||||
if not DATA_FILE.exists():
|
||||
print(f"ERROR: data file not found: {DATA_FILE}")
|
||||
sys.exit(2)
|
||||
with DATA_FILE.open(encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _is_hebrew_consonant(ch: str) -> bool:
|
||||
"""Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
|
||||
|
||||
Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
|
||||
only the first base character after NFD decomposition.
|
||||
"""
|
||||
normalized = unicodedata.normalize("NFD", ch)
|
||||
# The first codepoint is the base consonant; the rest are combining marks.
|
||||
base = normalized[0]
|
||||
cp = ord(base)
|
||||
return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Individual tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_required_fields(data: dict[str, Any]) -> None:
|
||||
"""Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
|
||||
name = "required_fields"
|
||||
errors: list[str] = []
|
||||
warn_details: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
word = entry.get("word")
|
||||
if not isinstance(word, dict):
|
||||
errors.append(f"[{key}] 'word' is missing or not a dict")
|
||||
else:
|
||||
if not word.get("nikkud"):
|
||||
errors.append(f"[{key}] word.nikkud is missing or empty")
|
||||
if not word.get("ktiv_male"):
|
||||
errors.append(f"[{key}] word.ktiv_male is missing or empty")
|
||||
|
||||
if not entry.get("slug"):
|
||||
errors.append(f"[{key}] 'slug' is missing or empty")
|
||||
if not entry.get("pos"):
|
||||
errors.append(f"[{key}] 'pos' is missing or empty")
|
||||
if not entry.get("meaning"):
|
||||
errors.append(f"[{key}] 'meaning' is missing or empty")
|
||||
|
||||
if entry.get("frequency") is None:
|
||||
warn_details.append(f"[{key}] 'frequency' is null/missing")
|
||||
|
||||
if warn_details:
|
||||
_warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
|
||||
if len(warn_details) > 20 and not _verbose:
|
||||
print(f" ... ({len(warn_details) - 20} more; use --verbose)")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_root_format(data: dict[str, Any]) -> None:
|
||||
"""root is a list of 2-5 Hebrew consonant chars, or an empty list."""
|
||||
name = "root_format"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
root = entry.get("root")
|
||||
if root is None:
|
||||
errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
|
||||
continue
|
||||
if not isinstance(root, list):
|
||||
errors.append(f"[{key}] 'root' is not a list: {root!r}")
|
||||
continue
|
||||
if len(root) == 0:
|
||||
continue # rootless word — valid
|
||||
if not (2 <= len(root) <= 5):
|
||||
errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
|
||||
continue
|
||||
for ch in root:
|
||||
# A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
|
||||
# Validate by checking the base consonant after NFD decomposition.
|
||||
if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
|
||||
errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
|
||||
break
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_unique_slugs(data: dict[str, Any]) -> None:
|
||||
"""All non-empty slugs are unique across entries — each pealim page is a distinct word."""
|
||||
name = "unique_slugs"
|
||||
seen: dict[str, list[str]] = {}
|
||||
|
||||
for key, entry in data.items():
|
||||
slug = entry.get("slug")
|
||||
if slug:
|
||||
seen.setdefault(slug, []).append(key)
|
||||
|
||||
dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
|
||||
if dups:
|
||||
errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001
|
||||
"""JSON loaded without top-level key collisions.
|
||||
|
||||
Python's json.load silently keeps the last value on duplicate keys;
|
||||
we re-parse with a custom object_pairs_hook to detect them.
|
||||
The pre-parsed ``_data`` dict is not used here because we need to
|
||||
re-read the raw file to catch duplicate keys that json.load would
|
||||
silently merge.
|
||||
"""
|
||||
name = "no_duplicate_keys"
|
||||
duplicates: list[str] = []
|
||||
|
||||
def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
|
||||
d: dict[str, Any] = {}
|
||||
for k, v in pairs:
|
||||
if k in d:
|
||||
duplicates.append(k)
|
||||
d[k] = v
|
||||
return d
|
||||
|
||||
with DATA_FILE.open(encoding="utf-8") as fh:
|
||||
json.load(fh, object_pairs_hook=_detect_dups)
|
||||
|
||||
if duplicates:
|
||||
_fail(name, [f"duplicate key: {k!r}" for k in duplicates])
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_confusable_symmetric(data: dict[str, Any]) -> None:
|
||||
"""If A lists B in confusable_group, B must list A."""
|
||||
name = "confusable_symmetric"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
if not group:
|
||||
continue
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if other is None:
|
||||
errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
|
||||
continue
|
||||
other_group = other.get("confusable_group") or []
|
||||
if key not in other_group:
|
||||
errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
|
||||
"""Every key in shared_roots must exist as a top-level key."""
|
||||
name = "shared_roots_valid_keys"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
shared = entry.get("shared_roots")
|
||||
if not shared:
|
||||
continue
|
||||
for ref_key in shared:
|
||||
if ref_key not in data:
|
||||
errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_unique_legacy_guids(data: dict[str, Any]) -> None:
|
||||
"""No two entries share the same vocab_legacy_guid (excluding null).
|
||||
|
||||
Exception: entries that share the same word.nikkud value inherited the
|
||||
same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
|
||||
These are tolerated — the duplicate GUID is a known artefact of how
|
||||
legacy GUIDs were generated from the nikkud word alone.
|
||||
"""
|
||||
name = "unique_legacy_guids"
|
||||
seen: dict[str, list[str]] = {}
|
||||
|
||||
for key, entry in data.items():
|
||||
guid = entry.get("vocab_legacy_guid")
|
||||
if guid:
|
||||
seen.setdefault(guid, []).append(key)
|
||||
|
||||
errors: list[str] = []
|
||||
for guid, keys in seen.items():
|
||||
if len(keys) <= 1:
|
||||
continue
|
||||
# Tolerate sharing if ALL entries with this GUID share the same word.nikkud
|
||||
nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
|
||||
if len(nikkud_values) == 1:
|
||||
# Same nikkud -> inherited from same legacy card; tolerable
|
||||
_verbose_print(
|
||||
f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
|
||||
)
|
||||
continue
|
||||
errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
|
||||
"""noun_inflection must be null if pos doesn't start with 'Noun'.
|
||||
|
||||
Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
|
||||
"""
|
||||
name = "no_noun_inflection_on_non_nouns"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
pos = entry.get("pos") or ""
|
||||
noun_inf = entry.get("noun_inflection")
|
||||
if not pos.startswith("Noun") and noun_inf is not None:
|
||||
errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
|
||||
_verbose_print(f"offending entry: {key!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
|
||||
"""meaning field must not contain inline emoji characters."""
|
||||
name = "no_emoji_in_meaning"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
if EMOJI_RE.search(meaning):
|
||||
errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
|
||||
"""For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
|
||||
|
||||
Uses nikkud (exact) matching, not stripped matching.
|
||||
"""
|
||||
name = "example_sentences_contain_word"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
examples = entry.get("examples")
|
||||
if not examples:
|
||||
continue
|
||||
vetted = examples.get("vetted")
|
||||
if not vetted:
|
||||
continue
|
||||
|
||||
word_obj = entry.get("word") or {}
|
||||
nikkud_word = word_obj.get("nikkud") or ""
|
||||
if not nikkud_word:
|
||||
continue
|
||||
|
||||
found = any(nikkud_word in (s.get("text") or "") for s in vetted)
|
||||
if not found:
|
||||
sentences_preview = [s.get("text", "") for s in vetted[:2]]
|
||||
errors.append(
|
||||
f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
_warn(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
|
||||
"""cloze_word_start/end must be within text bounds when present.
|
||||
|
||||
Null offsets are tolerated (and warned separately) because some sentences
|
||||
contain only inflected/construct/plural forms that cannot be matched back
|
||||
to the base nikkud or ktiv_male — this is a data quality issue in
|
||||
vetted_sentences.json, not a schema violation.
|
||||
"""
|
||||
name = "cloze_offsets_valid"
|
||||
errors: list[str] = []
|
||||
null_warn: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
examples = entry.get("examples")
|
||||
if not examples:
|
||||
continue
|
||||
cloze = examples.get("cloze")
|
||||
if not cloze:
|
||||
continue
|
||||
|
||||
text = cloze.get("text") or ""
|
||||
start = cloze.get("cloze_word_start")
|
||||
end = cloze.get("cloze_word_end")
|
||||
|
||||
if start is None or end is None:
|
||||
null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
|
||||
continue
|
||||
|
||||
text_len = len(text)
|
||||
if not isinstance(start, int) or not isinstance(end, int):
|
||||
errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
|
||||
continue
|
||||
if start < 0 or end < 0:
|
||||
errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
|
||||
continue
|
||||
if start >= end:
|
||||
errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
|
||||
continue
|
||||
if end > text_len:
|
||||
errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
|
||||
|
||||
if null_warn:
|
||||
_warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
|
||||
if len(null_warn) > 20 and not _verbose:
|
||||
print(f" ... ({len(null_warn) - 20} more; use --verbose)")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
|
||||
"""hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
|
||||
name = "hufal_pual_only_on_hifil_piel"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
hufal_pual = conj.get("hufal_pual_forms")
|
||||
if hufal_pual is None:
|
||||
continue
|
||||
|
||||
binyan = conj.get("binyan") or ""
|
||||
binyan_lower = binyan.lower()
|
||||
if "hif" not in binyan_lower and "pi" not in binyan_lower:
|
||||
errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
|
||||
"""All entries in a confusable_group must share the same word.ktiv_male."""
|
||||
name = "confusable_group_shares_ktiv_male"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
if not group:
|
||||
continue
|
||||
|
||||
my_word = entry.get("word") or {}
|
||||
my_ktiv = my_word.get("ktiv_male")
|
||||
if not my_ktiv:
|
||||
continue
|
||||
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if not other:
|
||||
continue # already caught by confusable_symmetric
|
||||
other_word = other.get("word") or {}
|
||||
other_ktiv = other_word.get("ktiv_male")
|
||||
if other_ktiv and other_ktiv != my_ktiv:
|
||||
errors.append(
|
||||
f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_confusables_guid(data: dict[str, Any]) -> None:
|
||||
"""confusables_guid must be consistent within each confusable_group.
|
||||
|
||||
Rules:
|
||||
- If confusable_group is non-null, confusables_guid must be non-null.
|
||||
- If confusable_group is null, confusables_guid must be null.
|
||||
- All entries that share a confusable_group must share the same
|
||||
confusables_guid value.
|
||||
"""
|
||||
name = "confusables_guid"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
guid = entry.get("confusables_guid")
|
||||
|
||||
if group and not guid:
|
||||
errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
|
||||
elif not group and guid is not None:
|
||||
errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
|
||||
|
||||
if not group or not guid:
|
||||
continue
|
||||
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if not other:
|
||||
continue # already caught by confusable_symmetric
|
||||
other_guid = other.get("confusables_guid")
|
||||
if other_guid != guid:
|
||||
errors.append(
|
||||
f"[{key}] confusables_guid={guid!r} but confusable member "
|
||||
f"{other_key!r} has confusables_guid={other_guid!r}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_conjugation_form_guids(data: dict[str, Any]) -> None:
|
||||
"""Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
|
||||
|
||||
Rules:
|
||||
- Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
|
||||
OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
|
||||
1st person forms where multiple GUIDs are possible).
|
||||
- No two forms within the same verb (across both form lists) may share a GUID.
|
||||
"""
|
||||
name = "conjugation_form_guids"
|
||||
errors: list[str] = []
|
||||
warnings: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
|
||||
seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label
|
||||
|
||||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||
forms = conj.get(form_list_key)
|
||||
if not forms:
|
||||
continue
|
||||
for form in forms:
|
||||
person = form.get("person", "?")
|
||||
label = f"{form_list_key}[{person}]"
|
||||
guid = form.get("guid")
|
||||
guid_candidates = form.get("guid_candidates")
|
||||
|
||||
if not guid and not guid_candidates:
|
||||
# New forms from rescrape use deterministic fallback — warn, don't fail
|
||||
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
|
||||
continue
|
||||
|
||||
if guid:
|
||||
if guid in seen_guids:
|
||||
errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
|
||||
else:
|
||||
seen_guids[guid] = label
|
||||
elif guid_candidates:
|
||||
for candidate in guid_candidates:
|
||||
if candidate in seen_guids:
|
||||
errors.append(
|
||||
f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
|
||||
)
|
||||
else:
|
||||
seen_guids[candidate] = label
|
||||
|
||||
if warnings:
|
||||
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_conjugation_person_codes(data: dict[str, Any]) -> None:
|
||||
"""active_forms person codes must be from the defined valid set."""
|
||||
name = "conjugation_person_codes"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
|
||||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||
forms = conj.get(form_list_key)
|
||||
if not forms:
|
||||
continue
|
||||
for form in forms:
|
||||
person = form.get("person")
|
||||
if person not in VALID_PERSON_CODES:
|
||||
errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
||||
"""For confusable words, their example sentences must not contain the wrong
|
||||
homograph's nikkud word.
|
||||
|
||||
Specifically: if A and B are confusable (same ktiv_male), A's vetted
|
||||
sentences must not contain B's nikkud form, and vice versa.
|
||||
"""
|
||||
name = "no_stripped_form_sentence_collisions"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
if not group:
|
||||
continue
|
||||
|
||||
examples = entry.get("examples")
|
||||
if not examples:
|
||||
continue
|
||||
vetted = examples.get("vetted")
|
||||
if not vetted:
|
||||
continue
|
||||
|
||||
my_word = entry.get("word") or {}
|
||||
my_nikkud = my_word.get("nikkud") or ""
|
||||
|
||||
my_texts = [s.get("text") or "" for s in vetted]
|
||||
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if not other:
|
||||
continue
|
||||
other_word = other.get("word") or {}
|
||||
other_nikkud = other_word.get("nikkud") or ""
|
||||
if not other_nikkud or other_nikkud == my_nikkud:
|
||||
continue # same nikkud homographs are ok (we can't distinguish by nikkud)
|
||||
|
||||
for text in my_texts:
|
||||
if other_nikkud in text:
|
||||
errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
|
||||
_verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
|
||||
break # one error per (key, other_key) pair is enough
|
||||
|
||||
if errors:
|
||||
_warn(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_shared_confusable_examples(data: dict[str, Any]) -> None:
|
||||
"""Within each confusable group, no two entries should share the same set of vetted sentence texts.
|
||||
|
||||
Shared examples indicate the deduplication step in epub_examples.py
|
||||
failed to assign examples to only the highest-frequency member.
|
||||
"""
|
||||
name = "no_shared_confusable_examples"
|
||||
errors: list[str] = []
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
# Build confusable group map
|
||||
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||
for key, entry in data.items():
|
||||
cg = entry.get("confusable_group")
|
||||
if cg:
|
||||
group_id = tuple(sorted(cg))
|
||||
group_map[group_id].append(key)
|
||||
|
||||
for _group_id, members in group_map.items():
|
||||
if len(members) < 2:
|
||||
continue
|
||||
|
||||
# Collect sentence text sets per member
|
||||
text_sets: dict[str, frozenset[str]] = {}
|
||||
for key in members:
|
||||
vetted = (data[key].get("examples") or {}).get("vetted") or []
|
||||
texts = frozenset(e.get("text", "") for e in vetted)
|
||||
if texts:
|
||||
text_sets[key] = texts
|
||||
|
||||
# Check for identical sets
|
||||
seen: dict[frozenset[str], str] = {}
|
||||
for key, texts in text_sets.items():
|
||||
if texts in seen:
|
||||
meaning_a = (data[seen[texts]].get("meaning") or "")[:30]
|
||||
meaning_b = (data[key].get("meaning") or "")[:30]
|
||||
errors.append(
|
||||
f"{seen[texts]} ({meaning_a}) and {key} ({meaning_b}) share {len(texts)} identical example(s)"
|
||||
)
|
||||
else:
|
||||
seen[texts] = key
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
|
||||
"""English meanings must not contain bare Hebrew text (spoils the card)."""
|
||||
name = "no_hebrew_in_meaning"
|
||||
errors: list[str] = []
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
|
||||
for key, entry in data.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
# Apply same cleaning pipeline as apkg_builder
|
||||
cleaned = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||
cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(", ;:")
|
||||
if hebrew_re.search(cleaned):
|
||||
errors.append(f"[{key}] meaning still contains Hebrew after cleaning: {cleaned!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_mishkal_consistency(data: dict[str, Any]) -> None:
|
||||
"""mishkal_hebrew must match mishkal via _mishkal_to_hebrew conversion."""
|
||||
name = "mishkal_consistency"
|
||||
errors: list[str] = []
|
||||
|
||||
try:
|
||||
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||
except ImportError:
|
||||
_warn(name, ["Could not import _mishkal_to_hebrew — skipping"])
|
||||
return
|
||||
|
||||
for key, entry in data.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_eng and mishkal_heb:
|
||||
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||
if expected and expected != mishkal_heb:
|
||||
errors.append(f"[{key}] {infl_key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||
if mishkal_heb and not mishkal_eng:
|
||||
errors.append(f"[{key}] {infl_key}: has mishkal_hebrew but no mishkal")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stats summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def print_stats(data: dict[str, Any]) -> None:
|
||||
"""Print a summary of dataset coverage metrics."""
|
||||
total = len(data)
|
||||
with_conj = sum(1 for e in data.values() if e.get("conjugation"))
|
||||
with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
|
||||
with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
|
||||
with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
|
||||
with_image = sum(1 for e in data.values() if e.get("image"))
|
||||
with_emoji = sum(1 for e in data.values() if e.get("emoji"))
|
||||
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
|
||||
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
|
||||
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
|
||||
with_mishkal = sum(
|
||||
1
|
||||
for e in data.values()
|
||||
if (e.get("noun_inflection") or {}).get("mishkal") or (e.get("adjective_inflection") or {}).get("mishkal")
|
||||
)
|
||||
|
||||
print()
|
||||
print("Stats Summary")
|
||||
print("─" * 42)
|
||||
print(f" Total entries: {total:>6}")
|
||||
print(f" With conjugation data: {with_conj:>6}")
|
||||
print(f" With noun_inflection: {with_noun_inf:>6}")
|
||||
print(f" With mishkal: {with_mishkal:>6}")
|
||||
print(f" With vetted examples: {with_vetted:>6}")
|
||||
print(f" With cloze examples: {with_cloze:>6}")
|
||||
print(f" With images: {with_image:>6}")
|
||||
print(f" With emoji: {with_emoji:>6}")
|
||||
print(f" With legacy GUIDs: {with_guid:>6}")
|
||||
print(f" In confusable groups: {in_confusable:>6}")
|
||||
print(f" With shared roots: {with_shared_roots:>6}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALL_TESTS: dict[str, Any] = {
|
||||
"required_fields": test_required_fields,
|
||||
"root_format": test_root_format,
|
||||
"unique_slugs": test_unique_slugs,
|
||||
"no_duplicate_keys": test_no_duplicate_keys,
|
||||
"confusable_symmetric": test_confusable_symmetric,
|
||||
"shared_roots_valid_keys": test_shared_roots_valid_keys,
|
||||
"unique_legacy_guids": test_unique_legacy_guids,
|
||||
"no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
|
||||
"no_emoji_in_meaning": test_no_emoji_in_meaning,
|
||||
"example_sentences_contain_word": test_example_sentences_contain_word,
|
||||
"cloze_offsets_valid": test_cloze_offsets_valid,
|
||||
"hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
|
||||
"confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
|
||||
"confusables_guid": test_confusables_guid,
|
||||
"conjugation_form_guids": test_conjugation_form_guids,
|
||||
"conjugation_person_codes": test_conjugation_person_codes,
|
||||
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||||
"no_shared_confusable_examples": test_no_shared_confusable_examples,
|
||||
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
|
||||
"mishkal_consistency": test_mishkal_consistency,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
global _verbose
|
||||
|
||||
parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Print full details for all failures (not just first 20).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
metavar="NAME",
|
||||
help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
_verbose = args.verbose
|
||||
|
||||
data = load_data()
|
||||
|
||||
# Select tests to run
|
||||
if args.test:
|
||||
if args.test not in ALL_TESTS:
|
||||
print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
|
||||
sys.exit(2)
|
||||
tests_to_run = {args.test: ALL_TESTS[args.test]}
|
||||
else:
|
||||
tests_to_run = ALL_TESTS
|
||||
|
||||
print(f"Validating {DATA_FILE} ({len(data)} entries)")
|
||||
print("─" * 60)
|
||||
|
||||
# no_duplicate_keys needs the file, not the pre-parsed dict
|
||||
for test_fn in tests_to_run.values():
|
||||
test_fn(data)
|
||||
|
||||
# Summary
|
||||
if not args.test:
|
||||
print_stats(data)
|
||||
|
||||
print()
|
||||
print("─" * 60)
|
||||
if _warnings:
|
||||
print(f" Warnings : {len(_warnings)}")
|
||||
if _failures:
|
||||
print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f" All {len(tests_to_run)} test(s) passed.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,198 +0,0 @@
|
|||
"""Sentence difficulty scoring by context-word frequency.
|
||||
|
||||
Scores sentences by the median frequency rank of context words
|
||||
(excluding the cloze target). Lower score = easier sentence.
|
||||
Used by epub_examples.py to select the best cloze sentence.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from statistics import median
|
||||
|
||||
import helpers
|
||||
import nikkud_to_ktiv_male
|
||||
|
||||
DEFAULT_RANK = 50_000
|
||||
|
||||
# Hebrew prefix consonants for ktiv_male prefix stripping (tier 5)
|
||||
_KM_PREFIX_CHARS = set("בהוכלמשע")
|
||||
|
||||
# Punctuation to strip from tokens
|
||||
_PUNCT = set('.,!?;:"\'"״׳–—()[]{}')
|
||||
|
||||
# Maqaf (Hebrew hyphen) — splits tokens
|
||||
_MAQAF = "־"
|
||||
|
||||
|
||||
def build_nikkud_map(words: dict) -> dict[str, str]:
|
||||
"""Build nikkud→ktiv_male lookup from words.json.
|
||||
|
||||
Indexes: headwords, conjugation forms (active, passive, infinitive,
|
||||
reference_form), noun inflections (singular, plural, construct,
|
||||
pronominal suffixes), and adjective inflections (ms/fs/mp/fp).
|
||||
|
||||
Args:
|
||||
words: The full words.json dict keyed by unique_key.
|
||||
|
||||
Returns:
|
||||
Dict mapping nikkud form to ktiv_male string.
|
||||
When collisions occur, last-write wins (acceptable for frequency lookup).
|
||||
"""
|
||||
nmap: dict[str, str] = {}
|
||||
|
||||
def _add(nikkud: str | None, ktiv_male: str | None) -> None:
|
||||
if nikkud and ktiv_male:
|
||||
nmap[nikkud] = ktiv_male
|
||||
|
||||
for entry in words.values():
|
||||
word = entry.get("word") or {}
|
||||
_add(word.get("nikkud"), word.get("ktiv_male"))
|
||||
|
||||
# Conjugation forms
|
||||
conj = entry.get("conjugation") or {}
|
||||
for form_entry in conj.get("active_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
for form_entry in conj.get("hufal_pual_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
inf = conj.get("infinitive") or {}
|
||||
_add(inf.get("nikkud"), inf.get("ktiv_male"))
|
||||
ref = conj.get("reference_form") or {}
|
||||
_add(ref.get("nikkud"), ref.get("ktiv_male"))
|
||||
|
||||
# Noun inflection forms
|
||||
noun = entry.get("noun_inflection") or {}
|
||||
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||
sub = noun.get(field) or {}
|
||||
nikkud_form = sub.get("nikkud")
|
||||
ktiv = sub.get("ktiv_male")
|
||||
_add(nikkud_form, ktiv)
|
||||
# Index construct forms without maqaf
|
||||
if nikkud_form and nikkud_form.endswith("־") and ktiv:
|
||||
_add(nikkud_form[:-1], ktiv)
|
||||
pronominal = noun.get("pronominal_suffixes") or {}
|
||||
for sub in pronominal.values():
|
||||
if isinstance(sub, dict):
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
# Adjective inflection forms
|
||||
adj = entry.get("adjective_inflection") or {}
|
||||
for field in ("ms", "fs", "mp", "fp"):
|
||||
sub = adj.get(field) or {}
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
return nmap
|
||||
|
||||
|
||||
def _resolve_token_frequency(
|
||||
token: str,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Resolve a nikkud sentence token to its frequency rank.
|
||||
|
||||
Uses a 5-tier pipeline:
|
||||
1. Known mapping (nikkud_map from words.json)
|
||||
2. Nikkud prefix stripping (epub_examples.try_strip_prefix)
|
||||
3. Academy rules converter (nikkud_to_ktiv_male.convert)
|
||||
4. strip_nikkud fallback (helpers.strip_nikkud)
|
||||
5. Ktiv_male prefix stripping on the converted form
|
||||
|
||||
Returns:
|
||||
Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found.
|
||||
"""
|
||||
# Tier 1: Direct lookup in nikkud→ktiv_male map
|
||||
ktiv = nikkud_map.get(token)
|
||||
if ktiv and ktiv in freq_data:
|
||||
return freq_data[ktiv]
|
||||
|
||||
# Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map
|
||||
from epub_examples import try_strip_prefix
|
||||
|
||||
prefix_hits = try_strip_prefix(token, nikkud_index)
|
||||
for _unique_key, _match_type, matched_remainder in prefix_hits:
|
||||
remainder_ktiv = nikkud_map.get(matched_remainder)
|
||||
if remainder_ktiv and remainder_ktiv in freq_data:
|
||||
return freq_data[remainder_ktiv]
|
||||
|
||||
# Tier 3: Academy rules converter
|
||||
converted = nikkud_to_ktiv_male.convert(token)
|
||||
if converted in freq_data:
|
||||
return freq_data[converted]
|
||||
|
||||
# Tier 4: strip_nikkud fallback
|
||||
stripped = helpers.strip_nikkud(token)
|
||||
if stripped != converted and stripped in freq_data:
|
||||
return freq_data[stripped]
|
||||
|
||||
# Tier 5: Ktiv_male prefix stripping on converted/stripped form
|
||||
for form in (converted, stripped):
|
||||
for prefix_len in (1, 2):
|
||||
if len(form) > prefix_len + 1:
|
||||
prefix = form[:prefix_len]
|
||||
if all(c in _KM_PREFIX_CHARS for c in prefix):
|
||||
stem = form[prefix_len:]
|
||||
if stem in freq_data:
|
||||
return freq_data[stem]
|
||||
|
||||
return DEFAULT_RANK
|
||||
|
||||
|
||||
def score_sentence(
|
||||
text: str,
|
||||
target_start: int,
|
||||
target_end: int,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Score a sentence by median frequency rank of context words.
|
||||
|
||||
Args:
|
||||
text: The full sentence text (with nikkud).
|
||||
target_start: Character offset where the cloze target word starts.
|
||||
target_end: Character offset where the cloze target word ends.
|
||||
nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map().
|
||||
nikkud_index: nikkud index from epub_examples._build_nikkud_index().
|
||||
freq_data: Frequency dict from frequency_lookup.get_freq_data().
|
||||
|
||||
Returns:
|
||||
Median frequency rank of context tokens (int). Lower = easier.
|
||||
Returns DEFAULT_RANK if no scoreable context tokens.
|
||||
"""
|
||||
# Tokenize: split on whitespace, then split on maqaf
|
||||
raw_tokens = text.split()
|
||||
tokens_with_pos: list[tuple[str, int, int]] = []
|
||||
pos = 0
|
||||
for raw in raw_tokens:
|
||||
start = text.index(raw, pos)
|
||||
# Split on maqaf
|
||||
parts = raw.split(_MAQAF)
|
||||
sub_pos = start
|
||||
for part in parts:
|
||||
if part:
|
||||
tokens_with_pos.append((part, sub_pos, sub_pos + len(part)))
|
||||
sub_pos += len(part) + 1 # +1 for maqaf
|
||||
pos = start + len(raw)
|
||||
|
||||
# Filter: exclude target word, strip punctuation, skip short tokens
|
||||
context_ranks: list[int] = []
|
||||
for token, tok_start, tok_end in tokens_with_pos:
|
||||
# Exclude target word by overlap with char offsets
|
||||
if tok_start < target_end and tok_end > target_start:
|
||||
continue
|
||||
|
||||
# Strip punctuation from edges
|
||||
cleaned = token.strip("".join(_PUNCT))
|
||||
if len(cleaned) < 2:
|
||||
continue
|
||||
|
||||
rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data)
|
||||
context_ranks.append(rank)
|
||||
|
||||
if not context_ranks:
|
||||
return DEFAULT_RANK
|
||||
|
||||
return int(median(context_ranks))
|
||||
31
test_scrape.py
Normal file
31
test_scrape.py
Normal file
|
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/env python3
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
word = 'אבל'
|
||||
url = f'https://www.pealim.com/search/?q={word}'
|
||||
headers = {
|
||||
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
|
||||
}
|
||||
|
||||
try:
|
||||
response = requests.get(url, headers=headers, timeout=10)
|
||||
print(f'Status: {response.status_code}')
|
||||
soup = BeautifulSoup(response.content, 'html.parser')
|
||||
|
||||
# Debug: check what we find
|
||||
word_elem = soup.find('h1', class_='word-title')
|
||||
pos_elem = soup.find('span', class_='pos')
|
||||
definition_elem = soup.find('div', class_='definition')
|
||||
|
||||
print(f'word_elem found: {word_elem is not None}')
|
||||
print(f'pos_elem found: {pos_elem is not None}')
|
||||
print(f'definition_elem found: {definition_elem is not None}')
|
||||
|
||||
print('\n--- HTML snippet (first 3000 chars) ---')
|
||||
print(soup.prettify()[:3000])
|
||||
|
||||
except Exception as e:
|
||||
print(f'Error: {e}')
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
|
@ -1,246 +0,0 @@
|
|||
"""Unit tests for apkg_builder — Sprint 15 learnings.
|
||||
|
||||
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
|
||||
meanings, PoS exact matching, gender field population, and mishkal data integrity.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure project root is on path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from apkg_builder import _categorize_pos, _cloze_prefix_len
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cloze prefix preservation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestClozePrefix:
|
||||
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
|
||||
|
||||
def test_single_prefix_bet(self):
|
||||
# בַּתּוֹר = bet + patach + tor
|
||||
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
|
||||
|
||||
def test_single_prefix_lamed(self):
|
||||
# לַמֶּלֶךְ = lamed + patach + melech
|
||||
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
|
||||
|
||||
def test_two_consonant_prefix(self):
|
||||
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
|
||||
token = "שֶׁבַּתּוֹר"
|
||||
word = "תּוֹר"
|
||||
prefix_len = _cloze_prefix_len(token, word)
|
||||
assert prefix_len > 0
|
||||
assert token[prefix_len:].startswith(word)
|
||||
|
||||
def test_no_prefix_direct_match(self):
|
||||
# Word appears at start — no prefix
|
||||
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
|
||||
|
||||
def test_empty_inputs(self):
|
||||
assert _cloze_prefix_len("", "תּוֹר") == 0
|
||||
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
|
||||
assert _cloze_prefix_len("", "") == 0
|
||||
|
||||
def test_non_prefix_letter_returns_zero(self):
|
||||
# If the "prefix" chars aren't valid prefix letters, return 0
|
||||
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
|
||||
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
|
||||
|
||||
def test_prefix_preserves_nikkud(self):
|
||||
# Verify that prefix_len includes nikkud marks
|
||||
token = "בַּתּוֹר"
|
||||
word = "תּוֹר"
|
||||
prefix_len = _cloze_prefix_len(token, word)
|
||||
prefix = token[:prefix_len]
|
||||
# Prefix should contain at least bet + nikkud mark(s)
|
||||
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
|
||||
assert base_letters == ["ב"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# PoS exact matching (no substring collisions)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCategorizePos:
|
||||
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
|
||||
|
||||
def test_noun_exact(self):
|
||||
assert _categorize_pos("Noun") == "Noun"
|
||||
|
||||
def test_pronoun_is_other(self):
|
||||
assert _categorize_pos("Pronoun") == "Other"
|
||||
|
||||
def test_verb_exact(self):
|
||||
assert _categorize_pos("Verb") == "Verb"
|
||||
|
||||
def test_noun_with_dash(self):
|
||||
assert _categorize_pos("Noun – masculine") == "Noun"
|
||||
|
||||
def test_adjective(self):
|
||||
assert _categorize_pos("Adjective") == "Adjective"
|
||||
|
||||
def test_conjunction_is_other(self):
|
||||
assert _categorize_pos("Conjunction") == "Other"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hebrew spoiler stripping from English meanings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHebrewSpoilerStripping:
|
||||
"""English meanings must not contain Hebrew text (spoils the card)."""
|
||||
|
||||
# Use the same regex from apkg_builder.py
|
||||
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
|
||||
|
||||
@staticmethod
|
||||
def _strip_hebrew(meaning: str) -> str:
|
||||
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
|
||||
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||
meaning = re.sub(r"[;:]\s*—", " —", meaning)
|
||||
meaning = re.sub(r";\s*:", ";", meaning)
|
||||
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||||
|
||||
def test_pure_english_unchanged(self):
|
||||
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
|
||||
|
||||
def test_hebrew_word_removed(self):
|
||||
result = self._strip_hebrew("to eat; אכל")
|
||||
assert "אכל" not in result
|
||||
|
||||
def test_hebrew_with_nikkud_removed(self):
|
||||
result = self._strip_hebrew("tall; גָּבוֹהַּ")
|
||||
assert "גָּבוֹהַּ" not in result
|
||||
assert "tall" in result
|
||||
|
||||
def test_no_residual_hebrew_in_real_data(self):
|
||||
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
# The regex used in apkg_builder
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
spoilers = []
|
||||
for key, entry in words.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
cleaned = self._strip_hebrew(meaning)
|
||||
if hebrew_re.search(cleaned):
|
||||
spoilers.append(f"{key}: {cleaned!r}")
|
||||
|
||||
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gender field for nouns (words.json data integrity)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGenderDataIntegrity:
|
||||
"""Nouns with noun_inflection should have gender populated."""
|
||||
|
||||
@pytest.fixture()
|
||||
def words(self):
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def test_nouns_have_gender(self, words):
|
||||
"""Nouns with noun_inflection should have a valid gender."""
|
||||
missing = []
|
||||
for key, entry in words.items():
|
||||
pos = entry.get("pos") or ""
|
||||
ni = entry.get("noun_inflection")
|
||||
if pos.startswith("Noun") and ni:
|
||||
gender = ni.get("gender") or ""
|
||||
if gender not in ("masculine", "feminine", "masculine and feminine"):
|
||||
missing.append(f"{key}: gender={gender!r}")
|
||||
|
||||
# Allow up to 7% missing (loan words, compound words, etc.)
|
||||
noun_count = sum(
|
||||
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
|
||||
)
|
||||
if noun_count > 0:
|
||||
pct_missing = len(missing) / noun_count
|
||||
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mishkal data integrity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMishkalIntegrity:
|
||||
"""Validate mishkal data consistency in words.json."""
|
||||
|
||||
@pytest.fixture()
|
||||
def words(self):
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def test_mishkal_hebrew_matches_english(self, words):
|
||||
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
|
||||
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||
|
||||
mismatches = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_eng and mishkal_heb:
|
||||
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||
if expected and expected != mishkal_heb:
|
||||
mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||
|
||||
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
|
||||
|
||||
def test_mishkal_hebrew_is_hebrew(self, words):
|
||||
"""mishkal_hebrew must contain Hebrew characters."""
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
bad = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_heb and not hebrew_re.search(mishkal_heb):
|
||||
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
|
||||
|
||||
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
|
||||
|
||||
def test_no_orphaned_mishkal(self, words):
|
||||
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
|
||||
orphans = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
if mishkal_heb and not mishkal_eng:
|
||||
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
|
||||
|
||||
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"
|
||||
|
|
@ -1,524 +0,0 @@
|
|||
"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from pealim_detail_scrape import (
|
||||
_parse_adjective_table,
|
||||
_parse_adjective_table_vl,
|
||||
_parse_preposition_table,
|
||||
_parse_preposition_table_vl,
|
||||
_scrape_adjective_detail,
|
||||
_scrape_preposition_detail,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures — real HTML snippets from pealim.com
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ADJECTIVE_MO_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="column-header" colspan="2">Singular</th>
|
||||
<th class="column-header" colspan="2">Plural</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="conj-td">
|
||||
<div id="ms-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִי</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fs-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִית</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="mp-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִיִּים</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fp-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִיּוֹת</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
# VL version: menukad spans contain unvowelled text (hebstyle=vl)
|
||||
ADJECTIVE_VL_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="conj-td">
|
||||
<div id="ms-a"><div><div>
|
||||
<span class="menukad">אביבי</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fs-a"><div><div>
|
||||
<span class="menukad">אביבית</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="mp-a"><div><div>
|
||||
<span class="menukad">אביביים</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fp-a"><div><div>
|
||||
<span class="menukad">אביביות</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
PREPOSITION_MO_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th rowspan="2">Person</th>
|
||||
<th class="column-header" colspan="2">Singular</th>
|
||||
<th class="column-header" colspan="2">Plural</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>1st</th>
|
||||
<td class="conj-td" colspan="2">
|
||||
<div id="P-1s"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלִּי</span>
|
||||
</div></div><div class="meaning"><strong>of mine</strong></div></div>
|
||||
</td>
|
||||
<td class="conj-td" colspan="2">
|
||||
<div id="P-1p"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּנוּ</span>
|
||||
</div></div><div class="meaning"><strong>of ours</strong></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>2nd</th>
|
||||
<td class="conj-td">
|
||||
<div id="P-2ms"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלְּךָ</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-2fs"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּךְ</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-2mp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּכֶם</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-2fp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּכֶן</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>3rd</th>
|
||||
<td class="conj-td">
|
||||
<div id="P-3ms"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלּוֹ</span>
|
||||
</div></div><div class="meaning"><strong>of his</strong></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-3fs"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּהּ</span>
|
||||
</div></div><div class="meaning"><strong>of hers</strong></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-3mp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּהֶם</span>
|
||||
</div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-3fp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּהֶן</span>
|
||||
</div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
PREPOSITION_VL_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>1st</th>
|
||||
<td colspan="2"><div id="P-1s"><div><div>
|
||||
<span class="menukad">שלי</span>
|
||||
</div></div></div></td>
|
||||
<td colspan="2"><div id="P-1p"><div><div>
|
||||
<span class="menukad">שלנו</span>
|
||||
</div></div></div></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>2nd</th>
|
||||
<td><div id="P-2ms"><div><div>
|
||||
<span class="menukad">שלך</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-2fs"><div><div>
|
||||
<span class="menukad">שלך</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-2mp"><div><div>
|
||||
<span class="menukad">שלכם</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-2fp"><div><div>
|
||||
<span class="menukad">שלכן</span>
|
||||
</div></div></div></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>3rd</th>
|
||||
<td><div id="P-3ms"><div><div>
|
||||
<span class="menukad">שלו</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-3fs"><div><div>
|
||||
<span class="menukad">שלה</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-3mp"><div><div>
|
||||
<span class="menukad">שלהם</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-3fp"><div><div>
|
||||
<span class="menukad">שלהן</span>
|
||||
</div></div></div></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
# Minimal full-page wrappers so _scrape_*_detail() can parse them
|
||||
_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
|
||||
_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
|
||||
_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
|
||||
_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Adjective table tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestParseAdjectiveTable:
|
||||
"""Tests for _parse_adjective_table (mo/nikkud page)."""
|
||||
|
||||
def test_returns_four_form_keys(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
|
||||
|
||||
def test_ms_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["ms"]["nikkud"] == "אֲבִיבִי"
|
||||
|
||||
def test_fs_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["fs"]["nikkud"] == "אֲבִיבִית"
|
||||
|
||||
def test_mp_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
|
||||
|
||||
def test_fp_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
|
||||
|
||||
def test_audio_url_present(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
|
||||
|
||||
def test_empty_on_missing_table(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
|
||||
assert result == {}
|
||||
|
||||
|
||||
class TestParseAdjectiveTableVl:
|
||||
"""Tests for _parse_adjective_table_vl (ktiv male page)."""
|
||||
|
||||
def test_returns_four_form_keys(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
|
||||
|
||||
def test_ms_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["ms"] == "אביבי"
|
||||
|
||||
def test_fs_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["fs"] == "אביבית"
|
||||
|
||||
def test_mp_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["mp"] == "אביביים"
|
||||
|
||||
def test_fp_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["fp"] == "אביביות"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _scrape_adjective_detail tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestScrapeAdjectiveDetail:
|
||||
"""Tests for _scrape_adjective_detail — schema compliance."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
|
||||
|
||||
def test_returns_non_empty_dict(self, result: dict) -> None:
|
||||
assert result
|
||||
|
||||
def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["ms"]["nikkud"] == "אֲבִיבִי"
|
||||
assert result["ms"]["ktiv_male"] == "אביבי"
|
||||
|
||||
def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["fs"]["nikkud"] == "אֲבִיבִית"
|
||||
assert result["fs"]["ktiv_male"] == "אביבית"
|
||||
|
||||
def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
|
||||
assert result["mp"]["ktiv_male"] == "אביביים"
|
||||
|
||||
def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
|
||||
assert result["fp"]["ktiv_male"] == "אביביות"
|
||||
|
||||
def test_mishkal_key_present(self, result: dict) -> None:
|
||||
# mishkal may be None since no PoS section is in our minimal fixture
|
||||
assert "mishkal" in result
|
||||
|
||||
def test_mishkal_hebrew_key_present(self, result: dict) -> None:
|
||||
assert "mishkal_hebrew" in result
|
||||
|
||||
def test_all_schema_keys_present(self, result: dict) -> None:
|
||||
expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
|
||||
assert expected.issubset(result.keys())
|
||||
|
||||
def test_empty_on_no_table(self) -> None:
|
||||
result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
|
||||
assert result == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Preposition table tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestParsePrepositionTable:
|
||||
"""Tests for _parse_preposition_table (mo/nikkud page)."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
|
||||
|
||||
def test_returns_ten_form_keys(self, result: dict) -> None:
|
||||
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||
assert set(result.keys()) == expected
|
||||
|
||||
def test_1s_nikkud(self, result: dict) -> None:
|
||||
assert result["1s"]["nikkud"] == "שֶׁלִּי"
|
||||
|
||||
def test_1p_nikkud(self, result: dict) -> None:
|
||||
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
|
||||
|
||||
def test_2ms_nikkud(self, result: dict) -> None:
|
||||
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
|
||||
|
||||
def test_2fs_nikkud(self, result: dict) -> None:
|
||||
assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
|
||||
|
||||
def test_2mp_nikkud(self, result: dict) -> None:
|
||||
assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
|
||||
|
||||
def test_2fp_nikkud(self, result: dict) -> None:
|
||||
assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
|
||||
|
||||
def test_3ms_nikkud(self, result: dict) -> None:
|
||||
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
|
||||
|
||||
def test_3fs_nikkud(self, result: dict) -> None:
|
||||
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
|
||||
|
||||
def test_3mp_nikkud(self, result: dict) -> None:
|
||||
assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
|
||||
|
||||
def test_3fp_nikkud(self, result: dict) -> None:
|
||||
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
|
||||
|
||||
def test_audio_url_present(self, result: dict) -> None:
|
||||
assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
|
||||
|
||||
def test_empty_on_missing_table(self) -> None:
|
||||
result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
|
||||
assert result == {}
|
||||
|
||||
|
||||
class TestParsePrepositionTableVl:
|
||||
"""Tests for _parse_preposition_table_vl (ktiv male page)."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
|
||||
|
||||
def test_returns_ten_form_keys(self, result: dict) -> None:
|
||||
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||
assert set(result.keys()) == expected
|
||||
|
||||
def test_1s_ktiv(self, result: dict) -> None:
|
||||
assert result["1s"] == "שלי"
|
||||
|
||||
def test_1p_ktiv(self, result: dict) -> None:
|
||||
assert result["1p"] == "שלנו"
|
||||
|
||||
def test_2ms_ktiv(self, result: dict) -> None:
|
||||
assert result["2ms"] == "שלך"
|
||||
|
||||
def test_3ms_ktiv(self, result: dict) -> None:
|
||||
assert result["3ms"] == "שלו"
|
||||
|
||||
def test_3fp_ktiv(self, result: dict) -> None:
|
||||
assert result["3fp"] == "שלהן"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _scrape_preposition_detail tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestScrapePrepositionDetail:
|
||||
"""Tests for _scrape_preposition_detail — schema compliance."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
|
||||
|
||||
def test_returns_non_empty_dict(self, result: dict) -> None:
|
||||
assert result
|
||||
|
||||
def test_all_ten_person_keys_present(self, result: dict) -> None:
|
||||
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||
assert expected.issubset(result.keys())
|
||||
|
||||
def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["1s"]["nikkud"] == "שֶׁלִּי"
|
||||
assert result["1s"]["ktiv_male"] == "שלי"
|
||||
|
||||
def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
|
||||
assert result["1p"]["ktiv_male"] == "שלנו"
|
||||
|
||||
def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
|
||||
assert result["2ms"]["ktiv_male"] == "שלך"
|
||||
|
||||
def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
|
||||
assert result["3ms"]["ktiv_male"] == "שלו"
|
||||
|
||||
def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
|
||||
assert result["3fs"]["ktiv_male"] == "שלה"
|
||||
|
||||
def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
|
||||
assert result["3fp"]["ktiv_male"] == "שלהן"
|
||||
|
||||
def test_empty_on_no_table(self) -> None:
|
||||
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
|
||||
assert result == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests for _parse_noun_gender_mishkal mishkal extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
from bs4 import BeautifulSoup # noqa: E402
|
||||
|
||||
from pealim_detail_scrape import _parse_noun_gender_mishkal # noqa: E402
|
||||
|
||||
|
||||
class TestNounGenderMishkal:
|
||||
def test_noun_with_mishkal(self):
|
||||
html = '<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "masculine"
|
||||
assert mishkal == "ketel"
|
||||
|
||||
def test_noun_without_mishkal(self):
|
||||
html = "<p>Noun – masculine</p>"
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "masculine"
|
||||
assert mishkal == ""
|
||||
|
||||
def test_adjective_mishkal(self):
|
||||
html = '<p>Adjective – <a href="/dict/?pos=adjective&am=qatul"><i>katul</i> pattern</a></p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
_, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert mishkal == "katul"
|
||||
|
||||
def test_feminine_noun(self):
|
||||
html = '<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, feminine</p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "feminine"
|
||||
assert mishkal == "ketel"
|
||||
|
|
@ -1,127 +0,0 @@
|
|||
"""Tests for epub_examples deduplication of confusable group examples."""
|
||||
|
||||
from epub_examples import _deduplicate_confusable_examples
|
||||
|
||||
|
||||
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
|
||||
"""Build a minimal words.json entry for testing."""
|
||||
entry = {
|
||||
"meaning": meaning,
|
||||
"confusable_group": confusable_group,
|
||||
}
|
||||
if vetted_texts is not None:
|
||||
entry["examples"] = {
|
||||
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
|
||||
}
|
||||
if frequency_rank is not None:
|
||||
entry["frequency_rank"] = frequency_rank
|
||||
return entry
|
||||
|
||||
|
||||
class TestDeduplicateConfusableExamples:
|
||||
"""Tests for _deduplicate_confusable_examples()."""
|
||||
|
||||
def test_shared_examples_kept_on_higher_frequency(self):
|
||||
"""When two confusables share identical examples, the one with
|
||||
lower frequency_rank (more common) keeps them."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
|
||||
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_no_action_when_examples_differ(self):
|
||||
"""Groups with different example sets are left untouched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert len(words["key_b"]["examples"]["vetted"]) == 1
|
||||
|
||||
def test_no_action_when_one_has_no_examples(self):
|
||||
"""If only one member has examples, nothing to deduplicate."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_no_frequency_uses_alphabetical_tiebreak(self):
|
||||
"""When no member has frequency data, first alphabetically wins."""
|
||||
group = ["alpha_key", "beta_key"]
|
||||
words = {
|
||||
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
|
||||
"beta_key": _make_entry("meaning2", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
|
||||
assert words["beta_key"]["examples"]["vetted"] == []
|
||||
|
||||
def test_three_way_group(self):
|
||||
"""Three-member group: highest frequency wins, other two cleared."""
|
||||
group = ["key_a", "key_b", "key_c"]
|
||||
words = {
|
||||
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
|
||||
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
|
||||
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 2
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
assert words["key_c"]["examples"]["vetted"] == []
|
||||
|
||||
def test_cloze_removed_from_losers(self):
|
||||
"""Losing entries should have their cloze data removed too."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
|
||||
}
|
||||
# Add cloze to both
|
||||
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert "cloze" not in words["key_b"]["examples"]
|
||||
|
||||
def test_no_confusable_groups_returns_zero(self):
|
||||
"""Words without confusable_group are ignored."""
|
||||
words = {
|
||||
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_mixed_frequency_and_none(self):
|
||||
"""Member with frequency beats member without."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
|
||||
"key_b": _make_entry("no_freq", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_partial_overlap_not_deduplicated(self):
|
||||
"""Groups with overlapping but not identical sentence sets are not touched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
|
||||
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
|
@ -1,83 +0,0 @@
|
|||
"""Integration tests for frequency-based sentence scoring in update_words_json."""
|
||||
|
||||
|
||||
def _make_sentence(text, source="test", match_method="direct", word_count=None, char_offset=0, char_end=3):
|
||||
"""Build a minimal sentence dict as match_sentences would produce."""
|
||||
if word_count is None:
|
||||
word_count = len(text.split())
|
||||
return {
|
||||
"text": text,
|
||||
"source": source,
|
||||
"match_method": match_method,
|
||||
"word_count": word_count,
|
||||
"char_offset": char_offset,
|
||||
"char_end": char_end,
|
||||
}
|
||||
|
||||
|
||||
class TestScoringIntegration:
|
||||
"""Tests that update_words_json uses frequency scoring."""
|
||||
|
||||
def test_cloze_has_difficulty_score(self):
|
||||
"""Cloze dict includes difficulty_score field."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא אָדָם טוֹב מְאוֹד", char_offset=10, char_end=13),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"].get("cloze")
|
||||
assert cloze is not None
|
||||
assert "difficulty_score" in cloze
|
||||
assert isinstance(cloze["difficulty_score"], int)
|
||||
|
||||
def test_vetted_sorted_by_difficulty(self):
|
||||
"""Vetted sentences are sorted easiest first."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא טוֹב", char_offset=4, char_end=7),
|
||||
_make_sentence("הַתַּפְנִיט טוֹב בְּיוֹתֵר", char_offset=10, char_end=13),
|
||||
_make_sentence("אֲנִי טוֹב הַיּוֹם", char_offset=5, char_end=8),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
vetted = words["טוֹב"]["examples"]["vetted"]
|
||||
assert len(vetted) == 3
|
||||
|
||||
def test_easiest_sentence_becomes_cloze(self):
|
||||
"""The sentence with the lowest difficulty score becomes the cloze."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
easy_text = "הוּא טוֹב מְאוֹד"
|
||||
hard_text = "הַפַּרְנָסִימוֹן טוֹב לְהַפְלִיא"
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence(hard_text, char_offset=14, char_end=17),
|
||||
_make_sentence(easy_text, char_offset=4, char_end=7),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"]["cloze"]
|
||||
assert cloze["text"] == easy_text
|
||||
|
|
@ -1,441 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Integration tests: scrape real pealim.com pages and validate data.
|
||||
|
||||
These tests hit pealim.com directly. They are skipped when the environment
|
||||
variable SKIP_INTEGRATION is set to any non-empty string.
|
||||
|
||||
Run with:
|
||||
pytest tests/test_scraper_integration.py -v -m integration
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Add project root to path so all sibling modules are importable
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
import pealim_detail_scrape
|
||||
import pealim_list_scrape
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Skip marker
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
skip_integration = pytest.mark.skipif(
|
||||
bool(os.environ.get("SKIP_INTEGRATION", "")),
|
||||
reason="SKIP_INTEGRATION is set",
|
||||
)
|
||||
|
||||
# A known Hif'il verb slug that is not page-1 dependent.
|
||||
# לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
|
||||
HIFIL_VERB_SLUG = "1135-lehagid"
|
||||
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
|
||||
HIFIL_VERB_MEANING = "to say, to tell"
|
||||
|
||||
# Minimum expected entries from a single list page
|
||||
MIN_LIST_ENTRIES = 10
|
||||
|
||||
# Hebrew character regex (Unicode block U+05D0–U+05EA)
|
||||
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
|
||||
|
||||
# Slug pattern: one or more digits, hyphen, one or more word chars
|
||||
SLUG_RE = re.compile(r"^\d+-\w+$")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _has_hebrew(text: str) -> bool:
|
||||
"""Return True if *text* contains at least one Hebrew consonant."""
|
||||
return bool(HEBREW_CHAR_RE.search(text))
|
||||
|
||||
|
||||
def _words_from_file(path: Path) -> dict:
|
||||
with path.open(encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class: list page scrape
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@skip_integration
|
||||
class TestListScrape:
|
||||
"""Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
|
||||
|
||||
def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
# Scrape exactly one page
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
|
||||
assert words_path.exists(), "words.json was not created after scrape"
|
||||
words = _words_from_file(words_path)
|
||||
assert len(words) >= MIN_LIST_ENTRIES, (
|
||||
f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
|
||||
)
|
||||
|
||||
def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
for key, entry in words.items():
|
||||
word_block = entry.get("word", {})
|
||||
nikkud = word_block.get("nikkud", "")
|
||||
ktiv_male = word_block.get("ktiv_male", "")
|
||||
slug = entry.get("slug", "")
|
||||
pos = entry.get("pos", "")
|
||||
meaning = entry.get("meaning", "")
|
||||
|
||||
assert nikkud, f"Entry '{key}': word.nikkud is empty"
|
||||
assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
|
||||
assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
|
||||
assert slug, f"Entry '{key}': slug is empty"
|
||||
assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
|
||||
assert pos, f"Entry '{key}': pos is empty"
|
||||
assert meaning, f"Entry '{key}': meaning is empty"
|
||||
|
||||
def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""At least one entry on page 1 must have a non-empty root list."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
entries_with_root = [e for e in words.values() if e.get("root")]
|
||||
assert entries_with_root, "No entries on page 1 have a non-empty root list"
|
||||
|
||||
def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""At least one entry on page 1 must have a non-empty audio_url."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
entries_with_audio = [e for e in words.values() if e.get("audio_url")]
|
||||
assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
|
||||
|
||||
def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
for key, entry in words.items():
|
||||
assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
|
||||
assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
|
||||
assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class: noun detail scrape
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@skip_integration
|
||||
class TestDetailScrapeNoun:
|
||||
"""Validate pealim_detail_scrape for a real noun detail page."""
|
||||
|
||||
def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
|
||||
"""Return the first (key, entry) pair that is a Noun with a non-empty root."""
|
||||
for key, entry in words.items():
|
||||
if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
|
||||
return key, entry
|
||||
return None
|
||||
|
||||
def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
|
||||
"""
|
||||
Scrape page 1 into a fresh words.json and return (path, words).
|
||||
Uses list scraper monkeypatched to tmp_path.
|
||||
"""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
return words_path, words
|
||||
|
||||
def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""After detail scrape, noun_inflection must not be null."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, noun_entry = pair
|
||||
|
||||
# Now monkeypatch detail scraper and run it on just this noun
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
# Small rate-limit delay between list scrape and detail scrape
|
||||
time.sleep(1.0)
|
||||
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
entry = updated_words.get(noun_key, {})
|
||||
|
||||
assert entry.get("noun_inflection") is not None, (
|
||||
f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
|
||||
)
|
||||
|
||||
def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, _noun_entry = pair
|
||||
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
time.sleep(1.0)
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||
|
||||
singular = ni.get("singular") or {}
|
||||
plural = ni.get("plural") or {}
|
||||
|
||||
assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
|
||||
assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
|
||||
assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
|
||||
assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
|
||||
|
||||
def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Noun gender must be 'masculine' or 'feminine'."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, _noun_entry = pair
|
||||
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
time.sleep(1.0)
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||
|
||||
gender = ni.get("gender", "")
|
||||
assert gender in ("masculine", "feminine"), (
|
||||
f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
|
||||
)
|
||||
|
||||
def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""detail_scraped must be True after a successful noun detail scrape."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, _ = pair
|
||||
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
time.sleep(1.0)
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
assert updated_words[noun_key].get("detail_scraped") is True, (
|
||||
f"detail_scraped is not True after scrape for '{noun_key}'"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class: verb detail scrape (Hif'il)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@skip_integration
|
||||
class TestDetailScrapeVerb:
|
||||
"""Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
|
||||
|
||||
def _build_test_words_json(self, tmp_path: Path) -> Path:
|
||||
"""
|
||||
Write a minimal words.json containing only the known Hif'il verb entry.
|
||||
The detail scraper's run() will pick it up because pos starts with 'Verb'
|
||||
and detail_scraped is absent/False.
|
||||
"""
|
||||
words_path = tmp_path / "words.json"
|
||||
entry = {
|
||||
"word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
|
||||
"slug": HIFIL_VERB_SLUG,
|
||||
"root": ["נ", "ג", "ד"],
|
||||
"pos": "Verb",
|
||||
"pos_hebrew": "פֹּעַל — הִפְעִיל",
|
||||
"meaning": HIFIL_VERB_MEANING,
|
||||
"meaning_raw": HIFIL_VERB_MEANING,
|
||||
"audio_url": "",
|
||||
"audio_file": "להגיד.mp3",
|
||||
"tags": "שורש::נגד פעלים",
|
||||
"last_scrape_date": "2026-03-08",
|
||||
"vocab_legacy_guid": None,
|
||||
"frequency": None,
|
||||
"pseudo_frequency": None,
|
||||
"emoji": None,
|
||||
"emoji_source": None,
|
||||
"emoji_visible": False,
|
||||
"image": None,
|
||||
"image_source": None,
|
||||
"hint": "",
|
||||
"shared_roots": [],
|
||||
"confusable_group": None,
|
||||
"confusables_guid": None,
|
||||
"examples": None,
|
||||
"noun_inflection": None,
|
||||
"conjugation": None,
|
||||
"adjective_inflection": None,
|
||||
"preposition_inflection": None,
|
||||
# Intentionally no detail_scraped key so the scraper processes it
|
||||
}
|
||||
words = {HIFIL_VERB_NIKKUD: entry}
|
||||
with words_path.open("w", encoding="utf-8") as fh:
|
||||
json.dump(words, fh, ensure_ascii=False, indent=2)
|
||||
return words_path
|
||||
|
||||
def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""After detail scrape, conjugation must not be null for the Hif'il verb."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||||
assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
|
||||
|
||||
def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
|
||||
assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
|
||||
assert conj.get("binyan_hebrew") == "הִפְעִיל", (
|
||||
f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
|
||||
infinitive = conj.get("infinitive") or {}
|
||||
reference_form = conj.get("reference_form") or {}
|
||||
|
||||
inf_nikkud = infinitive.get("nikkud", "")
|
||||
ref_nikkud = reference_form.get("nikkud", "")
|
||||
|
||||
assert inf_nikkud and _has_hebrew(inf_nikkud), (
|
||||
f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
|
||||
)
|
||||
assert ref_nikkud and _has_hebrew(ref_nikkud), (
|
||||
f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_active_forms_count_and_structure(
|
||||
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""active_forms must be a list of at least 20 entries, each with required sub-fields."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
active_forms = conj.get("active_forms")
|
||||
|
||||
assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
|
||||
assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
|
||||
|
||||
for i, form in enumerate(active_forms):
|
||||
assert form.get("person"), f"active_forms[{i}].person is empty"
|
||||
assert form.get("tense"), f"active_forms[{i}].tense is empty"
|
||||
form_block = form.get("form") or {}
|
||||
assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
|
||||
f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
|
||||
)
|
||||
assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
|
||||
f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
|
||||
hufal_forms = conj.get("hufal_pual_forms")
|
||||
assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
|
||||
assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
|
||||
assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
|
||||
|
||||
ref_passive = conj.get("reference_form_passive")
|
||||
assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
|
||||
passive_nikkud = (ref_passive or {}).get("nikkud", "")
|
||||
assert passive_nikkud and _has_hebrew(passive_nikkud), (
|
||||
f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""detail_scraped must be True after a successful verb detail scrape."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||||
assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"
|
||||
|
|
@ -1,207 +0,0 @@
|
|||
"""Tests for sentence difficulty scoring."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import frequency_lookup
|
||||
from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence
|
||||
|
||||
|
||||
class TestBuildNikkudMap:
|
||||
def test_maps_direct_headwords(self):
|
||||
words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָב"] == "אב"
|
||||
|
||||
def test_maps_conjugation_forms(self):
|
||||
words = {
|
||||
"שָׁמַר": {
|
||||
"word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
"conjugation": {
|
||||
"active_forms": [
|
||||
{
|
||||
"person": "1s",
|
||||
"tense": "עָבָר",
|
||||
"form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
|
||||
},
|
||||
],
|
||||
"infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
|
||||
"reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["שָׁמַרְתִּי"] == "שמרתי"
|
||||
assert nmap["לִשְׁמֹר"] == "לשמור"
|
||||
|
||||
def test_maps_noun_inflections(self):
|
||||
words = {
|
||||
"אָב": {
|
||||
"word": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"noun_inflection": {
|
||||
"singular": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
|
||||
"pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָבוֹת"] == "אבות"
|
||||
assert nmap["אָבִי"] == "אבי"
|
||||
|
||||
def test_maps_adjective_inflections(self):
|
||||
words = {
|
||||
"גָּדוֹל": {
|
||||
"word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"adjective_inflection": {
|
||||
"ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
|
||||
"mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
|
||||
"fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["גְּדוֹלָה"] == "גדולה"
|
||||
assert nmap["גְּדוֹלִים"] == "גדולים"
|
||||
|
||||
def test_construct_forms_strip_maqaf(self):
|
||||
words = {
|
||||
"בֵּית": {
|
||||
"word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
|
||||
"noun_inflection": {
|
||||
"construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert "בֵּית־" in nmap
|
||||
assert "בֵּית" in nmap
|
||||
|
||||
def test_handles_missing_fields(self):
|
||||
words = {
|
||||
"test": {
|
||||
"word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
|
||||
"conjugation": None,
|
||||
"noun_inflection": None,
|
||||
"adjective_inflection": None,
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["טֶסְט"] == "טסט"
|
||||
|
||||
def test_real_words_json_coverage(self):
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
nmap = build_nikkud_map(words)
|
||||
assert len(nmap) > 90_000
|
||||
|
||||
|
||||
class TestResolveTokenFrequency:
|
||||
@pytest.fixture()
|
||||
def freq_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_tier1_known_mapping(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 50_000
|
||||
|
||||
def test_tier3_academy_converter(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 1000
|
||||
|
||||
def test_unknown_token_returns_default(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank == 50_000
|
||||
|
||||
def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
assert freq_data.get("שלום") is not None
|
||||
|
||||
|
||||
class TestScoreSentence:
|
||||
@pytest.fixture()
|
||||
def scoring_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_returns_integer(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא הָלַךְ הַבַּיְתָה"
|
||||
start = text.index("הָלַךְ")
|
||||
end = start + len("הָלַךְ")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_easy_sentence_scores_lower(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
easy = "הוּא אָמַר שָׁלוֹם"
|
||||
easy_start = easy.index("אָמַר")
|
||||
easy_end = easy_start + len("אָמַר")
|
||||
hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
|
||||
hard_start = hard.index("נִשְׁתַּטֵּחַ")
|
||||
hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
|
||||
easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
|
||||
hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
|
||||
assert easy_score < hard_score
|
||||
|
||||
def test_single_context_token(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא טוֹב"
|
||||
start = 0
|
||||
end = len("הוּא")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_handles_punctuation(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = '"הוּא טוֹב!"'
|
||||
start = text.index("טוֹב")
|
||||
end = start + len("טוֹב")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_splits_on_maqaf(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "בֵּית־סֵפֶר גָּדוֹל"
|
||||
start = text.index("גָּדוֹל")
|
||||
end = start + len("גָּדוֹל")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_no_context_tokens_returns_default(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "א ב"
|
||||
score = score_sentence(text, 0, 1, nmap, nidx, freq)
|
||||
assert score == DEFAULT_RANK
|
||||
|
|
@ -1,58 +0,0 @@
|
|||
"""Smoke tests for the Hebrew Flash Cards project."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Ensure project root is on path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
|
||||
def test_helpers_strip_nikkud():
|
||||
from helpers import strip_nikkud
|
||||
|
||||
assert strip_nikkud("שָׁלוֹם") == "שלום"
|
||||
assert strip_nikkud("hello") == "hello"
|
||||
assert strip_nikkud("") == ""
|
||||
|
||||
|
||||
def test_apkg_builder_imports():
|
||||
import apkg_builder
|
||||
|
||||
assert hasattr(apkg_builder, "build_vocab_deck")
|
||||
assert hasattr(apkg_builder, "build_conj_deck")
|
||||
assert apkg_builder.VOCAB_MODEL_ID == 1_701_222_017_968
|
||||
|
||||
|
||||
def test_data_files_exist():
|
||||
data_dir = Path(__file__).resolve().parent.parent / "data"
|
||||
assert (data_dir / "words.json").exists(), "words.json missing"
|
||||
|
||||
|
||||
def test_strip_nikkud_idempotent():
|
||||
from helpers import strip_nikkud
|
||||
|
||||
plain = "שלום"
|
||||
assert strip_nikkud(plain) == plain
|
||||
|
||||
|
||||
def test_strip_nikkud_all_marks():
|
||||
from helpers import strip_nikkud
|
||||
|
||||
# Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
|
||||
nikkud = "הַמַּלְכָּה"
|
||||
plain = strip_nikkud(nikkud)
|
||||
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
|
||||
|
||||
|
||||
def test_categorize_pos_no_substring_match():
|
||||
"""Regression: 'Pronoun' must NOT match 'Noun' category."""
|
||||
from apkg_builder import _categorize_pos
|
||||
|
||||
assert _categorize_pos("Noun") == "Noun"
|
||||
assert _categorize_pos("Verb") == "Verb"
|
||||
assert _categorize_pos("Adjective") == "Adjective"
|
||||
assert _categorize_pos("Adverb") == "Adverb"
|
||||
assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
|
||||
assert _categorize_pos("Preposition") == "Other"
|
||||
assert _categorize_pos("Conjunction") == "Other"
|
||||
assert _categorize_pos("Cardinal numeral") == "Other"
|
||||
134
validate_apkg.py
134
validate_apkg.py
|
|
@ -14,6 +14,7 @@ import json
|
|||
import os
|
||||
import re
|
||||
import sqlite3
|
||||
import struct
|
||||
import sys
|
||||
import tempfile
|
||||
import zipfile
|
||||
|
|
@ -21,9 +22,6 @@ from pathlib import Path
|
|||
|
||||
VOCAB_APKG = Path("output/hebrew_vocabulary.apkg")
|
||||
CONJ_APKG = Path("output/hebrew_conjugations.apkg")
|
||||
CONF_APKG = Path("output/hebrew_confusables.apkg")
|
||||
PLURAL_APKG = Path("output/hebrew_plurals.apkg")
|
||||
COMPLETE_APKG = Path("output/hebrew_complete.apkg")
|
||||
|
||||
PASS = "\033[32m✓\033[0m"
|
||||
FAIL = "\033[31m✗\033[0m"
|
||||
|
|
@ -62,6 +60,7 @@ def _detect_format(data: bytes) -> str:
|
|||
|
||||
def validate_apkg(apkg_path: Path) -> int:
|
||||
"""Run all checks. Returns number of failures."""
|
||||
name = apkg_path.name
|
||||
print(f"\n{'='*60}")
|
||||
print(f" Validating: {apkg_path}")
|
||||
print(f"{'='*60}")
|
||||
|
|
@ -79,17 +78,16 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
print("\n[ZIP structure]")
|
||||
try:
|
||||
zf = zipfile.ZipFile(apkg_path)
|
||||
except zipfile.BadZipFile as e:
|
||||
print(f" {FAIL} Invalid ZIP: {e}")
|
||||
return 1
|
||||
|
||||
with zf, tempfile.TemporaryDirectory() as tmpdir:
|
||||
namelist = zf.namelist()
|
||||
has_db = "collection.anki2" in namelist
|
||||
has_media = "media" in namelist
|
||||
failures += 0 if check("collection.anki2 present", has_db) else 1
|
||||
failures += 0 if check("media manifest present", has_media) else 1
|
||||
except zipfile.BadZipFile as e:
|
||||
print(f" {FAIL} Invalid ZIP: {e}")
|
||||
return 1
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
zf.extractall(tmpdir)
|
||||
|
||||
# --- Media manifest ---
|
||||
|
|
@ -118,11 +116,8 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
size = zf.getinfo(num).file_size if num in zf.NameToInfo else -1
|
||||
if size == 0:
|
||||
zero_byte.append(orig)
|
||||
failures += (
|
||||
0
|
||||
if check("No zero-byte media files", len(zero_byte) == 0, f"{len(zero_byte)} empty" if zero_byte else "")
|
||||
else 1
|
||||
)
|
||||
failures += 0 if check("No zero-byte media files", len(zero_byte) == 0,
|
||||
f"{len(zero_byte)} empty" if zero_byte else "") else 1
|
||||
|
||||
# Check audio format sample (first 20 mp3s)
|
||||
mp3_names = [num for num, orig in media_map.items() if orig.endswith(".mp3")]
|
||||
|
|
@ -132,19 +127,16 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
fmt = _detect_format(data)
|
||||
if "MP3" not in fmt:
|
||||
bad_format.append(f"{media_map[num]}: {fmt}")
|
||||
failures += (
|
||||
0
|
||||
if check(
|
||||
failures += 0 if check(
|
||||
f"Audio format (sampled {min(20, len(mp3_names))} files)",
|
||||
len(bad_format) == 0,
|
||||
"; ".join(bad_format) if bad_format else "all MP3",
|
||||
)
|
||||
else 1
|
||||
)
|
||||
"; ".join(bad_format) if bad_format else f"all MP3",
|
||||
) else 1
|
||||
|
||||
# Fonts present
|
||||
font_files = [v for v in original_names if v.endswith(".ttf")]
|
||||
check("Heebo font files bundled", len(font_files) >= 1, ", ".join(font_files) if font_files else "none found")
|
||||
check("Heebo font files bundled", len(font_files) >= 1,
|
||||
", ".join(font_files) if font_files else "none found")
|
||||
|
||||
# --- Database ---
|
||||
print("\n[Database]")
|
||||
|
|
@ -152,7 +144,8 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
conn = sqlite3.connect(db_path)
|
||||
|
||||
schema_ver = conn.execute("SELECT ver FROM col").fetchone()[0]
|
||||
failures += 0 if check("Schema version 11 (Anki 2.1)", schema_ver == 11, f"got {schema_ver}") else 1
|
||||
failures += 0 if check("Schema version 11 (Anki 2.1)", schema_ver == 11,
|
||||
f"got {schema_ver}") else 1
|
||||
|
||||
note_count = conn.execute("SELECT COUNT(*) FROM notes").fetchone()[0]
|
||||
card_count = conn.execute("SELECT COUNT(*) FROM cards").fetchone()[0]
|
||||
|
|
@ -160,37 +153,33 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
failures += 0 if check("Cards present", card_count > 0, f"{card_count:,} cards") else 1
|
||||
|
||||
# Determine expected cards per note from model templates
|
||||
# Some templates are optional (e.g. cloze only generates when field is non-empty),
|
||||
# so we check that cards fall between min and max expected range.
|
||||
models_json_raw = conn.execute("SELECT models FROM col").fetchone()[0]
|
||||
models_raw = json.loads(models_json_raw)
|
||||
tmpl_counts = [len(m["tmpls"]) for m in models_raw.values()]
|
||||
if len(set(tmpl_counts)) == 1 and len(tmpl_counts) == 1:
|
||||
expected_ratio = tmpl_counts[0]
|
||||
# Allow fewer cards when optional templates exist (e.g. cloze)
|
||||
min_cards = note_count # at least 1 card per note
|
||||
max_cards = note_count * expected_ratio
|
||||
failures += (
|
||||
0
|
||||
if check(
|
||||
f"Cards per note (1–{expected_ratio} templates)",
|
||||
min_cards <= card_count <= max_cards,
|
||||
f"{card_count:,} cards from {note_count:,} notes",
|
||||
)
|
||||
else 1
|
||||
)
|
||||
expected_ratio = tmpl_counts[0] if len(set(tmpl_counts)) == 1 else None
|
||||
if expected_ratio:
|
||||
failures += 0 if check(
|
||||
f"{expected_ratio} card(s) per note",
|
||||
card_count == note_count * expected_ratio,
|
||||
f"{note_count} notes × {expected_ratio} = {note_count * expected_ratio}, got {card_count}",
|
||||
) else 1
|
||||
|
||||
# Duplicate GUIDs
|
||||
dup_guids = conn.execute("SELECT guid, COUNT(*) c FROM notes GROUP BY guid HAVING c > 1").fetchall()
|
||||
failures += 0 if check("No duplicate GUIDs", len(dup_guids) == 0, f"{len(dup_guids)} duplicates") else 1
|
||||
dup_guids = conn.execute(
|
||||
"SELECT guid, COUNT(*) c FROM notes GROUP BY guid HAVING c > 1"
|
||||
).fetchall()
|
||||
failures += 0 if check("No duplicate GUIDs", len(dup_guids) == 0,
|
||||
f"{len(dup_guids)} duplicates") else 1
|
||||
|
||||
# Card queue states
|
||||
queues = conn.execute("SELECT type, queue, COUNT(*) FROM cards GROUP BY type, queue").fetchall()
|
||||
queues = conn.execute(
|
||||
"SELECT type, queue, COUNT(*) FROM cards GROUP BY type, queue"
|
||||
).fetchall()
|
||||
queue_map = {(t, q): cnt for t, q, cnt in queues}
|
||||
new_cards = queue_map.get((0, 0), 0)
|
||||
suspended = queue_map.get((0, -1), 0) + queue_map.get((1, -1), 0) + queue_map.get((2, -1), 0)
|
||||
if new_cards > 0:
|
||||
check("Cards in new queue (type=0, queue=0)", True, f"{new_cards:,}")
|
||||
check(f"Cards in new queue (type=0, queue=0)", True, f"{new_cards:,}")
|
||||
if suspended > 0:
|
||||
warn("Suspended cards", f"{suspended:,}")
|
||||
|
||||
|
|
@ -201,18 +190,23 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
per_days = {dc.get("new", {}).get("perDay") for dc in dconf.values() if isinstance(dc, dict)}
|
||||
check("new.order configured", bool(orders), f"{orders}")
|
||||
if per_days:
|
||||
check("new.perDay > 0", all(p and p > 0 for p in per_days if p is not None), f"perDay={per_days}")
|
||||
check("new.perDay > 0", all(p and p > 0 for p in per_days if p is not None),
|
||||
f"perDay={per_days}")
|
||||
|
||||
# Deck assignment
|
||||
decks_json = conn.execute("SELECT decks FROM col").fetchone()[0]
|
||||
decks = json.loads(decks_json)
|
||||
real_decks = {did: d for did, d in decks.items() if did != "1"}
|
||||
if real_decks:
|
||||
check("Custom deck exists (not Default only)", True, ", ".join(d["name"] for d in real_decks.values()))
|
||||
check("Custom deck exists (not Default only)", True,
|
||||
", ".join(d["name"] for d in real_decks.values()))
|
||||
# All cards in the custom deck?
|
||||
for did_str in real_decks:
|
||||
assigned = conn.execute("SELECT COUNT(*) FROM cards WHERE did=?", [int(did_str)]).fetchone()[0]
|
||||
check(f"Cards in deck '{real_decks[did_str]['name']}'", assigned > 0, f"{assigned:,}/{card_count:,}")
|
||||
assigned = conn.execute(
|
||||
"SELECT COUNT(*) FROM cards WHERE did=?", [int(did_str)]
|
||||
).fetchone()[0]
|
||||
check(f"Cards in deck '{real_decks[did_str]['name']}'", assigned > 0,
|
||||
f"{assigned:,}/{card_count:,}")
|
||||
|
||||
# --- Sound references vs media manifest ---
|
||||
print("\n[Sound references]")
|
||||
|
|
@ -224,25 +218,16 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
|
||||
missing_audio = sound_refs - original_names
|
||||
orphaned_audio = original_names - sound_refs - set(font_files)
|
||||
failures += (
|
||||
0
|
||||
if check(
|
||||
"All sound refs in media manifest",
|
||||
len(missing_audio) == 0,
|
||||
f"{len(missing_audio)} missing" if missing_audio else "",
|
||||
)
|
||||
else 1
|
||||
)
|
||||
failures += 0 if check("All sound refs in media manifest", len(missing_audio) == 0,
|
||||
f"{len(missing_audio)} missing" if missing_audio else "") else 1
|
||||
if orphaned_audio:
|
||||
warn("Media files not referenced by any card", f"{len(orphaned_audio)} orphaned")
|
||||
|
||||
notes_with_audio = sum(1 for (flds,) in notes_flds if "[sound:" in flds)
|
||||
notes_with_audio = sum(
|
||||
1 for (flds,) in notes_flds if "[sound:" in flds
|
||||
)
|
||||
pct = notes_with_audio / note_count * 100 if note_count else 0
|
||||
if notes_with_audio > 0:
|
||||
check("Notes with audio", True, f"{notes_with_audio:,}/{note_count:,} ({pct:.0f}%)")
|
||||
else:
|
||||
# Non-audio variants intentionally have no audio — not a failure
|
||||
warn("No audio in this deck variant", f"0/{note_count:,}")
|
||||
check(f"Notes with audio", notes_with_audio > 0, f"{notes_with_audio:,}/{note_count:,} ({pct:.0f}%)")
|
||||
|
||||
# --- Empty fields check ---
|
||||
print("\n[Field content]")
|
||||
|
|
@ -251,12 +236,22 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
field_names = [f["name"] for f in model["flds"]]
|
||||
# Check required fields (first 3) are not empty
|
||||
required_idx = list(range(min(3, len(field_names))))
|
||||
all_notes_for_model = conn.execute("SELECT flds FROM notes WHERE mid=?", [int(mid_str)]).fetchall()
|
||||
for idx in required_idx:
|
||||
fname = field_names[idx]
|
||||
empty_count = conn.execute(
|
||||
"""SELECT COUNT(*) FROM notes
|
||||
WHERE mid=? AND (
|
||||
flds LIKE ? OR
|
||||
instr(flds, char(31)) = 0
|
||||
)""",
|
||||
[int(mid_str), "\x1f" * idx + "\x1f%"],
|
||||
).fetchone()[0]
|
||||
# Simpler: count notes where field idx is empty
|
||||
all_notes_for_model = conn.execute(
|
||||
"SELECT flds FROM notes WHERE mid=?", [int(mid_str)]
|
||||
).fetchall()
|
||||
empty = sum(
|
||||
1
|
||||
for (flds,) in all_notes_for_model
|
||||
1 for (flds,) in all_notes_for_model
|
||||
if len(flds.split("\x1f")) <= idx or not flds.split("\x1f")[idx].strip()
|
||||
)
|
||||
if empty > 0:
|
||||
|
|
@ -276,9 +271,6 @@ def main() -> None:
|
|||
group = parser.add_mutually_exclusive_group()
|
||||
group.add_argument("--vocab", action="store_true", help="Validate vocabulary deck only")
|
||||
group.add_argument("--conjugations", action="store_true", help="Validate conjugation deck only")
|
||||
group.add_argument("--confusables", action="store_true", help="Validate confusables deck only")
|
||||
group.add_argument("--plurals", action="store_true", help="Validate plurals deck only")
|
||||
group.add_argument("--complete", action="store_true", help="Validate complete combined deck only")
|
||||
args = parser.parse_args()
|
||||
|
||||
targets: list[Path] = []
|
||||
|
|
@ -288,14 +280,8 @@ def main() -> None:
|
|||
targets = [VOCAB_APKG]
|
||||
elif args.conjugations:
|
||||
targets = [CONJ_APKG]
|
||||
elif args.confusables:
|
||||
targets = [CONF_APKG]
|
||||
elif args.plurals:
|
||||
targets = [PLURAL_APKG]
|
||||
elif args.complete:
|
||||
targets = [COMPLETE_APKG]
|
||||
else:
|
||||
targets = [VOCAB_APKG, CONJ_APKG, CONF_APKG, PLURAL_APKG, COMPLETE_APKG]
|
||||
targets = [VOCAB_APKG, CONJ_APKG]
|
||||
|
||||
total_failures = 0
|
||||
for path in targets:
|
||||
|
|
|
|||
251
validate_verb_list.py
Normal file
251
validate_verb_list.py
Normal file
|
|
@ -0,0 +1,251 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
|
||||
|
||||
For each verb:
|
||||
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
|
||||
2. Searches pealim.com to find URL slug
|
||||
3. Fetches the page to confirm the binyan
|
||||
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
|
||||
|
||||
Output:
|
||||
verbs_input.txt — cleaned verb list for conjugation_extract.py
|
||||
Printed validation report table
|
||||
|
||||
Usage:
|
||||
python3 validate_verb_list.py
|
||||
|
||||
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
|
||||
running conjugation extraction.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
PEALIM_BASE = "https://www.pealim.com"
|
||||
REQUEST_DELAY = 1.5
|
||||
REQUEST_TIMEOUT = 15
|
||||
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
|
||||
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
|
||||
|
||||
# Known problem entries: word → (action, note)
|
||||
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
|
||||
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
|
||||
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
|
||||
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
|
||||
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
|
||||
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
|
||||
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
|
||||
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
|
||||
}
|
||||
|
||||
# Expected binyan by line range (1-indexed) per plan analysis
|
||||
LINE_RANGES: list[tuple[range, str]] = [
|
||||
(range(1, 18), "Pa'al"),
|
||||
(range(18, 29), "Nif'al"),
|
||||
(range(29, 37), "Pi'el"),
|
||||
(range(37, 43), "Pu'al"),
|
||||
(range(43, 53), "Hitpa'el"),
|
||||
(range(53, 63), "Hif'il"),
|
||||
(range(63, 71), "Huf'al"),
|
||||
]
|
||||
|
||||
SECTION_HEADERS: dict[str, str] = {
|
||||
"Pa'al": "# Pa'al (פָּעַל)",
|
||||
"Nif'al": "# Nif'al (נִפְעַל)",
|
||||
"Pi'el": "# Pi'el (פִּעֵל)",
|
||||
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
|
||||
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
|
||||
"Hif'il": "# Hif'il (הִפְעִיל)",
|
||||
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
|
||||
|
||||
|
||||
def classify_by_line(line_num: int) -> str:
|
||||
"""Return expected binyan for a 1-indexed line number."""
|
||||
for r, binyan in LINE_RANGES:
|
||||
if line_num in r:
|
||||
return binyan
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def find_slug(query: str) -> str | None:
|
||||
"""Search pealim.com and return first URL slug found."""
|
||||
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
|
||||
try:
|
||||
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
||||
return slugs[0] if slugs else None
|
||||
except Exception as e:
|
||||
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def get_page_binyan(slug: str) -> str:
|
||||
"""Fetch /dict/<slug>/ and extract binyan from page header."""
|
||||
url = f"{PEALIM_BASE}/dict/{slug}/"
|
||||
try:
|
||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
|
||||
for h3 in soup.find_all("h3", class_="page-header"):
|
||||
text = h3.get_text(" ", strip=True)
|
||||
for bname in binyan_names:
|
||||
if bname in text:
|
||||
return bname
|
||||
meta = soup.find("meta", {"property": "og:description"})
|
||||
if meta:
|
||||
desc = meta.get("content", "")
|
||||
for bname in binyan_names:
|
||||
if bname in desc:
|
||||
return bname
|
||||
except Exception as e:
|
||||
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
|
||||
return ""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not SOURCE_FILE.exists():
|
||||
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
lines = [l.strip() for l in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if l.strip()]
|
||||
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
|
||||
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
|
||||
|
||||
results = []
|
||||
|
||||
for line_num, word in enumerate(lines, start=1):
|
||||
expected_binyan = classify_by_line(line_num)
|
||||
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
|
||||
|
||||
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
|
||||
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
|
||||
|
||||
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
|
||||
|
||||
if issue_type == "REVIEW":
|
||||
# Don't query pealim for known-bad entries
|
||||
print(f"REVIEW (skipping query)")
|
||||
results.append({
|
||||
"line": line_num, "word": word,
|
||||
"expected_binyan": expected_binyan,
|
||||
"slug": "", "page_binyan": "",
|
||||
"status": "REVIEW", "notes": issue_note,
|
||||
"is_3ms": is_3ms_by_position,
|
||||
})
|
||||
continue
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
slug = find_slug(word)
|
||||
|
||||
if slug:
|
||||
time.sleep(REQUEST_DELAY)
|
||||
page_binyan = get_page_binyan(slug)
|
||||
else:
|
||||
page_binyan = ""
|
||||
|
||||
# Determine status
|
||||
if issue_type == "3ms" or is_3ms_by_position:
|
||||
status = "3ms"
|
||||
notes = issue_note or "Pu'al/Huf'al 3ms past form"
|
||||
elif not slug:
|
||||
status = "NOT_FOUND"
|
||||
notes = "no search result on pealim.com"
|
||||
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
|
||||
status = "MISMATCH"
|
||||
notes = f"expected {expected_binyan}, page says {page_binyan}"
|
||||
else:
|
||||
status = "OK"
|
||||
notes = ""
|
||||
|
||||
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
|
||||
results.append({
|
||||
"line": line_num, "word": word,
|
||||
"expected_binyan": expected_binyan,
|
||||
"slug": slug or "", "page_binyan": page_binyan,
|
||||
"status": status, "notes": notes,
|
||||
"is_3ms": is_3ms_by_position or issue_type == "3ms",
|
||||
})
|
||||
|
||||
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
|
||||
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
|
||||
review_lines: list[str] = []
|
||||
|
||||
for r in results:
|
||||
b = r["expected_binyan"]
|
||||
if b not in sections:
|
||||
b = list(sections.keys())[0]
|
||||
|
||||
if r["status"] == "REVIEW":
|
||||
review_lines.append(f"# REVIEW: {r['word']} — {r['notes']}")
|
||||
elif r["status"] == "3ms":
|
||||
sections[b].append(f"# 3ms: {r['word']}")
|
||||
elif r["status"] in ("OK", "MISMATCH"):
|
||||
sections[b].append(r["word"])
|
||||
else: # NOT_FOUND
|
||||
sections[b].append(f"# NOT_FOUND: {r['word']} — {r['notes']}")
|
||||
|
||||
output_lines = [
|
||||
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
|
||||
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
|
||||
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
|
||||
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
|
||||
"",
|
||||
]
|
||||
for binyan, header in SECTION_HEADERS.items():
|
||||
if sections.get(binyan):
|
||||
output_lines.append(header)
|
||||
output_lines.extend(sections[binyan])
|
||||
output_lines.append("")
|
||||
|
||||
if review_lines:
|
||||
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
|
||||
output_lines.extend(review_lines)
|
||||
output_lines.append("")
|
||||
|
||||
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
|
||||
print(f"\nWrote → {OUTPUT_FILE}")
|
||||
|
||||
# ── Print summary table ──────────────────────────────────────────────────────
|
||||
col_w = [4, 22, 14, 38, 12]
|
||||
print("\n" + "=" * 95)
|
||||
print("VALIDATION REPORT")
|
||||
print("=" * 95)
|
||||
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
|
||||
print("-" * 95)
|
||||
for r in results:
|
||||
print(
|
||||
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
|
||||
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
|
||||
)
|
||||
print("=" * 95)
|
||||
|
||||
counts = {s: sum(1 for r in results if r["status"] == s)
|
||||
for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
|
||||
print(
|
||||
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
|
||||
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
|
||||
)
|
||||
print(f"Total entries: {len(results)}")
|
||||
|
||||
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
|
||||
print(
|
||||
"\n⚠ Review flagged entries in verbs_input.txt before running:\n"
|
||||
" python3 conjugation_extract.py"
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -2,8 +2,6 @@
|
|||
# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).
|
||||
|
||||
# Pa'al (פָּעַל)
|
||||
# slug: להיות 454-lihyot
|
||||
להיות
|
||||
לשמור
|
||||
ללמוד
|
||||
לאסוף
|
||||
|
|
|
|||
Loading…
Reference in a new issue