Compare commits
31 commits
| Author | SHA1 | Date | |
|---|---|---|---|
| 6d2d446ed5 | |||
| f978e5f39a | |||
| 5f617af4ba | |||
| f3496998f5 | |||
| 138acb06d8 | |||
| 0a85291975 | |||
| 14d567a261 | |||
| 8b24d0fd26 | |||
| 272a2a080d | |||
| fb12f806a8 | |||
| 00fba934fb | |||
| d2a7c9d483 | |||
| d0f4aea58d | |||
| b3ea086e85 | |||
| af186e2030 | |||
| 0d92451271 | |||
| c85063ee2f | |||
| efd0745ada | |||
| 3b0f9defa9 | |||
| b8b65442cb | |||
| 04a4b52113 | |||
| f6af714e22 | |||
| b2fef5aa8a | |||
| a1d970a782 | |||
| 6c2a0f8eed | |||
| 08fb7009d8 | |||
| 2e48109d7f | |||
| 802c369365 | |||
| def2fc1aca | |||
| 5685270dfa | |||
| 34bec8f4ce |
55 changed files with 2581703 additions and 228292 deletions
26
.claude/settings.json
Normal file
26
.claude/settings.json
Normal file
|
|
@ -0,0 +1,26 @@
|
|||
{
|
||||
"hooks": {
|
||||
"PostToolUse": [
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=\"$CLAUDE_FILE_PATH\"; if [ -n \"$file\" ] && echo \"$file\" | grep -q '\\.py$'; then ruff format --quiet \"$file\" && ruff check --fix --quiet \"$file\" 2>/dev/null; fi"
|
||||
}
|
||||
]
|
||||
}
|
||||
],
|
||||
"PreToolUse": [
|
||||
{
|
||||
"matcher": "Edit|Write",
|
||||
"hooks": [
|
||||
{
|
||||
"type": "command",
|
||||
"command": "file=\"$CLAUDE_FILE_PATH\"; if echo \"$file\" | grep -qE '(legacy_guid_map\\.json|\\.env)$'; then echo 'BLOCKED: Protected file — legacy_guid_map.json and .env are read-only' >&2; exit 2; fi"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
10
.gitignore
vendored
10
.gitignore
vendored
|
|
@ -15,6 +15,7 @@ __pycache__/
|
|||
|
||||
# Large generated cache files (rebuild locally)
|
||||
data/benyehuda_index.json
|
||||
data/colliding_forms.json
|
||||
|
||||
# Audio directories (large; rebuild locally)
|
||||
data/audio/
|
||||
|
|
@ -29,6 +30,7 @@ output/
|
|||
|
||||
# Internal / private files — not for public repo
|
||||
ANKIWEB_DESCRIPTION.md
|
||||
PROJECT_NOTES.md
|
||||
PROJECTS.md
|
||||
SPRINT_LOG.md
|
||||
CLAUDE.md
|
||||
|
|
@ -46,6 +48,14 @@ data/epubs/
|
|||
|
||||
# Stray deck files
|
||||
Everything__*.apkg
|
||||
*.apkg
|
||||
|
||||
# Legacy CSV files (replaced by data/words.json)
|
||||
*.csv
|
||||
data/*.csv
|
||||
|
||||
# Dead whitelist files
|
||||
vulture_whitelist.py
|
||||
|
||||
# Release artifacts — distributed via Forgejo releases, not committed to tree
|
||||
releases/
|
||||
|
|
|
|||
|
|
@ -56,7 +56,7 @@ Fields on each card:
|
|||
| Plural form | for nouns: רבים: שֻׁלְחָנוֹת |
|
||||
| Disambiguation hint | for ambiguous Eng→Heb cards |
|
||||
|
||||
Cards are presented in **frequency order** — Anki will show you the most common words first.
|
||||
Cards are presented in **frequency order** — Anki will show you the most common words first. Note that because frequency is collected with words without nikkud, words that have the same letters but different nikkud will be assigned the same frequency.
|
||||
|
||||
### Eng→Heb disambiguation
|
||||
|
||||
|
|
|
|||
192
SCHEMA.yaml
Normal file
192
SCHEMA.yaml
Normal file
|
|
@ -0,0 +1,192 @@
|
|||
# Hebrew Flash Cards — Unified Data Schema (words.json)
|
||||
# Revised based on Nevo's feedback (2026-03-08)
|
||||
#
|
||||
# Top-level: dict keyed by unique_key
|
||||
# Unique key: nikkud word for most entries (e.g. "אָב")
|
||||
# For 146 homographs (same nikkud, different meaning): "word|pos" e.g. "אָח|Noun"
|
||||
# For same nikkud AND same pos: "word|pos|meaning" e.g. "אָח|Noun|brother"
|
||||
#
|
||||
# Hebrew text fields use nikkud/ktiv_male subfields:
|
||||
# field:
|
||||
# nikkud: "אָב" # with nikkud (hebstyle=mo)
|
||||
# ktiv_male: "אב" # plene spelling (hebstyle=vl)
|
||||
# This pattern applies to: word, singular, plural, construct forms, conjugated forms, etc.
|
||||
#
|
||||
# Pronoun notation for conjugation forms uses grammatical codes:
|
||||
# 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
||||
# (not Hebrew pronoun strings, which are ambiguous for gender in some persons)
|
||||
|
||||
entry:
|
||||
# --- Core Identity ---
|
||||
word:
|
||||
nikkud: "אָב"
|
||||
ktiv_male: "אב"
|
||||
slug: "6009-av" # Pealim URL slug (e.g. pealim.com/dict/6009-av/)
|
||||
root: ["א", "ב"] # Shoresh as list of consonant chars
|
||||
pos: "Noun" # Part of speech in English (as from pealim)
|
||||
pos_hebrew: "שֵׁם עֶצֶם" # Part of speech in Hebrew (with nikkud)
|
||||
meaning: "father" # English meaning (cleaned — no inline emoji, no Hebrew prepositions)
|
||||
meaning_raw: "father 👨" # Original meaning as scraped (may contain emoji and/or Hebrew preps)
|
||||
prep: "על" # Hebrew preposition(s) governing this word, extracted from meaning_raw (e.g. "(על)" → "על"); null if none
|
||||
audio_url: "https://..." # Pealim audio URL
|
||||
audio_file: "6009-av.mp3" # Local filename (slug-based for confusables, consonant-based otherwise)
|
||||
tags: "" # Pealim tags if any
|
||||
last_scrape_date: "2026-03-08" # ISO date of most recent pealim.com scrape for this entry
|
||||
|
||||
# --- Identity & Progress ---
|
||||
vocab_legacy_guid: "abc123..." # Vocab note GUID from legacy_guid_map.json
|
||||
# Other note GUIDs stored in their respective sections (cloze, plurals, conjugation)
|
||||
|
||||
# --- Frequency ---
|
||||
frequency: 412 # Hebrew frequency rank from hermitdave/FrequencyWords he_50k (ktiv male based)
|
||||
pseudo_frequency: null # Adjusted frequency for confusable homographs (deferred to future sprint)
|
||||
|
||||
# --- Display Enrichment ---
|
||||
emoji: "👨"
|
||||
emoji_source: "ai_vetted" # One of: ai_vetted, from_pealim, null
|
||||
emoji_visible: false # Whether to show on cards (false until emoji vetting is done)
|
||||
image: "father.jpg" # Wikipedia/Commons image filename, or null
|
||||
image_source: "wikipedia" # One of: wikipedia, commons, null
|
||||
hint: "" # Eng→Heb disambiguation hint (from refined_meanings.json)
|
||||
|
||||
# --- Shared Roots ---
|
||||
shared_roots: [] # List of unique_keys of other words sharing the same root
|
||||
# Computed by iterating all entries and grouping by root
|
||||
|
||||
# --- Confusables ---
|
||||
confusable_group: null # List of unique_keys sharing same ktiv_male, or null
|
||||
# e.g. ["אָח|Noun|brother", "אָח|Noun|fireplace"]
|
||||
|
||||
# --- Example Sentences ---
|
||||
examples:
|
||||
vetted: # AI-vetted sentences from Ben Yehuda / EPUB corpus
|
||||
- text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
||||
source: "ben_yehuda" # One of: ben_yehuda, epub_little_prince, epub_alice, ...
|
||||
vetted: true
|
||||
cloze: # Best sentence for cloze card, or null
|
||||
text: "הָאָב הָלַךְ לַעֲבוֹדָה"
|
||||
cloze_word_start: 0 # Character offset of the clozed word in text
|
||||
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
|
||||
cloze_hint: "family member"
|
||||
cloze_guid: "def456..." # GUID for the cloze note
|
||||
difficulty_score: 234 # Median frequency rank of context words (lower = easier); optional
|
||||
rejected_count: 0
|
||||
|
||||
# --- Noun-specific: Inflection Forms ---
|
||||
noun_inflection: null # null for non-nouns
|
||||
# When populated:
|
||||
# plurals_guid: "ghi789..." # GUID for plurals deck note
|
||||
# singular: # null if noun is inherently plural (e.g. bicycle/אופניים)
|
||||
# nikkud: "אָב"
|
||||
# ktiv_male: "אב"
|
||||
# plural:
|
||||
# nikkud: "אָבוֹת"
|
||||
# ktiv_male: "אבות"
|
||||
# singular_audio: "6009-av.mp3"
|
||||
# plural_audio: null # TODO: scrape from detail pages
|
||||
# construct_singular:
|
||||
# nikkud: "אֲבִי"
|
||||
# ktiv_male: "אבי"
|
||||
# construct_plural:
|
||||
# nikkud: "אֲבוֹת"
|
||||
# ktiv_male: "אבות"
|
||||
# pronominal_suffixes: # Scraped from pealim "forms with pronominal affixes" section
|
||||
# 1s:
|
||||
# nikkud: "אָבִי"
|
||||
# ktiv_male: "אבי"
|
||||
# 1p:
|
||||
# nikkud: "אָבִינוּ"
|
||||
# ktiv_male: "אבינו"
|
||||
# 2ms: ...
|
||||
# 2fs: ...
|
||||
# 2mp: ...
|
||||
# 2fp: ...
|
||||
# 3ms: ...
|
||||
# 3fs: ...
|
||||
# 3mp: ...
|
||||
# 3fp: ...
|
||||
# gender: "masculine"
|
||||
# gender_hebrew:
|
||||
# nikkud: "זָכָר"
|
||||
# ktiv_male: "זכר"
|
||||
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
||||
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
||||
|
||||
# --- Verb-specific: Conjugation Data ---
|
||||
conjugation: null # null for non-verbs
|
||||
# When populated:
|
||||
# in_conjugation_deck: true # Whether this verb is in the 71-verb conjugation deck
|
||||
# infinitive:
|
||||
# nikkud: "לִשְׁמֹר"
|
||||
# ktiv_male: "לשמור"
|
||||
# reference_form: # 3ms past (the citation form)
|
||||
# nikkud: "שָׁמַר"
|
||||
# ktiv_male: "שמר"
|
||||
# binyan: "Pa'al" # English binyan name
|
||||
# binyan_hebrew: "פָּעַל" # Hebrew binyan name (with nikkud)
|
||||
# prep: "על" # Hebrew preposition the verb takes, or null
|
||||
# active_forms:
|
||||
# - person: "1s" # Grammatical code: 1s, 1p, 2ms, 2fs, 2mp, 2fp, 3ms, 3fs, 3mp, 3fp
|
||||
# tense: "עָבָר"
|
||||
# form:
|
||||
# nikkud: "שָׁמַרְתִּי"
|
||||
# ktiv_male: "שמרתי"
|
||||
# audio_url: "https://..."
|
||||
# audio_file: null # For future use
|
||||
# hufal_pual_forms: null # Same structure as active_forms; non-null only for hif'il/pi'el verbs
|
||||
# # When non-null, binyan MUST be Hif'il or Pi'el (validated)
|
||||
# reference_form_passive: # 3ms past of the huf'al/pu'al counterpart, or null
|
||||
# nikkud: "שֻׁמַּר"
|
||||
# ktiv_male: "שומר"
|
||||
|
||||
# --- Adjective-specific ---
|
||||
adjective_inflection: null # null for non-adjectives
|
||||
# When populated:
|
||||
# ms:
|
||||
# nikkud: "גָּדוֹל"
|
||||
# ktiv_male: "גדול"
|
||||
# fs:
|
||||
# nikkud: "גְּדוֹלָה"
|
||||
# ktiv_male: "גדולה"
|
||||
# mp:
|
||||
# nikkud: "גְּדוֹלִים"
|
||||
# ktiv_male: "גדולים"
|
||||
# fp:
|
||||
# nikkud: "גְּדוֹלוֹת"
|
||||
# ktiv_male: "גדולות"
|
||||
# mishkal: "CaCaC" # English mishkal name (scraped from pealim PoS section)
|
||||
# mishkal_hebrew: "קָטָל" # Hebrew mishkal name (computed via mapping)
|
||||
|
||||
# --- Preposition-specific ---
|
||||
preposition_inflection: null # null for non-prepositions
|
||||
# When populated:
|
||||
# 1s:
|
||||
# nikkud: "שֶׁלִּי"
|
||||
# ktiv_male: "שלי"
|
||||
# 1p:
|
||||
# nikkud: "שֶׁלָּנוּ"
|
||||
# ktiv_male: "שלנו"
|
||||
# 2ms:
|
||||
# nikkud: "שֶׁלְּךָ"
|
||||
# ktiv_male: "שלך"
|
||||
# 2fs:
|
||||
# nikkud: "שֶׁלָּךְ"
|
||||
# ktiv_male: "שלך"
|
||||
# 2mp:
|
||||
# nikkud: "שֶׁלָּכֶם"
|
||||
# ktiv_male: "שלכם"
|
||||
# 2fp:
|
||||
# nikkud: "שֶׁלָּכֶן"
|
||||
# ktiv_male: "שלכן"
|
||||
# 3ms:
|
||||
# nikkud: "שֶׁלּוֹ"
|
||||
# ktiv_male: "שלו"
|
||||
# 3fs:
|
||||
# nikkud: "שֶׁלָּהּ"
|
||||
# ktiv_male: "שלה"
|
||||
# 3mp:
|
||||
# nikkud: "שֶׁלָּהֶם"
|
||||
# ktiv_male: "שלהם"
|
||||
# 3fp:
|
||||
# nikkud: "שֶׁלָּהֶן"
|
||||
# ktiv_male: "שלהן"
|
||||
1648
apkg_builder.py
1648
apkg_builder.py
File diff suppressed because it is too large
Load diff
196
benyehuda.py
196
benyehuda.py
|
|
@ -1,196 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Ben Yehuda corpus example-sentence lookup (nikkud corpus).
|
||||
|
||||
Downloads the nikkud-bearing plaintext ZIP once, indexes sentences by nikkud word form,
|
||||
then answers queries locally.
|
||||
|
||||
Exposed API:
|
||||
load(force_rebuild=False)
|
||||
get_examples(word_nikkud) -> list[str] (returns 0 or 1 examples)
|
||||
save_examples_cache()
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import zipfile
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
from helpers import strip_nikkud as _strip_nikkud
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Nikkud-bearing corpus (txt.zip instead of txt_stripped.zip)
|
||||
CORPUS_URL = "https://github.com/projectbenyehuda/public_domain_dump/releases/download/2025-10/txt.zip"
|
||||
INDEX_PATH = Path(__file__).parent / "data" / "benyehuda_index.json"
|
||||
EXAMPLES_CACHE_PATH = Path(__file__).parent / "data" / "examples_cache.json"
|
||||
REQUEST_TIMEOUT = 120
|
||||
MIN_SENTENCE_LEN = 20
|
||||
MAX_SENTENCE_LEN = 200
|
||||
MAX_INDEX_ENTRIES = 500 # cap examples kept per word in index to limit memory
|
||||
|
||||
# Module-level state
|
||||
_index: dict[str, list[str]] = {} # word (with nikkud) -> [sentence, ...]
|
||||
_examples_cache: dict[str, list[str]] = {} # word -> cached result for this run
|
||||
|
||||
|
||||
def _split_sentences(text: str) -> list[str]:
|
||||
"""
|
||||
Split text into sentences on newlines only (Hebrew sentences don't have
|
||||
mid-word period issues like English). Min 20 chars, max 200 chars.
|
||||
"""
|
||||
out = []
|
||||
for line in text.split("\n"):
|
||||
s = line.strip().strip("\"'.,;:!?")
|
||||
s = s.strip()
|
||||
if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN:
|
||||
out.append(s)
|
||||
return out
|
||||
|
||||
|
||||
def _build_index(corpus_zip_bytes: bytes) -> None:
|
||||
"""Parse corpus ZIP and build word (nikkud) → sentences index."""
|
||||
global _index
|
||||
_index = {}
|
||||
logger.info("Building Ben Yehuda index from nikkud corpus …")
|
||||
|
||||
with zipfile.ZipFile(BytesIO(corpus_zip_bytes)) as zf:
|
||||
txt_files = [n for n in zf.namelist() if n.endswith(".txt")]
|
||||
logger.info(f" Corpus contains {len(txt_files)} text files")
|
||||
for fname in txt_files:
|
||||
try:
|
||||
raw = zf.read(fname).decode("utf-8", errors="ignore")
|
||||
except Exception: # noqa: S112
|
||||
continue
|
||||
for sentence in _split_sentences(raw):
|
||||
# Index by each unique Hebrew token (with nikkud) in the sentence
|
||||
words = re.findall(r"[\u05d0-\u05ea\u05b0-\u05c7'\"]+", sentence)
|
||||
for w in set(words):
|
||||
if len(w) >= 2:
|
||||
bucket = _index.setdefault(w, [])
|
||||
if len(bucket) < MAX_INDEX_ENTRIES:
|
||||
bucket.append(sentence)
|
||||
|
||||
logger.info(f"Index built: {len(_index)} unique word forms")
|
||||
|
||||
|
||||
def _save_index() -> None:
|
||||
INDEX_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(INDEX_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(_index, f, ensure_ascii=False)
|
||||
logger.info(f"Ben Yehuda index saved → {INDEX_PATH}")
|
||||
|
||||
|
||||
def _load_index() -> None:
|
||||
global _index
|
||||
with open(INDEX_PATH, encoding="utf-8") as f:
|
||||
_index = json.load(f)
|
||||
logger.info(f"Ben Yehuda index loaded: {len(_index)} word forms")
|
||||
|
||||
|
||||
def load(force_rebuild: bool = False) -> None:
|
||||
"""Load or build the Ben Yehuda index. Downloads corpus if needed."""
|
||||
global _index, _examples_cache
|
||||
if _index and not force_rebuild:
|
||||
return
|
||||
|
||||
if force_rebuild:
|
||||
# Delete old index and discard examples cache
|
||||
if INDEX_PATH.exists():
|
||||
INDEX_PATH.unlink()
|
||||
logger.info("Deleted old Ben Yehuda index (force rebuild)")
|
||||
_examples_cache = {}
|
||||
else:
|
||||
# Load persisted examples cache (not needed on rebuild)
|
||||
if EXAMPLES_CACHE_PATH.exists():
|
||||
with open(EXAMPLES_CACHE_PATH, encoding="utf-8") as f:
|
||||
_examples_cache = json.load(f)
|
||||
|
||||
if INDEX_PATH.exists():
|
||||
_load_index()
|
||||
return
|
||||
|
||||
logger.info("Downloading Ben Yehuda nikkud corpus … (this may take 2-3 minutes)")
|
||||
resp = requests.get(CORPUS_URL, timeout=REQUEST_TIMEOUT, stream=True)
|
||||
resp.raise_for_status()
|
||||
data = resp.content
|
||||
logger.info(f"Corpus downloaded: {len(data) / 1e6:.1f} MB")
|
||||
|
||||
_build_index(data)
|
||||
_save_index()
|
||||
|
||||
|
||||
def save_examples_cache() -> None:
|
||||
EXAMPLES_CACHE_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(EXAMPLES_CACHE_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(_examples_cache, f, ensure_ascii=False)
|
||||
logger.info(f"Examples cache saved: {len(_examples_cache)} entries → {EXAMPLES_CACHE_PATH}")
|
||||
|
||||
|
||||
def get_examples(word_nikkud: str) -> list[str]:
|
||||
"""
|
||||
Return 0 or 1 example sentences for the given word (nikkud form).
|
||||
|
||||
Lookup strategy:
|
||||
1. Try exact nikkud match in index.
|
||||
2. Fall back to stripped (no-nikkud) match against index keys.
|
||||
|
||||
Returns the single longest sentence ≤ MAX_SENTENCE_LEN that contains
|
||||
the word as a whole token.
|
||||
"""
|
||||
if not _index:
|
||||
load()
|
||||
|
||||
word = word_nikkud.strip()
|
||||
word_stripped = _strip_nikkud(word)
|
||||
|
||||
cache_key = word
|
||||
|
||||
if cache_key in _examples_cache:
|
||||
return _examples_cache[cache_key]
|
||||
|
||||
# Lookup: try exact nikkud first, then stripped fallback
|
||||
candidates = _index.get(word, [])
|
||||
if not candidates and word_stripped:
|
||||
# Try looking up by stripped form across index keys
|
||||
for k, v in _index.items():
|
||||
if _strip_nikkud(k) == word_stripped:
|
||||
candidates = v
|
||||
break
|
||||
|
||||
# Filter: word must appear as a whole token
|
||||
# Match the stripped form (for robustness with nikkud variants in sentence)
|
||||
if word_stripped:
|
||||
pattern = r"(?<!\w)" + re.escape(word_stripped) + r"(?!\w)"
|
||||
matched = [s for s in candidates if re.search(pattern, _strip_nikkud(s))]
|
||||
else:
|
||||
matched = candidates[:]
|
||||
|
||||
# Filter by length
|
||||
matched = [s for s in matched if MIN_SENTENCE_LEN <= len(s) <= MAX_SENTENCE_LEN]
|
||||
|
||||
# Return the single longest sentence ≤ MAX_SENTENCE_LEN
|
||||
if matched:
|
||||
best = max(matched, key=len)
|
||||
result = [best]
|
||||
else:
|
||||
result = []
|
||||
|
||||
_examples_cache[cache_key] = result
|
||||
return result
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
load()
|
||||
tests = ["שָׁלוֹם", "בַּיִת", "סֵפֶר", "מַיִם", "אַהֲבָה", "יֶלֶד"]
|
||||
for w in tests:
|
||||
exs = get_examples(w)
|
||||
print(f"\n{w}: {len(exs)} example(s)")
|
||||
for ex in exs:
|
||||
print(f" → {ex[:100]}")
|
||||
save_examples_cache()
|
||||
110
card_preview.html
Normal file
110
card_preview.html
Normal file
|
|
@ -0,0 +1,110 @@
|
|||
<!DOCTYPE html>
|
||||
<html dir="rtl">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
|
||||
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
|
||||
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
|
||||
.card-content { padding: 16px; text-align: center; }
|
||||
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
|
||||
|
||||
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
|
||||
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
|
||||
.emoji-img { font-size: 48px; text-align: center; margin: 4px 0; }
|
||||
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
|
||||
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
|
||||
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
|
||||
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
|
||||
.example { font-size: 24px; color: #222; padding: 6px 8px; direction: rtl; text-align: center; border-left: 3px solid #ccc; font-style: italic; margin: 6px auto; max-width: 90%; }
|
||||
.voice-label { font-size: 20px; color: #888; }
|
||||
|
||||
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
|
||||
.more-header {
|
||||
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
|
||||
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
|
||||
}
|
||||
.more-header::-webkit-details-marker { display: none; }
|
||||
.more-header::before { content: "○ "; font-size: 14px; }
|
||||
details[open] > .more-header::before { content: "● "; }
|
||||
|
||||
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
|
||||
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Vocab: English → Hebrew (BACK) — collapsed</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">English → Hebrew — Back (default: collapsed)</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
|
||||
<div class="emoji-img">📍</div>
|
||||
<div class="divider"></div>
|
||||
<div class="hebrew">פַּעַם</div>
|
||||
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
|
||||
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Same card — EXPANDED</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">English → Hebrew — Back (expanded)</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="meaning">time (occasion), time round; once (when used as an adverb)</div>
|
||||
<div class="emoji-img">📍</div>
|
||||
<div class="divider"></div>
|
||||
<div class="hebrew">פַּעַם</div>
|
||||
|
||||
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">לְלֹא נִיקּוּד:</span><span class="sec-val">פעם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">פ.ע.ם</span></div>
|
||||
<div class="sec-label"><span class="sec-key">חֵלֶק דִּיבּוּר:</span><span class="sec-val">שֵׁם עֶצֶם, נְקֵבָה</span></div>
|
||||
<div class="sec-label"><span class="sec-key">רַבִּים:</span><span class="sec-val">פְּעָמִים</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">פַּעְמַיִם</span><span class="rw-meaning">twice, two times</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְפַעֵם</span><span class="rw-meaning">to surge (feeling, emotion)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פַּעֲמוֹן</span><span class="rw-meaning">bell</span></div>
|
||||
<div class="sec-label"><span class="rw-word">פְּעִימָה</span><span class="rw-meaning">heartbeat; beat; stroke (technolo…</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לִפְעֹם</span><span class="rw-meaning">to beat, to pulse, to throb</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִתְפַּעֵם</span><span class="rw-meaning">to be excited (emotionally)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהַפְעִים</span><span class="rw-meaning">to excite, to agitate (lit.)</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִפָּעֵם</span><span class="rw-meaning">to be excited, to be thrilled</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
114
card_preview_conj.html
Normal file
114
card_preview_conj.html
Normal file
|
|
@ -0,0 +1,114 @@
|
|||
<!DOCTYPE html>
|
||||
<html dir="rtl">
|
||||
<head>
|
||||
<meta charset="utf-8">
|
||||
<style>
|
||||
body { font-family: 'Heebo', 'Arial Hebrew', sans-serif; background: #fff; max-width: 600px; margin: 20px auto; }
|
||||
.card-container { border: 1px solid #ccc; border-radius: 8px; margin: 20px 0; overflow: hidden; }
|
||||
.card-label { background: #333; color: #fff; padding: 6px 12px; font-size: 14px; font-family: sans-serif; direction: ltr; }
|
||||
.card-content { padding: 16px; text-align: center; }
|
||||
.card-content hr { border: none; border-top: 1px solid #ccc; margin: 12px 0; }
|
||||
.hebrew { font-size: 48px; font-weight: bold; color: #222; direction: rtl; text-align: center; }
|
||||
.hebrew-sm { font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.meaning { font-size: 28px; color: #1a1a8c; text-align: center; direction: ltr; margin: 4px 0; }
|
||||
.hint { font-size: 22px; color: #555; margin: 4px 0; direction: rtl; text-align: center; }
|
||||
.divider { border-top: 1px solid #ccc; margin: 8px 0; }
|
||||
.sec-table { display: table; margin: 6px auto 0; direction: rtl; border-collapse: collapse; }
|
||||
.sec-label { display: table-row; font-size: 28px; font-weight: normal; color: #222; direction: rtl; }
|
||||
.sec-key { display: table-cell; font-size: 28px; color: #222; font-weight: bold; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.sec-val { display: table-cell; font-size: 28px; color: #222; text-align: right; padding: 2px 0; }
|
||||
.voice-label { font-size: 20px; color: #888; }
|
||||
|
||||
.more-toggle { text-align: center; direction: rtl; margin-top: 8px; }
|
||||
.more-header {
|
||||
display: inline-block; font-size: 18px; color: #555; cursor: pointer; list-style: none;
|
||||
border: 1px solid #ccc; border-radius: 16px; padding: 4px 16px; margin: 4px 0; background: #f8f8f8;
|
||||
}
|
||||
.more-header::-webkit-details-marker { display: none; }
|
||||
.more-header::before { content: "○ "; font-size: 14px; }
|
||||
details[open] > .more-header::before { content: "● "; }
|
||||
|
||||
.related-header { font-size: 22px; color: #555; text-align: center; margin: 4px 0; }
|
||||
.rw-word { display: table-cell; font-size: 28px; color: #222; font-weight: normal; text-align: right; padding: 2px 0 2px 8px; white-space: nowrap; }
|
||||
.rw-meaning { display: table-cell; font-size: 24px; color: #555; text-align: left; direction: ltr; padding: 2px 0; }
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — FRONT</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Front</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (collapsed)</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Back — default state</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
<hr>
|
||||
<div class="hebrew">שׁוֹמֵר (על)</div>
|
||||
|
||||
<details class="more-toggle"><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<h2 style="font-family:sans-serif;direction:ltr;">Conjugation Card — BACK (expanded)</h2>
|
||||
<div class="card-container">
|
||||
<div class="card-label">Back — expanded</div>
|
||||
<div class="card-content">
|
||||
|
||||
<div class="hint">אֵיךְ אוֹמְרִים</div>
|
||||
<div class="hebrew">אַתָּה</div>
|
||||
<div class="hebrew" style="color:#1a1a8c;">לִשְׁמֹר <span class="hebrew-sm">(על)</span></div>
|
||||
<div class="hebrew">בַּהוֹוֶה</div>
|
||||
<hr>
|
||||
<div class="hebrew">שׁוֹמֵר (על)</div>
|
||||
|
||||
<details class="more-toggle" open><summary class="more-header">מידע נוסף</summary>
|
||||
<div class="sec-label" style="text-align:center;display:block;">to guard; to keep, to maintain</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="sec-key">שֹׁרֶשׁ:</span><span class="sec-val">שׁ.מ.ר</span></div>
|
||||
<div class="sec-label"><span class="sec-key">בִּנְיָן:</span><span class="sec-val">פָּעַל</span></div>
|
||||
</div>
|
||||
<div class="divider" style="margin:6px 0;"></div>
|
||||
<div class="related-header">מִילִים קְשׁוּרוֹת</div>
|
||||
<div class="sec-table">
|
||||
<div class="sec-label"><span class="rw-word">מִשְׁמָר</span><span class="rw-meaning">guard, watch; shift</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שׁוֹמֵר</span><span class="rw-meaning">guard, watchman</span></div>
|
||||
<div class="sec-label"><span class="rw-word">שְׁמִירָה</span><span class="rw-meaning">guarding, watching</span></div>
|
||||
<div class="sec-label"><span class="rw-word">לְהִשָּׁמֵר</span><span class="rw-meaning">to beware, to be careful</span></div>
|
||||
</div>
|
||||
</details>
|
||||
|
||||
</div>
|
||||
</div>
|
||||
|
||||
</body>
|
||||
</html>
|
||||
|
|
@ -1,690 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Hebrew verb conjugations from pealim.com.
|
||||
Input: verbs_input.txt (one Hebrew infinitive per line;
|
||||
lines starting with '# 3ms:' search by 3ms past form for Pu'al/Huf'al)
|
||||
Output: data/conjugations.json
|
||||
|
||||
For each verb:
|
||||
1. Search pealim.com/search/?q=<verb> to find URL slug
|
||||
2. Fetch /dict/<slug>/ with hebstyle=mo cookie
|
||||
3. Parse conjugation table by row labels
|
||||
4. Capture audio URLs per form
|
||||
5. Parse passive (Pu'al/Huf'al) forms from the same page
|
||||
|
||||
Resume-safe: verbs already in conjugations.json are skipped.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
from helpers import strip_nikkud as _strip_nikkud
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PEALIM_BASE = "https://www.pealim.com"
|
||||
REQUEST_DELAY = 1.5
|
||||
REQUEST_TIMEOUT = 15
|
||||
VERBS_INPUT = Path(__file__).parent / "verbs_input.txt"
|
||||
CONJUGATIONS_PATH = Path(__file__).parent / "data" / "conjugations.json"
|
||||
DICT_CSV = next(
|
||||
(
|
||||
p
|
||||
for p in [
|
||||
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
|
||||
Path(__file__).parent / "data" / "pealim_dict_for_anki.csv",
|
||||
]
|
||||
if p.exists()
|
||||
),
|
||||
Path(__file__).parent / "data" / "hebrew_dict_for_anki.csv",
|
||||
)
|
||||
|
||||
# Pronoun labels (for card front display)
|
||||
PRONOUN_LABELS = {
|
||||
"present_ms": "",
|
||||
"present_fs": "",
|
||||
"present_mp": "",
|
||||
"present_fp": "",
|
||||
"past_1s": "אֲנִי",
|
||||
"past_1p": "אֲנַחְנוּ",
|
||||
"past_2ms": "אַתָּה",
|
||||
"past_2fs": "אַתְּ",
|
||||
"past_2mp": "אַתֶּם",
|
||||
"past_2fp": "אַתֶּן",
|
||||
"past_3ms": "הוּא",
|
||||
"past_3fs": "הִיא",
|
||||
"past_3p": "הֵם / הֵן",
|
||||
"future_1s": "אֲנִי",
|
||||
"future_1p": "אֲנַחְנוּ",
|
||||
"future_2ms": "אַתָּה",
|
||||
"future_2fs": "אַתְּ",
|
||||
"future_2mp": "אַתֶּם",
|
||||
"future_2fp": "אַתֶּן",
|
||||
"future_3ms": "הוּא",
|
||||
"future_3fs": "הִיא",
|
||||
"future_3mp": "הֵם",
|
||||
"future_3fp": "הֵן",
|
||||
"imperative_ms": "אַתָּה",
|
||||
"imperative_fs": "אַתְּ",
|
||||
"imperative_mp": "אַתֶּם",
|
||||
"imperative_fp": "אַתֶּן",
|
||||
"infinitive": "",
|
||||
}
|
||||
|
||||
# Human-readable tense description for card front
|
||||
TENSE_DESCRIPTION = {
|
||||
"present_ms": "הוֹוֶה",
|
||||
"present_fs": "הוֹוֶה",
|
||||
"present_mp": "הוֹוֶה",
|
||||
"present_fp": "הוֹוֶה",
|
||||
"past_1s": "עָבָר",
|
||||
"past_1p": "עָבָר",
|
||||
"past_2ms": "עָבָר",
|
||||
"past_2fs": "עָבָר",
|
||||
"past_2mp": "עָבָר",
|
||||
"past_2fp": "עָבָר",
|
||||
"past_3ms": "עָבָר",
|
||||
"past_3fs": "עָבָר",
|
||||
"past_3p": "עָבָר",
|
||||
"future_1s": "עָתִיד",
|
||||
"future_1p": "עָתִיד",
|
||||
"future_2ms": "עָתִיד",
|
||||
"future_2fs": "עָתִיד",
|
||||
"future_2mp": "עָתִיד",
|
||||
"future_2fp": "עָתִיד",
|
||||
"future_3ms": "עָתִיד",
|
||||
"future_3fs": "עָתִיד",
|
||||
"future_3mp": "עָתִיד",
|
||||
"future_3fp": "עָתִיד",
|
||||
"imperative_ms": "צִוּוּי",
|
||||
"imperative_fs": "צִוּוּי",
|
||||
"imperative_mp": "צִוּוּי",
|
||||
"imperative_fp": "צִוּוּי",
|
||||
"infinitive": "מְקוֹר",
|
||||
}
|
||||
|
||||
BINYAN_NAMES: tuple[str, ...] = ("Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al")
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/2.0)"})
|
||||
|
||||
|
||||
|
||||
def _build_pos_lookup() -> dict[str, str]:
|
||||
"""Build word_stripped → binyan dict from pealim_dict_for_anki.csv."""
|
||||
lookup: dict[str, str] = {}
|
||||
if not DICT_CSV.exists():
|
||||
return lookup
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
|
||||
try:
|
||||
df = pd.read_csv(DICT_CSV, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(DICT_CSV, index_col=0)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
word = str(row.get("Word", "")).strip()
|
||||
pos = str(row.get("Part of speech", row.get("Part of Speech", ""))).strip()
|
||||
if word and pos and "nan" not in pos.lower():
|
||||
lookup[_strip_nikkud(word)] = pos
|
||||
except Exception as e:
|
||||
logger.debug(f"Could not load PoS lookup: {e}")
|
||||
|
||||
return lookup
|
||||
|
||||
|
||||
# Cache PoS lookup (built once)
|
||||
_pos_lookup: dict[str, str] | None = None
|
||||
|
||||
|
||||
def _get_pos_lookup() -> dict[str, str]:
|
||||
global _pos_lookup
|
||||
if _pos_lookup is None:
|
||||
_pos_lookup = _build_pos_lookup()
|
||||
return _pos_lookup
|
||||
|
||||
|
||||
def _binyan_from_pos(word: str) -> str:
|
||||
"""Look up binyan from PoS field: 'Verb – pa\'al' or 'Verb – Pi\'el' → canonical name."""
|
||||
lookup = _get_pos_lookup()
|
||||
pos_str = lookup.get(_strip_nikkud(word), "")
|
||||
if not pos_str:
|
||||
return ""
|
||||
|
||||
pos_lower = pos_str.lower()
|
||||
# Map lowercase pealim.com PoS variants → canonical names
|
||||
for bname, variants in [
|
||||
("Pa'al", ["pa'al", "paal"]),
|
||||
("Nif'al", ["nif'al", "nifal"]),
|
||||
("Pi'el", ["pi'el", "piel"]),
|
||||
("Pu'al", ["pu'al", "pual"]),
|
||||
("Hitpa'el", ["hitpa'el", "hitpael"]),
|
||||
("Hif'il", ["hif'il", "hifil"]),
|
||||
("Huf'al", ["huf'al", "hufal"]),
|
||||
]:
|
||||
if any(v in pos_lower for v in variants):
|
||||
return bname
|
||||
return ""
|
||||
|
||||
|
||||
def _find_slug(query: str) -> str | None:
|
||||
"""Search pealim.com/search/?q=<verb> and return the URL slug."""
|
||||
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
|
||||
try:
|
||||
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
||||
if slugs:
|
||||
slug = slugs[0]
|
||||
logger.info(f" Slug: {slug}")
|
||||
return slug
|
||||
except Exception as e:
|
||||
logger.error(f" Error searching for '{query}': {e}")
|
||||
return None
|
||||
|
||||
|
||||
def _is_passive_binyan(binyan: str) -> bool:
|
||||
"""Return True if the binyan is a passive (Pu'al or Huf'al)."""
|
||||
return any(marker in binyan for marker in ("פֻּעַל", "הֻפְעַל", "Pu'al", "Huf'al"))
|
||||
|
||||
|
||||
def _get_menukad(cell) -> tuple[str, str]:
|
||||
"""
|
||||
Extract nikkud Hebrew text and audio URL from a table cell.
|
||||
Returns (form_text, audio_url).
|
||||
"""
|
||||
# Audio URL
|
||||
audio_span = cell.find("span", class_=lambda c: c and "audio-play" in c)
|
||||
audio_url = ""
|
||||
if audio_span:
|
||||
audio_url = audio_span.get("data-audio", "")
|
||||
|
||||
span = cell.find("span", class_="menukad")
|
||||
if span:
|
||||
return span.get_text(strip=True), audio_url
|
||||
|
||||
txt = cell.get_text(strip=True)
|
||||
if re.search(r"[\u05d0-\u05ea]", txt):
|
||||
return txt, audio_url
|
||||
return "", audio_url
|
||||
|
||||
|
||||
def _parse_table(soup: BeautifulSoup, passive: bool = False, table_el=None) -> dict[str, dict]:
|
||||
"""
|
||||
Parse the pealim conjugation table and return form_key -> {form, audio_url} mapping.
|
||||
If passive=True, look for the passive table (after "Passive" heading).
|
||||
If table_el is provided (and passive=False), parse that table directly.
|
||||
"""
|
||||
if passive:
|
||||
# Find <h3> containing "Passive"
|
||||
passive_h3 = None
|
||||
for h3 in soup.find_all("h3"):
|
||||
if "passive" in h3.get_text(strip=True).lower():
|
||||
passive_h3 = h3
|
||||
break
|
||||
if not passive_h3:
|
||||
return {}
|
||||
# Find next conjugation table after this heading
|
||||
table = None
|
||||
for sib in passive_h3.find_all_next():
|
||||
if sib.name == "table" and "conjugation-table" in sib.get("class", []):
|
||||
table = sib
|
||||
break
|
||||
if not table:
|
||||
return {}
|
||||
elif table_el is not None:
|
||||
table = table_el
|
||||
else:
|
||||
table = soup.find("table", class_="conjugation-table")
|
||||
|
||||
if not table:
|
||||
return {}
|
||||
|
||||
rows = table.find_all("tr")
|
||||
if len(rows) < 9:
|
||||
return {}
|
||||
|
||||
forms: dict[str, dict] = {}
|
||||
|
||||
def first_heb_forms(row_idx: int) -> list[tuple[str, str]]:
|
||||
"""Get only the Hebrew-text cells from a row (skip label cells)."""
|
||||
cells = rows[row_idx].find_all(["th", "td"])
|
||||
result = []
|
||||
for cell in cells:
|
||||
txt, audio_url = _get_menukad(cell)
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
if txt and re.search(r"[\u05d0-\u05ea]", txt):
|
||||
for _ in range(colspan):
|
||||
result.append((txt, audio_url))
|
||||
return result
|
||||
|
||||
def deduplicate(pairs: list[tuple[str, str]]) -> list[tuple[str, str]]:
|
||||
"""Return pairs with duplicate form-text entries removed (first occurrence kept)."""
|
||||
seen: set[str] = set()
|
||||
out: list[tuple[str, str]] = []
|
||||
for pair in pairs:
|
||||
if pair[0] not in seen:
|
||||
seen.add(pair[0])
|
||||
out.append(pair)
|
||||
return out
|
||||
|
||||
# Find rows by tense label
|
||||
present_row = past_row = future_row = imp_row = inf_row = -1
|
||||
for i, row in enumerate(rows):
|
||||
label = row.get_text(" ", strip=True).lower()
|
||||
if "present" in label and present_row < 0:
|
||||
present_row = i
|
||||
elif "past" in label and past_row < 0:
|
||||
past_row = i
|
||||
elif "future" in label and future_row < 0:
|
||||
future_row = i
|
||||
elif "imperative" in label and imp_row < 0:
|
||||
imp_row = i
|
||||
elif "infinitive" in label and inf_row < 0:
|
||||
inf_row = i
|
||||
|
||||
def store(key: str, form: str, audio_url: str) -> None:
|
||||
if form:
|
||||
forms[key] = {"form": form, "audio_url": audio_url}
|
||||
|
||||
# Present tense (4 forms: ms fs mp fp)
|
||||
if present_row >= 0:
|
||||
hf = first_heb_forms(present_row)
|
||||
keys = ["present_ms", "present_fs", "present_mp", "present_fp"]
|
||||
for k, (v, au) in zip(keys, hf, strict=False):
|
||||
store(k, v, au)
|
||||
|
||||
# Past tense
|
||||
if past_row >= 0:
|
||||
unique = deduplicate(first_heb_forms(past_row))
|
||||
if len(unique) >= 1:
|
||||
store("past_1s", unique[0][0], unique[0][1])
|
||||
if len(unique) >= 2:
|
||||
store("past_1p", unique[1][0], unique[1][1])
|
||||
|
||||
if past_row + 1 < len(rows):
|
||||
hf2 = first_heb_forms(past_row + 1)
|
||||
keys2 = ["past_2ms", "past_2fs", "past_2mp", "past_2fp"]
|
||||
for k, (v, au) in zip(keys2, hf2, strict=False):
|
||||
store(k, v, au)
|
||||
|
||||
if past_row + 2 < len(rows):
|
||||
unique3 = deduplicate(first_heb_forms(past_row + 2))
|
||||
keys3 = ["past_3ms", "past_3fs", "past_3p"]
|
||||
for k, (v, au) in zip(keys3, unique3, strict=False):
|
||||
store(k, v, au)
|
||||
|
||||
# Future tense
|
||||
if future_row >= 0:
|
||||
unique_f = deduplicate(first_heb_forms(future_row))
|
||||
if len(unique_f) >= 1:
|
||||
store("future_1s", unique_f[0][0], unique_f[0][1])
|
||||
if len(unique_f) >= 2:
|
||||
store("future_1p", unique_f[1][0], unique_f[1][1])
|
||||
|
||||
if future_row + 1 < len(rows):
|
||||
hf2 = first_heb_forms(future_row + 1)
|
||||
keys2 = ["future_2ms", "future_2fs", "future_2mp", "future_2fp"]
|
||||
for k, (v, au) in zip(keys2, hf2, strict=False):
|
||||
store(k, v, au)
|
||||
|
||||
if future_row + 2 < len(rows):
|
||||
hf3 = first_heb_forms(future_row + 2)
|
||||
keys3 = ["future_3ms", "future_3fs", "future_3mp", "future_3fp"]
|
||||
for k, (v, au) in zip(keys3, hf3, strict=False):
|
||||
store(k, v, au)
|
||||
|
||||
# Imperative
|
||||
if imp_row >= 0:
|
||||
hf = first_heb_forms(imp_row)
|
||||
keys = ["imperative_ms", "imperative_fs", "imperative_mp", "imperative_fp"]
|
||||
for k, (v, au) in zip(keys, hf, strict=False):
|
||||
store(k, v, au)
|
||||
|
||||
# Infinitive
|
||||
if inf_row >= 0:
|
||||
hf = first_heb_forms(inf_row)
|
||||
if hf:
|
||||
store("infinitive", hf[0][0], hf[0][1])
|
||||
|
||||
return forms
|
||||
|
||||
|
||||
def _extract_binyan_from_page(soup: BeautifulSoup) -> str:
|
||||
"""Extract binyan from page header span."""
|
||||
for h3 in soup.find_all("h3", class_="page-header"):
|
||||
text = h3.get_text(" ", strip=True)
|
||||
for bname in BINYAN_NAMES:
|
||||
if bname in text:
|
||||
return bname
|
||||
# Also try og:description
|
||||
meta = soup.find("meta", {"property": "og:description"})
|
||||
if meta:
|
||||
desc = meta.get("content", "")
|
||||
for bname in BINYAN_NAMES:
|
||||
if bname in desc:
|
||||
return bname
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_passive_binyan_from_page(soup: BeautifulSoup) -> str:
|
||||
"""Extract passive binyan name from passive section heading."""
|
||||
for h3 in soup.find_all("h3"):
|
||||
text = h3.get_text(" ", strip=True)
|
||||
if "passive" in text.lower():
|
||||
for bname in ("Pu'al", "Huf'al"):
|
||||
if bname in text:
|
||||
return bname
|
||||
# Infer: Pa'al/Pi'el → Pu'al; Hif'il → Huf'al (stored as span text)
|
||||
span = h3.find("span", class_="small")
|
||||
if span:
|
||||
span_text = span.get_text(strip=True)
|
||||
for bname in ("Pu'al", "Huf'al"):
|
||||
if bname in span_text:
|
||||
return bname
|
||||
return ""
|
||||
|
||||
|
||||
def _extract_conjugations(
|
||||
slug: str, search_term: str, is_3ms_search: bool = False, binyan_hint: str = ""
|
||||
) -> dict | None:
|
||||
"""Fetch /dict/<slug>/ and parse conjugation table (active + passive)."""
|
||||
url = f"{PEALIM_BASE}/dict/{slug}/"
|
||||
try:
|
||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.error(f" Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
# Extract meaning from <div class="lead"> (English translation)
|
||||
meaning = ""
|
||||
lead_div = soup.find("div", class_="lead")
|
||||
if lead_div:
|
||||
meaning = lead_div.get_text(strip=True)
|
||||
|
||||
# Extract root
|
||||
root = ""
|
||||
for span in soup.find_all("span", class_="menukad"):
|
||||
txt = span.get_text(strip=True)
|
||||
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
|
||||
root = txt
|
||||
break
|
||||
|
||||
# Extract binyan: try PoS lookup first, then page header, then section hint
|
||||
binyan = _binyan_from_pos(search_term) if not is_3ms_search else ""
|
||||
if not binyan:
|
||||
binyan = _extract_binyan_from_page(soup)
|
||||
if not binyan:
|
||||
binyan = binyan_hint
|
||||
|
||||
# Parse active forms table
|
||||
forms_raw = _parse_table(soup, passive=False)
|
||||
|
||||
if not forms_raw:
|
||||
logger.warning(f" No forms found for {slug}")
|
||||
return None
|
||||
|
||||
is_passive = _is_passive_binyan(binyan)
|
||||
|
||||
# For passive binyan search (3ms search), the "active" table is actually the passive one
|
||||
# Determine reference form
|
||||
infinitive_form = forms_raw.get("infinitive", {}).get("form", "") if not is_passive else ""
|
||||
past_3ms_form = forms_raw.get("past_3ms", {}).get("form", "")
|
||||
|
||||
reference_form = (past_3ms_form or search_term) if is_passive else (infinitive_form or search_term)
|
||||
|
||||
# Build active result
|
||||
result = {
|
||||
"infinitive": search_term,
|
||||
"slug": slug,
|
||||
"root": root,
|
||||
"binyan": binyan,
|
||||
"meaning": meaning,
|
||||
"is_passive": is_passive,
|
||||
"reference_form": reference_form,
|
||||
"forms": {},
|
||||
}
|
||||
|
||||
for key, form_data in forms_raw.items():
|
||||
if key in PRONOUN_LABELS:
|
||||
result["forms"][key] = {
|
||||
"form": form_data["form"],
|
||||
"audio_url": form_data.get("audio_url", ""),
|
||||
"pronoun": PRONOUN_LABELS[key],
|
||||
"tense": TENSE_DESCRIPTION.get(key, ""),
|
||||
}
|
||||
|
||||
# Check for a second conjugation table (alternate paradigm, e.g. להתגלות)
|
||||
# Collect all active tables (exclude passive tables which follow the "Passive" h3)
|
||||
passive_h3 = next(
|
||||
(h for h in soup.find_all("h3") if "passive" in h.get_text(strip=True).lower()),
|
||||
None,
|
||||
)
|
||||
passive_table_ids = {
|
||||
id(t) for t in (passive_h3.find_all_next("table", class_="conjugation-table") if passive_h3 else [])
|
||||
}
|
||||
active_tables = [t for t in soup.find_all("table", class_="conjugation-table") if id(t) not in passive_table_ids]
|
||||
if len(active_tables) >= 2:
|
||||
alt_raw = _parse_table(soup, passive=False, table_el=active_tables[1])
|
||||
alternate_forms = {}
|
||||
for key, form_data in alt_raw.items():
|
||||
if key in PRONOUN_LABELS:
|
||||
alt_form = form_data["form"]
|
||||
primary_form = forms_raw.get(key, {}).get("form", "")
|
||||
if alt_form and alt_form != primary_form:
|
||||
alternate_forms[key] = alt_form
|
||||
if alternate_forms:
|
||||
result["alternate_forms"] = alternate_forms
|
||||
logger.info(f" Found {len(alternate_forms)} alternate forms for {search_term}")
|
||||
|
||||
logger.info(f" Extracted {len(result['forms'])} forms for {search_term}")
|
||||
return result
|
||||
|
||||
|
||||
def _load_conjugations() -> dict:
|
||||
if CONJUGATIONS_PATH.exists():
|
||||
with open(CONJUGATIONS_PATH, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def _save_conjugations(data: dict) -> None:
|
||||
CONJUGATIONS_PATH.parent.mkdir(parents=True, exist_ok=True)
|
||||
with open(CONJUGATIONS_PATH, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def _extract_passive_from_active_slug(active_slug: str, search_term: str, binyan_hint: str = "") -> dict | None:
|
||||
"""Fetch active verb page and extract only the passive section forms.
|
||||
Used for Pu'al/Huf'al 3ms entries where we know the active verb's slug."""
|
||||
url = f"{PEALIM_BASE}/dict/{active_slug}/"
|
||||
try:
|
||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
except Exception as e:
|
||||
logger.error(f" Error fetching {url}: {e}")
|
||||
return None
|
||||
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
|
||||
# Extract meaning (this is the active verb's meaning — useful context for passive)
|
||||
meaning = ""
|
||||
lead_div = soup.find("div", class_="lead")
|
||||
if lead_div:
|
||||
meaning = lead_div.get_text(strip=True)
|
||||
|
||||
root = ""
|
||||
for span in soup.find_all("span", class_="menukad"):
|
||||
txt = span.get_text(strip=True)
|
||||
if txt and re.search(r"[\u05d0-\u05ea]", txt) and "-" in txt:
|
||||
root = txt
|
||||
break
|
||||
|
||||
active_binyan = _extract_binyan_from_page(soup)
|
||||
active_forms_raw = _parse_table(soup, passive=False)
|
||||
active_infinitive = active_forms_raw.get("infinitive", {}).get("form", "")
|
||||
|
||||
passive_forms_raw = _parse_table(soup, passive=True)
|
||||
if not passive_forms_raw:
|
||||
logger.warning(f" No passive forms found on {active_slug} for {search_term}")
|
||||
return None
|
||||
|
||||
passive_binyan = _extract_passive_binyan_from_page(soup)
|
||||
if not passive_binyan:
|
||||
passive_binyan = "Pu'al" if active_binyan == "Pi'el" else "Huf'al" if active_binyan == "Hif'il" else ""
|
||||
if not passive_binyan:
|
||||
passive_binyan = binyan_hint
|
||||
|
||||
result = {
|
||||
"infinitive": search_term,
|
||||
"slug": active_slug,
|
||||
"root": root,
|
||||
"binyan": passive_binyan,
|
||||
"meaning": meaning,
|
||||
"is_passive": True,
|
||||
"reference_form": active_infinitive or search_term,
|
||||
"forms": {},
|
||||
}
|
||||
for key, form_data in passive_forms_raw.items():
|
||||
if key in PRONOUN_LABELS:
|
||||
result["forms"][key] = {
|
||||
"form": form_data["form"],
|
||||
"audio_url": form_data.get("audio_url", ""),
|
||||
"pronoun": PRONOUN_LABELS[key],
|
||||
"tense": TENSE_DESCRIPTION.get(key, ""),
|
||||
}
|
||||
|
||||
logger.info(f" Extracted {len(result['forms'])} passive forms for {search_term} from {active_slug}")
|
||||
return result
|
||||
|
||||
|
||||
def main(verbs_file: Path = VERBS_INPUT) -> dict:
|
||||
"""Read verbs from file and extract conjugations. Returns full conjugations dict."""
|
||||
if not verbs_file.exists():
|
||||
logger.warning(f"verbs_input.txt not found at {verbs_file} — skipping")
|
||||
return _load_conjugations()
|
||||
|
||||
raw_lines = verbs_file.read_text(encoding="utf-8").splitlines()
|
||||
|
||||
# Parse slug overrides: "# slug: VERB SLUG" anywhere in the file
|
||||
slug_overrides: dict[str, str] = {}
|
||||
for line in raw_lines:
|
||||
stripped = line.strip()
|
||||
if stripped.startswith("# slug:"):
|
||||
parts = stripped[len("# slug:") :].strip().split()
|
||||
if len(parts) >= 2:
|
||||
slug_overrides[parts[0]] = parts[1]
|
||||
|
||||
# Map section header keywords → binyan name (for binyan_hint fallback)
|
||||
SECTION_BINYAN = {
|
||||
"pa'al": "Pa'al",
|
||||
"nif'al": "Nif'al",
|
||||
"pi'el": "Pi'el",
|
||||
"pu'al": "Pu'al",
|
||||
"hitpa'el": "Hitpa'el",
|
||||
"hif'il": "Hif'il",
|
||||
"huf'al": "Huf'al",
|
||||
}
|
||||
|
||||
# Parse: regular verbs and # 3ms: lines (optional active slug on 3ms lines)
|
||||
# Track current section binyan from comment headers for use as a hint
|
||||
verbs: list[tuple[str, bool, str | None, str]] = [] # (search_term, is_3ms_search, active_slug, binyan_hint)
|
||||
current_binyan_hint = ""
|
||||
for line in raw_lines:
|
||||
stripped = line.strip()
|
||||
if not stripped or stripped.startswith("# slug:"):
|
||||
continue
|
||||
if stripped.startswith("# 3ms:"):
|
||||
parts = stripped[len("# 3ms:") :].strip().split()
|
||||
if parts:
|
||||
form = parts[0]
|
||||
active_slug = parts[1] if len(parts) >= 2 else None
|
||||
verbs.append((form, True, active_slug, current_binyan_hint))
|
||||
elif stripped.startswith("#"):
|
||||
# Check if this is a section header setting the binyan context
|
||||
low = stripped.lower()
|
||||
for key, bname in SECTION_BINYAN.items():
|
||||
if key in low:
|
||||
current_binyan_hint = bname
|
||||
break
|
||||
else:
|
||||
verbs.append((stripped, False, None, current_binyan_hint))
|
||||
|
||||
logger.info(f"Loaded {len(verbs)} verbs from {verbs_file} ({sum(1 for _, p, _, _ in verbs if p)} passive 3ms)")
|
||||
if slug_overrides:
|
||||
logger.info(f" Slug overrides: {slug_overrides}")
|
||||
|
||||
conjugations = _load_conjugations()
|
||||
new_count = 0
|
||||
|
||||
for verb, is_3ms, active_slug, binyan_hint in verbs:
|
||||
if verb in conjugations:
|
||||
logger.info(f"Skipping {verb} (cached)")
|
||||
continue
|
||||
|
||||
logger.info(f"Processing: {verb} {'(3ms search)' if is_3ms else ''}")
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
if is_3ms:
|
||||
# Passive-only extraction: use provided active slug or search to find it
|
||||
if active_slug:
|
||||
slug = active_slug
|
||||
logger.info(f" Using active slug {slug} for passive extraction")
|
||||
else:
|
||||
slug = _find_slug(verb)
|
||||
if not slug:
|
||||
logger.warning(f" No slug found for {verb}")
|
||||
conjugations[verb] = None
|
||||
_save_conjugations(conjugations)
|
||||
continue
|
||||
logger.info(f" Found active slug {slug} for passive extraction")
|
||||
time.sleep(REQUEST_DELAY)
|
||||
data = _extract_passive_from_active_slug(slug, verb, binyan_hint=binyan_hint)
|
||||
else:
|
||||
override = slug_overrides.get(verb)
|
||||
if override:
|
||||
logger.info(f" Slug override: {override}")
|
||||
slug = override
|
||||
else:
|
||||
slug = _find_slug(verb)
|
||||
if not slug:
|
||||
logger.warning(f" No slug found for {verb}")
|
||||
conjugations[verb] = None
|
||||
_save_conjugations(conjugations)
|
||||
continue
|
||||
time.sleep(REQUEST_DELAY)
|
||||
data = _extract_conjugations(slug, verb, is_3ms_search=False, binyan_hint=binyan_hint)
|
||||
|
||||
conjugations[verb] = data
|
||||
_save_conjugations(conjugations)
|
||||
new_count += 1
|
||||
|
||||
logger.info(f"Done: {new_count} new verbs processed")
|
||||
return conjugations
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
result = main()
|
||||
for verb, data in result.items():
|
||||
if data:
|
||||
forms = data.get("forms", {})
|
||||
print(f"{verb}: {len(forms)} forms, binyan={data.get('binyan')}")
|
||||
sample_form = next(iter(forms.values()), {}) if forms else {}
|
||||
print(f" sample audio_url: {sample_form.get('audio_url', 'MISSING')[:60]}")
|
||||
else:
|
||||
print(f"{verb}: no data")
|
||||
50000
data/en_50k.txt
Normal file
50000
data/en_50k.txt
Normal file
File diff suppressed because it is too large
Load diff
1
data/frequency_clean.json
Normal file
1
data/frequency_clean.json
Normal file
File diff suppressed because one or more lines are too long
97847
data/frequency_discarded.json
Normal file
97847
data/frequency_discarded.json
Normal file
File diff suppressed because it is too large
Load diff
9106
data/hebrew_dict.csv
9106
data/hebrew_dict.csv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
281266
data/ktiv_male_forms.json
281266
data/ktiv_male_forms.json
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
9106
data/pealim_dict.csv
9106
data/pealim_dict.csv
File diff suppressed because it is too large
Load diff
File diff suppressed because it is too large
Load diff
2297586
data/words.json
Normal file
2297586
data/words.json
Normal file
File diff suppressed because it is too large
Load diff
|
|
@ -0,0 +1,150 @@
|
|||
# Adaptive Sentence Difficulty Cloze — v0.20 Design Spec
|
||||
|
||||
**Date:** 2026-03-15
|
||||
**Status:** Approved
|
||||
**Release:** v0.20
|
||||
|
||||
## Problem
|
||||
|
||||
Cloze cards currently select the example sentence closest to 9 words in length. This ignores whether the surrounding context words are familiar to the learner. A sentence full of rare words is harder than one with common words, regardless of length.
|
||||
|
||||
## Solution
|
||||
|
||||
Replace the length-based `_score()` function in `epub_examples.py` with a **frequency-based difficulty score**. The easiest sentence (most common context words) becomes the cloze. All vetted sentences remain on the card, ordered easy→hard.
|
||||
|
||||
## Scoring Pipeline
|
||||
|
||||
### Token Frequency Lookup (5-tier)
|
||||
|
||||
Given a nikkud sentence token, resolve its frequency rank:
|
||||
|
||||
1. **Known mapping** — look up token in the nikkud→ktiv_male map built from words.json headwords, conjugations, and inflections (94k mappings). If found, look up the ktiv_male in the frequency data.
|
||||
2. **Nikkud prefix stripping** — use `_try_strip_prefix()` to strip validated Hebrew prefixes (בהוכלמש), then resolve the remainder via the known mapping.
|
||||
3. **Academy rules converter** — apply `nikkud_to_ktiv_male.convert()` (91.6% accuracy) to produce ktiv_male, look up in frequency data.
|
||||
4. **strip_nikkud fallback** — use `helpers.strip_nikkud()` as a lossy fallback.
|
||||
5. **Ktiv_male prefix stripping** — strip 1-2 character Hebrew prefixes from the converted/stripped form and look up the stem.
|
||||
|
||||
Tokens not found in any tier are assigned a default high rank (50,000).
|
||||
|
||||
**Coverage:** ~93% of example sentence tokens resolve to a frequency rank (measured empirically on 7,588 sentences).
|
||||
|
||||
**Frequency data source:** Use `frequency_lookup.py` which auto-selects `frequency_clean.json` when available, falling back to `frequency_cache.json`.
|
||||
|
||||
### Sentence Difficulty Score
|
||||
|
||||
For a given word's candidate sentence:
|
||||
|
||||
1. Tokenize: split on whitespace, strip punctuation (.,!?;:"'"״׳–—()[]{}), split on maqaf (־).
|
||||
2. Exclude the target word's token using `cloze_word_start`/`cloze_word_end` offsets from the matched sentence.
|
||||
3. For each remaining token (length >= 2), resolve its frequency rank via the 5-tier pipeline.
|
||||
4. **Score = median frequency rank of context tokens.**
|
||||
|
||||
Lower score = easier (context words are more common). Median resists outliers (one rare proper noun shouldn't dominate).
|
||||
|
||||
### Integration Point
|
||||
|
||||
The scoring integrates into `epub_examples.py`'s existing `_score()` closure inside `update_words_json()` (line ~677). Currently:
|
||||
|
||||
```python
|
||||
def _score(s: dict) -> tuple[int,]:
|
||||
wc = s["word_count"]
|
||||
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
|
||||
return (length_score,)
|
||||
```
|
||||
|
||||
New scoring replaces length with frequency-based difficulty. The `_score` function gains access to the frequency pipeline via closure over the nikkud_map, nikkud_index, and freq_data built once at the start of `update_words_json()`.
|
||||
|
||||
**Minimum sentence length:** Reduced from 4 words to 3 words (`MIN_WORDS = 3` in epub_examples.py). Hebrew is more concise than English — 3-word sentences are valid and common. This expands the candidate pool for cloze selection.
|
||||
|
||||
**Behavioral change:** Because `pool.sort(key=_score)` determines which 3 sentences are selected as `best = pool[:3]`, changing the scoring function changes **which sentences are selected**, not just their order. This is intentional — we want the easiest sentences as cloze candidates, not the closest-to-9-words ones. Existing cloze GUIDs will be preserved when the same sentence text is re-selected; entries where a different sentence wins will get new GUIDs.
|
||||
|
||||
## Data Model Changes
|
||||
|
||||
### words.json
|
||||
|
||||
The `examples.cloze` dict (single sentence) gains an optional `difficulty_score` field:
|
||||
|
||||
```json
|
||||
{
|
||||
"examples": {
|
||||
"vetted": [
|
||||
{"text": "...", "source": "...", "match_method": "..."},
|
||||
{"text": "...", "source": "...", "match_method": "..."}
|
||||
],
|
||||
"cloze": {
|
||||
"text": "...",
|
||||
"cloze_word_start": 5,
|
||||
"cloze_word_end": 10,
|
||||
"cloze_hint": null,
|
||||
"cloze_guid": "abc123",
|
||||
"difficulty_score": 234
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
The vetted list is also sorted by difficulty (easiest first), so the card back shows sentences in pedagogically useful order.
|
||||
|
||||
### SCHEMA.yaml
|
||||
|
||||
Add `difficulty_score` as optional integer field under `examples.cloze`.
|
||||
|
||||
## Implementation Scope
|
||||
|
||||
### New file: `sentence_difficulty.py`
|
||||
|
||||
Standalone module for sentence scoring. No pipeline step — called by `epub_examples.py`.
|
||||
|
||||
- `score_sentence(sentence_text: str, target_start: int, target_end: int, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — returns median context frequency rank. Uses `target_start`/`target_end` character offsets to exclude the cloze target token.
|
||||
- `build_nikkud_map(words: dict) -> dict[str, str]` — builds nikkud→ktiv_male lookup from words.json (headwords + conjugation forms + noun inflections). Returns `{nikkud_form: ktiv_male_form}`. Implementation note: should share iteration logic with `epub_examples._build_nikkud_index()` or derive from its output to avoid duplicating the traversal of words.json forms.
|
||||
- `_resolve_token_frequency(token: str, nikkud_map: dict, nikkud_index: dict, freq_data: dict) -> int` — the 5-tier lookup. Uses `_try_strip_prefix` from epub_examples (made importable by removing underscore or adding a public wrapper).
|
||||
|
||||
### Modified files
|
||||
|
||||
- **`epub_examples.py`**:
|
||||
- Import `sentence_difficulty.score_sentence` and `sentence_difficulty.build_nikkud_map`
|
||||
- In `update_words_json()`: build nikkud_map and load freq_data once at start (before per-word loop)
|
||||
- Replace `_score()` closure with frequency-based scoring that calls `score_sentence()`
|
||||
- Sort vetted list by difficulty score (easiest first)
|
||||
- Store `difficulty_score` in the cloze dict
|
||||
- Make `_try_strip_prefix` importable (rename to `try_strip_prefix` or add public alias)
|
||||
- **`frequency_lookup.py`** — add `get_freq_data() -> dict` public accessor to expose the loaded frequency dict (avoids accessing private `_freq` directly)
|
||||
- **`SCHEMA.yaml`** — add `difficulty_score` field
|
||||
- **`run.py`** — no changes; scoring happens inside epub_examples step
|
||||
|
||||
### Not modified
|
||||
|
||||
- **`apkg_builder.py`** — reads cloze as-is; vetted order is already respected
|
||||
- **`nikkud_to_ktiv_male.py`** — used as-is
|
||||
- **Card templates** — no changes needed
|
||||
|
||||
## Dependencies
|
||||
|
||||
- `nikkud_to_ktiv_male.convert()` — Academy rules converter (already written)
|
||||
- `epub_examples._try_strip_prefix()` / `_build_nikkud_index()` — nikkud prefix stripping and index
|
||||
- `frequency_lookup.py` — loads frequency data (auto-selects clean vs cache)
|
||||
- `helpers.strip_nikkud()` — fallback converter
|
||||
|
||||
## Validation
|
||||
|
||||
- **Unit tests** for `score_sentence()` with known easy/hard sentences
|
||||
- **Unit tests** for `_resolve_token_frequency()` covering all 5 tiers
|
||||
- **Integration test**: verify cloze selection picks easiest sentence, vetted list is sorted
|
||||
- **Spot check**: manually review 10 words with 3+ sentences to confirm ordering
|
||||
- **Regression**: existing tests pass, GUID coverage unchanged, deck validates
|
||||
|
||||
## Constraints
|
||||
|
||||
- `examples.cloze` remains a single dict (not converted to list)
|
||||
- No new Anki card types or fields
|
||||
- No runtime JS in Anki cards
|
||||
- No network calls during scoring
|
||||
- `difficulty_score` is informational metadata; card rendering doesn't depend on it
|
||||
- Existing cloze GUIDs preserved when the same sentence is re-selected
|
||||
|
||||
## Scope Exclusions (Future Work)
|
||||
|
||||
- **Pronominal suffix stripping** — would improve the ~7% unscored token rate; deferred (PROJECT_NOTES.md)
|
||||
- **Kamatz katan disambiguation** — requires morphological analysis; accepted limitation
|
||||
- **Per-learner adaptive difficulty** — requires Anki plugin; out of scope for static deck
|
||||
- **Multiple cloze sentences per card** — would require schema migration to list; deferred
|
||||
883
epub_examples.py
883
epub_examples.py
File diff suppressed because it is too large
Load diff
|
|
@ -3,6 +3,10 @@
|
|||
Hebrew word frequency lookup from hermitdave/FrequencyWords corpus.
|
||||
Downloads he_50k.txt once; subsequent runs read from cache.
|
||||
Exposed API: get_frequency_rank(word_no_nikkud) -> int | None
|
||||
|
||||
TODO: Rewrite to update words.json frequency field directly instead of
|
||||
writing to a separate frequency_cache.json. Currently the migration script
|
||||
bridges the gap. See Phase 5 in SPRINT_LOG.md.
|
||||
"""
|
||||
|
||||
import json
|
||||
|
|
@ -11,12 +15,11 @@ from pathlib import Path
|
|||
|
||||
import requests
|
||||
|
||||
from helpers import strip_nikkud as _strip_nikkud
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
FREQ_URL = "https://raw.githubusercontent.com/hermitdave/FrequencyWords/master/content/2016/he/he_50k.txt"
|
||||
CACHE_PATH = Path(__file__).parent / "data" / "frequency_cache.json"
|
||||
CLEAN_CACHE_PATH = Path(__file__).parent / "data" / "frequency_clean.json"
|
||||
REQUEST_TIMEOUT = 30
|
||||
|
||||
# Module-level cache: word_no_nikkud -> rank (1 = most common)
|
||||
|
|
@ -24,12 +27,19 @@ _freq: dict[str, int] = {}
|
|||
|
||||
|
||||
def load(cache_path: Path = CACHE_PATH) -> None:
|
||||
"""Load frequency data from cache, downloading if not present."""
|
||||
"""Load frequency data from cache, downloading if not present.
|
||||
|
||||
Prefers frequency_clean.json (YAP-filtered) over raw frequency_cache.json.
|
||||
"""
|
||||
global _freq
|
||||
if cache_path.exists():
|
||||
with open(cache_path, encoding="utf-8") as f:
|
||||
# Prefer YAP-cleaned frequency data if available
|
||||
clean_path = cache_path.parent / "frequency_clean.json" if cache_path == CACHE_PATH else None
|
||||
load_path = clean_path if clean_path and clean_path.exists() else cache_path
|
||||
if load_path.exists():
|
||||
with open(load_path, encoding="utf-8") as f:
|
||||
_freq = json.load(f)
|
||||
logger.info(f"Frequency cache loaded: {len(_freq)} entries")
|
||||
label = "clean" if load_path == clean_path else "raw"
|
||||
logger.info(f"Frequency cache loaded ({label}): {len(_freq)} entries")
|
||||
return
|
||||
|
||||
logger.info("Downloading FrequencyWords he_50k.txt …")
|
||||
|
|
@ -41,7 +51,7 @@ def load(cache_path: Path = CACHE_PATH) -> None:
|
|||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
word = _strip_nikkud(line.split()[0])
|
||||
word = line.split()[0]
|
||||
if word and word not in _freq:
|
||||
_freq[word] = rank
|
||||
rank += 1
|
||||
|
|
@ -56,14 +66,24 @@ def get_frequency_rank(word_no_nikkud: str) -> int | None:
|
|||
"""
|
||||
Return the frequency rank of a word (1 = most common).
|
||||
Returns None if not found in the corpus.
|
||||
Strips nikkud from the input before lookup.
|
||||
Expects ktiv male (no nikkud) input.
|
||||
"""
|
||||
if not _freq:
|
||||
load()
|
||||
clean = _strip_nikkud(word_no_nikkud.strip())
|
||||
clean = word_no_nikkud.strip()
|
||||
return _freq.get(clean)
|
||||
|
||||
|
||||
def get_freq_data() -> dict[str, int]:
|
||||
"""Return the full frequency dict (word -> rank).
|
||||
|
||||
Auto-loads from cache if not yet loaded.
|
||||
"""
|
||||
if not _freq:
|
||||
load()
|
||||
return _freq
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
load()
|
||||
|
|
|
|||
|
|
@ -1,216 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract Hebrew vocabulary from pealim.com dictionary.
|
||||
Scrapes word entries, roots, parts of speech, and audio URLs for Anki flashcards.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Configure logging
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# Session for connection pooling
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-scraper/1.0)"})
|
||||
|
||||
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||||
REQUEST_DELAY = 1.5 # seconds between requests (respectful scraping)
|
||||
REQUEST_TIMEOUT = 10 # seconds
|
||||
|
||||
|
||||
def get_total_pages() -> int:
|
||||
"""Dynamically determine total pages from first request."""
|
||||
try:
|
||||
logger.info("Fetching total page count...")
|
||||
cookies = {"translit": "none", "hebstyle": "mo"}
|
||||
response = session.get(PEALIM_DICT_URL, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
# Hardcoded — pealim.com has ~608 pages at ~15 words/page
|
||||
return 608
|
||||
except Exception as e:
|
||||
logger.error(f"Error fetching page count: {e}. Using default (608).")
|
||||
return 608
|
||||
|
||||
|
||||
def _parse_page_with_audio(html_bytes: bytes) -> list[dict]:
|
||||
"""
|
||||
Parse a dict page with BeautifulSoup to extract word data + audio URL.
|
||||
Returns list of dicts with keys: Word, Root, Part of Speech, Meaning, audio_url.
|
||||
"""
|
||||
soup = BeautifulSoup(html_bytes, "html.parser")
|
||||
rows = []
|
||||
for tr in soup.select("table tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
# Audio URL from span[data-audio] in first td
|
||||
audio_span = tds[0].find(attrs={"data-audio": True})
|
||||
audio_url = audio_span["data-audio"] if audio_span else ""
|
||||
# Word with nikkud
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
# Root (may be link or plain text)
|
||||
root = tds[1].get_text(strip=True)
|
||||
# Part of speech
|
||||
pos = tds[2].get_text(strip=True)
|
||||
# Meaning
|
||||
meaning = tds[3].get_text(strip=True)
|
||||
if word:
|
||||
rows.append(
|
||||
{
|
||||
"Word": word,
|
||||
"Root": root if root else "-",
|
||||
"Part of Speech": pos,
|
||||
"Meaning": meaning,
|
||||
"audio_url": audio_url,
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def extract_from_website(max_pages: int | None = None) -> pd.DataFrame:
|
||||
"""
|
||||
Extract dictionary entries from pealim.com.
|
||||
Captures audio URLs from each word entry's data-audio attribute.
|
||||
|
||||
Args:
|
||||
max_pages: Maximum pages to scrape (None = all)
|
||||
|
||||
Returns:
|
||||
DataFrame with Word, Root, Part of Speech, Meaning, Word Without Nikkud, audio_url columns
|
||||
"""
|
||||
total_pages = max_pages or get_total_pages()
|
||||
logger.info(f"Starting extraction from {total_pages} pages...")
|
||||
|
||||
all_rows: list[dict] = []
|
||||
|
||||
for page_num in range(1, total_pages + 1):
|
||||
try:
|
||||
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||||
|
||||
# First request: with nikkud — parse with BeautifulSoup for audio URL
|
||||
cookies = {"translit": "none", "hebstyle": "mo"}
|
||||
response = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
response.raise_for_status()
|
||||
page_rows = _parse_page_with_audio(response.content)
|
||||
|
||||
# Second request: without nikkud — just get the word column
|
||||
cookies_vl = {"translit": "none", "hebstyle": "vl", "showmeaning": "off"}
|
||||
resp_vl = session.get(url, cookies=cookies_vl, timeout=REQUEST_TIMEOUT)
|
||||
resp_vl.raise_for_status()
|
||||
soup_vl = BeautifulSoup(resp_vl.content, "html.parser")
|
||||
no_nik_words = []
|
||||
for tr in soup_vl.select("table tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
w = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
no_nik_words.append(w)
|
||||
|
||||
# Merge no-nikkud words into rows
|
||||
for i, row in enumerate(page_rows):
|
||||
row["Word Without Nikkud"] = no_nik_words[i] if i < len(no_nik_words) else ""
|
||||
|
||||
all_rows.extend(page_rows)
|
||||
|
||||
if page_num % 50 == 0:
|
||||
logger.info(f"Processed {page_num}/{total_pages} pages ({len(all_rows)} words so far)...")
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
except requests.RequestException as e:
|
||||
logger.error(f"Error fetching page {page_num}: {e}. Retrying...")
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
except Exception as e:
|
||||
logger.error(f"Unexpected error on page {page_num}: {e}")
|
||||
continue
|
||||
|
||||
df = pd.DataFrame(all_rows)
|
||||
audio_count = (df["audio_url"] != "").sum() if "audio_url" in df.columns else 0
|
||||
logger.info(f"Extraction complete. Total words: {len(df)}, with audio URL: {audio_count}")
|
||||
return df
|
||||
|
||||
|
||||
def modify_for_anki(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Transform dictionary DataFrame for Anki import.
|
||||
Adds shared root words and Hebrew tags. Preserves audio_url column.
|
||||
"""
|
||||
logger.info("Preparing data for Anki...")
|
||||
|
||||
# Find shared root words
|
||||
shared_root_words = []
|
||||
for _idx, row in df.iterrows():
|
||||
root = row["Root"]
|
||||
word = row["Word"]
|
||||
|
||||
if root != "-" and pd.notna(root):
|
||||
same_root = df[(df["Root"] == root) & (df["Word"] != word)]["Word"].values
|
||||
shared = " ".join(str(w) for w in same_root)
|
||||
shared_root_words.append(shared)
|
||||
else:
|
||||
shared_root_words.append("")
|
||||
|
||||
df["shared roots"] = shared_root_words
|
||||
|
||||
# Generate Hebrew tags
|
||||
tags = []
|
||||
for _idx, row in df.iterrows():
|
||||
tag_parts = []
|
||||
|
||||
root = str(row["Root"]).replace(" ", "").replace("-", "")
|
||||
if "nan" not in root and root:
|
||||
root_clean = root.replace(".", "")
|
||||
tag_parts.append(f"שורש::{root_clean}")
|
||||
|
||||
pos = str(row["Part of Speech"])
|
||||
pos_tags = {
|
||||
"Adverb": "תוארי_הפועל",
|
||||
"Pronoun": "כינויי_גוף",
|
||||
"Noun": "שם_עצם",
|
||||
"Verb": "פעלים",
|
||||
"Adjective": "שם_תואר",
|
||||
"Preposition": "מילות_יחס",
|
||||
"Conjunction": "מילות_חיבור",
|
||||
"Particle": "מילית",
|
||||
}
|
||||
|
||||
for key, value in pos_tags.items():
|
||||
if key in pos:
|
||||
tag_parts.append(value)
|
||||
break
|
||||
|
||||
tags.append(" ".join(tag_parts))
|
||||
|
||||
df["tags"] = tags
|
||||
logger.info("Anki preparation complete.")
|
||||
return df
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
try:
|
||||
df = extract_from_website()
|
||||
df.to_csv("hebrew_dict.csv", index=True)
|
||||
logger.info("Saved: hebrew_dict.csv")
|
||||
|
||||
df = modify_for_anki(df)
|
||||
df.to_csv("hebrew_dict_for_anki.csv", sep=";", index=True)
|
||||
logger.info("Saved: hebrew_dict_for_anki.csv")
|
||||
|
||||
logger.info("Complete!")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Fatal error: {e}")
|
||||
raise
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -2,6 +2,10 @@
|
|||
"""
|
||||
Fetch images for concrete Hebrew nouns from Wikipedia / Wikimedia Commons.
|
||||
|
||||
TODO: Rewrite to update words.json image/image_source fields directly instead of
|
||||
writing to a separate image_cache.json. Currently the migration script bridges
|
||||
the gap. See Phase 5 in SPRINT_LOG.md.
|
||||
|
||||
Scope: Noun PoS entries only. Concreteness heuristic:
|
||||
- English meaning has no abstract suffixes (-tion, -ity, -ness, -ment, -ance, -ism, -hood,
|
||||
-ship, -ure, -al, -ing when not a gerund, -ence)
|
||||
|
|
@ -27,8 +31,6 @@ from pathlib import Path
|
|||
|
||||
import requests
|
||||
|
||||
from helpers import strip_nikkud as _strip_nikkud
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
|
|
@ -59,7 +61,6 @@ session.headers.update(
|
|||
)
|
||||
|
||||
|
||||
|
||||
def is_concrete(english_meaning: str) -> bool:
|
||||
"""Return True if the English meaning looks like a concrete noun."""
|
||||
meaning = english_meaning.strip().lower()
|
||||
|
|
@ -75,7 +76,7 @@ def is_concrete(english_meaning: str) -> bool:
|
|||
|
||||
def _safe_name(word_no_nikkud: str) -> str:
|
||||
"""Create a safe ASCII-ish filename from a Hebrew word (strip to Hebrew letters only)."""
|
||||
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", _strip_nikkud(word_no_nikkud))
|
||||
hebrew_only = re.sub(r"[^\u05d0-\u05ea]", "", word_no_nikkud)
|
||||
return hebrew_only if hebrew_only else "unknown"
|
||||
|
||||
|
||||
|
|
@ -258,7 +259,7 @@ def run(limit: int | None = None, dry_run: bool = False, single_word: str | None
|
|||
if single_word and word_plain != single_word:
|
||||
continue
|
||||
|
||||
cache_key = word_plain or _strip_nikkud(word)
|
||||
cache_key = word_plain
|
||||
|
||||
if cache_key in cache:
|
||||
skipped_cached += 1
|
||||
|
|
|
|||
185
nikkud_to_ktiv_male.py
Normal file
185
nikkud_to_ktiv_male.py
Normal file
|
|
@ -0,0 +1,185 @@
|
|||
"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling).
|
||||
|
||||
Implements Hebrew Academy rules for matres lectionis insertion:
|
||||
- Rule A: U vowel (kubutz) → always insert vav
|
||||
- Rule B: O vowel (holam on non-vav) → insert vav
|
||||
- Rule C: I vowel (hiriq) → insert yod (conditionally)
|
||||
- Rule D: E vowel (tsere) → insert yod (limited cases)
|
||||
- Rule E/F: Consonantal vav/yod doubling
|
||||
|
||||
Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/
|
||||
"""
|
||||
|
||||
import unicodedata
|
||||
|
||||
# Hebrew nikkud code points
|
||||
SHVA = "\u05b0"
|
||||
HATAF_SEGOL = "\u05b1"
|
||||
HATAF_PATAH = "\u05b2"
|
||||
HATAF_KAMATZ = "\u05b3"
|
||||
HIRIQ = "\u05b4"
|
||||
TSERE = "\u05b5"
|
||||
SEGOL = "\u05b6"
|
||||
PATAH = "\u05b7"
|
||||
KAMATZ = "\u05b8"
|
||||
HOLAM = "\u05b9"
|
||||
HOLAM_HASER = "\u05ba"
|
||||
KUBUTZ = "\u05bb"
|
||||
DAGESH = "\u05bc"
|
||||
METEG = "\u05bd"
|
||||
RAFE = "\u05bf"
|
||||
SHIN_DOT = "\u05c1"
|
||||
SIN_DOT = "\u05c2"
|
||||
|
||||
VAV = "ו"
|
||||
YOD = "י"
|
||||
MAQAF = "־"
|
||||
|
||||
VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ}
|
||||
|
||||
NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT}
|
||||
|
||||
|
||||
def _parse_segments(text: str) -> list[tuple[str, list[str]]]:
|
||||
"""Parse nikkud text into (character, [marks]) segments."""
|
||||
segments: list[tuple[str, list[str]]] = []
|
||||
cur_char: str | None = None
|
||||
cur_marks: list[str] = []
|
||||
|
||||
for ch in text:
|
||||
if unicodedata.category(ch) == "Mn":
|
||||
cur_marks.append(ch)
|
||||
else:
|
||||
if cur_char is not None:
|
||||
segments.append((cur_char, cur_marks))
|
||||
cur_char = ch
|
||||
cur_marks = []
|
||||
|
||||
if cur_char is not None:
|
||||
segments.append((cur_char, cur_marks))
|
||||
|
||||
return segments
|
||||
|
||||
|
||||
def _get_vowel(marks: list[str]) -> str | None:
|
||||
"""Extract the vowel mark from a list of combining marks."""
|
||||
for m in marks:
|
||||
if m in VOWELS:
|
||||
return m
|
||||
return None
|
||||
|
||||
|
||||
def _has_dagesh(marks: list[str]) -> bool:
|
||||
return DAGESH in marks
|
||||
|
||||
|
||||
def _is_hebrew_letter(ch: str) -> bool:
|
||||
return "\u05d0" <= ch <= "\u05ea"
|
||||
|
||||
|
||||
def convert(text: str) -> str:
|
||||
"""Convert nikkud Hebrew text to ktiv male.
|
||||
|
||||
Strips all nikkud marks and inserts matres lectionis (vav/yod)
|
||||
according to Hebrew Academy spelling rules.
|
||||
"""
|
||||
segments = _parse_segments(text)
|
||||
result: list[str] = []
|
||||
|
||||
for i, (ch, marks) in enumerate(segments):
|
||||
if not _is_hebrew_letter(ch):
|
||||
# Non-Hebrew character: output as-is (no marks)
|
||||
result.append(ch)
|
||||
continue
|
||||
|
||||
vowel = _get_vowel(marks)
|
||||
has_dag = _has_dagesh(marks)
|
||||
|
||||
# Output the base letter (strip all nikkud marks)
|
||||
result.append(ch)
|
||||
|
||||
# --- Rule A: U vowel (kubutz) → always add vav ---
|
||||
if vowel == KUBUTZ:
|
||||
result.append(VAV)
|
||||
continue
|
||||
|
||||
# --- Shuruk detection ---
|
||||
# Vav with dagesh and no other vowel = shuruk (already a mater)
|
||||
# Vav with dagesh AND a vowel = consonantal vav (ב with dagesh)
|
||||
# If letter is vav with dagesh only → it's shuruk, already output
|
||||
if ch == VAV and has_dag and vowel is None:
|
||||
# Shuruk: vav IS the mater lectionis, already output
|
||||
continue
|
||||
|
||||
# --- Rule B: O vowel (holam) → add vav ---
|
||||
if vowel in (HOLAM, HOLAM_HASER):
|
||||
if ch != VAV:
|
||||
# Exception: holam before aleph (pe-aleph verbs) — no vav
|
||||
# e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד
|
||||
next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א"
|
||||
if not next_is_aleph:
|
||||
result.append(VAV)
|
||||
# If ch IS vav (holam male), vav already output
|
||||
continue
|
||||
|
||||
# --- Rule C: I vowel (hiriq) → conditionally add yod ---
|
||||
if vowel == HIRIQ:
|
||||
if ch == YOD:
|
||||
# Yod already present, don't double
|
||||
continue
|
||||
|
||||
# Don't insert yod if next letter is already yod
|
||||
if i + 1 < len(segments) and segments[i + 1][0] == YOD:
|
||||
continue
|
||||
|
||||
# Rule C Section 3: Don't add yod if the NEXT consonant
|
||||
# has shva (indicating shva nach on that consonant)
|
||||
add_yod = True
|
||||
|
||||
if i + 1 < len(segments):
|
||||
next_ch, next_marks = segments[i + 1]
|
||||
next_vowel = _get_vowel(next_marks)
|
||||
|
||||
# Shva on next consonant = shva nach → don't add yod
|
||||
# UNLESS next consonant also has dagesh (= shva na / doubled)
|
||||
next_has_dagesh = _has_dagesh(next_marks)
|
||||
if next_vowel == SHVA and not next_has_dagesh:
|
||||
add_yod = False
|
||||
# No vowel on next consonant (word-final) = closed syllable
|
||||
# → don't add yod (e.g., suffix -תי -נו -תם)
|
||||
elif next_vowel is None and _is_hebrew_letter(next_ch):
|
||||
# Check if this is truly word-final or next-to-last
|
||||
remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0]))
|
||||
if remaining_letters <= 2:
|
||||
# Short suffix like תי, נו — don't add yod
|
||||
add_yod = False
|
||||
|
||||
if add_yod:
|
||||
result.append(YOD)
|
||||
continue
|
||||
|
||||
# --- Rule D: E vowel (tsere/segol) → generally NO yod ---
|
||||
# Exception (b): tsere before guttural/resh gets yod ONLY
|
||||
# in word-initial position (dagesh substitution in Hif'il/noun patterns)
|
||||
# e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע
|
||||
# but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר
|
||||
if vowel == TSERE:
|
||||
add_yod = False
|
||||
|
||||
if i + 1 < len(segments):
|
||||
next_ch = segments[i + 1][0]
|
||||
if next_ch in "אהחער":
|
||||
# Only at word-initial (pos 0) or after prefix (pos 1)
|
||||
# where dagesh substitution applies
|
||||
hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0]))
|
||||
if hebrew_pos <= 1:
|
||||
add_yod = True
|
||||
|
||||
if add_yod:
|
||||
result.append(YOD)
|
||||
continue
|
||||
|
||||
# All other vowels (patah, kamatz, segol, shva, hataf-*):
|
||||
# No mater lectionis insertion needed
|
||||
|
||||
return "".join(result)
|
||||
BIN
pealim.apkg
BIN
pealim.apkg
Binary file not shown.
348
pealim_audio_download.py
Normal file
348
pealim_audio_download.py
Normal file
|
|
@ -0,0 +1,348 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Download audio files from URLs stored in words.json.
|
||||
|
||||
Three audio categories are handled:
|
||||
1. Vocab audio → data/audio/{audio_file}
|
||||
2. Noun plural → data/audio/{slug}_plural.mp3
|
||||
3. Conjugation → data/audio_conj/{slug}_{form_key}.mp3
|
||||
data/audio_conj/{slug}_passive_{form_key}.mp3
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
AUDIO_DIR = DATA_DIR / "audio"
|
||||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||
WORDS_JSON = DATA_DIR / "words.json"
|
||||
|
||||
DOWNLOAD_DELAY = 0.3
|
||||
MAX_RETRIES = 3
|
||||
|
||||
# Map Hebrew tense names to English prefixes for form_key construction.
|
||||
# "מְקוֹר" (infinitive) is included for forward compatibility; it does not
|
||||
# appear in the current dataset but the form_key collapses to bare "infinitive".
|
||||
TENSE_TO_PREFIX = {
|
||||
"הוֹוֶה": "present",
|
||||
"עָבָר": "past",
|
||||
"עָתִיד": "future",
|
||||
"צִוּוּי": "imperative",
|
||||
"מְקוֹר": "infinitive",
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _make_audio_file(entry: dict) -> str:
|
||||
"""Derive the vocab audio filename when audio_file is absent.
|
||||
|
||||
Slug-based for confusable entries (slug contains the disambiguating ID),
|
||||
consonant-only for all others.
|
||||
|
||||
Args:
|
||||
entry: A words.json entry dict.
|
||||
|
||||
Returns:
|
||||
Filename string, e.g. ``"1234-shalom.mp3"`` or ``"שלום.mp3"``.
|
||||
"""
|
||||
audio_file = entry.get("audio_file", "")
|
||||
if audio_file:
|
||||
return audio_file
|
||||
# Fallback: use slug for confusables, ktiv_male for others
|
||||
slug = entry.get("slug", "")
|
||||
if entry.get("confusable_group"):
|
||||
return f"{slug}.mp3"
|
||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", ktiv_male)
|
||||
return f"{safe_name}.mp3"
|
||||
|
||||
|
||||
def _form_key(person: str, tense: str) -> str:
|
||||
"""Build a filesystem-safe form key from person and tense fields.
|
||||
|
||||
Args:
|
||||
person: Person code, e.g. ``"1s"``, ``"3fp"``, ``"ms"``.
|
||||
tense: Hebrew tense string from the conjugation form.
|
||||
|
||||
Returns:
|
||||
Form key such as ``"past_1s"`` or ``"present_ms"``.
|
||||
Infinitive tense always returns ``"infinitive"`` (no person suffix).
|
||||
"""
|
||||
prefix = TENSE_TO_PREFIX.get(tense, tense)
|
||||
if prefix == "infinitive":
|
||||
return "infinitive"
|
||||
return f"{prefix}_{person}"
|
||||
|
||||
|
||||
def _download(url: str, dest: Path, session: requests.Session) -> bool:
|
||||
"""Download *url* to *dest*, retrying up to MAX_RETRIES times.
|
||||
|
||||
Skips the download silently if *dest* already exists.
|
||||
|
||||
Args:
|
||||
url: HTTP(S) URL to download.
|
||||
dest: Local path to write the file to.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
``True`` if the file was downloaded (or already existed),
|
||||
``False`` if all retries were exhausted.
|
||||
"""
|
||||
if dest.exists():
|
||||
return True
|
||||
|
||||
for attempt in range(1, MAX_RETRIES + 1):
|
||||
try:
|
||||
resp = session.get(url, timeout=15)
|
||||
resp.raise_for_status()
|
||||
dest.write_bytes(resp.content)
|
||||
logger.debug("Downloaded %s → %s", url, dest.name)
|
||||
return True
|
||||
except requests.RequestException as exc:
|
||||
wait = 2**attempt
|
||||
if attempt < MAX_RETRIES:
|
||||
logger.warning(
|
||||
"Attempt %d/%d failed for %s (%s) — retrying in %ds",
|
||||
attempt,
|
||||
MAX_RETRIES,
|
||||
url,
|
||||
exc,
|
||||
wait,
|
||||
)
|
||||
time.sleep(wait)
|
||||
else:
|
||||
logger.error("All %d attempts failed for %s: %s", MAX_RETRIES, url, exc)
|
||||
return False
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Per-category downloaders
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def download_vocab_audio(
|
||||
entries: list[dict],
|
||||
session: requests.Session,
|
||||
) -> tuple[int, int, int]:
|
||||
"""Download vocabulary audio files.
|
||||
|
||||
Args:
|
||||
entries: List of words.json entry dicts.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
Tuple of (downloaded, cached, no_url) counts.
|
||||
"""
|
||||
downloaded = cached = no_url = 0
|
||||
|
||||
for entry in entries:
|
||||
url: str | None = entry.get("audio_url")
|
||||
if not url:
|
||||
no_url += 1
|
||||
continue
|
||||
|
||||
audio_file: str | None = entry.get("audio_file")
|
||||
if not audio_file:
|
||||
audio_file = _make_audio_file(entry)
|
||||
|
||||
dest = AUDIO_DIR / audio_file
|
||||
|
||||
if dest.exists():
|
||||
cached += 1
|
||||
continue
|
||||
|
||||
if _download(url, dest, session):
|
||||
downloaded += 1
|
||||
time.sleep(DOWNLOAD_DELAY)
|
||||
else:
|
||||
no_url += 1 # count persistent failures alongside missing URLs
|
||||
|
||||
return downloaded, cached, no_url
|
||||
|
||||
|
||||
def download_noun_plural_audio(
|
||||
entries: list[dict],
|
||||
session: requests.Session,
|
||||
) -> tuple[int, int]:
|
||||
"""Download noun plural audio files.
|
||||
|
||||
Destination: ``data/audio/{slug}_plural.mp3``
|
||||
|
||||
Args:
|
||||
entries: List of words.json entry dicts.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
Tuple of (downloaded, cached) counts.
|
||||
"""
|
||||
downloaded = cached = 0
|
||||
|
||||
for entry in entries:
|
||||
ni = entry.get("noun_inflection")
|
||||
if not ni or not isinstance(ni, dict):
|
||||
continue
|
||||
|
||||
url: str | None = ni.get("plural_audio")
|
||||
if not url or not url.startswith("http"):
|
||||
continue
|
||||
|
||||
slug: str = entry["slug"]
|
||||
dest = AUDIO_DIR / f"{slug}_plural.mp3"
|
||||
|
||||
if dest.exists():
|
||||
cached += 1
|
||||
continue
|
||||
|
||||
if _download(url, dest, session):
|
||||
downloaded += 1
|
||||
time.sleep(DOWNLOAD_DELAY)
|
||||
|
||||
return downloaded, cached
|
||||
|
||||
|
||||
def download_conjugation_audio(
|
||||
entries: list[dict],
|
||||
session: requests.Session,
|
||||
) -> tuple[int, int, int]:
|
||||
"""Download conjugation form audio files.
|
||||
|
||||
Active forms → ``data/audio_conj/{slug}_{form_key}.mp3``
|
||||
Passive forms → ``data/audio_conj/{slug}_passive_{form_key}.mp3``
|
||||
|
||||
Args:
|
||||
entries: List of words.json entry dicts.
|
||||
session: Shared requests session.
|
||||
|
||||
Returns:
|
||||
Tuple of (downloaded, cached, failed) counts.
|
||||
"""
|
||||
downloaded = cached = failed = 0
|
||||
|
||||
for entry in entries:
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
|
||||
slug: str = entry["slug"]
|
||||
|
||||
form_sets: list[tuple[str, list]] = [
|
||||
("", conj.get("active_forms") or []),
|
||||
("passive_", conj.get("hufal_pual_forms") or []),
|
||||
]
|
||||
|
||||
for prefix, forms in form_sets:
|
||||
for form in forms:
|
||||
url: str | None = form.get("audio_url")
|
||||
if not url:
|
||||
continue
|
||||
|
||||
key = _form_key(form.get("person", ""), form.get("tense", ""))
|
||||
dest = AUDIO_CONJ_DIR / f"{slug}_{prefix}{key}.mp3"
|
||||
|
||||
if dest.exists():
|
||||
cached += 1
|
||||
continue
|
||||
|
||||
if _download(url, dest, session):
|
||||
downloaded += 1
|
||||
time.sleep(DOWNLOAD_DELAY)
|
||||
else:
|
||||
failed += 1
|
||||
|
||||
return downloaded, cached, failed
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
"""Parse CLI args and run the audio download pipeline."""
|
||||
parser = argparse.ArgumentParser(description="Download Pealim audio files from words.json URLs.")
|
||||
parser.add_argument(
|
||||
"--skip-vocab",
|
||||
action="store_true",
|
||||
help="Skip vocabulary audio downloads.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--skip-conj",
|
||||
action="store_true",
|
||||
help="Skip conjugation audio downloads.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
metavar="N",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Limit processing to the first N words.json entries.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(message)s",
|
||||
)
|
||||
|
||||
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as fh:
|
||||
raw: dict[str, dict] = json.load(fh)
|
||||
|
||||
entries = list(raw.values())
|
||||
if args.test is not None:
|
||||
entries = entries[: args.test]
|
||||
|
||||
logger.info("[4] Downloading audio files …")
|
||||
|
||||
session = requests.Session()
|
||||
session.headers["User-Agent"] = "Mozilla/5.0 (compatible; PealimAnkiDeck/1.0; audio-fetch)"
|
||||
|
||||
# --- Vocab ---
|
||||
if not args.skip_vocab:
|
||||
v_dl, v_cached, v_no_url = download_vocab_audio(entries, session)
|
||||
else:
|
||||
v_dl = v_cached = v_no_url = 0
|
||||
|
||||
# --- Noun plural ---
|
||||
np_dl, np_cached = download_noun_plural_audio(entries, session)
|
||||
|
||||
# --- Conjugation ---
|
||||
if not args.skip_conj:
|
||||
c_dl, c_cached, c_failed = download_conjugation_audio(entries, session)
|
||||
else:
|
||||
c_dl = c_cached = c_failed = 0
|
||||
|
||||
# --- Summary ---
|
||||
if not args.skip_vocab:
|
||||
logger.info(
|
||||
" Vocab: %d downloaded, %d cached, %d no URL",
|
||||
v_dl,
|
||||
v_cached,
|
||||
v_no_url,
|
||||
)
|
||||
logger.info(" Noun plural: %d downloaded, %d cached", np_dl, np_cached)
|
||||
if not args.skip_conj:
|
||||
failed_msg = f", {c_failed} failed" if c_failed else ""
|
||||
logger.info(
|
||||
" Conjugation: %d downloaded, %d cached%s",
|
||||
c_dl,
|
||||
c_cached,
|
||||
failed_msg,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
1593
pealim_detail_scrape.py
Normal file
1593
pealim_detail_scrape.py
Normal file
File diff suppressed because it is too large
Load diff
9106
pealim_dict.csv
9106
pealim_dict.csv
File diff suppressed because it is too large
Load diff
12111
pealim_dict_for_anki.csv
12111
pealim_dict_for_anki.csv
File diff suppressed because it is too large
Load diff
714
pealim_list_scrape.py
Normal file
714
pealim_list_scrape.py
Normal file
|
|
@ -0,0 +1,714 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Consolidated list page scraper for pealim.com.
|
||||
|
||||
Scrapes /dict/?page=N with two cookie variants (hebstyle=mo for nikkud,
|
||||
hebstyle=vl for ktiv male) and writes results directly to data/words.json.
|
||||
|
||||
Usage:
|
||||
python3 pealim_list_scrape.py [--test N] [--force-refresh]
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import re
|
||||
import time
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Paths
|
||||
# ---------------------------------------------------------------------------
|
||||
PROJECT_ROOT = Path(__file__).parent
|
||||
DATA_DIR = PROJECT_ROOT / "data"
|
||||
WORDS_JSON = DATA_DIR / "words.json"
|
||||
PROGRESS_JSON = DATA_DIR / "list_scrape_progress.json"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
PEALIM_DICT_URL = "https://www.pealim.com/dict/"
|
||||
REQUEST_DELAY = 1.5 # seconds between requests
|
||||
REQUEST_TIMEOUT = 15 # seconds
|
||||
DEFAULT_TOTAL_PAGES = 608
|
||||
SAVE_EVERY = 10 # pages between incremental saves
|
||||
TODAY = date.today().isoformat()
|
||||
|
||||
# Prefer lxml if available; html.parser is the fallback
|
||||
try:
|
||||
import lxml # type: ignore[import-untyped] # noqa: F401
|
||||
|
||||
BS4_PARSER = "lxml"
|
||||
except ImportError:
|
||||
BS4_PARSER = "html.parser"
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Part-of-speech mappings
|
||||
# ---------------------------------------------------------------------------
|
||||
POS_HEBREW: dict[str, str] = {
|
||||
"Noun": "שֵׁם עֶצֶם",
|
||||
"Verb": "פֹּעַל",
|
||||
"Adjective": "שֵׁם תֹּאַר",
|
||||
"Adverb": "תֹּאַר הַפֹּעַל",
|
||||
"Pronoun": "כִּנּוּי גּוּף",
|
||||
"Preposition": "מִילַּת יַחַס",
|
||||
"Conjunction": "מִילַּת חִבּוּר",
|
||||
"Interjection": "מִילַּת קְרִיאָה",
|
||||
"Numeral": "שֵׁם מִסְפָּר",
|
||||
"Cardinal numeral": "שֵׁם מִסְפָּר",
|
||||
"Particle": "מִילִּית",
|
||||
"Determiner": "מְגַדִּיר",
|
||||
"Existential": "מִילַּת קִיּוּם",
|
||||
"Interrogative": "מִילַּת שְׁאֵלָה",
|
||||
}
|
||||
|
||||
# Use exact match on the POS string prefix; longer keys must be checked first.
|
||||
POS_HEBREW_ORDERED: list[tuple[str, str]] = sorted(POS_HEBREW.items(), key=lambda x: -len(x[0]))
|
||||
|
||||
BINYAN_HEBREW: dict[str, str] = {
|
||||
"Pa'al": "פָּעַל",
|
||||
"Nif'al": "נִפְעַל",
|
||||
"Pi'el": "פִּיעֵל",
|
||||
"Pu'al": "פֻּעַל",
|
||||
"Hif'il": "הִפְעִיל",
|
||||
"Huf'al": "הֻפְעַל",
|
||||
"Hitpa'el": "הִתְפַּעֵל",
|
||||
}
|
||||
|
||||
# Regex for extracting emoji characters
|
||||
EMOJI_RE = re.compile(
|
||||
r"[\U0001F300-\U0001FFFF\U00002600-\U000027BF\U0001F000-\U0001F9FF\u2600-\u26FF\u2700-\u27BF\uFE0E\uFE0F\u200D]+",
|
||||
re.UNICODE,
|
||||
)
|
||||
|
||||
# Regex for extracting Hebrew prepositions wrapped in parentheses, e.g. "(על)" or "(ב-)"
|
||||
HBPAREN_RE = re.compile(r"\(([\u05b0-\u05ea\u05f0-\u05f4\-]+)\)")
|
||||
|
||||
# Fields that must never be overwritten when updating an existing entry
|
||||
PROTECTED_FIELDS = frozenset(
|
||||
[
|
||||
"vocab_legacy_guid",
|
||||
"confusables_guid",
|
||||
"frequency",
|
||||
"pseudo_frequency",
|
||||
"emoji",
|
||||
"emoji_source",
|
||||
"emoji_visible",
|
||||
"image",
|
||||
"image_source",
|
||||
"hint",
|
||||
"examples",
|
||||
"noun_inflection",
|
||||
"conjugation",
|
||||
"adjective_inflection",
|
||||
"preposition_inflection",
|
||||
]
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Logging
|
||||
# ---------------------------------------------------------------------------
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# HTTP session
|
||||
# ---------------------------------------------------------------------------
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki-scraper/1.0)"})
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Default entry template
|
||||
# ---------------------------------------------------------------------------
|
||||
def _default_entry() -> dict:
|
||||
"""Return a fresh entry with all fields initialised to safe defaults."""
|
||||
return {
|
||||
"word": {"nikkud": "", "ktiv_male": ""},
|
||||
"slug": "",
|
||||
"root": [],
|
||||
"pos": "",
|
||||
"pos_hebrew": "",
|
||||
"meaning": "",
|
||||
"meaning_raw": "",
|
||||
"audio_url": "",
|
||||
"audio_file": "",
|
||||
"tags": "",
|
||||
"last_scrape_date": "",
|
||||
"vocab_legacy_guid": None,
|
||||
"frequency": None,
|
||||
"pseudo_frequency": None,
|
||||
"emoji": None,
|
||||
"emoji_source": None,
|
||||
"emoji_visible": False,
|
||||
"image": None,
|
||||
"image_source": None,
|
||||
"hint": "",
|
||||
"prep": None,
|
||||
"shared_roots": [],
|
||||
"confusable_group": None,
|
||||
"confusables_guid": None,
|
||||
"examples": None,
|
||||
"noun_inflection": None,
|
||||
"conjugation": None,
|
||||
"adjective_inflection": None,
|
||||
"preposition_inflection": None,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Parsing helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
def _extract_emoji(text: str) -> str | None:
|
||||
"""Return the first emoji run found in *text*, or None."""
|
||||
m = EMOJI_RE.search(text)
|
||||
return m.group(0) if m else None
|
||||
|
||||
|
||||
def _clean_meaning(raw: str) -> str:
|
||||
"""Strip emoji, Hebrew parenthesized prepositions, and extra whitespace from a raw meaning string."""
|
||||
cleaned = EMOJI_RE.sub("", raw)
|
||||
cleaned = HBPAREN_RE.sub("", cleaned)
|
||||
return " ".join(cleaned.split())
|
||||
|
||||
|
||||
def _parse_pos(pos_raw: str) -> tuple[str, str]:
|
||||
"""
|
||||
Parse raw PoS string into (pos_en, pos_hebrew).
|
||||
|
||||
Examples:
|
||||
"Noun – masculine" → ("Noun", "שֵׁם עֶצֶם")
|
||||
"Verb – pa'al" → ("Verb", "פֹּעַל — פָּעַל")
|
||||
"Cardinal numeral" → ("Cardinal numeral", "שֵׁם מִסְפָּר")
|
||||
"""
|
||||
# Strip leading/trailing whitespace; normalise dashes
|
||||
pos_clean = pos_raw.strip()
|
||||
|
||||
# Determine the base English PoS with longest-match strategy
|
||||
pos_en = ""
|
||||
for key, _ in POS_HEBREW_ORDERED:
|
||||
if pos_clean.startswith(key):
|
||||
pos_en = key
|
||||
break
|
||||
if not pos_en:
|
||||
# Fallback: take everything up to " – " or the full string
|
||||
pos_en = pos_clean.split(" – ")[0].split(" - ")[0].strip()
|
||||
|
||||
pos_heb = POS_HEBREW.get(pos_en, pos_en)
|
||||
|
||||
# For verbs, attempt to append binyan
|
||||
if pos_en == "Verb":
|
||||
# Look for binyan after dash; pealim uses "Verb – pa'al"
|
||||
dash_parts = re.split(r"\s*[–-]\s*", pos_clean)
|
||||
if len(dash_parts) >= 2:
|
||||
binyan_raw = dash_parts[1].strip()
|
||||
# Normalise capitalisation for lookup: "pa'al" → "Pa'al"
|
||||
binyan_key = binyan_raw.capitalize()
|
||||
# Handle mixed-case entries like "Nif'al"
|
||||
for bkey in BINYAN_HEBREW:
|
||||
if bkey.lower() == binyan_raw.lower():
|
||||
binyan_key = bkey
|
||||
break
|
||||
binyan_heb = BINYAN_HEBREW.get(binyan_key)
|
||||
if binyan_heb:
|
||||
pos_heb = f"{pos_heb} — {binyan_heb}"
|
||||
|
||||
return pos_en, pos_heb
|
||||
|
||||
|
||||
def _parse_root(root_raw: str) -> list[str]:
|
||||
"""
|
||||
Convert raw root text to a list of consonants.
|
||||
|
||||
Pealim shows roots as "פ - ע - ל" or "פ.ע.ל" or "—" (no root).
|
||||
"""
|
||||
if not root_raw or root_raw in ("-", "—", "–"):
|
||||
return []
|
||||
# Split on " - " or "." separators
|
||||
parts = re.split(r"\s*[-–—.]\s*", root_raw.strip())
|
||||
return [p.strip() for p in parts if p.strip()]
|
||||
|
||||
|
||||
def _build_tags(pos_en: str, root: list[str]) -> str:
|
||||
"""
|
||||
Generate Anki tags string matching the existing project convention.
|
||||
|
||||
Examples:
|
||||
pos=Noun, root=[] → "שם_עצם"
|
||||
pos=Noun, root=["א","ב"] → "שורש::אב שם_עצם"
|
||||
pos=Verb, root=["שמר"] → "שורש::שמר פעלים"
|
||||
"""
|
||||
pos_tag_map = {
|
||||
"Noun": "שם_עצם",
|
||||
"Verb": "פעלים",
|
||||
"Adjective": "שם_תואר",
|
||||
"Adverb": "תוארי_הפועל",
|
||||
"Pronoun": "כינויי_גוף",
|
||||
"Preposition": "מילות_יחס",
|
||||
"Conjunction": "מילות_חיבור",
|
||||
"Particle": "מילית",
|
||||
"Numeral": "שם_מספר",
|
||||
"Cardinal numeral": "שם_מספר",
|
||||
"Determiner": "מגדיר",
|
||||
"Existential": "מילת_קיום",
|
||||
"Interrogative": "מילת_שאלה",
|
||||
"Interjection": "מילת_קריאה",
|
||||
}
|
||||
|
||||
parts: list[str] = []
|
||||
if root:
|
||||
root_str = "".join(root)
|
||||
parts.append(f"שורש::{root_str}")
|
||||
|
||||
pos_heb_tag = pos_tag_map.get(pos_en, "")
|
||||
if pos_heb_tag:
|
||||
parts.append(pos_heb_tag)
|
||||
|
||||
return " ".join(parts)
|
||||
|
||||
|
||||
def _compute_audio_file(slug: str, ktiv_male: str) -> str:
|
||||
"""
|
||||
Return the local audio filename for an entry.
|
||||
|
||||
The actual confusable detection happens later (after all pages are scraped);
|
||||
here we store a placeholder that post_process() will correct.
|
||||
We default to the consonant-based name; confusables get slug-based names.
|
||||
"""
|
||||
consonants = ktiv_male or ""
|
||||
return f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Page parsing
|
||||
# ---------------------------------------------------------------------------
|
||||
def _parse_mo_page(html: bytes) -> list[dict]:
|
||||
"""
|
||||
Parse a hebstyle=mo (nikkud) list page.
|
||||
|
||||
Returns a list of raw row dicts with keys:
|
||||
nikkud, slug, root_raw, pos_raw, meaning_raw, audio_url
|
||||
"""
|
||||
soup = BeautifulSoup(html, BS4_PARSER)
|
||||
rows: list[dict] = []
|
||||
for tr in soup.select("table tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
|
||||
# Audio URL
|
||||
audio_span = tds[0].find(attrs={"data-audio": True})
|
||||
audio_url: str = audio_span["data-audio"] if audio_span else ""
|
||||
|
||||
# Slug
|
||||
slug = ""
|
||||
link = tds[0].find("a", href=True)
|
||||
if link:
|
||||
m = re.search(r"/dict/([^/]+)/", link["href"])
|
||||
if m:
|
||||
slug = m.group(1)
|
||||
|
||||
# Nikkud word
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
nikkud = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
|
||||
root_raw = tds[1].get_text(strip=True)
|
||||
pos_raw = tds[2].get_text(strip=True)
|
||||
meaning_raw = tds[3].get_text(strip=True)
|
||||
|
||||
if nikkud:
|
||||
rows.append(
|
||||
{
|
||||
"nikkud": nikkud,
|
||||
"slug": slug,
|
||||
"root_raw": root_raw,
|
||||
"pos_raw": pos_raw,
|
||||
"meaning_raw": meaning_raw,
|
||||
"audio_url": audio_url,
|
||||
}
|
||||
)
|
||||
return rows
|
||||
|
||||
|
||||
def _parse_vl_words(html: bytes) -> list[str]:
|
||||
"""
|
||||
Parse a hebstyle=vl (ktiv male) list page.
|
||||
|
||||
Returns ordered list of ktiv male strings (one per table row).
|
||||
"""
|
||||
soup = BeautifulSoup(html, BS4_PARSER)
|
||||
words: list[str] = []
|
||||
for tr in soup.select("table tr"):
|
||||
tds = tr.find_all("td")
|
||||
if len(tds) < 4:
|
||||
continue
|
||||
menukad = tds[0].find("span", class_="menukad")
|
||||
word = menukad.get_text(strip=True) if menukad else tds[0].get_text(strip=True)
|
||||
words.append(word)
|
||||
return words
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# words.json I/O
|
||||
# ---------------------------------------------------------------------------
|
||||
def _load_words() -> dict:
|
||||
"""Load words.json; return empty dict if missing."""
|
||||
if not WORDS_JSON.exists():
|
||||
logger.info("data/words.json not found — starting fresh.")
|
||||
return {}
|
||||
with WORDS_JSON.open(encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _save_words(words: dict) -> None:
|
||||
"""Atomically write words to words.json via a .tmp file."""
|
||||
tmp = WORDS_JSON.with_suffix(".json.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
json.dump(words, fh, ensure_ascii=False, indent=2)
|
||||
os.replace(tmp, WORDS_JSON)
|
||||
logger.info("Saved data/words.json (%d entries)", len(words))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Progress tracking
|
||||
# ---------------------------------------------------------------------------
|
||||
def _load_progress() -> set[int]:
|
||||
"""Return set of already-completed page numbers."""
|
||||
if not PROGRESS_JSON.exists():
|
||||
return set()
|
||||
with PROGRESS_JSON.open(encoding="utf-8") as fh:
|
||||
data = json.load(fh)
|
||||
return set(data.get("completed_pages", []))
|
||||
|
||||
|
||||
def _save_progress(completed: set[int]) -> None:
|
||||
"""Atomically write progress file."""
|
||||
tmp = PROGRESS_JSON.with_suffix(".json.tmp")
|
||||
with tmp.open("w", encoding="utf-8") as fh:
|
||||
json.dump({"completed_pages": sorted(completed)}, fh)
|
||||
os.replace(tmp, PROGRESS_JSON)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Unique key generation
|
||||
# ---------------------------------------------------------------------------
|
||||
def _make_unique_key(nikkud: str, pos_en: str, meaning: str, existing_keys: set[str]) -> str:
|
||||
"""
|
||||
Generate a collision-free unique key for a new entry.
|
||||
|
||||
Escalation:
|
||||
1. nikkud
|
||||
2. nikkud|pos_en
|
||||
3. nikkud|pos_en|meaning
|
||||
4. nikkud|pos_en|meaning|N (N = 2, 3, …)
|
||||
"""
|
||||
candidate = nikkud
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
|
||||
candidate = f"{nikkud}|{pos_en}"
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
|
||||
candidate = f"{nikkud}|{pos_en}|{meaning}"
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
|
||||
n = 2
|
||||
while True:
|
||||
candidate = f"{nikkud}|{pos_en}|{meaning}|{n}"
|
||||
if candidate not in existing_keys:
|
||||
return candidate
|
||||
n += 1
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Core: merge one scraped row into words dict
|
||||
# ---------------------------------------------------------------------------
|
||||
def _merge_row(
|
||||
words: dict,
|
||||
slug_index: dict[str, str],
|
||||
nikkud: str,
|
||||
ktiv_male: str,
|
||||
slug: str,
|
||||
root_raw: str,
|
||||
pos_raw: str,
|
||||
meaning_raw_raw: str,
|
||||
audio_url: str,
|
||||
) -> None:
|
||||
"""
|
||||
Upsert a single scraped row into *words* in-place.
|
||||
|
||||
*slug_index* maps slug → unique_key for fast lookup and is updated here
|
||||
when a new entry is created.
|
||||
"""
|
||||
# Derived fields
|
||||
pos_en, pos_heb = _parse_pos(pos_raw)
|
||||
root = _parse_root(root_raw)
|
||||
meaning_raw = meaning_raw_raw
|
||||
meaning = _clean_meaning(meaning_raw)
|
||||
emoji = _extract_emoji(meaning_raw_raw)
|
||||
tags = _build_tags(pos_en, root)
|
||||
audio_file = _compute_audio_file(slug, ktiv_male)
|
||||
# Extract Hebrew preposition(s) from the raw meaning (e.g. "(על)" → "על")
|
||||
prep_matches = HBPAREN_RE.findall(meaning_raw)
|
||||
prep: str | None = " ".join(prep_matches) if prep_matches else None
|
||||
|
||||
# ---- locate existing entry ----
|
||||
unique_key: str | None = slug_index.get(slug) if slug else None
|
||||
|
||||
if unique_key and unique_key in words:
|
||||
# Update list-level fields only; never touch protected fields
|
||||
entry = words[unique_key]
|
||||
entry["word"]["nikkud"] = nikkud
|
||||
entry["word"]["ktiv_male"] = ktiv_male
|
||||
entry["slug"] = slug
|
||||
entry["root"] = root
|
||||
entry["pos"] = pos_en
|
||||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["audio_url"] = audio_url
|
||||
entry["audio_file"] = audio_file
|
||||
entry["tags"] = tags
|
||||
entry["last_scrape_date"] = TODAY
|
||||
else:
|
||||
# Create new entry
|
||||
unique_key = _make_unique_key(nikkud, pos_en, meaning, set(words.keys()))
|
||||
entry = _default_entry()
|
||||
entry["word"]["nikkud"] = nikkud
|
||||
entry["word"]["ktiv_male"] = ktiv_male
|
||||
entry["slug"] = slug
|
||||
entry["root"] = root
|
||||
entry["pos"] = pos_en
|
||||
entry["pos_hebrew"] = pos_heb
|
||||
entry["meaning"] = meaning
|
||||
entry["meaning_raw"] = meaning_raw
|
||||
entry["prep"] = prep
|
||||
entry["emoji"] = emoji
|
||||
entry["emoji_source"] = "from_pealim" if emoji else None
|
||||
entry["audio_url"] = audio_url
|
||||
entry["audio_file"] = audio_file
|
||||
entry["tags"] = tags
|
||||
entry["last_scrape_date"] = TODAY
|
||||
words[unique_key] = entry
|
||||
if slug:
|
||||
slug_index[slug] = unique_key
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Post-processing: recompute confusable_group, shared_roots, audio_file
|
||||
# ---------------------------------------------------------------------------
|
||||
def _post_process(words: dict) -> None:
|
||||
"""
|
||||
After all pages are scraped, recompute derived cross-entry fields:
|
||||
|
||||
- confusable_group: entries sharing the same ktiv_male (2+)
|
||||
- shared_roots: entries sharing the same root (excluding self)
|
||||
- audio_file: slug-based for confusables, consonant-based otherwise
|
||||
"""
|
||||
logger.info("Post-processing: recomputing confusable groups and shared roots...")
|
||||
|
||||
# --- confusable groups ---
|
||||
ktiv_to_keys: dict[str, list[str]] = {}
|
||||
for key, entry in words.items():
|
||||
ktiv = entry.get("word", {}).get("ktiv_male", "")
|
||||
if ktiv:
|
||||
ktiv_to_keys.setdefault(ktiv, []).append(key)
|
||||
|
||||
for _, entry in words.items():
|
||||
ktiv = entry.get("word", {}).get("ktiv_male", "")
|
||||
group = ktiv_to_keys.get(ktiv, [])
|
||||
if len(group) >= 2:
|
||||
entry["confusable_group"] = sorted(group)
|
||||
# Confusable → slug-based audio filename
|
||||
slug = entry.get("slug", "")
|
||||
if slug:
|
||||
entry["audio_file"] = f"{slug}.mp3"
|
||||
else:
|
||||
# Only clear confusable_group if it wasn't set by enrichment (i.e. no confusables_guid)
|
||||
if not entry.get("confusables_guid"):
|
||||
entry["confusable_group"] = None
|
||||
# Non-confusable → consonant-based audio filename
|
||||
ktiv_male = entry.get("word", {}).get("ktiv_male", "")
|
||||
consonants = ktiv_male or ""
|
||||
slug = entry.get("slug", "")
|
||||
entry["audio_file"] = f"{consonants}.mp3" if consonants else f"{slug}.mp3"
|
||||
|
||||
# --- shared roots ---
|
||||
root_to_keys: dict[str, list[str]] = {}
|
||||
for key, entry in words.items():
|
||||
root = entry.get("root")
|
||||
if root:
|
||||
root_str = "|".join(root) # canonical form for grouping
|
||||
root_to_keys.setdefault(root_str, []).append(key)
|
||||
|
||||
for key, entry in words.items():
|
||||
root = entry.get("root")
|
||||
if root:
|
||||
root_str = "|".join(root)
|
||||
siblings = root_to_keys.get(root_str, [])
|
||||
entry["shared_roots"] = sorted(k for k in siblings if k != key)
|
||||
else:
|
||||
entry["shared_roots"] = []
|
||||
|
||||
logger.info("Post-processing complete.")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Scraping loop
|
||||
# ---------------------------------------------------------------------------
|
||||
def _build_slug_index(words: dict) -> dict[str, str]:
|
||||
"""Build slug → unique_key lookup from the current words dict."""
|
||||
index: dict[str, str] = {}
|
||||
for key, entry in words.items():
|
||||
slug = entry.get("slug", "")
|
||||
if slug and slug not in index:
|
||||
index[slug] = key
|
||||
return index
|
||||
|
||||
|
||||
def _fetch_page(url: str, cookies: dict) -> bytes | None:
|
||||
"""Fetch a single page; return raw bytes or None on failure."""
|
||||
try:
|
||||
resp = session.get(url, cookies=cookies, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.content
|
||||
except requests.RequestException as exc:
|
||||
logger.error("Request failed for %s: %s", url, exc)
|
||||
return None
|
||||
|
||||
|
||||
def run_scrape(total_pages: int, force_refresh: bool) -> None:
|
||||
"""
|
||||
Main scrape loop.
|
||||
|
||||
Args:
|
||||
total_pages: Number of list pages to scrape.
|
||||
force_refresh: If True, ignore progress file and re-scrape all pages.
|
||||
"""
|
||||
words = _load_words()
|
||||
slug_index = _build_slug_index(words)
|
||||
completed = set() if force_refresh else _load_progress()
|
||||
|
||||
if force_refresh and completed:
|
||||
logger.info("--force-refresh: ignoring %d completed pages.", len(completed))
|
||||
|
||||
pages_to_do = [p for p in range(1, total_pages + 1) if p not in completed]
|
||||
logger.info(
|
||||
"Pages to scrape: %d / %d (already done: %d)",
|
||||
len(pages_to_do),
|
||||
total_pages,
|
||||
len(completed),
|
||||
)
|
||||
|
||||
pages_since_save = 0
|
||||
|
||||
for page_num in pages_to_do:
|
||||
url = f"{PEALIM_DICT_URL}?page={page_num}"
|
||||
logger.info("Scraping page %d / %d …", page_num, total_pages)
|
||||
|
||||
# --- hebstyle=mo (nikkud + audio + slug) ---
|
||||
mo_html = _fetch_page(url, {"translit": "none", "hebstyle": "mo"})
|
||||
if mo_html is None:
|
||||
logger.warning("Skipping page %d (mo fetch failed).", page_num)
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
continue
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# --- hebstyle=vl (ktiv male) ---
|
||||
vl_html = _fetch_page(url, {"translit": "none", "hebstyle": "vl"})
|
||||
if vl_html is None:
|
||||
logger.warning("Skipping page %d (vl fetch failed).", page_num)
|
||||
time.sleep(REQUEST_DELAY * 2)
|
||||
continue
|
||||
|
||||
# Parse
|
||||
mo_rows = _parse_mo_page(mo_html)
|
||||
vl_words = _parse_vl_words(vl_html)
|
||||
|
||||
if not mo_rows:
|
||||
logger.warning("Page %d returned no rows — might be past end.", page_num)
|
||||
completed.add(page_num)
|
||||
_save_progress(completed)
|
||||
time.sleep(REQUEST_DELAY)
|
||||
continue
|
||||
|
||||
# Merge each row
|
||||
for i, row in enumerate(mo_rows):
|
||||
ktiv_male = vl_words[i] if i < len(vl_words) else ""
|
||||
_merge_row(
|
||||
words=words,
|
||||
slug_index=slug_index,
|
||||
nikkud=row["nikkud"],
|
||||
ktiv_male=ktiv_male,
|
||||
slug=row["slug"],
|
||||
root_raw=row["root_raw"],
|
||||
pos_raw=row["pos_raw"],
|
||||
meaning_raw_raw=row["meaning_raw"],
|
||||
audio_url=row["audio_url"],
|
||||
)
|
||||
|
||||
completed.add(page_num)
|
||||
pages_since_save += 1
|
||||
|
||||
# Incremental save every SAVE_EVERY pages
|
||||
if pages_since_save >= SAVE_EVERY:
|
||||
_save_words(words)
|
||||
_save_progress(completed)
|
||||
pages_since_save = 0
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
|
||||
# Final save + post-processing
|
||||
logger.info("All pages scraped. Running post-processing…")
|
||||
_post_process(words)
|
||||
_save_words(words)
|
||||
_save_progress(completed)
|
||||
logger.info("Done. Total entries in words.json: %d", len(words))
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CLI
|
||||
# ---------------------------------------------------------------------------
|
||||
def main() -> None:
|
||||
"""Entry point."""
|
||||
parser = argparse.ArgumentParser(description="Scrape pealim.com list pages into data/words.json.")
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
metavar="N",
|
||||
type=int,
|
||||
default=None,
|
||||
help="Scrape only the first N pages (for testing).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--force-refresh",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Re-scrape all pages, ignoring existing progress.",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
total_pages = args.test if args.test is not None else DEFAULT_TOTAL_PAGES
|
||||
logger.info(
|
||||
"Starting pealim list scraper | pages=%d | force=%s | parser=%s",
|
||||
total_pages,
|
||||
args.force_refresh,
|
||||
BS4_PARSER,
|
||||
)
|
||||
|
||||
run_scrape(total_pages=total_pages, force_refresh=args.force_refresh)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -25,6 +25,9 @@ dev = [
|
|||
|
||||
[tool.pytest.ini_options]
|
||||
testpaths = ["tests"]
|
||||
markers = [
|
||||
"integration: marks tests that hit the real pealim.com network (deselect with -m 'not integration')",
|
||||
]
|
||||
|
||||
[tool.ruff]
|
||||
target-version = "py311"
|
||||
|
|
|
|||
|
|
@ -1,183 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Rebuild vocab_sentence_matches.json using both direct word matching
|
||||
and ktiv male conjugated/declined form matching.
|
||||
|
||||
This dramatically improves sentence coverage by matching not just
|
||||
dictionary forms but all conjugated verbs and declined nouns.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
from helpers import strip_nikkud as _strip_nikkud
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(__file__).parent / "data"
|
||||
|
||||
|
||||
def main():
|
||||
# Load sentences
|
||||
with open(DATA_DIR / "epub_sentence_index.json") as f:
|
||||
sentences = json.load(f).get("sentences", [])
|
||||
logger.info(f"Loaded {len(sentences)} sentences")
|
||||
|
||||
# Load vocab CSV
|
||||
csv_path = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
try:
|
||||
df = pd.read_csv(csv_path, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(csv_path, index_col=0)
|
||||
logger.info(f"Loaded {len(df)} vocab entries")
|
||||
|
||||
# Build word lookup: stripped_form → (word_nikkud, word_no_nikkud)
|
||||
word_lookup: dict[str, list[tuple[str, str]]] = {}
|
||||
for _, row in df.iterrows():
|
||||
word = str(row.get("Word", "")).strip()
|
||||
wni = str(row.get("Word Without Nikkud", "")).strip()
|
||||
if not word or word in ("nan", "None"):
|
||||
continue
|
||||
stripped = _strip_nikkud(word)
|
||||
if stripped:
|
||||
word_lookup.setdefault(stripped, []).append((word, wni))
|
||||
|
||||
# Load ktiv male forms: ktiv_male_form → [{word_nikkud, form_type, ...}]
|
||||
ktiv_path = DATA_DIR / "ktiv_male_forms.json"
|
||||
ktiv_forms: dict[str, list[dict]] = {}
|
||||
if ktiv_path.exists():
|
||||
with open(ktiv_path) as f:
|
||||
ktiv_forms = json.load(f)
|
||||
logger.info(f"Loaded {len(ktiv_forms)} ktiv male forms")
|
||||
else:
|
||||
logger.warning("No ktiv_male_forms.json — only using direct matching")
|
||||
|
||||
# Build reverse lookup: ktiv_male → set of dictionary words (nikkud)
|
||||
ktiv_to_word: dict[str, set[str]] = {}
|
||||
for ktiv, entries in ktiv_forms.items():
|
||||
for entry in entries:
|
||||
word_nikkud = entry.get("word_nikkud", "")
|
||||
if word_nikkud:
|
||||
ktiv_to_word.setdefault(ktiv, set()).add(word_nikkud)
|
||||
|
||||
# Also add all vocab words' own stripped forms to ktiv_to_word
|
||||
for stripped, entries in word_lookup.items():
|
||||
for word_nikkud, _ in entries:
|
||||
ktiv_to_word.setdefault(stripped, set()).add(word_nikkud)
|
||||
|
||||
logger.info(f"Total matchable forms: {len(ktiv_to_word)}")
|
||||
|
||||
# Tokenize all sentences once
|
||||
sentence_tokens: list[tuple[dict, list[str]]] = []
|
||||
for s in sentences:
|
||||
stripped = s.get("stripped", _strip_nikkud(s.get("text", "")))
|
||||
tokens = [re.sub(r'[.,!?;:"\'\u05be]', "", t) for t in stripped.split()]
|
||||
tokens = [t for t in tokens if t] # remove empty
|
||||
sentence_tokens.append((s, tokens))
|
||||
|
||||
# Match: for each sentence token, check ktiv_to_word lookup
|
||||
# Build word_nikkud → [sentence_info]
|
||||
matches: dict[str, list[dict]] = {} # word_nikkud → [sentences]
|
||||
|
||||
for sent, tokens in sentence_tokens:
|
||||
text = sent.get("text", "")
|
||||
book = sent.get("book", "")
|
||||
word_len = len(tokens)
|
||||
|
||||
# Skip sentences that are too short or too long
|
||||
if word_len < 4 or word_len > 15:
|
||||
continue
|
||||
|
||||
for tok in tokens:
|
||||
if tok in ktiv_to_word:
|
||||
for word_nikkud in ktiv_to_word[tok]:
|
||||
matches.setdefault(word_nikkud, []).append(
|
||||
{
|
||||
"text": text,
|
||||
"book": book,
|
||||
"matched_form": tok,
|
||||
"word_count": word_len,
|
||||
}
|
||||
)
|
||||
|
||||
logger.info(f"Words with at least 1 match: {len(matches)}")
|
||||
|
||||
# Deduplicate and limit to 3 best sentences per word
|
||||
# Prefer shorter sentences (6-12 words ideal)
|
||||
output: dict[str, dict] = {}
|
||||
for word_nikkud, sents in matches.items():
|
||||
# Deduplicate by text
|
||||
seen_texts = set()
|
||||
unique = []
|
||||
for s in sents:
|
||||
if s["text"] not in seen_texts:
|
||||
seen_texts.add(s["text"])
|
||||
unique.append(s)
|
||||
|
||||
# Score: prefer 6-12 word sentences
|
||||
def score(s):
|
||||
wc = s["word_count"]
|
||||
if 6 <= wc <= 12:
|
||||
return 0 # ideal
|
||||
return abs(wc - 9) # distance from ideal
|
||||
|
||||
unique.sort(key=score)
|
||||
best = unique[:3]
|
||||
|
||||
# Find the Word Without Nikkud for this word
|
||||
stripped = _strip_nikkud(word_nikkud)
|
||||
wni = stripped # default
|
||||
if stripped in word_lookup:
|
||||
for wn, w_wni in word_lookup[stripped]:
|
||||
if wn == word_nikkud:
|
||||
wni = w_wni
|
||||
break
|
||||
|
||||
output[wni] = {
|
||||
"word_nikkud": word_nikkud,
|
||||
"sentences": [{"text": s["text"], "book": s["book"]} for s in best],
|
||||
}
|
||||
|
||||
# Save
|
||||
out_path = DATA_DIR / "vocab_sentence_matches.json"
|
||||
with open(out_path, "w") as f:
|
||||
json.dump(output, f, ensure_ascii=False, indent=1)
|
||||
|
||||
total_sents = sum(len(v["sentences"]) for v in output.values())
|
||||
logger.info(f"Saved {len(output)} words with {total_sents} sentences → {out_path}")
|
||||
|
||||
# Stats
|
||||
total_vocab = len(df)
|
||||
pct = len(output) * 100 / total_vocab
|
||||
logger.info(f"Coverage: {len(output)}/{total_vocab} ({pct:.1f}%)")
|
||||
|
||||
# Breakdown by match type
|
||||
direct_only = 0
|
||||
ktiv_only = 0
|
||||
both = 0
|
||||
for _wni, info in output.items():
|
||||
word = info["word_nikkud"]
|
||||
stripped = _strip_nikkud(word)
|
||||
has_direct = stripped in word_lookup
|
||||
has_ktiv = any(s.get("matched_form", "") != stripped for s in info["sentences"])
|
||||
if has_direct and has_ktiv:
|
||||
both += 1
|
||||
elif has_ktiv:
|
||||
ktiv_only += 1
|
||||
else:
|
||||
direct_only += 1
|
||||
|
||||
logger.info(f" Direct matches only: {direct_only}")
|
||||
logger.info(f" Ktiv male matches only: {ktiv_only}")
|
||||
logger.info(f" Both: {both}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
208
release.py
Normal file
208
release.py
Normal file
|
|
@ -0,0 +1,208 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Create a Forgejo release and upload all .apkg deck variants.
|
||||
|
||||
Usage:
|
||||
python3 release.py # uses RELEASE_TAG from apkg_builder.py
|
||||
python3 release.py v0.14 # explicit tag
|
||||
python3 release.py --dry-run # show what would be uploaded without doing it
|
||||
python3 release.py --validate # run validate_apkg.py first, abort on failure
|
||||
|
||||
Requires:
|
||||
FORGEJO_TOKEN env var or hardcoded token below.
|
||||
Git tag must not already exist (creates tag + release).
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
sys.path.insert(0, "/home/node/projects")
|
||||
import load_keeshare
|
||||
|
||||
REPO_API = "https://git.nevo.engineer/api/v1/repos/nevo/hebrew_flash_cards"
|
||||
FORGEJO_TOKEN: str = load_keeshare.get_entry("git.nevo.engineer")["password"]
|
||||
OUTPUT_DIR = Path(__file__).parent / "output"
|
||||
|
||||
# All deck variants to include in release
|
||||
DECK_PREFIX = "hebrew_"
|
||||
DECK_VARIANTS = [
|
||||
"hebrew_vocabulary.apkg",
|
||||
"hebrew_vocabulary_audio.apkg",
|
||||
"hebrew_vocabulary_images.apkg",
|
||||
"hebrew_vocabulary_audio_images.apkg",
|
||||
"hebrew_conjugations.apkg",
|
||||
"hebrew_conjugations_audio.apkg",
|
||||
"hebrew_confusables.apkg",
|
||||
"hebrew_confusables_audio.apkg",
|
||||
"hebrew_plurals.apkg",
|
||||
"hebrew_plurals_audio.apkg",
|
||||
"hebrew_complete.apkg",
|
||||
"hebrew_complete_audio.apkg",
|
||||
]
|
||||
|
||||
|
||||
def get_release_tag() -> str:
|
||||
"""Import RELEASE_TAG from apkg_builder."""
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
from apkg_builder import RELEASE_TAG
|
||||
|
||||
return RELEASE_TAG
|
||||
|
||||
|
||||
def api(method: str, endpoint: str, **kwargs) -> requests.Response:
|
||||
url = f"{REPO_API}{endpoint}"
|
||||
headers = {"Authorization": f"token {FORGEJO_TOKEN}"}
|
||||
resp = requests.request(method, url, headers=headers, timeout=30, **kwargs)
|
||||
resp.raise_for_status()
|
||||
return resp
|
||||
|
||||
|
||||
def tag_exists(tag: str) -> bool:
|
||||
try:
|
||||
api("GET", f"/tags/{tag}")
|
||||
return True
|
||||
except requests.HTTPError:
|
||||
return False
|
||||
|
||||
|
||||
def release_exists(tag: str) -> dict | None:
|
||||
try:
|
||||
resp = api("GET", f"/releases/tags/{tag}")
|
||||
return resp.json()
|
||||
except requests.HTTPError:
|
||||
return None
|
||||
|
||||
|
||||
def create_git_tag(tag: str) -> None:
|
||||
subprocess.run(["git", "tag", tag], check=True)
|
||||
subprocess.run(["git", "push", "origin", tag], check=True)
|
||||
print(f" Created and pushed tag: {tag}")
|
||||
|
||||
|
||||
def create_release(tag: str, assets: list[Path]) -> int:
|
||||
"""Create release, return release ID."""
|
||||
# Build release body from deck file sizes
|
||||
lines = ["## Deck Variants\n", "| File | Size |", "|------|------|"]
|
||||
for p in sorted(assets):
|
||||
size_mb = p.stat().st_size / 1_048_576
|
||||
lines.append(f"| {p.name} | {size_mb:.1f} MB |")
|
||||
|
||||
body = "\n".join(lines)
|
||||
data = {
|
||||
"tag_name": tag,
|
||||
"name": f"{tag} — Hebrew Flash Cards",
|
||||
"body": body,
|
||||
"draft": False,
|
||||
"prerelease": False,
|
||||
}
|
||||
resp = api("POST", "/releases", json=data)
|
||||
release_id = resp.json()["id"]
|
||||
print(f" Created release: {tag} (ID {release_id})")
|
||||
return release_id
|
||||
|
||||
|
||||
def delete_release_assets(release_id: int) -> int:
|
||||
"""Delete all existing assets on a release. Returns count deleted."""
|
||||
resp = api("GET", f"/releases/{release_id}/assets")
|
||||
assets = resp.json()
|
||||
for asset in assets:
|
||||
api("DELETE", f"/releases/{release_id}/assets/{asset['id']}")
|
||||
return len(assets)
|
||||
|
||||
|
||||
def upload_assets(release_id: int, assets: list[Path]) -> None:
|
||||
for p in sorted(assets):
|
||||
size_mb = p.stat().st_size / 1_048_576
|
||||
print(f" Uploading {p.name} ({size_mb:.1f} MB) ... ", end="", flush=True)
|
||||
with open(p, "rb") as f:
|
||||
api(
|
||||
"POST",
|
||||
f"/releases/{release_id}/assets?name={p.name}",
|
||||
files={"attachment": (p.name, f, "application/octet-stream")},
|
||||
)
|
||||
print("ok")
|
||||
|
||||
|
||||
def validate_decks() -> bool:
|
||||
"""Run validate_apkg.py, return True if all checks pass."""
|
||||
result = subprocess.run(
|
||||
[sys.executable, "validate_apkg.py"],
|
||||
capture_output=True,
|
||||
text=True,
|
||||
)
|
||||
print(result.stdout)
|
||||
if result.returncode != 0:
|
||||
print(result.stderr)
|
||||
return result.returncode == 0
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Create Forgejo release with deck assets")
|
||||
parser.add_argument("tag", nargs="?", help="Release tag (default: from apkg_builder.RELEASE_TAG)")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show what would be done without doing it")
|
||||
parser.add_argument("--validate", action="store_true", help="Run validate_apkg.py before releasing")
|
||||
parser.add_argument("--force", action="store_true", help="Re-upload assets if release already exists")
|
||||
args = parser.parse_args()
|
||||
|
||||
tag = args.tag or get_release_tag()
|
||||
print(f"Release tag: {tag}")
|
||||
|
||||
# Collect assets
|
||||
assets = [OUTPUT_DIR / name for name in DECK_VARIANTS]
|
||||
missing = [p for p in assets if not p.exists()]
|
||||
if missing:
|
||||
print("\nERROR: Missing deck files:")
|
||||
for p in missing:
|
||||
print(f" {p}")
|
||||
print("\nRun the build pipeline first: python3 run.py --skip-scrape")
|
||||
sys.exit(1)
|
||||
|
||||
print(f"Assets: {len(assets)} deck files")
|
||||
total_mb = sum(p.stat().st_size for p in assets) / 1_048_576
|
||||
print(f"Total size: {total_mb:.0f} MB")
|
||||
|
||||
if args.validate:
|
||||
print("\nValidating decks ...")
|
||||
if not validate_decks():
|
||||
print("ERROR: Validation failed. Aborting release.")
|
||||
sys.exit(1)
|
||||
print("Validation passed.\n")
|
||||
|
||||
if args.dry_run:
|
||||
print("\n[DRY RUN] Would upload:")
|
||||
for p in sorted(assets):
|
||||
size_mb = p.stat().st_size / 1_048_576
|
||||
print(f" {p.name} ({size_mb:.1f} MB)")
|
||||
print(f"\n[DRY RUN] Tag: {tag}")
|
||||
return
|
||||
|
||||
# Check if release already exists
|
||||
existing = release_exists(tag)
|
||||
if existing and not args.force:
|
||||
print(f"\nRelease {tag} already exists (ID {existing['id']}).")
|
||||
print("Use --force to delete existing assets and re-upload.")
|
||||
sys.exit(1)
|
||||
|
||||
if existing and args.force:
|
||||
release_id = existing["id"]
|
||||
deleted = delete_release_assets(release_id)
|
||||
print(f" Deleted {deleted} existing assets from release {tag}")
|
||||
else:
|
||||
# Create tag if needed
|
||||
if not tag_exists(tag):
|
||||
create_git_tag(tag)
|
||||
release_id = create_release(tag, assets)
|
||||
|
||||
# Upload
|
||||
print(f"\nUploading {len(assets)} files ...")
|
||||
upload_assets(release_id, assets)
|
||||
|
||||
print(f"\nDone. Release: https://git.nevo.engineer/nevo/hebrew_flash_cards/releases/tag/{tag}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
501
run.py
501
run.py
|
|
@ -7,13 +7,23 @@ Usage:
|
|||
|
||||
Options:
|
||||
--only {vocab,conjugations,confusables,plurals,complete} Run only one deck
|
||||
--skip-scrape Use existing data/pealim_dict.csv (no pealim.com dict scraping)
|
||||
Pipeline steps:
|
||||
1. List scrape — scrape pealim.com list pages → words.json (captures slugs)
|
||||
2. Detail scrape — scrape noun/verb detail pages using slugs → words.json
|
||||
3. Frequency — load/download word frequency data
|
||||
4. Examples — extract example sentences from Hebrew EPUBs
|
||||
5. Audio download — download audio mp3 files
|
||||
6. Fonts — download Heebo font files
|
||||
7. Images — fetch noun images from Wikipedia
|
||||
8. Build — build all .apkg deck variants
|
||||
|
||||
Options:
|
||||
--skip-scrape Skip list page scraping (use existing words.json)
|
||||
--skip-detail Skip detail page scraping
|
||||
--skip-audio Skip audio .mp3 downloads
|
||||
--skip-examples Skip Ben Yehuda example fetching
|
||||
--skip-conjugations Skip verb conjugation extraction
|
||||
--skip-examples Skip EPUB example extraction
|
||||
--skip-images Skip image fetching for concrete nouns
|
||||
--refresh-examples Force rebuild of Ben Yehuda index (delete old, download nikkud corpus)
|
||||
--test N Process only the first N dictionary words (for quick testing)
|
||||
--test N Limit to first N words/pages
|
||||
"""
|
||||
|
||||
import argparse
|
||||
|
|
@ -21,11 +31,8 @@ import json
|
|||
import logging
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
from helpers import strip_nikkud
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent))
|
||||
|
||||
logging.basicConfig(
|
||||
|
|
@ -39,6 +46,7 @@ OUTPUT_DIR = Path(__file__).parent / "output"
|
|||
AUDIO_DIR = DATA_DIR / "audio"
|
||||
AUDIO_CONJ_DIR = DATA_DIR / "audio_conj"
|
||||
FONTS_DIR = DATA_DIR / "fonts"
|
||||
WORDS_JSON = DATA_DIR / "words.json"
|
||||
|
||||
|
||||
def parse_args():
|
||||
|
|
@ -48,282 +56,117 @@ def parse_args():
|
|||
choices=["vocab", "conjugations", "confusables", "plurals", "complete"],
|
||||
help="Run only one deck (skips all unrelated steps)",
|
||||
)
|
||||
p.add_argument("--skip-scrape", action="store_true", help="Skip dict scraping; use cached CSV")
|
||||
p.add_argument("--skip-scrape", action="store_true", help="Skip list page scraping")
|
||||
p.add_argument("--skip-detail", action="store_true", help="Skip detail page scraping")
|
||||
p.add_argument("--skip-audio", action="store_true", help="Skip audio downloads")
|
||||
p.add_argument("--skip-examples", action="store_true", help="Skip Ben Yehuda example lookup")
|
||||
p.add_argument(
|
||||
"--skip-conjugations",
|
||||
action="store_true",
|
||||
help="Skip verb conjugation extraction (deprecated: use --only vocab)",
|
||||
)
|
||||
p.add_argument("--skip-examples", action="store_true", help="Skip EPUB example extraction")
|
||||
p.add_argument("--skip-images", action="store_true", help="Skip image fetching")
|
||||
p.add_argument("--refresh-examples", action="store_true", help="Force rebuild of Ben Yehuda index")
|
||||
p.add_argument("--test", type=int, metavar="N", help="Limit to first N words")
|
||||
return p.parse_args()
|
||||
|
||||
|
||||
def step_scrape(args):
|
||||
"""Step 1 — scrape or load dictionary."""
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
anki_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
# Legacy fallback names
|
||||
legacy_dict = DATA_DIR / "pealim_dict.csv"
|
||||
def step_list_scrape(args):
|
||||
"""Step 1 — scrape pealim.com list pages → words.json."""
|
||||
if args.skip_scrape:
|
||||
if dict_csv.exists():
|
||||
logger.info(f"[1] Using existing {dict_csv}")
|
||||
elif legacy_dict.exists():
|
||||
logger.info(f"[1] Using legacy {legacy_dict} (consider renaming)")
|
||||
if WORDS_JSON.exists():
|
||||
logger.info("[1] Using existing words.json (--skip-scrape)")
|
||||
else:
|
||||
logger.error(f"[1] --skip-scrape set but {dict_csv} not found. Aborting.")
|
||||
logger.error(f"[1] --skip-scrape set but {WORDS_JSON} not found. Aborting.")
|
||||
sys.exit(1)
|
||||
return
|
||||
|
||||
logger.info("[1] Scraping dictionary from pealim.com …")
|
||||
logger.info("[1] Scraping dictionary list pages from pealim.com …")
|
||||
import pealim_list_scrape
|
||||
|
||||
import hebrew_extract
|
||||
|
||||
df = hebrew_extract.extract_from_website()
|
||||
df.to_csv(dict_csv, index=True)
|
||||
logger.info(f" Saved {len(df)} words → {dict_csv}")
|
||||
|
||||
df = hebrew_extract.modify_for_anki(df)
|
||||
df.to_csv(anki_csv, sep=";", index=True)
|
||||
logger.info(f" Saved Anki CSV → {anki_csv}")
|
||||
total_pages = args.test if args.test else None
|
||||
pealim_list_scrape.run_scrape(total_pages=total_pages, force_refresh=False)
|
||||
|
||||
|
||||
def step_frequency() -> dict[str, int]:
|
||||
"""Step 2 — load/download word frequency data."""
|
||||
logger.info("[2] Loading word frequency data …")
|
||||
"""Step 3 — load/download word frequency data."""
|
||||
logger.info("[3] Loading word frequency data …")
|
||||
import frequency_lookup
|
||||
|
||||
frequency_lookup.load()
|
||||
return frequency_lookup._freq
|
||||
|
||||
|
||||
def step_examples(args, freq_cache: dict):
|
||||
"""Step 3 — load/build Ben Yehuda example index."""
|
||||
def step_examples(args) -> dict:
|
||||
"""Step 4 — extract example sentences from Hebrew EPUBs."""
|
||||
if args.skip_examples:
|
||||
logger.info("[3] Skipping examples (--skip-examples)")
|
||||
examples_path = DATA_DIR / "examples_cache.json"
|
||||
if examples_path.exists():
|
||||
with open(examples_path) as f:
|
||||
return json.load(f)
|
||||
logger.info("[4] Skipping examples (--skip-examples)")
|
||||
return {}
|
||||
|
||||
logger.info("[3] Loading Ben Yehuda example index …")
|
||||
import benyehuda
|
||||
logger.info("[4] Extracting EPUB example sentences …")
|
||||
import epub_examples
|
||||
|
||||
benyehuda.load(force_rebuild=args.refresh_examples)
|
||||
if not WORDS_JSON.exists():
|
||||
logger.warning("[4] words.json not found, skipping examples")
|
||||
return {}
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
try:
|
||||
import pandas as pd
|
||||
stats = epub_examples.run(words)
|
||||
|
||||
try:
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
# Save updated words.json
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
if args.test:
|
||||
df = df.head(args.test)
|
||||
|
||||
logger.info(f" Pre-fetching examples for {len(df)} words …")
|
||||
for _, row in df.iterrows():
|
||||
# Use nikkud word form as primary key (nikkud corpus)
|
||||
word_nikkud = str(row.get("Word", "")).strip()
|
||||
if word_nikkud:
|
||||
benyehuda.get_examples(word_nikkud)
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Could not pre-fetch all examples: {e}")
|
||||
|
||||
benyehuda.save_examples_cache()
|
||||
return benyehuda._examples_cache
|
||||
logger.info(f" Coverage: {stats['matched']}/{stats['total_vocab']}")
|
||||
return stats
|
||||
|
||||
|
||||
def step_audio(args):
|
||||
"""Step 4 — download vocabulary audio .mp3 files from audio_url column in CSV."""
|
||||
if args.skip_audio:
|
||||
logger.info("[4] Skipping audio (--skip-audio)")
|
||||
def step_detail_scrape(args):
|
||||
"""Step 2 — scrape detail pages for nouns and verbs → update words.json."""
|
||||
if args.skip_detail:
|
||||
logger.info("[2] Skipping detail scrape (--skip-detail)")
|
||||
return
|
||||
|
||||
logger.info("[4] Downloading vocabulary audio files …")
|
||||
logger.info("[2] Scraping detail pages from pealim.com …")
|
||||
import pealim_detail_scrape
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
try:
|
||||
try:
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
|
||||
if "audio_url" not in df.columns:
|
||||
logger.warning(" No audio_url column in CSV — re-scrape with hebrew_extract.py to capture audio URLs")
|
||||
return
|
||||
|
||||
if args.test:
|
||||
df = df.head(args.test)
|
||||
|
||||
AUDIO_DIR.mkdir(parents=True, exist_ok=True)
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
no_url = 0
|
||||
|
||||
for _, row in df.iterrows():
|
||||
word = str(row.get("Word", "")).strip()
|
||||
word_plain = str(row.get("Word Without Nikkud", "")).strip()
|
||||
audio_url = str(row.get("audio_url", "")).strip()
|
||||
|
||||
if not word:
|
||||
continue
|
||||
|
||||
safe_name = re.sub(r"[^\u05d0-\u05ea]", "", strip_nikkud(word_plain or word))
|
||||
if not safe_name:
|
||||
continue
|
||||
mp3_path = AUDIO_DIR / f"{safe_name}.mp3"
|
||||
|
||||
if mp3_path.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
if not audio_url or audio_url in ("nan", "None", ""):
|
||||
no_url += 1
|
||||
continue
|
||||
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.3)
|
||||
except Exception as e:
|
||||
logger.debug(f" Audio download failed for {word}: {e}")
|
||||
|
||||
logger.info(f" Audio: {downloaded} downloaded, {skipped} already cached, {no_url} without URL")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Audio step failed: {e}")
|
||||
test_limit = args.test if args.test else None
|
||||
pealim_detail_scrape.run(test=test_limit, force_refresh=False)
|
||||
|
||||
|
||||
def step_conj_audio(args, conjugations: dict):
|
||||
"""Step 4b — download conjugation audio .mp3 files."""
|
||||
def step_audio_download(args):
|
||||
"""Step 5 — download audio .mp3 files from URLs in words.json."""
|
||||
if args.skip_audio:
|
||||
logger.info("[4b] Skipping conjugation audio (--skip-audio)")
|
||||
logger.info("[5] Skipping audio (--skip-audio)")
|
||||
return
|
||||
|
||||
logger.info("[4b] Downloading conjugation audio files …")
|
||||
AUDIO_CONJ_DIR.mkdir(parents=True, exist_ok=True)
|
||||
logger.info("[5] Downloading audio files …")
|
||||
|
||||
import requests
|
||||
import pealim_audio_download
|
||||
|
||||
downloaded = 0
|
||||
skipped = 0
|
||||
failed = 0
|
||||
|
||||
for _infinitive, data in conjugations.items():
|
||||
if not data or not data.get("forms"):
|
||||
continue
|
||||
|
||||
slug = data.get("slug", "")
|
||||
if not slug:
|
||||
continue
|
||||
|
||||
# Active forms
|
||||
for form_key, form_data in data["forms"].items():
|
||||
audio_url = form_data.get("audio_url", "")
|
||||
if not audio_url:
|
||||
continue
|
||||
filename = f"{slug}_{form_key}.mp3"
|
||||
mp3_path = AUDIO_CONJ_DIR / filename
|
||||
if mp3_path.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.2)
|
||||
except Exception as e:
|
||||
logger.debug(f" Conj audio failed {filename}: {e}")
|
||||
failed += 1
|
||||
|
||||
# Passive partner forms
|
||||
passive = data.get("passive_partner")
|
||||
if passive and passive.get("forms"):
|
||||
for form_key, form_data in passive["forms"].items():
|
||||
audio_url = form_data.get("audio_url", "")
|
||||
if not audio_url:
|
||||
continue
|
||||
filename = f"{slug}_passive_{form_key}.mp3"
|
||||
mp3_path = AUDIO_CONJ_DIR / filename
|
||||
if mp3_path.exists():
|
||||
skipped += 1
|
||||
continue
|
||||
try:
|
||||
resp = requests.get(audio_url, timeout=10)
|
||||
resp.raise_for_status()
|
||||
mp3_path.write_bytes(resp.content)
|
||||
downloaded += 1
|
||||
time.sleep(0.2)
|
||||
except Exception as e:
|
||||
logger.debug(f" Conj audio failed {filename}: {e}")
|
||||
failed += 1
|
||||
|
||||
logger.info(f" Conjugation audio: {downloaded} downloaded, {skipped} cached, {failed} failed")
|
||||
test_limit = args.test if args.test else None
|
||||
pealim_audio_download.run(test=test_limit)
|
||||
|
||||
|
||||
def step_fonts(args):
|
||||
"""Step 4c — download Heebo font files (one-time, cached)."""
|
||||
def step_fonts(_args: argparse.Namespace):
|
||||
"""Step 6 — download Heebo font files (one-time, cached)."""
|
||||
FONTS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
regular = FONTS_DIR / "_Heebo-Regular.ttf"
|
||||
bold = FONTS_DIR / "_Heebo-Bold.ttf"
|
||||
|
||||
if regular.exists() and bold.exists():
|
||||
logger.info("[4c] Heebo fonts already cached")
|
||||
logger.info("[6] Heebo fonts already cached")
|
||||
return
|
||||
|
||||
logger.info("[4c] Downloading Heebo fonts from Google Fonts …")
|
||||
logger.info("[6] Downloading Heebo fonts from Google Fonts …")
|
||||
|
||||
# Fetch CSS to get actual TTF source URLs (static subset for Hebrew + Latin)
|
||||
import requests as _req
|
||||
|
||||
headers = {
|
||||
# Request TTF (not woff2) so Anki can embed them
|
||||
"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"
|
||||
}
|
||||
headers = {"User-Agent": "Mozilla/5.0 (X11; Linux x86_64) Gecko/20100101 Firefox/120.0"}
|
||||
css_url = "https://fonts.googleapis.com/css2?family=Heebo:wght@400;700"
|
||||
try:
|
||||
css_resp = _req.get(css_url, headers=headers, timeout=15)
|
||||
css_resp.raise_for_status()
|
||||
css_text = css_resp.text
|
||||
|
||||
# Find all src: url(...) references (may be woff2 for modern UA)
|
||||
font_urls = re.findall(r"src:\s*url\(([^)]+)\)", css_text)
|
||||
logger.debug(f" Found {len(font_urls)} font URL(s) in CSS")
|
||||
|
||||
# Prefer TTF; if only woff2 available, download first two and note
|
||||
downloaded = []
|
||||
for i, fu in enumerate(font_urls[:2]):
|
||||
fu = fu.strip("'\"")
|
||||
dest = regular if i == 0 else bold
|
||||
|
|
@ -332,142 +175,74 @@ def step_fonts(args):
|
|||
fr = _req.get(fu, timeout=15)
|
||||
fr.raise_for_status()
|
||||
dest.write_bytes(fr.content)
|
||||
downloaded.append(dest.name)
|
||||
logger.info(f" Downloaded → {dest.name}")
|
||||
|
||||
if not downloaded:
|
||||
logger.info(" All font files already present")
|
||||
|
||||
except Exception as e:
|
||||
logger.warning(f" Heebo download failed: {e}")
|
||||
logger.warning(" Cards will fall back to Arial Hebrew / David.")
|
||||
logger.warning(
|
||||
" To install manually: download Heebo-Regular.ttf and Heebo-Bold.ttf "
|
||||
"from https://fonts.google.com/specimen/Heebo and rename with _ prefix "
|
||||
f"into {FONTS_DIR}"
|
||||
)
|
||||
|
||||
|
||||
def step_images(args) -> dict:
|
||||
"""Step 4d — fetch images for concrete nouns (resume-safe)."""
|
||||
"""Step 7 — fetch images for concrete nouns (resume-safe)."""
|
||||
if args.skip_images:
|
||||
logger.info("[4d] Skipping images (--skip-images)")
|
||||
logger.info("[7] Skipping images (--skip-images)")
|
||||
cache_path = DATA_DIR / "image_cache.json"
|
||||
if cache_path.exists():
|
||||
with open(cache_path) as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
limit = args.test # When in test mode, limit images too
|
||||
logger.info("[4d] Fetching images for concrete nouns …")
|
||||
limit = args.test
|
||||
logger.info("[7] Fetching images for concrete nouns …")
|
||||
import image_fetch
|
||||
|
||||
return image_fetch.run(limit=limit)
|
||||
|
||||
|
||||
def step_build_all(
|
||||
args, examples_cache: dict, freq_cache: dict, conjugations: dict | None, image_cache: dict | None = None
|
||||
):
|
||||
"""Step 5 — build all 6 release variants (4 vocab + 2 conj)."""
|
||||
logger.info("[5] Building all deck variants …")
|
||||
def step_build_all(args):
|
||||
"""Step 8 — build all 12 release variants from the unified words.json."""
|
||||
logger.info("[8] Building all deck variants …")
|
||||
import apkg_builder
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
if not WORDS_JSON.exists():
|
||||
logger.error(f"[8] {WORDS_JSON} not found. Run the data pipeline first.")
|
||||
sys.exit(1)
|
||||
|
||||
apkg_builder.build_all_variants(
|
||||
dict_csv,
|
||||
conjugations=conjugations or {},
|
||||
examples_cache=examples_cache,
|
||||
freq_cache=freq_cache,
|
||||
image_cache=image_cache or {},
|
||||
limit=args.test,
|
||||
)
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
apkg_builder.build_all_variants(words, limit=args.test)
|
||||
|
||||
|
||||
def step_conjugations(args):
|
||||
"""Step 6 — extract conjugations (returns data; building handled by step_build_all).
|
||||
|
||||
--skip-conjugations skips re-extraction from pealim.com but still loads
|
||||
from cache so conj deck variants are built correctly.
|
||||
"""
|
||||
conj_cache = DATA_DIR / "conjugations.json"
|
||||
|
||||
if args.skip_conjugations:
|
||||
if conj_cache.exists():
|
||||
logger.info("[6] --skip-conjugations: loading from cache …")
|
||||
with open(conj_cache) as f:
|
||||
import json as _json
|
||||
|
||||
return _json.load(f)
|
||||
logger.info("[6] --skip-conjugations: no cache found, skipping conj decks")
|
||||
return None
|
||||
|
||||
verbs_file = Path(__file__).parent / "verbs_input.txt"
|
||||
if not verbs_file.exists():
|
||||
logger.info("[6] verbs_input.txt not found — skipping conjugation deck")
|
||||
return None
|
||||
|
||||
if conj_cache.exists():
|
||||
logger.info("[6] Using cached conjugations.json …")
|
||||
with open(conj_cache) as f:
|
||||
import json as _json
|
||||
|
||||
conjugations = _json.load(f)
|
||||
else:
|
||||
logger.info("[6] Extracting verb conjugations …")
|
||||
import conjugation_extract
|
||||
|
||||
conjugations = conjugation_extract.main(verbs_file)
|
||||
|
||||
# Download conjugation audio
|
||||
step_conj_audio(args, conjugations)
|
||||
|
||||
return conjugations
|
||||
|
||||
|
||||
def print_summary(args, examples_cache, freq_cache, conjugations):
|
||||
def print_summary(_args: argparse.Namespace, example_stats: dict, freq_cache: dict):
|
||||
logger.info("")
|
||||
logger.info("=" * 60)
|
||||
logger.info("SUMMARY")
|
||||
logger.info("=" * 60)
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "pealim_dict.csv"
|
||||
if dict_csv.exists():
|
||||
import pandas as pd
|
||||
if WORDS_JSON.exists():
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
logger.info(f" Dictionary words: {len(words)}")
|
||||
|
||||
try:
|
||||
df = pd.read_csv(dict_csv, sep=";", index_col=0)
|
||||
if df.shape[1] < 3:
|
||||
raise ValueError("too few columns")
|
||||
except (ValueError, pd.errors.ParserError):
|
||||
df = pd.read_csv(dict_csv, index_col=0)
|
||||
logger.info(f" Dictionary words: {len(df)}")
|
||||
nouns = sum(1 for e in words.values() if e.get("pos", "").startswith("Noun"))
|
||||
verbs = sum(1 for e in words.values() if e.get("pos", "").startswith("Verb"))
|
||||
detail_scraped = sum(1 for e in words.values() if e.get("detail_scraped"))
|
||||
logger.info(f" Nouns: {nouns}, Verbs: {verbs}, Detail-scraped: {detail_scraped}")
|
||||
|
||||
logger.info(f" Frequency entries: {len(freq_cache)}")
|
||||
logger.info(f" Example cache entries: {len(examples_cache)}")
|
||||
covered = sum(1 for v in examples_cache.values() if v)
|
||||
if examples_cache:
|
||||
logger.info(f" Example coverage: {covered}/{len(examples_cache)} ({100 * covered // len(examples_cache)}%)")
|
||||
matched = example_stats.get("matched", 0)
|
||||
total = example_stats.get("total_vocab", 0)
|
||||
if total:
|
||||
logger.info(f" Example coverage: {matched}/{total} ({100 * matched // total}%)")
|
||||
for book, count in example_stats.get("books", {}).items():
|
||||
logger.info(f" {book}: {count} sentences")
|
||||
|
||||
if AUDIO_DIR.exists():
|
||||
mp3s = list(AUDIO_DIR.glob("*.mp3"))
|
||||
logger.info(f" Vocabulary audio files: {len(mp3s)}")
|
||||
|
||||
if AUDIO_CONJ_DIR.exists():
|
||||
# Count only files that will be bundled: active non-infinitive forms
|
||||
# (excludes {slug}_passive_* and {slug}_infinitive.mp3 on-disk extras)
|
||||
mp3s = [
|
||||
p for p in AUDIO_CONJ_DIR.glob("*.mp3") if not p.stem.endswith("_infinitive") and "_passive_" not in p.stem
|
||||
]
|
||||
|
|
@ -498,9 +273,6 @@ def print_summary(args, examples_cache, freq_cache, conjugations):
|
|||
if apkg.exists():
|
||||
size_mb = apkg.stat().st_size / 1e6
|
||||
logger.info(f" {apkg.name}: {size_mb:.1f} MB")
|
||||
if conjugations:
|
||||
verb_count = sum(1 for v in conjugations.values() if v)
|
||||
logger.info(f" Verbs in conjugation deck: {verb_count}")
|
||||
|
||||
logger.info("=" * 60)
|
||||
logger.info("DONE")
|
||||
|
|
@ -515,92 +287,75 @@ def main():
|
|||
logger.info(f" MODE: --only {args.only}")
|
||||
if args.test:
|
||||
logger.info(f" TEST MODE: {args.test} words")
|
||||
if args.refresh_examples:
|
||||
logger.info(" REFRESH EXAMPLES: Ben Yehuda index will be rebuilt")
|
||||
logger.info("=" * 60)
|
||||
|
||||
def _load_words_for_only() -> dict:
|
||||
if not WORDS_JSON.exists():
|
||||
logger.error(f"words.json not found at {WORDS_JSON}. Run the data pipeline first.")
|
||||
sys.exit(1)
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
if args.only == "conjugations":
|
||||
step_fonts(args)
|
||||
conjugations = step_conjugations(args)
|
||||
if conjugations:
|
||||
import apkg_builder
|
||||
import apkg_builder
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
|
||||
deck, media = apkg_builder.build_conj_deck(
|
||||
conjugations,
|
||||
include_audio=audio,
|
||||
dict_csv=dict_csv,
|
||||
)
|
||||
apkg_builder.write_conj_apkg(deck, media, out_path=path)
|
||||
print_summary(args, {}, {}, conjugations or {})
|
||||
words = _load_words_for_only()
|
||||
for audio, path in [(False, apkg_builder.CONJ_APKG), (True, apkg_builder.CONJ_APKG_AUDIO)]:
|
||||
deck, media = apkg_builder.build_conj_deck(words, include_audio=audio)
|
||||
apkg_builder.write_conj_apkg(deck, media, out_path=path)
|
||||
print_summary(args, {}, {})
|
||||
return
|
||||
|
||||
if args.only == "confusables":
|
||||
step_fonts(args)
|
||||
import apkg_builder
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
words = _load_words_for_only()
|
||||
for audio, path in [(False, apkg_builder.CONF_APKG), (True, apkg_builder.CONF_APKG_AUDIO)]:
|
||||
deck, media = apkg_builder.build_confusables_deck(dict_csv, include_audio=audio)
|
||||
deck, media = apkg_builder.build_confusables_deck(words, include_audio=audio)
|
||||
apkg_builder.write_conf_apkg(deck, media, out_path=path)
|
||||
print_summary(args, {}, {}, {})
|
||||
print_summary(args, {}, {})
|
||||
return
|
||||
|
||||
if args.only == "plurals":
|
||||
step_fonts(args)
|
||||
import apkg_builder
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
words = _load_words_for_only()
|
||||
for audio, path in [(False, apkg_builder.PLURAL_APKG), (True, apkg_builder.PLURAL_APKG_AUDIO)]:
|
||||
deck, media = apkg_builder.build_plural_deck(dict_csv=dict_csv, include_audio=audio)
|
||||
deck, media = apkg_builder.build_plural_deck(words, include_audio=audio)
|
||||
apkg_builder.write_plural_apkg(deck, media, out_path=path)
|
||||
print_summary(args, {}, {}, {})
|
||||
print_summary(args, {}, {})
|
||||
return
|
||||
|
||||
if args.only == "complete":
|
||||
step_fonts(args)
|
||||
freq_cache = step_frequency() if not args.skip_scrape else {}
|
||||
examples_cache = step_examples(args, freq_cache) if not args.skip_examples else {}
|
||||
image_cache = step_images(args) if not args.skip_images else {}
|
||||
conjugations = step_conjugations(args)
|
||||
import apkg_builder
|
||||
|
||||
dict_csv = DATA_DIR / "hebrew_dict_for_anki.csv"
|
||||
if not dict_csv.exists():
|
||||
dict_csv = DATA_DIR / "hebrew_dict.csv"
|
||||
words = _load_words_for_only()
|
||||
emoji_lookup = apkg_builder._load_emoji_lookup()
|
||||
for audio, path in [(False, apkg_builder.COMPLETE_APKG), (True, apkg_builder.COMPLETE_APKG_AUDIO)]:
|
||||
decks, media = apkg_builder.build_complete_deck(
|
||||
dict_csv,
|
||||
conjugations=conjugations or {},
|
||||
examples_cache=examples_cache,
|
||||
freq_cache=freq_cache,
|
||||
image_cache=image_cache,
|
||||
emoji_lookup=emoji_lookup,
|
||||
words,
|
||||
include_audio=audio,
|
||||
emoji_lookup=emoji_lookup,
|
||||
)
|
||||
apkg_builder.write_complete_apkg(decks, media, out_path=path)
|
||||
print_summary(args, examples_cache, freq_cache, conjugations or {})
|
||||
print_summary(args, {}, {})
|
||||
return
|
||||
|
||||
if args.only == "vocab":
|
||||
args.skip_conjugations = True
|
||||
# Full pipeline
|
||||
step_list_scrape(args) # 1 — scrape list pages → words.json (captures slugs)
|
||||
step_detail_scrape(args) # 2 — scrape detail pages using slugs → words.json
|
||||
freq_cache = step_frequency() # 3 — word frequency data
|
||||
example_stats = step_examples(args) # 4 — EPUB example sentences
|
||||
step_audio_download(args) # 5 — download audio mp3s
|
||||
step_fonts(args) # 6 — download Heebo fonts
|
||||
step_images(args) # 7 — fetch noun images
|
||||
step_build_all(args) # 8 — build all .apkg variants
|
||||
|
||||
step_scrape(args)
|
||||
freq_cache = step_frequency()
|
||||
examples_cache = step_examples(args, freq_cache)
|
||||
step_audio(args)
|
||||
step_fonts(args)
|
||||
image_cache = step_images(args)
|
||||
conjugations = step_conjugations(args)
|
||||
step_build_all(args, examples_cache, freq_cache, conjugations, image_cache)
|
||||
|
||||
print_summary(args, examples_cache, freq_cache, conjugations or {})
|
||||
print_summary(args, example_stats, freq_cache)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
|
|
|||
392
scripts/assign_frequency.py
Normal file
392
scripts/assign_frequency.py
Normal file
|
|
@ -0,0 +1,392 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Assign frequency ranks from the cleaned corpus to words.json entries.
|
||||
|
||||
Two-tier assignment with PoS priority:
|
||||
Tier 1: Match headword ktiv_male directly against corpus
|
||||
Tier 2: Match conjugated/inflected forms (only if no other entry already
|
||||
claimed that corpus word via tier 1)
|
||||
|
||||
PoS priority (based on standalone-word likelihood in Hebrew text):
|
||||
כינויי_גוף (Pronoun) > מילות_חיבור (Conjunction) > שם_תואר (Adjective) >
|
||||
מילית (Particle) > שם_עצם (Noun) > תוארי_הפועל (Adverb) >
|
||||
מילות_יחס (Preposition) > פעלים (Verb)
|
||||
|
||||
Usage:
|
||||
python3 scripts/assign_frequency.py # assign and save
|
||||
python3 scripts/assign_frequency.py --dry-run # preview only
|
||||
python3 scripts/assign_frequency.py --stats # show statistics only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
|
||||
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
|
||||
|
||||
# Function word PoS — these dominate content words in homograph groups
|
||||
FUNCTION_POS = frozenset({"כינויי_גוף", "מילות_חיבור", "מילית", "מילות_יחס", "תוארי_הפועל"})
|
||||
|
||||
# Content PoS that loses frequency when a function word dominates
|
||||
# Adjectives also lose (e.g. כן "honest" vs כן "yes") — they're rare collisions
|
||||
CONTENT_POS = frozenset({"שם_עצם", "שם_תואר", "פעלים"})
|
||||
|
||||
# Manual overrides: at these corpus ranks, ALL homographs share frequency.
|
||||
# These are cases where the content word is genuinely common enough to deserve it.
|
||||
# e.g. rank 15: עם "people" (NN) alongside עם "with" (PREP)
|
||||
# Manual overrides: at these ktiv_male forms, ALL homographs share frequency.
|
||||
# These are cases where the content word is genuinely common enough to deserve it.
|
||||
SHARE_ALL_WORDS = frozenset(
|
||||
{
|
||||
"עם", # "people" (NN) + "with" (PREP)
|
||||
"שם", # "name" (NN) + "there" (ADV)
|
||||
"אל", # "god" (NN) + "to" (PREP) + "don't" (PART)
|
||||
"עד", # "witness"/"eternity" (NN) + "until" (PREP)
|
||||
"פה", # "mouth" (NN) + "here" (ADV)
|
||||
"לאחר", # "to be late" (VB) + "after" (PREP)
|
||||
"יופי", # "beauty" (NN) + "great!" (ADV)
|
||||
"המון", # "crowd" (NN) + "lots of" (ADV)
|
||||
"חבל", # "rope" (NN) + "it's a pity" (ADV)
|
||||
"ראשית", # "beginning" (NN) + "firstly" (ADV)
|
||||
"עקב", # "heel"/"footprint" (NN) + "due to" (CONJ)
|
||||
"אולם", # "hall" (NN) + "however" (ADV)
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _get_pos_tag(entry: dict) -> str:
|
||||
"""Extract primary PoS tag from entry's tags field."""
|
||||
tags = (entry.get("tags") or "").split()
|
||||
for t in tags:
|
||||
if not t.startswith("שורש"):
|
||||
return t
|
||||
return "unknown"
|
||||
|
||||
|
||||
def _build_form_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||
"""Build reverse index: ktiv_male_form -> [(unique_key, match_type), ...]"""
|
||||
index: dict[str, list[tuple[str, str]]] = defaultdict(list)
|
||||
|
||||
for key, entry in words.items():
|
||||
w = entry.get("word") or {}
|
||||
if km := w.get("ktiv_male"):
|
||||
index[km].append((key, "headword"))
|
||||
|
||||
# Verb conjugations: indexed for new-assignment-only matching (no upgrades).
|
||||
# Conjugated forms collide with unrelated headwords, so tier 2 only uses
|
||||
# these for entries that have NO existing frequency.
|
||||
conj = entry.get("conjugation") or {}
|
||||
for form in conj.get("active_forms") or []:
|
||||
if isinstance(form, dict):
|
||||
form_data = form.get("form") or {}
|
||||
if km2 := form_data.get("ktiv_male"):
|
||||
km2 = km2.rstrip("!\u200f ")
|
||||
index[km2].append((key, "conjugation"))
|
||||
|
||||
for hp in conj.get("hufal_pual_forms") or []:
|
||||
if isinstance(hp, dict):
|
||||
hp_data = hp.get("form") or {}
|
||||
if km3 := hp_data.get("ktiv_male"):
|
||||
km3 = km3.rstrip("!\u200f ")
|
||||
index[km3].append((key, "conjugation"))
|
||||
|
||||
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
|
||||
for inf_data in (entry.get(field) or {}).values():
|
||||
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
|
||||
index[km4].append((key, "inflection"))
|
||||
|
||||
return dict(index)
|
||||
|
||||
|
||||
def _should_get_frequency(
|
||||
entry: dict,
|
||||
all_headword_entries: list[tuple[str, str]],
|
||||
corpus_word: str,
|
||||
words: dict,
|
||||
) -> bool:
|
||||
"""Decide if an entry should get frequency in a homograph group.
|
||||
|
||||
Rules:
|
||||
- If only one entry matches, it always gets frequency.
|
||||
- If SHARE_ALL_WORDS includes this corpus word, all entries share.
|
||||
- If the group has function words AND content words, content words lose.
|
||||
- Otherwise all entries share.
|
||||
"""
|
||||
if len(all_headword_entries) <= 1:
|
||||
return True
|
||||
if corpus_word in SHARE_ALL_WORDS:
|
||||
return True
|
||||
|
||||
pos = _get_pos_tag(entry)
|
||||
has_function = any(_get_pos_tag(words[k]) in FUNCTION_POS for k, _ in all_headword_entries)
|
||||
|
||||
return not (has_function and pos in CONTENT_POS)
|
||||
|
||||
|
||||
def assign_frequencies(
|
||||
words: dict,
|
||||
freq_corpus: dict[str, int],
|
||||
raw_corpus: dict[str, int] | None = None,
|
||||
upgrade: bool = False,
|
||||
) -> dict[str, dict]:
|
||||
"""Assign frequency ranks to words.json entries. Returns assignment details.
|
||||
|
||||
freq_corpus controls which words are valid (cleaned corpus).
|
||||
raw_corpus provides original rank numbers (with gaps). If not provided,
|
||||
uses freq_corpus ranks (re-ranked, no gaps).
|
||||
upgrade: if True, tier 2 can upgrade an entry's rank when a conjugated/inflected
|
||||
form has a better (lower) rank than the headword match.
|
||||
"""
|
||||
rank_source = raw_corpus if raw_corpus is not None else freq_corpus
|
||||
form_index = _build_form_index(words)
|
||||
|
||||
# Track which corpus words have been claimed by tier 1
|
||||
tier1_claimed: set[str] = set()
|
||||
|
||||
# Results tracking
|
||||
assignments: dict[str, dict] = {} # unique_key -> {rank, source, corpus_word}
|
||||
|
||||
# --- Tier 1: headword matches ---
|
||||
# For each corpus word, find all headword matches and assign to eligible entries.
|
||||
# Homograph groups: function words get frequency, content words don't (unless overridden).
|
||||
corpus_by_rank = sorted(freq_corpus.items(), key=lambda x: x[1])
|
||||
|
||||
for corpus_word, _clean_rank in corpus_by_rank:
|
||||
matches = form_index.get(corpus_word, [])
|
||||
headword_matches = [(k, t) for k, t in matches if t == "headword"]
|
||||
if not headword_matches:
|
||||
continue
|
||||
|
||||
original_rank = rank_source.get(corpus_word, _clean_rank)
|
||||
assigned_any = False
|
||||
for entry_key, _ in headword_matches:
|
||||
if entry_key in assignments:
|
||||
continue
|
||||
if _should_get_frequency(words[entry_key], headword_matches, corpus_word, words):
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": "headword",
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
assigned_any = True
|
||||
|
||||
if assigned_any:
|
||||
tier1_claimed.add(corpus_word)
|
||||
|
||||
tier1_count = len(assignments)
|
||||
logger.info("Tier 1 (headword): %d entries assigned", tier1_count)
|
||||
|
||||
# --- Tier 2: conjugation/inflection matches ---
|
||||
# Only use corpus words NOT claimed in tier 1.
|
||||
# A corpus word that matches an inflection is "owned" by that headword —
|
||||
# it cannot also upgrade an unrelated verb via conjugation.
|
||||
# Upgrades (when enabled) only apply within the same match type priority.
|
||||
for corpus_word, _clean_rank in corpus_by_rank:
|
||||
if corpus_word in tier1_claimed:
|
||||
continue
|
||||
|
||||
matches = form_index.get(corpus_word, [])
|
||||
secondary_matches = [(k, t) for k, t in matches if t in ("conjugation", "inflection")]
|
||||
if not secondary_matches:
|
||||
continue
|
||||
|
||||
original_rank = rank_source.get(corpus_word, _clean_rank)
|
||||
|
||||
# Split by type: inflections take priority over conjugations
|
||||
inflection_matches = [(k, t) for k, t in secondary_matches if t == "inflection"]
|
||||
conjugation_matches = [(k, t) for k, t in secondary_matches if t == "conjugation"]
|
||||
|
||||
# If any inflection matches exist, this corpus word belongs to inflection.
|
||||
# Don't let conjugations claim it.
|
||||
active_matches = inflection_matches if inflection_matches else conjugation_matches
|
||||
|
||||
for entry_key, match_type in active_matches:
|
||||
existing = assignments.get(entry_key)
|
||||
if existing is None:
|
||||
# New assignment — conjugations only allowed for rank > 5000
|
||||
# (too many false positives in the important tiers)
|
||||
if match_type == "conjugation" and original_rank <= 5000:
|
||||
continue
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": match_type,
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
break
|
||||
if upgrade and match_type == "inflection" and original_rank < existing["rank"]:
|
||||
# Upgrade — only allowed for inflections (conjugations collide too much)
|
||||
assignments[entry_key] = {
|
||||
"rank": original_rank,
|
||||
"source": f"upgrade:{match_type}",
|
||||
"corpus_word": corpus_word,
|
||||
}
|
||||
break
|
||||
|
||||
tier2_count = len(assignments) - tier1_count
|
||||
logger.info("Tier 2 (conjugation/inflection): %d entries assigned", tier2_count)
|
||||
|
||||
return assignments
|
||||
|
||||
|
||||
def print_stats(words: dict, assignments: dict, freq_corpus: dict) -> None:
|
||||
"""Print detailed statistics about frequency assignment."""
|
||||
total = len(words)
|
||||
assigned = len(assignments)
|
||||
previously_had = sum(1 for e in words.values() if e.get("frequency") is not None)
|
||||
|
||||
print(f"\n{'=' * 60}")
|
||||
print("Frequency Assignment Statistics")
|
||||
print(f"{'=' * 60}")
|
||||
print(f"Words.json entries: {total}")
|
||||
print(f"Clean corpus size: {len(freq_corpus)}")
|
||||
print(f"Previously had freq: {previously_had}")
|
||||
print(f"Now assigned: {assigned}")
|
||||
print(f"Newly gained: {assigned - previously_had}")
|
||||
print(f"Still unlisted: {total - assigned}")
|
||||
|
||||
# By tier
|
||||
tier1 = sum(1 for a in assignments.values() if a["source"] == "headword")
|
||||
tier2_conj = sum(1 for a in assignments.values() if a["source"] == "conjugation")
|
||||
tier2_inf = sum(1 for a in assignments.values() if a["source"] == "inflection")
|
||||
print("\nBy assignment tier:")
|
||||
print(f" Tier 1 (headword): {tier1}")
|
||||
print(f" Tier 2 (conjugation): {tier2_conj}")
|
||||
print(f" Tier 2 (inflection): {tier2_inf}")
|
||||
|
||||
# By PoS
|
||||
print("\nBy PoS:")
|
||||
from collections import Counter
|
||||
|
||||
pos_assigned = Counter()
|
||||
pos_total = Counter()
|
||||
for k, v in words.items():
|
||||
pos = _get_pos_tag(v)
|
||||
pos_total[pos] += 1
|
||||
if k in assignments:
|
||||
pos_assigned[pos] += 1
|
||||
pos_order = [
|
||||
"כינויי_גוף",
|
||||
"מילות_חיבור",
|
||||
"שם_תואר",
|
||||
"מילית",
|
||||
"שם_עצם",
|
||||
"תוארי_הפועל",
|
||||
"מילות_יחס",
|
||||
"פעלים",
|
||||
"unknown",
|
||||
]
|
||||
for pos in sorted(pos_total, key=lambda p: pos_order.index(p) if p in pos_order else 99):
|
||||
a = pos_assigned[pos]
|
||||
t = pos_total[pos]
|
||||
pct = a / t * 100 if t else 0
|
||||
print(f" {pos:20s}: {a:5d}/{t:5d} ({pct:.0f}%)")
|
||||
|
||||
# By frequency tier (using apkg_builder tiers)
|
||||
print("\nBy frequency tier:")
|
||||
tiers = {
|
||||
"Core (1-500)": (1, 500),
|
||||
"Essential (501-1500)": (501, 1500),
|
||||
"Intermediate (1501-3000)": (1501, 3000),
|
||||
"Upper-intermediate (3001-5000)": (3001, 5000),
|
||||
"Advanced (5001-10000)": (5001, 10000),
|
||||
"Rare (10001+)": (10001, 999999),
|
||||
}
|
||||
for label, (lo, hi) in tiers.items():
|
||||
count = sum(1 for a in assignments.values() if lo <= a["rank"] <= hi)
|
||||
print(f" {label:35s}: {count}")
|
||||
|
||||
# Top 20 newly assigned (entries that didn't have frequency before)
|
||||
newly = []
|
||||
for k, a in assignments.items():
|
||||
if words[k].get("frequency") is None:
|
||||
w = words[k].get("word", {})
|
||||
newly.append((a["rank"], k, w.get("ktiv_male", ""), a["source"], a["corpus_word"]))
|
||||
newly.sort()
|
||||
if newly:
|
||||
print("\nTop 20 newly assigned entries:")
|
||||
for rank, _key, ktiv, source, corpus_word in newly[:20]:
|
||||
print(f" rank {rank:5d}: {ktiv:15s} via {source:12s} (corpus: {corpus_word})")
|
||||
|
||||
# Entries that LOST frequency (had it before, not assigned now)
|
||||
lost = []
|
||||
for k, v in words.items():
|
||||
old_freq = v.get("frequency")
|
||||
if old_freq is not None and k not in assignments:
|
||||
w = v.get("word", {})
|
||||
lost.append((old_freq, k, w.get("ktiv_male", "")))
|
||||
lost.sort()
|
||||
if lost:
|
||||
print(f"\nEntries that would LOSE frequency ({len(lost)} total):")
|
||||
for rank, _key, ktiv in lost[:20]:
|
||||
print(f" was rank {rank:5d}: {ktiv}")
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Assign frequency to words.json")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||
parser.add_argument("--stats", action="store_true", help="Show statistics only")
|
||||
parser.add_argument(
|
||||
"--upgrade", action="store_true", help="Allow tier 2 to upgrade headword rank from conjugated forms"
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
# Load data
|
||||
freq_path = CLEAN_CACHE if CLEAN_CACHE.exists() else RAW_CACHE
|
||||
logger.info("Loading frequency corpus: %s", freq_path)
|
||||
with open(freq_path, encoding="utf-8") as f:
|
||||
freq_corpus: dict[str, int] = json.load(f)
|
||||
|
||||
# Load raw corpus for original rank numbers (with gaps)
|
||||
raw_corpus: dict[str, int] | None = None
|
||||
if RAW_CACHE.exists() and freq_path != RAW_CACHE:
|
||||
with open(RAW_CACHE, encoding="utf-8") as f:
|
||||
raw_corpus = json.load(f)
|
||||
logger.info("Using original ranks from %s", RAW_CACHE)
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words: dict = json.load(f)
|
||||
|
||||
logger.info("Corpus: %d entries, Words.json: %d entries", len(freq_corpus), len(words))
|
||||
|
||||
# Run assignment
|
||||
assignments = assign_frequencies(words, freq_corpus, raw_corpus, upgrade=args.upgrade)
|
||||
|
||||
# Stats
|
||||
print_stats(words, assignments, freq_corpus)
|
||||
|
||||
if args.stats or args.dry_run:
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — no changes saved")
|
||||
return
|
||||
|
||||
# Apply to words.json
|
||||
changed = 0
|
||||
for key, entry in words.items():
|
||||
if key in assignments:
|
||||
new_rank = assignments[key]["rank"]
|
||||
if entry.get("frequency") != new_rank:
|
||||
entry["frequency"] = new_rank
|
||||
changed += 1
|
||||
else:
|
||||
if entry.get("frequency") is not None:
|
||||
entry["frequency"] = None
|
||||
changed += 1
|
||||
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Updated %d entries in words.json", changed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
269
scripts/assign_pseudo_frequency.py
Normal file
269
scripts/assign_pseudo_frequency.py
Normal file
|
|
@ -0,0 +1,269 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Assign pseudo-frequency to confusable groups using English word frequency.
|
||||
|
||||
Problem: Confusable entries share the same ktiv_male and thus the same Hebrew
|
||||
frequency rank. This script uses English frequency to differentiate them so
|
||||
Anki sorts more-common meanings first.
|
||||
|
||||
Algorithm:
|
||||
1. For each confusable group where all entries share the same Hebrew frequency,
|
||||
extract the first meaningful English keyword from each entry's meaning field.
|
||||
2. Look up English frequency rank for each keyword.
|
||||
3. Assign pseudo_frequency: the most frequent English meaning keeps the original
|
||||
Hebrew rank; less frequent meanings get progressively higher (worse) ranks
|
||||
by adding an offset (100 * position in group).
|
||||
|
||||
Usage:
|
||||
python3 scripts/assign_pseudo_frequency.py # assign and save
|
||||
python3 scripts/assign_pseudo_frequency.py --dry-run # preview only
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
EN_FREQ_PATH = PROJECT_ROOT / "data" / "en_50k.txt"
|
||||
|
||||
# Words too common/vague to use as frequency signal
|
||||
_EN_STOP = frozenset(
|
||||
{
|
||||
"to",
|
||||
"be",
|
||||
"a",
|
||||
"an",
|
||||
"the",
|
||||
"of",
|
||||
"in",
|
||||
"on",
|
||||
"at",
|
||||
"for",
|
||||
"and",
|
||||
"with",
|
||||
"by",
|
||||
"or",
|
||||
"but",
|
||||
"not",
|
||||
"as",
|
||||
"its",
|
||||
"it",
|
||||
"is",
|
||||
"was",
|
||||
"are",
|
||||
"from",
|
||||
"that",
|
||||
"this",
|
||||
"have",
|
||||
"has",
|
||||
"had",
|
||||
"do",
|
||||
"does",
|
||||
"did",
|
||||
"will",
|
||||
"would",
|
||||
"can",
|
||||
"could",
|
||||
"may",
|
||||
"might",
|
||||
"shall",
|
||||
"should",
|
||||
"must",
|
||||
"no",
|
||||
"yes",
|
||||
"very",
|
||||
"too",
|
||||
"also",
|
||||
"just",
|
||||
"only",
|
||||
"so",
|
||||
"up",
|
||||
"out",
|
||||
"into",
|
||||
"over",
|
||||
"after",
|
||||
"before",
|
||||
"about",
|
||||
"more",
|
||||
"than",
|
||||
"other",
|
||||
"some",
|
||||
"any",
|
||||
"all",
|
||||
"each",
|
||||
"every",
|
||||
"both",
|
||||
"few",
|
||||
"many",
|
||||
"much",
|
||||
"most",
|
||||
"such",
|
||||
"own",
|
||||
"same",
|
||||
"well",
|
||||
"still",
|
||||
"even",
|
||||
"how",
|
||||
"what",
|
||||
"when",
|
||||
"where",
|
||||
"which",
|
||||
"who",
|
||||
"whom",
|
||||
"whose",
|
||||
"why",
|
||||
"because",
|
||||
"if",
|
||||
"then",
|
||||
"else",
|
||||
"while",
|
||||
"until",
|
||||
"though",
|
||||
"whether",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
def _load_en_freq() -> dict[str, int]:
|
||||
"""Load English frequency data: word -> rank (1 = most common)."""
|
||||
freq: dict[str, int] = {}
|
||||
rank = 1
|
||||
with open(EN_FREQ_PATH, encoding="utf-8") as f:
|
||||
for line in f:
|
||||
parts = line.strip().split()
|
||||
if parts:
|
||||
word = parts[0].lower()
|
||||
if word not in freq:
|
||||
freq[word] = rank
|
||||
rank += 1
|
||||
return freq
|
||||
|
||||
|
||||
def _extract_keywords(meaning: str) -> list[str]:
|
||||
"""Extract meaningful English keywords from a meaning string.
|
||||
|
||||
Returns list of lowercase words, filtered for stop words and short words.
|
||||
"""
|
||||
# Strip parenthesized content, punctuation
|
||||
cleaned = re.sub(r"\([^)]*\)", " ", meaning)
|
||||
cleaned = re.sub(r"[^\w\s]", " ", cleaned)
|
||||
return [w.lower() for w in cleaned.split() if len(w) > 2 and w.lower() not in _EN_STOP]
|
||||
|
||||
|
||||
def assign_pseudo_frequencies(
|
||||
words: dict,
|
||||
en_freq: dict[str, int],
|
||||
dry_run: bool = False,
|
||||
) -> int:
|
||||
"""Assign pseudo_frequency to confusable groups. Returns count of changes."""
|
||||
|
||||
# Group by confusables_guid
|
||||
groups: dict[str, list[str]] = defaultdict(list)
|
||||
for key, entry in words.items():
|
||||
cg = entry.get("confusables_guid")
|
||||
if cg:
|
||||
groups[cg].append(key)
|
||||
|
||||
changes = 0
|
||||
assigned_groups = 0
|
||||
skipped_diff = 0
|
||||
skipped_no_en = 0
|
||||
|
||||
for _guid, keys in groups.items():
|
||||
entries = [words[k] for k in keys]
|
||||
freqs = [e.get("frequency") for e in entries]
|
||||
|
||||
# Skip groups that are already differentiated
|
||||
unique_freqs = set(freqs)
|
||||
if len(unique_freqs) > 1:
|
||||
skipped_diff += 1
|
||||
continue
|
||||
|
||||
base_freq = freqs[0] # All same (or all None)
|
||||
|
||||
# Look up English frequency for each entry
|
||||
en_ranks: list[tuple[int, str]] = [] # (en_rank, key)
|
||||
for key, entry in zip(keys, entries, strict=True):
|
||||
keywords = _extract_keywords(entry.get("meaning", ""))
|
||||
en_rank = 999_999
|
||||
for kw in keywords[:5]:
|
||||
r = en_freq.get(kw)
|
||||
if r is not None:
|
||||
en_rank = r
|
||||
break
|
||||
en_ranks.append((en_rank, key))
|
||||
|
||||
# Sort by English frequency (lower rank = more common)
|
||||
en_ranks.sort()
|
||||
|
||||
# Check if all entries have the same English rank (no signal)
|
||||
if len({r for r, _ in en_ranks}) <= 1:
|
||||
skipped_no_en += 1
|
||||
continue
|
||||
|
||||
assigned_groups += 1
|
||||
|
||||
# Assign pseudo_frequency: most common gets base, others get offset
|
||||
for position, (en_rank, key) in enumerate(en_ranks):
|
||||
pseudo = base_freq + position * 100 if base_freq is not None else 50000 + en_rank
|
||||
|
||||
if not dry_run:
|
||||
words[key]["pseudo_frequency"] = pseudo
|
||||
changes += 1
|
||||
|
||||
if dry_run:
|
||||
meaning = words[key].get("meaning", "")[:40]
|
||||
logger.info(
|
||||
" [en:%5d] pseudo=%6d %s",
|
||||
en_rank,
|
||||
pseudo,
|
||||
meaning,
|
||||
)
|
||||
|
||||
logger.info(
|
||||
"Pseudo-frequency: %d groups assigned, %d already differentiated, %d no English signal",
|
||||
assigned_groups,
|
||||
skipped_diff,
|
||||
skipped_no_en,
|
||||
)
|
||||
return changes
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Assign pseudo-frequency to confusables")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Preview without saving")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s %(levelname)s %(message)s",
|
||||
)
|
||||
|
||||
logger.info("Loading English frequency data: %s", EN_FREQ_PATH)
|
||||
en_freq = _load_en_freq()
|
||||
logger.info("English frequency: %d entries", len(en_freq))
|
||||
|
||||
with open(WORDS_JSON, encoding="utf-8") as f:
|
||||
words: dict = json.load(f)
|
||||
|
||||
changes = assign_pseudo_frequencies(words, en_freq, dry_run=args.dry_run)
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — %d changes would be made", changes)
|
||||
return
|
||||
|
||||
with open(WORDS_JSON, "w", encoding="utf-8") as f:
|
||||
json.dump(words, f, ensure_ascii=False, indent=2)
|
||||
|
||||
logger.info("Saved %d pseudo-frequency assignments to words.json", changes)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
212
scripts/check_guid_coverage.py
Normal file
212
scripts/check_guid_coverage.py
Normal file
|
|
@ -0,0 +1,212 @@
|
|||
"""Check that every GUID in the last-release complete .apkg exists in words.json.
|
||||
|
||||
Extracts GUIDs from the Anki SQLite database inside the .apkg (zip) file,
|
||||
then compares against all GUID fields stored in data/words.json.
|
||||
|
||||
Usage:
|
||||
python3 scripts/check_guid_coverage.py
|
||||
python3 scripts/check_guid_coverage.py --apkg output/hebrew_complete.apkg
|
||||
python3 scripts/check_guid_coverage.py --verbose
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import sqlite3
|
||||
import sys
|
||||
import tempfile
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
DEFAULT_APKG = PROJECT_ROOT / "output" / "hebrew_complete.apkg"
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
|
||||
# Known model IDs (from apkg_builder.py)
|
||||
MODEL_IDS = {
|
||||
1701222017968: "vocab",
|
||||
1234567893: "conjugation",
|
||||
1234567897: "plurals",
|
||||
1234567895: "confusables",
|
||||
}
|
||||
|
||||
|
||||
def extract_apkg_guids(apkg_path: Path) -> dict[int, set[str]]:
|
||||
"""Extract GUIDs from .apkg grouped by model ID."""
|
||||
by_model: dict[int, set[str]] = {}
|
||||
with zipfile.ZipFile(apkg_path) as z, tempfile.TemporaryDirectory() as td:
|
||||
z.extractall(td)
|
||||
db_path = os.path.join(td, "collection.anki2")
|
||||
conn = sqlite3.connect(db_path)
|
||||
cur = conn.cursor()
|
||||
cur.execute("SELECT guid, mid FROM notes")
|
||||
for guid, mid in cur.fetchall():
|
||||
by_model.setdefault(mid, set()).add(guid)
|
||||
conn.close()
|
||||
return by_model
|
||||
|
||||
|
||||
def collect_words_json_guids(data: dict[str, Any]) -> dict[str, set[str]]:
|
||||
"""Collect all GUIDs from words.json grouped by deck type."""
|
||||
vocab_guids: set[str] = set()
|
||||
cloze_guids: set[str] = set()
|
||||
conj_guids: set[str] = set()
|
||||
plurals_guids: set[str] = set()
|
||||
confusables_guids: set[str] = set()
|
||||
|
||||
for entry in data.values():
|
||||
# Vocab legacy GUID
|
||||
g = entry.get("vocab_legacy_guid")
|
||||
if g:
|
||||
vocab_guids.add(g)
|
||||
|
||||
# Cloze GUID (stored in examples.cloze.cloze_guid)
|
||||
examples = entry.get("examples")
|
||||
if examples:
|
||||
cloze = examples.get("cloze")
|
||||
if cloze:
|
||||
g = cloze.get("cloze_guid")
|
||||
if g:
|
||||
cloze_guids.add(g)
|
||||
|
||||
# Plurals GUID (stored inside noun_inflection)
|
||||
ni = entry.get("noun_inflection")
|
||||
if ni:
|
||||
g = ni.get("plurals_guid")
|
||||
if g:
|
||||
plurals_guids.add(g)
|
||||
|
||||
# Confusables GUID (top-level)
|
||||
g = entry.get("confusables_guid")
|
||||
if g:
|
||||
confusables_guids.add(g)
|
||||
|
||||
# Conjugation form GUIDs
|
||||
conj = entry.get("conjugation")
|
||||
if conj:
|
||||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||
forms = conj.get(form_list_key)
|
||||
if not forms:
|
||||
continue
|
||||
for form in forms:
|
||||
g = form.get("guid")
|
||||
if g:
|
||||
conj_guids.add(g)
|
||||
gc = form.get("guid_candidates")
|
||||
if gc:
|
||||
for g2 in gc:
|
||||
conj_guids.add(g2)
|
||||
|
||||
return {
|
||||
"vocab": vocab_guids,
|
||||
"cloze": cloze_guids,
|
||||
"conjugation": conj_guids,
|
||||
"plurals": plurals_guids,
|
||||
"confusables": confusables_guids,
|
||||
}
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Check GUID coverage between .apkg and words.json")
|
||||
parser.add_argument(
|
||||
"--apkg",
|
||||
type=Path,
|
||||
default=DEFAULT_APKG,
|
||||
help=f"Path to .apkg file (default: {DEFAULT_APKG})",
|
||||
)
|
||||
parser.add_argument("--verbose", "-v", action="store_true")
|
||||
args = parser.parse_args()
|
||||
|
||||
if not args.apkg.exists():
|
||||
print(f"ERROR: apkg not found: {args.apkg}")
|
||||
sys.exit(2)
|
||||
if not WORDS_JSON.exists():
|
||||
print(f"ERROR: words.json not found: {WORDS_JSON}")
|
||||
sys.exit(2)
|
||||
|
||||
print(f"Checking: {args.apkg}")
|
||||
print(f"Against: {WORDS_JSON}")
|
||||
print()
|
||||
|
||||
apkg_by_model = extract_apkg_guids(args.apkg)
|
||||
data = json.load(WORDS_JSON.open(encoding="utf-8"))
|
||||
wj = collect_words_json_guids(data)
|
||||
|
||||
total_apkg = sum(len(s) for s in apkg_by_model.values())
|
||||
total_wj = sum(len(s) for s in wj.values())
|
||||
print(f"Total GUIDs in apkg: {total_apkg}")
|
||||
print(f"Total GUIDs in words.json: {total_wj}")
|
||||
print()
|
||||
|
||||
all_missing = 0
|
||||
all_extra = 0
|
||||
|
||||
for mid, deck_name in MODEL_IDS.items():
|
||||
apkg_set = apkg_by_model.get(mid, set())
|
||||
|
||||
# Map apkg model to words.json GUID sets
|
||||
if deck_name == "vocab":
|
||||
# Vocab notes cover both vocab cards (ord 0,1) and cloze (ord 2)
|
||||
# They share the note GUID — vocab_legacy_guid IS the note guid
|
||||
wj_set = wj["vocab"] | wj["cloze"]
|
||||
elif deck_name == "conjugation":
|
||||
wj_set = wj["conjugation"]
|
||||
elif deck_name == "plurals":
|
||||
wj_set = wj["plurals"]
|
||||
elif deck_name == "confusables":
|
||||
wj_set = wj["confusables"]
|
||||
else:
|
||||
wj_set = set()
|
||||
|
||||
missing = apkg_set - wj_set
|
||||
extra = wj_set - apkg_set
|
||||
matched = apkg_set & wj_set
|
||||
all_missing += len(missing)
|
||||
all_extra += len(extra)
|
||||
|
||||
status = "PASS" if not missing else "FAIL"
|
||||
print(f" {status} {deck_name} (mid={mid})")
|
||||
print(
|
||||
f" apkg={len(apkg_set)}, words.json={len(wj_set)}, "
|
||||
f"matched={len(matched)}, missing={len(missing)}, extra={len(extra)}"
|
||||
)
|
||||
|
||||
if missing and args.verbose:
|
||||
# Try to find what word each missing GUID belongs to in the apkg
|
||||
print(" Missing GUIDs (in apkg, not in words.json):")
|
||||
for g in sorted(missing)[:20]:
|
||||
print(f" {g!r}")
|
||||
if len(missing) > 20:
|
||||
print(f" ... ({len(missing) - 20} more)")
|
||||
|
||||
if extra and args.verbose:
|
||||
print(" Extra GUIDs (in words.json, not in apkg):")
|
||||
for g in sorted(extra)[:10]:
|
||||
print(f" {g!r}")
|
||||
if len(extra) > 10:
|
||||
print(f" ... ({len(extra) - 10} more)")
|
||||
|
||||
print()
|
||||
|
||||
# Check for unknown model IDs in apkg
|
||||
unknown_mids = set(apkg_by_model.keys()) - set(MODEL_IDS.keys())
|
||||
if unknown_mids:
|
||||
print(f" WARNING: Unknown model IDs in apkg: {unknown_mids}")
|
||||
for mid in unknown_mids:
|
||||
print(f" mid={mid}: {len(apkg_by_model[mid])} notes")
|
||||
|
||||
print("─" * 60)
|
||||
if all_missing:
|
||||
print(f" FAILED: {all_missing} apkg GUIDs not found in words.json")
|
||||
print(" (These notes would lose study progress on reimport)")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f" All {total_apkg} apkg GUIDs accounted for in words.json.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
400
scripts/clean_frequency_corpus.py
Normal file
400
scripts/clean_frequency_corpus.py
Normal file
|
|
@ -0,0 +1,400 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Clean the Hebrew frequency corpus by removing prefix+word combinations.
|
||||
|
||||
Two modes:
|
||||
--mode yap (default) Use YAP morphological analyzer for accurate prefix detection.
|
||||
Requires YAP API running at localhost:8000.
|
||||
--mode heuristic Use rule-based prefix stripping (no external dependencies).
|
||||
|
||||
Both modes preserve words that exist as known dictionary forms in words.json.
|
||||
|
||||
Usage:
|
||||
python3 scripts/clean_frequency_corpus.py # YAP mode
|
||||
python3 scripts/clean_frequency_corpus.py --mode heuristic # heuristic fallback
|
||||
python3 scripts/clean_frequency_corpus.py --dry-run # preview only
|
||||
python3 scripts/clean_frequency_corpus.py --resume # resume YAP from checkpoint
|
||||
python3 scripts/clean_frequency_corpus.py --limit 1000 # process first N entries
|
||||
|
||||
Input: data/frequency_cache.json (raw he_50k.txt, 49999 entries)
|
||||
Output: data/frequency_clean.json (filtered, prefix combos removed)
|
||||
data/frequency_discarded.json (discarded entries with reason)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
PROJECT_ROOT = Path(__file__).parent.parent
|
||||
RAW_CACHE = PROJECT_ROOT / "data" / "frequency_cache.json"
|
||||
CLEAN_CACHE = PROJECT_ROOT / "data" / "frequency_clean.json"
|
||||
DISCARDED = PROJECT_ROOT / "data" / "frequency_discarded.json"
|
||||
WORDS_JSON = PROJECT_ROOT / "data" / "words.json"
|
||||
CHECKPOINT = PROJECT_ROOT / "data" / "_yap_checkpoint.json"
|
||||
|
||||
YAP_URL = os.environ.get("YAP_URL", "http://localhost:8000/yap/heb/joint")
|
||||
YAP_TIMEOUT = 10
|
||||
BATCH_SAVE_INTERVAL = 500
|
||||
|
||||
# --- YAP mode constants ---
|
||||
# POS tags that indicate a prefix
|
||||
PREFIX_POS = frozenset({"PREPOSITION", "CONJ", "DEF", "REL"})
|
||||
# POS tags for the host word that make the combo a false positive
|
||||
HOST_POS = frozenset({"NN", "NNP", "NNT", "PRP", "CD", "DT", "EX"})
|
||||
|
||||
# --- Heuristic mode constants ---
|
||||
# Hebrew prefix combinations, longest first for greedy matching.
|
||||
PREFIXES = [
|
||||
# 4-char
|
||||
"וכשמ",
|
||||
"וכשב",
|
||||
"וכשל",
|
||||
"וכשה",
|
||||
# 3-char
|
||||
"וכש",
|
||||
"ומה",
|
||||
"ובה",
|
||||
"וכה",
|
||||
"ולה",
|
||||
"ומש",
|
||||
"ובש",
|
||||
"וכב",
|
||||
"ולב",
|
||||
"ומב",
|
||||
"וכל",
|
||||
"ולכ",
|
||||
"שבה",
|
||||
"שמה",
|
||||
# 2-char
|
||||
"כש",
|
||||
"מה",
|
||||
"בה",
|
||||
"כה",
|
||||
"לה",
|
||||
"מש",
|
||||
"בש",
|
||||
"וב",
|
||||
"וה",
|
||||
"וכ",
|
||||
"ול",
|
||||
"ומ",
|
||||
"וש",
|
||||
"כב",
|
||||
"לב",
|
||||
"מב",
|
||||
"כל",
|
||||
"לכ",
|
||||
"שב",
|
||||
"שה",
|
||||
"שכ",
|
||||
"של",
|
||||
"שמ",
|
||||
# 1-char
|
||||
"ב",
|
||||
"ה",
|
||||
"ו",
|
||||
"כ",
|
||||
"ל",
|
||||
"מ",
|
||||
"ש",
|
||||
]
|
||||
MIN_REMAINDER_LEN = 2
|
||||
|
||||
|
||||
def _load_known_forms(words_path: Path) -> set[str]:
|
||||
"""Load all known ktiv_male forms from words.json."""
|
||||
if not words_path.exists():
|
||||
logger.warning("words.json not found at %s — no dictionary filter", words_path)
|
||||
return set()
|
||||
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
known: set[str] = set()
|
||||
for entry in words.values():
|
||||
w = entry.get("word") or {}
|
||||
if km := w.get("ktiv_male"):
|
||||
known.add(km)
|
||||
|
||||
for form in entry.get("active_forms") or []:
|
||||
if isinstance(form, dict) and (km2 := form.get("ktiv_male")):
|
||||
known.add(km2)
|
||||
|
||||
for hp in entry.get("hufal_pual_forms") or []:
|
||||
if isinstance(hp, dict) and (km3 := hp.get("ktiv_male")):
|
||||
known.add(km3)
|
||||
|
||||
for field in ("noun_inflection", "preposition_inflection", "adjective_inflection"):
|
||||
for inf_data in (entry.get(field) or {}).values():
|
||||
if isinstance(inf_data, dict) and (km4 := inf_data.get("ktiv_male")):
|
||||
known.add(km4)
|
||||
|
||||
logger.info("Loaded %d known dictionary forms from words.json", len(known))
|
||||
return known
|
||||
|
||||
|
||||
# ── YAP mode ──────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def query_yap(word: str) -> dict | None:
|
||||
"""Send a single word to YAP and return the JSON response."""
|
||||
payload = {"text": f"{word} "}
|
||||
try:
|
||||
resp = requests.post(YAP_URL, json=payload, timeout=YAP_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.json()
|
||||
except requests.RequestException as e:
|
||||
logger.warning("YAP request failed for '%s': %s", word, e)
|
||||
return None
|
||||
|
||||
|
||||
def is_prefix_combo_yap(yap_response: dict) -> tuple[bool, str]:
|
||||
"""Check if any morphological analysis segments the word as prefix+host.
|
||||
|
||||
Conservative: if ANY analysis in the lattice shows prefix+host → discard.
|
||||
"""
|
||||
lattice = yap_response.get("ma_lattice", "")
|
||||
if not lattice:
|
||||
return False, ""
|
||||
|
||||
arcs = []
|
||||
for line in lattice.strip().split("\n"):
|
||||
if not line.strip():
|
||||
continue
|
||||
parts = line.split("\t")
|
||||
if len(parts) < 6:
|
||||
continue
|
||||
arcs.append(
|
||||
{
|
||||
"from": parts[0],
|
||||
"to": parts[1],
|
||||
"form": parts[2],
|
||||
"lemma": parts[3],
|
||||
"cpos": parts[4],
|
||||
"pos": parts[5],
|
||||
}
|
||||
)
|
||||
|
||||
if len(arcs) < 2:
|
||||
return False, ""
|
||||
|
||||
for a in arcs:
|
||||
if a["cpos"] not in PREFIX_POS and a["pos"] not in PREFIX_POS:
|
||||
continue
|
||||
for b in arcs:
|
||||
if b["from"] != a["to"]:
|
||||
continue
|
||||
if b["cpos"] in HOST_POS or b["pos"] in HOST_POS:
|
||||
reason = f"{a['form']}({a['cpos']})+{b['form']}({b['cpos']})"
|
||||
return True, reason
|
||||
|
||||
return False, ""
|
||||
|
||||
|
||||
# ── Heuristic mode ────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def find_prefix_decomposition(word: str, freq: dict[str, int]) -> tuple[str, str] | None:
|
||||
"""Check if word is a prefix+higher-ranked-word combo (heuristic)."""
|
||||
if len(word) <= MIN_REMAINDER_LEN:
|
||||
return None
|
||||
|
||||
word_rank = freq.get(word, 999999)
|
||||
|
||||
for prefix in PREFIXES:
|
||||
if not word.startswith(prefix):
|
||||
continue
|
||||
remainder = word[len(prefix) :]
|
||||
if len(remainder) < MIN_REMAINDER_LEN:
|
||||
continue
|
||||
if remainder in freq and freq[remainder] < word_rank:
|
||||
return prefix, remainder
|
||||
|
||||
return None
|
||||
|
||||
|
||||
# ── Main ──────────────────────────────────────────────────────────────────
|
||||
|
||||
|
||||
def main() -> None:
|
||||
parser = argparse.ArgumentParser(description="Clean frequency corpus")
|
||||
parser.add_argument("--mode", choices=["yap", "heuristic"], default="yap", help="Detection mode")
|
||||
parser.add_argument("--dry-run", action="store_true", help="Show removals without saving")
|
||||
parser.add_argument("--resume", action="store_true", help="Resume YAP mode from checkpoint")
|
||||
parser.add_argument("--limit", type=int, default=0, help="Process only first N words (0=all)")
|
||||
args = parser.parse_args()
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
|
||||
if not RAW_CACHE.exists():
|
||||
logger.error("Raw frequency cache not found: %s", RAW_CACHE)
|
||||
sys.exit(1)
|
||||
|
||||
with open(RAW_CACHE, encoding="utf-8") as f:
|
||||
raw_freq: dict[str, int] = json.load(f)
|
||||
|
||||
logger.info("Raw frequency corpus: %d entries", len(raw_freq))
|
||||
|
||||
# Sort by rank
|
||||
words_by_rank = sorted(raw_freq.items(), key=lambda x: x[1])
|
||||
if args.limit:
|
||||
words_by_rank = words_by_rank[: args.limit]
|
||||
|
||||
if args.mode == "yap":
|
||||
discarded_list = _run_yap_mode(words_by_rank, args)
|
||||
else:
|
||||
known_forms = _load_known_forms(WORDS_JSON)
|
||||
discarded_list = _run_heuristic_mode(words_by_rank, raw_freq, known_forms)
|
||||
|
||||
kept_count = len(words_by_rank) - len(discarded_list)
|
||||
logger.info("Done. Kept: %d, Discarded: %d", kept_count, len(discarded_list))
|
||||
|
||||
if args.dry_run:
|
||||
logger.info("Dry run — no files written")
|
||||
return
|
||||
|
||||
# Build clean frequency dict (re-ranked without gaps)
|
||||
discarded_words = {d["word"] for d in discarded_list}
|
||||
clean_freq: dict[str, int] = {}
|
||||
new_rank = 1
|
||||
for word, _rank in words_by_rank:
|
||||
if word not in discarded_words:
|
||||
clean_freq[word] = new_rank
|
||||
new_rank += 1
|
||||
|
||||
with open(CLEAN_CACHE, "w", encoding="utf-8") as f:
|
||||
json.dump(clean_freq, f, ensure_ascii=False)
|
||||
logger.info("Clean frequency saved: %d entries → %s", len(clean_freq), CLEAN_CACHE)
|
||||
|
||||
with open(DISCARDED, "w", encoding="utf-8") as f:
|
||||
json.dump(discarded_list, f, ensure_ascii=False, indent=2)
|
||||
logger.info("Discarded entries saved: %d → %s", len(discarded_list), DISCARDED)
|
||||
|
||||
|
||||
def _run_yap_mode(
|
||||
words_by_rank: list[tuple[str, int]],
|
||||
args: argparse.Namespace,
|
||||
) -> list[dict]:
|
||||
"""Run YAP-based prefix detection."""
|
||||
# Check YAP connectivity
|
||||
test = query_yap("בדיקה")
|
||||
if test is None:
|
||||
logger.error("Cannot connect to YAP API at %s", YAP_URL)
|
||||
sys.exit(1)
|
||||
logger.info("YAP API connected")
|
||||
|
||||
# Load checkpoint if resuming
|
||||
analyzed: dict[str, dict] = {}
|
||||
if args.resume and CHECKPOINT.exists():
|
||||
with open(CHECKPOINT, encoding="utf-8") as f:
|
||||
analyzed = json.load(f)
|
||||
logger.info("Resumed from checkpoint: %d words already analyzed", len(analyzed))
|
||||
|
||||
discarded_list: list[dict] = []
|
||||
discarded_count = 0
|
||||
kept_count = 0
|
||||
error_count = 0
|
||||
|
||||
for i, (word, rank) in enumerate(words_by_rank):
|
||||
# Already analyzed (from checkpoint)
|
||||
if word in analyzed:
|
||||
if analyzed[word]["discard"]:
|
||||
discarded_count += 1
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": analyzed[word]["reason"]})
|
||||
else:
|
||||
kept_count += 1
|
||||
continue
|
||||
|
||||
# Trivial: single char, ASCII, or too short
|
||||
if len(word) <= 1 or word.isascii():
|
||||
analyzed[word] = {"discard": False, "reason": ""}
|
||||
kept_count += 1
|
||||
continue
|
||||
|
||||
result = query_yap(word)
|
||||
if result is None:
|
||||
analyzed[word] = {"discard": False, "reason": "yap_error"}
|
||||
error_count += 1
|
||||
kept_count += 1
|
||||
time.sleep(0.5)
|
||||
continue
|
||||
|
||||
is_combo, reason = is_prefix_combo_yap(result)
|
||||
analyzed[word] = {"discard": is_combo, "reason": reason}
|
||||
|
||||
if is_combo:
|
||||
discarded_count += 1
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
|
||||
if rank <= 500 or discarded_count <= 50:
|
||||
logger.info(" DISCARD rank %5d: %s (%s)", rank, word, reason)
|
||||
else:
|
||||
kept_count += 1
|
||||
|
||||
# Rate limit
|
||||
if i % 10 == 0:
|
||||
time.sleep(0.01)
|
||||
|
||||
# Checkpoint
|
||||
if (i + 1) % BATCH_SAVE_INTERVAL == 0:
|
||||
if not args.dry_run:
|
||||
with open(CHECKPOINT, "w", encoding="utf-8") as f:
|
||||
json.dump(analyzed, f, ensure_ascii=False)
|
||||
logger.info(
|
||||
" [%d/%d] kept=%d discarded=%d errors=%d",
|
||||
i + 1,
|
||||
len(words_by_rank),
|
||||
kept_count,
|
||||
discarded_count,
|
||||
error_count,
|
||||
)
|
||||
|
||||
# Final checkpoint save
|
||||
if not args.dry_run and CHECKPOINT.exists():
|
||||
CHECKPOINT.unlink()
|
||||
|
||||
if error_count:
|
||||
logger.warning("%d YAP errors encountered", error_count)
|
||||
|
||||
return discarded_list
|
||||
|
||||
|
||||
def _run_heuristic_mode(
|
||||
words_by_rank: list[tuple[str, int]],
|
||||
raw_freq: dict[str, int],
|
||||
known_forms: set[str],
|
||||
) -> list[dict]:
|
||||
"""Run heuristic prefix detection (no external dependencies)."""
|
||||
discarded_list: list[dict] = []
|
||||
discarded_count = 0
|
||||
|
||||
for word, rank in words_by_rank:
|
||||
if len(word) <= 1 or word.isascii():
|
||||
continue
|
||||
|
||||
# Known dictionary form → keep
|
||||
if word in known_forms:
|
||||
continue
|
||||
|
||||
result = find_prefix_decomposition(word, raw_freq)
|
||||
if result is not None:
|
||||
prefix, remainder = result
|
||||
discarded_count += 1
|
||||
reason = f"{prefix}+{remainder} (rank {raw_freq[remainder]})"
|
||||
discarded_list.append({"word": word, "original_rank": rank, "reason": reason})
|
||||
if rank <= 500 or discarded_count <= 50:
|
||||
logger.info(" DISCARD rank %5d: %s = %s", rank, word, reason)
|
||||
|
||||
return discarded_list
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,405 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Extract sentences from PDF books and match vocab words to sentences.
|
||||
|
||||
1. Extract sentences from alice.pdf and lion_strawberry.pdf
|
||||
2. Merge into existing epub_sentence_index.json
|
||||
3. Match vocab words to sentences, produce vocab_sentence_matches.json
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
|
||||
# Use the venv with pymupdf
|
||||
sys.path.insert(0, "/home/node/projects/pealim/venv_pdf/lib/python3.11/site-packages")
|
||||
# Also need the main venv for pandas
|
||||
sys.path.insert(0, "/home/node/projects/pealim/lib/python3.11/site-packages")
|
||||
|
||||
import fitz
|
||||
import pandas as pd
|
||||
|
||||
BASE_DIR = "/home/node/projects/pealim"
|
||||
DATA_DIR = os.path.join(BASE_DIR, "data")
|
||||
EPUBS_DIR = os.path.join(DATA_DIR, "epubs")
|
||||
SENTENCE_INDEX = os.path.join(DATA_DIR, "epub_sentence_index.json")
|
||||
VOCAB_CSV = os.path.join(DATA_DIR, "hebrew_dict_for_anki.csv")
|
||||
MATCHES_FILE = os.path.join(DATA_DIR, "vocab_sentence_matches.json")
|
||||
|
||||
NIKKUD_RE = re.compile(r"[\u0591-\u05C7]")
|
||||
HEBREW_RE = re.compile(r"[\u05d0-\u05ea]")
|
||||
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea\ufb20-\ufb4f]")
|
||||
|
||||
|
||||
def strip_nikkud(text):
|
||||
"""Remove all Hebrew nikkud/cantillation marks."""
|
||||
return NIKKUD_RE.sub("", text)
|
||||
|
||||
|
||||
def collapse_hebrew_spaces(text):
|
||||
"""Collapse spaces between Hebrew letter fragments (for badly-encoded PDFs).
|
||||
|
||||
Strategy: strip nikkud first, then iteratively remove spaces between
|
||||
Hebrew characters. Real word boundaries are detected by:
|
||||
- Final-form letters (ם ן ף ך ץ) followed by space
|
||||
- Punctuation (.,;:!?"')
|
||||
- Non-Hebrew characters
|
||||
"""
|
||||
stripped = strip_nikkud(text)
|
||||
# Normalize presentation forms to standard Hebrew
|
||||
# FB20-FB4F contains presentation forms
|
||||
for code in range(0xFB2A, 0xFB50):
|
||||
ch = chr(code)
|
||||
if ch in stripped:
|
||||
# Map shin/sin dots, dagesh forms back to base
|
||||
# FB2A = שׁ (shin+dot), FB2B = שׂ (sin+dot)
|
||||
base_map = {
|
||||
"\ufb2a": "ש",
|
||||
"\ufb2b": "ש",
|
||||
"\ufb35": "ו",
|
||||
"\ufb4b": "ו",
|
||||
"\ufb30": "א",
|
||||
"\ufb31": "ב",
|
||||
"\ufb32": "ג",
|
||||
"\ufb33": "ד",
|
||||
"\ufb34": "ה",
|
||||
"\ufb36": "ז",
|
||||
"\ufb38": "ט",
|
||||
"\ufb39": "י",
|
||||
"\ufb3a": "כ",
|
||||
"\ufb3b": "כ",
|
||||
"\ufb3c": "ל",
|
||||
"\ufb3e": "מ",
|
||||
"\ufb40": "נ",
|
||||
"\ufb41": "ס",
|
||||
"\ufb43": "פ",
|
||||
"\ufb44": "פ",
|
||||
"\ufb46": "צ",
|
||||
"\ufb47": "ק",
|
||||
"\ufb48": "ר",
|
||||
"\ufb49": "ש",
|
||||
"\ufb4a": "ת",
|
||||
}
|
||||
if ch in base_map:
|
||||
stripped = stripped.replace(ch, base_map[ch])
|
||||
|
||||
# Replace multiple spaces with single
|
||||
stripped = re.sub(r" {2,}", " ", stripped)
|
||||
|
||||
# Now rebuild text, keeping spaces only at word boundaries
|
||||
# Word boundary markers: final-form letters, punctuation, non-Hebrew
|
||||
final_forms = set("םןףךץ")
|
||||
result = []
|
||||
i = 0
|
||||
chars = list(stripped)
|
||||
|
||||
while i < len(chars):
|
||||
if chars[i] != " ":
|
||||
result.append(chars[i])
|
||||
i += 1
|
||||
continue
|
||||
|
||||
# It's a space. Decide if it's a word boundary.
|
||||
# Look back for the last non-space character
|
||||
prev_ch = None
|
||||
for j in range(len(result) - 1, -1, -1):
|
||||
if result[j] != " ":
|
||||
prev_ch = result[j]
|
||||
break
|
||||
|
||||
# Look forward for next non-space character
|
||||
next_ch = None
|
||||
for j in range(i + 1, len(chars)):
|
||||
if chars[j] != " ":
|
||||
next_ch = chars[j]
|
||||
break
|
||||
|
||||
is_boundary = False
|
||||
|
||||
# After final-form letter = word boundary
|
||||
if prev_ch and prev_ch in final_forms:
|
||||
is_boundary = True
|
||||
|
||||
# Before/after punctuation or non-Hebrew = word boundary
|
||||
if prev_ch and not HEBREW_RE.match(prev_ch):
|
||||
is_boundary = True
|
||||
if next_ch and not HEBREW_RE.match(next_ch):
|
||||
is_boundary = True
|
||||
|
||||
# If either side is not Hebrew at all, boundary
|
||||
if prev_ch is None or next_ch is None:
|
||||
is_boundary = True
|
||||
|
||||
if is_boundary:
|
||||
result.append(" ")
|
||||
# else: skip the space (collapse intra-word gap)
|
||||
i += 1
|
||||
|
||||
return "".join(result).strip()
|
||||
|
||||
|
||||
def extract_pdf_sentences(pdf_path, book_name):
|
||||
"""Extract sentences from a PDF file."""
|
||||
doc = fitz.open(pdf_path)
|
||||
sentences = []
|
||||
|
||||
for page_num in range(len(doc)):
|
||||
page = doc[page_num]
|
||||
text = page.get_text()
|
||||
|
||||
if not text.strip():
|
||||
continue
|
||||
|
||||
# Split into lines first, then split on sentence-ending punctuation
|
||||
lines = text.split("\n")
|
||||
|
||||
raw_sentences = []
|
||||
for line in lines:
|
||||
line = line.strip()
|
||||
if not line:
|
||||
continue
|
||||
# Split on sentence-ending punctuation followed by space or at end
|
||||
parts = re.split(r"(?<=[.?!])\s+", line)
|
||||
raw_sentences.extend(parts)
|
||||
|
||||
for sent in raw_sentences:
|
||||
sent = sent.strip()
|
||||
if not sent:
|
||||
continue
|
||||
|
||||
# Must contain Hebrew characters
|
||||
if not HEBREW_RE.search(sent):
|
||||
continue
|
||||
|
||||
# Create stripped version (no nikkud, collapsed spaces for PDF)
|
||||
stripped = collapse_hebrew_spaces(sent)
|
||||
|
||||
# Count Hebrew words in stripped version
|
||||
words = [w for w in stripped.split() if HEBREW_RE.search(w)]
|
||||
word_count = len(words)
|
||||
|
||||
# Filter: 4-15 Hebrew words
|
||||
if word_count < 4 or word_count > 15:
|
||||
continue
|
||||
|
||||
# Drop metadata-like lines
|
||||
# Page numbers (just digits)
|
||||
if re.match(r"^\d+$", sent.strip()):
|
||||
continue
|
||||
# Copyright text
|
||||
if any(kw in sent.lower() for kw in ["copyright", "©", "isbn", "printed in"]):
|
||||
continue
|
||||
|
||||
sentences.append(
|
||||
{
|
||||
"text": sent,
|
||||
"book": book_name,
|
||||
"stripped": stripped,
|
||||
}
|
||||
)
|
||||
|
||||
doc.close()
|
||||
return sentences
|
||||
|
||||
|
||||
def has_extractable_text(pdf_path):
|
||||
"""Check if a PDF has extractable text."""
|
||||
doc = fitz.open(pdf_path)
|
||||
text_found = False
|
||||
for i in range(min(len(doc), 10)):
|
||||
if doc[i].get_text().strip():
|
||||
text_found = True
|
||||
break
|
||||
doc.close()
|
||||
return text_found
|
||||
|
||||
|
||||
def load_sentence_index():
|
||||
"""Load existing sentence index."""
|
||||
if os.path.exists(SENTENCE_INDEX):
|
||||
with open(SENTENCE_INDEX, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return {"sentences": []}
|
||||
|
||||
|
||||
def save_sentence_index(data):
|
||||
"""Save sentence index."""
|
||||
with open(SENTENCE_INDEX, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def match_vocab_to_sentences(sentences, vocab_df):
|
||||
"""Match vocab words to sentences."""
|
||||
matches = {}
|
||||
|
||||
# Build lookup: word_no_nikkud -> word_nikkud
|
||||
vocab_words = []
|
||||
for _, row in vocab_df.iterrows():
|
||||
word_no_nik = str(row.get("Word Without Nikkud", "")).strip()
|
||||
word_nik = str(row.get("Word", "")).strip()
|
||||
if word_no_nik and word_nik:
|
||||
vocab_words.append((word_no_nik, word_nik))
|
||||
|
||||
print(f"Matching {len(vocab_words)} vocab words against {len(sentences)} sentences...")
|
||||
|
||||
# Precompute: for each sentence, get the stripped text
|
||||
sent_data = []
|
||||
for s in sentences:
|
||||
stripped = s.get("stripped", "")
|
||||
# For PDF sentences, stripped already has collapsed spaces but words may be joined
|
||||
# For EPUB sentences, stripped has proper word spacing
|
||||
sent_data.append(
|
||||
{
|
||||
"text": s["text"],
|
||||
"book": s["book"],
|
||||
"stripped": stripped,
|
||||
"word_count": len(stripped.split()),
|
||||
}
|
||||
)
|
||||
|
||||
matched_count = 0
|
||||
|
||||
for word_no_nik, word_nik in vocab_words:
|
||||
if len(word_no_nik) < 2:
|
||||
continue
|
||||
|
||||
# Build regex for word boundary matching
|
||||
# Use both approaches: proper word boundary and substring for PDF text
|
||||
pattern = re.compile(r"(?:^|\s)" + re.escape(word_no_nik) + r"(?:\s|$)")
|
||||
# For PDF texts with collapsed spaces, also try substring match
|
||||
# but only for words >= 3 chars to avoid false positives
|
||||
use_substring = len(word_no_nik) >= 3
|
||||
|
||||
word_matches = []
|
||||
|
||||
for sd in sent_data:
|
||||
stripped = sd["stripped"]
|
||||
|
||||
# Try word-boundary match first
|
||||
if pattern.search(stripped):
|
||||
word_matches.append(sd)
|
||||
elif use_substring and word_no_nik in stripped:
|
||||
# Substring match for PDF texts with collapsed spaces
|
||||
# Verify it's not part of a longer word by checking the character
|
||||
# before and after in the collapsed text
|
||||
idx = stripped.find(word_no_nik)
|
||||
before_ok = idx == 0 or not HEBREW_RE.match(stripped[idx - 1])
|
||||
after_idx = idx + len(word_no_nik)
|
||||
after_ok = after_idx >= len(stripped) or not HEBREW_RE.match(stripped[after_idx])
|
||||
# Only count if at least one boundary is clear
|
||||
# (for PDF collapsed text, boundaries are often missing)
|
||||
# For PDF books, we accept substring matches
|
||||
if sd["book"] in ("אליס בארץ הפלאות", "האריה שאהב תות") or before_ok or after_ok:
|
||||
word_matches.append(sd)
|
||||
|
||||
if word_matches:
|
||||
matched_count += 1
|
||||
|
||||
# Sort by preference: 6-12 words ideal, then shorter is better
|
||||
def score(sd):
|
||||
wc = sd["word_count"]
|
||||
if 6 <= wc <= 12:
|
||||
return (0, wc) # ideal range, prefer shorter
|
||||
if wc < 6:
|
||||
return (1, -wc) # too short
|
||||
return (2, wc) # too long
|
||||
|
||||
word_matches.sort(key=score)
|
||||
best = word_matches[:3]
|
||||
|
||||
matches[word_no_nik] = {
|
||||
"word_nikkud": word_nik,
|
||||
"sentences": [{"text": m["text"], "book": m["book"]} for m in best],
|
||||
}
|
||||
|
||||
print(
|
||||
f"Words with at least 1 match: {matched_count}/{len(vocab_words)} ({100 * matched_count / len(vocab_words):.1f}%)"
|
||||
)
|
||||
return matches
|
||||
|
||||
|
||||
def main():
|
||||
# ── Step 1: Extract from PDFs ──
|
||||
pdfs = [
|
||||
("alice.pdf", "אליס בארץ הפלאות"),
|
||||
("lion_strawberry.pdf", "האריה שאהב תות"),
|
||||
]
|
||||
|
||||
all_new_sentences = []
|
||||
|
||||
for filename, book_name in pdfs:
|
||||
pdf_path = os.path.join(EPUBS_DIR, filename)
|
||||
if not os.path.exists(pdf_path):
|
||||
print(f"SKIP: {filename} not found")
|
||||
continue
|
||||
|
||||
if not has_extractable_text(pdf_path):
|
||||
print(f"SKIP: {filename} has no extractable text (likely scanned images)")
|
||||
continue
|
||||
|
||||
print(f"Extracting from {filename} ({book_name})...")
|
||||
sentences = extract_pdf_sentences(pdf_path, book_name)
|
||||
print(f" Extracted {len(sentences)} sentences")
|
||||
all_new_sentences.extend(sentences)
|
||||
|
||||
# ── Step 2: Merge with existing index ──
|
||||
index = load_sentence_index()
|
||||
existing_count = len(index["sentences"])
|
||||
|
||||
# Deduplicate by (stripped, book)
|
||||
existing_keys = set()
|
||||
for s in index["sentences"]:
|
||||
key = (s.get("stripped", ""), s.get("book", ""))
|
||||
existing_keys.add(key)
|
||||
|
||||
added = 0
|
||||
for s in all_new_sentences:
|
||||
key = (s["stripped"], s["book"])
|
||||
if key not in existing_keys:
|
||||
index["sentences"].append(s)
|
||||
existing_keys.add(key)
|
||||
added += 1
|
||||
|
||||
save_sentence_index(index)
|
||||
total = len(index["sentences"])
|
||||
print(f"\nSentence index: {existing_count} existing + {added} new = {total} total")
|
||||
|
||||
# ── Per-book stats ──
|
||||
book_counts = {}
|
||||
for s in index["sentences"]:
|
||||
book = s.get("book", "unknown")
|
||||
book_counts[book] = book_counts.get(book, 0) + 1
|
||||
|
||||
print("\nSentences per book:")
|
||||
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {book}: {count}")
|
||||
|
||||
# ── Step 3: Match vocab words to sentences ──
|
||||
print(f"\nLoading vocab from {VOCAB_CSV}...")
|
||||
vocab_df = pd.read_csv(VOCAB_CSV, sep=";", index_col=0)
|
||||
print(f" {len(vocab_df)} vocab words loaded")
|
||||
|
||||
matches = match_vocab_to_sentences(index["sentences"], vocab_df)
|
||||
|
||||
with open(MATCHES_FILE, "w", encoding="utf-8") as f:
|
||||
json.dump(matches, f, ensure_ascii=False, indent=2)
|
||||
|
||||
print(f"\nWrote {len(matches)} word matches to {MATCHES_FILE}")
|
||||
|
||||
# ── Step 4: Summary stats ──
|
||||
total_words = len(vocab_df)
|
||||
matched_words = len(matches)
|
||||
print(f"\n{'=' * 50}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 50}")
|
||||
print(f"Total sentences: {total}")
|
||||
for book, count in sorted(book_counts.items(), key=lambda x: -x[1]):
|
||||
print(f" {book}: {count}")
|
||||
print(f"Total vocab words: {total_words}")
|
||||
print(f"Words with sentences: {matched_words} ({100 * matched_words / total_words:.1f}%)")
|
||||
print(f"Words without sentences: {total_words - matched_words}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,237 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape ktiv male (plene/vowelless) forms from pealim.com.
|
||||
|
||||
Uses hebstyle=vl cookie to get vowelless writing with matres lectionis.
|
||||
Builds a lookup: ktiv_male_form → [{word_nikkud, form_type, pos, slug}]
|
||||
|
||||
This enables matching Hebrew text (which is normally in ktiv male)
|
||||
against our vocabulary, including conjugated verbs and noun plurals.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
||||
OUTPUT_PATH = DATA_DIR / "ktiv_male_forms.json"
|
||||
COOKIES = {"translit": "none", "hebstyle": "vl"}
|
||||
REQUEST_TIMEOUT = 15
|
||||
DELAY = 1.5 # seconds between requests
|
||||
|
||||
|
||||
def fetch_verb_ktiv_male(slug: str, infinitive_nikkud: str) -> list[dict]:
|
||||
"""Fetch all conjugated forms in ktiv male for a verb."""
|
||||
url = f"https://www.pealim.com/dict/{slug}/"
|
||||
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
forms = []
|
||||
table = soup.find("table", class_="conjugation-table")
|
||||
if not table:
|
||||
return forms
|
||||
|
||||
# Also get the infinitive from the page
|
||||
lead = soup.find("div", class_="lead")
|
||||
if lead:
|
||||
inf_spans = lead.find_all("span", class_="menukad")
|
||||
for s in inf_spans:
|
||||
ktiv = s.text.strip()
|
||||
if ktiv:
|
||||
forms.append(
|
||||
{
|
||||
"ktiv_male": ktiv,
|
||||
"word_nikkud": infinitive_nikkud,
|
||||
"form_type": "infinitive",
|
||||
"pos": "Verb",
|
||||
"slug": slug,
|
||||
}
|
||||
)
|
||||
|
||||
rows = table.find_all("tr")
|
||||
for row in rows:
|
||||
menukad_spans = row.find_all("span", class_="menukad")
|
||||
for span in menukad_spans:
|
||||
ktiv = span.text.strip()
|
||||
if ktiv and ktiv not in {f["ktiv_male"] for f in forms}:
|
||||
forms.append(
|
||||
{
|
||||
"ktiv_male": ktiv,
|
||||
"word_nikkud": infinitive_nikkud,
|
||||
"form_type": "conjugation",
|
||||
"pos": "Verb",
|
||||
"slug": slug,
|
||||
}
|
||||
)
|
||||
|
||||
return forms
|
||||
|
||||
|
||||
def fetch_noun_ktiv_male(slug: str, singular_nikkud: str, gender: str) -> list[dict]:
|
||||
"""Fetch noun declension forms in ktiv male."""
|
||||
url = f"https://www.pealim.com/dict/{slug}/"
|
||||
resp = requests.get(url, cookies=COOKIES, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
forms = []
|
||||
table = soup.find("table", class_="conjugation-table")
|
||||
if not table:
|
||||
return forms
|
||||
|
||||
rows = table.find_all("tr")
|
||||
form_labels = ["absolute_singular", "absolute_plural", "construct_singular", "construct_plural"]
|
||||
label_idx = 0
|
||||
|
||||
for row in rows:
|
||||
menukad_spans = row.find_all("span", class_="menukad")
|
||||
for span in menukad_spans:
|
||||
ktiv = span.text.strip()
|
||||
if ktiv:
|
||||
ft = form_labels[label_idx] if label_idx < len(form_labels) else "other"
|
||||
forms.append(
|
||||
{
|
||||
"ktiv_male": ktiv,
|
||||
"word_nikkud": singular_nikkud,
|
||||
"form_type": ft,
|
||||
"pos": "Noun",
|
||||
"slug": slug,
|
||||
"gender": gender,
|
||||
}
|
||||
)
|
||||
label_idx += 1
|
||||
|
||||
return forms
|
||||
|
||||
|
||||
def scrape_verbs() -> list[dict]:
|
||||
"""Scrape ktiv male forms for all verbs in conjugations.json."""
|
||||
conj_path = DATA_DIR / "conjugations.json"
|
||||
if not conj_path.exists():
|
||||
logger.warning("No conjugations.json found")
|
||||
return []
|
||||
|
||||
with open(conj_path) as f:
|
||||
conjugations = json.load(f)
|
||||
|
||||
all_forms = []
|
||||
slugs_done = set()
|
||||
|
||||
for verb, data in conjugations.items():
|
||||
if not data or not data.get("slug"):
|
||||
continue
|
||||
slug = data["slug"]
|
||||
if slug in slugs_done:
|
||||
continue
|
||||
slugs_done.add(slug)
|
||||
|
||||
try:
|
||||
forms = fetch_verb_ktiv_male(slug, verb)
|
||||
all_forms.extend(forms)
|
||||
logger.info(f" Verb {verb} ({slug}): {len(forms)} forms")
|
||||
except Exception as e:
|
||||
logger.warning(f" Verb {verb} ({slug}) failed: {e}")
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
return all_forms
|
||||
|
||||
|
||||
def scrape_nouns() -> list[dict]:
|
||||
"""Scrape ktiv male forms for all nouns in noun_slug_map.json."""
|
||||
slug_path = DATA_DIR / "noun_slug_map.json"
|
||||
if not slug_path.exists():
|
||||
logger.warning("No noun_slug_map.json found")
|
||||
return []
|
||||
|
||||
with open(slug_path) as f:
|
||||
slug_map = json.load(f)
|
||||
|
||||
# Also load existing plurals to get nikkud singular form
|
||||
plurals_path = DATA_DIR / "noun_plurals.json"
|
||||
plurals = {}
|
||||
if plurals_path.exists():
|
||||
with open(plurals_path) as f:
|
||||
plurals = json.load(f)
|
||||
|
||||
all_forms = []
|
||||
done = 0
|
||||
total = len(slug_map)
|
||||
|
||||
for word, info in slug_map.items():
|
||||
slug = info.get("slug", "")
|
||||
if not slug:
|
||||
continue
|
||||
|
||||
# Get nikkud form from plurals data or slug map
|
||||
nikkud = info.get("word_nikkud", word)
|
||||
if word in plurals:
|
||||
nikkud = plurals[word].get("singular", nikkud)
|
||||
gender = info.get("gender", "")
|
||||
|
||||
try:
|
||||
forms = fetch_noun_ktiv_male(slug, nikkud, gender)
|
||||
all_forms.extend(forms)
|
||||
done += 1
|
||||
if done % 50 == 0:
|
||||
logger.info(f" Nouns: {done}/{total} ({len(all_forms)} forms)")
|
||||
# Save incrementally
|
||||
_save_forms(all_forms, partial=True)
|
||||
except Exception as e:
|
||||
logger.warning(f" Noun {word} ({slug}) failed: {e}")
|
||||
done += 1
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
return all_forms
|
||||
|
||||
|
||||
def _save_forms(all_forms: list[dict], partial: bool = False):
|
||||
"""Build and save the ktiv male lookup dict."""
|
||||
lookup: dict[str, list[dict]] = {}
|
||||
for entry in all_forms:
|
||||
ktiv = entry["ktiv_male"]
|
||||
# Don't include ktiv_male in the stored entry (it's the key)
|
||||
stored = {k: v for k, v in entry.items() if k != "ktiv_male"}
|
||||
lookup.setdefault(ktiv, []).append(stored)
|
||||
|
||||
suffix = ".partial" if partial else ""
|
||||
out = OUTPUT_PATH.parent / (OUTPUT_PATH.name + suffix)
|
||||
with open(out, "w") as f:
|
||||
json.dump(lookup, f, ensure_ascii=False, indent=1)
|
||||
|
||||
logger.info(f" Saved {len(lookup)} unique ktiv male forms → {out}")
|
||||
|
||||
|
||||
def main():
|
||||
mode = sys.argv[1] if len(sys.argv) > 1 else "all"
|
||||
|
||||
all_forms = []
|
||||
|
||||
if mode in ("all", "verbs"):
|
||||
logger.info("=== Scraping verb ktiv male forms ===")
|
||||
verb_forms = scrape_verbs()
|
||||
all_forms.extend(verb_forms)
|
||||
logger.info(f"Verbs done: {len(verb_forms)} forms from {len({f['slug'] for f in verb_forms})} verbs")
|
||||
|
||||
if mode in ("all", "nouns"):
|
||||
logger.info("=== Scraping noun ktiv male forms ===")
|
||||
noun_forms = scrape_nouns()
|
||||
all_forms.extend(noun_forms)
|
||||
logger.info(f"Nouns done: {len(noun_forms)} forms")
|
||||
|
||||
_save_forms(all_forms)
|
||||
logger.info(f"Total: {len(all_forms)} forms → {OUTPUT_PATH}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,365 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Scrape pealim.com for noun plural and construct forms.
|
||||
|
||||
Step 1: Collect noun slugs from list pages (/dict/?pos=noun&page=N)
|
||||
Step 2: Fetch detail pages for plural + construct forms
|
||||
Step 3: Print summary statistics
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
BASE_URL = "https://www.pealim.com"
|
||||
COOKIES = {"translit": "none", "hebstyle": "mo"}
|
||||
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
|
||||
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
||||
SLUG_MAP_FILE = DATA_DIR / "noun_slug_map.json"
|
||||
PROGRESS_FILE = DATA_DIR / "noun_slug_map_progress.json"
|
||||
PLURALS_FILE = DATA_DIR / "noun_plurals.json"
|
||||
DELAY = 1.5 # seconds between requests
|
||||
|
||||
|
||||
def load_json(path, default=None):
|
||||
if path.exists():
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
return default if default is not None else {}
|
||||
|
||||
|
||||
def save_json(path, data):
|
||||
with open(path, "w") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=2)
|
||||
|
||||
|
||||
def fetch_with_retry(url, max_retries=5):
|
||||
"""Fetch URL with exponential backoff."""
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
r = requests.get(url, cookies=COOKIES, headers=HEADERS, timeout=30)
|
||||
r.raise_for_status()
|
||||
return r
|
||||
except (requests.RequestException, ConnectionError) as e:
|
||||
wait = min(2**attempt * 2, 60)
|
||||
print(f" Retry {attempt + 1}/{max_retries} for {url}: {e} (waiting {wait}s)")
|
||||
time.sleep(wait)
|
||||
print(f" FAILED after {max_retries} retries: {url}")
|
||||
return None
|
||||
|
||||
|
||||
def get_total_pages():
|
||||
"""Get total number of noun list pages."""
|
||||
r = fetch_with_retry(f"{BASE_URL}/dict/?pos=noun&page=1")
|
||||
if not r:
|
||||
return 0
|
||||
soup = BeautifulSoup(r.text, "lxml")
|
||||
pages = set()
|
||||
for a in soup.select("ul.pagination li a"):
|
||||
href = a.get("href", "")
|
||||
m = re.search(r"page=(\d+)", href)
|
||||
if m:
|
||||
pages.add(int(m.group(1)))
|
||||
return max(pages) if pages else 1
|
||||
|
||||
|
||||
def parse_list_page(html):
|
||||
"""Parse a noun list page and return list of noun entries."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
table = soup.select_one("table.dict-table")
|
||||
if not table:
|
||||
return []
|
||||
|
||||
entries = []
|
||||
for row in table.select("tr")[1:]: # skip header
|
||||
tds = row.select("td")
|
||||
if len(tds) < 3:
|
||||
continue
|
||||
|
||||
# First td: word + link
|
||||
first_td = tds[0]
|
||||
a = first_td.select_one("a")
|
||||
if not a:
|
||||
continue
|
||||
href = a.get("href", "")
|
||||
slug_match = re.search(r"/dict/([^/]+)/", href)
|
||||
if not slug_match:
|
||||
continue
|
||||
slug = slug_match.group(1)
|
||||
|
||||
menukad = first_td.select_one("span.menukad")
|
||||
word_nikkud = menukad.get_text(strip=True) if menukad else ""
|
||||
|
||||
# Word without nikkud (strip combining marks)
|
||||
word_plain = re.sub(r"[\u0591-\u05C7]", "", word_nikkud)
|
||||
|
||||
# Third td: part of speech
|
||||
pos_text = tds[2].get_text(strip=True)
|
||||
|
||||
# Gender
|
||||
gender = ""
|
||||
if "masculine" in pos_text.lower():
|
||||
gender = "masculine"
|
||||
elif "feminine" in pos_text.lower():
|
||||
gender = "feminine"
|
||||
|
||||
# Mishkal pattern
|
||||
mishkal = ""
|
||||
m = re.search(r"(\w+)\s*pattern", pos_text.lower())
|
||||
if m:
|
||||
mishkal = m.group(1)
|
||||
|
||||
entries.append(
|
||||
{
|
||||
"word_plain": word_plain,
|
||||
"slug": slug,
|
||||
"word_nikkud": word_nikkud,
|
||||
"pos": pos_text,
|
||||
"gender": gender,
|
||||
"mishkal": mishkal,
|
||||
}
|
||||
)
|
||||
|
||||
return entries
|
||||
|
||||
|
||||
def step1_collect_slugs():
|
||||
"""Step 1: Collect noun slugs from list pages."""
|
||||
print("=" * 60)
|
||||
print("STEP 1: Collecting noun slugs from list pages")
|
||||
print("=" * 60)
|
||||
|
||||
slug_map = load_json(SLUG_MAP_FILE, {})
|
||||
progress = load_json(PROGRESS_FILE, [])
|
||||
completed_pages = set(progress) if isinstance(progress, list) else set()
|
||||
|
||||
# Get total pages
|
||||
total_pages = get_total_pages()
|
||||
print(f"Total pages: {total_pages}")
|
||||
print(f"Already completed: {len(completed_pages)} pages, {len(slug_map)} nouns")
|
||||
|
||||
remaining = [p for p in range(1, total_pages + 1) if p not in completed_pages]
|
||||
print(f"Remaining pages: {len(remaining)}")
|
||||
|
||||
if not remaining:
|
||||
print("All pages already scraped!")
|
||||
return slug_map
|
||||
|
||||
for i, page_num in enumerate(remaining):
|
||||
url = f"{BASE_URL}/dict/?pos=noun&page={page_num}"
|
||||
r = fetch_with_retry(url)
|
||||
if not r:
|
||||
print(f" Skipping page {page_num}")
|
||||
continue
|
||||
|
||||
entries = parse_list_page(r.text)
|
||||
for entry in entries:
|
||||
word = entry["word_plain"]
|
||||
slug_map[word] = {
|
||||
"slug": entry["slug"],
|
||||
"word_nikkud": entry["word_nikkud"],
|
||||
"pos": entry["pos"],
|
||||
"gender": entry["gender"],
|
||||
"mishkal": entry["mishkal"],
|
||||
}
|
||||
|
||||
completed_pages.add(page_num)
|
||||
done = len(completed_pages)
|
||||
print(f" Page {page_num} ({done}/{total_pages}): {len(entries)} nouns (total: {len(slug_map)})")
|
||||
|
||||
# Save progress every 10 pages
|
||||
if (i + 1) % 10 == 0 or page_num == remaining[-1]:
|
||||
save_json(SLUG_MAP_FILE, slug_map)
|
||||
save_json(PROGRESS_FILE, sorted(completed_pages))
|
||||
print(f" [Saved progress: {len(slug_map)} nouns, {done} pages]")
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
# Final save
|
||||
save_json(SLUG_MAP_FILE, slug_map)
|
||||
save_json(PROGRESS_FILE, sorted(completed_pages))
|
||||
print(f"\nStep 1 complete: {len(slug_map)} total nouns from {len(completed_pages)} pages")
|
||||
return slug_map
|
||||
|
||||
|
||||
def parse_detail_page(html, slug, gender, mishkal):
|
||||
"""Parse a noun detail page for plural/construct forms."""
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
tables = soup.select("table.conjugation-table")
|
||||
if not tables:
|
||||
return None
|
||||
|
||||
table = tables[0]
|
||||
rows = table.select("tr")
|
||||
|
||||
result = {
|
||||
"slug": slug,
|
||||
"singular": "",
|
||||
"singular_audio": "",
|
||||
"plural": "",
|
||||
"plural_audio": "",
|
||||
"construct_singular": "",
|
||||
"construct_plural": "",
|
||||
"gender": gender,
|
||||
"mishkal": mishkal,
|
||||
}
|
||||
|
||||
for row in rows:
|
||||
th = row.select_one("th")
|
||||
if not th:
|
||||
continue
|
||||
label = th.get_text(strip=True).lower()
|
||||
tds = row.select("td")
|
||||
|
||||
if "absolute" in label:
|
||||
if len(tds) >= 1:
|
||||
td = tds[0]
|
||||
m = td.select_one("span.menukad")
|
||||
result["singular"] = m.get_text(strip=True) if m else ""
|
||||
audio_el = td.select_one("[data-audio]")
|
||||
result["singular_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
|
||||
if len(tds) >= 2:
|
||||
td = tds[1]
|
||||
m = td.select_one("span.menukad")
|
||||
result["plural"] = m.get_text(strip=True) if m else ""
|
||||
audio_el = td.select_one("[data-audio]")
|
||||
result["plural_audio"] = audio_el.get("data-audio", "") if audio_el else td.get("data-audio", "")
|
||||
|
||||
elif "construct" in label:
|
||||
if len(tds) >= 1:
|
||||
td = tds[0]
|
||||
m = td.select_one("span.menukad")
|
||||
result["construct_singular"] = m.get_text(strip=True) if m else ""
|
||||
if len(tds) >= 2:
|
||||
td = tds[1]
|
||||
m = td.select_one("span.menukad")
|
||||
result["construct_plural"] = m.get_text(strip=True) if m else ""
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def step2_fetch_plurals(slug_map):
|
||||
"""Step 2: Fetch detail pages for plural + construct forms."""
|
||||
print("\n" + "=" * 60)
|
||||
print("STEP 2: Fetching plural + construct forms from detail pages")
|
||||
print("=" * 60)
|
||||
|
||||
plurals = load_json(PLURALS_FILE, {})
|
||||
already_done = set(plurals.keys())
|
||||
|
||||
# Build work list: nouns not yet in plurals
|
||||
work = []
|
||||
for word, info in slug_map.items():
|
||||
if word not in already_done:
|
||||
work.append((word, info))
|
||||
|
||||
print(f"Already have plural data: {len(already_done)}")
|
||||
print(f"Remaining to fetch: {len(work)}")
|
||||
|
||||
if not work:
|
||||
print("All nouns already have plural data!")
|
||||
return plurals
|
||||
|
||||
skipped = 0
|
||||
for i, (word, info) in enumerate(work):
|
||||
slug = info["slug"]
|
||||
url = f"{BASE_URL}/dict/{slug}/"
|
||||
r = fetch_with_retry(url)
|
||||
if not r:
|
||||
print(f" Skipping {word} ({slug})")
|
||||
skipped += 1
|
||||
continue
|
||||
|
||||
entry = parse_detail_page(r.text, slug, info.get("gender", ""), info.get("mishkal", ""))
|
||||
if entry:
|
||||
plurals[word] = entry
|
||||
else:
|
||||
# No declension table - store minimal entry
|
||||
plurals[word] = {
|
||||
"slug": slug,
|
||||
"singular": info.get("word_nikkud", ""),
|
||||
"singular_audio": "",
|
||||
"plural": "",
|
||||
"plural_audio": "",
|
||||
"construct_singular": "",
|
||||
"construct_plural": "",
|
||||
"gender": info.get("gender", ""),
|
||||
"mishkal": info.get("mishkal", ""),
|
||||
"no_declension_table": True,
|
||||
}
|
||||
|
||||
done = len(already_done) + i + 1 - skipped
|
||||
total = len(already_done) + len(work)
|
||||
if (i + 1) % 50 == 0 or i == 0:
|
||||
print(
|
||||
f" [{i + 1}/{len(work)}] {word} ({slug}): "
|
||||
f"plural={entry['plural'] if entry else 'N/A'} "
|
||||
f"(total: {done}/{total})"
|
||||
)
|
||||
|
||||
# Save every 50 entries
|
||||
if (i + 1) % 50 == 0 or i == len(work) - 1:
|
||||
save_json(PLURALS_FILE, plurals)
|
||||
print(f" [Saved: {len(plurals)} entries]")
|
||||
|
||||
time.sleep(DELAY)
|
||||
|
||||
save_json(PLURALS_FILE, plurals)
|
||||
print(f"\nStep 2 complete: {len(plurals)} total noun entries with plural data")
|
||||
return plurals
|
||||
|
||||
|
||||
def step3_summary(slug_map, plurals):
|
||||
"""Step 3: Print summary statistics."""
|
||||
print("\n" + "=" * 60)
|
||||
print("SUMMARY")
|
||||
print("=" * 60)
|
||||
|
||||
total_slugs = len(slug_map)
|
||||
total_plurals = len(plurals)
|
||||
has_plural = sum(1 for v in plurals.values() if v.get("plural"))
|
||||
has_construct = sum(1 for v in plurals.values() if v.get("construct_singular") or v.get("construct_plural"))
|
||||
has_audio = sum(1 for v in plurals.values() if v.get("singular_audio") or v.get("plural_audio"))
|
||||
no_table = sum(1 for v in plurals.values() if v.get("no_declension_table"))
|
||||
|
||||
# Irregular plurals: masculine with ות- ending, feminine with ים- ending
|
||||
irregular = 0
|
||||
for _word, v in plurals.items():
|
||||
plural = v.get("plural", "")
|
||||
gender = v.get("gender", "")
|
||||
if not plural or not gender:
|
||||
continue
|
||||
plain_plural = re.sub(r"[\u0591-\u05C7]", "", plural)
|
||||
if (
|
||||
gender == "masculine"
|
||||
and plain_plural.endswith("ות")
|
||||
or gender == "feminine"
|
||||
and plain_plural.endswith("ים")
|
||||
):
|
||||
irregular += 1
|
||||
|
||||
print(f"Total nouns in slug map: {total_slugs}")
|
||||
print(f"Total nouns with plural data: {total_plurals}")
|
||||
print(f" - With plural form: {has_plural}")
|
||||
print(f" - With construct forms: {has_construct}")
|
||||
print(f" - With audio URLs: {has_audio}")
|
||||
print(f" - No declension table: {no_table}")
|
||||
print(f" - Irregular plurals: {irregular}")
|
||||
|
||||
|
||||
def main():
|
||||
print("Pealim Noun Plural Scraper")
|
||||
print(f"Data directory: {DATA_DIR}")
|
||||
print()
|
||||
|
||||
slug_map = step1_collect_slugs()
|
||||
plurals = step2_fetch_plurals(slug_map)
|
||||
step3_summary(slug_map, plurals)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,250 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Scrape ktiv male (vowelless plene) conjugation forms for top 500 verbs from pealim.com."""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
sys.stdout.reconfigure(line_buffering=True)
|
||||
import requests # noqa: E402
|
||||
from bs4 import BeautifulSoup # noqa: E402
|
||||
|
||||
DATA_DIR = os.path.join(os.path.dirname(os.path.dirname(__file__)), "data")
|
||||
INPUT_FILE = os.path.join(DATA_DIR, "top_verbs_to_scrape.json")
|
||||
OUTPUT_FILE = os.path.join(DATA_DIR, "ktiv_male_forms.json")
|
||||
PARTIAL_FILE = os.path.join(DATA_DIR, "ktiv_male_forms_partial.json")
|
||||
PROGRESS_FILE = os.path.join(DATA_DIR, "ktiv_scrape_progress.json")
|
||||
|
||||
COOKIES = {"translit": "none", "hebstyle": "vl"}
|
||||
HEADERS = {"User-Agent": "Mozilla/5.0 (compatible; PealimScraper/1.0)"}
|
||||
DELAY = 1.5
|
||||
|
||||
session = requests.Session()
|
||||
session.cookies.update(COOKIES)
|
||||
session.headers.update(HEADERS)
|
||||
|
||||
|
||||
def load_json(path):
|
||||
if os.path.exists(path):
|
||||
with open(path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
return {}
|
||||
|
||||
|
||||
def save_json(data, path):
|
||||
with open(path, "w", encoding="utf-8") as f:
|
||||
json.dump(data, f, ensure_ascii=False, indent=1)
|
||||
|
||||
|
||||
def search_slug(wni):
|
||||
"""Search pealim for a verb and return the first result's slug."""
|
||||
url = "https://www.pealim.com/search/"
|
||||
resp = session.get(url, params={"q": wni}, timeout=15)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
# Look for result links like /dict/SLUG/
|
||||
for a in soup.select("a[href]"):
|
||||
href = a["href"]
|
||||
m = re.match(r"/dict/(\d+-[^/]+)/", href)
|
||||
if m:
|
||||
return m.group(1)
|
||||
return None
|
||||
|
||||
|
||||
def scrape_verb_forms(slug):
|
||||
"""Fetch a verb's detail page and extract all ktiv male conjugation forms."""
|
||||
url = f"https://www.pealim.com/dict/{slug}/"
|
||||
resp = session.get(url, timeout=15)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "html.parser")
|
||||
|
||||
forms = set()
|
||||
|
||||
# Get infinitive from div.lead or page title
|
||||
lead = soup.select_one("div.lead")
|
||||
if lead:
|
||||
menukad_spans = lead.select("span.menukad")
|
||||
for span in menukad_spans:
|
||||
text = span.get_text(strip=True)
|
||||
if text:
|
||||
forms.add(text)
|
||||
|
||||
# Get word_nikkud (the nikkud form of the infinitive) from the page
|
||||
# We need to fetch with mo cookie for that, but we already have it from input data
|
||||
# Instead, get the page title which usually has the nikkud form
|
||||
word_nikkud = None
|
||||
title = soup.select_one("h1")
|
||||
if title:
|
||||
menukad_in_title = title.select_one("span.menukad")
|
||||
if menukad_in_title:
|
||||
word_nikkud = menukad_in_title.get_text(strip=True)
|
||||
|
||||
# Get ALL span.menukad elements from conjugation tables
|
||||
for span in soup.select("span.menukad"):
|
||||
text = span.get_text(strip=True)
|
||||
if text:
|
||||
forms.add(text)
|
||||
|
||||
return forms, word_nikkud
|
||||
|
||||
|
||||
def main():
|
||||
verbs = load_json(INPUT_FILE)
|
||||
if not verbs:
|
||||
print("ERROR: No verbs found in input file")
|
||||
sys.exit(1)
|
||||
|
||||
# Load existing forms
|
||||
existing_forms = load_json(OUTPUT_FILE)
|
||||
new_forms = {} # Will be merged into existing at the end
|
||||
|
||||
# Load progress to resume
|
||||
progress = load_json(PROGRESS_FILE)
|
||||
done_wnis = set(progress.get("done_wnis", []))
|
||||
slug_cache = progress.get("slug_cache", {})
|
||||
|
||||
# Pre-populate slug cache from conjugations.json
|
||||
conj_file = os.path.join(DATA_DIR, "conjugations.json")
|
||||
if os.path.exists(conj_file):
|
||||
conj_data = load_json(conj_file)
|
||||
for wni_key, cdata in conj_data.items():
|
||||
if isinstance(cdata, dict) and "slug" in cdata and wni_key not in slug_cache:
|
||||
slug_cache[wni_key] = cdata["slug"]
|
||||
print(f"Pre-populated {len(slug_cache)} slugs from conjugations.json")
|
||||
|
||||
# Deduplicate verbs by wni
|
||||
seen_wni = set()
|
||||
unique_verbs = []
|
||||
for v in verbs:
|
||||
if v["wni"] not in seen_wni:
|
||||
seen_wni.add(v["wni"])
|
||||
unique_verbs.append(v)
|
||||
|
||||
total = len(unique_verbs)
|
||||
to_scrape = [v for v in unique_verbs if v["wni"] not in done_wnis]
|
||||
print(f"Total unique verbs: {total}, already done: {total - len(to_scrape)}, to scrape: {len(to_scrape)}")
|
||||
|
||||
scraped_count = 0
|
||||
skipped_count = 0
|
||||
total_new_forms = 0
|
||||
sample_verbs = {} # For summary: wni -> list of forms
|
||||
|
||||
for i, verb in enumerate(to_scrape):
|
||||
wni = verb["wni"]
|
||||
word_nikkud_input = verb["word"]
|
||||
|
||||
try:
|
||||
# Step 1: Find slug
|
||||
if wni in slug_cache:
|
||||
slug = slug_cache[wni]
|
||||
else:
|
||||
slug = search_slug(wni)
|
||||
time.sleep(DELAY)
|
||||
|
||||
if not slug:
|
||||
print(f" [{i + 1}/{len(to_scrape)}] SKIP {wni} - not found on pealim")
|
||||
skipped_count += 1
|
||||
done_wnis.add(wni)
|
||||
continue
|
||||
|
||||
slug_cache[wni] = slug
|
||||
|
||||
# Step 2: Scrape forms
|
||||
forms, page_nikkud = scrape_verb_forms(slug)
|
||||
time.sleep(DELAY)
|
||||
|
||||
# Use the nikkud form from our input data (more reliable)
|
||||
nikkud_to_use = word_nikkud_input
|
||||
|
||||
# Build entries for each form
|
||||
for form in forms:
|
||||
entry = {
|
||||
"word_nikkud": nikkud_to_use,
|
||||
"form_type": "conjugation",
|
||||
"pos": "Verb",
|
||||
"slug": slug,
|
||||
}
|
||||
if form not in new_forms:
|
||||
new_forms[form] = []
|
||||
# Check for duplicate entry
|
||||
if not any(e["slug"] == slug for e in new_forms[form]):
|
||||
new_forms[form].append(entry)
|
||||
total_new_forms += 1
|
||||
|
||||
scraped_count += 1
|
||||
# Collect samples (first 3 completed)
|
||||
if len(sample_verbs) < 3:
|
||||
sample_verbs[wni] = sorted(forms)
|
||||
|
||||
print(f" [{i + 1}/{len(to_scrape)}] {wni} -> {slug} ({len(forms)} forms)")
|
||||
done_wnis.add(wni)
|
||||
|
||||
except Exception as e:
|
||||
print(f" [{i + 1}/{len(to_scrape)}] ERROR {wni}: {e}")
|
||||
skipped_count += 1
|
||||
done_wnis.add(wni)
|
||||
|
||||
# Save progress every 50 verbs
|
||||
if (i + 1) % 50 == 0:
|
||||
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
|
||||
save_json(progress, PROGRESS_FILE)
|
||||
# Save partial merged result
|
||||
merged = dict(existing_forms)
|
||||
for form, entries in new_forms.items():
|
||||
if form in merged:
|
||||
existing_slugs = {e["slug"] for e in merged[form]}
|
||||
for entry in entries:
|
||||
if entry["slug"] not in existing_slugs:
|
||||
merged[form].append(entry)
|
||||
else:
|
||||
merged[form] = entries
|
||||
save_json(merged, PARTIAL_FILE)
|
||||
print(f" -- Progress saved at {i + 1}/{len(to_scrape)} --")
|
||||
|
||||
# Final merge
|
||||
merged = dict(existing_forms)
|
||||
for form, entries in new_forms.items():
|
||||
if form in merged:
|
||||
existing_slugs = {e["slug"] for e in merged[form]}
|
||||
for entry in entries:
|
||||
if entry["slug"] not in existing_slugs:
|
||||
merged[form].append(entry)
|
||||
else:
|
||||
merged[form] = entries
|
||||
|
||||
save_json(merged, OUTPUT_FILE)
|
||||
|
||||
# Save final progress
|
||||
progress = {"done_wnis": list(done_wnis), "slug_cache": slug_cache}
|
||||
save_json(progress, PROGRESS_FILE)
|
||||
|
||||
# Clean up partial file
|
||||
if os.path.exists(PARTIAL_FILE):
|
||||
os.remove(PARTIAL_FILE)
|
||||
|
||||
# Summary
|
||||
print(f"\n{'=' * 50}")
|
||||
print("SUMMARY")
|
||||
print(f"{'=' * 50}")
|
||||
print(f"Verbs scraped: {scraped_count}")
|
||||
print(f"Verbs skipped: {skipped_count}")
|
||||
print(f"New forms added: {total_new_forms}")
|
||||
print(f"Total unique ktiv male forms: {len(merged)}")
|
||||
print(f"Previous forms count: {len(existing_forms)}")
|
||||
print(f"Net new form keys: {len(merged) - len(existing_forms)}")
|
||||
|
||||
if sample_verbs:
|
||||
print("\nSample verbs:")
|
||||
for wni, forms in list(sample_verbs.items())[:3]:
|
||||
print(f"\n {wni} ({len(forms)} forms):")
|
||||
for f in forms[:8]:
|
||||
print(f" {f}")
|
||||
if len(forms) > 8:
|
||||
print(f" ... and {len(forms) - 8} more")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
919
scripts/validate_data.py
Normal file
919
scripts/validate_data.py
Normal file
|
|
@ -0,0 +1,919 @@
|
|||
"""Standalone integrity validator for data/words.json.
|
||||
|
||||
Validates the unified Hebrew Flash Cards data against the schema defined in
|
||||
SCHEMA.yaml. Each test prints PASS/FAIL with details on failures.
|
||||
|
||||
Usage:
|
||||
python3 scripts/validate_data.py
|
||||
python3 scripts/validate_data.py --verbose
|
||||
python3 scripts/validate_data.py --test confusable_symmetric
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
import unicodedata
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Bootstrap: make project root importable so helpers.py is accessible
|
||||
# ---------------------------------------------------------------------------
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Constants
|
||||
# ---------------------------------------------------------------------------
|
||||
DATA_FILE = Path(__file__).parent.parent / "data" / "words.json"
|
||||
|
||||
HEBREW_CONSONANT_RANGE = (0x05D0, 0x05EA) # alef–tav
|
||||
|
||||
VALID_PERSON_CODES: frozenset[str] = frozenset(
|
||||
["inf", "1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp", "3p", "ms", "fs", "mp", "fp"]
|
||||
)
|
||||
|
||||
EMOJI_RE = re.compile(
|
||||
r"[\U0001f600-\U0001f64f"
|
||||
r"\U0001f300-\U0001f5ff"
|
||||
r"\U0001f680-\U0001f6ff"
|
||||
r"\U0001f1e0-\U0001f1ff"
|
||||
r"\U00002702-\U000027b0"
|
||||
r"\U0001f900-\U0001f9ff"
|
||||
r"\U0001fa00-\U0001fa6f"
|
||||
r"\U0001fa70-\U0001faff]"
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Result tracking
|
||||
# ---------------------------------------------------------------------------
|
||||
_failures: list[str] = []
|
||||
_warnings: list[str] = []
|
||||
_verbose: bool = False
|
||||
|
||||
|
||||
def _pass(name: str) -> None:
|
||||
print(f" PASS {name}")
|
||||
|
||||
|
||||
def _fail(name: str, details: list[str]) -> None:
|
||||
global _failures
|
||||
_failures.append(name)
|
||||
print(f" FAIL {name}")
|
||||
for d in details:
|
||||
print(f" {d}")
|
||||
|
||||
|
||||
def _warn(name: str, details: list[str]) -> None:
|
||||
global _warnings
|
||||
_warnings.extend(details)
|
||||
print(f" WARN {name}")
|
||||
for d in details:
|
||||
print(f" {d}")
|
||||
|
||||
|
||||
def _verbose_print(msg: str) -> None:
|
||||
if _verbose:
|
||||
print(f" {msg}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helper: load data
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def load_data() -> dict[str, Any]:
|
||||
"""Load words.json and return the parsed dict."""
|
||||
if not DATA_FILE.exists():
|
||||
print(f"ERROR: data file not found: {DATA_FILE}")
|
||||
sys.exit(2)
|
||||
with DATA_FILE.open(encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
def _is_hebrew_consonant(ch: str) -> bool:
|
||||
"""Return True if ch is a Hebrew consonant (U+05D0..U+05EA).
|
||||
|
||||
Accepts multi-codepoint strings like 'שׁ' (shin + shin dot) by checking
|
||||
only the first base character after NFD decomposition.
|
||||
"""
|
||||
normalized = unicodedata.normalize("NFD", ch)
|
||||
# The first codepoint is the base consonant; the rest are combining marks.
|
||||
base = normalized[0]
|
||||
cp = ord(base)
|
||||
return HEBREW_CONSONANT_RANGE[0] <= cp <= HEBREW_CONSONANT_RANGE[1]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Individual tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def test_required_fields(data: dict[str, Any]) -> None:
|
||||
"""Every entry has word.nikkud, word.ktiv_male, slug, pos, meaning."""
|
||||
name = "required_fields"
|
||||
errors: list[str] = []
|
||||
warn_details: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
word = entry.get("word")
|
||||
if not isinstance(word, dict):
|
||||
errors.append(f"[{key}] 'word' is missing or not a dict")
|
||||
else:
|
||||
if not word.get("nikkud"):
|
||||
errors.append(f"[{key}] word.nikkud is missing or empty")
|
||||
if not word.get("ktiv_male"):
|
||||
errors.append(f"[{key}] word.ktiv_male is missing or empty")
|
||||
|
||||
if not entry.get("slug"):
|
||||
errors.append(f"[{key}] 'slug' is missing or empty")
|
||||
if not entry.get("pos"):
|
||||
errors.append(f"[{key}] 'pos' is missing or empty")
|
||||
if not entry.get("meaning"):
|
||||
errors.append(f"[{key}] 'meaning' is missing or empty")
|
||||
|
||||
if entry.get("frequency") is None:
|
||||
warn_details.append(f"[{key}] 'frequency' is null/missing")
|
||||
|
||||
if warn_details:
|
||||
_warn("frequency_missing", warn_details[:20] if not _verbose else warn_details)
|
||||
if len(warn_details) > 20 and not _verbose:
|
||||
print(f" ... ({len(warn_details) - 20} more; use --verbose)")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_root_format(data: dict[str, Any]) -> None:
|
||||
"""root is a list of 2-5 Hebrew consonant chars, or an empty list."""
|
||||
name = "root_format"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
root = entry.get("root")
|
||||
if root is None:
|
||||
errors.append(f"[{key}] 'root' key is absent (should be [] for rootless words)")
|
||||
continue
|
||||
if not isinstance(root, list):
|
||||
errors.append(f"[{key}] 'root' is not a list: {root!r}")
|
||||
continue
|
||||
if len(root) == 0:
|
||||
continue # rootless word — valid
|
||||
if not (2 <= len(root) <= 5):
|
||||
errors.append(f"[{key}] root has {len(root)} elements (expected 2-5): {root!r}")
|
||||
continue
|
||||
for ch in root:
|
||||
# A root element may be multi-codepoint (e.g. 'שׁ' = shin + shin dot).
|
||||
# Validate by checking the base consonant after NFD decomposition.
|
||||
if not isinstance(ch, str) or not ch or not _is_hebrew_consonant(ch):
|
||||
errors.append(f"[{key}] root char {ch!r} is not a Hebrew consonant (U+05D0..U+05EA)")
|
||||
break
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_unique_slugs(data: dict[str, Any]) -> None:
|
||||
"""All non-empty slugs are unique across entries — each pealim page is a distinct word."""
|
||||
name = "unique_slugs"
|
||||
seen: dict[str, list[str]] = {}
|
||||
|
||||
for key, entry in data.items():
|
||||
slug = entry.get("slug")
|
||||
if slug:
|
||||
seen.setdefault(slug, []).append(key)
|
||||
|
||||
dups = {slug: keys for slug, keys in seen.items() if len(keys) > 1}
|
||||
if dups:
|
||||
errors = [f"slug={slug!r} shared by: {keys}" for slug, keys in dups.items()]
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_duplicate_keys(_data: dict[str, Any]) -> None: # noqa: ARG001
|
||||
"""JSON loaded without top-level key collisions.
|
||||
|
||||
Python's json.load silently keeps the last value on duplicate keys;
|
||||
we re-parse with a custom object_pairs_hook to detect them.
|
||||
The pre-parsed ``_data`` dict is not used here because we need to
|
||||
re-read the raw file to catch duplicate keys that json.load would
|
||||
silently merge.
|
||||
"""
|
||||
name = "no_duplicate_keys"
|
||||
duplicates: list[str] = []
|
||||
|
||||
def _detect_dups(pairs: list[tuple[str, Any]]) -> dict[str, Any]:
|
||||
d: dict[str, Any] = {}
|
||||
for k, v in pairs:
|
||||
if k in d:
|
||||
duplicates.append(k)
|
||||
d[k] = v
|
||||
return d
|
||||
|
||||
with DATA_FILE.open(encoding="utf-8") as fh:
|
||||
json.load(fh, object_pairs_hook=_detect_dups)
|
||||
|
||||
if duplicates:
|
||||
_fail(name, [f"duplicate key: {k!r}" for k in duplicates])
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_confusable_symmetric(data: dict[str, Any]) -> None:
|
||||
"""If A lists B in confusable_group, B must list A."""
|
||||
name = "confusable_symmetric"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
if not group:
|
||||
continue
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if other is None:
|
||||
errors.append(f"[{key}] confusable_group references non-existent key {other_key!r}")
|
||||
continue
|
||||
other_group = other.get("confusable_group") or []
|
||||
if key not in other_group:
|
||||
errors.append(f"[{key}] lists {other_key!r} as confusable, but {other_key!r} does not list {key!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_shared_roots_valid_keys(data: dict[str, Any]) -> None:
|
||||
"""Every key in shared_roots must exist as a top-level key."""
|
||||
name = "shared_roots_valid_keys"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
shared = entry.get("shared_roots")
|
||||
if not shared:
|
||||
continue
|
||||
for ref_key in shared:
|
||||
if ref_key not in data:
|
||||
errors.append(f"[{key}] shared_roots references non-existent key {ref_key!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_unique_legacy_guids(data: dict[str, Any]) -> None:
|
||||
"""No two entries share the same vocab_legacy_guid (excluding null).
|
||||
|
||||
Exception: entries that share the same word.nikkud value inherited the
|
||||
same legacy Anki card (PoS homographs like חַד Particle vs Adjective).
|
||||
These are tolerated — the duplicate GUID is a known artefact of how
|
||||
legacy GUIDs were generated from the nikkud word alone.
|
||||
"""
|
||||
name = "unique_legacy_guids"
|
||||
seen: dict[str, list[str]] = {}
|
||||
|
||||
for key, entry in data.items():
|
||||
guid = entry.get("vocab_legacy_guid")
|
||||
if guid:
|
||||
seen.setdefault(guid, []).append(key)
|
||||
|
||||
errors: list[str] = []
|
||||
for guid, keys in seen.items():
|
||||
if len(keys) <= 1:
|
||||
continue
|
||||
# Tolerate sharing if ALL entries with this GUID share the same word.nikkud
|
||||
nikkud_values = {(data[k].get("word") or {}).get("nikkud") for k in keys}
|
||||
if len(nikkud_values) == 1:
|
||||
# Same nikkud -> inherited from same legacy card; tolerable
|
||||
_verbose_print(
|
||||
f"GUID {guid!r} shared by {len(keys)} entries with same nikkud ({next(iter(nikkud_values))!r}): {keys}"
|
||||
)
|
||||
continue
|
||||
errors.append(f"guid={guid!r} shared by entries with DIFFERENT nikkud: {keys}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_noun_inflection_on_non_nouns(data: dict[str, Any]) -> None:
|
||||
"""noun_inflection must be null if pos doesn't start with 'Noun'.
|
||||
|
||||
Explicit test case: 'גָּבוֹהַּ' (adjective) must NOT have noun_inflection.
|
||||
"""
|
||||
name = "no_noun_inflection_on_non_nouns"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
pos = entry.get("pos") or ""
|
||||
noun_inf = entry.get("noun_inflection")
|
||||
if not pos.startswith("Noun") and noun_inf is not None:
|
||||
errors.append(f"[{key}] pos={pos!r} but noun_inflection is set")
|
||||
_verbose_print(f"offending entry: {key!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_emoji_in_meaning(data: dict[str, Any]) -> None:
|
||||
"""meaning field must not contain inline emoji characters."""
|
||||
name = "no_emoji_in_meaning"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
if EMOJI_RE.search(meaning):
|
||||
errors.append(f"[{key}] meaning contains emoji: {meaning!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_example_sentences_contain_word(data: dict[str, Any]) -> None:
|
||||
"""For entries with examples.vetted, the word.nikkud must appear in at least one sentence.
|
||||
|
||||
Uses nikkud (exact) matching, not stripped matching.
|
||||
"""
|
||||
name = "example_sentences_contain_word"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
examples = entry.get("examples")
|
||||
if not examples:
|
||||
continue
|
||||
vetted = examples.get("vetted")
|
||||
if not vetted:
|
||||
continue
|
||||
|
||||
word_obj = entry.get("word") or {}
|
||||
nikkud_word = word_obj.get("nikkud") or ""
|
||||
if not nikkud_word:
|
||||
continue
|
||||
|
||||
found = any(nikkud_word in (s.get("text") or "") for s in vetted)
|
||||
if not found:
|
||||
sentences_preview = [s.get("text", "") for s in vetted[:2]]
|
||||
errors.append(
|
||||
f"[{key}] word {nikkud_word!r} not found in any vetted sentence. Sentences: {sentences_preview!r}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
_warn(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_cloze_offsets_valid(data: dict[str, Any]) -> None:
|
||||
"""cloze_word_start/end must be within text bounds when present.
|
||||
|
||||
Null offsets are tolerated (and warned separately) because some sentences
|
||||
contain only inflected/construct/plural forms that cannot be matched back
|
||||
to the base nikkud or ktiv_male — this is a data quality issue in
|
||||
vetted_sentences.json, not a schema violation.
|
||||
"""
|
||||
name = "cloze_offsets_valid"
|
||||
errors: list[str] = []
|
||||
null_warn: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
examples = entry.get("examples")
|
||||
if not examples:
|
||||
continue
|
||||
cloze = examples.get("cloze")
|
||||
if not cloze:
|
||||
continue
|
||||
|
||||
text = cloze.get("text") or ""
|
||||
start = cloze.get("cloze_word_start")
|
||||
end = cloze.get("cloze_word_end")
|
||||
|
||||
if start is None or end is None:
|
||||
null_warn.append(f"[{key}] cloze present but cloze_word_start/end are null")
|
||||
continue
|
||||
|
||||
text_len = len(text)
|
||||
if not isinstance(start, int) or not isinstance(end, int):
|
||||
errors.append(f"[{key}] cloze_word_start/end are not integers: {start!r}, {end!r}")
|
||||
continue
|
||||
if start < 0 or end < 0:
|
||||
errors.append(f"[{key}] cloze offsets are negative: start={start}, end={end}")
|
||||
continue
|
||||
if start >= end:
|
||||
errors.append(f"[{key}] cloze start >= end: start={start}, end={end}")
|
||||
continue
|
||||
if end > text_len:
|
||||
errors.append(f"[{key}] cloze end={end} exceeds text length={text_len}: {text!r}")
|
||||
|
||||
if null_warn:
|
||||
_warn(f"{name}_null_offsets", null_warn[:20] if not _verbose else null_warn)
|
||||
if len(null_warn) > 20 and not _verbose:
|
||||
print(f" ... ({len(null_warn) - 20} more; use --verbose)")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_hufal_pual_only_on_hifil_piel(data: dict[str, Any]) -> None:
|
||||
"""hufal_pual_forms must only be set for Hif'il or Pi'el verbs."""
|
||||
name = "hufal_pual_only_on_hifil_piel"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
hufal_pual = conj.get("hufal_pual_forms")
|
||||
if hufal_pual is None:
|
||||
continue
|
||||
|
||||
binyan = conj.get("binyan") or ""
|
||||
binyan_lower = binyan.lower()
|
||||
if "hif" not in binyan_lower and "pi" not in binyan_lower:
|
||||
errors.append(f"[{key}] hufal_pual_forms is set but binyan={binyan!r} (expected Hif'il or Pi'el)")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_confusable_group_shares_ktiv_male(data: dict[str, Any]) -> None:
|
||||
"""All entries in a confusable_group must share the same word.ktiv_male."""
|
||||
name = "confusable_group_shares_ktiv_male"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
if not group:
|
||||
continue
|
||||
|
||||
my_word = entry.get("word") or {}
|
||||
my_ktiv = my_word.get("ktiv_male")
|
||||
if not my_ktiv:
|
||||
continue
|
||||
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if not other:
|
||||
continue # already caught by confusable_symmetric
|
||||
other_word = other.get("word") or {}
|
||||
other_ktiv = other_word.get("ktiv_male")
|
||||
if other_ktiv and other_ktiv != my_ktiv:
|
||||
errors.append(
|
||||
f"[{key}] ktiv_male={my_ktiv!r} but confusable member {other_key!r} has ktiv_male={other_ktiv!r}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_confusables_guid(data: dict[str, Any]) -> None:
|
||||
"""confusables_guid must be consistent within each confusable_group.
|
||||
|
||||
Rules:
|
||||
- If confusable_group is non-null, confusables_guid must be non-null.
|
||||
- If confusable_group is null, confusables_guid must be null.
|
||||
- All entries that share a confusable_group must share the same
|
||||
confusables_guid value.
|
||||
"""
|
||||
name = "confusables_guid"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
guid = entry.get("confusables_guid")
|
||||
|
||||
if group and not guid:
|
||||
errors.append(f"[{key}] has confusable_group but confusables_guid is null/missing")
|
||||
elif not group and guid is not None:
|
||||
errors.append(f"[{key}] has confusables_guid={guid!r} but confusable_group is null")
|
||||
|
||||
if not group or not guid:
|
||||
continue
|
||||
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if not other:
|
||||
continue # already caught by confusable_symmetric
|
||||
other_guid = other.get("confusables_guid")
|
||||
if other_guid != guid:
|
||||
errors.append(
|
||||
f"[{key}] confusables_guid={guid!r} but confusable member "
|
||||
f"{other_key!r} has confusables_guid={other_guid!r}"
|
||||
)
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_conjugation_form_guids(data: dict[str, Any]) -> None:
|
||||
"""Every conjugation form must have a guid or guid_candidates, and GUIDs must be unique within a verb.
|
||||
|
||||
Rules:
|
||||
- Each form in active_forms and hufal_pual_forms must have a non-null ``guid``
|
||||
OR a non-empty ``guid_candidates`` list (used for present tense, past 3p, and
|
||||
1st person forms where multiple GUIDs are possible).
|
||||
- No two forms within the same verb (across both form lists) may share a GUID.
|
||||
"""
|
||||
name = "conjugation_form_guids"
|
||||
errors: list[str] = []
|
||||
warnings: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
|
||||
seen_guids: dict[str, str] = {} # guid -> "form_list_key[person]" label
|
||||
|
||||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||
forms = conj.get(form_list_key)
|
||||
if not forms:
|
||||
continue
|
||||
for form in forms:
|
||||
person = form.get("person", "?")
|
||||
label = f"{form_list_key}[{person}]"
|
||||
guid = form.get("guid")
|
||||
guid_candidates = form.get("guid_candidates")
|
||||
|
||||
if not guid and not guid_candidates:
|
||||
# New forms from rescrape use deterministic fallback — warn, don't fail
|
||||
warnings.append(f"[{key}] {label}: missing or null 'guid' and no 'guid_candidates'")
|
||||
continue
|
||||
|
||||
if guid:
|
||||
if guid in seen_guids:
|
||||
errors.append(f"[{key}] {label}: guid={guid!r} duplicates {seen_guids[guid]}")
|
||||
else:
|
||||
seen_guids[guid] = label
|
||||
elif guid_candidates:
|
||||
for candidate in guid_candidates:
|
||||
if candidate in seen_guids:
|
||||
errors.append(
|
||||
f"[{key}] {label}: guid_candidate={candidate!r} duplicates {seen_guids[candidate]}"
|
||||
)
|
||||
else:
|
||||
seen_guids[candidate] = label
|
||||
|
||||
if warnings:
|
||||
_warn(name + "_missing", [f"{len(warnings)} forms missing guid (deterministic fallback used)"])
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_conjugation_person_codes(data: dict[str, Any]) -> None:
|
||||
"""active_forms person codes must be from the defined valid set."""
|
||||
name = "conjugation_person_codes"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
conj = entry.get("conjugation")
|
||||
if not conj:
|
||||
continue
|
||||
|
||||
for form_list_key in ("active_forms", "hufal_pual_forms"):
|
||||
forms = conj.get(form_list_key)
|
||||
if not forms:
|
||||
continue
|
||||
for form in forms:
|
||||
person = form.get("person")
|
||||
if person not in VALID_PERSON_CODES:
|
||||
errors.append(f"[{key}] {form_list_key}: invalid person code {person!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_stripped_form_sentence_collisions(data: dict[str, Any]) -> None:
|
||||
"""For confusable words, their example sentences must not contain the wrong
|
||||
homograph's nikkud word.
|
||||
|
||||
Specifically: if A and B are confusable (same ktiv_male), A's vetted
|
||||
sentences must not contain B's nikkud form, and vice versa.
|
||||
"""
|
||||
name = "no_stripped_form_sentence_collisions"
|
||||
errors: list[str] = []
|
||||
|
||||
for key, entry in data.items():
|
||||
group = entry.get("confusable_group")
|
||||
if not group:
|
||||
continue
|
||||
|
||||
examples = entry.get("examples")
|
||||
if not examples:
|
||||
continue
|
||||
vetted = examples.get("vetted")
|
||||
if not vetted:
|
||||
continue
|
||||
|
||||
my_word = entry.get("word") or {}
|
||||
my_nikkud = my_word.get("nikkud") or ""
|
||||
|
||||
my_texts = [s.get("text") or "" for s in vetted]
|
||||
|
||||
for other_key in group:
|
||||
other = data.get(other_key)
|
||||
if not other:
|
||||
continue
|
||||
other_word = other.get("word") or {}
|
||||
other_nikkud = other_word.get("nikkud") or ""
|
||||
if not other_nikkud or other_nikkud == my_nikkud:
|
||||
continue # same nikkud homographs are ok (we can't distinguish by nikkud)
|
||||
|
||||
for text in my_texts:
|
||||
if other_nikkud in text:
|
||||
errors.append(f"[{key}] sentence contains wrong homograph {other_nikkud!r}: {text!r}")
|
||||
_verbose_print(f" my word: {my_nikkud!r}, wrong form: {other_nikkud!r}")
|
||||
break # one error per (key, other_key) pair is enough
|
||||
|
||||
if errors:
|
||||
_warn(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_shared_confusable_examples(data: dict[str, Any]) -> None:
|
||||
"""Within each confusable group, no two entries should share the same set of vetted sentence texts.
|
||||
|
||||
Shared examples indicate the deduplication step in epub_examples.py
|
||||
failed to assign examples to only the highest-frequency member.
|
||||
"""
|
||||
name = "no_shared_confusable_examples"
|
||||
errors: list[str] = []
|
||||
|
||||
from collections import defaultdict
|
||||
|
||||
# Build confusable group map
|
||||
group_map: dict[tuple[str, ...], list[str]] = defaultdict(list)
|
||||
for key, entry in data.items():
|
||||
cg = entry.get("confusable_group")
|
||||
if cg:
|
||||
group_id = tuple(sorted(cg))
|
||||
group_map[group_id].append(key)
|
||||
|
||||
for _group_id, members in group_map.items():
|
||||
if len(members) < 2:
|
||||
continue
|
||||
|
||||
# Collect sentence text sets per member
|
||||
text_sets: dict[str, frozenset[str]] = {}
|
||||
for key in members:
|
||||
vetted = (data[key].get("examples") or {}).get("vetted") or []
|
||||
texts = frozenset(e.get("text", "") for e in vetted)
|
||||
if texts:
|
||||
text_sets[key] = texts
|
||||
|
||||
# Check for identical sets
|
||||
seen: dict[frozenset[str], str] = {}
|
||||
for key, texts in text_sets.items():
|
||||
if texts in seen:
|
||||
meaning_a = (data[seen[texts]].get("meaning") or "")[:30]
|
||||
meaning_b = (data[key].get("meaning") or "")[:30]
|
||||
errors.append(
|
||||
f"{seen[texts]} ({meaning_a}) and {key} ({meaning_b}) share {len(texts)} identical example(s)"
|
||||
)
|
||||
else:
|
||||
seen[texts] = key
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_no_hebrew_in_meaning(data: dict[str, Any]) -> None:
|
||||
"""English meanings must not contain bare Hebrew text (spoils the card)."""
|
||||
name = "no_hebrew_in_meaning"
|
||||
errors: list[str] = []
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
|
||||
for key, entry in data.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
# Apply same cleaning pipeline as apkg_builder
|
||||
cleaned = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||
cleaned = re.sub(r"\s{2,}", " ", cleaned).strip(", ;:")
|
||||
if hebrew_re.search(cleaned):
|
||||
errors.append(f"[{key}] meaning still contains Hebrew after cleaning: {cleaned!r}")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
def test_mishkal_consistency(data: dict[str, Any]) -> None:
|
||||
"""mishkal_hebrew must match mishkal via _mishkal_to_hebrew conversion."""
|
||||
name = "mishkal_consistency"
|
||||
errors: list[str] = []
|
||||
|
||||
try:
|
||||
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||
except ImportError:
|
||||
_warn(name, ["Could not import _mishkal_to_hebrew — skipping"])
|
||||
return
|
||||
|
||||
for key, entry in data.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_eng and mishkal_heb:
|
||||
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||
if expected and expected != mishkal_heb:
|
||||
errors.append(f"[{key}] {infl_key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||
if mishkal_heb and not mishkal_eng:
|
||||
errors.append(f"[{key}] {infl_key}: has mishkal_hebrew but no mishkal")
|
||||
|
||||
if errors:
|
||||
_fail(name, errors[:20] if not _verbose else errors)
|
||||
if len(errors) > 20 and not _verbose:
|
||||
print(f" ... ({len(errors) - 20} more; use --verbose)")
|
||||
else:
|
||||
_pass(name)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Stats summary
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def print_stats(data: dict[str, Any]) -> None:
|
||||
"""Print a summary of dataset coverage metrics."""
|
||||
total = len(data)
|
||||
with_conj = sum(1 for e in data.values() if e.get("conjugation"))
|
||||
with_noun_inf = sum(1 for e in data.values() if e.get("noun_inflection"))
|
||||
with_vetted = sum(1 for e in data.values() if (e.get("examples") or {}).get("vetted"))
|
||||
with_cloze = sum(1 for e in data.values() if (e.get("examples") or {}).get("cloze"))
|
||||
with_image = sum(1 for e in data.values() if e.get("image"))
|
||||
with_emoji = sum(1 for e in data.values() if e.get("emoji"))
|
||||
with_guid = sum(1 for e in data.values() if e.get("vocab_legacy_guid"))
|
||||
in_confusable = sum(1 for e in data.values() if e.get("confusable_group"))
|
||||
with_shared_roots = sum(1 for e in data.values() if e.get("shared_roots"))
|
||||
with_mishkal = sum(
|
||||
1
|
||||
for e in data.values()
|
||||
if (e.get("noun_inflection") or {}).get("mishkal") or (e.get("adjective_inflection") or {}).get("mishkal")
|
||||
)
|
||||
|
||||
print()
|
||||
print("Stats Summary")
|
||||
print("─" * 42)
|
||||
print(f" Total entries: {total:>6}")
|
||||
print(f" With conjugation data: {with_conj:>6}")
|
||||
print(f" With noun_inflection: {with_noun_inf:>6}")
|
||||
print(f" With mishkal: {with_mishkal:>6}")
|
||||
print(f" With vetted examples: {with_vetted:>6}")
|
||||
print(f" With cloze examples: {with_cloze:>6}")
|
||||
print(f" With images: {with_image:>6}")
|
||||
print(f" With emoji: {with_emoji:>6}")
|
||||
print(f" With legacy GUIDs: {with_guid:>6}")
|
||||
print(f" In confusable groups: {in_confusable:>6}")
|
||||
print(f" With shared roots: {with_shared_roots:>6}")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test registry
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ALL_TESTS: dict[str, Any] = {
|
||||
"required_fields": test_required_fields,
|
||||
"root_format": test_root_format,
|
||||
"unique_slugs": test_unique_slugs,
|
||||
"no_duplicate_keys": test_no_duplicate_keys,
|
||||
"confusable_symmetric": test_confusable_symmetric,
|
||||
"shared_roots_valid_keys": test_shared_roots_valid_keys,
|
||||
"unique_legacy_guids": test_unique_legacy_guids,
|
||||
"no_noun_inflection_on_non_nouns": test_no_noun_inflection_on_non_nouns,
|
||||
"no_emoji_in_meaning": test_no_emoji_in_meaning,
|
||||
"example_sentences_contain_word": test_example_sentences_contain_word,
|
||||
"cloze_offsets_valid": test_cloze_offsets_valid,
|
||||
"hufal_pual_only_on_hifil_piel": test_hufal_pual_only_on_hifil_piel,
|
||||
"confusable_group_shares_ktiv_male": test_confusable_group_shares_ktiv_male,
|
||||
"confusables_guid": test_confusables_guid,
|
||||
"conjugation_form_guids": test_conjugation_form_guids,
|
||||
"conjugation_person_codes": test_conjugation_person_codes,
|
||||
"no_stripped_form_sentence_collisions": test_no_stripped_form_sentence_collisions,
|
||||
"no_shared_confusable_examples": test_no_shared_confusable_examples,
|
||||
"no_hebrew_in_meaning": test_no_hebrew_in_meaning,
|
||||
"mishkal_consistency": test_mishkal_consistency,
|
||||
}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Entry point
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def main() -> None:
|
||||
global _verbose
|
||||
|
||||
parser = argparse.ArgumentParser(description="Validate data/words.json against the Hebrew Flash Cards schema.")
|
||||
parser.add_argument(
|
||||
"--verbose",
|
||||
"-v",
|
||||
action="store_true",
|
||||
help="Print full details for all failures (not just first 20).",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test",
|
||||
metavar="NAME",
|
||||
help=f"Run a single test by name. Available: {', '.join(ALL_TESTS)}",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
_verbose = args.verbose
|
||||
|
||||
data = load_data()
|
||||
|
||||
# Select tests to run
|
||||
if args.test:
|
||||
if args.test not in ALL_TESTS:
|
||||
print(f"ERROR: unknown test {args.test!r}. Available: {', '.join(ALL_TESTS)}")
|
||||
sys.exit(2)
|
||||
tests_to_run = {args.test: ALL_TESTS[args.test]}
|
||||
else:
|
||||
tests_to_run = ALL_TESTS
|
||||
|
||||
print(f"Validating {DATA_FILE} ({len(data)} entries)")
|
||||
print("─" * 60)
|
||||
|
||||
# no_duplicate_keys needs the file, not the pre-parsed dict
|
||||
for test_fn in tests_to_run.values():
|
||||
test_fn(data)
|
||||
|
||||
# Summary
|
||||
if not args.test:
|
||||
print_stats(data)
|
||||
|
||||
print()
|
||||
print("─" * 60)
|
||||
if _warnings:
|
||||
print(f" Warnings : {len(_warnings)}")
|
||||
if _failures:
|
||||
print(f" FAILED: {len(_failures)} test(s): {', '.join(_failures)}")
|
||||
sys.exit(1)
|
||||
else:
|
||||
print(f" All {len(tests_to_run)} test(s) passed.")
|
||||
sys.exit(0)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
198
sentence_difficulty.py
Normal file
198
sentence_difficulty.py
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
"""Sentence difficulty scoring by context-word frequency.
|
||||
|
||||
Scores sentences by the median frequency rank of context words
|
||||
(excluding the cloze target). Lower score = easier sentence.
|
||||
Used by epub_examples.py to select the best cloze sentence.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from statistics import median
|
||||
|
||||
import helpers
|
||||
import nikkud_to_ktiv_male
|
||||
|
||||
DEFAULT_RANK = 50_000
|
||||
|
||||
# Hebrew prefix consonants for ktiv_male prefix stripping (tier 5)
|
||||
_KM_PREFIX_CHARS = set("בהוכלמשע")
|
||||
|
||||
# Punctuation to strip from tokens
|
||||
_PUNCT = set('.,!?;:"\'"״׳–—()[]{}')
|
||||
|
||||
# Maqaf (Hebrew hyphen) — splits tokens
|
||||
_MAQAF = "־"
|
||||
|
||||
|
||||
def build_nikkud_map(words: dict) -> dict[str, str]:
|
||||
"""Build nikkud→ktiv_male lookup from words.json.
|
||||
|
||||
Indexes: headwords, conjugation forms (active, passive, infinitive,
|
||||
reference_form), noun inflections (singular, plural, construct,
|
||||
pronominal suffixes), and adjective inflections (ms/fs/mp/fp).
|
||||
|
||||
Args:
|
||||
words: The full words.json dict keyed by unique_key.
|
||||
|
||||
Returns:
|
||||
Dict mapping nikkud form to ktiv_male string.
|
||||
When collisions occur, last-write wins (acceptable for frequency lookup).
|
||||
"""
|
||||
nmap: dict[str, str] = {}
|
||||
|
||||
def _add(nikkud: str | None, ktiv_male: str | None) -> None:
|
||||
if nikkud and ktiv_male:
|
||||
nmap[nikkud] = ktiv_male
|
||||
|
||||
for entry in words.values():
|
||||
word = entry.get("word") or {}
|
||||
_add(word.get("nikkud"), word.get("ktiv_male"))
|
||||
|
||||
# Conjugation forms
|
||||
conj = entry.get("conjugation") or {}
|
||||
for form_entry in conj.get("active_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
for form_entry in conj.get("hufal_pual_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
inf = conj.get("infinitive") or {}
|
||||
_add(inf.get("nikkud"), inf.get("ktiv_male"))
|
||||
ref = conj.get("reference_form") or {}
|
||||
_add(ref.get("nikkud"), ref.get("ktiv_male"))
|
||||
|
||||
# Noun inflection forms
|
||||
noun = entry.get("noun_inflection") or {}
|
||||
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||
sub = noun.get(field) or {}
|
||||
nikkud_form = sub.get("nikkud")
|
||||
ktiv = sub.get("ktiv_male")
|
||||
_add(nikkud_form, ktiv)
|
||||
# Index construct forms without maqaf
|
||||
if nikkud_form and nikkud_form.endswith("־") and ktiv:
|
||||
_add(nikkud_form[:-1], ktiv)
|
||||
pronominal = noun.get("pronominal_suffixes") or {}
|
||||
for sub in pronominal.values():
|
||||
if isinstance(sub, dict):
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
# Adjective inflection forms
|
||||
adj = entry.get("adjective_inflection") or {}
|
||||
for field in ("ms", "fs", "mp", "fp"):
|
||||
sub = adj.get(field) or {}
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
return nmap
|
||||
|
||||
|
||||
def _resolve_token_frequency(
|
||||
token: str,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Resolve a nikkud sentence token to its frequency rank.
|
||||
|
||||
Uses a 5-tier pipeline:
|
||||
1. Known mapping (nikkud_map from words.json)
|
||||
2. Nikkud prefix stripping (epub_examples.try_strip_prefix)
|
||||
3. Academy rules converter (nikkud_to_ktiv_male.convert)
|
||||
4. strip_nikkud fallback (helpers.strip_nikkud)
|
||||
5. Ktiv_male prefix stripping on the converted form
|
||||
|
||||
Returns:
|
||||
Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found.
|
||||
"""
|
||||
# Tier 1: Direct lookup in nikkud→ktiv_male map
|
||||
ktiv = nikkud_map.get(token)
|
||||
if ktiv and ktiv in freq_data:
|
||||
return freq_data[ktiv]
|
||||
|
||||
# Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map
|
||||
from epub_examples import try_strip_prefix
|
||||
|
||||
prefix_hits = try_strip_prefix(token, nikkud_index)
|
||||
for _unique_key, _match_type, matched_remainder in prefix_hits:
|
||||
remainder_ktiv = nikkud_map.get(matched_remainder)
|
||||
if remainder_ktiv and remainder_ktiv in freq_data:
|
||||
return freq_data[remainder_ktiv]
|
||||
|
||||
# Tier 3: Academy rules converter
|
||||
converted = nikkud_to_ktiv_male.convert(token)
|
||||
if converted in freq_data:
|
||||
return freq_data[converted]
|
||||
|
||||
# Tier 4: strip_nikkud fallback
|
||||
stripped = helpers.strip_nikkud(token)
|
||||
if stripped != converted and stripped in freq_data:
|
||||
return freq_data[stripped]
|
||||
|
||||
# Tier 5: Ktiv_male prefix stripping on converted/stripped form
|
||||
for form in (converted, stripped):
|
||||
for prefix_len in (1, 2):
|
||||
if len(form) > prefix_len + 1:
|
||||
prefix = form[:prefix_len]
|
||||
if all(c in _KM_PREFIX_CHARS for c in prefix):
|
||||
stem = form[prefix_len:]
|
||||
if stem in freq_data:
|
||||
return freq_data[stem]
|
||||
|
||||
return DEFAULT_RANK
|
||||
|
||||
|
||||
def score_sentence(
|
||||
text: str,
|
||||
target_start: int,
|
||||
target_end: int,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Score a sentence by median frequency rank of context words.
|
||||
|
||||
Args:
|
||||
text: The full sentence text (with nikkud).
|
||||
target_start: Character offset where the cloze target word starts.
|
||||
target_end: Character offset where the cloze target word ends.
|
||||
nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map().
|
||||
nikkud_index: nikkud index from epub_examples._build_nikkud_index().
|
||||
freq_data: Frequency dict from frequency_lookup.get_freq_data().
|
||||
|
||||
Returns:
|
||||
Median frequency rank of context tokens (int). Lower = easier.
|
||||
Returns DEFAULT_RANK if no scoreable context tokens.
|
||||
"""
|
||||
# Tokenize: split on whitespace, then split on maqaf
|
||||
raw_tokens = text.split()
|
||||
tokens_with_pos: list[tuple[str, int, int]] = []
|
||||
pos = 0
|
||||
for raw in raw_tokens:
|
||||
start = text.index(raw, pos)
|
||||
# Split on maqaf
|
||||
parts = raw.split(_MAQAF)
|
||||
sub_pos = start
|
||||
for part in parts:
|
||||
if part:
|
||||
tokens_with_pos.append((part, sub_pos, sub_pos + len(part)))
|
||||
sub_pos += len(part) + 1 # +1 for maqaf
|
||||
pos = start + len(raw)
|
||||
|
||||
# Filter: exclude target word, strip punctuation, skip short tokens
|
||||
context_ranks: list[int] = []
|
||||
for token, tok_start, tok_end in tokens_with_pos:
|
||||
# Exclude target word by overlap with char offsets
|
||||
if tok_start < target_end and tok_end > target_start:
|
||||
continue
|
||||
|
||||
# Strip punctuation from edges
|
||||
cleaned = token.strip("".join(_PUNCT))
|
||||
if len(cleaned) < 2:
|
||||
continue
|
||||
|
||||
rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data)
|
||||
context_ranks.append(rank)
|
||||
|
||||
if not context_ranks:
|
||||
return DEFAULT_RANK
|
||||
|
||||
return int(median(context_ranks))
|
||||
246
tests/test_apkg_builder.py
Normal file
246
tests/test_apkg_builder.py
Normal file
|
|
@ -0,0 +1,246 @@
|
|||
"""Unit tests for apkg_builder — Sprint 15 learnings.
|
||||
|
||||
Tests cover: cloze prefix preservation, Hebrew spoiler stripping from English
|
||||
meanings, PoS exact matching, gender field population, and mishkal data integrity.
|
||||
"""
|
||||
|
||||
import json
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Ensure project root is on path
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from apkg_builder import _categorize_pos, _cloze_prefix_len
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Cloze prefix preservation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestClozePrefix:
|
||||
"""_cloze_prefix_len must detect Hebrew prefix letters before the word."""
|
||||
|
||||
def test_single_prefix_bet(self):
|
||||
# בַּתּוֹר = bet + patach + tor
|
||||
assert _cloze_prefix_len("בַּתּוֹר", "תּוֹר") > 0
|
||||
|
||||
def test_single_prefix_lamed(self):
|
||||
# לַמֶּלֶךְ = lamed + patach + melech
|
||||
assert _cloze_prefix_len("לַמֶּלֶךְ", "מֶּלֶךְ") > 0
|
||||
|
||||
def test_two_consonant_prefix(self):
|
||||
# שֶׁבַּתּוֹר = shin + bet + tor (two prefix letters)
|
||||
token = "שֶׁבַּתּוֹר"
|
||||
word = "תּוֹר"
|
||||
prefix_len = _cloze_prefix_len(token, word)
|
||||
assert prefix_len > 0
|
||||
assert token[prefix_len:].startswith(word)
|
||||
|
||||
def test_no_prefix_direct_match(self):
|
||||
# Word appears at start — no prefix
|
||||
assert _cloze_prefix_len("תּוֹר", "תּוֹר") == 0
|
||||
|
||||
def test_empty_inputs(self):
|
||||
assert _cloze_prefix_len("", "תּוֹר") == 0
|
||||
assert _cloze_prefix_len("בַּתּוֹר", "") == 0
|
||||
assert _cloze_prefix_len("", "") == 0
|
||||
|
||||
def test_non_prefix_letter_returns_zero(self):
|
||||
# If the "prefix" chars aren't valid prefix letters, return 0
|
||||
# 'ת' is not in _PREFIX_LETTERS (בהוכלמש)
|
||||
assert _cloze_prefix_len("תַּתּוֹר", "תּוֹר") == 0
|
||||
|
||||
def test_prefix_preserves_nikkud(self):
|
||||
# Verify that prefix_len includes nikkud marks
|
||||
token = "בַּתּוֹר"
|
||||
word = "תּוֹר"
|
||||
prefix_len = _cloze_prefix_len(token, word)
|
||||
prefix = token[:prefix_len]
|
||||
# Prefix should contain at least bet + nikkud mark(s)
|
||||
base_letters = [c for c in prefix if "\u05d0" <= c <= "\u05ea"]
|
||||
assert base_letters == ["ב"]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# PoS exact matching (no substring collisions)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestCategorizePos:
|
||||
"""_categorize_pos must not let 'Pronoun' match 'Noun'."""
|
||||
|
||||
def test_noun_exact(self):
|
||||
assert _categorize_pos("Noun") == "Noun"
|
||||
|
||||
def test_pronoun_is_other(self):
|
||||
assert _categorize_pos("Pronoun") == "Other"
|
||||
|
||||
def test_verb_exact(self):
|
||||
assert _categorize_pos("Verb") == "Verb"
|
||||
|
||||
def test_noun_with_dash(self):
|
||||
assert _categorize_pos("Noun – masculine") == "Noun"
|
||||
|
||||
def test_adjective(self):
|
||||
assert _categorize_pos("Adjective") == "Adjective"
|
||||
|
||||
def test_conjunction_is_other(self):
|
||||
assert _categorize_pos("Conjunction") == "Other"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Hebrew spoiler stripping from English meanings
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestHebrewSpoilerStripping:
|
||||
"""English meanings must not contain Hebrew text (spoils the card)."""
|
||||
|
||||
# Use the same regex from apkg_builder.py
|
||||
HEBREW_STRIP_RE = re.compile(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*")
|
||||
|
||||
@staticmethod
|
||||
def _strip_hebrew(meaning: str) -> str:
|
||||
"""Replicate the meaning cleaning pipeline from build_vocab_deck."""
|
||||
meaning = re.sub(r"[\u0590-\u05FF][\u0590-\u05FF\u0591-\u05C7\s\-]*", "", meaning)
|
||||
meaning = re.sub(r"[;:]\s*—", " —", meaning)
|
||||
meaning = re.sub(r";\s*:", ";", meaning)
|
||||
return re.sub(r"\s{2,}", " ", meaning).strip(", ;:")
|
||||
|
||||
def test_pure_english_unchanged(self):
|
||||
assert self._strip_hebrew("to eat, to consume") == "to eat, to consume"
|
||||
|
||||
def test_hebrew_word_removed(self):
|
||||
result = self._strip_hebrew("to eat; אכל")
|
||||
assert "אכל" not in result
|
||||
|
||||
def test_hebrew_with_nikkud_removed(self):
|
||||
result = self._strip_hebrew("tall; גָּבוֹהַּ")
|
||||
assert "גָּבוֹהַּ" not in result
|
||||
assert "tall" in result
|
||||
|
||||
def test_no_residual_hebrew_in_real_data(self):
|
||||
"""Scan actual words.json — no meaning should contain Hebrew after stripping."""
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
|
||||
# The regex used in apkg_builder
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
spoilers = []
|
||||
for key, entry in words.items():
|
||||
meaning = entry.get("meaning") or ""
|
||||
cleaned = self._strip_hebrew(meaning)
|
||||
if hebrew_re.search(cleaned):
|
||||
spoilers.append(f"{key}: {cleaned!r}")
|
||||
|
||||
assert not spoilers, f"Hebrew found in {len(spoilers)} meanings after stripping: {spoilers[:5]}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Gender field for nouns (words.json data integrity)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestGenderDataIntegrity:
|
||||
"""Nouns with noun_inflection should have gender populated."""
|
||||
|
||||
@pytest.fixture()
|
||||
def words(self):
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def test_nouns_have_gender(self, words):
|
||||
"""Nouns with noun_inflection should have a valid gender."""
|
||||
missing = []
|
||||
for key, entry in words.items():
|
||||
pos = entry.get("pos") or ""
|
||||
ni = entry.get("noun_inflection")
|
||||
if pos.startswith("Noun") and ni:
|
||||
gender = ni.get("gender") or ""
|
||||
if gender not in ("masculine", "feminine", "masculine and feminine"):
|
||||
missing.append(f"{key}: gender={gender!r}")
|
||||
|
||||
# Allow up to 7% missing (loan words, compound words, etc.)
|
||||
noun_count = sum(
|
||||
1 for e in words.values() if (e.get("pos") or "").startswith("Noun") and e.get("noun_inflection")
|
||||
)
|
||||
if noun_count > 0:
|
||||
pct_missing = len(missing) / noun_count
|
||||
assert pct_missing < 0.07, f"{len(missing)}/{noun_count} nouns missing gender: {missing[:10]}"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Mishkal data integrity
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestMishkalIntegrity:
|
||||
"""Validate mishkal data consistency in words.json."""
|
||||
|
||||
@pytest.fixture()
|
||||
def words(self):
|
||||
words_path = Path(__file__).resolve().parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
return json.load(f)
|
||||
|
||||
def test_mishkal_hebrew_matches_english(self, words):
|
||||
"""If mishkal and mishkal_hebrew are both set, they should correspond via _mishkal_to_hebrew."""
|
||||
from pealim_detail_scrape import _mishkal_to_hebrew
|
||||
|
||||
mismatches = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_eng and mishkal_heb:
|
||||
expected = _mishkal_to_hebrew(mishkal_eng) or ""
|
||||
if expected and expected != mishkal_heb:
|
||||
mismatches.append(f"{key}: {mishkal_eng}→{mishkal_heb} (expected {expected})")
|
||||
|
||||
assert not mismatches, f"{len(mismatches)} mishkal mismatches: {mismatches[:10]}"
|
||||
|
||||
def test_mishkal_hebrew_is_hebrew(self, words):
|
||||
"""mishkal_hebrew must contain Hebrew characters."""
|
||||
hebrew_re = re.compile(r"[\u05D0-\u05EA]")
|
||||
bad = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
if mishkal_heb and not hebrew_re.search(mishkal_heb):
|
||||
bad.append(f"{key}: mishkal_hebrew={mishkal_heb!r}")
|
||||
|
||||
assert not bad, f"{len(bad)} non-Hebrew mishkal_hebrew values: {bad[:10]}"
|
||||
|
||||
def test_no_orphaned_mishkal(self, words):
|
||||
"""If mishkal_hebrew is set, mishkal (English) must also be set."""
|
||||
orphans = []
|
||||
for key, entry in words.items():
|
||||
for infl_key in ("noun_inflection", "adjective_inflection"):
|
||||
infl = entry.get(infl_key)
|
||||
if not infl:
|
||||
continue
|
||||
mishkal_heb = infl.get("mishkal_hebrew") or ""
|
||||
mishkal_eng = infl.get("mishkal") or ""
|
||||
if mishkal_heb and not mishkal_eng:
|
||||
orphans.append(f"{key}: has mishkal_hebrew but no mishkal")
|
||||
|
||||
assert not orphans, f"{len(orphans)} orphaned mishkal_hebrew: {orphans[:10]}"
|
||||
524
tests/test_detail_scrape.py
Normal file
524
tests/test_detail_scrape.py
Normal file
|
|
@ -0,0 +1,524 @@
|
|||
"""Tests for adjective and preposition detail page parsing in pealim_detail_scrape.py."""
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from pealim_detail_scrape import (
|
||||
_parse_adjective_table,
|
||||
_parse_adjective_table_vl,
|
||||
_parse_preposition_table,
|
||||
_parse_preposition_table_vl,
|
||||
_scrape_adjective_detail,
|
||||
_scrape_preposition_detail,
|
||||
)
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Fixtures — real HTML snippets from pealim.com
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
ADJECTIVE_MO_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th class="column-header" colspan="2">Singular</th>
|
||||
<th class="column-header" colspan="2">Plural</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="conj-td">
|
||||
<div id="ms-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/dn/dngfpnovmytc.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִי</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fs-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/1j/1j6srg3do7n5k.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִית</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="mp-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/tj/tjrhw0b5dkhc.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִיִּים</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fp-a">
|
||||
<div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/h3/h3u1ml5a4xcf.mp3">🔊</span>
|
||||
<span class="menukad">אֲבִיבִיּוֹת</span>
|
||||
</div></div>
|
||||
<div class="meaning">spring-like, vernal</div>
|
||||
</div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
# VL version: menukad spans contain unvowelled text (hebstyle=vl)
|
||||
ADJECTIVE_VL_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<td class="conj-td">
|
||||
<div id="ms-a"><div><div>
|
||||
<span class="menukad">אביבי</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fs-a"><div><div>
|
||||
<span class="menukad">אביבית</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="mp-a"><div><div>
|
||||
<span class="menukad">אביביים</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="fp-a"><div><div>
|
||||
<span class="menukad">אביביות</span>
|
||||
</div></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
PREPOSITION_MO_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<thead>
|
||||
<tr>
|
||||
<th rowspan="2">Person</th>
|
||||
<th class="column-header" colspan="2">Singular</th>
|
||||
<th class="column-header" colspan="2">Plural</th>
|
||||
</tr>
|
||||
<tr>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
<th class="column-header">Masculine</th>
|
||||
<th class="column-header">Feminine</th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>1st</th>
|
||||
<td class="conj-td" colspan="2">
|
||||
<div id="P-1s"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5faeteecr.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלִּי</span>
|
||||
</div></div><div class="meaning"><strong>of mine</strong></div></div>
|
||||
</td>
|
||||
<td class="conj-td" colspan="2">
|
||||
<div id="P-1p"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/13/13uvi0dz6tgcc.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּנוּ</span>
|
||||
</div></div><div class="meaning"><strong>of ours</strong></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>2nd</th>
|
||||
<td class="conj-td">
|
||||
<div id="P-2ms"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/shbxafq8ietx.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלְּךָ</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>m. sg.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-2fs"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9ue3a8buo3.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּךְ</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>f. sg.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-2mp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olx8vzsctlzn.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּכֶם</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>m. pl.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-2fp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/ol/olxrms6dl8eq.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּכֶן</span>
|
||||
</div></div><div class="meaning"><strong>of yours</strong> <em>f. pl.</em></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>3rd</th>
|
||||
<td class="conj-td">
|
||||
<div id="P-3ms"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/fk/fkp5qigelthg.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלּוֹ</span>
|
||||
</div></div><div class="meaning"><strong>of his</strong></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-3fs"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/sh/sh9w36hojm5w.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּהּ</span>
|
||||
</div></div><div class="meaning"><strong>of hers</strong></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-3mp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n99z0jr8pint.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּהֶם</span>
|
||||
</div></div><div class="meaning"><strong>of theirs</strong> <em>m.</em></div></div>
|
||||
</td>
|
||||
<td class="conj-td">
|
||||
<div id="P-3fp"><div><div>
|
||||
<span class="audio-play" data-audio="https://audio.pealim.com/v0/n9/n9ahrc59h52w.mp3">🔊</span>
|
||||
<span class="menukad">שֶׁלָּהֶן</span>
|
||||
</div></div><div class="meaning"><strong>of theirs</strong> <em>f.</em></div></div>
|
||||
</td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
PREPOSITION_VL_TABLE = """
|
||||
<table class="table table-condensed conjugation-table">
|
||||
<tbody>
|
||||
<tr>
|
||||
<th>1st</th>
|
||||
<td colspan="2"><div id="P-1s"><div><div>
|
||||
<span class="menukad">שלי</span>
|
||||
</div></div></div></td>
|
||||
<td colspan="2"><div id="P-1p"><div><div>
|
||||
<span class="menukad">שלנו</span>
|
||||
</div></div></div></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>2nd</th>
|
||||
<td><div id="P-2ms"><div><div>
|
||||
<span class="menukad">שלך</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-2fs"><div><div>
|
||||
<span class="menukad">שלך</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-2mp"><div><div>
|
||||
<span class="menukad">שלכם</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-2fp"><div><div>
|
||||
<span class="menukad">שלכן</span>
|
||||
</div></div></div></td>
|
||||
</tr>
|
||||
<tr>
|
||||
<th>3rd</th>
|
||||
<td><div id="P-3ms"><div><div>
|
||||
<span class="menukad">שלו</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-3fs"><div><div>
|
||||
<span class="menukad">שלה</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-3mp"><div><div>
|
||||
<span class="menukad">שלהם</span>
|
||||
</div></div></div></td>
|
||||
<td><div id="P-3fp"><div><div>
|
||||
<span class="menukad">שלהן</span>
|
||||
</div></div></div></td>
|
||||
</tr>
|
||||
</tbody>
|
||||
</table>
|
||||
"""
|
||||
|
||||
# Minimal full-page wrappers so _scrape_*_detail() can parse them
|
||||
_ADJECTIVE_MO_PAGE = f"<html><body>{ADJECTIVE_MO_TABLE}</body></html>"
|
||||
_ADJECTIVE_VL_PAGE = f"<html><body>{ADJECTIVE_VL_TABLE}</body></html>"
|
||||
_PREPOSITION_MO_PAGE = f"<html><body>{PREPOSITION_MO_TABLE}</body></html>"
|
||||
_PREPOSITION_VL_PAGE = f"<html><body>{PREPOSITION_VL_TABLE}</body></html>"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Adjective table tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestParseAdjectiveTable:
|
||||
"""Tests for _parse_adjective_table (mo/nikkud page)."""
|
||||
|
||||
def test_returns_four_form_keys(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
|
||||
|
||||
def test_ms_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["ms"]["nikkud"] == "אֲבִיבִי"
|
||||
|
||||
def test_fs_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["fs"]["nikkud"] == "אֲבִיבִית"
|
||||
|
||||
def test_mp_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
|
||||
|
||||
def test_fp_nikkud(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
|
||||
|
||||
def test_audio_url_present(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup(ADJECTIVE_MO_TABLE, "lxml"))
|
||||
assert result["ms"]["audio_url"].startswith("https://audio.pealim.com/")
|
||||
|
||||
def test_empty_on_missing_table(self) -> None:
|
||||
result = _parse_adjective_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
|
||||
assert result == {}
|
||||
|
||||
|
||||
class TestParseAdjectiveTableVl:
|
||||
"""Tests for _parse_adjective_table_vl (ktiv male page)."""
|
||||
|
||||
def test_returns_four_form_keys(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert set(result.keys()) == {"ms", "fs", "mp", "fp"}
|
||||
|
||||
def test_ms_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["ms"] == "אביבי"
|
||||
|
||||
def test_fs_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["fs"] == "אביבית"
|
||||
|
||||
def test_mp_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["mp"] == "אביביים"
|
||||
|
||||
def test_fp_ktiv(self) -> None:
|
||||
result = _parse_adjective_table_vl(__import__("bs4").BeautifulSoup(ADJECTIVE_VL_TABLE, "lxml"))
|
||||
assert result["fp"] == "אביביות"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _scrape_adjective_detail tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestScrapeAdjectiveDetail:
|
||||
"""Tests for _scrape_adjective_detail — schema compliance."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _scrape_adjective_detail("9098-avivi", _ADJECTIVE_MO_PAGE, _ADJECTIVE_VL_PAGE)
|
||||
|
||||
def test_returns_non_empty_dict(self, result: dict) -> None:
|
||||
assert result
|
||||
|
||||
def test_ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["ms"]["nikkud"] == "אֲבִיבִי"
|
||||
assert result["ms"]["ktiv_male"] == "אביבי"
|
||||
|
||||
def test_fs_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["fs"]["nikkud"] == "אֲבִיבִית"
|
||||
assert result["fs"]["ktiv_male"] == "אביבית"
|
||||
|
||||
def test_mp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["mp"]["nikkud"] == "אֲבִיבִיִּים"
|
||||
assert result["mp"]["ktiv_male"] == "אביביים"
|
||||
|
||||
def test_fp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["fp"]["nikkud"] == "אֲבִיבִיּוֹת"
|
||||
assert result["fp"]["ktiv_male"] == "אביביות"
|
||||
|
||||
def test_mishkal_key_present(self, result: dict) -> None:
|
||||
# mishkal may be None since no PoS section is in our minimal fixture
|
||||
assert "mishkal" in result
|
||||
|
||||
def test_mishkal_hebrew_key_present(self, result: dict) -> None:
|
||||
assert "mishkal_hebrew" in result
|
||||
|
||||
def test_all_schema_keys_present(self, result: dict) -> None:
|
||||
expected = {"ms", "fs", "mp", "fp", "mishkal", "mishkal_hebrew"}
|
||||
assert expected.issubset(result.keys())
|
||||
|
||||
def test_empty_on_no_table(self) -> None:
|
||||
result = _scrape_adjective_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
|
||||
assert result == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Preposition table tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestParsePrepositionTable:
|
||||
"""Tests for _parse_preposition_table (mo/nikkud page)."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _parse_preposition_table(__import__("bs4").BeautifulSoup(PREPOSITION_MO_TABLE, "lxml"))
|
||||
|
||||
def test_returns_ten_form_keys(self, result: dict) -> None:
|
||||
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||
assert set(result.keys()) == expected
|
||||
|
||||
def test_1s_nikkud(self, result: dict) -> None:
|
||||
assert result["1s"]["nikkud"] == "שֶׁלִּי"
|
||||
|
||||
def test_1p_nikkud(self, result: dict) -> None:
|
||||
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
|
||||
|
||||
def test_2ms_nikkud(self, result: dict) -> None:
|
||||
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
|
||||
|
||||
def test_2fs_nikkud(self, result: dict) -> None:
|
||||
assert result["2fs"]["nikkud"] == "שֶׁלָּךְ"
|
||||
|
||||
def test_2mp_nikkud(self, result: dict) -> None:
|
||||
assert result["2mp"]["nikkud"] == "שֶׁלָּכֶם"
|
||||
|
||||
def test_2fp_nikkud(self, result: dict) -> None:
|
||||
assert result["2fp"]["nikkud"] == "שֶׁלָּכֶן"
|
||||
|
||||
def test_3ms_nikkud(self, result: dict) -> None:
|
||||
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
|
||||
|
||||
def test_3fs_nikkud(self, result: dict) -> None:
|
||||
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
|
||||
|
||||
def test_3mp_nikkud(self, result: dict) -> None:
|
||||
assert result["3mp"]["nikkud"] == "שֶׁלָּהֶם"
|
||||
|
||||
def test_3fp_nikkud(self, result: dict) -> None:
|
||||
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
|
||||
|
||||
def test_audio_url_present(self, result: dict) -> None:
|
||||
assert result["1s"]["audio_url"].startswith("https://audio.pealim.com/")
|
||||
|
||||
def test_empty_on_missing_table(self) -> None:
|
||||
result = _parse_preposition_table(__import__("bs4").BeautifulSoup("<html><body></body></html>", "lxml"))
|
||||
assert result == {}
|
||||
|
||||
|
||||
class TestParsePrepositionTableVl:
|
||||
"""Tests for _parse_preposition_table_vl (ktiv male page)."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _parse_preposition_table_vl(__import__("bs4").BeautifulSoup(PREPOSITION_VL_TABLE, "lxml"))
|
||||
|
||||
def test_returns_ten_form_keys(self, result: dict) -> None:
|
||||
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||
assert set(result.keys()) == expected
|
||||
|
||||
def test_1s_ktiv(self, result: dict) -> None:
|
||||
assert result["1s"] == "שלי"
|
||||
|
||||
def test_1p_ktiv(self, result: dict) -> None:
|
||||
assert result["1p"] == "שלנו"
|
||||
|
||||
def test_2ms_ktiv(self, result: dict) -> None:
|
||||
assert result["2ms"] == "שלך"
|
||||
|
||||
def test_3ms_ktiv(self, result: dict) -> None:
|
||||
assert result["3ms"] == "שלו"
|
||||
|
||||
def test_3fp_ktiv(self, result: dict) -> None:
|
||||
assert result["3fp"] == "שלהן"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# _scrape_preposition_detail tests
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
class TestScrapePrepositionDetail:
|
||||
"""Tests for _scrape_preposition_detail — schema compliance."""
|
||||
|
||||
@pytest.fixture()
|
||||
def result(self) -> dict:
|
||||
return _scrape_preposition_detail("2643-shel", _PREPOSITION_MO_PAGE, _PREPOSITION_VL_PAGE)
|
||||
|
||||
def test_returns_non_empty_dict(self, result: dict) -> None:
|
||||
assert result
|
||||
|
||||
def test_all_ten_person_keys_present(self, result: dict) -> None:
|
||||
expected = {"1s", "1p", "2ms", "2fs", "2mp", "2fp", "3ms", "3fs", "3mp", "3fp"}
|
||||
assert expected.issubset(result.keys())
|
||||
|
||||
def test_1s_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["1s"]["nikkud"] == "שֶׁלִּי"
|
||||
assert result["1s"]["ktiv_male"] == "שלי"
|
||||
|
||||
def test_1p_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["1p"]["nikkud"] == "שֶׁלָּנוּ"
|
||||
assert result["1p"]["ktiv_male"] == "שלנו"
|
||||
|
||||
def test_2ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["2ms"]["nikkud"] == "שֶׁלְּךָ"
|
||||
assert result["2ms"]["ktiv_male"] == "שלך"
|
||||
|
||||
def test_3ms_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["3ms"]["nikkud"] == "שֶׁלּוֹ"
|
||||
assert result["3ms"]["ktiv_male"] == "שלו"
|
||||
|
||||
def test_3fs_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["3fs"]["nikkud"] == "שֶׁלָּהּ"
|
||||
assert result["3fs"]["ktiv_male"] == "שלה"
|
||||
|
||||
def test_3fp_has_nikkud_and_ktiv(self, result: dict) -> None:
|
||||
assert result["3fp"]["nikkud"] == "שֶׁלָּהֶן"
|
||||
assert result["3fp"]["ktiv_male"] == "שלהן"
|
||||
|
||||
def test_empty_on_no_table(self) -> None:
|
||||
result = _scrape_preposition_detail("missing", "<html><body></body></html>", "<html><body></body></html>")
|
||||
assert result == {}
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests for _parse_noun_gender_mishkal mishkal extraction
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
from bs4 import BeautifulSoup # noqa: E402
|
||||
|
||||
from pealim_detail_scrape import _parse_noun_gender_mishkal # noqa: E402
|
||||
|
||||
|
||||
class TestNounGenderMishkal:
|
||||
def test_noun_with_mishkal(self):
|
||||
html = '<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, masculine</p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "masculine"
|
||||
assert mishkal == "ketel"
|
||||
|
||||
def test_noun_without_mishkal(self):
|
||||
html = "<p>Noun – masculine</p>"
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "masculine"
|
||||
assert mishkal == ""
|
||||
|
||||
def test_adjective_mishkal(self):
|
||||
html = '<p>Adjective – <a href="/dict/?pos=adjective&am=qatul"><i>katul</i> pattern</a></p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
_, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert mishkal == "katul"
|
||||
|
||||
def test_feminine_noun(self):
|
||||
html = '<p>Noun – <a href="/dict/?pos=noun&nm=qetel"><i>ketel</i> pattern</a>, feminine</p>'
|
||||
soup = BeautifulSoup(html, "html.parser")
|
||||
gender, mishkal = _parse_noun_gender_mishkal(soup)
|
||||
assert gender == "feminine"
|
||||
assert mishkal == "ketel"
|
||||
127
tests/test_epub_examples.py
Normal file
127
tests/test_epub_examples.py
Normal file
|
|
@ -0,0 +1,127 @@
|
|||
"""Tests for epub_examples deduplication of confusable group examples."""
|
||||
|
||||
from epub_examples import _deduplicate_confusable_examples
|
||||
|
||||
|
||||
def _make_entry(meaning, confusable_group, vetted_texts=None, frequency_rank=None):
|
||||
"""Build a minimal words.json entry for testing."""
|
||||
entry = {
|
||||
"meaning": meaning,
|
||||
"confusable_group": confusable_group,
|
||||
}
|
||||
if vetted_texts is not None:
|
||||
entry["examples"] = {
|
||||
"vetted": [{"text": t, "source": "test", "match_method": "direct"} for t in vetted_texts],
|
||||
}
|
||||
if frequency_rank is not None:
|
||||
entry["frequency_rank"] = frequency_rank
|
||||
return entry
|
||||
|
||||
|
||||
class TestDeduplicateConfusableExamples:
|
||||
"""Tests for _deduplicate_confusable_examples()."""
|
||||
|
||||
def test_shared_examples_kept_on_higher_frequency(self):
|
||||
"""When two confusables share identical examples, the one with
|
||||
lower frequency_rank (more common) keeps them."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("brother", group, ["sent1", "sent2"], frequency_rank=500),
|
||||
"key_b": _make_entry("fireplace", group, ["sent1", "sent2"], frequency_rank=8000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_no_action_when_examples_differ(self):
|
||||
"""Groups with different example sets are left untouched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, ["sent2"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert len(words["key_b"]["examples"]["vetted"]) == 1
|
||||
|
||||
def test_no_action_when_one_has_no_examples(self):
|
||||
"""If only one member has examples, nothing to deduplicate."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("meaning1", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("meaning2", group, frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_no_frequency_uses_alphabetical_tiebreak(self):
|
||||
"""When no member has frequency data, first alphabetically wins."""
|
||||
group = ["alpha_key", "beta_key"]
|
||||
words = {
|
||||
"alpha_key": _make_entry("meaning1", group, ["sent1"]),
|
||||
"beta_key": _make_entry("meaning2", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["alpha_key"]["examples"]["vetted"]) == 1
|
||||
assert words["beta_key"]["examples"]["vetted"] == []
|
||||
|
||||
def test_three_way_group(self):
|
||||
"""Three-member group: highest frequency wins, other two cleared."""
|
||||
group = ["key_a", "key_b", "key_c"]
|
||||
words = {
|
||||
"key_a": _make_entry("yes", group, ["sent1", "sent2"], frequency_rank=50),
|
||||
"key_b": _make_entry("honest", group, ["sent1", "sent2"], frequency_rank=3000),
|
||||
"key_c": _make_entry("pedestal", group, ["sent1", "sent2"], frequency_rank=15000),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 2
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 2
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
assert words["key_c"]["examples"]["vetted"] == []
|
||||
|
||||
def test_cloze_removed_from_losers(self):
|
||||
"""Losing entries should have their cloze data removed too."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("common", group, ["sent1"], frequency_rank=100),
|
||||
"key_b": _make_entry("rare", group, ["sent1"], frequency_rank=9000),
|
||||
}
|
||||
# Add cloze to both
|
||||
words["key_b"]["examples"]["cloze"] = {"text": "sent1", "cloze_guid": "abc"}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert "cloze" not in words["key_b"]["examples"]
|
||||
|
||||
def test_no_confusable_groups_returns_zero(self):
|
||||
"""Words without confusable_group are ignored."""
|
||||
words = {
|
||||
"key_a": {"meaning": "word1", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
"key_b": {"meaning": "word2", "examples": {"vetted": [{"text": "s1"}]}},
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
|
||||
def test_mixed_frequency_and_none(self):
|
||||
"""Member with frequency beats member without."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("has_freq", group, ["sent1"], frequency_rank=5000),
|
||||
"key_b": _make_entry("no_freq", group, ["sent1"]),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 1
|
||||
assert len(words["key_a"]["examples"]["vetted"]) == 1
|
||||
assert words["key_b"]["examples"]["vetted"] == []
|
||||
|
||||
def test_partial_overlap_not_deduplicated(self):
|
||||
"""Groups with overlapping but not identical sentence sets are not touched."""
|
||||
group = ["key_a", "key_b"]
|
||||
words = {
|
||||
"key_a": _make_entry("m1", group, ["sent1", "sent2"], frequency_rank=100),
|
||||
"key_b": _make_entry("m2", group, ["sent1", "sent3"], frequency_rank=200),
|
||||
}
|
||||
cleared = _deduplicate_confusable_examples(words)
|
||||
assert cleared == 0
|
||||
83
tests/test_scoring_integration.py
Normal file
83
tests/test_scoring_integration.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
"""Integration tests for frequency-based sentence scoring in update_words_json."""
|
||||
|
||||
|
||||
def _make_sentence(text, source="test", match_method="direct", word_count=None, char_offset=0, char_end=3):
|
||||
"""Build a minimal sentence dict as match_sentences would produce."""
|
||||
if word_count is None:
|
||||
word_count = len(text.split())
|
||||
return {
|
||||
"text": text,
|
||||
"source": source,
|
||||
"match_method": match_method,
|
||||
"word_count": word_count,
|
||||
"char_offset": char_offset,
|
||||
"char_end": char_end,
|
||||
}
|
||||
|
||||
|
||||
class TestScoringIntegration:
|
||||
"""Tests that update_words_json uses frequency scoring."""
|
||||
|
||||
def test_cloze_has_difficulty_score(self):
|
||||
"""Cloze dict includes difficulty_score field."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא אָדָם טוֹב מְאוֹד", char_offset=10, char_end=13),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"].get("cloze")
|
||||
assert cloze is not None
|
||||
assert "difficulty_score" in cloze
|
||||
assert isinstance(cloze["difficulty_score"], int)
|
||||
|
||||
def test_vetted_sorted_by_difficulty(self):
|
||||
"""Vetted sentences are sorted easiest first."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא טוֹב", char_offset=4, char_end=7),
|
||||
_make_sentence("הַתַּפְנִיט טוֹב בְּיוֹתֵר", char_offset=10, char_end=13),
|
||||
_make_sentence("אֲנִי טוֹב הַיּוֹם", char_offset=5, char_end=8),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
vetted = words["טוֹב"]["examples"]["vetted"]
|
||||
assert len(vetted) == 3
|
||||
|
||||
def test_easiest_sentence_becomes_cloze(self):
|
||||
"""The sentence with the lowest difficulty score becomes the cloze."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
easy_text = "הוּא טוֹב מְאוֹד"
|
||||
hard_text = "הַפַּרְנָסִימוֹן טוֹב לְהַפְלִיא"
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence(hard_text, char_offset=14, char_end=17),
|
||||
_make_sentence(easy_text, char_offset=4, char_end=7),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"]["cloze"]
|
||||
assert cloze["text"] == easy_text
|
||||
441
tests/test_scraper_integration.py
Normal file
441
tests/test_scraper_integration.py
Normal file
|
|
@ -0,0 +1,441 @@
|
|||
#!/usr/bin/env python3
|
||||
"""Integration tests: scrape real pealim.com pages and validate data.
|
||||
|
||||
These tests hit pealim.com directly. They are skipped when the environment
|
||||
variable SKIP_INTEGRATION is set to any non-empty string.
|
||||
|
||||
Run with:
|
||||
pytest tests/test_scraper_integration.py -v -m integration
|
||||
"""
|
||||
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
# Add project root to path so all sibling modules are importable
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
import pealim_detail_scrape
|
||||
import pealim_list_scrape
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Skip marker
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
skip_integration = pytest.mark.skipif(
|
||||
bool(os.environ.get("SKIP_INTEGRATION", "")),
|
||||
reason="SKIP_INTEGRATION is set",
|
||||
)
|
||||
|
||||
# A known Hif'il verb slug that is not page-1 dependent.
|
||||
# לְהַגִּיד (to tell/say) — Hif'il, slug 1135-lehagid
|
||||
HIFIL_VERB_SLUG = "1135-lehagid"
|
||||
HIFIL_VERB_NIKKUD = "לְהַגִּיד"
|
||||
HIFIL_VERB_MEANING = "to say, to tell"
|
||||
|
||||
# Minimum expected entries from a single list page
|
||||
MIN_LIST_ENTRIES = 10
|
||||
|
||||
# Hebrew character regex (Unicode block U+05D0–U+05EA)
|
||||
HEBREW_CHAR_RE = re.compile(r"[\u05d0-\u05ea]")
|
||||
|
||||
# Slug pattern: one or more digits, hyphen, one or more word chars
|
||||
SLUG_RE = re.compile(r"^\d+-\w+$")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Helpers
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
def _has_hebrew(text: str) -> bool:
|
||||
"""Return True if *text* contains at least one Hebrew consonant."""
|
||||
return bool(HEBREW_CHAR_RE.search(text))
|
||||
|
||||
|
||||
def _words_from_file(path: Path) -> dict:
|
||||
with path.open(encoding="utf-8") as fh:
|
||||
return json.load(fh)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class: list page scrape
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@skip_integration
|
||||
class TestListScrape:
|
||||
"""Validate pealim_list_scrape against a real /dict/?page=1 fetch."""
|
||||
|
||||
def test_list_page_1_produces_entries(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Page 1 must yield at least MIN_LIST_ENTRIES entries in words.json."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
# Scrape exactly one page
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
|
||||
assert words_path.exists(), "words.json was not created after scrape"
|
||||
words = _words_from_file(words_path)
|
||||
assert len(words) >= MIN_LIST_ENTRIES, (
|
||||
f"Expected at least {MIN_LIST_ENTRIES} entries from page 1, got {len(words)}"
|
||||
)
|
||||
|
||||
def test_list_entries_have_required_fields(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Every entry must have non-empty nikkud, ktiv_male, slug, pos, meaning."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
for key, entry in words.items():
|
||||
word_block = entry.get("word", {})
|
||||
nikkud = word_block.get("nikkud", "")
|
||||
ktiv_male = word_block.get("ktiv_male", "")
|
||||
slug = entry.get("slug", "")
|
||||
pos = entry.get("pos", "")
|
||||
meaning = entry.get("meaning", "")
|
||||
|
||||
assert nikkud, f"Entry '{key}': word.nikkud is empty"
|
||||
assert _has_hebrew(nikkud), f"Entry '{key}': word.nikkud has no Hebrew chars: {nikkud!r}"
|
||||
assert ktiv_male, f"Entry '{key}': word.ktiv_male is empty"
|
||||
assert slug, f"Entry '{key}': slug is empty"
|
||||
assert SLUG_RE.match(slug), f"Entry '{key}': slug does not match \\d+-\\w+ pattern: {slug!r}"
|
||||
assert pos, f"Entry '{key}': pos is empty"
|
||||
assert meaning, f"Entry '{key}': meaning is empty"
|
||||
|
||||
def test_list_at_least_one_entry_has_root(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""At least one entry on page 1 must have a non-empty root list."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
entries_with_root = [e for e in words.values() if e.get("root")]
|
||||
assert entries_with_root, "No entries on page 1 have a non-empty root list"
|
||||
|
||||
def test_list_at_least_one_entry_has_audio(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""At least one entry on page 1 must have a non-empty audio_url."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
entries_with_audio = [e for e in words.values() if e.get("audio_url")]
|
||||
assert entries_with_audio, "No entries on page 1 have a non-empty audio_url"
|
||||
|
||||
def test_list_post_process_fields_exist(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""After scrape, every entry must have 'confusable_group' and 'shared_roots' keys (post-processed)."""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
|
||||
for key, entry in words.items():
|
||||
assert "confusable_group" in entry, f"Entry '{key}' missing 'confusable_group' key"
|
||||
assert "shared_roots" in entry, f"Entry '{key}' missing 'shared_roots' key"
|
||||
assert isinstance(entry["shared_roots"], list), f"Entry '{key}': shared_roots is not a list"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class: noun detail scrape
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@skip_integration
|
||||
class TestDetailScrapeNoun:
|
||||
"""Validate pealim_detail_scrape for a real noun detail page."""
|
||||
|
||||
def _find_noun_with_root(self, words: dict) -> tuple[str, dict] | None:
|
||||
"""Return the first (key, entry) pair that is a Noun with a non-empty root."""
|
||||
for key, entry in words.items():
|
||||
if entry.get("pos", "").startswith("Noun") and entry.get("root") and entry.get("slug"):
|
||||
return key, entry
|
||||
return None
|
||||
|
||||
def _prepare_words_json(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> tuple[Path, dict]:
|
||||
"""
|
||||
Scrape page 1 into a fresh words.json and return (path, words).
|
||||
Uses list scraper monkeypatched to tmp_path.
|
||||
"""
|
||||
words_path = tmp_path / "words.json"
|
||||
progress_path = tmp_path / "list_scrape_progress.json"
|
||||
|
||||
monkeypatch.setattr(pealim_list_scrape, "WORDS_JSON", words_path)
|
||||
monkeypatch.setattr(pealim_list_scrape, "PROGRESS_JSON", progress_path)
|
||||
|
||||
pealim_list_scrape.run_scrape(total_pages=1, force_refresh=True)
|
||||
words = _words_from_file(words_path)
|
||||
return words_path, words
|
||||
|
||||
def test_noun_detail_inflection_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""After detail scrape, noun_inflection must not be null."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, noun_entry = pair
|
||||
|
||||
# Now monkeypatch detail scraper and run it on just this noun
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
# Small rate-limit delay between list scrape and detail scrape
|
||||
time.sleep(1.0)
|
||||
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
entry = updated_words.get(noun_key, {})
|
||||
|
||||
assert entry.get("noun_inflection") is not None, (
|
||||
f"noun_inflection is None after detail scrape for '{noun_key}' (slug={noun_entry.get('slug')})"
|
||||
)
|
||||
|
||||
def test_noun_detail_singular_and_plural_forms(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Noun singular and plural forms must have non-empty nikkud and ktiv_male."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, _noun_entry = pair
|
||||
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
time.sleep(1.0)
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||
|
||||
singular = ni.get("singular") or {}
|
||||
plural = ni.get("plural") or {}
|
||||
|
||||
assert singular.get("nikkud"), f"noun_inflection.singular.nikkud is empty for '{noun_key}'"
|
||||
assert singular.get("ktiv_male"), f"noun_inflection.singular.ktiv_male is empty for '{noun_key}'"
|
||||
assert plural.get("nikkud"), f"noun_inflection.plural.nikkud is empty for '{noun_key}'"
|
||||
assert plural.get("ktiv_male"), f"noun_inflection.plural.ktiv_male is empty for '{noun_key}'"
|
||||
|
||||
def test_noun_detail_gender(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Noun gender must be 'masculine' or 'feminine'."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, _noun_entry = pair
|
||||
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
time.sleep(1.0)
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
ni = updated_words[noun_key].get("noun_inflection", {}) or {}
|
||||
|
||||
gender = ni.get("gender", "")
|
||||
assert gender in ("masculine", "feminine"), (
|
||||
f"noun_inflection.gender is {gender!r} for '{noun_key}' (expected 'masculine' or 'feminine')"
|
||||
)
|
||||
|
||||
def test_noun_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""detail_scraped must be True after a successful noun detail scrape."""
|
||||
words_path, words = self._prepare_words_json(tmp_path, monkeypatch)
|
||||
|
||||
pair = self._find_noun_with_root(words)
|
||||
assert pair is not None, "No noun with a root found on page 1"
|
||||
noun_key, _ = pair
|
||||
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
time.sleep(1.0)
|
||||
pealim_detail_scrape.run(force_refresh=True, nouns_only=True)
|
||||
|
||||
updated_words = _words_from_file(words_path)
|
||||
assert updated_words[noun_key].get("detail_scraped") is True, (
|
||||
f"detail_scraped is not True after scrape for '{noun_key}'"
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Test class: verb detail scrape (Hif'il)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
|
||||
@pytest.mark.integration
|
||||
@skip_integration
|
||||
class TestDetailScrapeVerb:
|
||||
"""Validate pealim_detail_scrape for a known Hif'il verb (lehagid, slug 4183-lehagid)."""
|
||||
|
||||
def _build_test_words_json(self, tmp_path: Path) -> Path:
|
||||
"""
|
||||
Write a minimal words.json containing only the known Hif'il verb entry.
|
||||
The detail scraper's run() will pick it up because pos starts with 'Verb'
|
||||
and detail_scraped is absent/False.
|
||||
"""
|
||||
words_path = tmp_path / "words.json"
|
||||
entry = {
|
||||
"word": {"nikkud": HIFIL_VERB_NIKKUD, "ktiv_male": "להגיד"},
|
||||
"slug": HIFIL_VERB_SLUG,
|
||||
"root": ["נ", "ג", "ד"],
|
||||
"pos": "Verb",
|
||||
"pos_hebrew": "פֹּעַל — הִפְעִיל",
|
||||
"meaning": HIFIL_VERB_MEANING,
|
||||
"meaning_raw": HIFIL_VERB_MEANING,
|
||||
"audio_url": "",
|
||||
"audio_file": "להגיד.mp3",
|
||||
"tags": "שורש::נגד פעלים",
|
||||
"last_scrape_date": "2026-03-08",
|
||||
"vocab_legacy_guid": None,
|
||||
"frequency": None,
|
||||
"pseudo_frequency": None,
|
||||
"emoji": None,
|
||||
"emoji_source": None,
|
||||
"emoji_visible": False,
|
||||
"image": None,
|
||||
"image_source": None,
|
||||
"hint": "",
|
||||
"shared_roots": [],
|
||||
"confusable_group": None,
|
||||
"confusables_guid": None,
|
||||
"examples": None,
|
||||
"noun_inflection": None,
|
||||
"conjugation": None,
|
||||
"adjective_inflection": None,
|
||||
"preposition_inflection": None,
|
||||
# Intentionally no detail_scraped key so the scraper processes it
|
||||
}
|
||||
words = {HIFIL_VERB_NIKKUD: entry}
|
||||
with words_path.open("w", encoding="utf-8") as fh:
|
||||
json.dump(words, fh, ensure_ascii=False, indent=2)
|
||||
return words_path
|
||||
|
||||
def test_verb_detail_conjugation_not_null(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""After detail scrape, conjugation must not be null for the Hif'il verb."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||||
assert entry.get("conjugation") is not None, f"conjugation is None after detail scrape for {HIFIL_VERB_SLUG}"
|
||||
|
||||
def test_verb_detail_binyan(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""conjugation.binyan must be \"Hif'il\" and binyan_hebrew must be the correct nikkud."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
|
||||
assert conj.get("binyan") == "Hif'il", f"Expected binyan='Hif\\'il', got {conj.get('binyan')!r}"
|
||||
assert conj.get("binyan_hebrew") == "הִפְעִיל", (
|
||||
f"Expected binyan_hebrew='הִפְעִיל', got {conj.get('binyan_hebrew')!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_infinitive_and_reference_form(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""infinitive.nikkud and reference_form.nikkud must be non-empty Hebrew strings."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
|
||||
infinitive = conj.get("infinitive") or {}
|
||||
reference_form = conj.get("reference_form") or {}
|
||||
|
||||
inf_nikkud = infinitive.get("nikkud", "")
|
||||
ref_nikkud = reference_form.get("nikkud", "")
|
||||
|
||||
assert inf_nikkud and _has_hebrew(inf_nikkud), (
|
||||
f"infinitive.nikkud is empty or has no Hebrew chars: {inf_nikkud!r}"
|
||||
)
|
||||
assert ref_nikkud and _has_hebrew(ref_nikkud), (
|
||||
f"reference_form.nikkud (3ms past) is empty or has no Hebrew chars: {ref_nikkud!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_active_forms_count_and_structure(
|
||||
self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch
|
||||
) -> None:
|
||||
"""active_forms must be a list of at least 20 entries, each with required sub-fields."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
active_forms = conj.get("active_forms")
|
||||
|
||||
assert isinstance(active_forms, list), f"active_forms is not a list: {type(active_forms)}"
|
||||
assert len(active_forms) >= 20, f"Expected at least 20 active forms, got {len(active_forms)}"
|
||||
|
||||
for i, form in enumerate(active_forms):
|
||||
assert form.get("person"), f"active_forms[{i}].person is empty"
|
||||
assert form.get("tense"), f"active_forms[{i}].tense is empty"
|
||||
form_block = form.get("form") or {}
|
||||
assert form_block.get("nikkud") and _has_hebrew(form_block["nikkud"]), (
|
||||
f"active_forms[{i}].form.nikkud is empty or has no Hebrew: {form_block.get('nikkud')!r}"
|
||||
)
|
||||
assert form_block.get("ktiv_male") and _has_hebrew(form_block["ktiv_male"]), (
|
||||
f"active_forms[{i}].form.ktiv_male is empty or has no Hebrew: {form_block.get('ktiv_male')!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_hufal_passive_section(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""Hif'il verb must have a non-null hufal_pual_forms list and reference_form_passive."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
conj = words[HIFIL_VERB_NIKKUD].get("conjugation") or {}
|
||||
|
||||
hufal_forms = conj.get("hufal_pual_forms")
|
||||
assert hufal_forms is not None, "hufal_pual_forms is None — expected Huf'al passive section for a Hif'il verb"
|
||||
assert isinstance(hufal_forms, list), f"hufal_pual_forms is not a list: {type(hufal_forms)}"
|
||||
assert len(hufal_forms) > 0, "hufal_pual_forms list is empty"
|
||||
|
||||
ref_passive = conj.get("reference_form_passive")
|
||||
assert ref_passive is not None, "reference_form_passive is None — expected a Huf'al 3ms past form"
|
||||
passive_nikkud = (ref_passive or {}).get("nikkud", "")
|
||||
assert passive_nikkud and _has_hebrew(passive_nikkud), (
|
||||
f"reference_form_passive.nikkud is empty or has no Hebrew: {passive_nikkud!r}"
|
||||
)
|
||||
|
||||
def test_verb_detail_scraped_flag(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> None:
|
||||
"""detail_scraped must be True after a successful verb detail scrape."""
|
||||
words_path = self._build_test_words_json(tmp_path)
|
||||
monkeypatch.setattr(pealim_detail_scrape, "WORDS_JSON", words_path)
|
||||
|
||||
pealim_detail_scrape.run(test=1, force_refresh=True, verbs_only=True)
|
||||
|
||||
words = _words_from_file(words_path)
|
||||
entry = words.get(HIFIL_VERB_NIKKUD, {})
|
||||
assert entry.get("detail_scraped") is True, f"detail_scraped is not True after scrape for {HIFIL_VERB_SLUG}"
|
||||
207
tests/test_sentence_difficulty.py
Normal file
207
tests/test_sentence_difficulty.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""Tests for sentence difficulty scoring."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import frequency_lookup
|
||||
from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence
|
||||
|
||||
|
||||
class TestBuildNikkudMap:
|
||||
def test_maps_direct_headwords(self):
|
||||
words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָב"] == "אב"
|
||||
|
||||
def test_maps_conjugation_forms(self):
|
||||
words = {
|
||||
"שָׁמַר": {
|
||||
"word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
"conjugation": {
|
||||
"active_forms": [
|
||||
{
|
||||
"person": "1s",
|
||||
"tense": "עָבָר",
|
||||
"form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
|
||||
},
|
||||
],
|
||||
"infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
|
||||
"reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["שָׁמַרְתִּי"] == "שמרתי"
|
||||
assert nmap["לִשְׁמֹר"] == "לשמור"
|
||||
|
||||
def test_maps_noun_inflections(self):
|
||||
words = {
|
||||
"אָב": {
|
||||
"word": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"noun_inflection": {
|
||||
"singular": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
|
||||
"pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָבוֹת"] == "אבות"
|
||||
assert nmap["אָבִי"] == "אבי"
|
||||
|
||||
def test_maps_adjective_inflections(self):
|
||||
words = {
|
||||
"גָּדוֹל": {
|
||||
"word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"adjective_inflection": {
|
||||
"ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
|
||||
"mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
|
||||
"fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["גְּדוֹלָה"] == "גדולה"
|
||||
assert nmap["גְּדוֹלִים"] == "גדולים"
|
||||
|
||||
def test_construct_forms_strip_maqaf(self):
|
||||
words = {
|
||||
"בֵּית": {
|
||||
"word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
|
||||
"noun_inflection": {
|
||||
"construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert "בֵּית־" in nmap
|
||||
assert "בֵּית" in nmap
|
||||
|
||||
def test_handles_missing_fields(self):
|
||||
words = {
|
||||
"test": {
|
||||
"word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
|
||||
"conjugation": None,
|
||||
"noun_inflection": None,
|
||||
"adjective_inflection": None,
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["טֶסְט"] == "טסט"
|
||||
|
||||
def test_real_words_json_coverage(self):
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
nmap = build_nikkud_map(words)
|
||||
assert len(nmap) > 90_000
|
||||
|
||||
|
||||
class TestResolveTokenFrequency:
|
||||
@pytest.fixture()
|
||||
def freq_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_tier1_known_mapping(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 50_000
|
||||
|
||||
def test_tier3_academy_converter(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 1000
|
||||
|
||||
def test_unknown_token_returns_default(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank == 50_000
|
||||
|
||||
def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
assert freq_data.get("שלום") is not None
|
||||
|
||||
|
||||
class TestScoreSentence:
|
||||
@pytest.fixture()
|
||||
def scoring_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_returns_integer(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא הָלַךְ הַבַּיְתָה"
|
||||
start = text.index("הָלַךְ")
|
||||
end = start + len("הָלַךְ")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_easy_sentence_scores_lower(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
easy = "הוּא אָמַר שָׁלוֹם"
|
||||
easy_start = easy.index("אָמַר")
|
||||
easy_end = easy_start + len("אָמַר")
|
||||
hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
|
||||
hard_start = hard.index("נִשְׁתַּטֵּחַ")
|
||||
hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
|
||||
easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
|
||||
hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
|
||||
assert easy_score < hard_score
|
||||
|
||||
def test_single_context_token(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא טוֹב"
|
||||
start = 0
|
||||
end = len("הוּא")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_handles_punctuation(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = '"הוּא טוֹב!"'
|
||||
start = text.index("טוֹב")
|
||||
end = start + len("טוֹב")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_splits_on_maqaf(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "בֵּית־סֵפֶר גָּדוֹל"
|
||||
start = text.index("גָּדוֹל")
|
||||
end = start + len("גָּדוֹל")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_no_context_tokens_returns_default(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "א ב"
|
||||
score = score_sentence(text, 0, 1, nmap, nidx, freq)
|
||||
assert score == DEFAULT_RANK
|
||||
|
|
@ -25,8 +25,7 @@ def test_apkg_builder_imports():
|
|||
|
||||
def test_data_files_exist():
|
||||
data_dir = Path(__file__).resolve().parent.parent / "data"
|
||||
assert (data_dir / "hebrew_dict_for_anki.csv").exists(), "vocab CSV missing"
|
||||
assert (data_dir / "conjugations.json").exists(), "conjugations cache missing"
|
||||
assert (data_dir / "words.json").exists(), "words.json missing"
|
||||
|
||||
|
||||
def test_strip_nikkud_idempotent():
|
||||
|
|
@ -42,4 +41,18 @@ def test_strip_nikkud_all_marks():
|
|||
# Comprehensive: patach, kamatz, segol, tsere, hiriq, holam, kubutz, shva, dagesh
|
||||
nikkud = "הַמַּלְכָּה"
|
||||
plain = strip_nikkud(nikkud)
|
||||
assert all(ch < "\u0591" or ch > "\u05C7" for ch in plain), f"Residual nikkud in: {plain}"
|
||||
assert all(ch < "\u0591" or ch > "\u05c7" for ch in plain), f"Residual nikkud in: {plain}"
|
||||
|
||||
|
||||
def test_categorize_pos_no_substring_match():
|
||||
"""Regression: 'Pronoun' must NOT match 'Noun' category."""
|
||||
from apkg_builder import _categorize_pos
|
||||
|
||||
assert _categorize_pos("Noun") == "Noun"
|
||||
assert _categorize_pos("Verb") == "Verb"
|
||||
assert _categorize_pos("Adjective") == "Adjective"
|
||||
assert _categorize_pos("Adverb") == "Adverb"
|
||||
assert _categorize_pos("Pronoun") == "Other", "Pronoun must not match Noun"
|
||||
assert _categorize_pos("Preposition") == "Other"
|
||||
assert _categorize_pos("Conjunction") == "Other"
|
||||
assert _categorize_pos("Cardinal numeral") == "Other"
|
||||
|
|
|
|||
|
|
@ -238,7 +238,11 @@ def validate_apkg(apkg_path: Path) -> int:
|
|||
|
||||
notes_with_audio = sum(1 for (flds,) in notes_flds if "[sound:" in flds)
|
||||
pct = notes_with_audio / note_count * 100 if note_count else 0
|
||||
check("Notes with audio", notes_with_audio > 0, f"{notes_with_audio:,}/{note_count:,} ({pct:.0f}%)")
|
||||
if notes_with_audio > 0:
|
||||
check("Notes with audio", True, f"{notes_with_audio:,}/{note_count:,} ({pct:.0f}%)")
|
||||
else:
|
||||
# Non-audio variants intentionally have no audio — not a failure
|
||||
warn("No audio in this deck variant", f"0/{note_count:,}")
|
||||
|
||||
# --- Empty fields check ---
|
||||
print("\n[Field content]")
|
||||
|
|
|
|||
|
|
@ -1,256 +0,0 @@
|
|||
#!/usr/bin/env python3
|
||||
"""
|
||||
Validate nevo_typed_verbs_from_modern_hebrew against pealim.com.
|
||||
|
||||
For each verb:
|
||||
1. Classifies it by position in the file (Pa'al/Nif'al/Pi'el/Pu'al/Hitpa'el/Hif'il/Huf'al)
|
||||
2. Searches pealim.com to find URL slug
|
||||
3. Fetches the page to confirm the binyan
|
||||
4. Flags known-problem entries and detects: not-found, binyan mismatch, suspected typos
|
||||
|
||||
Output:
|
||||
verbs_input.txt — cleaned verb list for conjugation_extract.py
|
||||
Printed validation report table
|
||||
|
||||
Usage:
|
||||
python3 validate_verb_list.py
|
||||
|
||||
After running, review verbs_input.txt (especially REVIEW-flagged entries) before
|
||||
running conjugation extraction.
|
||||
"""
|
||||
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import urllib.parse
|
||||
from pathlib import Path
|
||||
|
||||
import requests
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
PEALIM_BASE = "https://www.pealim.com"
|
||||
REQUEST_DELAY = 1.5
|
||||
REQUEST_TIMEOUT = 15
|
||||
SOURCE_FILE = Path(__file__).parent / "nevo_typed_verbs_from_modern_hebrew"
|
||||
OUTPUT_FILE = Path(__file__).parent / "verbs_input.txt"
|
||||
|
||||
# Known problem entries: word → (action, note)
|
||||
# action: "REVIEW" = comment out and flag, "3ms" = treat as 3ms past form
|
||||
KNOWN_ISSUES: dict[str, tuple[str, str]] = {
|
||||
"לגבוה": ("REVIEW", "not a standard infinitive form; likely defective spelling or wrong word"),
|
||||
"לההרג": ("REVIEW", "extra ה; should probably be להיהרג (Nif'al of הרג)"),
|
||||
"להתלקלח": ("REVIEW", "not a real word; likely typo for להתקלקל"),
|
||||
"להקלל": ("REVIEW", "ambiguous: could be Hif'il לְהָקֵל (to ease) or Nif'al of קלל"),
|
||||
"המציא": ("3ms", "Hif'il 3ms past form, not an infinitive"),
|
||||
"קומם": ("3ms", "ambiguous: Pu'al 3ms past; Pi'el infinitive is לְקוֹמֵם"),
|
||||
}
|
||||
|
||||
# Expected binyan by line range (1-indexed) per plan analysis
|
||||
LINE_RANGES: list[tuple[range, str]] = [
|
||||
(range(1, 18), "Pa'al"),
|
||||
(range(18, 29), "Nif'al"),
|
||||
(range(29, 37), "Pi'el"),
|
||||
(range(37, 43), "Pu'al"),
|
||||
(range(43, 53), "Hitpa'el"),
|
||||
(range(53, 63), "Hif'il"),
|
||||
(range(63, 71), "Huf'al"),
|
||||
]
|
||||
|
||||
SECTION_HEADERS: dict[str, str] = {
|
||||
"Pa'al": "# Pa'al (פָּעַל)",
|
||||
"Nif'al": "# Nif'al (נִפְעַל)",
|
||||
"Pi'el": "# Pi'el (פִּעֵל)",
|
||||
"Pu'al": "# Pu'al (פֻּעַל) — 3ms past, no infinitive",
|
||||
"Hitpa'el": "# Hitpa'el (הִתְפַּעֵל)",
|
||||
"Hif'il": "# Hif'il (הִפְעִיל)",
|
||||
"Huf'al": "# Huf'al (הֻפְעַל) — 3ms past, no infinitive",
|
||||
}
|
||||
|
||||
session = requests.Session()
|
||||
session.headers.update({"User-Agent": "Mozilla/5.0 (compatible; pealim-anki/3.0)"})
|
||||
|
||||
|
||||
def classify_by_line(line_num: int) -> str:
|
||||
"""Return expected binyan for a 1-indexed line number."""
|
||||
for r, binyan in LINE_RANGES:
|
||||
if line_num in r:
|
||||
return binyan
|
||||
return "Unknown"
|
||||
|
||||
|
||||
def find_slug(query: str) -> str | None:
|
||||
"""Search pealim.com and return first URL slug found."""
|
||||
url = f"{PEALIM_BASE}/search/?q={urllib.parse.quote(query)}"
|
||||
try:
|
||||
resp = session.get(url, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
slugs = re.findall(r"/dict/(\d+[-][^/?\"<>\s]+)/", resp.text)
|
||||
return slugs[0] if slugs else None
|
||||
except Exception as e:
|
||||
print(f" ERROR searching {query!r}: {e}", file=sys.stderr)
|
||||
return None
|
||||
|
||||
|
||||
def get_page_binyan(slug: str) -> str:
|
||||
"""Fetch /dict/<slug>/ and extract binyan from page header."""
|
||||
url = f"{PEALIM_BASE}/dict/{slug}/"
|
||||
try:
|
||||
resp = session.get(url, cookies={"hebstyle": "mo"}, timeout=REQUEST_TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
soup = BeautifulSoup(resp.text, "lxml")
|
||||
binyan_names = ["Pa'al", "Nif'al", "Pi'el", "Pu'al", "Hitpa'el", "Hif'il", "Huf'al"]
|
||||
for h3 in soup.find_all("h3", class_="page-header"):
|
||||
text = h3.get_text(" ", strip=True)
|
||||
for bname in binyan_names:
|
||||
if bname in text:
|
||||
return bname
|
||||
meta = soup.find("meta", {"property": "og:description"})
|
||||
if meta:
|
||||
desc = meta.get("content", "")
|
||||
for bname in binyan_names:
|
||||
if bname in desc:
|
||||
return bname
|
||||
except Exception as e:
|
||||
print(f" ERROR fetching {slug}: {e}", file=sys.stderr)
|
||||
return ""
|
||||
|
||||
|
||||
def main() -> None:
|
||||
if not SOURCE_FILE.exists():
|
||||
print(f"ERROR: {SOURCE_FILE} not found", file=sys.stderr)
|
||||
sys.exit(1)
|
||||
|
||||
lines = [line.strip() for line in SOURCE_FILE.read_text(encoding="utf-8").splitlines() if line.strip()]
|
||||
print(f"Loaded {len(lines)} entries from {SOURCE_FILE.name}")
|
||||
print(f"Querying pealim.com (delay {REQUEST_DELAY}s per request)…\n")
|
||||
|
||||
results = []
|
||||
|
||||
for line_num, word in enumerate(lines, start=1):
|
||||
expected_binyan = classify_by_line(line_num)
|
||||
issue_type, issue_note = KNOWN_ISSUES.get(word, (None, ""))
|
||||
|
||||
# Positions 37-42 (Pu'al) and 63-70 (Huf'al) are 3ms past forms
|
||||
is_3ms_by_position = expected_binyan in ("Pu'al", "Huf'al")
|
||||
|
||||
print(f"[{line_num:2d}/{len(lines)}] {word:<20}", end=" ", flush=True)
|
||||
|
||||
if issue_type == "REVIEW":
|
||||
# Don't query pealim for known-bad entries
|
||||
print("REVIEW (skipping query)")
|
||||
results.append(
|
||||
{
|
||||
"line": line_num,
|
||||
"word": word,
|
||||
"expected_binyan": expected_binyan,
|
||||
"slug": "",
|
||||
"page_binyan": "",
|
||||
"status": "REVIEW",
|
||||
"notes": issue_note,
|
||||
"is_3ms": is_3ms_by_position,
|
||||
}
|
||||
)
|
||||
continue
|
||||
|
||||
time.sleep(REQUEST_DELAY)
|
||||
slug = find_slug(word)
|
||||
|
||||
if slug:
|
||||
time.sleep(REQUEST_DELAY)
|
||||
page_binyan = get_page_binyan(slug)
|
||||
else:
|
||||
page_binyan = ""
|
||||
|
||||
# Determine status
|
||||
if issue_type == "3ms" or is_3ms_by_position:
|
||||
status = "3ms"
|
||||
notes = issue_note or "Pu'al/Huf'al 3ms past form"
|
||||
elif not slug:
|
||||
status = "NOT_FOUND"
|
||||
notes = "no search result on pealim.com"
|
||||
elif page_binyan and expected_binyan and page_binyan != expected_binyan:
|
||||
status = "MISMATCH"
|
||||
notes = f"expected {expected_binyan}, page says {page_binyan}"
|
||||
else:
|
||||
status = "OK"
|
||||
notes = ""
|
||||
|
||||
print(f"{status:<12} slug={slug or '-':<35} binyan={page_binyan or '-'}")
|
||||
results.append(
|
||||
{
|
||||
"line": line_num,
|
||||
"word": word,
|
||||
"expected_binyan": expected_binyan,
|
||||
"slug": slug or "",
|
||||
"page_binyan": page_binyan,
|
||||
"status": status,
|
||||
"notes": notes,
|
||||
"is_3ms": is_3ms_by_position or issue_type == "3ms",
|
||||
}
|
||||
)
|
||||
|
||||
# ── Write cleaned verbs_input.txt ────────────────────────────────────────────
|
||||
sections: dict[str, list[str]] = {b: [] for b in SECTION_HEADERS}
|
||||
review_lines: list[str] = []
|
||||
|
||||
for r in results:
|
||||
b = r["expected_binyan"]
|
||||
if b not in sections:
|
||||
b = list(sections.keys())[0]
|
||||
|
||||
if r["status"] == "REVIEW":
|
||||
review_lines.append(f"# REVIEW: {r['word']} — {r['notes']}")
|
||||
elif r["status"] == "3ms":
|
||||
sections[b].append(f"# 3ms: {r['word']}")
|
||||
elif r["status"] in ("OK", "MISMATCH"):
|
||||
sections[b].append(r["word"])
|
||||
else: # NOT_FOUND
|
||||
sections[b].append(f"# NOT_FOUND: {r['word']} — {r['notes']}")
|
||||
|
||||
output_lines = [
|
||||
"# Verb list — validated against pealim.com from nevo_typed_verbs_from_modern_hebrew",
|
||||
"# Lines prefixed '# 3ms:' are searched by 3ms past form (Pu'al/Huf'al).",
|
||||
"# Lines prefixed '# REVIEW:' need manual correction before conjugation extraction.",
|
||||
"# Lines prefixed '# NOT_FOUND:' had no pealim.com result — check spelling.",
|
||||
"",
|
||||
]
|
||||
for binyan, header in SECTION_HEADERS.items():
|
||||
if sections.get(binyan):
|
||||
output_lines.append(header)
|
||||
output_lines.extend(sections[binyan])
|
||||
output_lines.append("")
|
||||
|
||||
if review_lines:
|
||||
output_lines.append("# ── Entries flagged for manual review ──────────────────────────────────────────")
|
||||
output_lines.extend(review_lines)
|
||||
output_lines.append("")
|
||||
|
||||
OUTPUT_FILE.write_text("\n".join(output_lines), encoding="utf-8")
|
||||
print(f"\nWrote → {OUTPUT_FILE}")
|
||||
|
||||
# ── Print summary table ──────────────────────────────────────────────────────
|
||||
print("\n" + "=" * 95)
|
||||
print("VALIDATION REPORT")
|
||||
print("=" * 95)
|
||||
print(f"{'#':>4} {'Verb':<22} {'Status':<14} {'Slug':<38} {'Binyan':<12} Notes")
|
||||
print("-" * 95)
|
||||
for r in results:
|
||||
print(
|
||||
f"{r['line']:>4} {r['word']:<22} {r['status']:<14} "
|
||||
f"{r['slug'][:36]:<38} {r['page_binyan'] or '-':<12} {r['notes']}"
|
||||
)
|
||||
print("=" * 95)
|
||||
|
||||
counts = {s: sum(1 for r in results if r["status"] == s) for s in ("OK", "3ms", "MISMATCH", "REVIEW", "NOT_FOUND")}
|
||||
print(
|
||||
f"\nSummary: {counts['OK']} OK | {counts['3ms']} 3ms-past | "
|
||||
f"{counts['MISMATCH']} MISMATCH | {counts['REVIEW']} REVIEW | {counts['NOT_FOUND']} NOT_FOUND"
|
||||
)
|
||||
print(f"Total entries: {len(results)}")
|
||||
|
||||
if counts["REVIEW"] > 0 or counts["NOT_FOUND"] > 0 or counts["MISMATCH"] > 0:
|
||||
print("\n⚠ Review flagged entries in verbs_input.txt before running:\n python3 conjugation_extract.py")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
|
@ -1,3 +0,0 @@
|
|||
# Vulture whitelist: suppress false positives for interface methods
|
||||
# HTMLParser.handle_starttag requires (self, tag, attrs) signature
|
||||
attrs # noqa
|
||||
Loading…
Reference in a new issue