Compare commits
9 commits
b3ea086e85
...
138acb06d8
| Author | SHA1 | Date | |
|---|---|---|---|
| 138acb06d8 | |||
| 0a85291975 | |||
| 14d567a261 | |||
| 8b24d0fd26 | |||
| 272a2a080d | |||
| fb12f806a8 | |||
| 00fba934fb | |||
| d2a7c9d483 | |||
| d0f4aea58d |
9 changed files with 22481 additions and 17734 deletions
|
|
@ -69,6 +69,7 @@ entry:
|
|||
cloze_word_end: 4 # End offset — enables exact extraction regardless of nikkud changes
|
||||
cloze_hint: "family member"
|
||||
cloze_guid: "def456..." # GUID for the cloze note
|
||||
difficulty_score: 234 # Median frequency rank of context words (lower = easier); optional
|
||||
rejected_count: 0
|
||||
|
||||
# --- Noun-specific: Inflection Forms ---
|
||||
|
|
|
|||
|
|
@ -35,7 +35,7 @@ COMPLETE_PLURAL_DECK_ID = 1_234_567_903
|
|||
|
||||
# Release version tag added to all notes so users can identify which release
|
||||
# their cards come from (visible in Anki's Browse view and card info).
|
||||
RELEASE_TAG = "v0.19"
|
||||
RELEASE_TAG = "v0.20"
|
||||
|
||||
# Regex for extracting emoji and Hebrew prepositions from meaning strings
|
||||
EMOJI_RE = re.compile(r"[\U0001F000-\U0001FFFF\u2600-\u27FF\u2300-\u23FF\uFE00-\uFE0F]+")
|
||||
|
|
|
|||
39683
data/words.json
39683
data/words.json
File diff suppressed because it is too large
Load diff
|
|
@ -54,6 +54,8 @@ def _score(s: dict) -> tuple[int,]:
|
|||
|
||||
New scoring replaces length with frequency-based difficulty. The `_score` function gains access to the frequency pipeline via closure over the nikkud_map, nikkud_index, and freq_data built once at the start of `update_words_json()`.
|
||||
|
||||
**Minimum sentence length:** Reduced from 4 words to 3 words (`MIN_WORDS = 3` in epub_examples.py). Hebrew is more concise than English — 3-word sentences are valid and common. This expands the candidate pool for cloze selection.
|
||||
|
||||
**Behavioral change:** Because `pool.sort(key=_score)` determines which 3 sentences are selected as `best = pool[:3]`, changing the scoring function changes **which sentences are selected**, not just their order. This is intentional — we want the easiest sentences as cloze candidates, not the closest-to-9-words ones. Existing cloze GUIDs will be preserved when the same sentence text is re-selected; entries where a different sentence wins will get new GUIDs.
|
||||
|
||||
## Data Model Changes
|
||||
|
|
|
|||
|
|
@ -18,7 +18,9 @@ import zipfile
|
|||
from html.parser import HTMLParser
|
||||
from pathlib import Path
|
||||
|
||||
import frequency_lookup
|
||||
from helpers import strip_nikkud
|
||||
from sentence_difficulty import build_nikkud_map, score_sentence
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
|
@ -57,7 +59,7 @@ def _discover_epubs() -> dict[str, str]:
|
|||
|
||||
|
||||
# Sentence length bounds (word count)
|
||||
MIN_WORDS = 4
|
||||
MIN_WORDS = 3
|
||||
MAX_WORDS = 15
|
||||
|
||||
|
||||
|
|
@ -448,6 +450,10 @@ def _try_strip_prefix(token: str, nikkud_index: dict) -> list[tuple[str, str, st
|
|||
return results
|
||||
|
||||
|
||||
# Public alias for use by sentence_difficulty module
|
||||
try_strip_prefix = _try_strip_prefix
|
||||
|
||||
|
||||
def _build_nikkud_index(words: dict) -> dict[str, list[tuple[str, str]]]:
|
||||
"""Build a mapping from nikkud form to list of (unique_key, match_type).
|
||||
|
||||
|
|
@ -654,6 +660,11 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
|
||||
updated = 0
|
||||
|
||||
# Build frequency scoring infrastructure (once for all words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
|
||||
for unique_key, sent_list in matches.items():
|
||||
if unique_key not in words:
|
||||
continue
|
||||
|
|
@ -673,11 +684,18 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
prefix_only = [s for s in unique if "prefix" in s["match_method"]]
|
||||
pool = direct if direct else prefix_only
|
||||
|
||||
# Score: prefer 6–12 word sentences
|
||||
# Score: prefer sentences with easier (more common) context words
|
||||
def _score(s: dict) -> tuple[int,]:
|
||||
wc = s["word_count"]
|
||||
length_score = abs(wc - 9) if not (6 <= wc <= 12) else 0
|
||||
return (length_score,)
|
||||
return (
|
||||
score_sentence(
|
||||
s["text"],
|
||||
s["char_offset"],
|
||||
s["char_end"],
|
||||
nikkud_map,
|
||||
nikkud_index,
|
||||
freq_data,
|
||||
),
|
||||
)
|
||||
|
||||
pool.sort(key=_score)
|
||||
best = pool[:3]
|
||||
|
|
@ -712,6 +730,7 @@ def update_words_json(words: dict, matches: dict, confusable_keys: set[str]) ->
|
|||
"cloze_word_end": top["char_end"],
|
||||
"cloze_hint": None,
|
||||
"cloze_guid": cloze_guid,
|
||||
"difficulty_score": _score(top)[0],
|
||||
}
|
||||
elif is_confusable:
|
||||
examples.pop("cloze", None)
|
||||
|
|
|
|||
|
|
@ -74,6 +74,16 @@ def get_frequency_rank(word_no_nikkud: str) -> int | None:
|
|||
return _freq.get(clean)
|
||||
|
||||
|
||||
def get_freq_data() -> dict[str, int]:
|
||||
"""Return the full frequency dict (word -> rank).
|
||||
|
||||
Auto-loads from cache if not yet loaded.
|
||||
"""
|
||||
if not _freq:
|
||||
load()
|
||||
return _freq
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(level=logging.INFO, format="%(levelname)s %(message)s")
|
||||
load()
|
||||
|
|
|
|||
198
sentence_difficulty.py
Normal file
198
sentence_difficulty.py
Normal file
|
|
@ -0,0 +1,198 @@
|
|||
"""Sentence difficulty scoring by context-word frequency.
|
||||
|
||||
Scores sentences by the median frequency rank of context words
|
||||
(excluding the cloze target). Lower score = easier sentence.
|
||||
Used by epub_examples.py to select the best cloze sentence.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from statistics import median
|
||||
|
||||
import helpers
|
||||
import nikkud_to_ktiv_male
|
||||
|
||||
DEFAULT_RANK = 50_000
|
||||
|
||||
# Hebrew prefix consonants for ktiv_male prefix stripping (tier 5)
|
||||
_KM_PREFIX_CHARS = set("בהוכלמשע")
|
||||
|
||||
# Punctuation to strip from tokens
|
||||
_PUNCT = set('.,!?;:"\'"״׳–—()[]{}')
|
||||
|
||||
# Maqaf (Hebrew hyphen) — splits tokens
|
||||
_MAQAF = "־"
|
||||
|
||||
|
||||
def build_nikkud_map(words: dict) -> dict[str, str]:
|
||||
"""Build nikkud→ktiv_male lookup from words.json.
|
||||
|
||||
Indexes: headwords, conjugation forms (active, passive, infinitive,
|
||||
reference_form), noun inflections (singular, plural, construct,
|
||||
pronominal suffixes), and adjective inflections (ms/fs/mp/fp).
|
||||
|
||||
Args:
|
||||
words: The full words.json dict keyed by unique_key.
|
||||
|
||||
Returns:
|
||||
Dict mapping nikkud form to ktiv_male string.
|
||||
When collisions occur, last-write wins (acceptable for frequency lookup).
|
||||
"""
|
||||
nmap: dict[str, str] = {}
|
||||
|
||||
def _add(nikkud: str | None, ktiv_male: str | None) -> None:
|
||||
if nikkud and ktiv_male:
|
||||
nmap[nikkud] = ktiv_male
|
||||
|
||||
for entry in words.values():
|
||||
word = entry.get("word") or {}
|
||||
_add(word.get("nikkud"), word.get("ktiv_male"))
|
||||
|
||||
# Conjugation forms
|
||||
conj = entry.get("conjugation") or {}
|
||||
for form_entry in conj.get("active_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
for form_entry in conj.get("hufal_pual_forms") or []:
|
||||
form = form_entry.get("form") or {}
|
||||
_add(form.get("nikkud"), form.get("ktiv_male"))
|
||||
inf = conj.get("infinitive") or {}
|
||||
_add(inf.get("nikkud"), inf.get("ktiv_male"))
|
||||
ref = conj.get("reference_form") or {}
|
||||
_add(ref.get("nikkud"), ref.get("ktiv_male"))
|
||||
|
||||
# Noun inflection forms
|
||||
noun = entry.get("noun_inflection") or {}
|
||||
for field in ("singular", "plural", "construct_singular", "construct_plural"):
|
||||
sub = noun.get(field) or {}
|
||||
nikkud_form = sub.get("nikkud")
|
||||
ktiv = sub.get("ktiv_male")
|
||||
_add(nikkud_form, ktiv)
|
||||
# Index construct forms without maqaf
|
||||
if nikkud_form and nikkud_form.endswith("־") and ktiv:
|
||||
_add(nikkud_form[:-1], ktiv)
|
||||
pronominal = noun.get("pronominal_suffixes") or {}
|
||||
for sub in pronominal.values():
|
||||
if isinstance(sub, dict):
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
# Adjective inflection forms
|
||||
adj = entry.get("adjective_inflection") or {}
|
||||
for field in ("ms", "fs", "mp", "fp"):
|
||||
sub = adj.get(field) or {}
|
||||
_add(sub.get("nikkud"), sub.get("ktiv_male"))
|
||||
|
||||
return nmap
|
||||
|
||||
|
||||
def _resolve_token_frequency(
|
||||
token: str,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Resolve a nikkud sentence token to its frequency rank.
|
||||
|
||||
Uses a 5-tier pipeline:
|
||||
1. Known mapping (nikkud_map from words.json)
|
||||
2. Nikkud prefix stripping (epub_examples.try_strip_prefix)
|
||||
3. Academy rules converter (nikkud_to_ktiv_male.convert)
|
||||
4. strip_nikkud fallback (helpers.strip_nikkud)
|
||||
5. Ktiv_male prefix stripping on the converted form
|
||||
|
||||
Returns:
|
||||
Frequency rank (1 = most common). DEFAULT_RANK (50000) if not found.
|
||||
"""
|
||||
# Tier 1: Direct lookup in nikkud→ktiv_male map
|
||||
ktiv = nikkud_map.get(token)
|
||||
if ktiv and ktiv in freq_data:
|
||||
return freq_data[ktiv]
|
||||
|
||||
# Tier 2: Nikkud prefix stripping → resolve remainder via nikkud_map
|
||||
from epub_examples import try_strip_prefix
|
||||
|
||||
prefix_hits = try_strip_prefix(token, nikkud_index)
|
||||
for _unique_key, _match_type, matched_remainder in prefix_hits:
|
||||
remainder_ktiv = nikkud_map.get(matched_remainder)
|
||||
if remainder_ktiv and remainder_ktiv in freq_data:
|
||||
return freq_data[remainder_ktiv]
|
||||
|
||||
# Tier 3: Academy rules converter
|
||||
converted = nikkud_to_ktiv_male.convert(token)
|
||||
if converted in freq_data:
|
||||
return freq_data[converted]
|
||||
|
||||
# Tier 4: strip_nikkud fallback
|
||||
stripped = helpers.strip_nikkud(token)
|
||||
if stripped != converted and stripped in freq_data:
|
||||
return freq_data[stripped]
|
||||
|
||||
# Tier 5: Ktiv_male prefix stripping on converted/stripped form
|
||||
for form in (converted, stripped):
|
||||
for prefix_len in (1, 2):
|
||||
if len(form) > prefix_len + 1:
|
||||
prefix = form[:prefix_len]
|
||||
if all(c in _KM_PREFIX_CHARS for c in prefix):
|
||||
stem = form[prefix_len:]
|
||||
if stem in freq_data:
|
||||
return freq_data[stem]
|
||||
|
||||
return DEFAULT_RANK
|
||||
|
||||
|
||||
def score_sentence(
|
||||
text: str,
|
||||
target_start: int,
|
||||
target_end: int,
|
||||
nikkud_map: dict[str, str],
|
||||
nikkud_index: dict,
|
||||
freq_data: dict[str, int],
|
||||
) -> int:
|
||||
"""Score a sentence by median frequency rank of context words.
|
||||
|
||||
Args:
|
||||
text: The full sentence text (with nikkud).
|
||||
target_start: Character offset where the cloze target word starts.
|
||||
target_end: Character offset where the cloze target word ends.
|
||||
nikkud_map: nikkud→ktiv_male mapping from build_nikkud_map().
|
||||
nikkud_index: nikkud index from epub_examples._build_nikkud_index().
|
||||
freq_data: Frequency dict from frequency_lookup.get_freq_data().
|
||||
|
||||
Returns:
|
||||
Median frequency rank of context tokens (int). Lower = easier.
|
||||
Returns DEFAULT_RANK if no scoreable context tokens.
|
||||
"""
|
||||
# Tokenize: split on whitespace, then split on maqaf
|
||||
raw_tokens = text.split()
|
||||
tokens_with_pos: list[tuple[str, int, int]] = []
|
||||
pos = 0
|
||||
for raw in raw_tokens:
|
||||
start = text.index(raw, pos)
|
||||
# Split on maqaf
|
||||
parts = raw.split(_MAQAF)
|
||||
sub_pos = start
|
||||
for part in parts:
|
||||
if part:
|
||||
tokens_with_pos.append((part, sub_pos, sub_pos + len(part)))
|
||||
sub_pos += len(part) + 1 # +1 for maqaf
|
||||
pos = start + len(raw)
|
||||
|
||||
# Filter: exclude target word, strip punctuation, skip short tokens
|
||||
context_ranks: list[int] = []
|
||||
for token, tok_start, tok_end in tokens_with_pos:
|
||||
# Exclude target word by overlap with char offsets
|
||||
if tok_start < target_end and tok_end > target_start:
|
||||
continue
|
||||
|
||||
# Strip punctuation from edges
|
||||
cleaned = token.strip("".join(_PUNCT))
|
||||
if len(cleaned) < 2:
|
||||
continue
|
||||
|
||||
rank = _resolve_token_frequency(cleaned, nikkud_map, nikkud_index, freq_data)
|
||||
context_ranks.append(rank)
|
||||
|
||||
if not context_ranks:
|
||||
return DEFAULT_RANK
|
||||
|
||||
return int(median(context_ranks))
|
||||
83
tests/test_scoring_integration.py
Normal file
83
tests/test_scoring_integration.py
Normal file
|
|
@ -0,0 +1,83 @@
|
|||
"""Integration tests for frequency-based sentence scoring in update_words_json."""
|
||||
|
||||
|
||||
def _make_sentence(text, source="test", match_method="direct", word_count=None, char_offset=0, char_end=3):
|
||||
"""Build a minimal sentence dict as match_sentences would produce."""
|
||||
if word_count is None:
|
||||
word_count = len(text.split())
|
||||
return {
|
||||
"text": text,
|
||||
"source": source,
|
||||
"match_method": match_method,
|
||||
"word_count": word_count,
|
||||
"char_offset": char_offset,
|
||||
"char_end": char_end,
|
||||
}
|
||||
|
||||
|
||||
class TestScoringIntegration:
|
||||
"""Tests that update_words_json uses frequency scoring."""
|
||||
|
||||
def test_cloze_has_difficulty_score(self):
|
||||
"""Cloze dict includes difficulty_score field."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא אָדָם טוֹב מְאוֹד", char_offset=10, char_end=13),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"].get("cloze")
|
||||
assert cloze is not None
|
||||
assert "difficulty_score" in cloze
|
||||
assert isinstance(cloze["difficulty_score"], int)
|
||||
|
||||
def test_vetted_sorted_by_difficulty(self):
|
||||
"""Vetted sentences are sorted easiest first."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence("הוּא טוֹב", char_offset=4, char_end=7),
|
||||
_make_sentence("הַתַּפְנִיט טוֹב בְּיוֹתֵר", char_offset=10, char_end=13),
|
||||
_make_sentence("אֲנִי טוֹב הַיּוֹם", char_offset=5, char_end=8),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
vetted = words["טוֹב"]["examples"]["vetted"]
|
||||
assert len(vetted) == 3
|
||||
|
||||
def test_easiest_sentence_becomes_cloze(self):
|
||||
"""The sentence with the lowest difficulty score becomes the cloze."""
|
||||
from epub_examples import update_words_json
|
||||
|
||||
words = {
|
||||
"טוֹב": {
|
||||
"word": {"nikkud": "טוֹב", "ktiv_male": "טוב"},
|
||||
"examples": {},
|
||||
}
|
||||
}
|
||||
easy_text = "הוּא טוֹב מְאוֹד"
|
||||
hard_text = "הַפַּרְנָסִימוֹן טוֹב לְהַפְלִיא"
|
||||
matches = {
|
||||
"טוֹב": [
|
||||
_make_sentence(hard_text, char_offset=14, char_end=17),
|
||||
_make_sentence(easy_text, char_offset=4, char_end=7),
|
||||
]
|
||||
}
|
||||
update_words_json(words, matches, confusable_keys=set())
|
||||
cloze = words["טוֹב"]["examples"]["cloze"]
|
||||
assert cloze["text"] == easy_text
|
||||
207
tests/test_sentence_difficulty.py
Normal file
207
tests/test_sentence_difficulty.py
Normal file
|
|
@ -0,0 +1,207 @@
|
|||
"""Tests for sentence difficulty scoring."""
|
||||
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
import frequency_lookup
|
||||
from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence
|
||||
|
||||
|
||||
class TestBuildNikkudMap:
|
||||
def test_maps_direct_headwords(self):
|
||||
words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָב"] == "אב"
|
||||
|
||||
def test_maps_conjugation_forms(self):
|
||||
words = {
|
||||
"שָׁמַר": {
|
||||
"word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
"conjugation": {
|
||||
"active_forms": [
|
||||
{
|
||||
"person": "1s",
|
||||
"tense": "עָבָר",
|
||||
"form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
|
||||
},
|
||||
],
|
||||
"infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
|
||||
"reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["שָׁמַרְתִּי"] == "שמרתי"
|
||||
assert nmap["לִשְׁמֹר"] == "לשמור"
|
||||
|
||||
def test_maps_noun_inflections(self):
|
||||
words = {
|
||||
"אָב": {
|
||||
"word": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"noun_inflection": {
|
||||
"singular": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||||
"plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
|
||||
"pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["אָבוֹת"] == "אבות"
|
||||
assert nmap["אָבִי"] == "אבי"
|
||||
|
||||
def test_maps_adjective_inflections(self):
|
||||
words = {
|
||||
"גָּדוֹל": {
|
||||
"word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"adjective_inflection": {
|
||||
"ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||||
"fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
|
||||
"mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
|
||||
"fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["גְּדוֹלָה"] == "גדולה"
|
||||
assert nmap["גְּדוֹלִים"] == "גדולים"
|
||||
|
||||
def test_construct_forms_strip_maqaf(self):
|
||||
words = {
|
||||
"בֵּית": {
|
||||
"word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
|
||||
"noun_inflection": {
|
||||
"construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
|
||||
},
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert "בֵּית־" in nmap
|
||||
assert "בֵּית" in nmap
|
||||
|
||||
def test_handles_missing_fields(self):
|
||||
words = {
|
||||
"test": {
|
||||
"word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
|
||||
"conjugation": None,
|
||||
"noun_inflection": None,
|
||||
"adjective_inflection": None,
|
||||
}
|
||||
}
|
||||
nmap = build_nikkud_map(words)
|
||||
assert nmap["טֶסְט"] == "טסט"
|
||||
|
||||
def test_real_words_json_coverage(self):
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
nmap = build_nikkud_map(words)
|
||||
assert len(nmap) > 90_000
|
||||
|
||||
|
||||
class TestResolveTokenFrequency:
|
||||
@pytest.fixture()
|
||||
def freq_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_tier1_known_mapping(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 50_000
|
||||
|
||||
def test_tier3_academy_converter(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank is not None
|
||||
assert rank < 1000
|
||||
|
||||
def test_unknown_token_returns_default(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
|
||||
assert rank == 50_000
|
||||
|
||||
def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
|
||||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||||
assert freq_data.get("שלום") is not None
|
||||
|
||||
|
||||
class TestScoreSentence:
|
||||
@pytest.fixture()
|
||||
def scoring_setup(self):
|
||||
frequency_lookup.load()
|
||||
freq_data = frequency_lookup.get_freq_data()
|
||||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||||
if not words_path.exists():
|
||||
pytest.skip("words.json not available")
|
||||
with open(words_path, encoding="utf-8") as f:
|
||||
words = json.load(f)
|
||||
from epub_examples import _build_nikkud_index
|
||||
|
||||
nikkud_map = build_nikkud_map(words)
|
||||
nikkud_index = _build_nikkud_index(words)
|
||||
return nikkud_map, nikkud_index, freq_data
|
||||
|
||||
def test_returns_integer(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא הָלַךְ הַבַּיְתָה"
|
||||
start = text.index("הָלַךְ")
|
||||
end = start + len("הָלַךְ")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_easy_sentence_scores_lower(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
easy = "הוּא אָמַר שָׁלוֹם"
|
||||
easy_start = easy.index("אָמַר")
|
||||
easy_end = easy_start + len("אָמַר")
|
||||
hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
|
||||
hard_start = hard.index("נִשְׁתַּטֵּחַ")
|
||||
hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
|
||||
easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
|
||||
hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
|
||||
assert easy_score < hard_score
|
||||
|
||||
def test_single_context_token(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "הוּא טוֹב"
|
||||
start = 0
|
||||
end = len("הוּא")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_handles_punctuation(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = '"הוּא טוֹב!"'
|
||||
start = text.index("טוֹב")
|
||||
end = start + len("טוֹב")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_splits_on_maqaf(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "בֵּית־סֵפֶר גָּדוֹל"
|
||||
start = text.index("גָּדוֹל")
|
||||
end = start + len("גָּדוֹל")
|
||||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||||
assert isinstance(score, int)
|
||||
|
||||
def test_no_context_tokens_returns_default(self, scoring_setup):
|
||||
nmap, nidx, freq = scoring_setup
|
||||
text = "א ב"
|
||||
score = score_sentence(text, 0, 1, nmap, nidx, freq)
|
||||
assert score == DEFAULT_RANK
|
||||
Loading…
Reference in a new issue