Implements build_nikkud_map(), _resolve_token_frequency(), and score_sentence() for v0.20 adaptive cloze sentence selection. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
207 lines
8.2 KiB
Python
207 lines
8.2 KiB
Python
"""Tests for sentence difficulty scoring."""
|
||
|
||
import json
|
||
from pathlib import Path
|
||
|
||
import pytest
|
||
|
||
import frequency_lookup
|
||
from sentence_difficulty import DEFAULT_RANK, _resolve_token_frequency, build_nikkud_map, score_sentence
|
||
|
||
|
||
class TestBuildNikkudMap:
|
||
def test_maps_direct_headwords(self):
|
||
words = {"אָב": {"word": {"nikkud": "אָב", "ktiv_male": "אב"}}}
|
||
nmap = build_nikkud_map(words)
|
||
assert nmap["אָב"] == "אב"
|
||
|
||
def test_maps_conjugation_forms(self):
|
||
words = {
|
||
"שָׁמַר": {
|
||
"word": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||
"conjugation": {
|
||
"active_forms": [
|
||
{
|
||
"person": "1s",
|
||
"tense": "עָבָר",
|
||
"form": {"nikkud": "שָׁמַרְתִּי", "ktiv_male": "שמרתי"},
|
||
},
|
||
],
|
||
"infinitive": {"nikkud": "לִשְׁמֹר", "ktiv_male": "לשמור"},
|
||
"reference_form": {"nikkud": "שָׁמַר", "ktiv_male": "שמר"},
|
||
},
|
||
}
|
||
}
|
||
nmap = build_nikkud_map(words)
|
||
assert nmap["שָׁמַרְתִּי"] == "שמרתי"
|
||
assert nmap["לִשְׁמֹר"] == "לשמור"
|
||
|
||
def test_maps_noun_inflections(self):
|
||
words = {
|
||
"אָב": {
|
||
"word": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||
"noun_inflection": {
|
||
"singular": {"nikkud": "אָב", "ktiv_male": "אב"},
|
||
"plural": {"nikkud": "אָבוֹת", "ktiv_male": "אבות"},
|
||
"pronominal_suffixes": {"1s": {"nikkud": "אָבִי", "ktiv_male": "אבי"}},
|
||
},
|
||
}
|
||
}
|
||
nmap = build_nikkud_map(words)
|
||
assert nmap["אָבוֹת"] == "אבות"
|
||
assert nmap["אָבִי"] == "אבי"
|
||
|
||
def test_maps_adjective_inflections(self):
|
||
words = {
|
||
"גָּדוֹל": {
|
||
"word": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||
"adjective_inflection": {
|
||
"ms": {"nikkud": "גָּדוֹל", "ktiv_male": "גדול"},
|
||
"fs": {"nikkud": "גְּדוֹלָה", "ktiv_male": "גדולה"},
|
||
"mp": {"nikkud": "גְּדוֹלִים", "ktiv_male": "גדולים"},
|
||
"fp": {"nikkud": "גְּדוֹלוֹת", "ktiv_male": "גדולות"},
|
||
},
|
||
}
|
||
}
|
||
nmap = build_nikkud_map(words)
|
||
assert nmap["גְּדוֹלָה"] == "גדולה"
|
||
assert nmap["גְּדוֹלִים"] == "גדולים"
|
||
|
||
def test_construct_forms_strip_maqaf(self):
|
||
words = {
|
||
"בֵּית": {
|
||
"word": {"nikkud": "בֵּית", "ktiv_male": "בית"},
|
||
"noun_inflection": {
|
||
"construct_singular": {"nikkud": "בֵּית־", "ktiv_male": "בית"},
|
||
},
|
||
}
|
||
}
|
||
nmap = build_nikkud_map(words)
|
||
assert "בֵּית־" in nmap
|
||
assert "בֵּית" in nmap
|
||
|
||
def test_handles_missing_fields(self):
|
||
words = {
|
||
"test": {
|
||
"word": {"nikkud": "טֶסְט", "ktiv_male": "טסט"},
|
||
"conjugation": None,
|
||
"noun_inflection": None,
|
||
"adjective_inflection": None,
|
||
}
|
||
}
|
||
nmap = build_nikkud_map(words)
|
||
assert nmap["טֶסְט"] == "טסט"
|
||
|
||
def test_real_words_json_coverage(self):
|
||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||
if not words_path.exists():
|
||
pytest.skip("words.json not available")
|
||
with open(words_path, encoding="utf-8") as f:
|
||
words = json.load(f)
|
||
nmap = build_nikkud_map(words)
|
||
assert len(nmap) > 90_000
|
||
|
||
|
||
class TestResolveTokenFrequency:
|
||
@pytest.fixture()
|
||
def freq_setup(self):
|
||
frequency_lookup.load()
|
||
freq_data = frequency_lookup.get_freq_data()
|
||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||
if not words_path.exists():
|
||
pytest.skip("words.json not available")
|
||
with open(words_path, encoding="utf-8") as f:
|
||
words = json.load(f)
|
||
from epub_examples import _build_nikkud_index
|
||
|
||
nikkud_map = build_nikkud_map(words)
|
||
nikkud_index = _build_nikkud_index(words)
|
||
return nikkud_map, nikkud_index, freq_data
|
||
|
||
def test_tier1_known_mapping(self, freq_setup):
|
||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||
rank = _resolve_token_frequency("אָב", nikkud_map, nikkud_index, freq_data)
|
||
assert rank is not None
|
||
assert rank < 50_000
|
||
|
||
def test_tier3_academy_converter(self, freq_setup):
|
||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||
rank = _resolve_token_frequency("שָׁלוֹם", nikkud_map, nikkud_index, freq_data)
|
||
assert rank is not None
|
||
assert rank < 1000
|
||
|
||
def test_unknown_token_returns_default(self, freq_setup):
|
||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||
rank = _resolve_token_frequency("קְסַנְתּוֹפּוּלוֹס", nikkud_map, nikkud_index, freq_data)
|
||
assert rank == 50_000
|
||
|
||
def test_tier5_ktiv_male_prefix_strip(self, freq_setup):
|
||
nikkud_map, nikkud_index, freq_data = freq_setup
|
||
assert freq_data.get("שלום") is not None
|
||
|
||
|
||
class TestScoreSentence:
|
||
@pytest.fixture()
|
||
def scoring_setup(self):
|
||
frequency_lookup.load()
|
||
freq_data = frequency_lookup.get_freq_data()
|
||
words_path = Path(__file__).parent.parent / "data" / "words.json"
|
||
if not words_path.exists():
|
||
pytest.skip("words.json not available")
|
||
with open(words_path, encoding="utf-8") as f:
|
||
words = json.load(f)
|
||
from epub_examples import _build_nikkud_index
|
||
|
||
nikkud_map = build_nikkud_map(words)
|
||
nikkud_index = _build_nikkud_index(words)
|
||
return nikkud_map, nikkud_index, freq_data
|
||
|
||
def test_returns_integer(self, scoring_setup):
|
||
nmap, nidx, freq = scoring_setup
|
||
text = "הוּא הָלַךְ הַבַּיְתָה"
|
||
start = text.index("הָלַךְ")
|
||
end = start + len("הָלַךְ")
|
||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||
assert isinstance(score, int)
|
||
|
||
def test_easy_sentence_scores_lower(self, scoring_setup):
|
||
nmap, nidx, freq = scoring_setup
|
||
easy = "הוּא אָמַר שָׁלוֹם"
|
||
easy_start = easy.index("אָמַר")
|
||
easy_end = easy_start + len("אָמַר")
|
||
hard = "הַפַּרְדֵּס נִשְׁתַּטֵּחַ בַּדַּהֲרָה"
|
||
hard_start = hard.index("נִשְׁתַּטֵּחַ")
|
||
hard_end = hard_start + len("נִשְׁתַּטֵּחַ")
|
||
easy_score = score_sentence(easy, easy_start, easy_end, nmap, nidx, freq)
|
||
hard_score = score_sentence(hard, hard_start, hard_end, nmap, nidx, freq)
|
||
assert easy_score < hard_score
|
||
|
||
def test_single_context_token(self, scoring_setup):
|
||
nmap, nidx, freq = scoring_setup
|
||
text = "הוּא טוֹב"
|
||
start = 0
|
||
end = len("הוּא")
|
||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||
assert isinstance(score, int)
|
||
|
||
def test_handles_punctuation(self, scoring_setup):
|
||
nmap, nidx, freq = scoring_setup
|
||
text = '"הוּא טוֹב!"'
|
||
start = text.index("טוֹב")
|
||
end = start + len("טוֹב")
|
||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||
assert isinstance(score, int)
|
||
|
||
def test_splits_on_maqaf(self, scoring_setup):
|
||
nmap, nidx, freq = scoring_setup
|
||
text = "בֵּית־סֵפֶר גָּדוֹל"
|
||
start = text.index("גָּדוֹל")
|
||
end = start + len("גָּדוֹל")
|
||
score = score_sentence(text, start, end, nmap, nidx, freq)
|
||
assert isinstance(score, int)
|
||
|
||
def test_no_context_tokens_returns_default(self, scoring_setup):
|
||
nmap, nidx, freq = scoring_setup
|
||
text = "א ב"
|
||
score = score_sentence(text, 0, 1, nmap, nidx, freq)
|
||
assert score == DEFAULT_RANK
|