Add Academy-rules-based nikkud→ktiv male converter (91.6% accuracy vs 77.2% for strip_nikkud) and v0.20 adaptive sentence difficulty cloze design spec. The converter enables frequency-based sentence scoring by properly resolving nikkud tokens to their ktiv male forms for frequency corpus lookup. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
185 lines
6.3 KiB
Python
185 lines
6.3 KiB
Python
"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling).
|
||
|
||
Implements Hebrew Academy rules for matres lectionis insertion:
|
||
- Rule A: U vowel (kubutz) → always insert vav
|
||
- Rule B: O vowel (holam on non-vav) → insert vav
|
||
- Rule C: I vowel (hiriq) → insert yod (conditionally)
|
||
- Rule D: E vowel (tsere) → insert yod (limited cases)
|
||
- Rule E/F: Consonantal vav/yod doubling
|
||
|
||
Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/
|
||
"""
|
||
|
||
import unicodedata
|
||
|
||
# Hebrew nikkud code points
|
||
SHVA = "\u05b0"
|
||
HATAF_SEGOL = "\u05b1"
|
||
HATAF_PATAH = "\u05b2"
|
||
HATAF_KAMATZ = "\u05b3"
|
||
HIRIQ = "\u05b4"
|
||
TSERE = "\u05b5"
|
||
SEGOL = "\u05b6"
|
||
PATAH = "\u05b7"
|
||
KAMATZ = "\u05b8"
|
||
HOLAM = "\u05b9"
|
||
HOLAM_HASER = "\u05ba"
|
||
KUBUTZ = "\u05bb"
|
||
DAGESH = "\u05bc"
|
||
METEG = "\u05bd"
|
||
RAFE = "\u05bf"
|
||
SHIN_DOT = "\u05c1"
|
||
SIN_DOT = "\u05c2"
|
||
|
||
VAV = "ו"
|
||
YOD = "י"
|
||
MAQAF = "־"
|
||
|
||
VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ}
|
||
|
||
NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT}
|
||
|
||
|
||
def _parse_segments(text: str) -> list[tuple[str, list[str]]]:
|
||
"""Parse nikkud text into (character, [marks]) segments."""
|
||
segments: list[tuple[str, list[str]]] = []
|
||
cur_char: str | None = None
|
||
cur_marks: list[str] = []
|
||
|
||
for ch in text:
|
||
if unicodedata.category(ch) == "Mn":
|
||
cur_marks.append(ch)
|
||
else:
|
||
if cur_char is not None:
|
||
segments.append((cur_char, cur_marks))
|
||
cur_char = ch
|
||
cur_marks = []
|
||
|
||
if cur_char is not None:
|
||
segments.append((cur_char, cur_marks))
|
||
|
||
return segments
|
||
|
||
|
||
def _get_vowel(marks: list[str]) -> str | None:
|
||
"""Extract the vowel mark from a list of combining marks."""
|
||
for m in marks:
|
||
if m in VOWELS:
|
||
return m
|
||
return None
|
||
|
||
|
||
def _has_dagesh(marks: list[str]) -> bool:
|
||
return DAGESH in marks
|
||
|
||
|
||
def _is_hebrew_letter(ch: str) -> bool:
|
||
return "\u05d0" <= ch <= "\u05ea"
|
||
|
||
|
||
def convert(text: str) -> str:
|
||
"""Convert nikkud Hebrew text to ktiv male.
|
||
|
||
Strips all nikkud marks and inserts matres lectionis (vav/yod)
|
||
according to Hebrew Academy spelling rules.
|
||
"""
|
||
segments = _parse_segments(text)
|
||
result: list[str] = []
|
||
|
||
for i, (ch, marks) in enumerate(segments):
|
||
if not _is_hebrew_letter(ch):
|
||
# Non-Hebrew character: output as-is (no marks)
|
||
result.append(ch)
|
||
continue
|
||
|
||
vowel = _get_vowel(marks)
|
||
has_dag = _has_dagesh(marks)
|
||
|
||
# Output the base letter (strip all nikkud marks)
|
||
result.append(ch)
|
||
|
||
# --- Rule A: U vowel (kubutz) → always add vav ---
|
||
if vowel == KUBUTZ:
|
||
result.append(VAV)
|
||
continue
|
||
|
||
# --- Shuruk detection ---
|
||
# Vav with dagesh and no other vowel = shuruk (already a mater)
|
||
# Vav with dagesh AND a vowel = consonantal vav (ב with dagesh)
|
||
# If letter is vav with dagesh only → it's shuruk, already output
|
||
if ch == VAV and has_dag and vowel is None:
|
||
# Shuruk: vav IS the mater lectionis, already output
|
||
continue
|
||
|
||
# --- Rule B: O vowel (holam) → add vav ---
|
||
if vowel in (HOLAM, HOLAM_HASER):
|
||
if ch != VAV:
|
||
# Exception: holam before aleph (pe-aleph verbs) — no vav
|
||
# e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד
|
||
next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א"
|
||
if not next_is_aleph:
|
||
result.append(VAV)
|
||
# If ch IS vav (holam male), vav already output
|
||
continue
|
||
|
||
# --- Rule C: I vowel (hiriq) → conditionally add yod ---
|
||
if vowel == HIRIQ:
|
||
if ch == YOD:
|
||
# Yod already present, don't double
|
||
continue
|
||
|
||
# Don't insert yod if next letter is already yod
|
||
if i + 1 < len(segments) and segments[i + 1][0] == YOD:
|
||
continue
|
||
|
||
# Rule C Section 3: Don't add yod if the NEXT consonant
|
||
# has shva (indicating shva nach on that consonant)
|
||
add_yod = True
|
||
|
||
if i + 1 < len(segments):
|
||
next_ch, next_marks = segments[i + 1]
|
||
next_vowel = _get_vowel(next_marks)
|
||
|
||
# Shva on next consonant = shva nach → don't add yod
|
||
# UNLESS next consonant also has dagesh (= shva na / doubled)
|
||
next_has_dagesh = _has_dagesh(next_marks)
|
||
if next_vowel == SHVA and not next_has_dagesh:
|
||
add_yod = False
|
||
# No vowel on next consonant (word-final) = closed syllable
|
||
# → don't add yod (e.g., suffix -תי -נו -תם)
|
||
elif next_vowel is None and _is_hebrew_letter(next_ch):
|
||
# Check if this is truly word-final or next-to-last
|
||
remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0]))
|
||
if remaining_letters <= 2:
|
||
# Short suffix like תי, נו — don't add yod
|
||
add_yod = False
|
||
|
||
if add_yod:
|
||
result.append(YOD)
|
||
continue
|
||
|
||
# --- Rule D: E vowel (tsere/segol) → generally NO yod ---
|
||
# Exception (b): tsere before guttural/resh gets yod ONLY
|
||
# in word-initial position (dagesh substitution in Hif'il/noun patterns)
|
||
# e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע
|
||
# but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר
|
||
if vowel == TSERE:
|
||
add_yod = False
|
||
|
||
if i + 1 < len(segments):
|
||
next_ch = segments[i + 1][0]
|
||
if next_ch in "אהחער":
|
||
# Only at word-initial (pos 0) or after prefix (pos 1)
|
||
# where dagesh substitution applies
|
||
hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0]))
|
||
if hebrew_pos <= 1:
|
||
add_yod = True
|
||
|
||
if add_yod:
|
||
result.append(YOD)
|
||
continue
|
||
|
||
# All other vowels (patah, kamatz, segol, shva, hataf-*):
|
||
# No mater lectionis insertion needed
|
||
|
||
return "".join(result)
|