hebrew_flash_cards/nikkud_to_ktiv_male.py
Sochen b3ea086e85 v0.20 design spec + nikkud-to-ktiv-male converter
Add Academy-rules-based nikkud→ktiv male converter (91.6% accuracy
vs 77.2% for strip_nikkud) and v0.20 adaptive sentence difficulty
cloze design spec. The converter enables frequency-based sentence
scoring by properly resolving nikkud tokens to their ktiv male forms
for frequency corpus lookup.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-15 12:57:14 +00:00

185 lines
6.3 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling).
Implements Hebrew Academy rules for matres lectionis insertion:
- Rule A: U vowel (kubutz) → always insert vav
- Rule B: O vowel (holam on non-vav) → insert vav
- Rule C: I vowel (hiriq) → insert yod (conditionally)
- Rule D: E vowel (tsere) → insert yod (limited cases)
- Rule E/F: Consonantal vav/yod doubling
Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/
"""
import unicodedata
# Hebrew nikkud code points
SHVA = "\u05b0"
HATAF_SEGOL = "\u05b1"
HATAF_PATAH = "\u05b2"
HATAF_KAMATZ = "\u05b3"
HIRIQ = "\u05b4"
TSERE = "\u05b5"
SEGOL = "\u05b6"
PATAH = "\u05b7"
KAMATZ = "\u05b8"
HOLAM = "\u05b9"
HOLAM_HASER = "\u05ba"
KUBUTZ = "\u05bb"
DAGESH = "\u05bc"
METEG = "\u05bd"
RAFE = "\u05bf"
SHIN_DOT = "\u05c1"
SIN_DOT = "\u05c2"
VAV = "ו"
YOD = "י"
MAQAF = "־"
VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ}
NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT}
def _parse_segments(text: str) -> list[tuple[str, list[str]]]:
"""Parse nikkud text into (character, [marks]) segments."""
segments: list[tuple[str, list[str]]] = []
cur_char: str | None = None
cur_marks: list[str] = []
for ch in text:
if unicodedata.category(ch) == "Mn":
cur_marks.append(ch)
else:
if cur_char is not None:
segments.append((cur_char, cur_marks))
cur_char = ch
cur_marks = []
if cur_char is not None:
segments.append((cur_char, cur_marks))
return segments
def _get_vowel(marks: list[str]) -> str | None:
"""Extract the vowel mark from a list of combining marks."""
for m in marks:
if m in VOWELS:
return m
return None
def _has_dagesh(marks: list[str]) -> bool:
return DAGESH in marks
def _is_hebrew_letter(ch: str) -> bool:
return "\u05d0" <= ch <= "\u05ea"
def convert(text: str) -> str:
"""Convert nikkud Hebrew text to ktiv male.
Strips all nikkud marks and inserts matres lectionis (vav/yod)
according to Hebrew Academy spelling rules.
"""
segments = _parse_segments(text)
result: list[str] = []
for i, (ch, marks) in enumerate(segments):
if not _is_hebrew_letter(ch):
# Non-Hebrew character: output as-is (no marks)
result.append(ch)
continue
vowel = _get_vowel(marks)
has_dag = _has_dagesh(marks)
# Output the base letter (strip all nikkud marks)
result.append(ch)
# --- Rule A: U vowel (kubutz) → always add vav ---
if vowel == KUBUTZ:
result.append(VAV)
continue
# --- Shuruk detection ---
# Vav with dagesh and no other vowel = shuruk (already a mater)
# Vav with dagesh AND a vowel = consonantal vav (ב with dagesh)
# If letter is vav with dagesh only → it's shuruk, already output
if ch == VAV and has_dag and vowel is None:
# Shuruk: vav IS the mater lectionis, already output
continue
# --- Rule B: O vowel (holam) → add vav ---
if vowel in (HOLAM, HOLAM_HASER):
if ch != VAV:
# Exception: holam before aleph (pe-aleph verbs) — no vav
# e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד
next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א"
if not next_is_aleph:
result.append(VAV)
# If ch IS vav (holam male), vav already output
continue
# --- Rule C: I vowel (hiriq) → conditionally add yod ---
if vowel == HIRIQ:
if ch == YOD:
# Yod already present, don't double
continue
# Don't insert yod if next letter is already yod
if i + 1 < len(segments) and segments[i + 1][0] == YOD:
continue
# Rule C Section 3: Don't add yod if the NEXT consonant
# has shva (indicating shva nach on that consonant)
add_yod = True
if i + 1 < len(segments):
next_ch, next_marks = segments[i + 1]
next_vowel = _get_vowel(next_marks)
# Shva on next consonant = shva nach → don't add yod
# UNLESS next consonant also has dagesh (= shva na / doubled)
next_has_dagesh = _has_dagesh(next_marks)
if next_vowel == SHVA and not next_has_dagesh:
add_yod = False
# No vowel on next consonant (word-final) = closed syllable
# → don't add yod (e.g., suffix -תי -נו -תם)
elif next_vowel is None and _is_hebrew_letter(next_ch):
# Check if this is truly word-final or next-to-last
remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0]))
if remaining_letters <= 2:
# Short suffix like תי, נו — don't add yod
add_yod = False
if add_yod:
result.append(YOD)
continue
# --- Rule D: E vowel (tsere/segol) → generally NO yod ---
# Exception (b): tsere before guttural/resh gets yod ONLY
# in word-initial position (dagesh substitution in Hif'il/noun patterns)
# e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע
# but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר
if vowel == TSERE:
add_yod = False
if i + 1 < len(segments):
next_ch = segments[i + 1][0]
if next_ch in "אהחער":
# Only at word-initial (pos 0) or after prefix (pos 1)
# where dagesh substitution applies
hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0]))
if hebrew_pos <= 1:
add_yod = True
if add_yod:
result.append(YOD)
continue
# All other vowels (patah, kamatz, segol, shva, hataf-*):
# No mater lectionis insertion needed
return "".join(result)