"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling). Implements Hebrew Academy rules for matres lectionis insertion: - Rule A: U vowel (kubutz) → always insert vav - Rule B: O vowel (holam on non-vav) → insert vav - Rule C: I vowel (hiriq) → insert yod (conditionally) - Rule D: E vowel (tsere) → insert yod (limited cases) - Rule E/F: Consonantal vav/yod doubling Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/ """ import unicodedata # Hebrew nikkud code points SHVA = "\u05b0" HATAF_SEGOL = "\u05b1" HATAF_PATAH = "\u05b2" HATAF_KAMATZ = "\u05b3" HIRIQ = "\u05b4" TSERE = "\u05b5" SEGOL = "\u05b6" PATAH = "\u05b7" KAMATZ = "\u05b8" HOLAM = "\u05b9" HOLAM_HASER = "\u05ba" KUBUTZ = "\u05bb" DAGESH = "\u05bc" METEG = "\u05bd" RAFE = "\u05bf" SHIN_DOT = "\u05c1" SIN_DOT = "\u05c2" VAV = "ו" YOD = "י" MAQAF = "־" VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ} NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT} def _parse_segments(text: str) -> list[tuple[str, list[str]]]: """Parse nikkud text into (character, [marks]) segments.""" segments: list[tuple[str, list[str]]] = [] cur_char: str | None = None cur_marks: list[str] = [] for ch in text: if unicodedata.category(ch) == "Mn": cur_marks.append(ch) else: if cur_char is not None: segments.append((cur_char, cur_marks)) cur_char = ch cur_marks = [] if cur_char is not None: segments.append((cur_char, cur_marks)) return segments def _get_vowel(marks: list[str]) -> str | None: """Extract the vowel mark from a list of combining marks.""" for m in marks: if m in VOWELS: return m return None def _has_dagesh(marks: list[str]) -> bool: return DAGESH in marks def _is_hebrew_letter(ch: str) -> bool: return "\u05d0" <= ch <= "\u05ea" def convert(text: str) -> str: """Convert nikkud Hebrew text to ktiv male. Strips all nikkud marks and inserts matres lectionis (vav/yod) according to Hebrew Academy spelling rules. """ segments = _parse_segments(text) result: list[str] = [] for i, (ch, marks) in enumerate(segments): if not _is_hebrew_letter(ch): # Non-Hebrew character: output as-is (no marks) result.append(ch) continue vowel = _get_vowel(marks) has_dag = _has_dagesh(marks) # Output the base letter (strip all nikkud marks) result.append(ch) # --- Rule A: U vowel (kubutz) → always add vav --- if vowel == KUBUTZ: result.append(VAV) continue # --- Shuruk detection --- # Vav with dagesh and no other vowel = shuruk (already a mater) # Vav with dagesh AND a vowel = consonantal vav (ב with dagesh) # If letter is vav with dagesh only → it's shuruk, already output if ch == VAV and has_dag and vowel is None: # Shuruk: vav IS the mater lectionis, already output continue # --- Rule B: O vowel (holam) → add vav --- if vowel in (HOLAM, HOLAM_HASER): if ch != VAV: # Exception: holam before aleph (pe-aleph verbs) — no vav # e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א" if not next_is_aleph: result.append(VAV) # If ch IS vav (holam male), vav already output continue # --- Rule C: I vowel (hiriq) → conditionally add yod --- if vowel == HIRIQ: if ch == YOD: # Yod already present, don't double continue # Don't insert yod if next letter is already yod if i + 1 < len(segments) and segments[i + 1][0] == YOD: continue # Rule C Section 3: Don't add yod if the NEXT consonant # has shva (indicating shva nach on that consonant) add_yod = True if i + 1 < len(segments): next_ch, next_marks = segments[i + 1] next_vowel = _get_vowel(next_marks) # Shva on next consonant = shva nach → don't add yod # UNLESS next consonant also has dagesh (= shva na / doubled) next_has_dagesh = _has_dagesh(next_marks) if next_vowel == SHVA and not next_has_dagesh: add_yod = False # No vowel on next consonant (word-final) = closed syllable # → don't add yod (e.g., suffix -תי -נו -תם) elif next_vowel is None and _is_hebrew_letter(next_ch): # Check if this is truly word-final or next-to-last remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0])) if remaining_letters <= 2: # Short suffix like תי, נו — don't add yod add_yod = False if add_yod: result.append(YOD) continue # --- Rule D: E vowel (tsere/segol) → generally NO yod --- # Exception (b): tsere before guttural/resh gets yod ONLY # in word-initial position (dagesh substitution in Hif'il/noun patterns) # e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע # but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר if vowel == TSERE: add_yod = False if i + 1 < len(segments): next_ch = segments[i + 1][0] if next_ch in "אהחער": # Only at word-initial (pos 0) or after prefix (pos 1) # where dagesh substitution applies hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0])) if hebrew_pos <= 1: add_yod = True if add_yod: result.append(YOD) continue # All other vowels (patah, kamatz, segol, shva, hataf-*): # No mater lectionis insertion needed return "".join(result)