hebrew_flash_cards/nikkud_to_ktiv_male.py

"""Convert nikkud (vocalized) Hebrew to ktiv male (plene spelling).

Implements Hebrew Academy rules for matres lectionis insertion:
- Rule A: U vowel (kubutz) → always insert vav
- Rule B: O vowel (holam on non-vav) → insert vav
- Rule C: I vowel (hiriq) → insert yod (conditionally)
- Rule D: E vowel (tsere) → insert yod (limited cases)
- Rule E/F: Consonantal vav/yod doubling

Reference: https://hebrew-academy.org.il/topic/hahlatot/missingvocalizationspelling/
"""

import unicodedata

# Hebrew nikkud code points
SHVA = "\u05b0"
HATAF_SEGOL = "\u05b1"
HATAF_PATAH = "\u05b2"
HATAF_KAMATZ = "\u05b3"
HIRIQ = "\u05b4"
TSERE = "\u05b5"
SEGOL = "\u05b6"
PATAH = "\u05b7"
KAMATZ = "\u05b8"
HOLAM = "\u05b9"
HOLAM_HASER = "\u05ba"
KUBUTZ = "\u05bb"
DAGESH = "\u05bc"
METEG = "\u05bd"
RAFE = "\u05bf"
SHIN_DOT = "\u05c1"
SIN_DOT = "\u05c2"

VAV = "ו"
YOD = "י"
MAQAF = "־"

VOWELS = {SHVA, HATAF_SEGOL, HATAF_PATAH, HATAF_KAMATZ, HIRIQ, TSERE, SEGOL, PATAH, KAMATZ, HOLAM, HOLAM_HASER, KUBUTZ}

NIKKUD_MARKS = VOWELS | {DAGESH, METEG, RAFE, SHIN_DOT, SIN_DOT}


def _parse_segments(text: str) -> list[tuple[str, list[str]]]:
    """Parse nikkud text into (character, [marks]) segments."""
    segments: list[tuple[str, list[str]]] = []
    cur_char: str | None = None
    cur_marks: list[str] = []

    for ch in text:
        if unicodedata.category(ch) == "Mn":
            cur_marks.append(ch)
        else:
            if cur_char is not None:
                segments.append((cur_char, cur_marks))
            cur_char = ch
            cur_marks = []

    if cur_char is not None:
        segments.append((cur_char, cur_marks))

    return segments


def _get_vowel(marks: list[str]) -> str | None:
    """Extract the vowel mark from a list of combining marks."""
    for m in marks:
        if m in VOWELS:
            return m
    return None


def _has_dagesh(marks: list[str]) -> bool:
    return DAGESH in marks


def _is_hebrew_letter(ch: str) -> bool:
    return "\u05d0" <= ch <= "\u05ea"


def convert(text: str) -> str:
    """Convert nikkud Hebrew text to ktiv male.

    Strips all nikkud marks and inserts matres lectionis (vav/yod)
    according to Hebrew Academy spelling rules.
    """
    segments = _parse_segments(text)
    result: list[str] = []

    for i, (ch, marks) in enumerate(segments):
        if not _is_hebrew_letter(ch):
            # Non-Hebrew character: output as-is (no marks)
            result.append(ch)
            continue

        vowel = _get_vowel(marks)
        has_dag = _has_dagesh(marks)

        # Output the base letter (strip all nikkud marks)
        result.append(ch)

        # --- Rule A: U vowel (kubutz) → always add vav ---
        if vowel == KUBUTZ:
            result.append(VAV)
            continue

        # --- Shuruk detection ---
        # Vav with dagesh and no other vowel = shuruk (already a mater)
        # Vav with dagesh AND a vowel = consonantal vav (ב with dagesh)
        # If letter is vav with dagesh only → it's shuruk, already output
        if ch == VAV and has_dag and vowel is None:
            # Shuruk: vav IS the mater lectionis, already output
            continue

        # --- Rule B: O vowel (holam) → add vav ---
        if vowel in (HOLAM, HOLAM_HASER):
            if ch != VAV:
                # Exception: holam before aleph (pe-aleph verbs) — no vav
                # e.g., תֹּאבַד→תאבד, יֹאבַד→יאבד, נֹאבַד→נאבד
                next_is_aleph = i + 1 < len(segments) and segments[i + 1][0] == "א"
                if not next_is_aleph:
                    result.append(VAV)
            # If ch IS vav (holam male), vav already output
            continue

        # --- Rule C: I vowel (hiriq) → conditionally add yod ---
        if vowel == HIRIQ:
            if ch == YOD:
                # Yod already present, don't double
                continue

            # Don't insert yod if next letter is already yod
            if i + 1 < len(segments) and segments[i + 1][0] == YOD:
                continue

            # Rule C Section 3: Don't add yod if the NEXT consonant
            # has shva (indicating shva nach on that consonant)
            add_yod = True

            if i + 1 < len(segments):
                next_ch, next_marks = segments[i + 1]
                next_vowel = _get_vowel(next_marks)

                # Shva on next consonant = shva nach → don't add yod
                # UNLESS next consonant also has dagesh (= shva na / doubled)
                next_has_dagesh = _has_dagesh(next_marks)
                if next_vowel == SHVA and not next_has_dagesh:
                    add_yod = False
                # No vowel on next consonant (word-final) = closed syllable
                # → don't add yod (e.g., suffix -תי -נו -תם)
                elif next_vowel is None and _is_hebrew_letter(next_ch):
                    # Check if this is truly word-final or next-to-last
                    remaining_letters = sum(1 for j in range(i + 1, len(segments)) if _is_hebrew_letter(segments[j][0]))
                    if remaining_letters <= 2:
                        # Short suffix like תי, נו — don't add yod
                        add_yod = False

            if add_yod:
                result.append(YOD)
            continue

        # --- Rule D: E vowel (tsere/segol) → generally NO yod ---
        # Exception (b): tsere before guttural/resh gets yod ONLY
        # in word-initial position (dagesh substitution in Hif'il/noun patterns)
        # e.g., הֵחֵל→היחל, תֵּאָבֵד→תיאבד, הֵרִיעַ→היריע
        # but NOT mid-word: מְסַפֵּר→מספר, מְעַבֵּר→מעבר
        if vowel == TSERE:
            add_yod = False

            if i + 1 < len(segments):
                next_ch = segments[i + 1][0]
                if next_ch in "אהחער":
                    # Only at word-initial (pos 0) or after prefix (pos 1)
                    # where dagesh substitution applies
                    hebrew_pos = sum(1 for j in range(i) if _is_hebrew_letter(segments[j][0]))
                    if hebrew_pos <= 1:
                        add_yod = True

            if add_yod:
                result.append(YOD)
            continue

        # All other vowels (patah, kamatz, segol, shva, hataf-*):
        # No mater lectionis insertion needed

    return "".join(result)