Merge pull request #1060 from PyThaiNLP/add-spelling

wannaphong · web-flow · commit ef73fdc57d21 · 2025-01-14T19:05:56.000+07:00
Add pythainlp.util.spelling
diff --git a/docs/api/util.rst b/docs/api/util.rst
@@ -258,6 +258,14 @@ Modules
 
     The `words_to_num` function is a numeral conversion utility that translates Thai word numerals into numerical form. It is important for numerical data extraction and computation.
 
+.. autofunction:: spelling
+    :noindex:
+    The `spelling` function is a text processing tool for spelling Thai word.
+
+.. autofunction:: thai_consonant_to_spelling
+
+.. autofunction:: tone_to_spelling
+
 .. autofunction:: pythainlp.util.spell_words.spell_syllable
     :noindex:
 
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py
@@ -44,13 +44,15 @@
     "reorder_vowels",
     "rhyme",
     "sound_syllable",
+    "spelling",
     "spell_words",
     "syllable_length",
     "syllable_open_close_detector",
     "text_to_arabic_digit",
     "text_to_num",
     "text_to_thai_digit",
     "th_zodiac",
+    "thai_consonant_to_spelling",
     "thai_digit_to_arabic_digit",
     "thai_keyboard_dist",
     "thai_strptime",
@@ -65,6 +67,7 @@
     "to_idna",
     "to_lunar_date",
     "tone_detector",
+    "tone_to_spelling",
     "words_to_num",
 ]
 
@@ -134,4 +137,9 @@
     syllable_open_close_detector,
     tone_detector,
 )
-from pythainlp.util.pronounce import rhyme
+from pythainlp.util.pronounce import (
+    rhyme,
+    spelling,
+    tone_to_spelling,
+    thai_consonant_to_spelling,
+)
diff --git a/pythainlp/util/pronounce.py b/pythainlp/util/pronounce.py
@@ -3,10 +3,14 @@
 # SPDX-FileType: SOURCE
 # SPDX-License-Identifier: Apache-2.0
 from typing import List
+import re
 
 from pythainlp.corpus import thai_words
 from pythainlp.khavee import KhaveeVerifier
 from pythainlp.tokenize import syllable_tokenize
+from pythainlp.tokenize import Tokenizer
+from pythainlp import thai_consonants, thai_tonemarks
+from pythainlp.util import remove_tonemark
 
 kv = KhaveeVerifier()
 all_thai_words_dict = None
@@ -30,11 +34,149 @@ def rhyme(word: str) -> List[str]:
     """
     global all_thai_words_dict
     list_sumpus = []
-    if all_thai_words_dict == None:
+    if all_thai_words_dict is None:
         all_thai_words_dict = [
             i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1
         ]
     for i in all_thai_words_dict:
         if kv.is_sumpus(word, i) and i != word:
             list_sumpus.append(i)
     return sorted(list_sumpus)
+
+
+thai_vowel = ''.join((
+    "อะ,อา,อิ,อี,อึ,อื,อุ,อู,เอะ,เอ,แอะ,แอ,เอียะ,เอีย,เอือะ,เอือ,อัวะ,อัว,โอะ,",
+    "โอ,เอาะ,ออ,เออะ,เออ,อำ,ใอ,ไอ,เอา,ฤ,ฤๅ,ฦ,ฦๅ"
+)).split(",")
+thai_vowel_all = [
+    ("([ก-ฮ])ะ", "\\1อะ"),
+    ("([ก-ฮ])า", "\\1อา"),
+    ("อิ".replace("อ", "([ก-ฮ])"), "อิ".replace("อ", "\\1อ")),
+    ("อี".replace("อ", "([ก-ฮ])"), "อี".replace("อ", "\\1อ")),
+    ("อึ".replace("อ", "([ก-ฮ])", 1), "อึ".replace("อ", "\\1อ", 1)),
+    ("อื".replace("อ", "([ก-ฮ])", 1), "อื".replace("อ", "\\1อ", 1)),
+    ("อุ".replace("อ", "([ก-ฮ])", 1), "อุ".replace("อ", "\\1อ", 1)),
+    ("อู".replace("อ", "([ก-ฮ])", 1), "อู".replace("อ", "\\1อ", 1)),
+    ("เอะ".replace("อ", "([ก-ฮ])", 1), "\\1เอะ"),
+    ("เอ".replace("อ", "([ก-ฮ])", 1), "\\1เอ"),
+    ("แอะ".replace("อ", "([ก-ฮ])", 1), "\\1แอะ"),
+    ("แอ".replace("อ", "([ก-ฮ])", 1), "\\1แอ"),
+    ("เอียะ".replace("อ", "([ก-ฮ])", 1), "\\1เอียะ"),
+    ("เอีย".replace("อ", "([ก-ฮ])", 1), "\\1เอีย"),
+    ("เอือะ".replace("อ", "([ก-ฮ])", 1), "\\1เอือะ"),
+    ("เอือ".replace("อ", "([ก-ฮ])", 1), "\\1เอือ"),
+    ("อัวะ".replace("อ", "([ก-ฮ])", 1), "\\1อัวะ"),
+    ("อัว".replace("อ", "([ก-ฮ])", 1), "\\1อัว"),
+    ("โอะ".replace("อ", "([ก-ฮ])", 1), "\\1โอะ"),
+    ("โอ".replace("อ", "([ก-ฮ])", 1), "\\1โอ"),
+    ("เอาะ".replace("อ", "([ก-ฮ])", 1), "\\1เอาะ"),
+    ("ออ".replace("อ", "([ก-ฮ])", 1), "\\1ออ"),
+    ("เออะ".replace("อ", "([ก-ฮ])", 1), "\\1เออะ"),
+    ("เออ".replace("อ", "([ก-ฮ])", 1), "\\1เออ"),
+    ("อำ".replace("อ", "([ก-ฮ])", 1), "\\1อำ"),
+    ("ใอ".replace("อ", "([ก-ฮ])", 1), "\\1ใอ"),
+    ("ไอ".replace("อ", "([ก-ฮ])", 1), "\\1ไอ"),
+    ("เอา".replace("อ", "([ก-ฮ])", 1), "\\1เอา"),
+    ("อั".replace("อ", "([ก-ฮ])", 1), "\\1อะ"),
+]
+thai_vowel_all.sort(key=lambda t: len(t[0]), reverse=True)
+
+
+def thai_consonant_to_spelling(c: str) -> str:
+    """
+    Thai consonants to spelling
+
+    :param str c: A Thai consonant
+    :return: spelling
+    :rtype: str
+
+    :Example:
+    ::
+
+        from pythainlp.util import thai_consonant_to_spelling
+
+        print(tone_to_spelling("ก"))
+        # output: กอ
+    """
+    if len(c) == 1 and c in thai_consonants:
+        return c + "อ"
+    return c
+
+
+def tone_to_spelling(t: str) -> str:
+    """
+    Thai tonemarks to spelling
+
+    :param str t: A Thai tonemarks
+    :return: spelling
+    :rtype: str
+
+    :Example:
+    ::
+
+        from pythainlp.util import tone_to_spelling
+
+        print(tone_to_spelling("่")) # ไม้เอก
+        # output: ไม้เอก
+    """
+    if t == "่":
+        return "ไม้เอก"
+    elif t == "้":
+        return "ไม้โท"
+    elif t == "๊":
+        return "ไม้ตรี"
+    elif t == "๋":
+        return "ไม้จัตวา"
+    return t
+
+
+def spelling(word: str) -> List[str]:
+    """
+    Thai word to spelling
+
+    This funnction support Thai root word only.
+
+    :param str word: A Thai word
+    :return: spelling
+    :rtype: List[str]
+
+    :Example:
+    ::
+
+        from pythainlp.util import spelling
+
+        print(spelling("เรียน"))
+        # output: ['รอ', 'เอีย', 'นอ', 'เรียน']
+
+        print(spelling("เฝ้า)
+        # output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
+    """
+    if not word or not isinstance(word, str):
+        return []
+    thai_vowel_tokenizer = Tokenizer(
+        custom_dict=thai_vowel + list(thai_consonants),
+        engine="longest"
+    )
+    word_pre = remove_tonemark(word).replace("็", "")
+    tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks]
+    word_output = word_pre
+    for i, j in thai_vowel_all:
+        if len(re.findall(i, word_pre, re.U)) > 0:
+            if "็" in word and i == "เ([ก-ฮ])":
+                word_output = re.sub(i, "\\1เอะ", word_pre)
+            else:
+                word_output = re.sub(i, j, word_pre)
+            break
+    list_word_output = thai_vowel_tokenizer.word_tokenize(word_output)
+    output = [
+        i for i in [thai_consonant_to_spelling(i) for i in list_word_output]
+        if '์' not in i
+    ]
+    if word_pre == word:
+        return output + [word]
+    elif tone != []:
+        return output + [word_pre, tone[0], word]
+    elif "็" in word:
+        return output + [word]
+    else:
+        return output + [word_pre, word]
diff --git a/tests/core/test_util.py b/tests/core/test_util.py
@@ -66,6 +66,7 @@
     to_lunar_date,
     tone_detector,
     words_to_num,
+    spelling,
 )
 from pythainlp.util.morse import morse_decode, morse_encode
 
@@ -844,6 +845,18 @@ def test_th_zodiac(self):
     # def test_abbreviation_to_full_text(self):
     #     self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))
 
+    def test_spelling(self):
+        self.assertEqual(spelling([]), [])
+        self.assertEqual(spelling("เรียน"), ['รอ', 'เอีย', 'นอ', 'เรียน'])
+        self.assertEqual(
+            spelling("เฝ้า"), ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
+        )
+        self.assertEqual(spelling("คน"), ['คอ', 'นอ', 'คน'])
+        self.assertEqual(spelling("กัน"), ['กอ', 'อะ', 'นอ', 'กัน'])
+        self.assertEqual(
+            spelling("กั้น"), ['กอ', 'อะ', 'นอ', 'กัน', 'ไม้โท', 'กั้น']
+        )
+
     def test_longest_common_subsequence(self):
         self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB")
         self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")