Skip to content

Commit ef73fdc

Browse files
authored
Merge pull request #1060 from PyThaiNLP/add-spelling
Add pythainlp.util.spelling
2 parents 1271452 + 343c7a1 commit ef73fdc

File tree

4 files changed

+173
-2
lines changed

4 files changed

+173
-2
lines changed

docs/api/util.rst

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -258,6 +258,14 @@ Modules
258258

259259
The `words_to_num` function is a numeral conversion utility that translates Thai word numerals into numerical form. It is important for numerical data extraction and computation.
260260

261+
.. autofunction:: spelling
262+
:noindex:
263+
The `spelling` function is a text processing tool for spelling Thai word.
264+
265+
.. autofunction:: thai_consonant_to_spelling
266+
267+
.. autofunction:: tone_to_spelling
268+
261269
.. autofunction:: pythainlp.util.spell_words.spell_syllable
262270
:noindex:
263271

pythainlp/util/__init__.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -44,13 +44,15 @@
4444
"reorder_vowels",
4545
"rhyme",
4646
"sound_syllable",
47+
"spelling",
4748
"spell_words",
4849
"syllable_length",
4950
"syllable_open_close_detector",
5051
"text_to_arabic_digit",
5152
"text_to_num",
5253
"text_to_thai_digit",
5354
"th_zodiac",
55+
"thai_consonant_to_spelling",
5456
"thai_digit_to_arabic_digit",
5557
"thai_keyboard_dist",
5658
"thai_strptime",
@@ -65,6 +67,7 @@
6567
"to_idna",
6668
"to_lunar_date",
6769
"tone_detector",
70+
"tone_to_spelling",
6871
"words_to_num",
6972
]
7073

@@ -134,4 +137,9 @@
134137
syllable_open_close_detector,
135138
tone_detector,
136139
)
137-
from pythainlp.util.pronounce import rhyme
140+
from pythainlp.util.pronounce import (
141+
rhyme,
142+
spelling,
143+
tone_to_spelling,
144+
thai_consonant_to_spelling,
145+
)

pythainlp/util/pronounce.py

Lines changed: 143 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,10 +3,14 @@
33
# SPDX-FileType: SOURCE
44
# SPDX-License-Identifier: Apache-2.0
55
from typing import List
6+
import re
67

78
from pythainlp.corpus import thai_words
89
from pythainlp.khavee import KhaveeVerifier
910
from pythainlp.tokenize import syllable_tokenize
11+
from pythainlp.tokenize import Tokenizer
12+
from pythainlp import thai_consonants, thai_tonemarks
13+
from pythainlp.util import remove_tonemark
1014

1115
kv = KhaveeVerifier()
1216
all_thai_words_dict = None
@@ -30,11 +34,149 @@ def rhyme(word: str) -> List[str]:
3034
"""
3135
global all_thai_words_dict
3236
list_sumpus = []
33-
if all_thai_words_dict == None:
37+
if all_thai_words_dict is None:
3438
all_thai_words_dict = [
3539
i for i in list(thai_words()) if len(syllable_tokenize(i)) == 1
3640
]
3741
for i in all_thai_words_dict:
3842
if kv.is_sumpus(word, i) and i != word:
3943
list_sumpus.append(i)
4044
return sorted(list_sumpus)
45+
46+
47+
thai_vowel = ''.join((
48+
"อะ,อา,อิ,อี,อึ,อื,อุ,อู,เอะ,เอ,แอะ,แอ,เอียะ,เอีย,เอือะ,เอือ,อัวะ,อัว,โอะ,",
49+
"โอ,เอาะ,ออ,เออะ,เออ,อำ,ใอ,ไอ,เอา,ฤ,ฤๅ,ฦ,ฦๅ"
50+
)).split(",")
51+
thai_vowel_all = [
52+
("([ก-ฮ])ะ", "\\1อะ"),
53+
("([ก-ฮ])า", "\\1อา"),
54+
("อิ".replace("อ", "([ก-ฮ])"), "อิ".replace("อ", "\\1อ")),
55+
("อี".replace("อ", "([ก-ฮ])"), "อี".replace("อ", "\\1อ")),
56+
("อึ".replace("อ", "([ก-ฮ])", 1), "อึ".replace("อ", "\\1อ", 1)),
57+
("อื".replace("อ", "([ก-ฮ])", 1), "อื".replace("อ", "\\1อ", 1)),
58+
("อุ".replace("อ", "([ก-ฮ])", 1), "อุ".replace("อ", "\\1อ", 1)),
59+
("อู".replace("อ", "([ก-ฮ])", 1), "อู".replace("อ", "\\1อ", 1)),
60+
("เอะ".replace("อ", "([ก-ฮ])", 1), "\\1เอะ"),
61+
("เอ".replace("อ", "([ก-ฮ])", 1), "\\1เอ"),
62+
("แอะ".replace("อ", "([ก-ฮ])", 1), "\\1แอะ"),
63+
("แอ".replace("อ", "([ก-ฮ])", 1), "\\1แอ"),
64+
("เอียะ".replace("อ", "([ก-ฮ])", 1), "\\1เอียะ"),
65+
("เอีย".replace("อ", "([ก-ฮ])", 1), "\\1เอีย"),
66+
("เอือะ".replace("อ", "([ก-ฮ])", 1), "\\1เอือะ"),
67+
("เอือ".replace("อ", "([ก-ฮ])", 1), "\\1เอือ"),
68+
("อัวะ".replace("อ", "([ก-ฮ])", 1), "\\1อัวะ"),
69+
("อัว".replace("อ", "([ก-ฮ])", 1), "\\1อัว"),
70+
("โอะ".replace("อ", "([ก-ฮ])", 1), "\\1โอะ"),
71+
("โอ".replace("อ", "([ก-ฮ])", 1), "\\1โอ"),
72+
("เอาะ".replace("อ", "([ก-ฮ])", 1), "\\1เอาะ"),
73+
("ออ".replace("อ", "([ก-ฮ])", 1), "\\1ออ"),
74+
("เออะ".replace("อ", "([ก-ฮ])", 1), "\\1เออะ"),
75+
("เออ".replace("อ", "([ก-ฮ])", 1), "\\1เออ"),
76+
("อำ".replace("อ", "([ก-ฮ])", 1), "\\1อำ"),
77+
("ใอ".replace("อ", "([ก-ฮ])", 1), "\\1ใอ"),
78+
("ไอ".replace("อ", "([ก-ฮ])", 1), "\\1ไอ"),
79+
("เอา".replace("อ", "([ก-ฮ])", 1), "\\1เอา"),
80+
("อั".replace("อ", "([ก-ฮ])", 1), "\\1อะ"),
81+
]
82+
thai_vowel_all.sort(key=lambda t: len(t[0]), reverse=True)
83+
84+
85+
def thai_consonant_to_spelling(c: str) -> str:
86+
"""
87+
Thai consonants to spelling
88+
89+
:param str c: A Thai consonant
90+
:return: spelling
91+
:rtype: str
92+
93+
:Example:
94+
::
95+
96+
from pythainlp.util import thai_consonant_to_spelling
97+
98+
print(tone_to_spelling("ก"))
99+
# output: กอ
100+
"""
101+
if len(c) == 1 and c in thai_consonants:
102+
return c + "อ"
103+
return c
104+
105+
106+
def tone_to_spelling(t: str) -> str:
107+
"""
108+
Thai tonemarks to spelling
109+
110+
:param str t: A Thai tonemarks
111+
:return: spelling
112+
:rtype: str
113+
114+
:Example:
115+
::
116+
117+
from pythainlp.util import tone_to_spelling
118+
119+
print(tone_to_spelling("่")) # ไม้เอก
120+
# output: ไม้เอก
121+
"""
122+
if t == "่":
123+
return "ไม้เอก"
124+
elif t == "้":
125+
return "ไม้โท"
126+
elif t == "๊":
127+
return "ไม้ตรี"
128+
elif t == "๋":
129+
return "ไม้จัตวา"
130+
return t
131+
132+
133+
def spelling(word: str) -> List[str]:
134+
"""
135+
Thai word to spelling
136+
137+
This funnction support Thai root word only.
138+
139+
:param str word: A Thai word
140+
:return: spelling
141+
:rtype: List[str]
142+
143+
:Example:
144+
::
145+
146+
from pythainlp.util import spelling
147+
148+
print(spelling("เรียน"))
149+
# output: ['รอ', 'เอีย', 'นอ', 'เรียน']
150+
151+
print(spelling("เฝ้า)
152+
# output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
153+
"""
154+
if not word or not isinstance(word, str):
155+
return []
156+
thai_vowel_tokenizer = Tokenizer(
157+
custom_dict=thai_vowel + list(thai_consonants),
158+
engine="longest"
159+
)
160+
word_pre = remove_tonemark(word).replace("็", "")
161+
tone = [tone_to_spelling(i) for i in word if i in thai_tonemarks]
162+
word_output = word_pre
163+
for i, j in thai_vowel_all:
164+
if len(re.findall(i, word_pre, re.U)) > 0:
165+
if "็" in word and i == "เ([ก-ฮ])":
166+
word_output = re.sub(i, "\\1เอะ", word_pre)
167+
else:
168+
word_output = re.sub(i, j, word_pre)
169+
break
170+
list_word_output = thai_vowel_tokenizer.word_tokenize(word_output)
171+
output = [
172+
i for i in [thai_consonant_to_spelling(i) for i in list_word_output]
173+
if '์' not in i
174+
]
175+
if word_pre == word:
176+
return output + [word]
177+
elif tone != []:
178+
return output + [word_pre, tone[0], word]
179+
elif "็" in word:
180+
return output + [word]
181+
else:
182+
return output + [word_pre, word]

tests/core/test_util.py

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@
6666
to_lunar_date,
6767
tone_detector,
6868
words_to_num,
69+
spelling,
6970
)
7071
from pythainlp.util.morse import morse_decode, morse_encode
7172

@@ -844,6 +845,18 @@ def test_th_zodiac(self):
844845
# def test_abbreviation_to_full_text(self):
845846
# self.assertIsInstance(abbreviation_to_full_text("รร.ของเราน่าอยู่", list))
846847

848+
def test_spelling(self):
849+
self.assertEqual(spelling([]), [])
850+
self.assertEqual(spelling("เรียน"), ['รอ', 'เอีย', 'นอ', 'เรียน'])
851+
self.assertEqual(
852+
spelling("เฝ้า"), ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
853+
)
854+
self.assertEqual(spelling("คน"), ['คอ', 'นอ', 'คน'])
855+
self.assertEqual(spelling("กัน"), ['กอ', 'อะ', 'นอ', 'กัน'])
856+
self.assertEqual(
857+
spelling("กั้น"), ['กอ', 'อะ', 'นอ', 'กัน', 'ไม้โท', 'กั้น']
858+
)
859+
847860
def test_longest_common_subsequence(self):
848861
self.assertEqual(longest_common_subsequence("ABCBDAB", "BDCAB"), "BDAB")
849862
self.assertEqual(longest_common_subsequence("AGGTAB", "GXTXAYB"), "GTAB")

0 commit comments

Comments
 (0)