33# SPDX-FileType: SOURCE
44# SPDX-License-Identifier: Apache-2.0
55from typing import List
6+ import re
67
78from pythainlp .corpus import thai_words
89from pythainlp .khavee import KhaveeVerifier
910from pythainlp .tokenize import syllable_tokenize
11+ from pythainlp .tokenize import Tokenizer
12+ from pythainlp import thai_consonants , thai_tonemarks
13+ from pythainlp .util import remove_tonemark
1014
1115kv = KhaveeVerifier ()
1216all_thai_words_dict = None
@@ -30,11 +34,149 @@ def rhyme(word: str) -> List[str]:
3034 """
3135 global all_thai_words_dict
3236 list_sumpus = []
33- if all_thai_words_dict == None :
37+ if all_thai_words_dict is None :
3438 all_thai_words_dict = [
3539 i for i in list (thai_words ()) if len (syllable_tokenize (i )) == 1
3640 ]
3741 for i in all_thai_words_dict :
3842 if kv .is_sumpus (word , i ) and i != word :
3943 list_sumpus .append (i )
4044 return sorted (list_sumpus )
45+
46+
47+ thai_vowel = '' .join ((
48+ "อะ,อา,อิ,อี,อึ,อื,อุ,อู,เอะ,เอ,แอะ,แอ,เอียะ,เอีย,เอือะ,เอือ,อัวะ,อัว,โอะ," ,
49+ "โอ,เอาะ,ออ,เออะ,เออ,อำ,ใอ,ไอ,เอา,ฤ,ฤๅ,ฦ,ฦๅ"
50+ )).split ("," )
51+ thai_vowel_all = [
52+ ("([ก-ฮ])ะ" , "\\ 1อะ" ),
53+ ("([ก-ฮ])า" , "\\ 1อา" ),
54+ ("อิ" .replace ("อ" , "([ก-ฮ])" ), "อิ" .replace ("อ" , "\\ 1อ" )),
55+ ("อี" .replace ("อ" , "([ก-ฮ])" ), "อี" .replace ("อ" , "\\ 1อ" )),
56+ ("อึ" .replace ("อ" , "([ก-ฮ])" , 1 ), "อึ" .replace ("อ" , "\\ 1อ" , 1 )),
57+ ("อื" .replace ("อ" , "([ก-ฮ])" , 1 ), "อื" .replace ("อ" , "\\ 1อ" , 1 )),
58+ ("อุ" .replace ("อ" , "([ก-ฮ])" , 1 ), "อุ" .replace ("อ" , "\\ 1อ" , 1 )),
59+ ("อู" .replace ("อ" , "([ก-ฮ])" , 1 ), "อู" .replace ("อ" , "\\ 1อ" , 1 )),
60+ ("เอะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอะ" ),
61+ ("เอ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอ" ),
62+ ("แอะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1แอะ" ),
63+ ("แอ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1แอ" ),
64+ ("เอียะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอียะ" ),
65+ ("เอีย" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอีย" ),
66+ ("เอือะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอือะ" ),
67+ ("เอือ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอือ" ),
68+ ("อัวะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1อัวะ" ),
69+ ("อัว" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1อัว" ),
70+ ("โอะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1โอะ" ),
71+ ("โอ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1โอ" ),
72+ ("เอาะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอาะ" ),
73+ ("ออ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1ออ" ),
74+ ("เออะ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เออะ" ),
75+ ("เออ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เออ" ),
76+ ("อำ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1อำ" ),
77+ ("ใอ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1ใอ" ),
78+ ("ไอ" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1ไอ" ),
79+ ("เอา" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1เอา" ),
80+ ("อั" .replace ("อ" , "([ก-ฮ])" , 1 ), "\\ 1อะ" ),
81+ ]
82+ thai_vowel_all .sort (key = lambda t : len (t [0 ]), reverse = True )
83+
84+
85+ def thai_consonant_to_spelling (c : str ) -> str :
86+ """
87+ Thai consonants to spelling
88+
89+ :param str c: A Thai consonant
90+ :return: spelling
91+ :rtype: str
92+
93+ :Example:
94+ ::
95+
96+ from pythainlp.util import thai_consonant_to_spelling
97+
98+ print(tone_to_spelling("ก"))
99+ # output: กอ
100+ """
101+ if len (c ) == 1 and c in thai_consonants :
102+ return c + "อ"
103+ return c
104+
105+
106+ def tone_to_spelling (t : str ) -> str :
107+ """
108+ Thai tonemarks to spelling
109+
110+ :param str t: A Thai tonemarks
111+ :return: spelling
112+ :rtype: str
113+
114+ :Example:
115+ ::
116+
117+ from pythainlp.util import tone_to_spelling
118+
119+ print(tone_to_spelling("่")) # ไม้เอก
120+ # output: ไม้เอก
121+ """
122+ if t == "่" :
123+ return "ไม้เอก"
124+ elif t == "้" :
125+ return "ไม้โท"
126+ elif t == "๊" :
127+ return "ไม้ตรี"
128+ elif t == "๋" :
129+ return "ไม้จัตวา"
130+ return t
131+
132+
133+ def spelling (word : str ) -> List [str ]:
134+ """
135+ Thai word to spelling
136+
137+ This funnction support Thai root word only.
138+
139+ :param str word: A Thai word
140+ :return: spelling
141+ :rtype: List[str]
142+
143+ :Example:
144+ ::
145+
146+ from pythainlp.util import spelling
147+
148+ print(spelling("เรียน"))
149+ # output: ['รอ', 'เอีย', 'นอ', 'เรียน']
150+
151+ print(spelling("เฝ้า)
152+ # output: ['ฝอ', 'เอา', 'เฝา', 'ไม้โท', 'เฝ้า']
153+ """
154+ if not word or not isinstance (word , str ):
155+ return []
156+ thai_vowel_tokenizer = Tokenizer (
157+ custom_dict = thai_vowel + list (thai_consonants ),
158+ engine = "longest"
159+ )
160+ word_pre = remove_tonemark (word ).replace ("็" , "" )
161+ tone = [tone_to_spelling (i ) for i in word if i in thai_tonemarks ]
162+ word_output = word_pre
163+ for i , j in thai_vowel_all :
164+ if len (re .findall (i , word_pre , re .U )) > 0 :
165+ if "็" in word and i == "เ([ก-ฮ])" :
166+ word_output = re .sub (i , "\\ 1เอะ" , word_pre )
167+ else :
168+ word_output = re .sub (i , j , word_pre )
169+ break
170+ list_word_output = thai_vowel_tokenizer .word_tokenize (word_output )
171+ output = [
172+ i for i in [thai_consonant_to_spelling (i ) for i in list_word_output ]
173+ if '์' not in i
174+ ]
175+ if word_pre == word :
176+ return output + [word ]
177+ elif tone != []:
178+ return output + [word_pre , tone [0 ], word ]
179+ elif "็" in word :
180+ return output + [word ]
181+ else :
182+ return output + [word_pre , word ]
0 commit comments