Skip to content

Commit 7332984

Browse files
authored
Merge pull request #1056 from PyThaiNLP/fixed-1055
Fixed #1055 bug: Tone detector + syllable sound bug
2 parents 37f6ba8 + f3e772e commit 7332984

File tree

2 files changed

+69
-30
lines changed

2 files changed

+69
-30
lines changed

pythainlp/util/syllable.py

Lines changed: 57 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
"""
66
Syllable tools
77
"""
8+
89
import re
910

1011
from pythainlp import thai_consonants, thai_tonemarks
@@ -23,9 +24,7 @@
2324
thai_consonants_all = list(thai_consonants)
2425
thai_consonants_all.remove("อ")
2526

26-
_temp = list(
27-
"".join(["".join(v) for v in spelling_class.values()])
28-
)
27+
_temp = list("".join(["".join(v) for v in spelling_class.values()]))
2928
not_spelling_class = [j for j in thai_consonants_all if j not in _temp]
3029

3130
# vowel's short sound
@@ -37,6 +36,7 @@
3736
# These spelling consonant ares live syllables.
3837
for i in ["กง", "กน", "กม", "เกย", "เกอว"]:
3938
_check_1.extend(spelling_class[i])
39+
4040
# These spelling consonants are dead syllables.
4141
_check_2 = spelling_class["กก"] + spelling_class["กบ"] + spelling_class["กด"]
4242

@@ -54,6 +54,7 @@
5454
"high": thai_high_aspirates + thai_high_irregular,
5555
}
5656
thai_initial_consonant_to_type = {}
57+
5758
for k, v in thai_initial_consonant_type.items():
5859
for i in v:
5960
thai_initial_consonant_to_type[i] = k
@@ -67,7 +68,7 @@ def sound_syllable(syllable: str) -> str:
6768
The syllable is a live syllable or dead syllable.
6869
6970
:param str syllable: Thai syllable
70-
:return: syllable's type (live or dead)
71+
:return: syllable's type ("live" or "dead")
7172
:rtype: str
7273
7374
:Example:
@@ -81,56 +82,78 @@ def sound_syllable(syllable: str) -> str:
8182
print(sound_syllable("เลข"))
8283
# output: dead
8384
"""
85+
# if len of syllable < 2
86+
if len(syllable) < 2:
87+
return "dead"
88+
8489
# get consonants
8590
consonants = [i for i in syllable if i in list(thai_consonants_all)]
91+
if (
92+
(len(consonants) == 0)
93+
and ("อ" in syllable)
94+
and any((c in set("เ")) for c in syllable)
95+
and (len(syllable) == 2)
96+
):
97+
return "live"
98+
8699
# get spelling consonants
87100
spelling_consonant = consonants[-1]
88-
# if len of syllable < 2
89-
if len(syllable) < 2:
90-
return "dead"
91-
elif (spelling_consonant in _check_2) and (
101+
if (spelling_consonant in _check_2) and (
92102
any((c in set("าีืแูาเโ")) for c in syllable) is False
93103
and any((c in set("ำใไ")) for c in syllable) is False
94104
and bool(pattern.search(syllable)) is not True
95105
):
96106
return "dead"
97-
elif any((c in set("าีืแูาโ")) for c in syllable): # in syllable:
107+
108+
if any((c in set("าีืแูาโ")) for c in syllable): # in syllable:
98109
if (
99110
spelling_consonant in _check_1
100111
and bool(re_short.search(syllable)) is not True
101112
):
102113
return "live"
103-
elif (
114+
115+
if (
104116
spelling_consonant != syllable[-1]
105117
and bool(re_short.search(syllable)) is not True
106118
):
107119
return "live"
108-
elif spelling_consonant in _check_2:
120+
121+
if spelling_consonant in _check_2:
109122
return "dead"
110-
elif bool(re_short.search(syllable)) or any(
123+
124+
if bool(re_short.search(syllable)) or any(
111125
(c in set(short)) for c in syllable
112126
):
113127
return "dead"
128+
114129
return "live"
115-
elif any((c in set("ำใไ")) for c in syllable):
130+
131+
if any((c in set("ำใไ")) for c in syllable):
116132
return "live" # if these vowel's long sounds are live syllables
117-
elif bool(pattern.search(syllable)): # if it is เ-า
133+
134+
if bool(pattern.search(syllable)): # if it is เ-า
118135
return "live"
119-
elif spelling_consonant in _check_1:
136+
137+
if spelling_consonant in _check_1:
120138
if (
121139
bool(re_short.search(syllable))
122140
or any((c in set(short)) for c in syllable)
123141
) and len(consonants) < 2:
124142
return "dead"
143+
144+
if syllable[-1] in set(short):
145+
return "dead"
146+
125147
return "live"
126-
elif bool(
148+
149+
if bool(
127150
re_short.search(syllable)
128151
) or any( # if vowel's short sound is found
129152
(c in set(short)) for c in syllable
130153
): # consonant in short
131154
return "dead"
132-
else:
133-
return "dead"
155+
156+
return "dead"
134157

135158

136159
def syllable_open_close_detector(syllable: str) -> str:
@@ -155,10 +178,13 @@ def syllable_open_close_detector(syllable: str) -> str:
155178
# output: open
156179
"""
157180
consonants = [i for i in syllable if i in list(thai_consonants)]
181+
158182
if len(consonants) < 2:
159183
return "open"
160-
elif len(consonants) == 2 and consonants[-1] == "อ":
184+
185+
if len(consonants) == 2 and consonants[-1] == "อ":
161186
return "open"
187+
162188
return "close"
163189

164190

@@ -186,27 +212,31 @@ def syllable_length(syllable: str) -> str:
186212
consonants = [i for i in syllable if i in list(thai_consonants)]
187213
if len(consonants) <= 3 and any((c in set(short)) for c in syllable):
188214
return "short"
189-
elif bool(re_short.search(syllable)):
215+
216+
if bool(re_short.search(syllable)):
190217
return "short"
191-
else:
192-
return "long"
218+
219+
return "long"
193220

194221

195222
def _tone_mark_detector(syllable: str) -> str:
196223
tone_mark = [i for i in syllable if i in list(thai_tonemarks)]
197224
if tone_mark == []:
198225
return ""
199-
else:
200-
return tone_mark[0]
226+
227+
return tone_mark[0]
201228

202229

203230
def _check_sonorant_syllable(syllable: str) -> bool:
204231
_sonorant = [i for i in syllable if i in thai_low_sonorants]
205232
consonants = [i for i in syllable if i in list(thai_consonants)]
233+
206234
if _sonorant[-1] == consonants[-2]:
207235
return True
208-
elif _sonorant[-1] == consonants[-1]:
236+
237+
if _sonorant[-1] == consonants[-1]:
209238
return True
239+
210240
return False
211241

212242

@@ -248,9 +278,7 @@ def tone_detector(syllable: str) -> str:
248278
initial_consonant_type = thai_initial_consonant_to_type[initial_consonant]
249279
# r for store value
250280
r = ""
251-
if len(consonants) > 1 and (
252-
initial_consonant in ("อ", "ห")
253-
):
281+
if len(consonants) > 1 and (initial_consonant in ("อ", "ห")):
254282
consonant_ending = _check_sonorant_syllable(syllable)
255283
if (
256284
initial_consonant == "อ"
@@ -325,4 +353,5 @@ def tone_detector(syllable: str) -> str:
325353
r = "m"
326354
elif initial_consonant_type == "high" and s == "live":
327355
r = "r"
356+
328357
return r

tests/core/test_util.py

Lines changed: 12 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -680,9 +680,14 @@ def test_sound_syllable(self):
680680
("เพราะ", "dead"),
681681
("เกาะ", "dead"),
682682
("แคะ", "dead"),
683+
("ประ", "dead"),
683684
]
684685
for i, j in test:
685-
self.assertEqual(sound_syllable(i), j)
686+
self.assertEqual(
687+
sound_syllable(i),
688+
j,
689+
f"{i} should be determined to be a '{j}' syllable."
690+
)
686691

687692
def test_tone_detector(self):
688693
data = [
@@ -710,9 +715,14 @@ def test_tone_detector(self):
710715
("f", "ผู้"),
711716
("h", "ครับ"),
712717
("f", "ค่ะ"),
718+
("m", "เอ"),
713719
]
714720
for i, j in data:
715-
self.assertEqual(tone_detector(j), i)
721+
self.assertEqual(
722+
tone_detector(j),
723+
i,
724+
f"{j} should be determined to be a '{i}' tone."
725+
)
716726

717727
def test_syllable_length(self):
718728
self.assertEqual(syllable_length("มาก"), "long")

0 commit comments

Comments
 (0)