This repository was archived by the owner on Nov 30, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 50
/
Copy pathtest_transcribe.py
285 lines (226 loc) · 9.09 KB
/
test_transcribe.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
# Copyright © 2018-2019 Joseph Lorimer <[email protected]>
#
# This file is part of Chinese Support Redux.
#
# Chinese Support Redux is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by the Free
# Software Foundation, either version 3 of the License, or (at your option) any
# later version.
#
# Chinese Support Redux is distributed in the hope that it will be useful, but
# WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
# more details.
#
# You should have received a copy of the GNU General Public License along with
# Chinese Support Redux. If not, see <https://www.gnu.org/licenses/>.
from unittest import skip
from chinese.transcribe import (
accentuate,
get_tone_number_pinyin,
is_sentence,
no_tone,
replace_tone_marks,
split_transcript,
tone_number,
transcribe,
)
from tests import Base
class Accentuate(Base):
def test_pinyin(self):
self.assertEqual(accentuate(['xian4'], 'pinyin'), ['xiàn'])
self.assertEqual(accentuate(['xian4 zai4'], 'pinyin'), ['xiàn zài'])
self.assertEqual(
accentuate(['hen3', 'gao1 xing4'], 'pinyin'), ['hěn', 'gāo xìng']
)
def test_cantonese(self):
self.assertEqual(accentuate(['xian4'], 'jyutping'), ['xian4'])
class SplitTranscript(Base):
def test_tone_mark(self):
self.assertEqual(split_transcript('xiànzài', 'pinyin'), ['xiàn zài'])
def test_tone_number(self):
self.assertEqual(
split_transcript('xian4zai4', 'pinyin'), ['xian4 zai4']
)
def test_muliple_words(self):
self.assertEqual(
split_transcript('hěn gāoxìng', 'pinyin'), ['hěn', 'gāo xìng']
)
def test_multisyllabic_words(self):
self.assertEqual(
split_transcript('túshūguǎn', 'pinyin'), ['tú shū guǎn']
)
def test_ungrouped(self):
self.assertEqual(
split_transcript('hěn gāoxìng', 'pinyin', grouped=False),
['hěn', 'gāo', 'xìng'],
)
def test_apostrophe(self):
self.assertEqual(
split_transcript("yīlù píng'ān", 'pinyin'), ['yī lù', 'píng ān']
)
self.assertEqual(
split_transcript("yòu'éryuán", 'pinyin'), ['yòu ér yuán']
)
def test_punctuation(self):
self.assertEqual(
split_transcript('Méiyǒu, méiyǒu.', 'pinyin'),
['Méi yǒu', ',', 'méi yǒu', '.'],
)
self.assertEqual(
split_transcript('Méi yǒu, méi yǒu.', 'pinyin', grouped=False),
['Méi', 'yǒu', ',', 'méi', 'yǒu', '.'],
)
self.assertEqual(
split_transcript('(méi) yǒu', 'pinyin', grouped=False),
['(', 'méi', ')', 'yǒu'],
)
def test_issue_79(self):
self.assertEqual(split_transcript("xiá ài", 'pinyin'), ['xiá', 'ài'])
self.assertEqual(split_transcript("xiá'ài", 'pinyin'), ['xiá ài'])
self.assertEqual(split_transcript("xiáài", 'pinyin'), ['xiá ài'])
def test_regression_1(self):
self.assertEqual(
split_transcript('chuángdān', 'pinyin'), ['chuáng dān']
)
class Transcribe(Base):
def test_single_word(self):
self.assertEqual(transcribe(['你'], 'pinyin', 'simp'), ['nǐ'])
def test_multiple_words(self):
self.assertEqual(
transcribe(['图书', '馆'], 'pinyin', 'simp'), ['tú shū', 'guǎn']
)
def test_single_polyphone(self):
self.assertEqual(transcribe(['说'], 'pinyin', 'simp'), ['shuō'])
def test_single_zici_polyphone(self):
self.assertEqual(transcribe(['分子'], 'pinyin', 'simp'), ['fēn zǐ'])
def test_multiple_polyphones(self):
self.assertEqual(
transcribe(['你', '要', '说', '什么'], 'pinyin', 'simp'), ['nǐ', 'yào', 'shuō', 'shén me']
)
def test_multiple_zici_polyphones(self):
self.assertEqual(
transcribe(['重点', '分子', '便宜'], 'pinyin', 'simp'), ['zhòng diǎn', 'fēn zǐ', 'pián yi']
)
def test_no_chinese(self):
self.assertEqual(transcribe(['foo'], 'pinyin', 'simp'), [])
def test_mixed_english_chinese(self):
self.assertEqual(
transcribe(['foo', '你'], 'pinyin', 'simp'), ['foo', 'nǐ']
)
self.assertEqual(
transcribe(['Brian的'], 'pinyin', 'simp'), ['Brian de']
)
def test_bopomofo(self):
self.assertEqual(transcribe(['你'], 'bopomofo', 'trad'), ['ㄋㄧˇ'])
def test_punctuation_retained_converted(self):
self.assertEqual(
transcribe(['没有', ',', '没有', '。'], 'pinyin', 'simp'),
['méi yǒu', ',', 'méi yǒu', '.'],
)
def test_grouped_chars(self):
self.assertEqual(
transcribe(['你', '什么', '时候', '能', '来', '?'], 'pinyin', 'simp'),
['nǐ', 'shén me', 'shí hou', 'néng', 'lái', '?'],
)
def test_jyutping_words(self):
self.assertEqual(transcribe(['上海'], 'jyutping', 'trad'), [None])
self.assertEqual(
transcribe(['上海人'], 'jyutping', 'trad'), ['soeng6 hoi2 jan4']
)
def test_jyutping_sentence(self):
self.assertEqual(
transcribe(['對唔住', ',', '我', '唔係', '李', '太'], 'jyutping', 'trad'),
['deoi3 m4 zyu6', ',', None, 'm4 hai6', 'lei5', 'taai3'],
)
class ReplaceToneMarks(Base):
def test_split_syllables(self):
self.assertEqual(
replace_tone_marks(['hàn', 'yǔ', 'pīn', 'yīn']),
['han4', 'yu3', 'pin1', 'yin1'],
)
def test_joined_syllables_spaced(self):
self.assertEqual(
replace_tone_marks(['hàn yǔ', 'pīn yīn']),
['han4 yu3', 'pin1 yin1'],
)
def test_joined_syllables_unspaced(self):
self.assertEqual(
replace_tone_marks(['hànyǔ', 'pīnyīn']), ['han4 yu3', 'pin1 yin1']
)
def test_tone_number(self):
self.assertEqual(
replace_tone_marks(['pin1', 'yin1']), ['pin1', 'yin1']
)
def test_tone_superscript(self):
self.assertEqual(
replace_tone_marks(['pin¹', 'yin¹']), ['pin¹', 'yin¹']
)
def test_neutral_tone(self):
self.assertEqual(replace_tone_marks(['ne']), ['ne5'])
def test_umlaut(self):
self.assertEqual(replace_tone_marks(['lǘ']), ['lü2'])
def test_ruby(self):
self.assertEqual(replace_tone_marks(['你[nǐ]']), ['你[ni3]'])
def test_composed_diacritics(self):
self.assertEqual(
replace_tone_marks(['shén', 'yùn']), ['shen2', 'yun4']
)
def test_decomposed_diacritics(self):
self.assertEqual(
replace_tone_marks(['shén', 'yùn']), ['shen2', 'yun4']
)
def test_issue_79(self):
self.assertEqual(replace_tone_marks(['xiá', 'ài']), ['xia2', 'ai4'])
self.assertEqual(replace_tone_marks(['xiáài']), ['xia2 ai4'])
class NoTone(Base):
def test_tone_number(self):
self.assertEqual(no_tone('ni3'), 'ni')
def test_tone_superscript(self):
self.assertEqual(no_tone('ni³'), 'ni')
def test_tone_mark(self):
self.assertEqual(no_tone('má'), 'ma')
def test_tone_styling_spaced(self):
self.assertEqual(
no_tone(
'<span class="tone2">méi</span> <span class="tone3">yǒu</span>'
),
'mei you',
)
@skip
def test_tone_styling_unspaced(self):
self.assertEqual(
no_tone(
'<span class="tone2">méi</span><span class="tone3">yǒu</span>'
),
'meiyou',
)
def test_ruby(self):
self.assertEqual(no_tone('你[nǐ]'), '你[ni]')
class ToneNumber(Base):
def test_tone_number(self):
self.assertEqual(tone_number('ni3'), '3')
def test_tone_superscript(self):
self.assertEqual(tone_number('ni³'), '3')
def test_tone_mark(self):
self.assertEqual(tone_number('nǐ'), '3')
def test_tone_styling(self):
self.assertEqual(tone_number('<span class="tone2">nǐ</span>'), '3')
def test_bopomofo(self):
self.assertEqual(tone_number('ㄧㄣ'), '1')
self.assertEqual(tone_number('ㄖㄨˊ'), '2')
self.assertEqual(tone_number('ㄋㄧˇ'), '3')
self.assertEqual(tone_number('ㄓㄨˋ'), '4')
self.assertEqual(tone_number('ㄋㄜ˙'), '5')
class IsSentence(Base):
def test_length(self):
self.assertFalse(is_sentence('你' * 6))
self.assertTrue(is_sentence('你' * 7))
def test_punc(self):
self.assertFalse(is_sentence('你'))
self.assertTrue(is_sentence('你。'))
class GetToneNumberPinyin(Base):
def test_get_tone_number_pinyin(self):
self.assertEqual(get_tone_number_pinyin('hàn yǔ'), 'han yu3')
self.assertEqual(get_tone_number_pinyin('hànyǔ'), 'hanyu3')
self.assertEqual(get_tone_number_pinyin('hàn'), 'han4')