diff --git a/chinese/data/db/polyphones.tsv b/chinese/data/db/polyphones.tsv new file mode 100644 index 0000000..0fb048d --- /dev/null +++ b/chinese/data/db/polyphones.tsv @@ -0,0 +1,126 @@ +# hanzi pinyin +地 de5 +把 ba4 +长 chang2 +难 nan4 +率 shuai4 +勒 le4 +差 cha1 +子 zi3 +耶 ye1 +尽 jin4 +倒 dao3 +丽 li2 +佛 fo2 +载 zai3 +幢 zhuang4 +背 bei1 +划 hua2 +担 dan4 +咱 za2 +弹 tan2 +甚 shen4 +薄 bao2 +撒 sa3 +斗 dou3 +钻 zuan4 +挣 zheng4 +似 si4 +沈 shen3 +夹 jia2 +档 dang3 +拚 pin1 +脏 zang1 +识 shi2 +仔 zai3 +晃 huang4 +缝 feng2 +削 xiao1 +掺 chan1 +杠 gang4 +揣 chuai3 +漂 piao1 +殷 yin3 +楞 leng4 +陂 po1 +不是 bu2 shi5 +起来 qi3 lai5 +出来 chu1 lai2 +东西 dong1 xi5 +地方 di4 fang1 +告诉 gao4 su4 +当时 dang1 shi2 +女人 nü3 ren2 +过去 guo4 qu4 +结果 jie1 guo3 +多少 duo1 shao3 +过来 guo4 lai2 +故事 gu4 shi4 +精神 jing1 shen2 +人家 ren2 jia1 +不了 bu4 liao3 +当年 dang1 nian2 +妻子 qi1 zi3 +说道 shuo1 dao4 +便宜 pian2 yi5 +重点 zhong4 dian3 +土地 tu3 di4 +高中 gao1 zhong1 +说法 shuo1 fa3 +生意 sheng1 yi4 +老公 lao3 gong1 +尽量 jin3 liang4 +得了 de2 le5 +当天 dang1 tian1 +小子 xiao3 zi5 +好处 hao3 chu5 +好吃 hao3 chi1 +分子 fen1 zi3 +为人 wei2 ren2 +同行 tong2 hang2 +老子 lao3 zi5 +好玩 hao3 wan2 +大都 da4 dou1 +正当 zheng4 dang1 +所长 suo3 zhang3 +言语 yan2 yu3 +本事 ben3 shi4 +孙子 sun1 zi5 +恶心 e3 xin1 +重重 chong2 chong2 +跟前 gen1 qian2 +琢磨 zhuo2 mo2 +乖乖 guai1 guai1 +大方 da4 fang1 +个头 ge4 tou2 +温和 wen1 he2 +狮子 shi1 zi5 +当晚 dang1 wan3 +教会 jiao1 hui4 +开通 kai1 tong1 +看好 kan4 hao3 +大爷 da4 ye2 +工夫 gong1 fu1 +口音 kou3 yin1 +当日 dang1 ri4 +大王 da4 wang2 +得罪 de2 zui4 +转动 zhuan3 dong4 +结实 jie1 shi2 +转头 zhuan3 tou2 +空地 kong1 di4 +款式 kuan3 shi4 +扎实 zha1 shi5 +下场 xia4 chang5 +公道 gong1 dao4 +明朝 ming2 chao2 +澄清 cheng2 qing1 +分量 fen4 liang5 +小儿 xiao3 er2 +上头 shang4 tou2 +本色 ben3 se4 +单子 dan1 zi5 +下水 xia4 shui3 +冷战 leng3 zhan4 +端详 duan1 xiang2 +丁丁 ding1 ding1 diff --git a/chinese/database.py b/chinese/database.py index 09f1d22..4cbf1e6 100644 --- a/chinese/database.py +++ b/chinese/database.py @@ -19,6 +19,7 @@ from os.path import dirname, join, realpath from sqlite3 import connect +import csv from .util import add_with_space @@ -28,6 +29,12 @@ def __init__(self): db_path = join(dirname(realpath(__file__)), 'data', 'db', 'chinese.db') self.conn = connect(db_path) self.c = self.conn.cursor() + polyphone_map_path = join(dirname(realpath(__file__)), 'data', 'db', 'polyphones.tsv') + self.polyphone_map = {} + with open(polyphone_map_path, encoding="utf-8") as file: + for line in csv.reader(file, delimiter="\t"): + if not line[0].startswith("#"): + self.polyphone_map[line[0]] = line[1] def create_indices(self): self.c.execute( @@ -42,17 +49,25 @@ def create_indices(self): def _get_word_pinyin(self, word, type_, prefer_tw=False, no_variants=True): from .transcribe import accentuate - if type_ == 'trad': - query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE traditional=?' - elif type_ == 'simp': - query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE simplified=?' - else: - raise ValueError(type_) + # first check polyphones override map + if type_ == 'simp' and word in self.polyphone_map: + return ' '.join(accentuate(list(map(str.lower, self.polyphone_map[word].split())), 'pinyin')) - if no_variants: - query += """AND (english NOT LIKE '%variant%' OR english IS NULL) - AND (german NOT LIKE '%variant%' OR german IS NULL) - AND (french NOT LIKE '%variant%' OR french IS NULL)""" + # second use zidian for single characters instead of cidian + if len(word) == 1: + query = 'SELECT kMandarin, kMandarin FROM hanzi WHERE cp=?' + else: + if type_ == 'trad': + query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE traditional=?' + elif type_ == 'simp': + query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE simplified=?' + else: + raise ValueError(type_) + + if no_variants: + query += """AND (english NOT LIKE '%variant%' OR english IS NULL) + AND (german NOT LIKE '%variant%' OR german IS NULL) + AND (french NOT LIKE '%variant%' OR french IS NULL)""" self.c.execute(query, (word,)) res = self.c.fetchone() @@ -248,4 +263,4 @@ def get_sentences(self, word): try: return self.c.fetchone() except: - return [] \ No newline at end of file + return [] diff --git a/tests/test_ruby.py b/tests/test_ruby.py index 1c9770a..f37c95e 100644 --- a/tests/test_ruby.py +++ b/tests/test_ruby.py @@ -43,7 +43,7 @@ def test_ruby_bottom(self): def test_bopomofo(self): self.assertEqual(ruby(['機場'], 'bopomofo'), ['機[ㄐㄧ]場[ㄔㄤˇ]']) - self.assertEqual(ruby(['機', '場'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˊ]']) + self.assertEqual(ruby(['機', '場'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˇ]']) self.assertEqual( ruby(['加拿大人'], 'bopomofo'), ['加[ㄐㄧㄚ]拿[ㄋㄚˊ]大[ㄉㄚˋ]人[ㄖㄣˊ]'] ) @@ -51,7 +51,7 @@ def test_bopomofo(self): def test_bopomofo_punc(self): self.assertEqual(ruby(['機場。'], 'bopomofo'), ['機[ㄐㄧ]場[ㄔㄤˇ]。']) self.assertEqual( - ruby(['機', '場', '。'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˊ]', '。'] + ruby(['機', '場', '。'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˇ]', '。'] ) def test_jyutping_available(self): diff --git a/tests/test_transcribe.py b/tests/test_transcribe.py index ec8c027..c05f561 100644 --- a/tests/test_transcribe.py +++ b/tests/test_transcribe.py @@ -109,6 +109,22 @@ def test_multiple_words(self): transcribe(['图书', '馆'], 'pinyin', 'simp'), ['tú shū', 'guǎn'] ) + def test_single_polyphone(self): + self.assertEqual(transcribe(['说'], 'pinyin', 'simp'), ['shuō']) + + def test_single_zici_polyphone(self): + self.assertEqual(transcribe(['分子'], 'pinyin', 'simp'), ['fēn zǐ']) + + def test_multiple_polyphones(self): + self.assertEqual( + transcribe(['你', '要', '说', '什么'], 'pinyin', 'simp'), ['nǐ', 'yào', 'shuō', 'shén me'] + ) + + def test_multiple_zici_polyphones(self): + self.assertEqual( + transcribe(['重点', '分子', '便宜'], 'pinyin', 'simp'), ['zhòng diǎn', 'fēn zǐ', 'pián yi'] + ) + def test_no_chinese(self): self.assertEqual(transcribe(['foo'], 'pinyin', 'simp'), [])