Skip to content
This repository was archived by the owner on Nov 30, 2023. It is now read-only.

Add polyphones map to override tones for common polyphonic hanzi #210

Open
wants to merge 6 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
126 changes: 126 additions & 0 deletions chinese/data/db/polyphones.tsv
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
# hanzi pinyin
地 de5
把 ba4
长 chang2
难 nan4
率 shuai4
勒 le4
差 cha1
子 zi3
耶 ye1
尽 jin4
倒 dao3
丽 li2
佛 fo2
载 zai3
幢 zhuang4
背 bei1
划 hua2
担 dan4
咱 za2
弹 tan2
甚 shen4
薄 bao2
撒 sa3
斗 dou3
钻 zuan4
挣 zheng4
似 si4
沈 shen3
夹 jia2
档 dang3
拚 pin1
脏 zang1
识 shi2
仔 zai3
晃 huang4
缝 feng2
削 xiao1
掺 chan1
杠 gang4
揣 chuai3
漂 piao1
殷 yin3
楞 leng4
陂 po1
不是 bu2 shi5
起来 qi3 lai5
出来 chu1 lai2
东西 dong1 xi5
地方 di4 fang1
告诉 gao4 su4
当时 dang1 shi2
女人 nü3 ren2
过去 guo4 qu4
结果 jie1 guo3
多少 duo1 shao3
过来 guo4 lai2
故事 gu4 shi4
精神 jing1 shen2
人家 ren2 jia1
不了 bu4 liao3
当年 dang1 nian2
妻子 qi1 zi3
说道 shuo1 dao4
便宜 pian2 yi5
重点 zhong4 dian3
土地 tu3 di4
高中 gao1 zhong1
说法 shuo1 fa3
生意 sheng1 yi4
老公 lao3 gong1
尽量 jin3 liang4
得了 de2 le5
当天 dang1 tian1
小子 xiao3 zi5
好处 hao3 chu5
好吃 hao3 chi1
分子 fen1 zi3
为人 wei2 ren2
同行 tong2 hang2
老子 lao3 zi5
好玩 hao3 wan2
大都 da4 dou1
正当 zheng4 dang1
所长 suo3 zhang3
言语 yan2 yu3
本事 ben3 shi4
孙子 sun1 zi5
恶心 e3 xin1
重重 chong2 chong2
跟前 gen1 qian2
琢磨 zhuo2 mo2
乖乖 guai1 guai1
大方 da4 fang1
个头 ge4 tou2
温和 wen1 he2
狮子 shi1 zi5
当晚 dang1 wan3
教会 jiao1 hui4
开通 kai1 tong1
看好 kan4 hao3
大爷 da4 ye2
工夫 gong1 fu1
口音 kou3 yin1
当日 dang1 ri4
大王 da4 wang2
得罪 de2 zui4
转动 zhuan3 dong4
结实 jie1 shi2
转头 zhuan3 tou2
空地 kong1 di4
款式 kuan3 shi4
扎实 zha1 shi5
下场 xia4 chang5
公道 gong1 dao4
明朝 ming2 chao2
澄清 cheng2 qing1
分量 fen4 liang5
小儿 xiao3 er2
上头 shang4 tou2
本色 ben3 se4
单子 dan1 zi5
下水 xia4 shui3
冷战 leng3 zhan4
端详 duan1 xiang2
丁丁 ding1 ding1
37 changes: 26 additions & 11 deletions chinese/database.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@

from os.path import dirname, join, realpath
from sqlite3 import connect
import csv

from .util import add_with_space

Expand All @@ -28,6 +29,12 @@ def __init__(self):
db_path = join(dirname(realpath(__file__)), 'data', 'db', 'chinese.db')
self.conn = connect(db_path)
self.c = self.conn.cursor()
polyphone_map_path = join(dirname(realpath(__file__)), 'data', 'db', 'polyphones.tsv')
self.polyphone_map = {}
with open(polyphone_map_path, encoding="utf-8") as file:
for line in csv.reader(file, delimiter="\t"):
if not line[0].startswith("#"):
self.polyphone_map[line[0]] = line[1]

def create_indices(self):
self.c.execute(
Expand All @@ -42,17 +49,25 @@ def create_indices(self):
def _get_word_pinyin(self, word, type_, prefer_tw=False, no_variants=True):
from .transcribe import accentuate

if type_ == 'trad':
query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE traditional=?'
elif type_ == 'simp':
query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE simplified=?'
else:
raise ValueError(type_)
# first check polyphones override map
if type_ == 'simp' and word in self.polyphone_map:
return ' '.join(accentuate(list(map(str.lower, self.polyphone_map[word].split())), 'pinyin'))

if no_variants:
query += """AND (english NOT LIKE '%variant%' OR english IS NULL)
AND (german NOT LIKE '%variant%' OR german IS NULL)
AND (french NOT LIKE '%variant%' OR french IS NULL)"""
# second use zidian for single characters instead of cidian
if len(word) == 1:
query = 'SELECT kMandarin, kMandarin FROM hanzi WHERE cp=?'
else:
if type_ == 'trad':
query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE traditional=?'
elif type_ == 'simp':
query = 'SELECT pinyin, pinyin_tw FROM cidian WHERE simplified=?'
else:
raise ValueError(type_)

if no_variants:
query += """AND (english NOT LIKE '%variant%' OR english IS NULL)
AND (german NOT LIKE '%variant%' OR german IS NULL)
AND (french NOT LIKE '%variant%' OR french IS NULL)"""

self.c.execute(query, (word,))
res = self.c.fetchone()
Expand Down Expand Up @@ -248,4 +263,4 @@ def get_sentences(self, word):
try:
return self.c.fetchone()
except:
return []
return []
4 changes: 2 additions & 2 deletions tests/test_ruby.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,15 +43,15 @@ def test_ruby_bottom(self):

def test_bopomofo(self):
self.assertEqual(ruby(['機場'], 'bopomofo'), ['機[ㄐㄧ]場[ㄔㄤˇ]'])
self.assertEqual(ruby(['機', '場'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˊ]'])
self.assertEqual(ruby(['機', '場'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˇ]'])
self.assertEqual(
ruby(['加拿大人'], 'bopomofo'), ['加[ㄐㄧㄚ]拿[ㄋㄚˊ]大[ㄉㄚˋ]人[ㄖㄣˊ]']
)

def test_bopomofo_punc(self):
self.assertEqual(ruby(['機場。'], 'bopomofo'), ['機[ㄐㄧ]場[ㄔㄤˇ]。'])
self.assertEqual(
ruby(['機', '場', '。'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˊ]', '。']
ruby(['機', '場', '。'], 'bopomofo'), ['機[ㄐㄧ]', '場[ㄔㄤˇ]', '。']
)

def test_jyutping_available(self):
Expand Down
16 changes: 16 additions & 0 deletions tests/test_transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,6 +109,22 @@ def test_multiple_words(self):
transcribe(['图书', '馆'], 'pinyin', 'simp'), ['tú shū', 'guǎn']
)

def test_single_polyphone(self):
self.assertEqual(transcribe(['说'], 'pinyin', 'simp'), ['shuō'])

def test_single_zici_polyphone(self):
self.assertEqual(transcribe(['分子'], 'pinyin', 'simp'), ['fēn zǐ'])

def test_multiple_polyphones(self):
self.assertEqual(
transcribe(['你', '要', '说', '什么'], 'pinyin', 'simp'), ['nǐ', 'yào', 'shuō', 'shén me']
)

def test_multiple_zici_polyphones(self):
self.assertEqual(
transcribe(['重点', '分子', '便宜'], 'pinyin', 'simp'), ['zhòng diǎn', 'fēn zǐ', 'pián yi']
)

def test_no_chinese(self):
self.assertEqual(transcribe(['foo'], 'pinyin', 'simp'), [])

Expand Down