-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathchineseSegment.py
235 lines (188 loc) · 8.61 KB
/
chineseSegment.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
import config
import dataProcessing
class Segment:
def __init__(self):
self.candidateLen = config.candidateLen
self.wordList_1998_path = config.wordList_1998_path
self.wordPairList_path_1998 = config.wordPairList_path_1998
def getCandidateWords(sentence,words_dict):
# sentence指句子 candidateLen指候选词的最大长度
candidateLen = Segment().candidateLen
candidate_words_list = []
sentence_length = len(sentence)
#_,words_dict = dataProcessing.dataProcess().readWordsDict(config.wordList_1998_path)
for index in range(sentence_length):
# 每个字作为候选词加进候选词列表
# candidate_words_list.append([word,index,index])
# 当前字下 存在候选词的个数
tmp_count = 0
for i in range(candidateLen):
# 在当前位置寻找候选词,范围为candidateLen
if index + i < sentence_length:
word = sentence[index:index + i + 1]
if word in words_dict.keys():
candidate_words_list.append([word, index, index + i])
tmp_count += 1
if index - i >= 0:
word = sentence[index - i:index + 1]
if word in words_dict.keys():
if [word, index - i, index] not in candidate_words_list:
candidate_words_list.append([word, index - i, index])
tmp_count += 1
if tmp_count == 0:
# 词表中不存在
w = sentence[index]
# 频率计为1
words_dict[w] = 1
candidate_words_list.append([w, index, index])
return candidate_words_list, words_dict
def findCandidateLeftWords(sentence,words_dict):
# 返回左邻词词典{word:[[leftword1,s1,s2],[leftword2,s1,s2]]}`
left_word_dict = {}
left_word_dict['end'] = []
candidateLen = Segment().candidateLen
candidate_words_list , words_dict = Segment.getCandidateWords(sentence,words_dict)
for candidate_word in candidate_words_list:
# candidate_word = [word,start,end]
word = candidate_word[0]
start = candidate_word[1]
end = candidate_word[2]
word_str = word + ' '+str(start)+' '+str(end)
# 初始化这个候选词的左邻词字典
left_word_dict[word_str] = []
if start == 0:
left_word = 'start'
left_word_dict[word_str].append([left_word, -1, -1])
if end == len(sentence) - 1:
left_word = word
left_word_dict['end'].append([left_word, start, end])
for i in range(candidateLen):
if start - i - 1 < 0:
break
tmp_word = sentence[start - i - 1:start]
if tmp_word in words_dict.keys():
left_word = tmp_word
left_word_dict[word_str].append([left_word, start - i - 1, start - 1])
return candidate_words_list,words_dict,left_word_dict
def findBestLeftWord(sentence,words_dict,words_pair_dict):
candidate_words_list,words_dict,left_word_dict = Segment.findCandidateLeftWords(sentence, words_dict)
#words_pair_dict = dataProcessing.dataProcess().readWordsPairDict(config.wordPairList_path_1998)
best_left_word_dict = {}
words_length = len(words_dict)
max_end_pro = float('-inf')
for i in range(len(candidate_words_list)):
candidate_word = candidate_words_list[i][0]
s1 = candidate_words_list[i][1]
s2 = candidate_words_list[i][2]
max_pro = float('-inf')
candidate_word_str = candidate_word + ' '+str(s1) +' '+str(s2)
# 累计概率
if s1 == 0:
best_left_word = 'start' + ' -1 -1'
#best_left_word = 'start'
pro = float(words_dict[candidate_word]) / words_length
##
#candidate_word = candidate_word +' '+str(s1) +' '+str(s2)
best_left_word_dict[candidate_word_str] = [best_left_word, pro]
continue
# 中间的词
left_word_list = left_word_dict[candidate_word_str]
for j in range(len(left_word_list)):
left_word = left_word_list[j][0]
s1_left = left_word_list[j][1]
s2_left = left_word_list[j][2]
word_pair = left_word + ' ' + candidate_word
sum_pair_count = 0
pair_count = 0
for key in words_pair_dict.keys():
if key.split()[1] == candidate_word:
sum_pair_count += 1
if key == word_pair:
pair_count = words_pair_dict[key]
else:
# 如果不存在词对,计为1
pair_count = 0
left_word_str = left_word +' '+str(s1_left)+' '+str(s2_left)
if left_word_str not in best_left_word_dict.keys():
continue
##拉普拉斯平滑
# pro = math.log(best_left_word_dict[left_word][1] + 1) + math.log(pair_count + 1) - math.log(sum_pair_count + sentence_length) + 100
pro = best_left_word_dict[left_word_str][1] * (pair_count + 1) / ((sum_pair_count) + words_length)
if pro > max_pro:
max_pro = pro
best_left_word = left_word_str
best_left_word_dict[candidate_word_str] = [best_left_word, max_pro]
if s2 == len(sentence) - 1:
pro = best_left_word_dict[candidate_word_str][1]
if pro > max_end_pro:
max_end_pro = pro
best_left_word_dict['end'] = [candidate_word_str, max_end_pro]
return best_left_word_dict
def sentenceCut(sentence):
sentence_list = []
def is_Chinese(word):
if '\u4e00' <=word <= '\u9fff':
return True
else:
return False
i = 0
while i < len(sentence):
s = ''
digit = ''
if sentence[i].isdigit() == False and is_Chinese(sentence[i]) == False:
sentence_list.append(sentence[i])
i += 1
continue
while i<len(sentence) and is_Chinese(sentence[i]):
s += sentence[i]
i += 1
if s!= '':
sentence_list.append(s)
continue
while i<len(sentence) and sentence[i].isdigit():
digit += sentence[i]
i += 1
if digit!='':
sentence_list.append(digit)
continue
return sentence_list
def getChineseSegment(sentence,words_dict,words_pair_dict):
print('正在切分.......')
best_left_word_dict = Segment.findBestLeftWord(sentence,words_dict,words_pair_dict)
result = ''
key = 'end'
while key != 'start -1 -1':
if key == 'end':
key = best_left_word_dict[key][0]
continue
result = key.split()[0] + ' ' + result
key = best_left_word_dict[key][0]
return result
def saveResult(self,result_list,path):
for i in range(len(result_list)):
with open(path,'a') as f:
f.write(result_list[i])
f.write('\n')
if __name__ == '__main__':
sentence_list = dataProcessing.dataProcess().getTestData()
for sentence in sentence_list:
print(Segment.sentenceCut(sentence))
_, words_dict = dataProcessing.dataProcess().readWordsDict(
config.wordList_1998_path)
words_pair_dict = dataProcessing.dataProcess().readWordsPairDict(
config.wordPairList_path_1998)
result_list = []
sentence = '欢乐热闹的气氛已悄悄降临'
print(Segment.getChineseSegment(sentence,words_dict,words_pair_dict))
for sentences in sentence_list:
sentence_cut = Segment.sentenceCut(sentences)
result = ''
for sentence in sentence_cut:
if '\u4e00' <=sentence[0]<= '\u9fff' and len(sentence) > 1:
result += Segment.getChineseSegment(sentence, words_dict,
words_pair_dict)
else:
result += sentence +' '
print(result)
result_list.append(result)
Segment().saveResult(result_list,config.test_result_path)