-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpostagger.py
50 lines (41 loc) · 1.47 KB
/
postagger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
import codecs
import os
import sys
from character_tagging import character_tagging
from compute_maxent import load_data, generate_test, test_maxent, save_label
from feature_extract import feature_extract, gather_feature
from pos2word import character_2_word
from test_split import character_split
def main():
# process msr_training file
print('reading msr_training file ...')
input_file = "icwb2-data\\training\msr_training1.utf8"
output_file = ".\output\\train_tagging.txt"
if not os.path.exists('.\output'):
os.mkdir('.\output')
character_tagging(input_file, output_file)
# extract feature
print('extract feature ...')
with codecs.open('output\\train_tagging.txt', 'r', 'utf-8') as f:
feature_extract(f, gather_feature)
# split test file
print('split test file ...')
input_file = "icwb2-data\\testing\msr_test2.utf8"
output_file = ".\output\\test_tagging.txt"
character_split(input_file, output_file)
# compute maxent
print('compute maxent ...')
train = load_data('output\context.txt')
file_test = codecs.open('output\\test_tagging.txt', 'r', 'utf-8')
# predict label
print('predict label')
test = generate_test(file_test)
label = test_maxent('IIS', test, train)
save_label(label)
input_file = 'output/pos_tagging.txt'
output_file = 'output/result.txt'
character_2_word(input_file, output_file)
print('\nfinish')
sys.exit(0)
if __name__ == '__main__':
main()