-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathph6_add_nfkc_entry4lex_csv.py
63 lines (48 loc) · 1.59 KB
/
ph6_add_nfkc_entry4lex_csv.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
#! /bin/python
# coding: utf-8
import codecs
import unicodedata as ud
from toolbox import *
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
def main(input_file, output_file):
# 読み込み
surface_set = set()
line_list = []
with codecs.open(input_file, 'r', 'utf-8') as fin:
for line in fin:
line = line.lstrip(u'\ufeff\ufffe')
line = line.rstrip(u'\r\n')
if line and (line[0] != u','):
line_list.append(line)
line = csv_splitter(line)
surface = line[0]
surface_set.add(surface)
# NFKCエントリ追加
output_line_list = []
for line in line_list:
output_line_list.append(line)
line = csv_splitter(line)
surface = line[0]
nfkc_surface = ud.normalize('NFKC', surface)
if (surface != nfkc_surface) and (len(surface) == len(nfkc_surface)) and (nfkc_surface not in surface_set):
print surface, nfkc_surface
line[0] = nfkc_surface
line = csv_joinner(line)
print line
output_line_list.append(line)
# 書き出し
with codecs.open(output_file, 'w', 'utf-8') as fout:
for line in output_line_list:
if line and (line[0] not in [u',', u' ']):
fout.write(line + u'\n')
if __name__ == '__main__':
argvs = sys.argv
argc = len(argvs)
if argc != 3:
print ''
print 'python add_nfkc_entry4lex_csv.py inputfile(src_lex.csv) outputfile(nfkc_added_lex.csv)'
print ''
else:
main(argvs[1], argvs[2])