-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathgsw_style_tokenization.py
125 lines (117 loc) · 5.16 KB
/
gsw_style_tokenization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import sys
def read_conllu(filename):
sentences = []
with open(filename, encoding="utf8") as f_in:
comments = []
tokens = []
for line in f_in:
line = line.strip()
if not line:
if tokens:
sentences.append((comments, tokens))
comments = []
tokens = []
continue
if line.startswith("#"):
comments.append(line)
continue
tokens.append(line.split("\t"))
if tokens:
sentences.append((comments, tokens))
return sentences
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: python3 gsw_style_tokenization.py IN_FILE OUT_FILE")
sys.exit(1)
_, in_file, out_file = sys.argv
sentences = read_conllu(in_file)
with open(out_file, "w+", encoding="utf8") as f_out:
for (comments, tokens) in sentences:
for comment in comments:
f_out.write(comment + "\n")
# Find multi-word tokens that should be merged and keep track
# of resulting ID changes
id_map = {}
deprel_map = {}
cur_id_offset = 0
i = 0
while i < len(tokens):
if not tokens[i]:
i += 1
continue
id_ = tokens[i][0]
i1, i2 = None, None
if "-" in id_:
# Multi-word tokens
start, end = id_.split("-")
start = int(start)
end = int(end)
if end - start > 1:
print("!!!!!! TODO long MWT")
print(id_)
print(tokens)
sys.exit(1)
tokens[i] = None
i1 = i + 1
i2 = i + 2
elif "SpaceAfter=No" in tokens[i][9] and\
"CorrectSpaceAfter=Yes" not in tokens[i][9] and\
tokens[i][3] != "PUNCT" and\
tokens[i + 1][3] != "PUNCT" and\
tokens[i + 1][1] != "–":
# Determiner-noun combinations etc.
i1 = i
i2 = i + 1
if i1 and i2:
id1, form1, _, upos1, _, _, head1, deprel1, _, _ = tokens[i1]
id2, form2, _, upos2, _, _, head2, deprel2, _, misc2 = tokens[i2]
if (id2 == head1):
# id_map[id1] = int(id2) - cur_id_offset
deprel_map[id1] = (head1, deprel1)
if cur_id_offset > 0:
id_map[id2] = int(id2) - cur_id_offset
tokens[i1] = None
tokens[i2][1] = form1 + form2
# tokens[i2][3] = upos1 # per NOAH paper / UD-GSW-UZH
if misc2 == "_":
tokens[i2][9] = f"Orig={form1}_{upos1}+{form2}_{upos2}"
else:
tokens[i2][9] += f"|Orig={form1}_{upos1}+{form2}_{upos2}"
else:
# id_map[id2] = int(id1) - cur_id_offset
deprel_map[id2] = (head2, deprel2)
if cur_id_offset > 0:
id_map[id1] = int(id1) - cur_id_offset
tokens[i2] = None
tokens[i1][1] = form1 + form2
tokens[i1][9] = f"Orig={form1}_{upos1}+{form2}_{upos2}"
if "SpaceAfter=No" in misc2:
tokens[i1][9] += "|SpaceAfter=No"
if tokens[i2 + 1][3] != "PUNCT":
id_map[i2 + 1] = int(id1) - cur_id_offset
tokens[i1][1] = form1 + form2 + tokens[i2 + 1][1]
tokens[i1][9] = f"Orig={form1}_{upos1}+{form2}_{upos2}+{tokens[i2 + 1][1]}_{tokens[i2 + 1][3]}"
tokens[i2 + 1] = None
i += 1
cur_id_offset += 1
if "SpaceAfter=No" in tokens[i1][9]:
print("!!!! Quadruple tokens currently unaccounted for")
print(id_)
print(tokens)
cur_id_offset += 1
i += i2 - i
else:
if cur_id_offset > 0:
id_map[id_] = int(id_) - cur_id_offset
i += 1
# Apply ID changes and write output
for token in tokens:
if not token:
continue
id_, form, _, upos, _, _, head, deprel, _, misc = token
if head in deprel_map:
head, deprel = deprel_map[head]
id_ = id_map.get(id_, id_)
head = id_map.get(head, head)
f_out.write(f"{id_}\t{form}\t_\t{upos}\t_\t_\t{head}\t{deprel}\t_\t{misc}\n")
f_out.write("\n")