-
Notifications
You must be signed in to change notification settings - Fork 46
/
Copy pathconllu-w2t.py
executable file
·70 lines (63 loc) · 2.95 KB
/
conllu-w2t.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
#!/usr/bin/env python
import sys
import re
import file_util
from file_util import ID,HEAD,DEPS #column index for the columns we'll need
import argparse
interval_re=re.compile(ur"^([0-9]+)-([0-9]+)$",re.U)
def get_tokens(wtree):
"""
Returns a list of tokens in the tree as integer intervals like so:
[(1,1),(2,3),(4,4),...]
`tree` is a tree (as produced by trees()) in the word-indexed format
"""
tokens=[]
for cols in wtree:
if cols[ID].isdigit():
t_id=int(cols[ID])
#Not covered by the previous interval?
if not (tokens and tokens[-1][0]<=t_id and tokens[-1][1]>=t_id):
tokens.append((t_id,t_id)) #nope - let's make a default interval for it
else:
match=interval_re.match(cols[ID]) #Check the interval against the regex
beg,end=int(match.group(1)),int(match.group(2))
tokens.append((beg,end))
return tokens
def w2t(wtree):
tokens=get_tokens(wtree)
word_ids=[u"0"] #root remains 0
line_idx=0 #index of the line in wtree we are editing
for token_idx,(b,e) in enumerate(tokens): #go over all token ranges and produce new IDs for the words involved
wtree[line_idx][ID]=unicode(token_idx+1) #Renumber the ID field of the token
if b==e: #token==word
word_ids.append("%d"%(token_idx+1))
line_idx+=1
else:
#We have a range, renumber the words
line_idx+=1
for word_idx,_ in enumerate(range(b,e+1)): #consume as many lines as there are words in the token
word_ids.append("%d.%d"%(token_idx+1,word_idx+1))
wtree[line_idx][ID]=word_ids[-1]
line_idx+=1
#word_ids is now a list with 1-based indexing which has the new ID for every single word
#the ID column has been renumbered by now
#now we can renumber all of the HEAD columns
for cols in wtree:
if cols[HEAD]==u"_": #token
continue
cols[HEAD]=word_ids[int(cols[HEAD])]
if cols[DEPS]!=u"_": #need to renumber secondary deps
new_pairs=[]
for head_deprel in cols[DEPS].split(u"|"):
head,deprel=head_deprel.split(u":")
new_pairs.append(word_ids[int(head)]+u":"+deprel)
cols[DEPS]=u"|".join(new_pairs)
if __name__=="__main__":
opt_parser = argparse.ArgumentParser(description='Conversion script from word-based CoNLL-U to token-based CoNLL-U. This script assumes that the input is validated and does no checking on its own.')
opt_parser.add_argument('input', nargs='?', help='Input file name, or "-" or nothing for standard input.')
opt_parser.add_argument('output', nargs='?', help='Output file name, or "-" or nothing for standard output.')
args = opt_parser.parse_args() #Parsed command-line arguments
inp,out=file_util.in_out(args)
for comments,tree in file_util.trees(inp):
w2t(tree)
file_util.print_tree(comments,tree,out)