-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathtquery.py
executable file
·288 lines (239 loc) · 11.8 KB
/
tquery.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
#!/usr/bin/env python3
"""
Query by token, filtering by matching against one or more fields of the input,
and optionally display fields as columns of output alongside the token in context.
By default, the first column is the sentence ID, the second column is the token offset(s),
and the third column is the token highlighted in context.
By default, 2 header rows are printed.
A sample of lines output by calling
./tquery.py streusle.json +ss=Manner +ss2!=Locus
reviews-116821-0011 3 p.Manner p.ComparisonRef It tasted >> like << I just flew back home .
reviews-371300-0005 8-9 p.Manner p.Manner I was told to take my coffee >> to go << if I wanted to finish it .
reviews-048363-0016 7,9 p.Manner p.Manner Here I am now driving confidently >> on << my >> own << .
Interface: ./tquery [OPTIONS] streusle.json [+]<fldname>[<op><pattern>] [[+]<fldname2>[<op2><pattern2>] ...]
OPTIONS:
-H: omit header lines giving commit hash and call info, and column headers (these lines start with "#")
-I: case-sensitive filtering (case-insensitive by default)
-L i..[j]: filter to sentences with at least i and no more than j tokens,
where i and j are positive integers (j is optional)
-S: omit sentence IDs in output
-T: omit token numbers (offsets within the sentence) in output
fldname: one of the column names: w(ord), l(emma), upos, xpos, feats, head, deprel, edeps, misc, smwe, wmwe, lt (lextag)
or lc (lexcat), ll (lexlemma), ss = r (role), f (function)
g (governor lemma), o (object lemma), config (syntactic configuration). Can also specify a token-level property of the governor or object:
g.upos, o.lt, etc. (not currently supported for role/function/lexlemma/lexcat, which are stored at the lexical level; cannot be used recursively).
+fldname to print the value of the field in a column of output
op: if filtering on the field, one of: = (regex partial match), == (regex full match), != (inverse regex partial match), !== (inverse regex full match)
pattern: if filtering on the field, regex to match against the field's value; case-insensitive unless -I is specified
Note that for prepositions/possessives, lextag contains the full supersense labeling in "role|function" notation.
So lextag can be used to search for a supersense without specifying whether it occurs as role or function.
Properties of the governor and object cannot be referenced unless the govobj.py script has been run to add the information to the JSON.
An example use of this script to query preposition tokens:
./conllulex2json.py streusle.conllulex > streusle.json
./govobj.py streusle.json > streusle.go.json
./tquery.py streusle.go.json lc==PP? +ll +r +f +config!=subordinating +g +g.upos +o=. +o.upos > streusle.ptoks.tsv
Outputs the tokens annotated as P or (idiomatic) PP along with their role/function supersenses,
syntactic configuration (default, predicative, or stranded), and lemmas and POSes of the governor and (non-empty) object.
@author: Nathan Schneider (@nschneid)
@since: 2018-06-13
"""
import sys, json, fileinput, re
import shlex, subprocess
from itertools import chain
TKN_LEVEL_FIELDS = {'w': 'word', 'word': 'word', 'l': 'lemma', 'lemma': 'lemma',
'upos': 'upos', 'xpos': 'xpos', 'feats': 'feats',
'head': 'head', 'deprel': 'deprel', 'edeps': 'edeps',
'misc': 'misc', 'smwe': 'smwe', 'wmwe': 'wmwe', 'lextag': 'lextag', 'lt': 'lextag'}
LEX_LEVEL_FIELDS = {'lexcat': 'lexcat', 'lc': 'lexcat', 'lexlemma': 'lexlemma', 'll': 'lexlemma',
'r': 'ss', 'ss': 'ss', 'f': 'ss2', 'ss2': 'ss2'}
GOVOBJ_FIELDS = {'g': 'govlemma', 'govlemma': 'govlemma', 'o': 'objlemma', 'objlemma': 'objlemma', 'config': 'config'}
ALL_FIELDS = dict(**TKN_LEVEL_FIELDS, **LEX_LEVEL_FIELDS, **GOVOBJ_FIELDS)
RE_FLAGS = re.IGNORECASE # case-insensitive by default
def tselect(jsonPath, fields, tknconstraints=[], lexconstraints=[], govobjconstraints=[], minlen=0, maxlen=float('inf')):
with open(jsonPath, encoding='utf-8') as inF:
data = json.load(inF)
for sent in data:
if not minlen <= len(sent["toks"]) <= maxlen:
continue
for lexe in chain(sent["swes"].values(), sent["smwes"].values()):
fail = False
myprints = {k: None for k in fields}
# at the lexical expression level: lexcat, lexlemma, ss (role), ss2 (function), heuristic_relation["govlemma", "objlemma", "config"]
for fld, matchX in lexconstraints:
if matchX and not matchX(lexe[fld]):
fail = True
break
if matchX is None:
myprints[fld] = lexe[fld]
if govobjconstraints and not fail:
if "heuristic_relation" not in lexe:
fail = True
else:
govobj = lexe["heuristic_relation"]
for fld, matchX in govobjconstraints:
if '.' in fld:
assert fld.startswith('g.') or fld.startswith('o.')
i = govobj["gov"] if fld.startswith('g.') else govobj["obj"]
if i is None:
if matchX:
fail = True
break
else:
go = {'': ''}
f = ''
else:
go = sent["toks"][i-1]
f = fld.split('.',1)[1]
else:
go = govobj
f = fld
if matchX and not matchX(go[f]):
fail = True
break
if matchX is None:
myprints[fld] = go[f]
if tknconstraints and not fail:
toks = [sent["toks"][i-1] for i in lexe["toknums"]]
for fld, matchX in tknconstraints:
combined = tuple(tok[fld] for tok in toks)
if len(combined)==1:
combined = combined[0]
# combined is a tuple if this is a multi-token expression,
# and just a single field value otherwise
if matchX and not matchX(str(combined)):
fail = True
break
if matchX is None:
myprints[fld] = combined
if not fail:
myprints['_sentid'] = sent["sent_id"]
s = ''
inmatch = False
toknums = lexe["toknums"]
for tok in sent["toks"]:
if tok["#"] in toknums:
if not inmatch:
inmatch = True
s += '>> '
else:
if inmatch:
inmatch = False
s += '<< '
s += tok["word"] + ' '
if inmatch:
s += '<< '
myprints['_context'] = s
if 1 < len(toknums) == max(toknums)-min(toknums)+1:
myprints['_tokoffset'] = f'{min(toknums)}-{max(toknums)}'
else:
myprints['_tokoffset'] = ','.join(map(str,lexe["toknums"]))
yield myprints
if __name__=='__main__':
printHeader = True
printSentId = True
printTokOffset = True
lowerb = 0
upperb = float('inf')
args = sys.argv[1:]
assert len(args)>=2
while args[0].startswith('-'):
flag = args.pop(0)
if flag=='-H': # no header info
printHeader = False
elif flag=='-I': # case-sensitive
RE_FLAGS = 0
elif flag=='-S': # no sentence IDs
printSentId = False
elif flag=='-T': # no token offsets
printTokOffset = False
elif flag=='-L': # sentence length range
v = args.pop(0)
lowerb, upperb = v.split('..')
assert lowerb,f'-L {v} option is invalid (minimum length required, e.g. "1..10" or "1..")'
lowerb = int(lowerb)
upperb = int(upperb) if upperb else float('inf')
assert 0<lowerb<=upperb,f'-L {v} option is invalid'
else:
raise ValueError(f'Invalid flag: {flag}')
inFP = args.pop(0)
tknconstraints, lexconstraints, govobjconstraints = [], [], []
prints = [] # fields whose values are to be printed
if printSentId:
prints.append('_sentid')
if printTokOffset:
prints.append('_tokoffset')
# parse the query (fields and constraints)
for arg in args:
printme = False
if arg.startswith('+'):
printme = True
arg = arg[1:]
if '=' in arg:
fld, pattern = arg.split('=', 1)
if fld.endswith('!'):
op = '!='
fld = fld[:-1]
if pattern.startswith('='):
op += '='
pattern = '^' + pattern[1:] + '$'
r = re.compile(pattern, RE_FLAGS)
matchX = (lambda r: lambda s: s is None or r.search(s) is None)(r)
elif pattern.startswith('='):
op = '=='
pattern = pattern[1:]
r = re.compile('^'+pattern+'$', RE_FLAGS)
matchX = (lambda r: lambda s: s is not None and r.search(s) is not None)(r)
else:
op = '='
r = re.compile(pattern, RE_FLAGS)
matchX = (lambda r: lambda s: s is not None and r.search(s) is not None)(r)
if '.' in fld:
prefix, fld = fld.split('.',1)
prefix += '.' # g. (governor) or o. (object)
else:
prefix = ''
fld = ALL_FIELDS[fld]
fld = prefix+fld
if fld in TKN_LEVEL_FIELDS:
tknconstraints.append((fld, matchX))
elif fld in LEX_LEVEL_FIELDS:
lexconstraints.append((fld, matchX))
else:
govobjconstraints.append((fld, matchX))
else:
assert printme
fld = arg
if printme:
if '.' in fld:
prefix, fld = fld.split('.',1)
prefix += '.' # g. (governor) or o. (object)
else:
prefix = ''
fld = ALL_FIELDS[fld]
fld = prefix+fld
if fld not in prints:
prints.append(fld)
# to the "constraints", add a dummy item indicate that the field should be looked up for printing
if fld in TKN_LEVEL_FIELDS:
tknconstraints.append((fld, None))
elif fld in LEX_LEVEL_FIELDS:
lexconstraints.append((fld, None))
else:
govobjconstraints.append((fld, None))
prints.append('_context')
if printHeader:
# for reproducibility, the git commit hash and the command line call to this script
commitHash = subprocess.check_output(['git', 'rev-parse', '--short', 'HEAD']).strip()
sysCall = sys.argv[0] + " " + " ".join(map(shlex.quote, sys.argv[1:]))
print(f'# {commitHash} ~ {sysCall}')
# column headers
print('# ' + '\t'.join(prints), sep='\t')
n = 0
for myprints in tselect(inFP, prints, tknconstraints=tknconstraints,
lexconstraints=lexconstraints, govobjconstraints=govobjconstraints,
minlen=lowerb, maxlen=upperb):
print(*[myprints[f] for f in prints],
#lexe["ss"]+('|'+lexe["ss2"] if lexe["ss2"] and lexe["ss2"]!=lexe["ss"] else ''), # TODO: make a field for this
sep='\t')
n += 1
print(f'{n} match' + ('es' if n!=1 else ''), file=sys.stderr)