-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathhelper.py
188 lines (153 loc) · 4.88 KB
/
helper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
modals = ['can', 'could', 'may', 'might', 'shall', 'should', 'will', 'would', 'must']
def getWh(tagged)->str:
for tag in tagged:
if 'who' in tag[0].lower():
return 'who'
elif 'where'in tag[0].lower():
return 'where'
elif 'how' in tag[0].lower():
return 'how'
return None
def start_to_wh(tagged, wh):
ret = []
for t in tagged:
ret.append(t)
if t[0].lower() == wh.lower():
break
return ret
def x_in_set(x, pos_set:list, is_pos=True):
for pair in pos_set:
word = pair[0].lower()
pos = pair[1].lower()
if type(x) == list:
for item in x:
if is_pos and pos[0] == item[0].lower():
return True
elif not is_pos and word == item.lower():
return True
else:
if is_pos and pos[0]== x[0].lower():
return True
elif not is_pos and word == x.lower():
return True
return False
def is_sub_aux_inv(tagged:list, wh:str, aux:list, y:list) -> bool:
rel_clause_heur = ['NN', 'NNS', 'NNP', 'NNPS', 'DT', 'JJ', 'PDT', 'POS', 'PRP', 'PRP$', 'CD']
hit_wh = False
hit_aux = False
hit_y = False
for word, pos in tagged:
if word in wh:
hit_wh = True
elif hit_wh and pos in rel_clause_heur and not hit_aux:
return False
elif hit_wh and word in aux:
hit_aux = True
elif hit_wh and hit_aux and 'V' in pos:
return False
elif hit_wh and hit_aux and pos in y:
hit_y = True
if hit_wh and hit_aux and hit_y:
return True
return False
def rel_clause_seq(start_wh:list, rel_clause) -> bool:
hit_v = False
hit_rel = False
for word, pos in start_wh:
if 'V' in pos:
hit_v = True
elif hit_v and pos in rel_clause:
hit_rel = True
if hit_v and hit_rel:
return True
return False
def emb_seq(start_wh:list, rel_clause) -> bool:
hit_rel = False
hit_v = False
hit_rel_after_v = False
for word, pos in start_wh:
if pos in rel_clause: # Has hit the relative clause
hit_rel = True
elif hit_rel and 'V' in pos:
hit_v = True
if hit_rel and hit_v and pos in rel_clause:
hit_rel_after_v = True
if hit_v and hit_rel and not hit_rel_after_v:
return True
return False
def collect_json(directory:str, endfname:str):
import os
import json
collected = []
for file in os.listdir(directory):
filename = os.fsdecode(file)
if not filename.endswith(".json"):
continue
with open(f'{directory}/{filename}') as json_data:
data = json.load(json_data)
collected.extend(data)
with open(f"{endfname}.json", 'w') as outfile:
json.dump(collected, outfile, indent=4)
print("successfully made complete json")
return True
def get_v_before_wh(tagged:list, wh:str) -> str:
hit_wh = False
t = tagged[::-1]
for word,pos in t:
if word.lower() == wh.lower():
hit_wh = True
elif ('V' in pos[0] or "MD" in pos) and hit_wh:
return word
return ""
def get_three_v_after_wh(tagged:list, wh:str) -> str:
hit_wh = False
hit_v1 = False
hit_v2 = False
verb1 = ""
verb2 = ""
verb3 = ""
for word, pos in tagged:
if word.lower() == wh.lower():
hit_wh = True
elif ('V' in pos[0] or "MD" in pos) and hit_wh:
hit_wh = False
hit_v1 = True
verb1 = word
elif ('V' in pos[0] or "MD" in pos) and hit_v1:
hit_v1 = False
hit_v2 = True
verb2 = word
elif ('V' in pos[0] or "MD" in pos) and hit_v2:
hit_v2 = False
verb3 = word
return verb1, verb2, verb3
def get_set_wh_v1(tagged_sent:list, wh:str):
get = False
ret_list = []
for word, pos in tagged_sent:
if wh.lower() == word.lower():
get = True
if get:
tup = (word, pos)
ret_list.append(tup)
if ("V" in pos[0] or "MD" in pos) and get:
break
return ret_list
def modded_lemma(verb:str):
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
if not verb:
return None
if verb.endswith((".", "?", "-", "!")):
verb = verb[:-1]
if verb in ["'re", "'m"]:
return "be"
else:
return lemmatizer.lemmatize(verb, "v")
if __name__ == '__main__':
# import nltk
# from nltk import word_tokenize
# from nltk import pos_tag
# sent = 'how can I get'
# lowered_pos = [(x[0].lower(), x[1]) for x in pos_tag(word_tokenize(sent))]
collect_json('unread_split', 'corpus')