-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
c500611
commit 7488950
Showing
3 changed files
with
468 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,204 @@ | ||
import json | ||
import os | ||
from POSTree import POSTree | ||
from nltk.parse import stanford | ||
from stanfordcorenlp import StanfordCoreNLP | ||
import logging | ||
import json | ||
from nltk.tree import * | ||
|
||
class StanfordNLP: | ||
def __init__(self, host='http://localhost', port=9000): | ||
self.nlp = StanfordCoreNLP(host, port=port, | ||
timeout=30000) # , quiet=False, logging_level=logging.DEBUG) | ||
self.props = { | ||
'annotators': 'parse', | ||
'pipelineLanguage': 'en', | ||
'outputFormat': 'json' | ||
} | ||
def parse(self, sentence): | ||
return self.nlp.parse(sentence) | ||
@staticmethod | ||
def tokens_to_dict(_tokens): | ||
tokens = defaultdict(dict) | ||
for token in _tokens: | ||
tokens[int(token['index'])] = { | ||
'word': token['word'], | ||
'lemma': token['lemma'], | ||
'pos': token['pos'], | ||
'ner': token['ner'] | ||
} | ||
return tokens | ||
|
||
sNLP = StanfordNLP() | ||
# convert all the questions to sentences | ||
# the fillin part is represented with '_'*4(____) | ||
|
||
def dump_json(data, outpath): | ||
print ('Saving to', outpath) | ||
with open(outpath, 'w') as out: | ||
json.dump(data, out, indent=4, separators=(',', ': ')) | ||
|
||
def question_to_sentence(sentence): | ||
parse_res = sNLP.parse(sentence) | ||
#print(parse_res) | ||
tree = POSTree(parse_res) | ||
try: | ||
res = tree.adjust_order() | ||
#print(res) | ||
except: | ||
#print ("*****************************") | ||
# print (sentence) | ||
flag = False | ||
res = sentence | ||
if res[-1] == '?': | ||
for mark in ['what','where','which','when','Where','When','What','Which','how','How']: | ||
if mark in res: | ||
flag = True | ||
res = res.replace(mark,' **blank** ') | ||
res = res[:-1] | ||
break | ||
if not flag: | ||
if res[-1] == '.': | ||
res = res[:-1] | ||
res += ' **blank** ' | ||
return res | ||
|
||
def convert_gre_data(filename): | ||
with open(filename,encoding="utf-8") as f: | ||
data = json.load(f) | ||
print("*"*30) | ||
print("start, ",filename) | ||
length = len(data) | ||
print("# of MCQs before",length) | ||
for i in range(length): | ||
#print(data[i]['sentence'].find(' ')) | ||
if data[i]['sentence'].find('________') != -1: | ||
data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ') | ||
else: | ||
data[i]['sentence'] = data[i]['sentence'].replace(' ',' **blank** ') | ||
#print(data[i]['sentence']) | ||
outpath = filename[:-5]+"_converted.json" | ||
print("processing done, ",outpath) | ||
print("# of MCQs after",len(data)) | ||
dump_json(data,outpath) | ||
|
||
def convert_mcq_data(filename): | ||
with open(filename,encoding="utf-8") as f: | ||
data = json.load(f) | ||
print("*"*30) | ||
print("start, ",filename) | ||
length = len(data) | ||
print("# of MCQs before",length) | ||
for i in range(length): | ||
if data[i]['sentence'].find('________') != -1: | ||
data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ') | ||
elif data[i]['sentence'][-1] == '?': | ||
data[i]['sentence'] = question_to_sentence(data[i]['sentence']) | ||
else: | ||
data[i]['sentence'] += ' **blank** ' | ||
# flag = False | ||
# for mark in ['what','where','which','when','Where','When','What','Which']: | ||
# if mark in data[i]['sentence']: | ||
# flag = True | ||
# data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**') | ||
# break | ||
# if not flag: | ||
# data[i]['sentence'] += ' **blank**' | ||
outpath = filename[:-5]+"_converted.json" | ||
print("processing done, ",outpath) | ||
print("# of MCQs after",len(data)) | ||
dump_json(data,outpath) | ||
|
||
def convert_trivia_data(filename): | ||
with open(filename,encoding="utf-8") as f: | ||
data = json.load(f) | ||
print("*"*30) | ||
print("start, ",filename) | ||
length = len(data) | ||
print("# of MCQs before",length) | ||
for i in range(length): | ||
if data[i]['answer'] == 'True' or data[i]['answer'] == 'False': | ||
continue | ||
if '?' in data[i]['sentence']: | ||
data[i]['sentence'] = question_to_sentence(data[i]['sentence']) | ||
# for mark in ['what','where','which','when','Where','When','What','Which']: | ||
# if mark in data[i]['sentence']: | ||
# flag = True | ||
# data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**') | ||
# break | ||
# if not flag: | ||
# print (data[i]['sentence']) | ||
elif data[i]['sentence'].find('________') != -1: | ||
data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ') | ||
else: | ||
data[i]['sentence'] = data[i]['sentence'][:-1] + ' **blank** ' | ||
outpath = filename[:-5]+"_converted.json" | ||
print("processing done, ",outpath) | ||
print("# of MCQs after",len(data)) | ||
dump_json(data,outpath) | ||
|
||
def convert_mcql_data(filename): | ||
with open(filename,encoding="utf-8") as f: | ||
data = json.load(f) | ||
print("*"*30) | ||
print("start, ",filename) | ||
length = len(data) | ||
print("# of MCQs before",length) | ||
for i in range(length): | ||
data[i]['sentence'] = data[i]['sentence'] + ' **blank** ' | ||
#print(data[i]['sentence']) | ||
outpath = filename[:-5]+"_converted.json" | ||
print("processing done, ",outpath) | ||
print("# of MCQs after",len(data)) | ||
dump_json(data,outpath) | ||
|
||
def convert_sciq_data(filename): | ||
with open(filename,encoding="utf-8") as f: | ||
data = json.load(f) | ||
print("*"*30) | ||
print("start, ",filename) | ||
length = len(data) | ||
print("# of MCQs before",length) | ||
for i in range(length): | ||
flag = False | ||
if '?' in data[i]['sentence']: | ||
data[i]['sentence'] = question_to_sentence(data[i]['sentence']) | ||
# for mark in ['What','what','which','Which','where','when','Where','When','Who','who','How many','How do','this']: | ||
# if mark in data[i]['sentence']: | ||
# flag = True | ||
# data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**') | ||
# break | ||
# if not flag: | ||
# data[i]['sentence'] = data[i]['sentence'][:-1] + '**blank**' | ||
outpath = filename[:-5]+"_converted.json" | ||
print("processing done, ",outpath) | ||
print("# of MCQs after",len(data)) | ||
try: | ||
dump_json(data,outpath) | ||
except: | ||
print(data) | ||
|
||
|
||
|
||
if __name__=="__main__": | ||
filenames = [ | ||
'MCQ/mcq_total_filtered.json', | ||
'Gre/gre_total_filtered.json', | ||
"OpenTriviaQA/trivia_total_filtered.json" | ||
] | ||
for x in ['LTR-DG/data/mcql_processed/', 'LTR-DG/data/sciq_processed/']: | ||
for i in ['test_neg_filtered.json','test_filtered.json','train_neg_filtered.json','train_filtered.json','valid_neg_filtered.json','valid_filtered.json']: | ||
path = x + i | ||
filenames.append(path) | ||
sentence = "If a cat pushes its face against your head, this means what?" | ||
question_to_sentence(sentence) | ||
convert_gre_data(filenames[1]) | ||
convert_mcq_data(filenames[0]) | ||
convert_trivia_data(filenames[2]) | ||
for i in range(3,9): | ||
convert_mcql_data(filenames[i]) | ||
for i in range(9,15): | ||
convert_sciq_data(filenames[i]) | ||
|
||
print(question_to_sentence("What is the opportunity cost of purchasing the factory for the first year of operation?")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
import json | ||
import os | ||
from nltk.parse import stanford | ||
|
||
# os.environ['STANFORD_PARSER'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser.jar' | ||
# os.environ['STANFORD_MODELS'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser-3.9.2-models.jar' | ||
|
||
# java_path = "/mnt/c/Program Files/Java/jdk1.8.0_111/bin/java.exe" | ||
# os.environ['JAVAHOME'] = java_path | ||
|
||
# parser = stanford.StanfordParser(model_path="/mnt/e/Course/NLP/Toolkits/jars/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") | ||
|
||
from stanfordcorenlp import StanfordCoreNLP | ||
import logging | ||
import json | ||
from nltk.tree import * | ||
|
||
class StanfordNLP: | ||
def __init__(self, host='http://localhost', port=9000): | ||
self.nlp = StanfordCoreNLP(host, port=port, | ||
timeout=30000) # , quiet=False, logging_level=logging.DEBUG) | ||
self.props = { | ||
'annotators': 'parse', | ||
'pipelineLanguage': 'en', | ||
'outputFormat': 'json' | ||
} | ||
def parse(self, sentence): | ||
return self.nlp.parse(sentence) | ||
@staticmethod | ||
def tokens_to_dict(_tokens): | ||
tokens = defaultdict(dict) | ||
for token in _tokens: | ||
tokens[int(token['index'])] = { | ||
'word': token['word'], | ||
'lemma': token['lemma'], | ||
'pos': token['pos'], | ||
'ner': token['ner'] | ||
} | ||
return tokens | ||
|
||
sNLP = StanfordNLP() | ||
|
||
def dump_json(data, outpath): | ||
print ('Saving to', outpath) | ||
with open(outpath, 'w') as out: | ||
json.dump(data, out, indent=4, separators=(',', ': ')) | ||
|
||
def normalize_string(s): | ||
def is_number(s): | ||
try: | ||
float(s) | ||
return True | ||
except ValueError: | ||
pass | ||
try: | ||
import unicodedata | ||
unicodedata.numeric(s) | ||
return True | ||
except (TypeError, ValueError): | ||
pass | ||
|
||
return False | ||
s = s.strip('').strip(".") | ||
if is_number(s): | ||
s = '' | ||
return s | ||
|
||
def conv_json(filename): | ||
with open(filename,encoding="utf-8") as f: | ||
data = json.load(f) | ||
results = [] | ||
print("*"*30) | ||
print("start, ",filename) | ||
print("# of MCQs before",len(data)) | ||
for item in data: | ||
|
||
item['answer'] = normalize_string(item['answer']) | ||
|
||
# delete questions whose answer is number or "all of above" | ||
if item['answer'] =='' or "all of the above" in item['answer'] or "All of the above" in item['answer']: | ||
continue | ||
# delete distractor which is number or "all of above" | ||
i = 0 | ||
while i < len(item['distractors']): | ||
item['distractors'][i] = normalize_string(item['distractors'][i]) | ||
if item['distractors'][i]=='' or "all of the above" in item['distractors'][i] or "All of the above" in item['distractors'][i]: | ||
del item['distractors'][i] | ||
else: | ||
i += 1 | ||
|
||
phases = list(item['distractors']) | ||
phases.append(item['answer']) | ||
flag = True | ||
|
||
for p in phases: | ||
L = p.split() | ||
#print("length, ",len(L)) | ||
if len(L) == 1: | ||
continue | ||
elif len(L)>5: | ||
flag = False | ||
break | ||
else: | ||
#print ("start parsing, ", p) | ||
# do stanford parser | ||
res = sNLP.parse(p) | ||
res = res[1:].replace(' ','').replace('\n','').split('(') | ||
# print("parse result, ",res) | ||
if res[1] != 'VP': | ||
flag = False | ||
break | ||
if flag: | ||
results.append(item) | ||
# save each file | ||
outpath = filename[:-5]+"_filtered.json" | ||
print("processing done, ",outpath) | ||
print("# of MCQs after",len(results)) | ||
dump_json(results,outpath) | ||
return results | ||
|
||
|
||
|
||
if __name__=="__main__": | ||
filenames = [ | ||
#'MCQ/mcq_total.json', | ||
#'Gre/gre_total.json', | ||
#"OpenTriviaQA/trivia_total.json" | ||
] | ||
for i in ['test_neg.json','test.json','train_neg.json','train.json','valid_neg.json','valid.json']: | ||
path = 'LTR-DG/data/mcql_processed/' + i | ||
filenames.append(path) | ||
#path = 'LTR-DG/data/sciq_processed/' + i | ||
#filenames.append(path) | ||
results = [] | ||
for file in filenames: | ||
conv_json(file) | ||
#results.extend() | ||
#dump_json(results,'total_filtered.json') |
Oops, something went wrong.