Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
xinzhu-cai authored Aug 28, 2019
1 parent c500611 commit 7488950
Show file tree
Hide file tree
Showing 3 changed files with 468 additions and 0 deletions.
204 changes: 204 additions & 0 deletions Preprocessing/questionConversion.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,204 @@
import json
import os
from POSTree import POSTree
from nltk.parse import stanford
from stanfordcorenlp import StanfordCoreNLP
import logging
import json
from nltk.tree import *

class StanfordNLP:
def __init__(self, host='http://localhost', port=9000):
self.nlp = StanfordCoreNLP(host, port=port,
timeout=30000) # , quiet=False, logging_level=logging.DEBUG)
self.props = {
'annotators': 'parse',
'pipelineLanguage': 'en',
'outputFormat': 'json'
}
def parse(self, sentence):
return self.nlp.parse(sentence)
@staticmethod
def tokens_to_dict(_tokens):
tokens = defaultdict(dict)
for token in _tokens:
tokens[int(token['index'])] = {
'word': token['word'],
'lemma': token['lemma'],
'pos': token['pos'],
'ner': token['ner']
}
return tokens

sNLP = StanfordNLP()
# convert all the questions to sentences
# the fillin part is represented with '_'*4(____)

def dump_json(data, outpath):
print ('Saving to', outpath)
with open(outpath, 'w') as out:
json.dump(data, out, indent=4, separators=(',', ': '))

def question_to_sentence(sentence):
parse_res = sNLP.parse(sentence)
#print(parse_res)
tree = POSTree(parse_res)
try:
res = tree.adjust_order()
#print(res)
except:
#print ("*****************************")
# print (sentence)
flag = False
res = sentence
if res[-1] == '?':
for mark in ['what','where','which','when','Where','When','What','Which','how','How']:
if mark in res:
flag = True
res = res.replace(mark,' **blank** ')
res = res[:-1]
break
if not flag:
if res[-1] == '.':
res = res[:-1]
res += ' **blank** '
return res

def convert_gre_data(filename):
with open(filename,encoding="utf-8") as f:
data = json.load(f)
print("*"*30)
print("start, ",filename)
length = len(data)
print("# of MCQs before",length)
for i in range(length):
#print(data[i]['sentence'].find(' '))
if data[i]['sentence'].find('________') != -1:
data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ')
else:
data[i]['sentence'] = data[i]['sentence'].replace(' ',' **blank** ')
#print(data[i]['sentence'])
outpath = filename[:-5]+"_converted.json"
print("processing done, ",outpath)
print("# of MCQs after",len(data))
dump_json(data,outpath)

def convert_mcq_data(filename):
with open(filename,encoding="utf-8") as f:
data = json.load(f)
print("*"*30)
print("start, ",filename)
length = len(data)
print("# of MCQs before",length)
for i in range(length):
if data[i]['sentence'].find('________') != -1:
data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ')
elif data[i]['sentence'][-1] == '?':
data[i]['sentence'] = question_to_sentence(data[i]['sentence'])
else:
data[i]['sentence'] += ' **blank** '
# flag = False
# for mark in ['what','where','which','when','Where','When','What','Which']:
# if mark in data[i]['sentence']:
# flag = True
# data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**')
# break
# if not flag:
# data[i]['sentence'] += ' **blank**'
outpath = filename[:-5]+"_converted.json"
print("processing done, ",outpath)
print("# of MCQs after",len(data))
dump_json(data,outpath)

def convert_trivia_data(filename):
with open(filename,encoding="utf-8") as f:
data = json.load(f)
print("*"*30)
print("start, ",filename)
length = len(data)
print("# of MCQs before",length)
for i in range(length):
if data[i]['answer'] == 'True' or data[i]['answer'] == 'False':
continue
if '?' in data[i]['sentence']:
data[i]['sentence'] = question_to_sentence(data[i]['sentence'])
# for mark in ['what','where','which','when','Where','When','What','Which']:
# if mark in data[i]['sentence']:
# flag = True
# data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**')
# break
# if not flag:
# print (data[i]['sentence'])
elif data[i]['sentence'].find('________') != -1:
data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ')
else:
data[i]['sentence'] = data[i]['sentence'][:-1] + ' **blank** '
outpath = filename[:-5]+"_converted.json"
print("processing done, ",outpath)
print("# of MCQs after",len(data))
dump_json(data,outpath)

def convert_mcql_data(filename):
with open(filename,encoding="utf-8") as f:
data = json.load(f)
print("*"*30)
print("start, ",filename)
length = len(data)
print("# of MCQs before",length)
for i in range(length):
data[i]['sentence'] = data[i]['sentence'] + ' **blank** '
#print(data[i]['sentence'])
outpath = filename[:-5]+"_converted.json"
print("processing done, ",outpath)
print("# of MCQs after",len(data))
dump_json(data,outpath)

def convert_sciq_data(filename):
with open(filename,encoding="utf-8") as f:
data = json.load(f)
print("*"*30)
print("start, ",filename)
length = len(data)
print("# of MCQs before",length)
for i in range(length):
flag = False
if '?' in data[i]['sentence']:
data[i]['sentence'] = question_to_sentence(data[i]['sentence'])
# for mark in ['What','what','which','Which','where','when','Where','When','Who','who','How many','How do','this']:
# if mark in data[i]['sentence']:
# flag = True
# data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**')
# break
# if not flag:
# data[i]['sentence'] = data[i]['sentence'][:-1] + '**blank**'
outpath = filename[:-5]+"_converted.json"
print("processing done, ",outpath)
print("# of MCQs after",len(data))
try:
dump_json(data,outpath)
except:
print(data)



if __name__=="__main__":
filenames = [
'MCQ/mcq_total_filtered.json',
'Gre/gre_total_filtered.json',
"OpenTriviaQA/trivia_total_filtered.json"
]
for x in ['LTR-DG/data/mcql_processed/', 'LTR-DG/data/sciq_processed/']:
for i in ['test_neg_filtered.json','test_filtered.json','train_neg_filtered.json','train_filtered.json','valid_neg_filtered.json','valid_filtered.json']:
path = x + i
filenames.append(path)
sentence = "If a cat pushes its face against your head, this means what?"
question_to_sentence(sentence)
convert_gre_data(filenames[1])
convert_mcq_data(filenames[0])
convert_trivia_data(filenames[2])
for i in range(3,9):
convert_mcql_data(filenames[i])
for i in range(9,15):
convert_sciq_data(filenames[i])

print(question_to_sentence("What is the opportunity cost of purchasing the factory for the first year of operation?"))
138 changes: 138 additions & 0 deletions Preprocessing/questionFiltering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
import json
import os
from nltk.parse import stanford

# os.environ['STANFORD_PARSER'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser.jar'
# os.environ['STANFORD_MODELS'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser-3.9.2-models.jar'

# java_path = "/mnt/c/Program Files/Java/jdk1.8.0_111/bin/java.exe"
# os.environ['JAVAHOME'] = java_path

# parser = stanford.StanfordParser(model_path="/mnt/e/Course/NLP/Toolkits/jars/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")

from stanfordcorenlp import StanfordCoreNLP
import logging
import json
from nltk.tree import *

class StanfordNLP:
def __init__(self, host='http://localhost', port=9000):
self.nlp = StanfordCoreNLP(host, port=port,
timeout=30000) # , quiet=False, logging_level=logging.DEBUG)
self.props = {
'annotators': 'parse',
'pipelineLanguage': 'en',
'outputFormat': 'json'
}
def parse(self, sentence):
return self.nlp.parse(sentence)
@staticmethod
def tokens_to_dict(_tokens):
tokens = defaultdict(dict)
for token in _tokens:
tokens[int(token['index'])] = {
'word': token['word'],
'lemma': token['lemma'],
'pos': token['pos'],
'ner': token['ner']
}
return tokens

sNLP = StanfordNLP()

def dump_json(data, outpath):
print ('Saving to', outpath)
with open(outpath, 'w') as out:
json.dump(data, out, indent=4, separators=(',', ': '))

def normalize_string(s):
def is_number(s):
try:
float(s)
return True
except ValueError:
pass
try:
import unicodedata
unicodedata.numeric(s)
return True
except (TypeError, ValueError):
pass

return False
s = s.strip('').strip(".")
if is_number(s):
s = ''
return s

def conv_json(filename):
with open(filename,encoding="utf-8") as f:
data = json.load(f)
results = []
print("*"*30)
print("start, ",filename)
print("# of MCQs before",len(data))
for item in data:

item['answer'] = normalize_string(item['answer'])

# delete questions whose answer is number or "all of above"
if item['answer'] =='' or "all of the above" in item['answer'] or "All of the above" in item['answer']:
continue
# delete distractor which is number or "all of above"
i = 0
while i < len(item['distractors']):
item['distractors'][i] = normalize_string(item['distractors'][i])
if item['distractors'][i]=='' or "all of the above" in item['distractors'][i] or "All of the above" in item['distractors'][i]:
del item['distractors'][i]
else:
i += 1

phases = list(item['distractors'])
phases.append(item['answer'])
flag = True

for p in phases:
L = p.split()
#print("length, ",len(L))
if len(L) == 1:
continue
elif len(L)>5:
flag = False
break
else:
#print ("start parsing, ", p)
# do stanford parser
res = sNLP.parse(p)
res = res[1:].replace(' ','').replace('\n','').split('(')
# print("parse result, ",res)
if res[1] != 'VP':
flag = False
break
if flag:
results.append(item)
# save each file
outpath = filename[:-5]+"_filtered.json"
print("processing done, ",outpath)
print("# of MCQs after",len(results))
dump_json(results,outpath)
return results



if __name__=="__main__":
filenames = [
#'MCQ/mcq_total.json',
#'Gre/gre_total.json',
#"OpenTriviaQA/trivia_total.json"
]
for i in ['test_neg.json','test.json','train_neg.json','train.json','valid_neg.json','valid.json']:
path = 'LTR-DG/data/mcql_processed/' + i
filenames.append(path)
#path = 'LTR-DG/data/sciq_processed/' + i
#filenames.append(path)
results = []
for file in filenames:
conv_json(file)
#results.extend()
#dump_json(results,'total_filtered.json')
Loading

0 comments on commit 7488950

Please sign in to comment.