From 7488950e664a7c1d93759ecb135d2a04fdda02f5 Mon Sep 17 00:00:00 2001 From: Xinzhu Cai Date: Wed, 28 Aug 2019 08:55:33 -0400 Subject: [PATCH] Add files via upload --- Preprocessing/questionConversion.py | 204 ++++++++++++++++++++++++++++ Preprocessing/questionFiltering.py | 138 +++++++++++++++++++ Preprocessing/stanfordCoreNLP.py | 126 +++++++++++++++++ 3 files changed, 468 insertions(+) create mode 100644 Preprocessing/questionConversion.py create mode 100644 Preprocessing/questionFiltering.py create mode 100644 Preprocessing/stanfordCoreNLP.py diff --git a/Preprocessing/questionConversion.py b/Preprocessing/questionConversion.py new file mode 100644 index 0000000..0719bdc --- /dev/null +++ b/Preprocessing/questionConversion.py @@ -0,0 +1,204 @@ +import json +import os +from POSTree import POSTree +from nltk.parse import stanford +from stanfordcorenlp import StanfordCoreNLP +import logging +import json +from nltk.tree import * + +class StanfordNLP: + def __init__(self, host='http://localhost', port=9000): + self.nlp = StanfordCoreNLP(host, port=port, + timeout=30000) # , quiet=False, logging_level=logging.DEBUG) + self.props = { + 'annotators': 'parse', + 'pipelineLanguage': 'en', + 'outputFormat': 'json' + } + def parse(self, sentence): + return self.nlp.parse(sentence) + @staticmethod + def tokens_to_dict(_tokens): + tokens = defaultdict(dict) + for token in _tokens: + tokens[int(token['index'])] = { + 'word': token['word'], + 'lemma': token['lemma'], + 'pos': token['pos'], + 'ner': token['ner'] + } + return tokens + +sNLP = StanfordNLP() +# convert all the questions to sentences +# the fillin part is represented with '_'*4(____) + +def dump_json(data, outpath): + print ('Saving to', outpath) + with open(outpath, 'w') as out: + json.dump(data, out, indent=4, separators=(',', ': ')) + +def question_to_sentence(sentence): + parse_res = sNLP.parse(sentence) + #print(parse_res) + tree = POSTree(parse_res) + try: + res = tree.adjust_order() + #print(res) + except: + #print ("*****************************") + # print (sentence) + flag = False + res = sentence + if res[-1] == '?': + for mark in ['what','where','which','when','Where','When','What','Which','how','How']: + if mark in res: + flag = True + res = res.replace(mark,' **blank** ') + res = res[:-1] + break + if not flag: + if res[-1] == '.': + res = res[:-1] + res += ' **blank** ' + return res + +def convert_gre_data(filename): + with open(filename,encoding="utf-8") as f: + data = json.load(f) + print("*"*30) + print("start, ",filename) + length = len(data) + print("# of MCQs before",length) + for i in range(length): + #print(data[i]['sentence'].find(' ')) + if data[i]['sentence'].find('________') != -1: + data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ') + else: + data[i]['sentence'] = data[i]['sentence'].replace(' ',' **blank** ') + #print(data[i]['sentence']) + outpath = filename[:-5]+"_converted.json" + print("processing done, ",outpath) + print("# of MCQs after",len(data)) + dump_json(data,outpath) + +def convert_mcq_data(filename): + with open(filename,encoding="utf-8") as f: + data = json.load(f) + print("*"*30) + print("start, ",filename) + length = len(data) + print("# of MCQs before",length) + for i in range(length): + if data[i]['sentence'].find('________') != -1: + data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ') + elif data[i]['sentence'][-1] == '?': + data[i]['sentence'] = question_to_sentence(data[i]['sentence']) + else: + data[i]['sentence'] += ' **blank** ' + # flag = False + # for mark in ['what','where','which','when','Where','When','What','Which']: + # if mark in data[i]['sentence']: + # flag = True + # data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**') + # break + # if not flag: + # data[i]['sentence'] += ' **blank**' + outpath = filename[:-5]+"_converted.json" + print("processing done, ",outpath) + print("# of MCQs after",len(data)) + dump_json(data,outpath) + +def convert_trivia_data(filename): + with open(filename,encoding="utf-8") as f: + data = json.load(f) + print("*"*30) + print("start, ",filename) + length = len(data) + print("# of MCQs before",length) + for i in range(length): + if data[i]['answer'] == 'True' or data[i]['answer'] == 'False': + continue + if '?' in data[i]['sentence']: + data[i]['sentence'] = question_to_sentence(data[i]['sentence']) + # for mark in ['what','where','which','when','Where','When','What','Which']: + # if mark in data[i]['sentence']: + # flag = True + # data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**') + # break + # if not flag: + # print (data[i]['sentence']) + elif data[i]['sentence'].find('________') != -1: + data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ') + else: + data[i]['sentence'] = data[i]['sentence'][:-1] + ' **blank** ' + outpath = filename[:-5]+"_converted.json" + print("processing done, ",outpath) + print("# of MCQs after",len(data)) + dump_json(data,outpath) + +def convert_mcql_data(filename): + with open(filename,encoding="utf-8") as f: + data = json.load(f) + print("*"*30) + print("start, ",filename) + length = len(data) + print("# of MCQs before",length) + for i in range(length): + data[i]['sentence'] = data[i]['sentence'] + ' **blank** ' + #print(data[i]['sentence']) + outpath = filename[:-5]+"_converted.json" + print("processing done, ",outpath) + print("# of MCQs after",len(data)) + dump_json(data,outpath) + +def convert_sciq_data(filename): + with open(filename,encoding="utf-8") as f: + data = json.load(f) + print("*"*30) + print("start, ",filename) + length = len(data) + print("# of MCQs before",length) + for i in range(length): + flag = False + if '?' in data[i]['sentence']: + data[i]['sentence'] = question_to_sentence(data[i]['sentence']) + # for mark in ['What','what','which','Which','where','when','Where','When','Who','who','How many','How do','this']: + # if mark in data[i]['sentence']: + # flag = True + # data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**') + # break + # if not flag: + # data[i]['sentence'] = data[i]['sentence'][:-1] + '**blank**' + outpath = filename[:-5]+"_converted.json" + print("processing done, ",outpath) + print("# of MCQs after",len(data)) + try: + dump_json(data,outpath) + except: + print(data) + + + +if __name__=="__main__": + filenames = [ + 'MCQ/mcq_total_filtered.json', + 'Gre/gre_total_filtered.json', + "OpenTriviaQA/trivia_total_filtered.json" + ] + for x in ['LTR-DG/data/mcql_processed/', 'LTR-DG/data/sciq_processed/']: + for i in ['test_neg_filtered.json','test_filtered.json','train_neg_filtered.json','train_filtered.json','valid_neg_filtered.json','valid_filtered.json']: + path = x + i + filenames.append(path) + sentence = "If a cat pushes its face against your head, this means what?" + question_to_sentence(sentence) + convert_gre_data(filenames[1]) + convert_mcq_data(filenames[0]) + convert_trivia_data(filenames[2]) + for i in range(3,9): + convert_mcql_data(filenames[i]) + for i in range(9,15): + convert_sciq_data(filenames[i]) + +print(question_to_sentence("What is the opportunity cost of purchasing the factory for the first year of operation?")) \ No newline at end of file diff --git a/Preprocessing/questionFiltering.py b/Preprocessing/questionFiltering.py new file mode 100644 index 0000000..83b2b41 --- /dev/null +++ b/Preprocessing/questionFiltering.py @@ -0,0 +1,138 @@ +import json +import os +from nltk.parse import stanford + +# os.environ['STANFORD_PARSER'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser.jar' +# os.environ['STANFORD_MODELS'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser-3.9.2-models.jar' + +# java_path = "/mnt/c/Program Files/Java/jdk1.8.0_111/bin/java.exe" +# os.environ['JAVAHOME'] = java_path + +# parser = stanford.StanfordParser(model_path="/mnt/e/Course/NLP/Toolkits/jars/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz") + +from stanfordcorenlp import StanfordCoreNLP +import logging +import json +from nltk.tree import * + +class StanfordNLP: + def __init__(self, host='http://localhost', port=9000): + self.nlp = StanfordCoreNLP(host, port=port, + timeout=30000) # , quiet=False, logging_level=logging.DEBUG) + self.props = { + 'annotators': 'parse', + 'pipelineLanguage': 'en', + 'outputFormat': 'json' + } + def parse(self, sentence): + return self.nlp.parse(sentence) + @staticmethod + def tokens_to_dict(_tokens): + tokens = defaultdict(dict) + for token in _tokens: + tokens[int(token['index'])] = { + 'word': token['word'], + 'lemma': token['lemma'], + 'pos': token['pos'], + 'ner': token['ner'] + } + return tokens + +sNLP = StanfordNLP() + +def dump_json(data, outpath): + print ('Saving to', outpath) + with open(outpath, 'w') as out: + json.dump(data, out, indent=4, separators=(',', ': ')) + +def normalize_string(s): + def is_number(s): + try: + float(s) + return True + except ValueError: + pass + try: + import unicodedata + unicodedata.numeric(s) + return True + except (TypeError, ValueError): + pass + + return False + s = s.strip('').strip(".") + if is_number(s): + s = '' + return s + +def conv_json(filename): + with open(filename,encoding="utf-8") as f: + data = json.load(f) + results = [] + print("*"*30) + print("start, ",filename) + print("# of MCQs before",len(data)) + for item in data: + + item['answer'] = normalize_string(item['answer']) + + # delete questions whose answer is number or "all of above" + if item['answer'] =='' or "all of the above" in item['answer'] or "All of the above" in item['answer']: + continue + # delete distractor which is number or "all of above" + i = 0 + while i < len(item['distractors']): + item['distractors'][i] = normalize_string(item['distractors'][i]) + if item['distractors'][i]=='' or "all of the above" in item['distractors'][i] or "All of the above" in item['distractors'][i]: + del item['distractors'][i] + else: + i += 1 + + phases = list(item['distractors']) + phases.append(item['answer']) + flag = True + + for p in phases: + L = p.split() + #print("length, ",len(L)) + if len(L) == 1: + continue + elif len(L)>5: + flag = False + break + else: + #print ("start parsing, ", p) + # do stanford parser + res = sNLP.parse(p) + res = res[1:].replace(' ','').replace('\n','').split('(') + # print("parse result, ",res) + if res[1] != 'VP': + flag = False + break + if flag: + results.append(item) + # save each file + outpath = filename[:-5]+"_filtered.json" + print("processing done, ",outpath) + print("# of MCQs after",len(results)) + dump_json(results,outpath) + return results + + + +if __name__=="__main__": + filenames = [ + #'MCQ/mcq_total.json', + #'Gre/gre_total.json', + #"OpenTriviaQA/trivia_total.json" + ] + for i in ['test_neg.json','test.json','train_neg.json','train.json','valid_neg.json','valid.json']: + path = 'LTR-DG/data/mcql_processed/' + i + filenames.append(path) + #path = 'LTR-DG/data/sciq_processed/' + i + #filenames.append(path) + results = [] + for file in filenames: + conv_json(file) + #results.extend() + #dump_json(results,'total_filtered.json') \ No newline at end of file diff --git a/Preprocessing/stanfordCoreNLP.py b/Preprocessing/stanfordCoreNLP.py new file mode 100644 index 0000000..e5ef766 --- /dev/null +++ b/Preprocessing/stanfordCoreNLP.py @@ -0,0 +1,126 @@ +#-*- coding: utf-8 -*- +#!python3 + +import os +import sys +import langdetect +import corenlp + +sys.path.append('/mnt/e/Course/NLP/Toolkits/jdk1.8.0_191/bin') +# Setting the CoreNLP root folder as environment variable +corenlp_home = '/mnt/e/Course/NLP/Toolkits/stanford-corenlp-full-2018-10-05' +os.environ['CORENLP_HOME'] = corenlp_home +# properties from StanfordCoreNLP-chinese.properties +# When interacting with the server, lists of strings are handled in parentheses, split by spaces and not commas. +StanfordCoreNLP_chinese_properties = {'annotators':('tokenize' 'ssplit' 'pos' 'lemma' 'ner' 'parse' 'mention' 'coref'),'tokenize.language':'zh','segment.model':'edu/stanford/nlp/models/segmenter/chinese/ctb.gz','segment.sighanCorporaDict':'edu/stanford/nlp/models/segmenter/chinese','segment.serDictionary':'edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz','segment.sighanPostProcessing':True,'ssplit.boundaryTokenRegex':'[.。]|[!?!?]+','pos.model':'edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger','ner.language':'chinese','ner.model':'edu/stanford/nlp/models/ner/chinese.misc.distsim.crf.ser.gz','ner.applyNumericClassifiers':True,'ner.useSUTime':False,'regexner.mapping':'edu/stanford/nlp/models/kbp/cn_regexner_mapping.tab','regexner.validpospattern':'^(NR|NN|JJ).*','regexner.ignorecase':True,'regexner.noDefaultOverwriteLabels':'CITY','parse.model':'edu/stanford/nlp/models/srparser/chineseSR.ser.gz','depparse.model':'edu/stanford/nlp/models/parser/nndep/UD_Chinese.gz','depparse.language':'chinese','coref.sieves':('ChineseHeadMatch' 'ExactStringMatch' 'PreciseConstructs' 'StrictHeadMatch1' 'StrictHeadMatch2' 'StrictHeadMatch3' 'StrictHeadMatch4' 'PronounMatch'),'coref.input.type':'raw','coref.postprocessing':True,'coref.calculateFeatureImportance':False,'coref.useConstituencyTree':True,'coref.useSemantics':False,'coref.algorithm':'hybrid','coref.path.word2vec':'','coref.language':'zh','coref.defaultPronounAgreement':True,'coref.zh.dict':'edu/stanford/nlp/models/dcoref/zh-attributes.txt.gz','coref.print.md.log':False,'coref.md.type':'RULE','coref.md.liberalChineseMD':False,'kbp.semgrex':'edu/stanford/nlp/models/kbp/chinese/semgrex','kbp.tokensregex':'edu/stanford/nlp/models/kbp/chinese/tokensregex','kbp.model':None,'entitylink.wikidict':'edu/stanford/nlp/models/kbp/wikidict_chinese.tsv.gz'} + +def English_CoreNLP_test(text=None, annotators=None): + if text==None: + text = 'This is a test sentence for the server to handle. I wonder what it will do.' + #### + # default annotators is all: ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse'] + # tokenize: splits each word + # ssplit: splits the structure by sentence in a list of sentences. + # lemma: lemmatizes the words to a basic conjugation/ dictionary form + # pos: Part of Speech tagging + # ner: Named Entity Recognizer + # depparse: Dependency Parsing + if annotators==None: + annotators = ["tokenize", "ssplit"] + # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse'] + with corenlp.CoreNLPClient(annotators=annotators, timeout=15000) as client: + ann = client.annotate(text) + # sent_list = [token.word for token in ann.sentence[0].token] + # ['This', 'is', 'a', 'test', 'sentence', 'for', 'the', 'server', 'to', 'handle','.'] + # sent_list = [token.word for token in ann.sentence[1].token] + # ['I', 'wonder', 'what', 'it', 'will', 'do','.'] + return ann + +def Chinese_CoreNLP_test(text=None, annotators=None): + if text==None: + text = ("国务院日前发出紧急通知,要求各地切实落实保证市场供应的各项政策,维护副食品价格稳定。") + #### + if annotators==None: + annotators = ['tokenize', 'ssplit', 'pos'] + # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse'] + with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: + ann = client.annotate(text) + # sent_list = [token.word for token in ann.sentence[0].token] + # ['国务院', '日前', '发出', '紧急', '通知', ',', '要求', '各地', '切实', '落实', '保证', '市场', '供应', '的', '各', '项', '政策', ',', '维护', '副食品', '价格', '稳定', '。'] + return ann + +#Grabs a chinese string and returns as list of words nested in a list of sentences +def Segment(text, sent_split=True, tolist=True): + words=[] + if text!='': + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException: + lang = "undetermined" + if (lang == "zh-cn"): #If text is chinese segment, else leave it + ######### + if sent_split: + annotators = ['tokenize', 'ssplit'] + with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: + ann = client.annotate(text) + words = [[token.word for token in sent.token] for sent in ann.sentence] + segmented_list = [' '.join(wordlist) for wordlist in words] + segmented = '\n'.join(segmented_list) + else: + annotators = ['tokenize'] + with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: + ann = client.annotate(text) + words = [token.word for token in ann.sentencelessToken] + segmented = ' '.join(words) + else: + segmented = text + words = segmented.split() + else: + segmented = text + if tolist: + return words #list + else: + return segmented #string + +def POSTag(text, sent_split=True, tolist=True): + words=[] + if text!='': + try: + lang = langdetect.detect(text) + except langdetect.lang_detect_exception.LangDetectException: + lang = "undetermined" + if (lang == "zh-cn"): #If text is chinese segment, else leave it + ######### + if sent_split: + annotators = ['tokenize', 'ssplit', 'pos'] + with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: + ann = client.annotate(text) + words = [[(token.word,token.pos) for token in sent.token] for sent in ann.sentence] + segmented_list = [' '.join(['#'.join(posted) for posted in wordlist]) for wordlist in words] + segmented = '\n'.join(segmented_list) + else: + annotators = ['tokenize','pos'] + with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: + ann = client.annotate(text) + words = [(token.word, token.pos) for token in ann.sentencelessToken] + segmented = ' '.join(['#'.join(posted) for posted in words]) + else: + segmented = text + words = segmented.split() + else: + segmented = text + if tolist: + return words #list + else: + return segmented #string + +def Parse(text, annotators=None): + if annotators==None: + # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'parse', 'depparse', 'regnexer','coref'] + annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'parse'] + with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client: + ann = client.annotate(text) + return ann + +if __name__ == '__main__': + English_CoreNLP_test("hello world!") \ No newline at end of file