From 7488950e664a7c1d93759ecb135d2a04fdda02f5 Mon Sep 17 00:00:00 2001
From: Xinzhu Cai <hzh13528@163.com>
Date: Wed, 28 Aug 2019 08:55:33 -0400
Subject: [PATCH] Add files via upload

---
 Preprocessing/questionConversion.py | 204 ++++++++++++++++++++++++++++
 Preprocessing/questionFiltering.py  | 138 +++++++++++++++++++
 Preprocessing/stanfordCoreNLP.py    | 126 +++++++++++++++++
 3 files changed, 468 insertions(+)
 create mode 100644 Preprocessing/questionConversion.py
 create mode 100644 Preprocessing/questionFiltering.py
 create mode 100644 Preprocessing/stanfordCoreNLP.py

diff --git a/Preprocessing/questionConversion.py b/Preprocessing/questionConversion.py
new file mode 100644
index 0000000..0719bdc
--- /dev/null
+++ b/Preprocessing/questionConversion.py
@@ -0,0 +1,204 @@
+import json
+import os
+from POSTree import POSTree
+from nltk.parse import stanford
+from stanfordcorenlp import StanfordCoreNLP
+import logging
+import json
+from nltk.tree import *
+
+class StanfordNLP:
+    def __init__(self, host='http://localhost', port=9000):
+        self.nlp = StanfordCoreNLP(host, port=port,
+                                   timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
+        self.props = {
+            'annotators': 'parse',
+            'pipelineLanguage': 'en',
+            'outputFormat': 'json'
+        }
+    def parse(self, sentence):
+        return self.nlp.parse(sentence)
+    @staticmethod
+    def tokens_to_dict(_tokens):
+        tokens = defaultdict(dict)
+        for token in _tokens:
+            tokens[int(token['index'])] = {
+                'word': token['word'],
+                'lemma': token['lemma'],
+                'pos': token['pos'],
+                'ner': token['ner']
+            }
+        return tokens
+
+sNLP = StanfordNLP()
+# convert all the questions to sentences
+# the fillin part is represented with '_'*4(____)
+
+def dump_json(data, outpath):
+    print ('Saving to', outpath)
+    with open(outpath, 'w') as out:
+        json.dump(data, out, indent=4, separators=(',', ': '))
+
+def question_to_sentence(sentence):
+    parse_res = sNLP.parse(sentence)
+    #print(parse_res)
+    tree = POSTree(parse_res)
+    try:
+        res = tree.adjust_order()
+        #print(res)
+    except:
+        #print ("*****************************")
+       # print (sentence)
+        flag = False
+        res = sentence
+        if res[-1] == '?':
+            for mark in ['what','where','which','when','Where','When','What','Which','how','How']:
+                if mark in res:
+                    flag = True
+                    res = res.replace(mark,' **blank** ')
+                    res = res[:-1]
+                    break
+        if not flag:
+            if res[-1] == '.':
+                res = res[:-1]
+            res += ' **blank** '
+    return res
+
+def convert_gre_data(filename):
+    with open(filename,encoding="utf-8") as f:
+        data = json.load(f)
+    print("*"*30)
+    print("start, ",filename)
+    length = len(data)
+    print("# of MCQs before",length)
+    for i in range(length):
+        #print(data[i]['sentence'].find('  '))
+        if data[i]['sentence'].find('________') != -1:
+            data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ')
+        else:
+            data[i]['sentence'] = data[i]['sentence'].replace('  ',' **blank** ')
+        #print(data[i]['sentence'])
+    outpath = filename[:-5]+"_converted.json"
+    print("processing done, ",outpath)
+    print("# of MCQs after",len(data))
+    dump_json(data,outpath)
+
+def convert_mcq_data(filename):
+    with open(filename,encoding="utf-8") as f:
+        data = json.load(f)
+    print("*"*30)
+    print("start, ",filename)
+    length = len(data)
+    print("# of MCQs before",length)
+    for i in range(length):
+        if data[i]['sentence'].find('________') != -1:
+            data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ')
+        elif data[i]['sentence'][-1] == '?':
+            data[i]['sentence'] = question_to_sentence(data[i]['sentence'])
+        else:
+            data[i]['sentence'] += ' **blank** '
+            # flag = False
+            # for mark in ['what','where','which','when','Where','When','What','Which']:
+            #     if mark in data[i]['sentence']:
+            #         flag = True
+            #         data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**')
+            #         break
+            # if not flag:
+            #     data[i]['sentence'] += ' **blank**'
+    outpath = filename[:-5]+"_converted.json"
+    print("processing done, ",outpath)
+    print("# of MCQs after",len(data))
+    dump_json(data,outpath)
+
+def convert_trivia_data(filename):
+    with open(filename,encoding="utf-8") as f:
+        data = json.load(f)
+    print("*"*30)
+    print("start, ",filename)
+    length = len(data)
+    print("# of MCQs before",length)
+    for i in range(length):
+        if data[i]['answer'] == 'True' or data[i]['answer'] == 'False':
+            continue
+        if '?' in data[i]['sentence']:
+            data[i]['sentence'] = question_to_sentence(data[i]['sentence'])
+            # for mark in ['what','where','which','when','Where','When','What','Which']:
+            #     if mark in data[i]['sentence']:
+            #         flag = True
+            #         data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**')
+            #         break
+            # if not flag:
+            #     print (data[i]['sentence'])
+        elif data[i]['sentence'].find('________') != -1:
+            data[i]['sentence'] = data[i]['sentence'].replace('________',' **blank** ')
+        else:
+            data[i]['sentence'] = data[i]['sentence'][:-1] + ' **blank** '
+    outpath = filename[:-5]+"_converted.json"
+    print("processing done, ",outpath)
+    print("# of MCQs after",len(data))
+    dump_json(data,outpath)
+
+def convert_mcql_data(filename):
+    with open(filename,encoding="utf-8") as f:
+        data = json.load(f)
+    print("*"*30)
+    print("start, ",filename)
+    length = len(data)
+    print("# of MCQs before",length)
+    for i in range(length):
+        data[i]['sentence'] = data[i]['sentence'] + ' **blank** '
+        #print(data[i]['sentence'])
+    outpath = filename[:-5]+"_converted.json"
+    print("processing done, ",outpath)
+    print("# of MCQs after",len(data))
+    dump_json(data,outpath)
+
+def convert_sciq_data(filename):
+    with open(filename,encoding="utf-8") as f:
+        data = json.load(f)
+    print("*"*30)
+    print("start, ",filename)
+    length = len(data)
+    print("# of MCQs before",length)
+    for i in range(length):
+        flag = False
+        if '?' in data[i]['sentence']:
+            data[i]['sentence'] = question_to_sentence(data[i]['sentence'])
+            # for mark in ['What','what','which','Which','where','when','Where','When','Who','who','How many','How do','this']:
+            #     if mark in data[i]['sentence']:
+            #         flag = True
+            #         data[i]['sentence'] = data[i]['sentence'].replace(mark,'**blank**')
+            #         break
+            # if not flag:
+            #     data[i]['sentence'] = data[i]['sentence'][:-1] + '**blank**'
+    outpath = filename[:-5]+"_converted.json"
+    print("processing done, ",outpath)
+    print("# of MCQs after",len(data))
+    try:
+        dump_json(data,outpath)
+    except:
+        print(data)
+
+
+
+if __name__=="__main__":
+    filenames = [
+    'MCQ/mcq_total_filtered.json',
+    'Gre/gre_total_filtered.json',
+    "OpenTriviaQA/trivia_total_filtered.json"
+    ]
+    for x in ['LTR-DG/data/mcql_processed/', 'LTR-DG/data/sciq_processed/']:
+        for i in ['test_neg_filtered.json','test_filtered.json','train_neg_filtered.json','train_filtered.json','valid_neg_filtered.json','valid_filtered.json']:
+            path = x + i
+            filenames.append(path)
+    sentence = "If a cat pushes its face against your head, this means what?"
+    question_to_sentence(sentence)
+    convert_gre_data(filenames[1])
+    convert_mcq_data(filenames[0])
+    convert_trivia_data(filenames[2])
+    for i in range(3,9):
+       convert_mcql_data(filenames[i])
+    for i in range(9,15):
+        convert_sciq_data(filenames[i])
+
+print(question_to_sentence("What is the opportunity cost of purchasing the factory for the first year of operation?"))
\ No newline at end of file
diff --git a/Preprocessing/questionFiltering.py b/Preprocessing/questionFiltering.py
new file mode 100644
index 0000000..83b2b41
--- /dev/null
+++ b/Preprocessing/questionFiltering.py
@@ -0,0 +1,138 @@
+import json
+import os
+from nltk.parse import stanford
+
+# os.environ['STANFORD_PARSER'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser.jar'
+# os.environ['STANFORD_MODELS'] = '/mnt/e/Course/NLP/Toolkits/jars/stanford-parser-3.9.2-models.jar'
+
+# java_path = "/mnt/c/Program Files/Java/jdk1.8.0_111/bin/java.exe"
+# os.environ['JAVAHOME'] = java_path
+
+# parser = stanford.StanfordParser(model_path="/mnt/e/Course/NLP/Toolkits/jars/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz")
+
+from stanfordcorenlp import StanfordCoreNLP
+import logging
+import json
+from nltk.tree import *
+
+class StanfordNLP:
+    def __init__(self, host='http://localhost', port=9000):
+        self.nlp = StanfordCoreNLP(host, port=port,
+                                   timeout=30000)  # , quiet=False, logging_level=logging.DEBUG)
+        self.props = {
+            'annotators': 'parse',
+            'pipelineLanguage': 'en',
+            'outputFormat': 'json'
+        }
+    def parse(self, sentence):
+        return self.nlp.parse(sentence)
+    @staticmethod
+    def tokens_to_dict(_tokens):
+        tokens = defaultdict(dict)
+        for token in _tokens:
+            tokens[int(token['index'])] = {
+                'word': token['word'],
+                'lemma': token['lemma'],
+                'pos': token['pos'],
+                'ner': token['ner']
+            }
+        return tokens
+
+sNLP = StanfordNLP()
+
+def dump_json(data, outpath):
+    print ('Saving to', outpath)
+    with open(outpath, 'w') as out:
+        json.dump(data, out, indent=4, separators=(',', ': '))
+
+def normalize_string(s):
+    def is_number(s):
+        try:
+            float(s)
+            return True
+        except ValueError:
+            pass
+        try:
+            import unicodedata
+            unicodedata.numeric(s)
+            return True
+        except (TypeError, ValueError):
+            pass
+    
+        return False
+    s = s.strip('').strip(".")
+    if is_number(s):
+        s = ''
+    return s
+
+def conv_json(filename):
+    with open(filename,encoding="utf-8") as f:
+        data = json.load(f)
+    results = []
+    print("*"*30)
+    print("start, ",filename)
+    print("# of MCQs before",len(data))
+    for item in data:
+
+        item['answer'] = normalize_string(item['answer'])
+
+        # delete questions whose answer is number or "all of above"
+        if item['answer'] =='' or "all of the above" in item['answer'] or "All of the above" in item['answer']:
+            continue
+        # delete distractor which is number or "all of above"
+        i = 0
+        while i < len(item['distractors']):
+            item['distractors'][i] = normalize_string(item['distractors'][i])
+            if item['distractors'][i]=='' or "all of the above" in item['distractors'][i] or "All of the above" in item['distractors'][i]:
+                del item['distractors'][i]
+            else:
+                i += 1
+    
+        phases = list(item['distractors'])
+        phases.append(item['answer'])
+        flag = True
+
+        for p in phases:
+            L = p.split()
+            #print("length, ",len(L))
+            if len(L) == 1:
+                continue
+            elif len(L)>5:
+                flag = False
+                break
+            else:
+                #print ("start parsing, ", p)
+                # do stanford parser
+                res = sNLP.parse(p)
+                res = res[1:].replace(' ','').replace('\n','').split('(')
+               # print("parse result, ",res)
+                if res[1] != 'VP':
+                    flag = False
+                    break
+        if flag:
+            results.append(item)
+    # save each file 
+    outpath = filename[:-5]+"_filtered.json"
+    print("processing done, ",outpath)
+    print("# of MCQs after",len(results))
+    dump_json(results,outpath)
+    return results
+
+
+
+if __name__=="__main__":
+    filenames = [
+    #'MCQ/mcq_total.json',
+    #'Gre/gre_total.json',
+    #"OpenTriviaQA/trivia_total.json"
+    ]
+    for i in ['test_neg.json','test.json','train_neg.json','train.json','valid_neg.json','valid.json']:
+        path = 'LTR-DG/data/mcql_processed/' + i
+        filenames.append(path)
+        #path = 'LTR-DG/data/sciq_processed/' + i
+        #filenames.append(path)
+    results = []
+    for file in filenames:
+        conv_json(file)
+        #results.extend()
+    #dump_json(results,'total_filtered.json')
\ No newline at end of file
diff --git a/Preprocessing/stanfordCoreNLP.py b/Preprocessing/stanfordCoreNLP.py
new file mode 100644
index 0000000..e5ef766
--- /dev/null
+++ b/Preprocessing/stanfordCoreNLP.py
@@ -0,0 +1,126 @@
+#-*- coding: utf-8 -*-
+#!python3
+
+import os
+import sys
+import langdetect
+import corenlp
+
+sys.path.append('/mnt/e/Course/NLP/Toolkits/jdk1.8.0_191/bin')
+# Setting the CoreNLP root folder as environment variable
+corenlp_home = '/mnt/e/Course/NLP/Toolkits/stanford-corenlp-full-2018-10-05'
+os.environ['CORENLP_HOME'] = corenlp_home
+# properties from StanfordCoreNLP-chinese.properties
+# When interacting with the server, lists of strings are handled in parentheses, split by spaces and not commas.
+StanfordCoreNLP_chinese_properties = {'annotators':('tokenize' 'ssplit' 'pos' 'lemma' 'ner' 'parse' 'mention' 'coref'),'tokenize.language':'zh','segment.model':'edu/stanford/nlp/models/segmenter/chinese/ctb.gz','segment.sighanCorporaDict':'edu/stanford/nlp/models/segmenter/chinese','segment.serDictionary':'edu/stanford/nlp/models/segmenter/chinese/dict-chris6.ser.gz','segment.sighanPostProcessing':True,'ssplit.boundaryTokenRegex':'[.。]|[!?！？]+','pos.model':'edu/stanford/nlp/models/pos-tagger/chinese-distsim/chinese-distsim.tagger','ner.language':'chinese','ner.model':'edu/stanford/nlp/models/ner/chinese.misc.distsim.crf.ser.gz','ner.applyNumericClassifiers':True,'ner.useSUTime':False,'regexner.mapping':'edu/stanford/nlp/models/kbp/cn_regexner_mapping.tab','regexner.validpospattern':'^(NR|NN|JJ).*','regexner.ignorecase':True,'regexner.noDefaultOverwriteLabels':'CITY','parse.model':'edu/stanford/nlp/models/srparser/chineseSR.ser.gz','depparse.model':'edu/stanford/nlp/models/parser/nndep/UD_Chinese.gz','depparse.language':'chinese','coref.sieves':('ChineseHeadMatch' 'ExactStringMatch' 'PreciseConstructs' 'StrictHeadMatch1' 'StrictHeadMatch2' 'StrictHeadMatch3' 'StrictHeadMatch4' 'PronounMatch'),'coref.input.type':'raw','coref.postprocessing':True,'coref.calculateFeatureImportance':False,'coref.useConstituencyTree':True,'coref.useSemantics':False,'coref.algorithm':'hybrid','coref.path.word2vec':'','coref.language':'zh','coref.defaultPronounAgreement':True,'coref.zh.dict':'edu/stanford/nlp/models/dcoref/zh-attributes.txt.gz','coref.print.md.log':False,'coref.md.type':'RULE','coref.md.liberalChineseMD':False,'kbp.semgrex':'edu/stanford/nlp/models/kbp/chinese/semgrex','kbp.tokensregex':'edu/stanford/nlp/models/kbp/chinese/tokensregex','kbp.model':None,'entitylink.wikidict':'edu/stanford/nlp/models/kbp/wikidict_chinese.tsv.gz'}
+
+def English_CoreNLP_test(text=None, annotators=None):
+    if text==None:
+        text = 'This is a test sentence for the server to handle. I wonder what it will do.'
+    ####
+    # default annotators is all: ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse']
+    # tokenize: splits each word
+    # ssplit: splits the structure by sentence in a list of sentences.
+    # lemma: lemmatizes the words to a basic conjugation/ dictionary form
+    # pos: Part of Speech tagging
+    # ner: Named Entity Recognizer
+    # depparse: Dependency Parsing
+    if annotators==None:
+        annotators = ["tokenize", "ssplit"]
+        # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse']
+    with corenlp.CoreNLPClient(annotators=annotators, timeout=15000) as client:
+        ann = client.annotate(text)
+    # sent_list = [token.word for token in ann.sentence[0].token]
+    # ['This', 'is', 'a', 'test', 'sentence', 'for', 'the', 'server', 'to', 'handle','.']
+    # sent_list = [token.word for token in ann.sentence[1].token]
+    # ['I', 'wonder', 'what', 'it', 'will', 'do','.']
+    return ann
+
+def Chinese_CoreNLP_test(text=None, annotators=None):
+    if text==None:
+        text = ("国务院日前发出紧急通知，要求各地切实落实保证市场供应的各项政策，维护副食品价格稳定。")
+    ####
+    if annotators==None:
+        annotators = ['tokenize', 'ssplit', 'pos']
+        # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'depparse']
+    with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
+        ann = client.annotate(text)
+    # sent_list = [token.word for token in ann.sentence[0].token]
+    # ['国务院', '日前', '发出', '紧急', '通知', '，', '要求', '各地', '切实', '落实', '保证', '市场', '供应', '的', '各', '项', '政策', '，', '维护', '副食品', '价格', '稳定', '。']
+    return ann
+
+#Grabs a chinese string and returns as list of words nested in a list of sentences
+def Segment(text, sent_split=True, tolist=True):
+    words=[]
+    if text!='':
+        try:
+            lang = langdetect.detect(text)
+        except langdetect.lang_detect_exception.LangDetectException:
+            lang = "undetermined"
+        if (lang == "zh-cn"): #If text is chinese segment, else leave it
+            #########
+            if sent_split:
+                annotators = ['tokenize', 'ssplit']
+                with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
+                    ann = client.annotate(text)
+                words = [[token.word for token in sent.token] for sent in ann.sentence]
+                segmented_list = [' '.join(wordlist) for wordlist in words]
+                segmented = '\n'.join(segmented_list)
+            else:
+                annotators = ['tokenize']
+                with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
+                    ann = client.annotate(text)
+                words = [token.word for token in ann.sentencelessToken]
+                segmented = ' '.join(words)
+        else:
+            segmented = text
+            words = segmented.split()
+    else:
+        segmented = text
+    if tolist:
+        return words #list
+    else:
+        return segmented #string
+
+def POSTag(text, sent_split=True, tolist=True):
+    words=[]
+    if text!='':
+        try:
+            lang = langdetect.detect(text)
+        except langdetect.lang_detect_exception.LangDetectException:
+            lang = "undetermined"
+        if (lang == "zh-cn"): #If text is chinese segment, else leave it
+            #########
+            if sent_split:
+                annotators = ['tokenize', 'ssplit', 'pos']
+                with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
+                    ann = client.annotate(text)
+                words = [[(token.word,token.pos) for token in sent.token] for sent in ann.sentence]
+                segmented_list = [' '.join(['#'.join(posted) for posted in wordlist]) for wordlist in words]
+                segmented = '\n'.join(segmented_list)
+            else:
+                annotators = ['tokenize','pos']
+                with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
+                    ann = client.annotate(text)
+                words = [(token.word, token.pos) for token in ann.sentencelessToken]
+                segmented = ' '.join(['#'.join(posted) for posted in words])
+        else:
+            segmented = text
+            words = segmented.split()
+    else:
+        segmented = text
+    if tolist:
+        return words #list
+    else:
+        return segmented #string
+
+def Parse(text, annotators=None):
+    if annotators==None:
+        # annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'ner', 'parse', 'depparse', 'regnexer','coref']
+        annotators = ['tokenize', 'ssplit', 'lemma', 'pos', 'parse']
+    with corenlp.CoreNLPClient(annotators=annotators, properties=StanfordCoreNLP_chinese_properties, timeout=15000) as client:
+        ann = client.annotate(text)
+    return ann
+
+if __name__ == '__main__':
+    English_CoreNLP_test("hello world!")
\ No newline at end of file