From f7eff699826b75a2e29b011630a2ac1401d0f7e3 Mon Sep 17 00:00:00 2001 From: Xinzhu Cai Date: Wed, 28 Aug 2019 08:57:11 -0400 Subject: [PATCH] Add files via upload --- Ranker/POSTree.py | 595 +++++++++++++++++++++++++++++++ Ranker/loadGlove.py | 77 ++++ Ranker/loadVocab.py | 32 ++ Ranker/multiCalculateFeatures.py | 233 ++++++++++++ Ranker/prepareData.py | 78 ++++ Ranker/train.py | 176 +++++++++ 6 files changed, 1191 insertions(+) create mode 100644 Ranker/POSTree.py create mode 100644 Ranker/loadGlove.py create mode 100644 Ranker/loadVocab.py create mode 100644 Ranker/multiCalculateFeatures.py create mode 100644 Ranker/prepareData.py create mode 100644 Ranker/train.py diff --git a/Ranker/POSTree.py b/Ranker/POSTree.py new file mode 100644 index 0000000..d3c89c0 --- /dev/null +++ b/Ranker/POSTree.py @@ -0,0 +1,595 @@ +from operator import itemgetter + +class POSTree(object): + + class Node(object): + def init(self, token): + self.token = token + self.first_child = None + self.next_sibling = None + + def repr(self): + return '<%s>' % (self.token,) + + def init(self, text): + """Create a Penn Treebacnk style tree from plaint text. + """ + + self.raw_text = text + self.text = text.replace('\n', '') + self.text_length = len(self.text) + self.text_pointer = 0 + self.words = [] + self.root = self.create_tree() + self.question = ' '.join(self.gather_word(self.root)) + self.VB_TAG = ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD') + self.VB_WORD = ('do', 'does', 'can', 'could', 'would', 'should', + 'might', 'has', 'have', "'ve", 'is', "'s", 'are', "'re", 'was', 'were') + + def create_tree(self): + parent = None + token = self.next_token() + if token == '(': + token = self.next_token() + parent = self.Node(token) + parent.first_child = self.create_tree() + child = parent.first_child + if child != None: + while True: + child.next_sibling = self.create_tree() + child = child.next_sibling + if child == None: + break + elif token != ')': + parent = self.Node(token.lower()) + self.words.append(token.lower()) + + return parent + + def next_token(self): + end = self.text_pointer + while end < self.text_length and self.text[end] == ' ': + end += 1 + + if end == self.text_length: + return None + + if self.text[end] in ('(', ')'): + token = self.text[end] + end += 1 + else: + start = end + end += 1 + while end < self.text_length and self.text[end] not in ('(', ')', ' '): + end += 1 + token = self.text[start:end] + self.text_pointer = end + return token + + def first_order_traverse(self): + self.first_order_traverse(self.root) + + def first_order_traverse(self, tree): + if tree != None: + print(tree.token) + self.first_order_traverse(tree.first_child) + if tree.first_child != None: + child = tree.first_child.next_sibling + while child != None: + self.first_order_traverse(child) + child = child.next_sibling + + def delete_period(self): + child = self.root.first_child.first_child + assert(child.token != '.') + while child.next_sibling.token != '.': + child = child.next_sibling + child.next_sibling = None + + def check_PP(self, prenode, node): + while node != None and node.token in ('PP', ',', 'SBAR'): + prenode = node + node = node.next_sibling + return prenode, node + + def adjust_order(self): + try: + child = self.root.first_child + if child.token == 'FRAG' and ' '.join(self.words[:2]) == 'how many': + words = ['there', 'are', '**blank**'] + self.words[2:-1] + return ' '.join(words) + + self.delete_period() + assert(child.next_sibling == None) + if child.token == 'SQ': + self.adjust_SQ_question(child) + elif child.token == 'SBARQ': + prefirst = child + first = child.first_child + second = first.next_sibling + if first.token == 'SQ' and second == None: + self.adjust_SQ_question(first) + elif (first.token in ('WHADJP', 'WHNP', 'WHADVP', 'WHPP') + and second.token == 'SQ'): + WH = self.delete_tree(prefirst, first) + self.adjust_SBARQ_question(WH, second) + else: + raise ValueError('Unknown question structure!') + elif child.token == 'SBAR': + if (child.first_child.token == 'WHADJP' + and child.first_child.next_sibling.token == 'S' + and ' '.join(self.words[:2]) == 'how many'): + SQ = child.first_child.next_sibling + WH = self.delete_tree(child, child.first_child) + self.adjust_SBARQ_question(WH, SQ) + else: + raise ValueError('Unknown question structure!') + else: + raise ValueError('Unknown question structure!') + words = self.gather_word(self.root) + words = filter(lambda w: w != '', words) + statement = ' '.join(words) + except Exception as e: + if DEBUG: + print(self.question) + print(self.raw_text) + raise e + return statement + + def create_answer_node(self, before_text='', after_text=''): + node = self.Node('A') + answer = '**blank**' + if before_text != '': + answer = '%s %s' % (before_text, answer) + if after_text != '': + answer = '%s %s' % (answer, after_text) + node.first_child = self.Node(answer) + return node + + def check_VB(self, node): + if node.token in self.VB_TAG: + return True + if node.first_child.token in self.VB_WORD: + node.token = 'VB' + return True + return False + + def adjust_SQ_question(self, SQ): + VB = SQ.first_child + assert(self.check_VB(VB)) + auxiliary = VB.first_child.token + if auxiliary not in ('do', 'did', 'does'): + answer = self.create_answer_node(before_text=auxiliary) + else: + answer = self.create_answer_node() + + # move answer after first NP + NP = VB.next_sibling + while NP.token != 'NP': + NP = NP.next_sibling + self.insert_after(answer, NP) + self.delete_tree(SQ, VB) + return SQ + + def gather_word(self, tree): + words = [] + def recursor(t): + if t == None: + return + if t.first_child == None: + words.append(t.token) + else: + recursor(t.first_child) + sibling = t.first_child.next_sibling + while sibling != None: + recursor(sibling) + sibling = sibling.next_sibling + recursor(tree) + return words + + def tree_to_text(self, tree): + words = [] + def recursor(t): + if t == None: + return + if t.first_child == None: + words.append(' '+t.token) + else: + words.append('('+t.token) + recursor(t.first_child) + sibling = t.first_child.next_sibling + while sibling != None: + recursor(sibling) + sibling = sibling.next_sibling + words.append(')') + recursor(tree) + return ''.join(words) + + def convert_WH_to_answer(self, WH): + words = self.gather_word(WH) + WH_text = ' '.join(words) + if WH_text == 'how old': + WH.first_child = self.create_answer_node(after_text='years old') + elif WH_text == 'why': + WH.first_child = self.create_answer_node(before_text='because') + elif WH.token in ('WHADJP', 'WHADVP'): + WH.first_child = self.create_answer_node() + elif WH.token == 'WHNP' or WH.token == 'WHPP' and WH.first_child.next_sibling.token == 'WHNP': + parent = WH if WH.token == 'WHNP' else WH.first_child.next_sibling + first = WH.first_child + while first.token == 'WHNP': + parent = first + first = first.first_child + if first.token == 'WHADJP': + first.first_child = self.create_answer_node() + elif self.tree_to_text(parent).startswith('(WHNP(WDT what)(NN color)(NN'): + after_text = ' '.join(self.gather_word(parent)).replace('what color ', '', 1) + parent.first_child = self.create_answer_node(after_text=after_text) + else: + parent.first_child = self.create_answer_node() + else: + raise ValueError('Unknown WH structure!') + return WH + + def check_ADVP(self, prenode, node): + while node != None and node.token == 'ADVP': + prenode = node + node = node.next_sibling + return prenode, node + + def delete_tree(self, prenode, node): + if node == None: + return node + if prenode.first_child == node: + prenode.first_child = node.next_sibling + else: + prenode.next_sibling = node.next_sibling + node.next_sibling = None + return node + + def delete_node(self, prenode, node): + if node == None: + return node + if prenode.first_child == node: + if node.first_child == None: + prenode.first_child = node.next_sibling + else: + prenode.first_child = node.first_child + lc = node.first_child + while lc.next_sibling != None: + lc = lc.next_sibling + lc.next_sibling = node.next_sibling + node.first_child = None + else: + if node.first_child == None: + prenode.next_sibling = node.next_sibling + else: + prenode.next_sibling = node.first_child + lc = node.first_child + while lc.next_sibling != None: + lc = lc.next_sibling + lc.next_sibling = node.next_sibling + node.first_child = None + node.next_sibling = None + return node + + def insert_after(self, srcnode, dstnode): + assert(srcnode != None and dstnode != None) + srcnode.next_sibling = dstnode.next_sibling + dstnode.next_sibling = srcnode + return srcnode + + def insert_as_first_child(self, srcnode, dstnode): + assert(srcnode != None and dstnode != None) + srcnode.next_sibling = dstnode.first_child + dstnode.first_child = srcnode + return srcnode + + def insert_as_last_child(self, srcnode, dstnode): + assert(srcnode != None and dstnode != None) + lc = dstnode.first_child + if lc == None: + self.insert_as_first_child(srcnode, dstnode) + else: + while lc.next_sibling != None: + lc = lc.next_sibling + self.insert_after(srcnode, lc) + return srcnode + + def adjust_SQ_in_SBARQ(self, SQ, WH): + prefirst, first = self.check_ADVP(SQ, SQ.first_child) + + # SQ = VP + if first.token == 'VP': + return SQ + + # SQ = NP + VP + if (first.token == 'NP' and first.next_sibling != None + and first.next_sibling.token == 'VP' and first.next_sibling.next_sibling == None): + return SQ + + if not self.check_VB(first): + raise ValueError('First child of SQ in SBARQ is not VB*/MD') + + # process 's 're 've + if first.first_child.token == "'s": + first.first_child.token = 'is' + elif first.first_child.token == "'re": + first.first_child.token = 'are' + elif first.first_child.token == "'ve": + first.first_child.token = 'have' + + presecond, second = self.check_ADVP(first, first.next_sibling) + + # SQ = VB* + [ADVP] + if second == None: + return SQ + + # process RB(not) and auxiliary do/does/did + if second.token == 'RB' and second.first_child.token in ("n't", "not"): + if first.first_child.token == 'ca': + first.first_child.token = 'can not' + else: + first.first_child.token += ' not' + self.delete_tree(presecond, second) + presecond, second = self.check_ADVP(first, first.next_sibling) + else: + if first.first_child.token in ('do', 'does', 'did'): + first.first_child.token = '' + + # SQ = VB*+PP/ADJP/VP + if second.next_sibling == None and second.token in ('PP', 'ADJP', 'VP'): + return SQ + + # SQ = VB* + NP + # | | + # first second + if second.token == 'NP' and second.next_sibling == None: + fc = second.first_child + + # second = NP + ? + # | | + # fc sc + if (fc.token == 'NP' and fc.next_sibling != None + and fc.next_sibling.next_sibling == None): + sc = fc.next_sibling + if ((sc.token == 'PP' and WH.token == 'WHADVP') + or (sc.token == 'PP' and sc.first_child.token == 'IN' + and sc.first_child.next_sibling == None) + or (sc.token == 'NP' and ' '.join(self.gather_word(fc)) == 'there') + or (sc.token == 'ADJP') + or (sc.token == 'SBAR' and sc.first_child.token == 'WHADVP')): + self.delete_node(presecond, second) + VB = self.delete_tree(prefirst, first) + self.insert_after(VB, fc) + return SQ + VB = self.delete_tree(prefirst, first) + self.insert_after(VB, second) + return SQ + + # SQ = VB* + NP + ? + # | | | + # first second third + if second.token == 'NP' and second.next_sibling != None: + prethird, third = self.check_ADVP(second, second.next_sibling) + # SQ = VB* + NP + ADVP + if third == None: + VB = self.delete_tree(prefirst, first) + self.insert_after(VB, second) + return SQ + + if third.next_sibling == None: + if ((third.token in ('ADJP', 'PP', 'NP', 'VP')) + or (third.token == 'S' + and self.tree_to_text(third).startswith('(S(VP(TO to)(VP(VB'))): + VB = self.delete_tree(prefirst, first) + self.insert_after(VB, second) + return SQ + + raise ValueError('Unknown SQ structure in SBARQ!') + + def prefix_by_to_WH(self, WH): + BY = self.Node('BY') + BY.first_child = self.Node('by') + self.insert_as_first_child(BY, WH) + return WH + + def insert_WH_into_SQ(self, WH, SQ): + if self.words[0] == 'why': + self.insert_as_last_child(WH, SQ) + return SQ + + prefirst, first = self.check_ADVP(SQ, SQ.first_child) + + if first.next_sibling == None: + # SQ = VP + if first.token == 'VP': + self.insert_as_first_child(WH, SQ) + return SQ + + # SQ = NP + if first.token == 'NP': + self.insert_after(WH, first) + return SQ + + # SQ = VB* + if self.check_VB(first): + self.insert_as_first_child(WH, SQ) + return SQ + + raise ValueError('Unknown SQ structure!') + + presecond, second = self.check_ADVP(first, first.next_sibling) + + # SQ = VB* + ADVP + if self.check_VB(first) and second == None: + self.insert_as_first_child(WH, SQ) + return SQ + + # SQ = VB* + VP/PP/ADJP + # | | + # first second + if (self.check_VB(first) and second.next_sibling == None + and second.token in ('VP', 'PP', 'ADJP')): + self.insert_as_first_child(WH, SQ) + return SQ + + prethird, third = self.check_ADVP(second, second.next_sibling) + + # SQ = NP + VB* + [ADVP] + # | | + # first second + if (first.token == 'NP' and self.check_VB(second) and + (second.next_sibling == None or third == None)): + self.insert_after(WH, second) + return SQ + + # SQ = NP + VP + # | | + # first second + if (first.token == 'NP' and second.token == 'VP' + and second.next_sibling == None): + if WH.token in ('WHNP', 'WHADJP'): #Wh-noun Phrase. who, which book, whose daughter, none of which, or how many leopards. + self.insert_as_first_child(WH, SQ) + return SQ + if WH.token == 'WHPP': #Wh-prepositional Phrase. by which + self.insert_after(WH, second) + return SQ + + if third == None: + raise ValueError('Unknown SQ structure!') + + # SQ = NP + VB* + ? + # | | | + # first second third + if first.token == 'NP' and self.check_VB(second) and third.next_sibling == None: + + # SQ = NP + VB* + VP + if third.token == 'VP': + VB = second + VP = third + while (self.check_VB(VP.first_child) and VP.first_child.next_sibling != None + and VP.first_child.next_sibling.token == 'VP'): + VB = VP.first_child + VP = VB.next_sibling + # VP = VBN + [...] + # | + # fc + _, fc = self.check_ADVP(VP, VP.first_child) + if ((VB.first_child.token != '' + and VB.first_child.token.split()[0] in ('is', 'are', 'was', 'were')) + and fc.token == 'VBN'): + if WH.token == 'WHADVP' and self.words[0] == 'how': + WH = self.prefix_by_to_WH(WH) + self.insert_after(WH, VP) + return SQ + if WH.token == 'WHADVP' and self.words[0] in ('why', 'where'): + self.insert_after(WH, VP) + return SQ + # VP = VB* + # | + # fc + if self.check_VB(fc) and fc.next_sibling == None: + self.insert_after(WH, VP) + return SQ + # VP = VB* + ? + # | | + # fc sc + if (self.check_VB(fc) and fc.next_sibling != None + and fc.next_sibling.next_sibling == None): + sc = fc.next_sibling + # VP = VB* + PRT + if sc.token == 'PRT': + self.insert_after(WH, VP) + return SQ + # VP = VB* + PP + if sc.token == 'PP': + ffc = sc.first_child + if ffc.token == 'IN' and ffc.next_sibling == None: + self.insert_after(WH, VP) + return SQ + if (ffc.token == 'IN' and ffc.next_sibling != None + and ffc.next_sibling.next_sibling == None): + ssc = ffc.next_sibling + if ssc.token in ('NP', 'ADJP'): + self.insert_after(WH, fc) + return SQ + # VP = VB* + SBAR + if sc.token == 'SBAR': + if fc.first_child.token in ('know', 'think'): + if WH.token == 'WHADVP' and self.words[0] == 'how': + WH = self.prefix_by_to_WH(WH) + self.insert_after(WH, VP) + return SQ + self.insert_after(WH, VP) + return SQ + self.insert_after(WH, fc) + return SQ + # VP = VB* + S + if sc.token == 'S' and self.tree_to_text(sc).startswith('(S(VP(TO to)(VP(VB'): + VB_S = sc.first_child.first_child.next_sibling.first_child + if VB_S.next_sibling == None: + self.insert_after(WH, VP) + return SQ + if (VB_S.next_sibling.token == 'SBAR' + and VB_S.next_sibling.first_child.token == 'WHADVP'): + self.insert_after(WH, VB_S) + return SQ + self.insert_after(WH, fc) + return SQ + # VP = VB* + ADVP + if sc.token == 'ADVP': + self.insert_after(WH, fc) + return SQ + + if WH.token == 'WHADVP' and self.words[0] == 'how': + WH = self.prefix_by_to_WH(WH) + self.insert_after(WH, VP) + return SQ + self.insert_after(WH, VP) + return SQ + + # SQ = NP + VB* + NP + if third.token == 'NP': + self.insert_after(WH, third) + return SQ + # SQ = NP + VB* + S + if third.token == 'S' and self.tree_to_text(third).startswith('(S(VP(TO to)(VP(VB'): + VB_S = third.first_child.first_child.next_sibling.first_child + if VB_S.next_sibling == None and WH.token == 'WHNP': + self.insert_after(WH, VB_S) + return SQ + self.insert_after(WH, second) + return SQ + # SQ = NP + VB* + SBAR + if third.token == 'SBAR' and third.first_child.token == 'WHADVP': + self.insert_after(WH, second) + return SQ + # SQ = NP + VB* + PP + if third.token == 'PP': + self.insert_after(WH, third) + return SQ + # SQ = NP + VB* + ADJP + if third.token == 'ADJP': + if WH.token == 'WHADVP' and self.words[0] == 'how': + WH = self.prefix_by_to_WH(WH) + self.insert_after(WH, third) + return SQ + self.insert_after(WH, third) + return SQ + + raise ValueError('Unknown SQ structure!') + + def adjust_SBARQ_question(self, WH, SQ): + """ + Adjust word order of SBARQ question. + convert_WH_to_answer() -> adjust_SQ_in_SBARQ() -> insert_WH_into_SQ(). + """ + #WH = self.root.first_child.first_child + #SQ = WH.next_sibling + + WH = self.convert_WH_to_answer(WH) + SQ = self.adjust_SQ_in_SBARQ(SQ, WH) + SQ = self.insert_WH_into_SQ(WH, SQ) + + self.root.first_child.first_child = SQ \ No newline at end of file diff --git a/Ranker/loadGlove.py b/Ranker/loadGlove.py new file mode 100644 index 0000000..b1e901f --- /dev/null +++ b/Ranker/loadGlove.py @@ -0,0 +1,77 @@ +import random +import os +import pickle +import math +import numpy as np +import pickle + +def loadGloVe(filename, vocab_exist=None): + vocab = [] + vocab_dict = {} + embd = [] + with open(filename, 'r') as fin: + for line in fin: + row = line.strip().split(' ') + if vocab_exist is None or row[0] in vocab_exist: + vocab.append(row[0]) + vocab_dict[row[0]] = len(vocab) - 1 + embd.append(row[1:]) + print('Loaded GloVe!') + embd = np.array(embd) + return vocab, vocab_dict, embd + +def build_vocab(dataset, pretrained_embeddings_path): + vocab = None + if os.path.isfile('{}/vocab_python2.pkl'.format(dataset)): + print('loading saved vocab...') + with open('{}/vocab_python2.pkl'.format(dataset), 'rb') as fin: + vocab = pickle.load(fin) + else: + code = int(0) + vocab = {} + vocab['UNKNOWN'] = code + code += 1 + filenames = ['{}/train.data'.format(dataset), '{}/valid.data'.format(dataset), '{}/test.data'.format(dataset)] + for filename in filenames: + for line in open(filename): + items = line.strip().split(' ') + for i in range(1, len(items)): + words = items[i].split('_') + for word in words: + if word not in vocab: + vocab[word] = code + code += 1 + embd = None + print("#vocab,",len(vocab)) + print(vocab['isotopy']) + if os.path.isfile('{}/embd_python2.pkl'.format(dataset)): + print('loading saved embd...') + with open('{}/embd_python2.pkl'.format(dataset), 'rb') as fin: + embd = pickle.load(fin) + elif len(pretrained_embeddings_path) > 0: + vocab_all, vocab_dict_all, embd_all = loadGloVe(pretrained_embeddings_path, vocab) + embd = [] + for k, v in vocab.items(): + try: + index = vocab_dict_all[k] + embd.append(embd_all[index]) + except: + embd.append(np.random.uniform(-0.05, 0.05, (embd_all.shape[1]))) + embd = np.array(embd) + return vocab, embd + +if __name__=="__main__": + vocab, embd = build_vocab("/home/xinzhu/Code/Mydata/data","/home/xinzhu/Code/model/data/data_embeddings/glove.840B.300d.txt") + output = open('/home/xinzhu/Code/Mydata/data/vocab_python2.pkl', 'wb') + pickle.dump(vocab, output) + output.close() + print("dump to vocab.pkl!") + output = open('/home/xinzhu/Code/Mydata/data/embd_python2.pkl', 'wb') + pickle.dump(embd, output) + output.close() + print("dump to embd.pkl!") + print(len(vocab)) + # L = "If we observe a pebble in a pool".lower().split() + # for i in L: + # print("find, ",i) + # print(embd[vocab[i]]) \ No newline at end of file diff --git a/Ranker/loadVocab.py b/Ranker/loadVocab.py new file mode 100644 index 0000000..fb65ea1 --- /dev/null +++ b/Ranker/loadVocab.py @@ -0,0 +1,32 @@ +# add neg_samples to total.json, get total_neg.json +import json +import random +import io + +def dump_json(data, outpath): + print ('Saving to', outpath) + with open(outpath, 'w') as out: + json.dump(data, out, indent=4, separators=(',', ': ')) + +if __name__=="__main__": + voc_file = 'data/vocab.txt' + of = io.open(voc_file,'w') + with io.open('data/total.json',encoding="utf-8") as f: + data = json.load(f) + vocab = set() + for item in data: + vocab.add(item['answer']) + for d in item['distractors']: + vocab.add(d) + for i in vocab: + of.write(i+'\n') + of.close() + print("load vocab done!") + results = [] + for item in data: + num = len(item['distractors']) + item['neg_samples'] = random.sample(vocab,num) + results.append(item) + + dump_json(results,"data/total_neg.json") + print("output total_neg.json done!") diff --git a/Ranker/multiCalculateFeatures.py b/Ranker/multiCalculateFeatures.py new file mode 100644 index 0000000..4576494 --- /dev/null +++ b/Ranker/multiCalculateFeatures.py @@ -0,0 +1,233 @@ +import nltk +import csv +import re +import numpy as np +import inflect +import pickle +from difflib import SequenceMatcher +from gensim.models import Word2Vec +p = inflect.engine() +model = Word2Vec.load("/home/xinzhu/Dataset/Word2Vec-on-Wikipedia-Corpus/model/word2vec_gensim") +print("Prepare Word2Vec model done!") + +prefix = '/home/xinzhu/Code/model/feature/' +infile = open(prefix+'unigram_freq.csv', mode='r') +reader = csv.reader(infile) +freq_dict = {row[0]:row[1] for row in reader} + +fin = open('/home/xinzhu/Code/model/data/mcql_processed/vocab_python2.pkl', 'rb') +vocab = pickle.load(fin) +print('loading saved vocab...') +fin.close() + +fin = open('/home/xinzhu/Code/model/data/mcql_processed/embd_python2.pkl', 'rb') +embd = pickle.load(fin) +print('loading saved embd...') +fin.close() + +cnt = 0 + +def emb_sim(a,d): + aL = a.split(' ') + dL = d.split(' ') + avec = np.array([0.0]*300) + dvec = np.array([0.0]*300) + for word in aL: + try: + emres = [float(x) for x in embd[vocab[word]]] + avec += emres + except: + pass + for word in dL: + try: + emres = [float(x) for x in embd[vocab[word]]] + dvec += emres + except: + pass + avec /= len(aL) + dvec /= len(dL) + upnum = 0 + downnum = 0 + for i in range(len(avec)): + upnum += avec[i]*dvec[i] + downnum += avec[i]*avec[i] + downnum += dvec[i]*dvec[i] + if downnum == 0: + return 0 + return upnum/downnum + +def pos_sim(a,d): +#"""POS similarity a is answer, d is distractor""" + apos = nltk.pos_tag(nltk.word_tokenize(a)) + dpos = nltk.pos_tag(nltk.word_tokenize(d)) + aset = set() + dset = set() + for tag in apos: + aset.add(tag[1]) + for tag in dpos: + dset.add(tag[1]) + M11 = len(aset & dset) + M10 = len(aset - dset) + M01 = len(dset - aset) + similarity = M11/(M11+M10+M01) + #print("POS_sim, ",similarity) + return similarity + +def edit_distance(s1, s2): +#"""levenshteinDistance""" + return nltk.edit_distance(s1,s2) + +def token_sim(s1,s2): +#""" jaccard similarity between two strings""" + aset = set(nltk.word_tokenize(s1)) + dset = set(nltk.word_tokenize(s2)) + return nltk.jaccard_distance(aset,dset) + +def length_sim(a,d): +#"""calculate a and d's character and token lengths and the difference of lengths""" + acharlen = len(a) + dcharlen = len(d) + atokenlen = len(nltk.word_tokenize(a)) + dtokenlen = len(nltk.word_tokenize(d)) + diffcharlen = abs(acharlen-dcharlen) + difftokenlen = abs(atokenlen-dtokenlen) + return [acharlen,dcharlen,atokenlen,dtokenlen,diffcharlen,difftokenlen] + +# Function to find Longest Common Sub-string +def suffix(str1,str2): + # initialize SequenceMatcher object with + # input string + seqMatch = SequenceMatcher(None,str1,str2) + # find match of longest sub-string + # output will be like Match(a=0, b=0, size=5) + match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) + # print longest substring + if (match.size!=0): + res = str1[match.a: match.a + match.size] + abs_len = len(res) + return [abs_len,float(abs_len)/len(str1),float(abs_len)/len(str2)] + else: + return [0,0.0,0.0] + +def freq(a,d): +#"""average word frequency in a and d""" + aL = a.split() + dL = d.split() + afreqs = [] + dfreqs = [] + for word in aL: + afreqs.append(int(freq_dict.get(word,0))) + for word in dL: + dfreqs.append(int(freq_dict.get(word,0))) + return [sum(afreqs)/len(afreqs),sum(dfreqs)/len(dfreqs)] + + +def is_plural( noun): + return p.singular_noun(noun) is not False + +def singlar_or_plural(a,d): + a = nltk.word_tokenize(a) + d = nltk.word_tokenize(d) + aflag = False + dflag = False + for x in a: + if is_plural(x): + aflag = True + for x in d: + if is_plural(x): + dflag = True + return aflag == dflag + +def num(s): +# whether numbers appear in a and d + if re.search(r'\d', s): + return True + _known = { + 'zero': 0, + 'one': 1, + 'two': 2, + 'three': 3, + 'four': 4, + 'five': 5, + 'six': 6, + 'seven': 7, + 'eight': 8, + 'nine': 9, + 'ten': 10, + 'eleven': 11, + 'twelve': 12, + 'thirteen': 13, + 'fourteen': 14, + 'fifteen': 15, + 'sixteen': 16, + 'seventeen': 17, + 'eighteen': 18, + 'nineteen': 19, + 'twenty': 20, + 'thirty': 30, + 'forty': 40, + 'fifty': 50, + 'sixty': 60, + 'seventy': 70, + 'eighty': 80, + 'ninety': 90 + } + for n in _known.keys(): + if n in s: + return True + return False + +def wiki_sim(a,d): + res = 0 + try: + res = model.similarity(a,d) + except: + pass + return res + +def cal_10_feature_vec(params): + q = params[0].replace('_',' ') + a = params[1].replace('_',' ') + d = params[2].replace('_',' ') + y = params[3] + features = [] + features.extend([emb_sim(q,d),emb_sim(a,d)]) #2 + features.append(pos_sim(a,d))#1 + features.append(edit_distance(a,d))#1 + features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)])#3 + features.extend(length_sim(a,d)) #6 + features.extend(suffix(a,d)) #3 + features.extend(freq(a,d)) #2 + global cnt + cnt += 1 + if cnt%10000 == 0: + print(cnt) + return [features,y,q,a,d] + +def cal_26_feature_vec(params): +#"""26-dimensional feature vector""" + q = params[0].replace('_',' ') + a = params[1].replace('_',' ') + d = params[2].replace('_',' ') + y = params[3] + features = [] + features.extend([emb_sim(q,d),emb_sim(a,d)]) #2 + features.append(pos_sim(a,d)) #1 + features.append(edit_distance(a,d)) #1 + features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) #3 + features.extend(length_sim(a,d)) #6 + features.extend(suffix(a,d)) #3 + features.extend(freq(a,d)) #2 + features.append(singlar_or_plural(a,d)) #1 + features.extend([int(num(a)),int(num(d))]) #2 + features.append(wiki_sim(a,d)) #1 + #print("total features, ",features) + global cnt + cnt += 1 + if cnt%10000 == 0: + print(cnt) + return [features,y,q,a,d] + +#print(singlar_or_plural("many things","here you are")) +#if __name__=="__main__": +# print(cal_10_feature_vec("Economics deals primarily with the concept of","scarcity","change")) \ No newline at end of file diff --git a/Ranker/prepareData.py b/Ranker/prepareData.py new file mode 100644 index 0000000..52455fe --- /dev/null +++ b/Ranker/prepareData.py @@ -0,0 +1,78 @@ +import json +import sys +import pandas as pd +sys.path.append('./') +sys.path.append('.') +import multi_calculate_features +import io +import random +from multiprocessing import Pool +import multiprocessing + +def extract_content(s): + index = s.find("") + if index-1 > 0: + return s[:index-1] + else: + return s + +def prepare_training_data(infile,vocfile,outfile): + voc_file = io.open(vocfile,'r',encoding='utf-8') + vocab = [] + for line in voc_file: + vocab.append(line.strip('\n').lower().replace(' ','_')) + features = [] + cpu_count = multiprocessing.cpu_count() + pool = Pool(processes=cpu_count) + params = [] + print("="*50) + f = io.open(infile,'r',encoding='utf-8') + L = f.readlines() + length = len(L) + print("#total, ",length) + lastques = "" + lastlabel = 1 + dislist = [] + for line in range(length): + items = L[line].strip().split(' ') + ques = extract_content(items[1]) + ans = extract_content(items[2]) + dis = extract_content(items[3]) + label = int(items[0]) + dislist.append(dis) + if ques == lastques and label == 0: + if lastlabel == 1: + sslice = random.sample(vocab,10) + for v in sslice: + while v in dislist: + v = random.sample(vocab,1)[0] + params.append([ques,ans,v,0]) + del dislist[:] + lastlabel = label + else: + continue + params.append([ques,ans,dis,label]) + lastques = ques + lastlabel = label + features = pool.map(multi_calculate_features.cal_26_feature_vec,params) + with open(outfile,'w') as t: + for i in range(len(features)): + for x in features[i][0]: + t.write(str(x)) + t.write(' ') + t.write('\t') + t.write(str(features[i][1])) + t.write('\t') + t.write(features[i][2].encode('utf-8')) + t.write('\t') + t.write(features[i][3].encode('utf-8')) + t.write('\t') + t.write(features[i][4].encode('utf-8')) + t.write('\n') + print("finish!") + +if __name__=="__main__": + prepare_training_data(\ + './data/train.data',\ + './data/vocab.txt',\ + './data/L2_train_features.txt') \ No newline at end of file diff --git a/Ranker/train.py b/Ranker/train.py new file mode 100644 index 0000000..052457b --- /dev/null +++ b/Ranker/train.py @@ -0,0 +1,176 @@ +import numpy as np +import json +import matplotlib +matplotlib.use('Agg') +from sklearn.linear_model import LogisticRegression +from sklearn import datasets +import matplotlib.pyplot as plt +from sklearn.externals import joblib +from sklearn.ensemble import RandomForestClassifier +from sklearn.svm import SVC +import argparse +import pyltr +import xgboost as xgb +from xgboost import plot_importance + + +def train_LR(X,Y,model): + #delete 16,17,4 is the best score: 0.8237416251503178 ('score:', 0.8625927837128107) + X = np.asarray(X, dtype=np.float64) + mask = [True]*X.shape[1] + mask[16] = False + mask[17] = False + mask[14] = False + #mask[1] = False + X = X[:, mask] + Y = np.asarray(Y, dtype=np.int32) + logreg = LogisticRegression(C=1.0, solver='liblinear', multi_class='ovr') + clf = logreg.fit(X, Y) + #joblib.dump(clf, "models/L1_LR_train_model.m") + print("LR score:",clf.score(X,Y)) + joblib.dump(clf,model) + scores = clf.predict_proba(X) + labels = clf.predict(X) + return scores,labels + +def train_RF(X,Y,model): + X = np.asarray(X, dtype=np.float64) + Y = np.asarray(Y, dtype=np.int32) + clf = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=0) + clf.fit(X, Y) + print("feature importance,",clf.feature_importances_) + #joblib.dump(clf, "models/L1_RF_train_model.m") + joblib.dump(clf, model) +# ('feature importance,', array([7.82722123e-04, 3.39524151e-02, 9.80291102e-02, +# 1.46344023e-01, +# 2.09017260e-04, 5.37960842e-02, 2.04428460e-04, 4.39473404e-03, +# 5.12405964e-03, 5.58617809e-03, 2.12143594e-02, 4.38502621e-02, +# 8.26976723e-02, 1.46414209e-01, 1.13165187e-01, 2.41856736e-01, +# 5.99338024e-04, 1.77946434e-03])) + # n_estimators=100 max_depth = 2, score: 0.8234266735383382 + print("RF score,",clf.score(X,Y)) + scores = clf.predict_proba(X) + labels = clf.predict(X) + return scores,labels + +def train_SVM(X,Y,model): + X = np.asarray(X, dtype=np.float64) + Y = np.asarray(Y, dtype=np.int32) + clf = SVC(gamma='auto',probability=True) + clf.fit(X, Y) + #joblib.dump(clf, "models/L1_SVM_train_model.m") + joblib.dump(clf, model) + #score 0.9969936437038309 + print("SVM score,",clf.score(X,Y)) + scores = clf.predict_proba(X) + labels = clf.predict(X) + return scores,labels + +def train_LM(X,Y,model): + Tqids = 0 + metric = pyltr.metrics.NDCG(k=10) + clf = pyltr.models.LambdaMART( + metric=metric, + n_estimators=1000, + learning_rate=0.02, + max_features=0.5, + query_subsample=0.5, + max_leaf_nodes=10, + min_samples_leaf=64, + verbose=1, + ) + clf.fit(X, Y, Tqids) + Epred = clf.predict(EX) + joblib.dump(clf, model) + print 'Random ranking:', metric.calc_mean_random(Eqids, Y) + print 'Our model:', metric.calc_mean(Eqids, Y, Epred) + +def train_xgboost(X,Y,model): + params = { + 'booster': 'gbtree', + 'objective': 'multi:softmax', + 'num_class': 3, + 'gamma': 0.1, + 'max_depth': 6, + 'lambda': 2, + 'subsample': 0.7, + 'colsample_bytree': 0.7, + 'min_child_weight': 3, + 'silent': 1, + 'eta': 0.1, + 'seed': 1000, + 'nthread': 4, + } + plst = params.items() + + dtrain = xgb.DMatrix(X, Y) + num_rounds = 500 + clf = xgb.train(plst, dtrain, num_rounds) + dtrain = xgb.DMatrix(X) + ans = clf.predict(dtrain) + cnt1 = 0 + cnt2 = 0 + for i in range(len(Y)): + if ans[i] == Y[i]: + cnt1 += 1 + else: + cnt2 += 1 + print("Score: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2))) + joblib.dump(clf, model) + + +def write_result(scores,labels,outfile): + outf = open(outfile,"w") + for i in range(len(ques)): + # print("scores, ", scores[i]) + # print("predict label, ", clf.predict([X[i],])) + # print("label,", Y[i]) + for x in [Y[i],"\t",labels[i],"\t",ques[i],"\t",ans[i] \ + ,"\t",dis[i],"\t",scores[i][0],"\t",scores[i][1],'\n']: + if type(x)!=type(""): + try: + outf.write('{}'.format(x)) + except: + outf.write(x.encode('utf-8')) + pass + else: + outf.write(x) + outf.close() + +if __name__=="__main__": + parser = argparse.ArgumentParser() + parser.add_argument('--json', type=str, default="mcql_train_new.json", help='path to json') + parser.add_argument('--outfile', type=str, default="result.txt", help='path to output result') + parser.add_argument('--model', type=str, default="models/train_model.m", help='path to model') + parser.add_argument('--type', type=str, default="SVM", help='model type') + parser.add_argument('--saveresult',type=bool, default="False",help='save predict result or not') + args = parser.parse_args() + inputfile = args.json + model = args.model + model_type = args.type + outfile = args.outfile + f = open(inputfile,'r') + data = json.load(f) + X = [] + Y = [] + ques = [] + ans = [] + dis = [] + for item in data: + X.append(item[0]) + Y.append(item[1]) + ques.append(item[2]) + ans.append(item[3]) + dis.append(item[4]) + if model_type == 'SVM': + scores,labels = train_SVM(X,Y,model) + elif model_type == 'LR': + scores,labels = train_LR(X,Y,model) + elif model_type == 'RF': + scores,labels = train_RF(X,Y,model) + elif model_type == 'LM': + train_LM(X,Y,model) + else: + train_xgboost(X,Y,model) + if args.saveresult: + write_result(scores,labels,outfile) \ No newline at end of file