From f7eff699826b75a2e29b011630a2ac1401d0f7e3 Mon Sep 17 00:00:00 2001
From: Xinzhu Cai <hzh13528@163.com>
Date: Wed, 28 Aug 2019 08:57:11 -0400
Subject: [PATCH] Add files via upload

---
 Ranker/POSTree.py                | 595 +++++++++++++++++++++++++++++++
 Ranker/loadGlove.py              |  77 ++++
 Ranker/loadVocab.py              |  32 ++
 Ranker/multiCalculateFeatures.py | 233 ++++++++++++
 Ranker/prepareData.py            |  78 ++++
 Ranker/train.py                  | 176 +++++++++
 6 files changed, 1191 insertions(+)
 create mode 100644 Ranker/POSTree.py
 create mode 100644 Ranker/loadGlove.py
 create mode 100644 Ranker/loadVocab.py
 create mode 100644 Ranker/multiCalculateFeatures.py
 create mode 100644 Ranker/prepareData.py
 create mode 100644 Ranker/train.py

diff --git a/Ranker/POSTree.py b/Ranker/POSTree.py
new file mode 100644
index 0000000..d3c89c0
--- /dev/null
+++ b/Ranker/POSTree.py
@@ -0,0 +1,595 @@
+from operator import itemgetter
+
+class POSTree(object):
+
+    class Node(object):
+        def init(self, token):
+            self.token = token
+            self.first_child = None
+            self.next_sibling = None
+
+        def repr(self):
+            return '<%s>' % (self.token,)
+
+    def init(self, text):
+        """Create a Penn Treebacnk style tree from plaint text.
+        """
+        
+        self.raw_text = text
+        self.text = text.replace('\n', '')
+        self.text_length = len(self.text)
+        self.text_pointer = 0
+        self.words = []
+        self.root = self.create_tree()
+        self.question = ' '.join(self.gather_word(self.root))
+        self.VB_TAG = ('VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD')
+        self.VB_WORD = ('do', 'does', 'can', 'could', 'would', 'should', 
+                'might', 'has', 'have', "'ve", 'is', "'s", 'are', "'re", 'was', 'were')
+
+    def create_tree(self):
+        parent = None
+        token = self.next_token()
+        if token == '(':
+            token = self.next_token()
+            parent = self.Node(token)
+            parent.first_child = self.create_tree()
+            child = parent.first_child
+            if child != None:
+                while True:
+                    child.next_sibling = self.create_tree()
+                    child = child.next_sibling
+                    if child == None:
+                        break
+        elif token != ')':
+            parent = self.Node(token.lower())
+            self.words.append(token.lower())
+
+        return parent
+
+    def next_token(self):
+        end = self.text_pointer
+        while end < self.text_length and self.text[end] == ' ':
+            end += 1
+
+        if end == self.text_length:
+            return None
+
+        if self.text[end] in ('(', ')'):
+            token = self.text[end]
+            end += 1
+        else:
+            start = end
+            end += 1
+            while end < self.text_length and self.text[end] not in ('(', ')', ' '):
+                end += 1
+            token = self.text[start:end]
+        self.text_pointer = end
+        return token
+
+    def first_order_traverse(self):
+        self.first_order_traverse(self.root)
+
+    def first_order_traverse(self, tree):
+        if tree != None:
+            print(tree.token)
+            self.first_order_traverse(tree.first_child)
+            if tree.first_child != None:
+                child = tree.first_child.next_sibling
+                while child != None:
+                    self.first_order_traverse(child)
+                    child = child.next_sibling
+
+    def delete_period(self):
+        child = self.root.first_child.first_child
+        assert(child.token != '.')
+        while child.next_sibling.token != '.':
+            child = child.next_sibling
+        child.next_sibling = None
+
+    def check_PP(self, prenode, node):
+        while node != None and node.token in ('PP', ',', 'SBAR'):
+            prenode = node
+            node = node.next_sibling
+        return prenode, node
+
+    def adjust_order(self):
+        try:
+            child = self.root.first_child
+            if child.token == 'FRAG' and ' '.join(self.words[:2]) == 'how many':
+                words = ['there', 'are', '**blank**'] + self.words[2:-1]
+                return ' '.join(words)
+
+            self.delete_period()
+            assert(child.next_sibling == None)
+            if child.token == 'SQ':
+                self.adjust_SQ_question(child)
+            elif child.token == 'SBARQ':
+                prefirst = child
+                first = child.first_child
+                second = first.next_sibling
+                if first.token == 'SQ' and second == None:
+                    self.adjust_SQ_question(first)
+                elif (first.token in ('WHADJP', 'WHNP', 'WHADVP', 'WHPP')
+                        and second.token == 'SQ'):
+                    WH = self.delete_tree(prefirst, first)
+                    self.adjust_SBARQ_question(WH, second)
+                else:
+                    raise ValueError('Unknown question structure!')
+            elif child.token == 'SBAR':
+                if (child.first_child.token == 'WHADJP'
+                        and child.first_child.next_sibling.token == 'S'
+                        and ' '.join(self.words[:2]) == 'how many'):
+                    SQ = child.first_child.next_sibling
+                    WH = self.delete_tree(child, child.first_child)
+                    self.adjust_SBARQ_question(WH, SQ)
+                else:
+                    raise ValueError('Unknown question structure!')
+            else:
+                raise ValueError('Unknown question structure!')
+            words = self.gather_word(self.root)
+            words = filter(lambda w: w != '', words)
+            statement = ' '.join(words)
+        except Exception as e:
+            if DEBUG:
+                print(self.question)
+                print(self.raw_text)
+            raise e
+        return statement
+
+    def create_answer_node(self, before_text='', after_text=''):
+        node = self.Node('A')
+        answer = '**blank**'
+        if before_text != '':
+            answer = '%s %s' % (before_text, answer)
+        if after_text != '':
+            answer = '%s %s' % (answer, after_text)
+        node.first_child = self.Node(answer)
+        return node
+
+    def check_VB(self, node):
+        if node.token in self.VB_TAG:
+            return True
+        if node.first_child.token in self.VB_WORD:
+            node.token = 'VB'
+            return True
+        return False
+
+    def adjust_SQ_question(self, SQ):
+        VB = SQ.first_child
+        assert(self.check_VB(VB))
+        auxiliary = VB.first_child.token
+        if auxiliary not in ('do', 'did', 'does'):
+            answer = self.create_answer_node(before_text=auxiliary)
+        else:
+            answer = self.create_answer_node()
+
+        # move answer after first NP
+        NP = VB.next_sibling
+        while NP.token != 'NP':
+            NP = NP.next_sibling
+        self.insert_after(answer, NP)
+        self.delete_tree(SQ, VB)
+        return SQ
+
+    def gather_word(self, tree):
+        words = []
+        def recursor(t):
+            if t == None:
+                return
+            if t.first_child == None:
+                words.append(t.token)
+            else:
+                recursor(t.first_child)
+                sibling = t.first_child.next_sibling
+                while sibling != None:
+                    recursor(sibling)
+                    sibling = sibling.next_sibling
+        recursor(tree)
+        return words
+
+    def tree_to_text(self, tree):
+        words = []
+        def recursor(t):
+            if t == None:
+                return
+            if t.first_child == None:
+                words.append(' '+t.token)
+            else:
+                words.append('('+t.token)
+                recursor(t.first_child)
+                sibling = t.first_child.next_sibling
+                while sibling != None:
+                    recursor(sibling)
+                    sibling = sibling.next_sibling
+                words.append(')')
+        recursor(tree)
+        return ''.join(words)
+
+    def convert_WH_to_answer(self, WH):
+        words = self.gather_word(WH)
+        WH_text = ' '.join(words)
+        if WH_text == 'how old':
+            WH.first_child = self.create_answer_node(after_text='years old')
+        elif WH_text == 'why':
+            WH.first_child = self.create_answer_node(before_text='because')
+        elif WH.token in ('WHADJP', 'WHADVP'):
+            WH.first_child = self.create_answer_node()
+        elif WH.token == 'WHNP' or WH.token == 'WHPP' and WH.first_child.next_sibling.token == 'WHNP':
+            parent = WH if WH.token == 'WHNP' else WH.first_child.next_sibling
+            first = WH.first_child
+            while first.token == 'WHNP':
+                parent = first
+                first = first.first_child
+            if first.token == 'WHADJP':
+                first.first_child = self.create_answer_node()
+            elif self.tree_to_text(parent).startswith('(WHNP(WDT what)(NN color)(NN'):
+                after_text = ' '.join(self.gather_word(parent)).replace('what color ', '', 1)
+                parent.first_child = self.create_answer_node(after_text=after_text)
+            else:
+                parent.first_child = self.create_answer_node()
+        else:
+            raise ValueError('Unknown WH structure!')
+        return WH
+
+    def check_ADVP(self, prenode, node):
+        while node != None and node.token == 'ADVP':
+            prenode = node
+            node = node.next_sibling
+        return prenode, node
+
+    def delete_tree(self, prenode, node):
+        if node == None:
+            return node
+        if prenode.first_child == node:
+            prenode.first_child = node.next_sibling
+        else:
+            prenode.next_sibling = node.next_sibling
+        node.next_sibling = None
+        return node
+
+    def delete_node(self, prenode, node):
+        if node == None:
+            return node
+        if prenode.first_child == node:
+            if node.first_child == None:
+                prenode.first_child = node.next_sibling
+            else:
+                prenode.first_child = node.first_child
+                lc = node.first_child
+                while lc.next_sibling != None:
+                    lc = lc.next_sibling
+                lc.next_sibling = node.next_sibling
+                node.first_child = None
+        else:
+            if node.first_child == None:
+                prenode.next_sibling = node.next_sibling
+            else:
+                prenode.next_sibling = node.first_child
+                lc = node.first_child
+                while lc.next_sibling != None:
+                    lc = lc.next_sibling
+                lc.next_sibling = node.next_sibling
+                node.first_child = None
+        node.next_sibling = None
+        return node
+
+    def insert_after(self, srcnode, dstnode):
+        assert(srcnode != None and dstnode != None)
+        srcnode.next_sibling = dstnode.next_sibling
+        dstnode.next_sibling = srcnode
+        return srcnode
+
+    def insert_as_first_child(self, srcnode, dstnode):
+        assert(srcnode != None and dstnode != None)
+        srcnode.next_sibling = dstnode.first_child
+        dstnode.first_child = srcnode
+        return srcnode
+
+    def insert_as_last_child(self, srcnode, dstnode):
+        assert(srcnode != None and dstnode != None)
+        lc = dstnode.first_child
+        if lc == None:
+            self.insert_as_first_child(srcnode, dstnode)
+        else:
+            while lc.next_sibling != None:
+                lc = lc.next_sibling
+            self.insert_after(srcnode, lc)
+        return srcnode
+
+    def adjust_SQ_in_SBARQ(self, SQ, WH):
+        prefirst, first = self.check_ADVP(SQ, SQ.first_child)
+        
+        # SQ = VP
+        if first.token == 'VP':
+            return SQ
+
+        # SQ = NP + VP
+        if (first.token == 'NP' and first.next_sibling != None 
+                and first.next_sibling.token == 'VP' and first.next_sibling.next_sibling == None):
+            return SQ
+
+        if not self.check_VB(first):
+            raise ValueError('First child of SQ in SBARQ is not VB*/MD')
+
+        # process 's 're 've
+        if first.first_child.token == "'s":
+            first.first_child.token = 'is'
+        elif first.first_child.token == "'re":
+            first.first_child.token = 'are'
+        elif first.first_child.token == "'ve":
+            first.first_child.token = 'have'
+
+        presecond, second = self.check_ADVP(first, first.next_sibling)
+
+        # SQ = VB* + [ADVP]
+        if second == None:
+            return SQ
+
+        # process RB(not) and auxiliary do/does/did
+        if second.token == 'RB' and second.first_child.token in ("n't", "not"):
+            if first.first_child.token == 'ca':
+                first.first_child.token = 'can not'
+            else:
+                first.first_child.token += ' not'
+            self.delete_tree(presecond, second)
+            presecond, second = self.check_ADVP(first, first.next_sibling)
+        else:
+            if first.first_child.token in ('do', 'does', 'did'):
+                first.first_child.token = ''
+
+        # SQ = VB*+PP/ADJP/VP
+        if second.next_sibling == None and second.token in ('PP', 'ADJP', 'VP'):
+            return SQ
+        
+        # SQ = VB* + NP
+        #      |     |
+        #     first second
+        if second.token == 'NP' and second.next_sibling == None:
+            fc = second.first_child
+
+            # second = NP + ?
+            #          |    |
+            #          fc   sc
+            if (fc.token == 'NP' and fc.next_sibling != None
+                    and fc.next_sibling.next_sibling == None):
+                sc = fc.next_sibling
+                if ((sc.token == 'PP' and WH.token == 'WHADVP')
+                        or (sc.token == 'PP' and sc.first_child.token == 'IN' 
+                            and sc.first_child.next_sibling == None)
+                        or (sc.token == 'NP' and ' '.join(self.gather_word(fc)) == 'there')
+                        or (sc.token == 'ADJP')
+                        or (sc.token == 'SBAR' and sc.first_child.token == 'WHADVP')):
+                    self.delete_node(presecond, second)
+                    VB = self.delete_tree(prefirst, first)
+                    self.insert_after(VB, fc)
+                    return SQ
+            VB = self.delete_tree(prefirst, first)
+            self.insert_after(VB, second)
+            return SQ
+
+        # SQ = VB* + NP + ? 
+        #      |     |    |
+        #    first second third
+        if second.token == 'NP' and second.next_sibling != None:
+            prethird, third = self.check_ADVP(second, second.next_sibling)
+            # SQ = VB* + NP + ADVP
+            if third == None:
+                VB = self.delete_tree(prefirst, first)
+                self.insert_after(VB, second)
+                return SQ
+
+            if third.next_sibling == None:
+                if ((third.token in ('ADJP', 'PP', 'NP', 'VP'))
+                        or (third.token == 'S' 
+                            and self.tree_to_text(third).startswith('(S(VP(TO to)(VP(VB'))):
+                    VB = self.delete_tree(prefirst, first)
+                    self.insert_after(VB, second)
+                    return SQ
+
+        raise ValueError('Unknown SQ structure in SBARQ!')
+
+    def prefix_by_to_WH(self, WH):
+        BY = self.Node('BY')
+        BY.first_child = self.Node('by')
+        self.insert_as_first_child(BY, WH)
+        return WH
+
+    def insert_WH_into_SQ(self, WH, SQ):
+        if self.words[0] == 'why':
+            self.insert_as_last_child(WH, SQ)
+            return SQ
+
+        prefirst, first = self.check_ADVP(SQ, SQ.first_child)
+
+        if first.next_sibling == None:
+            # SQ = VP
+            if first.token == 'VP':
+                self.insert_as_first_child(WH, SQ)
+                return SQ
+
+            # SQ = NP
+            if first.token == 'NP':
+                self.insert_after(WH, first)
+                return SQ
+
+            # SQ = VB*
+            if self.check_VB(first):
+                self.insert_as_first_child(WH, SQ)
+                return SQ
+
+            raise ValueError('Unknown SQ structure!')
+
+        presecond, second = self.check_ADVP(first, first.next_sibling)
+
+        # SQ = VB* + ADVP
+        if self.check_VB(first) and second == None:
+            self.insert_as_first_child(WH, SQ)
+            return SQ
+
+        # SQ = VB* + VP/PP/ADJP
+        #      |     |
+        #    first  second
+        if (self.check_VB(first) and second.next_sibling == None 
+                and second.token in ('VP', 'PP', 'ADJP')):
+            self.insert_as_first_child(WH, SQ)
+            return SQ
+
+        prethird, third = self.check_ADVP(second, second.next_sibling)
+
+        # SQ = NP + VB* + [ADVP]
+        #      |    |      
+        #    first second 
+        if (first.token == 'NP' and self.check_VB(second) and 
+                (second.next_sibling == None or third == None)):
+            self.insert_after(WH, second)
+            return SQ
+
+        # SQ = NP + VP
+        #      |    |
+        #    first second
+        if (first.token == 'NP' and second.token == 'VP' 
+                and second.next_sibling == None):
+            if WH.token in ('WHNP', 'WHADJP'): #Wh-noun Phrase. who, which book, whose daughter, none of which, or how many leopards.
+                self.insert_as_first_child(WH, SQ)
+                return SQ
+            if WH.token  == 'WHPP': #Wh-prepositional Phrase. by which
+                self.insert_after(WH, second)
+                return SQ
+
+        if third == None:
+            raise ValueError('Unknown SQ structure!')
+
+        # SQ = NP + VB* + ?
+        #      |    |     |
+        #   first second third
+        if first.token == 'NP' and self.check_VB(second) and third.next_sibling == None:
+
+            # SQ = NP + VB* + VP
+            if third.token == 'VP':
+                VB = second
+                VP = third
+                while (self.check_VB(VP.first_child) and VP.first_child.next_sibling != None
+                        and VP.first_child.next_sibling.token == 'VP'):
+                    VB = VP.first_child
+                    VP = VB.next_sibling
+                # VP = VBN + [...]
+                #      |
+                #      fc
+                _, fc = self.check_ADVP(VP, VP.first_child)
+                if ((VB.first_child.token != '' 
+                        and VB.first_child.token.split()[0] in ('is', 'are', 'was', 'were'))
+                        and fc.token == 'VBN'):
+                    if WH.token == 'WHADVP' and self.words[0] == 'how':
+                        WH = self.prefix_by_to_WH(WH)
+                        self.insert_after(WH, VP)
+                        return SQ
+                    if WH.token == 'WHADVP' and self.words[0] in ('why', 'where'):
+                        self.insert_after(WH, VP)
+                        return SQ
+                # VP = VB*
+                #      |
+                #      fc
+                if self.check_VB(fc) and fc.next_sibling == None:
+                    self.insert_after(WH, VP)
+                    return SQ
+                # VP = VB* + ?
+                #      |     |
+                #      fc    sc
+                if (self.check_VB(fc) and fc.next_sibling != None
+                        and fc.next_sibling.next_sibling == None):
+                    sc = fc.next_sibling
+                    # VP = VB* + PRT
+                    if sc.token == 'PRT':
+                        self.insert_after(WH, VP)
+                        return SQ
+                    # VP = VB* + PP
+                    if sc.token == 'PP':
+                        ffc = sc.first_child
+                        if ffc.token == 'IN' and ffc.next_sibling == None:
+                            self.insert_after(WH, VP)
+                            return SQ
+                        if (ffc.token == 'IN' and ffc.next_sibling != None
+                                and ffc.next_sibling.next_sibling == None):
+                            ssc = ffc.next_sibling
+                            if ssc.token in ('NP', 'ADJP'):
+                                self.insert_after(WH, fc)
+                                return SQ
+                    # VP = VB* + SBAR
+                    if sc.token == 'SBAR':
+                        if fc.first_child.token in ('know', 'think'):
+                            if WH.token == 'WHADVP' and self.words[0] == 'how':
+                                WH = self.prefix_by_to_WH(WH)
+                                self.insert_after(WH, VP)
+                                return SQ
+                            self.insert_after(WH, VP)
+                            return SQ
+                        self.insert_after(WH, fc)
+                        return SQ
+                    # VP = VB* + S
+                    if sc.token == 'S' and self.tree_to_text(sc).startswith('(S(VP(TO to)(VP(VB'):
+                        VB_S = sc.first_child.first_child.next_sibling.first_child
+                        if VB_S.next_sibling == None:
+                            self.insert_after(WH, VP)
+                            return SQ
+                        if (VB_S.next_sibling.token == 'SBAR' 
+                                and VB_S.next_sibling.first_child.token == 'WHADVP'):
+                            self.insert_after(WH, VB_S)
+                            return SQ
+                        self.insert_after(WH, fc)
+                        return SQ
+                    # VP = VB* + ADVP
+                    if sc.token == 'ADVP':
+                        self.insert_after(WH, fc)
+                        return SQ
+
+                if WH.token == 'WHADVP' and self.words[0] == 'how':
+                    WH = self.prefix_by_to_WH(WH)
+                    self.insert_after(WH, VP)
+                    return SQ
+                self.insert_after(WH, VP)
+                return SQ
+
+            # SQ = NP + VB* + NP
+            if third.token == 'NP':
+                self.insert_after(WH, third)
+                return SQ
+            # SQ = NP + VB* + S
+            if third.token == 'S' and self.tree_to_text(third).startswith('(S(VP(TO to)(VP(VB'):
+                VB_S = third.first_child.first_child.next_sibling.first_child
+                if VB_S.next_sibling == None and WH.token == 'WHNP':
+                    self.insert_after(WH, VB_S)
+                    return SQ
+                self.insert_after(WH, second)
+                return SQ
+            # SQ = NP + VB* + SBAR
+            if third.token == 'SBAR' and third.first_child.token == 'WHADVP':
+                self.insert_after(WH, second)
+                return SQ
+            # SQ = NP + VB* + PP
+            if third.token == 'PP':
+                self.insert_after(WH, third)
+                return SQ
+            # SQ = NP + VB* + ADJP
+            if third.token == 'ADJP':
+                if WH.token == 'WHADVP' and self.words[0] == 'how':
+                    WH = self.prefix_by_to_WH(WH)
+                    self.insert_after(WH, third)
+                    return SQ
+                self.insert_after(WH, third)
+                return SQ
+
+        raise ValueError('Unknown SQ structure!')
+
+    def adjust_SBARQ_question(self, WH, SQ):
+        """
+            Adjust word order of SBARQ question.
+            convert_WH_to_answer() -> adjust_SQ_in_SBARQ() -> insert_WH_into_SQ().
+        """
+        #WH = self.root.first_child.first_child
+        #SQ = WH.next_sibling
+
+        WH = self.convert_WH_to_answer(WH)
+        SQ = self.adjust_SQ_in_SBARQ(SQ, WH)
+        SQ = self.insert_WH_into_SQ(WH, SQ)
+
+        self.root.first_child.first_child = SQ
\ No newline at end of file
diff --git a/Ranker/loadGlove.py b/Ranker/loadGlove.py
new file mode 100644
index 0000000..b1e901f
--- /dev/null
+++ b/Ranker/loadGlove.py
@@ -0,0 +1,77 @@
+import random
+import os
+import pickle
+import math
+import numpy as np
+import pickle
+
+def loadGloVe(filename, vocab_exist=None):
+    vocab = []
+    vocab_dict = {}
+    embd = []
+    with open(filename, 'r') as fin:
+        for line in fin:
+            row = line.strip().split(' ')
+            if vocab_exist is None or row[0] in vocab_exist:
+                vocab.append(row[0])
+                vocab_dict[row[0]] = len(vocab) - 1
+                embd.append(row[1:])
+        print('Loaded GloVe!')
+    embd = np.array(embd)
+    return vocab, vocab_dict, embd
+
+def build_vocab(dataset, pretrained_embeddings_path):
+    vocab = None
+    if os.path.isfile('{}/vocab_python2.pkl'.format(dataset)):
+        print('loading saved vocab...')
+        with open('{}/vocab_python2.pkl'.format(dataset), 'rb') as fin:
+            vocab = pickle.load(fin)
+    else:
+        code = int(0)
+        vocab = {}
+        vocab['UNKNOWN'] = code
+        code += 1
+        filenames = ['{}/train.data'.format(dataset), '{}/valid.data'.format(dataset), '{}/test.data'.format(dataset)]
+        for filename in filenames:
+            for line in open(filename):
+                items = line.strip().split(' ')
+                for i in range(1, len(items)):
+                    words = items[i].split('_')
+                    for word in words:
+                        if word not in vocab:
+                            vocab[word] = code
+                            code += 1
+    embd = None
+    print("#vocab,",len(vocab))
+    print(vocab['isotopy'])
+    if os.path.isfile('{}/embd_python2.pkl'.format(dataset)):
+        print('loading saved embd...')
+        with open('{}/embd_python2.pkl'.format(dataset), 'rb') as fin:
+            embd = pickle.load(fin)
+    elif len(pretrained_embeddings_path) > 0:
+        vocab_all, vocab_dict_all, embd_all = loadGloVe(pretrained_embeddings_path, vocab)
+        embd = []
+        for k, v in vocab.items():
+            try:
+                index = vocab_dict_all[k]
+                embd.append(embd_all[index])
+            except:
+                embd.append(np.random.uniform(-0.05, 0.05, (embd_all.shape[1])))
+        embd = np.array(embd)
+    return vocab, embd
+
+if __name__=="__main__":
+    vocab, embd = build_vocab("/home/xinzhu/Code/Mydata/data","/home/xinzhu/Code/model/data/data_embeddings/glove.840B.300d.txt")
+    output = open('/home/xinzhu/Code/Mydata/data/vocab_python2.pkl', 'wb')
+    pickle.dump(vocab, output)
+    output.close()
+    print("dump to vocab.pkl!")
+    output = open('/home/xinzhu/Code/Mydata/data/embd_python2.pkl', 'wb')
+    pickle.dump(embd, output)
+    output.close()
+    print("dump to embd.pkl!")
+    print(len(vocab))
+    # L = "If we observe a pebble in a pool".lower().split()
+    # for i in L:
+    #     print("find, ",i)
+    #     print(embd[vocab[i]])
\ No newline at end of file
diff --git a/Ranker/loadVocab.py b/Ranker/loadVocab.py
new file mode 100644
index 0000000..fb65ea1
--- /dev/null
+++ b/Ranker/loadVocab.py
@@ -0,0 +1,32 @@
+# add neg_samples to total.json, get total_neg.json
+import json
+import random
+import io
+
+def dump_json(data, outpath):
+    print ('Saving to', outpath)
+    with open(outpath, 'w') as out:
+        json.dump(data, out, indent=4, separators=(',', ': '))
+
+if __name__=="__main__":
+    voc_file = 'data/vocab.txt'
+    of = io.open(voc_file,'w')
+    with io.open('data/total.json',encoding="utf-8") as f:
+        data = json.load(f)
+    vocab = set()
+    for item in data:
+        vocab.add(item['answer'])
+        for d in item['distractors']:
+            vocab.add(d)
+    for i in vocab:
+        of.write(i+'\n')
+    of.close()
+    print("load vocab done!")
+    results = []
+    for item in data:
+        num = len(item['distractors'])
+        item['neg_samples'] = random.sample(vocab,num)
+        results.append(item)
+
+    dump_json(results,"data/total_neg.json")
+    print("output total_neg.json done!")
diff --git a/Ranker/multiCalculateFeatures.py b/Ranker/multiCalculateFeatures.py
new file mode 100644
index 0000000..4576494
--- /dev/null
+++ b/Ranker/multiCalculateFeatures.py
@@ -0,0 +1,233 @@
+import nltk
+import csv
+import re
+import numpy as np
+import inflect
+import pickle
+from difflib import SequenceMatcher 
+from gensim.models import Word2Vec
+p = inflect.engine()
+model = Word2Vec.load("/home/xinzhu/Dataset/Word2Vec-on-Wikipedia-Corpus/model/word2vec_gensim")
+print("Prepare Word2Vec model done!")
+
+prefix = '/home/xinzhu/Code/model/feature/'
+infile = open(prefix+'unigram_freq.csv', mode='r')
+reader = csv.reader(infile)
+freq_dict = {row[0]:row[1] for row in reader}
+
+fin = open('/home/xinzhu/Code/model/data/mcql_processed/vocab_python2.pkl', 'rb')
+vocab = pickle.load(fin)
+print('loading saved vocab...')
+fin.close()
+
+fin = open('/home/xinzhu/Code/model/data/mcql_processed/embd_python2.pkl', 'rb')
+embd = pickle.load(fin)
+print('loading saved embd...')
+fin.close()
+
+cnt = 0
+
+def emb_sim(a,d):
+    aL = a.split(' ')
+    dL = d.split(' ')
+    avec = np.array([0.0]*300)
+    dvec = np.array([0.0]*300)
+    for word in aL:
+        try:
+            emres = [float(x) for x in embd[vocab[word]]]
+            avec += emres
+        except:
+            pass
+    for word in dL:
+        try:
+            emres = [float(x) for x in embd[vocab[word]]]
+            dvec += emres
+        except:
+            pass
+    avec /= len(aL)
+    dvec /= len(dL)
+    upnum = 0
+    downnum = 0
+    for i in range(len(avec)):
+        upnum += avec[i]*dvec[i]
+        downnum += avec[i]*avec[i]
+        downnum += dvec[i]*dvec[i]
+    if downnum == 0:
+        return 0
+    return upnum/downnum
+
+def pos_sim(a,d):
+#"""POS similarity a is answer, d is distractor"""
+    apos = nltk.pos_tag(nltk.word_tokenize(a))
+    dpos = nltk.pos_tag(nltk.word_tokenize(d))
+    aset = set()
+    dset = set()
+    for tag in apos:
+        aset.add(tag[1])
+    for tag in dpos:
+        dset.add(tag[1])
+    M11 = len(aset & dset)
+    M10 = len(aset - dset)
+    M01 = len(dset - aset)
+    similarity = M11/(M11+M10+M01)
+    #print("POS_sim, ",similarity)
+    return similarity
+
+def edit_distance(s1, s2):
+#"""levenshteinDistance"""
+    return nltk.edit_distance(s1,s2)
+
+def token_sim(s1,s2):
+#""" jaccard similarity between two strings"""
+    aset = set(nltk.word_tokenize(s1))
+    dset = set(nltk.word_tokenize(s2))
+    return nltk.jaccard_distance(aset,dset)
+
+def length_sim(a,d):
+#"""calculate a and d's character and token lengths and the difference of lengths"""
+    acharlen = len(a)
+    dcharlen = len(d)
+    atokenlen = len(nltk.word_tokenize(a))
+    dtokenlen = len(nltk.word_tokenize(d))
+    diffcharlen = abs(acharlen-dcharlen)
+    difftokenlen = abs(atokenlen-dtokenlen)
+    return [acharlen,dcharlen,atokenlen,dtokenlen,diffcharlen,difftokenlen]
+
+# Function to find Longest Common Sub-string
+def suffix(str1,str2): 
+    # initialize SequenceMatcher object with  
+    # input string 
+    seqMatch = SequenceMatcher(None,str1,str2) 
+    # find match of longest sub-string 
+    # output will be like Match(a=0, b=0, size=5) 
+    match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) 
+    # print longest substring 
+    if (match.size!=0): 
+        res = str1[match.a: match.a + match.size]
+        abs_len = len(res)
+        return [abs_len,float(abs_len)/len(str1),float(abs_len)/len(str2)]  
+    else: 
+        return [0,0.0,0.0] 
+
+def freq(a,d):
+#"""average word frequency in a and d"""
+    aL = a.split()
+    dL = d.split()
+    afreqs = []
+    dfreqs = []
+    for word in aL:
+        afreqs.append(int(freq_dict.get(word,0)))
+    for word in dL:
+        dfreqs.append(int(freq_dict.get(word,0)))
+    return [sum(afreqs)/len(afreqs),sum(dfreqs)/len(dfreqs)]
+
+
+def is_plural( noun):
+    return p.singular_noun(noun) is not False
+
+def singlar_or_plural(a,d):
+    a = nltk.word_tokenize(a)
+    d = nltk.word_tokenize(d)
+    aflag = False
+    dflag = False
+    for x in a:
+        if is_plural(x):
+            aflag = True
+    for x in d:
+        if is_plural(x):
+            dflag = True
+    return aflag == dflag
+
+def num(s):
+# whether numbers appear in a and d 
+    if re.search(r'\d', s):
+        return True
+    _known = {
+    'zero': 0,
+    'one': 1,
+    'two': 2,
+    'three': 3,
+    'four': 4,
+    'five': 5,
+    'six': 6,
+    'seven': 7,
+    'eight': 8,
+    'nine': 9,
+    'ten': 10,
+    'eleven': 11,
+    'twelve': 12,
+    'thirteen': 13,
+    'fourteen': 14,
+    'fifteen': 15,
+    'sixteen': 16,
+    'seventeen': 17,
+    'eighteen': 18,
+    'nineteen': 19,
+    'twenty': 20,
+    'thirty': 30,
+    'forty': 40,
+    'fifty': 50,
+    'sixty': 60,
+    'seventy': 70,
+    'eighty': 80,
+    'ninety': 90
+    }
+    for n in _known.keys():
+        if n in s:
+            return True
+    return False
+
+def wiki_sim(a,d):
+    res = 0
+    try:
+        res = model.similarity(a,d)
+    except:
+        pass
+    return res
+
+def cal_10_feature_vec(params):
+    q = params[0].replace('_',' ')
+    a = params[1].replace('_',' ')
+    d = params[2].replace('_',' ')
+    y = params[3]
+    features = []
+    features.extend([emb_sim(q,d),emb_sim(a,d)]) #2
+    features.append(pos_sim(a,d))#1
+    features.append(edit_distance(a,d))#1
+    features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)])#3
+    features.extend(length_sim(a,d)) #6
+    features.extend(suffix(a,d)) #3
+    features.extend(freq(a,d)) #2
+    global cnt
+    cnt += 1
+    if cnt%10000 == 0:
+        print(cnt)
+    return [features,y,q,a,d]
+
+def cal_26_feature_vec(params):
+#"""26-dimensional feature vector"""
+    q = params[0].replace('_',' ')
+    a = params[1].replace('_',' ')
+    d = params[2].replace('_',' ')
+    y = params[3]
+    features = []
+    features.extend([emb_sim(q,d),emb_sim(a,d)]) #2
+    features.append(pos_sim(a,d)) #1
+    features.append(edit_distance(a,d)) #1
+    features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) #3
+    features.extend(length_sim(a,d)) #6
+    features.extend(suffix(a,d)) #3
+    features.extend(freq(a,d)) #2
+    features.append(singlar_or_plural(a,d)) #1
+    features.extend([int(num(a)),int(num(d))]) #2
+    features.append(wiki_sim(a,d)) #1
+    #print("total features, ",features)
+    global cnt
+    cnt += 1
+    if cnt%10000 == 0:
+        print(cnt)
+    return [features,y,q,a,d]
+
+#print(singlar_or_plural("many things","here you are"))
+#if __name__=="__main__":
+#    print(cal_10_feature_vec("Economics deals primarily with the concept of","scarcity","change"))
\ No newline at end of file
diff --git a/Ranker/prepareData.py b/Ranker/prepareData.py
new file mode 100644
index 0000000..52455fe
--- /dev/null
+++ b/Ranker/prepareData.py
@@ -0,0 +1,78 @@
+import json
+import sys 
+import pandas as pd
+sys.path.append('./')
+sys.path.append('.')
+import multi_calculate_features
+import io
+import random
+from multiprocessing import Pool
+import multiprocessing
+
+def extract_content(s):
+    index = s.find("<a>")
+    if index-1 > 0:
+        return s[:index-1]
+    else:
+        return s
+
+def prepare_training_data(infile,vocfile,outfile):
+    voc_file = io.open(vocfile,'r',encoding='utf-8')
+    vocab = []
+    for line in voc_file:
+        vocab.append(line.strip('\n').lower().replace(' ','_'))
+    features = []
+    cpu_count = multiprocessing.cpu_count()
+    pool = Pool(processes=cpu_count)
+    params = []
+    print("="*50)
+    f = io.open(infile,'r',encoding='utf-8')
+    L = f.readlines()
+    length = len(L)
+    print("#total, ",length)
+    lastques = ""
+    lastlabel = 1
+    dislist = []
+    for line in range(length):
+        items = L[line].strip().split(' ')
+        ques = extract_content(items[1])
+        ans = extract_content(items[2])
+        dis = extract_content(items[3])
+        label = int(items[0])
+        dislist.append(dis)
+        if ques == lastques and label == 0:
+            if lastlabel == 1:
+                sslice = random.sample(vocab,10)
+                for v in sslice:
+                    while v in dislist:
+                        v = random.sample(vocab,1)[0]
+                    params.append([ques,ans,v,0])
+                del dislist[:]
+                lastlabel = label
+            else:
+                continue
+        params.append([ques,ans,dis,label])
+        lastques = ques
+        lastlabel = label
+    features = pool.map(multi_calculate_features.cal_26_feature_vec,params)
+    with open(outfile,'w') as t:
+        for i in range(len(features)):
+            for x in features[i][0]:
+                t.write(str(x))
+                t.write(' ')
+            t.write('\t')
+            t.write(str(features[i][1]))
+            t.write('\t')
+            t.write(features[i][2].encode('utf-8'))
+            t.write('\t')
+            t.write(features[i][3].encode('utf-8'))
+            t.write('\t')
+            t.write(features[i][4].encode('utf-8'))
+            t.write('\n')
+    print("finish!")
+        
+if __name__=="__main__":
+    prepare_training_data(\
+    './data/train.data',\
+    './data/vocab.txt',\
+    './data/L2_train_features.txt')
\ No newline at end of file
diff --git a/Ranker/train.py b/Ranker/train.py
new file mode 100644
index 0000000..052457b
--- /dev/null
+++ b/Ranker/train.py
@@ -0,0 +1,176 @@
+import numpy as np
+import json
+import matplotlib
+matplotlib.use('Agg')
+from sklearn.linear_model import LogisticRegression
+from sklearn import datasets
+import matplotlib.pyplot as plt
+from sklearn.externals import joblib
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.svm import SVC
+import argparse
+import pyltr
+import xgboost as xgb
+from xgboost import plot_importance
+
+
+def train_LR(X,Y,model):
+    #delete 16,17,4 is the best score: 0.8237416251503178 ('score:', 0.8625927837128107)
+    X = np.asarray(X, dtype=np.float64)
+    mask = [True]*X.shape[1]
+    mask[16] = False
+    mask[17] = False
+    mask[14] = False
+    #mask[1] = False
+    X = X[:, mask]
+    Y = np.asarray(Y, dtype=np.int32)
+    logreg = LogisticRegression(C=1.0, solver='liblinear', multi_class='ovr')
+    clf = logreg.fit(X, Y)
+    #joblib.dump(clf, "models/L1_LR_train_model.m")
+    print("LR score:",clf.score(X,Y))
+    joblib.dump(clf,model)
+    scores = clf.predict_proba(X)
+    labels = clf.predict(X)
+    return scores,labels
+
+def train_RF(X,Y,model):
+    X = np.asarray(X, dtype=np.float64)
+    Y = np.asarray(Y, dtype=np.int32)
+    clf = RandomForestClassifier(n_estimators=500, max_depth=2, random_state=0)
+    clf.fit(X, Y)
+    print("feature importance,",clf.feature_importances_)
+    #joblib.dump(clf, "models/L1_RF_train_model.m")
+    joblib.dump(clf, model)
+#     ('feature importance,', array([7.82722123e-04, 3.39524151e-02, 9.80291102e-02, 
+# 1.46344023e-01,                                                                
+#        2.09017260e-04, 5.37960842e-02, 2.04428460e-04, 4.39473404e-03,         
+#        5.12405964e-03, 5.58617809e-03, 2.12143594e-02, 4.38502621e-02,         
+#        8.26976723e-02, 1.46414209e-01, 1.13165187e-01, 2.41856736e-01,    
+#        5.99338024e-04, 1.77946434e-03]))
+    # n_estimators=100 max_depth = 2, score: 0.8234266735383382
+    print("RF score,",clf.score(X,Y))
+    scores = clf.predict_proba(X)
+    labels = clf.predict(X)
+    return scores,labels
+
+def train_SVM(X,Y,model):
+    X = np.asarray(X, dtype=np.float64)
+    Y = np.asarray(Y, dtype=np.int32)
+    clf = SVC(gamma='auto',probability=True)
+    clf.fit(X, Y)
+    #joblib.dump(clf, "models/L1_SVM_train_model.m")
+    joblib.dump(clf, model)
+    #score 0.9969936437038309
+    print("SVM score,",clf.score(X,Y))
+    scores = clf.predict_proba(X)
+    labels = clf.predict(X)
+    return scores,labels
+
+def train_LM(X,Y,model):
+    Tqids = 0
+    metric = pyltr.metrics.NDCG(k=10)
+    clf = pyltr.models.LambdaMART(
+        metric=metric,
+        n_estimators=1000,
+        learning_rate=0.02,
+        max_features=0.5,
+        query_subsample=0.5,
+        max_leaf_nodes=10,
+        min_samples_leaf=64,
+        verbose=1,
+    )
+    clf.fit(X, Y, Tqids)
+    Epred = clf.predict(EX)
+    joblib.dump(clf, model)
+    print 'Random ranking:', metric.calc_mean_random(Eqids, Y)
+    print 'Our model:', metric.calc_mean(Eqids, Y, Epred)
+
+def train_xgboost(X,Y,model):
+    params = {
+    'booster': 'gbtree',
+    'objective': 'multi:softmax',
+    'num_class': 3,
+    'gamma': 0.1,
+    'max_depth': 6,
+    'lambda': 2,
+    'subsample': 0.7,
+    'colsample_bytree': 0.7,
+    'min_child_weight': 3,
+    'silent': 1,
+    'eta': 0.1,
+    'seed': 1000,
+    'nthread': 4,
+    }  
+    plst = params.items()
+
+    dtrain = xgb.DMatrix(X, Y)
+    num_rounds = 500
+    clf = xgb.train(plst, dtrain, num_rounds)
+    dtrain = xgb.DMatrix(X)
+    ans = clf.predict(dtrain)
+    cnt1 = 0
+    cnt2 = 0
+    for i in range(len(Y)):
+        if ans[i] == Y[i]:
+            cnt1 += 1
+        else:
+            cnt2 += 1
+    print("Score: %.2f %% " % (100 * cnt1 / (cnt1 + cnt2)))
+    joblib.dump(clf, model)
+
+
+def write_result(scores,labels,outfile):
+    outf = open(outfile,"w")
+    for i in range(len(ques)):
+        # print("scores, ", scores[i])
+        # print("predict label, ", clf.predict([X[i],]))
+        # print("label,", Y[i])
+        for x in [Y[i],"\t",labels[i],"\t",ques[i],"\t",ans[i] \
+        ,"\t",dis[i],"\t",scores[i][0],"\t",scores[i][1],'\n']:
+            if type(x)!=type(""):
+                try:
+                    outf.write('{}'.format(x))
+                except:
+                    outf.write(x.encode('utf-8'))
+                    pass
+            else:
+                outf.write(x)
+    outf.close()
+
+if __name__=="__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--json', type=str, default="mcql_train_new.json", help='path to json')
+    parser.add_argument('--outfile', type=str, default="result.txt", help='path to output result')
+    parser.add_argument('--model', type=str, default="models/train_model.m", help='path to model')
+    parser.add_argument('--type', type=str, default="SVM", help='model type')
+    parser.add_argument('--saveresult',type=bool, default="False",help='save predict result or not')
+    args = parser.parse_args()
+    inputfile = args.json
+    model = args.model
+    model_type = args.type
+    outfile = args.outfile
+    f = open(inputfile,'r')
+    data = json.load(f)
+    X = []
+    Y = []
+    ques = []
+    ans = []
+    dis = []
+    for item in data:
+        X.append(item[0]) 
+        Y.append(item[1])
+        ques.append(item[2])
+        ans.append(item[3])
+        dis.append(item[4])
+    if model_type == 'SVM':
+        scores,labels = train_SVM(X,Y,model)
+    elif model_type == 'LR':
+        scores,labels = train_LR(X,Y,model)
+    elif model_type == 'RF':
+        scores,labels = train_RF(X,Y,model)
+    elif model_type == 'LM':
+        train_LM(X,Y,model)
+    else:
+        train_xgboost(X,Y,model)
+    if args.saveresult:
+        write_result(scores,labels,outfile)
\ No newline at end of file