Add files via upload

xinzhu-cai · Aug 28, 2019 · 38a98c5 · 38a98c5
1 parent ded1532
commit 38a98c5
Show file tree

Hide file tree

Showing 7 changed files with 1,719 additions and 0 deletions.
diff --git a/ConceptGenerator/LDA.py b/ConceptGenerator/LDA.py
@@ -0,0 +1,124 @@
+import itertools
+import os
+
+import gensim
+from gensim import corpora
+from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
+from gensim.parsing.preprocessing import STOPWORDS
+from gensim.utils import smart_open, simple_preprocess
+from stop_words import get_stop_words
+
+
+def tokenize(text):
+    """
+    Preprocess and then tokenize a given text
+    :param text: the text which should be tokenized.
+    :return: the token of the given text, after preprocess the text
+    """
+    return [token for token in simple_preprocess(text) if token not in STOPWORDS]
+
+
+def iter_over_dump_file(dump_file, min_length_of_article=50, ignore_namespaces=None):
+    """
+    Iterator over wiki_dump_file.
+    Returns title and tokens for next article in dump file.
+    Ignores short articles.
+    Ignores meta articles, throug given namespaces.
+    Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft'
+    :param dump_file: the dump file
+    :param min_length_of_article: the min number of words in the next article. Default = 50
+    :param ignore_namespaces: list of namespaces which should be ignored.
+    :return: title, tokens
+    """
+    if ignore_namespaces is None:
+        ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
+    for title, text, pageid in _extract_pages(smart_open(dump_file)):
+        text = filter_wiki(text)
+        tokens = tokenize(text)
+        if len(tokens) < min_length_of_article or any(
+                title.startswith(namespace + ':') for namespace in ignore_namespaces):
+            continue  # ignore short articles and various meta-articles
+        yield title, tokens
+
+
+class LDA():
+    def __init__(self):
+        self.stop_words = get_stop_words('en')
+
+    def load(self, model_file):
+        """
+        Loads a LDA model from a given file
+        :param model_file: the file which contains the model, which should be loaded
+        """
+        from gensim.models.ldamodel import LdaModel
+        # self.ldamodel = LdaModel.load(model_file)
+        self.ldamodel = gensim.models.ldamulticore.LdaMulticore.load(model_file)
+        # print(self.ldamodel.print_topics(num_topics=100))
+
+        # self.ldamodel = gensim.models.wrappers.LdaMallet.load(model_file)
+        # from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
+        # self.ldamodel.show_topics(num_topics=5, num_words=10)
+        # self.ldamodel = malletmodel2ldamodel(self.ldamodel)
+        # print(self.ldamodel.__dict__)
+
+    def generate_bow_of_dump_file(self, dump_file, bow_output_file, dict_output_file):
+        doc_stream = (tokens for _, tokens in iter_over_dump_file(dump_file))
+        id2word_dict = gensim.corpora.Dictionary(doc_stream) #obtain: (word_id:word)
+        print(id2word_dict)
+        id2word_dict.filter_extremes(no_below=20, no_above=0.1, keep_n=250000) # word must appear >10 times, and no more than 20% documents
+        print(id2word_dict)
+        dump_corpus = DumpCorpus(dump_file, id2word_dict) #from dictionary to bag of words
+        print("save bow...")
+        #Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document.
+        gensim.corpora.MmCorpus.serialize(bow_output_file, dump_corpus)
+        print("save dict")
+        id2word_dict.save(dict_output_file)
+
+    def train_on_dump_file(self, num_topics, bow_path, dict_path, model_outputfile, training_iterations=20,
+                           max_docs=None):
+        """
+        Trains a new LDA model based on a wikipedia dump or any other dump in the same format.
+        The dump could be zipped.
+        :param num_topics: the number of topics, which should be generated
+        :param bow_path: the path inclusive filename, where the bag of words should be saved
+        :param dict_path: the path incl. filename, where the dictionary should be saved
+        :param model_outputfile: the file in which the trained model should be stored
+        :param training_iterations: the number of LDA training iterations
+        :param max_docs: the number of how many docs should be used for training, if None all docs are used
+        """
+        print("load bow...")
+        mm_corpus = gensim.corpora.MmCorpus(bow_path)
+        print("load dict...")
+        id2word_dict = gensim.corpora.Dictionary.load(dict_path)
+        clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, max_docs)
+        print("start training")
+        #train LDA on bag of word corpus
+        self.ldamodel = gensim.models.ldamulticore.LdaMulticore(clipped_corpus, num_topics=num_topics,
+                                                                id2word=id2word_dict, passes=training_iterations,
+                                                                minimum_probability=0)
+        print("save model")
+        self.ldamodel.save(model_outputfile)
+
+
+class DumpCorpus(object):
+    def __init__(self, dump_file, dictionary, clip_docs=None):
+        """
+        Parse the first `clip_docs` documents from file `dump_file`.
+        Yield each document in turn, as a list of tokens (unicode strings).
+        """
+        self.dump_file = dump_file
+        self.dictionary = dictionary
+        self.clip_docs = clip_docs
+
+    def __iter__(self):
+        """
+        Iterator over wiki corpus
+        :return: bag-of-words format = list of `(token_id, token_count)` 2-tuples
+        """
+        self.titles = []
+        for title, tokens in itertools.islice(iter_over_dump_file(self.dump_file), self.clip_docs):
+            self.titles.append(title)
+            yield self.dictionary.doc2bow(tokens) # tokens to (token_id, token_count) tuples
+
+    def __len__(self):
+        return self.clip_docs
diff --git a/ConceptGenerator/layer1_calculate_features.py b/ConceptGenerator/layer1_calculate_features.py
@@ -0,0 +1,266 @@
+import nltk
+import csv
+import re
+import numpy as np
+import inflect
+import pickle
+from difflib import SequenceMatcher 
+from gensim.models import Word2Vec
+
+p = inflect.engine()
+model = Word2Vec.load("/home/xinzhu/Dataset/Word2Vec-on-Wikipedia-Corpus/model/word2vec_gensim")
+print("Prepare Word2Vec model done!")
+
+prefix = '/home/xinzhu/Code/model/feature/'
+infile = open(prefix+'unigram_freq.csv', mode='r')
+reader = csv.reader(infile)
+freq_dict = {row[0]:row[1] for row in reader}
+
+fin = open('/home/xinzhu/Code/Mydata/data/vocab_python2.pkl', 'rb')
+vocab = pickle.load(fin)
+print('loading saved vocab...')
+fin.close()
+
+fin = open('/home/xinzhu/Code/Mydata/data/embd_python2.pkl', 'rb')
+embd = pickle.load(fin)
+print('loading saved embd...')
+fin.close()
+
+cnt = 0
+
+def emb_sim(a,d):
+    avec = np.array([0.0]*300)
+    dvec = np.array([0.0]*300)
+    try:
+        aL = a.split(' ')
+        dL = d.split(' ')
+        for word in aL:
+            try:
+                emres = [float(x) for x in embd[vocab[word]]]
+                avec += emres
+            except:
+                pass
+        for word in dL:
+            try:
+                emres = [float(x) for x in embd[vocab[word]]]
+                dvec += emres
+            except:
+                pass
+        avec /= len(aL)
+        dvec /= len(dL)
+    except:
+        try:
+            avec = [float(x) for x in embd[vocab[a]]]
+            dvec = [float(x) for x in embd[vocab[d]]]
+        except:
+            pass
+    upnum = 0
+    downnum = 0
+    try:
+        for i in range(len(avec)):
+            upnum += avec[i]*dvec[i]
+            downnum += avec[i]*avec[i]
+            downnum += dvec[i]*dvec[i]
+        if downnum == 0:
+            return 0
+        return upnum/downnum
+    except:
+        return 0
+
+def pos_sim(a,d):
+#"""POS similarity a is answer, d is distractor"""
+    try:
+        apos = nltk.pos_tag(nltk.word_tokenize(a))
+        dpos = nltk.pos_tag(nltk.word_tokenize(d))
+        aset = set()
+        dset = set()
+        for tag in apos:
+            aset.add(tag[1])
+        for tag in dpos:
+            dset.add(tag[1])
+        M11 = len(aset & dset)
+        M10 = len(aset - dset)
+        M01 = len(dset - aset)
+        similarity = M11/(M11+M10+M01) if (M11+M10+M01)>0 else 0
+        #print("POS_sim, ",similarity)
+        return similarity
+    except:
+        return 0
+
+def edit_distance(s1, s2):
+#"""levenshteinDistance"""
+    try:
+        return nltk.edit_distance(s1,s2)
+    except:
+        return 0
+
+def token_sim(s1,s2):
+#""" jaccard similarity between two strings"""
+    try:
+        aset = set(nltk.word_tokenize(s1))
+        dset = set(nltk.word_tokenize(s2))
+        return nltk.jaccard_distance(aset,dset)
+    except:
+        return 0
+
+def length_sim(a,d):
+#"""calculate a and d's character and token lengths and the difference of lengths"""
+    try:
+        acharlen = len(a)
+        dcharlen = len(d)
+        atokenlen = len(nltk.word_tokenize(a))
+        dtokenlen = len(nltk.word_tokenize(d))
+        diffcharlen = abs(acharlen-dcharlen)
+        difftokenlen = abs(atokenlen-dtokenlen)
+        return [acharlen,dcharlen,atokenlen,dtokenlen,diffcharlen,difftokenlen]
+    except:
+        return [acharlen,dcharlen,1,1,diffcharlen,difftokenlen]
+
+# Function to find Longest Common Sub-string
+def suffix(str1,str2): 
+    try:
+        # initialize SequenceMatcher object with  
+        # input string 
+        seqMatch = SequenceMatcher(None,str1,str2) 
+        # find match of longest sub-string 
+        # output will be like Match(a=0, b=0, size=5) 
+        match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) 
+        # print longest substring 
+        if (match.size!=0): 
+            res = str1[match.a: match.a + match.size]
+            abs_len = len(res)
+            return [abs_len,float(abs_len)/len(str1) if len(str1)>0 else 0.0,float(abs_len)/len(str2) if len(str2)>0 else 0.0]  
+        else: 
+            return [0,0.0,0.0] 
+    except:
+        return [0,0.0,0.0] 
+
+def freq(a,d):
+#"""average word frequency in a and d"""
+    try:
+        aL = a.split()
+        dL = d.split()
+        afreqs = []
+        dfreqs = []
+        for word in aL:
+            afreqs.append(freq_dict.get(word,0))
+        for word in dL:
+            dfreqs.append(freq_dict.get(word,0))
+        return [sum(afreqs)/len(afreqs) if len(afreqs)>0 else 0,sum(dfreqs)/len(dfreqs) if len(dfreqs)>0 else 0]
+    except:
+        return [0.0,0.0]
+
+def is_plural( noun):
+    try:
+        return p.singular_noun(noun) is not False
+    except:
+        return False
+
+def singlar_or_plural(a,d):
+    try:
+        a = nltk.word_tokenize(a)
+        d = nltk.word_tokenize(d)
+        aflag = False
+        dflag = False
+        for x in a:
+            if is_plural(x):
+                aflag = True
+        for x in d:
+            if is_plural(x):
+                dflag = True
+        if aflag == dflag:
+            return 1
+        else:
+            return 0
+    except:
+        return 0
+
+def num(s):
+# whether numbers appear in a and d 
+    if re.search(r'\d', s):
+        return True
+    _known = {
+    'zero': 0,
+    'one': 1,
+    'two': 2,
+    'three': 3,
+    'four': 4,
+    'five': 5,
+    'six': 6,
+    'seven': 7,
+    'eight': 8,
+    'nine': 9,
+    'ten': 10,
+    'eleven': 11,
+    'twelve': 12,
+    'thirteen': 13,
+    'fourteen': 14,
+    'fifteen': 15,
+    'sixteen': 16,
+    'seventeen': 17,
+    'eighteen': 18,
+    'nineteen': 19,
+    'twenty': 20,
+    'thirty': 30,
+    'forty': 40,
+    'fifty': 50,
+    'sixty': 60,
+    'seventy': 70,
+    'eighty': 80,
+    'ninety': 90
+    }
+    for n in _known.keys():
+        if n in s:
+            return True
+    return False
+
+def wiki_sim(a,d):
+    res = 0
+    try:
+        res = model.similarity(a,d)
+    except:
+        pass
+    return res
+
+def cal_10_feature_vec(params):
+    q = params[0].replace('_',' ')
+    a = params[1].replace('_',' ')
+    d = params[2].replace('_',' ')
+    y = params[3]
+    features = []
+    features.extend([emb_sim(q,d),emb_sim(a,d)])
+    features.append(pos_sim(a,d))
+    features.append(edit_distance(a,d))
+    features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)])
+    features.extend(length_sim(a,d))
+    features.extend(suffix(a,d))
+    features.extend(freq(a,d))
+    global cnt
+    cnt += 1
+    if cnt%10000 == 0:
+        print(cnt)
+    return [features,y,q,a,d]
+
+def cal_26_feature_vec(params):
+#"""26-dimensional feature vector"""
+    q = params[0]
+    a = params[1]
+    d = params[2]
+    features = []
+    features.extend([emb_sim(q,d),emb_sim(a,d)]) #2
+    features.append(pos_sim(a,d)) #1
+    features.append(edit_distance(a,d)) #1
+    features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) #3
+    features.extend(length_sim(a,d)) #6
+    features.extend(suffix(a,d)) #3
+    features.extend(freq(a,d)) #2
+    features.append(singlar_or_plural(a,d)) #1
+    features.extend([int(num(a)),int(num(d))]) #2
+    features.append(wiki_sim(a,d)) #1
+    # print("total features, ",features)
+    global cnt
+    cnt += 1
+    if cnt%10000 == 0:
+        print(cnt)
+    # print(features)
+    return features