Skip to content

Commit

Permalink
Add files via upload
Browse files Browse the repository at this point in the history
  • Loading branch information
xinzhu-cai authored Aug 28, 2019
1 parent ded1532 commit 38a98c5
Show file tree
Hide file tree
Showing 7 changed files with 1,719 additions and 0 deletions.
124 changes: 124 additions & 0 deletions ConceptGenerator/LDA.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
import itertools
import os

import gensim
from gensim import corpora
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki
from gensim.parsing.preprocessing import STOPWORDS
from gensim.utils import smart_open, simple_preprocess
from stop_words import get_stop_words


def tokenize(text):
"""
Preprocess and then tokenize a given text
:param text: the text which should be tokenized.
:return: the token of the given text, after preprocess the text
"""
return [token for token in simple_preprocess(text) if token not in STOPWORDS]


def iter_over_dump_file(dump_file, min_length_of_article=50, ignore_namespaces=None):
"""
Iterator over wiki_dump_file.
Returns title and tokens for next article in dump file.
Ignores short articles.
Ignores meta articles, throug given namespaces.
Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft'
:param dump_file: the dump file
:param min_length_of_article: the min number of words in the next article. Default = 50
:param ignore_namespaces: list of namespaces which should be ignored.
:return: title, tokens
"""
if ignore_namespaces is None:
ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split()
for title, text, pageid in _extract_pages(smart_open(dump_file)):
text = filter_wiki(text)
tokens = tokenize(text)
if len(tokens) < min_length_of_article or any(
title.startswith(namespace + ':') for namespace in ignore_namespaces):
continue # ignore short articles and various meta-articles
yield title, tokens


class LDA():
def __init__(self):
self.stop_words = get_stop_words('en')

def load(self, model_file):
"""
Loads a LDA model from a given file
:param model_file: the file which contains the model, which should be loaded
"""
from gensim.models.ldamodel import LdaModel
# self.ldamodel = LdaModel.load(model_file)
self.ldamodel = gensim.models.ldamulticore.LdaMulticore.load(model_file)
# print(self.ldamodel.print_topics(num_topics=100))

# self.ldamodel = gensim.models.wrappers.LdaMallet.load(model_file)
# from gensim.models.wrappers.ldamallet import malletmodel2ldamodel
# self.ldamodel.show_topics(num_topics=5, num_words=10)
# self.ldamodel = malletmodel2ldamodel(self.ldamodel)
# print(self.ldamodel.__dict__)

def generate_bow_of_dump_file(self, dump_file, bow_output_file, dict_output_file):
doc_stream = (tokens for _, tokens in iter_over_dump_file(dump_file))
id2word_dict = gensim.corpora.Dictionary(doc_stream) #obtain: (word_id:word)
print(id2word_dict)
id2word_dict.filter_extremes(no_below=20, no_above=0.1, keep_n=250000) # word must appear >10 times, and no more than 20% documents
print(id2word_dict)
dump_corpus = DumpCorpus(dump_file, id2word_dict) #from dictionary to bag of words
print("save bow...")
#Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document.
gensim.corpora.MmCorpus.serialize(bow_output_file, dump_corpus)
print("save dict")
id2word_dict.save(dict_output_file)

def train_on_dump_file(self, num_topics, bow_path, dict_path, model_outputfile, training_iterations=20,
max_docs=None):
"""
Trains a new LDA model based on a wikipedia dump or any other dump in the same format.
The dump could be zipped.
:param num_topics: the number of topics, which should be generated
:param bow_path: the path inclusive filename, where the bag of words should be saved
:param dict_path: the path incl. filename, where the dictionary should be saved
:param model_outputfile: the file in which the trained model should be stored
:param training_iterations: the number of LDA training iterations
:param max_docs: the number of how many docs should be used for training, if None all docs are used
"""
print("load bow...")
mm_corpus = gensim.corpora.MmCorpus(bow_path)
print("load dict...")
id2word_dict = gensim.corpora.Dictionary.load(dict_path)
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, max_docs)
print("start training")
#train LDA on bag of word corpus
self.ldamodel = gensim.models.ldamulticore.LdaMulticore(clipped_corpus, num_topics=num_topics,
id2word=id2word_dict, passes=training_iterations,
minimum_probability=0)
print("save model")
self.ldamodel.save(model_outputfile)


class DumpCorpus(object):
def __init__(self, dump_file, dictionary, clip_docs=None):
"""
Parse the first `clip_docs` documents from file `dump_file`.
Yield each document in turn, as a list of tokens (unicode strings).
"""
self.dump_file = dump_file
self.dictionary = dictionary
self.clip_docs = clip_docs

def __iter__(self):
"""
Iterator over wiki corpus
:return: bag-of-words format = list of `(token_id, token_count)` 2-tuples
"""
self.titles = []
for title, tokens in itertools.islice(iter_over_dump_file(self.dump_file), self.clip_docs):
self.titles.append(title)
yield self.dictionary.doc2bow(tokens) # tokens to (token_id, token_count) tuples

def __len__(self):
return self.clip_docs
266 changes: 266 additions & 0 deletions ConceptGenerator/layer1_calculate_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,266 @@
import nltk
import csv
import re
import numpy as np
import inflect
import pickle
from difflib import SequenceMatcher
from gensim.models import Word2Vec

p = inflect.engine()
model = Word2Vec.load("/home/xinzhu/Dataset/Word2Vec-on-Wikipedia-Corpus/model/word2vec_gensim")
print("Prepare Word2Vec model done!")

prefix = '/home/xinzhu/Code/model/feature/'
infile = open(prefix+'unigram_freq.csv', mode='r')
reader = csv.reader(infile)
freq_dict = {row[0]:row[1] for row in reader}

fin = open('/home/xinzhu/Code/Mydata/data/vocab_python2.pkl', 'rb')
vocab = pickle.load(fin)
print('loading saved vocab...')
fin.close()

fin = open('/home/xinzhu/Code/Mydata/data/embd_python2.pkl', 'rb')
embd = pickle.load(fin)
print('loading saved embd...')
fin.close()

cnt = 0

def emb_sim(a,d):
avec = np.array([0.0]*300)
dvec = np.array([0.0]*300)
try:
aL = a.split(' ')
dL = d.split(' ')
for word in aL:
try:
emres = [float(x) for x in embd[vocab[word]]]
avec += emres
except:
pass
for word in dL:
try:
emres = [float(x) for x in embd[vocab[word]]]
dvec += emres
except:
pass
avec /= len(aL)
dvec /= len(dL)
except:
try:
avec = [float(x) for x in embd[vocab[a]]]
dvec = [float(x) for x in embd[vocab[d]]]
except:
pass
upnum = 0
downnum = 0
try:
for i in range(len(avec)):
upnum += avec[i]*dvec[i]
downnum += avec[i]*avec[i]
downnum += dvec[i]*dvec[i]
if downnum == 0:
return 0
return upnum/downnum
except:
return 0

def pos_sim(a,d):
#"""POS similarity a is answer, d is distractor"""
try:
apos = nltk.pos_tag(nltk.word_tokenize(a))
dpos = nltk.pos_tag(nltk.word_tokenize(d))
aset = set()
dset = set()
for tag in apos:
aset.add(tag[1])
for tag in dpos:
dset.add(tag[1])
M11 = len(aset & dset)
M10 = len(aset - dset)
M01 = len(dset - aset)
similarity = M11/(M11+M10+M01) if (M11+M10+M01)>0 else 0
#print("POS_sim, ",similarity)
return similarity
except:
return 0

def edit_distance(s1, s2):
#"""levenshteinDistance"""
try:
return nltk.edit_distance(s1,s2)
except:
return 0

def token_sim(s1,s2):
#""" jaccard similarity between two strings"""
try:
aset = set(nltk.word_tokenize(s1))
dset = set(nltk.word_tokenize(s2))
return nltk.jaccard_distance(aset,dset)
except:
return 0

def length_sim(a,d):
#"""calculate a and d's character and token lengths and the difference of lengths"""
try:
acharlen = len(a)
dcharlen = len(d)
atokenlen = len(nltk.word_tokenize(a))
dtokenlen = len(nltk.word_tokenize(d))
diffcharlen = abs(acharlen-dcharlen)
difftokenlen = abs(atokenlen-dtokenlen)
return [acharlen,dcharlen,atokenlen,dtokenlen,diffcharlen,difftokenlen]
except:
return [acharlen,dcharlen,1,1,diffcharlen,difftokenlen]

# Function to find Longest Common Sub-string
def suffix(str1,str2):
try:
# initialize SequenceMatcher object with
# input string
seqMatch = SequenceMatcher(None,str1,str2)
# find match of longest sub-string
# output will be like Match(a=0, b=0, size=5)
match = seqMatch.find_longest_match(0, len(str1), 0, len(str2))
# print longest substring
if (match.size!=0):
res = str1[match.a: match.a + match.size]
abs_len = len(res)
return [abs_len,float(abs_len)/len(str1) if len(str1)>0 else 0.0,float(abs_len)/len(str2) if len(str2)>0 else 0.0]
else:
return [0,0.0,0.0]
except:
return [0,0.0,0.0]

def freq(a,d):
#"""average word frequency in a and d"""
try:
aL = a.split()
dL = d.split()
afreqs = []
dfreqs = []
for word in aL:
afreqs.append(freq_dict.get(word,0))
for word in dL:
dfreqs.append(freq_dict.get(word,0))
return [sum(afreqs)/len(afreqs) if len(afreqs)>0 else 0,sum(dfreqs)/len(dfreqs) if len(dfreqs)>0 else 0]
except:
return [0.0,0.0]

def is_plural( noun):
try:
return p.singular_noun(noun) is not False
except:
return False

def singlar_or_plural(a,d):
try:
a = nltk.word_tokenize(a)
d = nltk.word_tokenize(d)
aflag = False
dflag = False
for x in a:
if is_plural(x):
aflag = True
for x in d:
if is_plural(x):
dflag = True
if aflag == dflag:
return 1
else:
return 0
except:
return 0

def num(s):
# whether numbers appear in a and d
if re.search(r'\d', s):
return True
_known = {
'zero': 0,
'one': 1,
'two': 2,
'three': 3,
'four': 4,
'five': 5,
'six': 6,
'seven': 7,
'eight': 8,
'nine': 9,
'ten': 10,
'eleven': 11,
'twelve': 12,
'thirteen': 13,
'fourteen': 14,
'fifteen': 15,
'sixteen': 16,
'seventeen': 17,
'eighteen': 18,
'nineteen': 19,
'twenty': 20,
'thirty': 30,
'forty': 40,
'fifty': 50,
'sixty': 60,
'seventy': 70,
'eighty': 80,
'ninety': 90
}
for n in _known.keys():
if n in s:
return True
return False

def wiki_sim(a,d):
res = 0
try:
res = model.similarity(a,d)
except:
pass
return res

def cal_10_feature_vec(params):
q = params[0].replace('_',' ')
a = params[1].replace('_',' ')
d = params[2].replace('_',' ')
y = params[3]
features = []
features.extend([emb_sim(q,d),emb_sim(a,d)])
features.append(pos_sim(a,d))
features.append(edit_distance(a,d))
features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)])
features.extend(length_sim(a,d))
features.extend(suffix(a,d))
features.extend(freq(a,d))
global cnt
cnt += 1
if cnt%10000 == 0:
print(cnt)
return [features,y,q,a,d]

def cal_26_feature_vec(params):
#"""26-dimensional feature vector"""
q = params[0]
a = params[1]
d = params[2]
features = []
features.extend([emb_sim(q,d),emb_sim(a,d)]) #2
features.append(pos_sim(a,d)) #1
features.append(edit_distance(a,d)) #1
features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) #3
features.extend(length_sim(a,d)) #6
features.extend(suffix(a,d)) #3
features.extend(freq(a,d)) #2
features.append(singlar_or_plural(a,d)) #1
features.extend([int(num(a)),int(num(d))]) #2
features.append(wiki_sim(a,d)) #1
# print("total features, ",features)
global cnt
cnt += 1
if cnt%10000 == 0:
print(cnt)
# print(features)
return features
Loading

0 comments on commit 38a98c5

Please sign in to comment.