-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
ded1532
commit 38a98c5
Showing
7 changed files
with
1,719 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
import itertools | ||
import os | ||
|
||
import gensim | ||
from gensim import corpora | ||
from gensim.corpora.wikicorpus import _extract_pages, filter_wiki | ||
from gensim.parsing.preprocessing import STOPWORDS | ||
from gensim.utils import smart_open, simple_preprocess | ||
from stop_words import get_stop_words | ||
|
||
|
||
def tokenize(text): | ||
""" | ||
Preprocess and then tokenize a given text | ||
:param text: the text which should be tokenized. | ||
:return: the token of the given text, after preprocess the text | ||
""" | ||
return [token for token in simple_preprocess(text) if token not in STOPWORDS] | ||
|
||
|
||
def iter_over_dump_file(dump_file, min_length_of_article=50, ignore_namespaces=None): | ||
""" | ||
Iterator over wiki_dump_file. | ||
Returns title and tokens for next article in dump file. | ||
Ignores short articles. | ||
Ignores meta articles, throug given namespaces. | ||
Default namespaces are 'Wikipedia', 'Category', 'File', 'Portal', 'Template', 'MediaWiki', 'User', 'Help', 'Book', 'Draft' | ||
:param dump_file: the dump file | ||
:param min_length_of_article: the min number of words in the next article. Default = 50 | ||
:param ignore_namespaces: list of namespaces which should be ignored. | ||
:return: title, tokens | ||
""" | ||
if ignore_namespaces is None: | ||
ignore_namespaces = 'Wikipedia Category File Portal Template MediaWiki User Help Book Draft'.split() | ||
for title, text, pageid in _extract_pages(smart_open(dump_file)): | ||
text = filter_wiki(text) | ||
tokens = tokenize(text) | ||
if len(tokens) < min_length_of_article or any( | ||
title.startswith(namespace + ':') for namespace in ignore_namespaces): | ||
continue # ignore short articles and various meta-articles | ||
yield title, tokens | ||
|
||
|
||
class LDA(): | ||
def __init__(self): | ||
self.stop_words = get_stop_words('en') | ||
|
||
def load(self, model_file): | ||
""" | ||
Loads a LDA model from a given file | ||
:param model_file: the file which contains the model, which should be loaded | ||
""" | ||
from gensim.models.ldamodel import LdaModel | ||
# self.ldamodel = LdaModel.load(model_file) | ||
self.ldamodel = gensim.models.ldamulticore.LdaMulticore.load(model_file) | ||
# print(self.ldamodel.print_topics(num_topics=100)) | ||
|
||
# self.ldamodel = gensim.models.wrappers.LdaMallet.load(model_file) | ||
# from gensim.models.wrappers.ldamallet import malletmodel2ldamodel | ||
# self.ldamodel.show_topics(num_topics=5, num_words=10) | ||
# self.ldamodel = malletmodel2ldamodel(self.ldamodel) | ||
# print(self.ldamodel.__dict__) | ||
|
||
def generate_bow_of_dump_file(self, dump_file, bow_output_file, dict_output_file): | ||
doc_stream = (tokens for _, tokens in iter_over_dump_file(dump_file)) | ||
id2word_dict = gensim.corpora.Dictionary(doc_stream) #obtain: (word_id:word) | ||
print(id2word_dict) | ||
id2word_dict.filter_extremes(no_below=20, no_above=0.1, keep_n=250000) # word must appear >10 times, and no more than 20% documents | ||
print(id2word_dict) | ||
dump_corpus = DumpCorpus(dump_file, id2word_dict) #from dictionary to bag of words | ||
print("save bow...") | ||
#Iterate through the document stream corpus, saving the documents to fname and recording byte offset of each document. | ||
gensim.corpora.MmCorpus.serialize(bow_output_file, dump_corpus) | ||
print("save dict") | ||
id2word_dict.save(dict_output_file) | ||
|
||
def train_on_dump_file(self, num_topics, bow_path, dict_path, model_outputfile, training_iterations=20, | ||
max_docs=None): | ||
""" | ||
Trains a new LDA model based on a wikipedia dump or any other dump in the same format. | ||
The dump could be zipped. | ||
:param num_topics: the number of topics, which should be generated | ||
:param bow_path: the path inclusive filename, where the bag of words should be saved | ||
:param dict_path: the path incl. filename, where the dictionary should be saved | ||
:param model_outputfile: the file in which the trained model should be stored | ||
:param training_iterations: the number of LDA training iterations | ||
:param max_docs: the number of how many docs should be used for training, if None all docs are used | ||
""" | ||
print("load bow...") | ||
mm_corpus = gensim.corpora.MmCorpus(bow_path) | ||
print("load dict...") | ||
id2word_dict = gensim.corpora.Dictionary.load(dict_path) | ||
clipped_corpus = gensim.utils.ClippedCorpus(mm_corpus, max_docs) | ||
print("start training") | ||
#train LDA on bag of word corpus | ||
self.ldamodel = gensim.models.ldamulticore.LdaMulticore(clipped_corpus, num_topics=num_topics, | ||
id2word=id2word_dict, passes=training_iterations, | ||
minimum_probability=0) | ||
print("save model") | ||
self.ldamodel.save(model_outputfile) | ||
|
||
|
||
class DumpCorpus(object): | ||
def __init__(self, dump_file, dictionary, clip_docs=None): | ||
""" | ||
Parse the first `clip_docs` documents from file `dump_file`. | ||
Yield each document in turn, as a list of tokens (unicode strings). | ||
""" | ||
self.dump_file = dump_file | ||
self.dictionary = dictionary | ||
self.clip_docs = clip_docs | ||
|
||
def __iter__(self): | ||
""" | ||
Iterator over wiki corpus | ||
:return: bag-of-words format = list of `(token_id, token_count)` 2-tuples | ||
""" | ||
self.titles = [] | ||
for title, tokens in itertools.islice(iter_over_dump_file(self.dump_file), self.clip_docs): | ||
self.titles.append(title) | ||
yield self.dictionary.doc2bow(tokens) # tokens to (token_id, token_count) tuples | ||
|
||
def __len__(self): | ||
return self.clip_docs |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,266 @@ | ||
import nltk | ||
import csv | ||
import re | ||
import numpy as np | ||
import inflect | ||
import pickle | ||
from difflib import SequenceMatcher | ||
from gensim.models import Word2Vec | ||
|
||
p = inflect.engine() | ||
model = Word2Vec.load("/home/xinzhu/Dataset/Word2Vec-on-Wikipedia-Corpus/model/word2vec_gensim") | ||
print("Prepare Word2Vec model done!") | ||
|
||
prefix = '/home/xinzhu/Code/model/feature/' | ||
infile = open(prefix+'unigram_freq.csv', mode='r') | ||
reader = csv.reader(infile) | ||
freq_dict = {row[0]:row[1] for row in reader} | ||
|
||
fin = open('/home/xinzhu/Code/Mydata/data/vocab_python2.pkl', 'rb') | ||
vocab = pickle.load(fin) | ||
print('loading saved vocab...') | ||
fin.close() | ||
|
||
fin = open('/home/xinzhu/Code/Mydata/data/embd_python2.pkl', 'rb') | ||
embd = pickle.load(fin) | ||
print('loading saved embd...') | ||
fin.close() | ||
|
||
cnt = 0 | ||
|
||
def emb_sim(a,d): | ||
avec = np.array([0.0]*300) | ||
dvec = np.array([0.0]*300) | ||
try: | ||
aL = a.split(' ') | ||
dL = d.split(' ') | ||
for word in aL: | ||
try: | ||
emres = [float(x) for x in embd[vocab[word]]] | ||
avec += emres | ||
except: | ||
pass | ||
for word in dL: | ||
try: | ||
emres = [float(x) for x in embd[vocab[word]]] | ||
dvec += emres | ||
except: | ||
pass | ||
avec /= len(aL) | ||
dvec /= len(dL) | ||
except: | ||
try: | ||
avec = [float(x) for x in embd[vocab[a]]] | ||
dvec = [float(x) for x in embd[vocab[d]]] | ||
except: | ||
pass | ||
upnum = 0 | ||
downnum = 0 | ||
try: | ||
for i in range(len(avec)): | ||
upnum += avec[i]*dvec[i] | ||
downnum += avec[i]*avec[i] | ||
downnum += dvec[i]*dvec[i] | ||
if downnum == 0: | ||
return 0 | ||
return upnum/downnum | ||
except: | ||
return 0 | ||
|
||
def pos_sim(a,d): | ||
#"""POS similarity a is answer, d is distractor""" | ||
try: | ||
apos = nltk.pos_tag(nltk.word_tokenize(a)) | ||
dpos = nltk.pos_tag(nltk.word_tokenize(d)) | ||
aset = set() | ||
dset = set() | ||
for tag in apos: | ||
aset.add(tag[1]) | ||
for tag in dpos: | ||
dset.add(tag[1]) | ||
M11 = len(aset & dset) | ||
M10 = len(aset - dset) | ||
M01 = len(dset - aset) | ||
similarity = M11/(M11+M10+M01) if (M11+M10+M01)>0 else 0 | ||
#print("POS_sim, ",similarity) | ||
return similarity | ||
except: | ||
return 0 | ||
|
||
def edit_distance(s1, s2): | ||
#"""levenshteinDistance""" | ||
try: | ||
return nltk.edit_distance(s1,s2) | ||
except: | ||
return 0 | ||
|
||
def token_sim(s1,s2): | ||
#""" jaccard similarity between two strings""" | ||
try: | ||
aset = set(nltk.word_tokenize(s1)) | ||
dset = set(nltk.word_tokenize(s2)) | ||
return nltk.jaccard_distance(aset,dset) | ||
except: | ||
return 0 | ||
|
||
def length_sim(a,d): | ||
#"""calculate a and d's character and token lengths and the difference of lengths""" | ||
try: | ||
acharlen = len(a) | ||
dcharlen = len(d) | ||
atokenlen = len(nltk.word_tokenize(a)) | ||
dtokenlen = len(nltk.word_tokenize(d)) | ||
diffcharlen = abs(acharlen-dcharlen) | ||
difftokenlen = abs(atokenlen-dtokenlen) | ||
return [acharlen,dcharlen,atokenlen,dtokenlen,diffcharlen,difftokenlen] | ||
except: | ||
return [acharlen,dcharlen,1,1,diffcharlen,difftokenlen] | ||
|
||
# Function to find Longest Common Sub-string | ||
def suffix(str1,str2): | ||
try: | ||
# initialize SequenceMatcher object with | ||
# input string | ||
seqMatch = SequenceMatcher(None,str1,str2) | ||
# find match of longest sub-string | ||
# output will be like Match(a=0, b=0, size=5) | ||
match = seqMatch.find_longest_match(0, len(str1), 0, len(str2)) | ||
# print longest substring | ||
if (match.size!=0): | ||
res = str1[match.a: match.a + match.size] | ||
abs_len = len(res) | ||
return [abs_len,float(abs_len)/len(str1) if len(str1)>0 else 0.0,float(abs_len)/len(str2) if len(str2)>0 else 0.0] | ||
else: | ||
return [0,0.0,0.0] | ||
except: | ||
return [0,0.0,0.0] | ||
|
||
def freq(a,d): | ||
#"""average word frequency in a and d""" | ||
try: | ||
aL = a.split() | ||
dL = d.split() | ||
afreqs = [] | ||
dfreqs = [] | ||
for word in aL: | ||
afreqs.append(freq_dict.get(word,0)) | ||
for word in dL: | ||
dfreqs.append(freq_dict.get(word,0)) | ||
return [sum(afreqs)/len(afreqs) if len(afreqs)>0 else 0,sum(dfreqs)/len(dfreqs) if len(dfreqs)>0 else 0] | ||
except: | ||
return [0.0,0.0] | ||
|
||
def is_plural( noun): | ||
try: | ||
return p.singular_noun(noun) is not False | ||
except: | ||
return False | ||
|
||
def singlar_or_plural(a,d): | ||
try: | ||
a = nltk.word_tokenize(a) | ||
d = nltk.word_tokenize(d) | ||
aflag = False | ||
dflag = False | ||
for x in a: | ||
if is_plural(x): | ||
aflag = True | ||
for x in d: | ||
if is_plural(x): | ||
dflag = True | ||
if aflag == dflag: | ||
return 1 | ||
else: | ||
return 0 | ||
except: | ||
return 0 | ||
|
||
def num(s): | ||
# whether numbers appear in a and d | ||
if re.search(r'\d', s): | ||
return True | ||
_known = { | ||
'zero': 0, | ||
'one': 1, | ||
'two': 2, | ||
'three': 3, | ||
'four': 4, | ||
'five': 5, | ||
'six': 6, | ||
'seven': 7, | ||
'eight': 8, | ||
'nine': 9, | ||
'ten': 10, | ||
'eleven': 11, | ||
'twelve': 12, | ||
'thirteen': 13, | ||
'fourteen': 14, | ||
'fifteen': 15, | ||
'sixteen': 16, | ||
'seventeen': 17, | ||
'eighteen': 18, | ||
'nineteen': 19, | ||
'twenty': 20, | ||
'thirty': 30, | ||
'forty': 40, | ||
'fifty': 50, | ||
'sixty': 60, | ||
'seventy': 70, | ||
'eighty': 80, | ||
'ninety': 90 | ||
} | ||
for n in _known.keys(): | ||
if n in s: | ||
return True | ||
return False | ||
|
||
def wiki_sim(a,d): | ||
res = 0 | ||
try: | ||
res = model.similarity(a,d) | ||
except: | ||
pass | ||
return res | ||
|
||
def cal_10_feature_vec(params): | ||
q = params[0].replace('_',' ') | ||
a = params[1].replace('_',' ') | ||
d = params[2].replace('_',' ') | ||
y = params[3] | ||
features = [] | ||
features.extend([emb_sim(q,d),emb_sim(a,d)]) | ||
features.append(pos_sim(a,d)) | ||
features.append(edit_distance(a,d)) | ||
features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) | ||
features.extend(length_sim(a,d)) | ||
features.extend(suffix(a,d)) | ||
features.extend(freq(a,d)) | ||
global cnt | ||
cnt += 1 | ||
if cnt%10000 == 0: | ||
print(cnt) | ||
return [features,y,q,a,d] | ||
|
||
def cal_26_feature_vec(params): | ||
#"""26-dimensional feature vector""" | ||
q = params[0] | ||
a = params[1] | ||
d = params[2] | ||
features = [] | ||
features.extend([emb_sim(q,d),emb_sim(a,d)]) #2 | ||
features.append(pos_sim(a,d)) #1 | ||
features.append(edit_distance(a,d)) #1 | ||
features.extend([token_sim(q,d),token_sim(a,d),token_sim(q,a)]) #3 | ||
features.extend(length_sim(a,d)) #6 | ||
features.extend(suffix(a,d)) #3 | ||
features.extend(freq(a,d)) #2 | ||
features.append(singlar_or_plural(a,d)) #1 | ||
features.extend([int(num(a)),int(num(d))]) #2 | ||
features.append(wiki_sim(a,d)) #1 | ||
# print("total features, ",features) | ||
global cnt | ||
cnt += 1 | ||
if cnt%10000 == 0: | ||
print(cnt) | ||
# print(features) | ||
return features |
Oops, something went wrong.