-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdoc2vec_optimization.py
136 lines (116 loc) · 4.78 KB
/
doc2vec_optimization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#import necessary packages
from __future__ import division
from os import listdir
from os.path import isfile, join
import multiprocessing
import gensim
from gensim.models.doc2vec import LabeledSentence
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re
import time
from itertools import product
#gather classified documents
classifiedDocLabels = []
classifiedDocLabels = [f for f in listdir("Documents/ClassifiedDocuments") if f.endswith('.txt')]
stopwords = stopwords.words("english") #stopwords list
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()
#function to preprocess text
def process_text(openFile, numbers, stemming):
#clean and tokenize document string
raw = openFile.read().lower()
raw = unicode(raw, errors='replace')
cleanedText = ' '.join([word for word in raw.split() if word not in stopwords])
if(numbers == True or stemming == True):
tokens = tokenizer.tokenize(cleanedText)
if(numbers == True):
# remove numbers
number_tokens = [re.sub(r'[\d]', ' ', i) for i in tokens]
number_tokens = ' '.join(number_tokens).split()
if(stemming == True):
if(numbers == True):
#stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
else:
stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
if(stemming == True):
cleanedText = ' '.join(stemmed_tokens)
else:
cleanedText = ' '.join(number_tokens)
#return thet cleaned text
return cleanedText
#gather classified document text only
labeledData = []
for doc in classifiedDocLabels:
path = 'Documents/ClassifiedDocuments/' + doc
f = open(path, 'r')
cleanedText = process_text(f, False, False)
labeledData.append(cleanedText)
f.close()
#class needed for doc2vec model
class DocIterator(object):
def __init__(self, doc_list, labels_list):
self.labels_list = labels_list
self.doc_list = doc_list
def __iter__(self):
for idx, doc in enumerate(self.doc_list):
yield LabeledSentence(words=doc.split(),tags=[self.labels_list[idx]])
#iterator object for the labeled data doc2vec model
labeledIt = DocIterator(labeledData, classifiedDocLabels)
#construct the set of hyperparameters to optimize
params = {"size": [10, 50, 100, 500, 1000, 5000],
"window": [5, 6, 7, 8, 9, 10],
"min_count": [1, 2, 3, 4, 5, 10]}
#define my own scoring function for gridsearch
def scorer(estimator, X):
#build vocab from sequence of sentence
estimator.build_vocab(X)
#training the model on the text corpus
for epoch in range(10):
estimator.train(X)
estimator.alpha -= 0.002 # decrease the learning rate
estimator.min_alpha = estimator.alpha # fix the learning rate, no deca
estimator.train(X)
#accuracy to return
accuracy = 0
for i in range(1, 11):
string1 = 'True' + str(i) + '.txt'
string2 = 'Misinformed' + str(i) + '.txt'
if('True' in estimator.docvecs.most_similar(string1)[0][0]):
accuracy = accuracy + 1
if('Misinformed' in estimator.docvecs.most_similar(string2)[0][0]):
accuracy = accuracy + 1
return accuracy
#define function to do hyperparameter search
def hyperparameter_search(params, dociterator):
combos = list(product(params['size'], params['window'], params['min_count']))
print 'Testing ' + str((len(combos))) + ' different combinations'
maxAccuracyParams = []
maxAccuracy = 0
for combo in combos:
model = gensim.models.Doc2Vec(size=combo[0], window=combo[1],
min_count = combo[2],
workers = multiprocessing.cpu_count(),
alpha=0.025, min_alpha=0.025)
accuracy = scorer(model, dociterator)/20
if(maxAccuracy < accuracy):
maxAccuracy = accuracy
maxAccuracyParams = []
maxAccuracyParams.append(combo)
elif(maxAccuracy == accuracy):
maxAccuracyParams.append(combo)
else:
pass
return maxAccuracy, maxAccuracyParams
#tune the hyperparameters by scoring over all possible paramter combinations
print("[INFO] tuning hyperparameters")
start = time.time()
maxAcc, bestParams = hyperparameter_search(params, labeledIt)
#evaluate the results of the hyperparameter search
print("[INFO] parameter space search took {:.2f} seconds".format(
time.time() - start))
print "[INFO] parameter search max accuracy: " + str(maxAcc)
print "[INFO] parameter search best parameters: " + str(bestParams)
print "[INFO] There are " + str(len(bestParams)) + " best parameter combos"