-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdoc2vec.py
135 lines (111 loc) · 4.26 KB
/
doc2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
from os import listdir
from os.path import isfile, join
import multiprocessing
import gensim
from gensim.models.doc2vec import LabeledSentence
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
import re
#gather classified documents
classifiedDocLabels = []
classifiedDocLabels = [f for f in listdir("Documents/ClassifiedDocuments") if f.endswith('.txt')]
#gather unlabeled documents
unlabeledDocLabels = []
unlabeledDocLabels = [f for f in listdir("Documents/UnlabeledDocumenets") if f.endswith('.txt')]
#array of combined doc labels
docLabels = []
for file in classifiedDocLabels:
docLabels.append(file)
for file in unlabeledDocLabels:
docLabels.append(file)
stopwords = stopwords.words("english") #stopwords list
tokenizer = RegexpTokenizer(r'\w+')
p_stemmer = PorterStemmer()
#function to preprocess text
def process_text(openFile, numbers, stemming):
#clean and tokenize document string
raw = openFile.read().lower()
raw = unicode(raw, errors='replace')
cleanedText = ' '.join([word for word in raw.split() if word not in stopwords])
if(numbers == True or stemming == True):
tokens = tokenizer.tokenize(cleanedText)
if(numbers == True):
# remove numbers
number_tokens = [re.sub(r'[\d]', ' ', i) for i in tokens]
number_tokens = ' '.join(number_tokens).split()
if(stemming == True):
if(numbers == True):
#stem tokens
stemmed_tokens = [p_stemmer.stem(i) for i in number_tokens]
else:
stemmed_tokens = [p_stemmer.stem(i) for i in tokens]
if(stemming == True):
cleanedText = ' '.join(stemmed_tokens)
else:
cleanedText = ' '.join(number_tokens)
#return thet cleaned text
return cleanedText
i = 0 #conter to switch between two directories
data = [] #array to hold all the text documents
for doc in docLabels:
if i < 20:
path = 'Documents/ClassifiedDocuments/' + doc
f = open(path, 'r')
cleanedText = process_text(f, False, False)
data.append(cleanedText)
i = i + 1
f.close()
else:
path = 'Documents/UnlabeledDocumenets/' + doc
f = open(path, 'r')
cleanedText = process_text(f, False, False)
data.append(cleanedText)
f.close()
labeledData = []
for doc in classifiedDocLabels:
path = 'Documents/ClassifiedDocuments/' + doc
f = open(path, 'r')
cleanedText = process_text(f, False, False)
labeledData.append(cleanedText)
f.close()
#class needed for doc2vec model
class DocIterator(object):
def __init__(self, doc_list, labels_list):
self.labels_list = labels_list
self.doc_list = doc_list
def __iter__(self):
for idx, doc in enumerate(self.doc_list):
yield LabeledSentence(words=doc.split(),tags=[self.labels_list[idx]])
#iterator object for the doc2vec model
it = DocIterator(data, docLabels)
#iterator object for the labeled data doc2vec model
labeledIt = DocIterator(labeledData, classifiedDocLabels)
#build the Doc2Vec model at a fixed learning rate
model = gensim.models.Doc2Vec(size=1000, window=8, min_count=4,
workers = multiprocessing.cpu_count(),
alpha=0.025, min_alpha=0.025)
#build vocab from sequence of sentence
model.build_vocab(labeledIt)
#training the model on the text corpus
for epoch in range(10):
model.train(labeledIt)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no deca
model.train(labeledIt)
#save the model
model.save('Models/labeledDoc2Vec2.model')
#build the Doc2Vec model at a fixed learning rate
model = gensim.models.Doc2Vec(size=1000, window=8, min_count=4,
workers = multiprocessing.cpu_count(),
alpha=0.025, min_alpha=0.025)
#build vocab from sequence of sentence
model.build_vocab(it)
#training the model on the text corpus
for epoch in range(10):
model.train(it)
model.alpha -= 0.002 # decrease the learning rate
model.min_alpha = model.alpha # fix the learning rate, no deca
model.train(it)
#save the model
model.save('Models/Doc2Vec2.model')