-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathsent2vec_pp.py
138 lines (115 loc) · 6.69 KB
/
sent2vec_pp.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env python
import settings
from copy import deepcopy
from numpy import exp, dot, zeros, outer, random, dtype, get_include, amax,\
uint32, seterr, array, uint8, vstack, argsort, fromstring, sqrt, newaxis, ndarray, empty, sum as np_sum
FAST_VERSION = -1
IS_DOUBLE = settings.use_double
if IS_DOUBLE:
from numpy import float64 as REAL
else:
from numpy import float32 as REAL
def train_sent_vec(model, sent_vec, sentence, alpha, work=None, neu1=None, sent_vec_grad=None):
if model.sg:
return train_sent_vec_sg(model, sent_vec, sentence, alpha, work, neu1, sent_vec_grad)
else:
return train_sent_vec_cbow(model, sent_vec, sentence, alpha, work, neu1, sent_vec_grad)
def train_sent_vec_sg(model, sent_vec, sentence, alpha, work=None, neu1=None, sent_vec_grad=None):
"""
Update skip-gram model by training on a single sentence.
The sentence is a list of Vocab objects (or None, where the corresponding
word is not in the vocabulary. Called internally from `Sent2Vec.train()`.
This is the non-optimized, Python version. If you have cython installed, gensim
will use the optimized version from word2vec_inner instead.
"""
w2vmodel = model.w2v
if model.negative:
# precompute negative labels
labels = zeros(model.negative + 1)
labels[0] = 1.0
for pos, word in enumerate(sentence):
if word is None:
continue # OOV word in the input sentence => skip
reduced_window = random.randint(w2vmodel.window) # `b` in the original word2vec code
# now go over all words from the (reduced) window, predicting each one in turn
start = max(0, pos - w2vmodel.window + reduced_window)
for pos2, word2 in enumerate(sentence[start : pos + w2vmodel.window + 1 - reduced_window], start):
# don't train on OOV words
if word2:
# l1 = w2vmodel.syn0[word.index]
l1 = sent_vec
neu1e = zeros(l1.shape)
if model.hs:
# work on the entire tree at once, to push as much work into numpy's C routines as possible (performance)
l2a = deepcopy(w2vmodel.syn1[word2.point]) # 2d matrix, codelen x layer1_size
fa = 1.0 / (1.0 + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1 - word2.code - fa) * alpha # vector of error gradients multiplied by the learning rate
if model.word_learn == 1: w2vmodel.syn1[word2.point] += outer(ga, l1) # learn hidden -> output
neu1e += dot(ga, l2a) # save error
if model.negative:
# use this word (label = 1) + `negative` other random words not from this sentence (label = 0)
word_indices = [word2.index]
while len(word_indices) < w2vmodel.negative + 1:
w = w2vmodel.table[random.randint(w2vmodel.table.shape[0])]
if w != word2.index:
word_indices.append(w)
l2b = w2vmodel.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size
fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output
gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate
if model.word_learn == 1: w2vmodel.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output
neu1e += dot(gb, l2b) # save error
sent_vec += neu1e # learn input -> hidden
return len([word for word in sentence if word is not None])
def train_sent_vec_cbow(model, sent_vec, sentence, alpha, work=None, neu1=None, sent_vec_grad=None):
"""
Update CBOW model by training on a single sentence.
The sentence is a list of Vocab objects (or None, where the corresponding
word is not in the vocabulary. Called internally from `Sent2Vec.train()`.
This is the non-optimized, Python version. If you have cython installed, gensim
will use the optimized version from word2vec_inner instead.
"""
w2vmodel = model.w2v
if model.negative:
# precompute negative labels
labels = zeros(model.negative + 1)
labels[0] = 1.
for pos, word in enumerate(sentence):
if word is None:
continue # OOV word in the input sentence => skip
reduced_window = random.randint(model.window) # `b` in the original word2vec code
start = max(0, pos - model.window + reduced_window)
window_pos = enumerate(sentence[start : pos + model.window + 1 - reduced_window], start)
word2_indices = [word2.index for pos2, word2 in window_pos if (word2 is not None and pos2 != pos)]
l1 = np_sum(w2vmodel.syn0[word2_indices], axis=0) # 1 x layer1_size
l1 += sent_vec
if word2_indices and model.cbow_mean:
l1 /= (len(word2_indices) + 1) ##modified by jmarui
neu1e = zeros(l1.shape)
if model.hs:
l2a = w2vmodel.syn1[word.point] # 2d matrix, codelen x layer1_size
fa = 1. / (1. + exp(-dot(l1, l2a.T))) # propagate hidden -> output
ga = (1. - word.code - fa) * alpha # vector of error gradients multiplied by the learning rate
if model.word_learn == 1: w2vmodel.syn1[word.point] += outer(ga, l1) # learn hidden -> output
neu1e += dot(ga, l2a) # save error
if model.negative:
# use this word (label = 1) + `negative` other random words not from this sentence (label = 0)
word_indices = [word.index]
while len(word_indices) < model.negative + 1:
w = w2vmodel.table[random.randint(w2vmodel.table.shape[0])]
if w != word.index:
word_indices.append(w)
l2b = w2vmodel.syn1neg[word_indices] # 2d matrix, k+1 x layer1_size
fb = 1. / (1. + exp(-dot(l1, l2b.T))) # propagate hidden -> output
gb = (labels - fb) * alpha # vector of error gradients multiplied by the learning rate
if model.word_learn == 1: w2vmodel.syn1neg[word_indices] += outer(gb, l1) # learn hidden -> output
neu1e += dot(gb, l2b) # save error
if model.word_learn == 1: w2vmodel.syn0[word2_indices] += neu1e # learn input -> hidden, here for all words in the window separately
sent_vec += neu1e # learn input -> hidden, here for all words in the window separately
return len([word for word in sentence if word is not None])
def sentvec_sim(model, vec, num, sims):
vec_len_r = 1.0 / sqrt(dot(vec, vec))
for i in xrange(sims):
vec2 = model.sents[i]
vec2_len_r = 1.0 / sqrt(dot(vec2, vec2))
sims[i] = dot(model.sents[i], vec) * vec2_len_r
sims *= vec_len_r