-
Notifications
You must be signed in to change notification settings - Fork 3
Expand file tree
/
Copy pathembeding_generator.py
More file actions
123 lines (90 loc) · 2.96 KB
/
embeding_generator.py
File metadata and controls
123 lines (90 loc) · 2.96 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
filename = 'glove.6B.50d.txt'
def loadGloVe(filename):
vocab = []
embd = []
file = open(filename,'r')
for line in file.readlines():
row = line.strip().split(' ')
vocab.append(row[0])
embd.append(row[1:])
print('GloVe Loaded.')
file.close()
return vocab,embd
vocab,embd = loadGloVe(filename)
embedding = np.asarray(embd)
embedding = embedding.astype(np.float32)
word_vec_dim = len(embd[0])
import csv
import nltk as nlp
from nltk import word_tokenize
import string
summaries = []
texts = []
def clean(text):
text = text.lower()
printable = set(string.printable)
return filter(lambda x: x in printable, text)
with open('Reviews.csv', 'rb') as csvfile:
Reviews = csv.DictReader(csvfile)
i = 0
for row in Reviews:
i +=1
if i==1000:
break
clean_text = clean(row['Text'])
clean_summary = clean(row['Summary'])
print(i)
summaries.append(word_tokenize(clean_summary))
texts.append(word_tokenize(clean_text))
def np_nearest_neighbour(x):
#returns array in embedding that's most similar (in terms of cosine similarity) to x
xdoty = np.multiply(embedding,x)
xdoty = np.sum(xdoty,1)
xlen = np.square(x)
xlen = np.sum(xlen,0)
xlen = np.sqrt(xlen)
ylen = np.square(embedding)
ylen = np.sum(ylen,1)
ylen = np.sqrt(ylen)
xlenylen = np.multiply(xlen,ylen)
cosine_similarities = np.divide(xdoty,xlenylen)
return embedding[np.argmax(cosine_similarities)]
def word2vec(word):
if word in vocab:
return embedding[vocab.index(word)]
else:
return embedding[vocab.index('unk')]
def vec2word(vec):
for x in range(0, len(embedding)):
if np.array_equal(embedding[x],np.asarray(vec)):
return vocab[x]
return vec2word(np_nearest_neighbour(np.asarray(vec)))
# word = "king"
# print("Vector representation of '"+str(vec2word(word2vec("kingdom")))+"':\n")
print(np.dot(np.array(word2vec("king")), np.array(word2vec("King"))))
print(np.dot(np.array(word2vec("king")), np.array(word2vec("king"))))
# print(np.dot(np.array(word2vec("king")), np.array(word2vec("queen"))))
vec_summaries = []
for summary in summaries:
vec_summary = []
for word in summary:
vec_summary.append(str(word2vec(word)).strip("[]"))
vec_summary.append(str(word2vec('eos')).strip("[]"))
vec_summary = ":\n".join(vec_summary)
# vec_summary = vec_summary.astype(np.float32)
vec_summaries.append(vec_summary)
vec_texts = []
for text in texts:
vec_text = []
for word in text:
vec_text.append(str(word2vec(word)).strip('[]'))
vec_text.append(str(word2vec('eos')).strip("[]"))
vec_text = ":".join(vec_text)
# vec_text = vec_text.astype(np.float32)
vec_texts.append(vec_text)
import pickle
with open('vec_summaries', 'wb') as fp:
pickle.dump(vec_summaries, fp)
with open('vec_texts', 'wb') as fp:
pickle.dump(vec_texts, fp)