Abstractive-Text-Summarization-using-Seq2Seq-RNN/embeding_generator.py at master · nikhibdg/Abstractive-Text-Summarization-using-Seq2Seq-RNN · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
import numpy as np
filename = 'glove.6B.50d.txt'


def loadGloVe(filename):
    vocab = []
    embd = []
    file = open(filename,'r')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append(row[1:])
    print('GloVe Loaded.')
    file.close()
    return vocab,embd

vocab,embd = loadGloVe(filename)

embedding = np.asarray(embd)
embedding = embedding.astype(np.float32)

word_vec_dim = len(embd[0])

import csv
import nltk as nlp
from nltk import word_tokenize
import string

summaries = []
texts = []

def clean(text):
    text = text.lower()
    printable = set(string.printable)
    return filter(lambda x: x in printable, text)

with open('Reviews.csv', 'rb') as csvfile:
    Reviews = csv.DictReader(csvfile)
    i = 0
    for row in Reviews:
        i +=1
        if i==1000:
            break

        clean_text = clean(row['Text'])
        clean_summary = clean(row['Summary'])
        print(i)
        summaries.append(word_tokenize(clean_summary))
        texts.append(word_tokenize(clean_text))

def np_nearest_neighbour(x):
    #returns array in embedding that's most similar (in terms of cosine similarity) to x

    xdoty = np.multiply(embedding,x)
    xdoty = np.sum(xdoty,1)
    xlen = np.square(x)
    xlen = np.sum(xlen,0)
    xlen = np.sqrt(xlen)
    ylen = np.square(embedding)
    ylen = np.sum(ylen,1)
    ylen = np.sqrt(ylen)
    xlenylen = np.multiply(xlen,ylen)
    cosine_similarities = np.divide(xdoty,xlenylen)

    return embedding[np.argmax(cosine_similarities)]


def word2vec(word):
    if word in vocab:
        return embedding[vocab.index(word)]
    else:
        return embedding[vocab.index('unk')]

def vec2word(vec):
    for x in range(0, len(embedding)):
            if np.array_equal(embedding[x],np.asarray(vec)):
                return vocab[x]
    return vec2word(np_nearest_neighbour(np.asarray(vec)))

# word = "king"
# print("Vector representation of '"+str(vec2word(word2vec("kingdom")))+"':\n")
print(np.dot(np.array(word2vec("king")), np.array(word2vec("King"))))
print(np.dot(np.array(word2vec("king")), np.array(word2vec("king"))))
# print(np.dot(np.array(word2vec("king")), np.array(word2vec("queen"))))

vec_summaries = []

for summary in summaries:

    vec_summary = []

    for word in summary:
        vec_summary.append(str(word2vec(word)).strip("[]"))

    vec_summary.append(str(word2vec('eos')).strip("[]"))

    vec_summary = ":\n".join(vec_summary)
#    vec_summary = vec_summary.astype(np.float32)

    vec_summaries.append(vec_summary)


vec_texts = []

for text in texts:

    vec_text = []

    for word in text:
        vec_text.append(str(word2vec(word)).strip('[]'))

    vec_text.append(str(word2vec('eos')).strip("[]"))
    vec_text = ":".join(vec_text)
#    vec_text = vec_text.astype(np.float32)

    vec_texts.append(vec_text)


import pickle
with open('vec_summaries', 'wb') as fp:
    pickle.dump(vec_summaries, fp)
with open('vec_texts', 'wb') as fp:
    pickle.dump(vec_texts, fp)