-
-
Notifications
You must be signed in to change notification settings - Fork 36
/
Copy pathsentence_types.py
455 lines (359 loc) · 16.3 KB
/
sentence_types.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
'''
Written by Austin Walters
Last Edit: January 2, 2019
For use on https://austingwalters.com
Methods to ingest sentence types
'''
from __future__ import print_function
import os
import re
import numpy as np
import nltk
import json
import random
from bs4 import BeautifulSoup
def encode_phrases(comments, word_encoding=None, add_pos_tags_flag=False):
'''
Encodes comments into a vectorized list
INPUTS:
:param comments: Array of strings containig contents of comments
:param word_encoding: Prior hashmap containing mapping of encodings
:param add_pos_tags_flag: Flag to add parts-of-speach tags after each word
RETURNS:
:return encoded_comments: Comments in a vectorized list of lists
words and punctuation vectorized.
:return word_encoding: hashmap of the word to embeded value
:return word_decoding: hashmap of the embedded value to word
'''
word_decoding = { ' ': 0} # will store a has to reviews words
count = 1
encoded_comments = []
if word_encoding == None:
word_encoding = { 0: ' '} # will store a hash of words
else:
# Will preload word_decoding if word_encoding exists
for word in word_encoding:
word_decoding[word_encoding[word]] = word
for comment in comments:
encoded_comment = []
comment = nltk.word_tokenize(comment)
# Create a POS sentence: word POS_tag word POS_tag, etc.
if add_pos_tags_flag:
comment = nltk.pos_tag(comment)
comment = [ele for word_tuple in comment for ele in word_tuple]
for word in comment:
word = word.lower() # Lowercase word for mapping
if word not in word_encoding:
word_encoding[word] = count
count += 1
if word not in word_decoding:
word_decoding[word_encoding[word]] = word
encoded_comment.append(word_encoding[word])
encoded_comments.append(encoded_comment)
return encoded_comments, word_encoding, word_decoding
def decode_phrases(encoded_comments, word_decoding):
'''
Decodes comments to an array of words
INPUTS:
:param encoded_comments: Vectorized list of embedded words
and punctiuation. Created by the
encode_phrases function
:param word_decoding: Mapping from word embedding to words.
RETURNS:
:return dencoded_comments: Comments in english words (still in list)
'''
decoded_comments = []
for encoded_comment in encoded_comments:
decoded_comment = []
for encoded_word in encoded_comment:
word = word_decoding[encoded_word]
decoded_comment.append(word)
decoded_comments.append(" ".join(decoded_comment))
return decoded_comments
def get_custom_test_comments():
'''
Returns a sample test which is easy to manually validates
'''
print('\nCreating Manual Test...')
test_comments = [
"This is a stupid example.",
"This is another statement, perhaps this will trick the network",
"I don't understand",
"What's up?",
"open the app",
"This is another example",
"Do what I tell you",
"come over here and listen",
"how do you know what to look for",
"Remember how good the concert was?",
"Who is the greatest basketball player of all time?",
"Eat your cereal.",
"Usually the prior sentence is not classified properly.",
"Don't forget about your homework!",
"Can the model identify a sentence without a question mark",
"Everything speculated here is VC money and financial bubble with unrelaible financial values. Zomato, uber, paytm, flipkart throw discounts at the rate of losses. May be few can survive at the end. This hurts a lot for SMB too.",
"I am trying to keep tabs on electric two-wheeler startup industry in India. Ather energy is emerging as a big name. Anyone knows how they are doing?",
"generally a pretty intuitive way to accomplish a task. Want to trash an app Drag it to the trash Want to print a PDF",
"Make sure ownership is clear and minimizing opportunities for such problematic outcomes in the second place",
"Stop the video and walk away."
]
test_comments_category = [
"statement",
"statement",
"statement",
"question",
"command",
"statement",
"command",
"command",
"question",
"question",
"question",
"command",
"statement",
"command",
"question",
"statement",
"question",
"question",
"statement",
"command"
]
return test_comments, test_comments_category
def gen_test_comments(max_samples=999999999):
"""
Generates sample dataset from parsing
the SQuAD dataset and combining it
with the SPAADIA dataset.
The data is then shuffled, and two
arrays are returned. One contains
comments the other categories.
The indexes for classification
in both arrays match, such that
comment[i] correlates with category[i].
Types of sentences:
Statement (Declarative Sentence)
Question (Interrogative Sentence)
Exclamation (Exclamatory Sentence)
Command (Imperative Sentence)
Current Counts:
Command: 1264
Statement: 81104
Question: 131219
Ideally, we will improve the command and
exclamation data samples to be at least
10% of the overall dataset.
"""
tagged_comments = {}
with open('data/train-v2.0.json', 'r') as qa:
parsed = json.load(qa)
statement_count = 0
question_count = 0
command_count = 0
# Pulls all data from the SQuAD 2.0 Dataset, adds to our dataset
for i in range(len(parsed["data"])):
for j in range(len(parsed["data"][i]["paragraphs"])):
statements = parsed["data"][i]["paragraphs"][j]["context"]
if random.randint(0,9) % 4 == 0:
statement = statements
if statement_count < max_samples and statement not in tagged_comments:
tagged_comments[statement] = "statement"
statement_count += 1
else:
for statement in statements.split("."):
if len(statement) <= 2:
continue
if random.randint(0,9) % 3 == 0:
statement += "."
if statement_count < max_samples and statement not in tagged_comments:
tagged_comments[statement] = "statement"
statement_count += 1
for k in range(len(parsed["data"][i]["paragraphs"][j]["qas"])):
question = parsed["data"][i]["paragraphs"][j]["qas"][k]["question"]
if random.randint(0,9) % 2 == 0:
question = question.replace("?", "")
if random.randint(0,9) % 2 == 0:
question = statements.split(".")[0]+". "+question
if question_count < max_samples and question not in tagged_comments:
tagged_comments[question] = "question"
question_count += 1
# Pulls all data from the SPAADIA dataset, adds to our dataset
for doc in os.listdir('data/SPAADIA'):
with open('data/SPAADIA/' + doc, 'r') as handle:
conversations = BeautifulSoup(handle, features="xml")
for imperative in conversations.findAll("imp"):
imperative = imperative.get_text().replace("\n", "")
if command_count < max_samples and imperative not in tagged_comments:
tagged_comments[imperative] = "command"
command_count += 1
for declarative in conversations.findAll("decl"):
declarative = declarative.get_text().replace("\n", "")
if statement_count < max_samples and declarative not in tagged_comments:
tagged_comments[declarative] = "statement"
statement_count += 1
for question in conversations.findAll("q-yn"):
question = question.get_text().replace("\n", "")
if question_count < max_samples and question not in tagged_comments:
tagged_comments[question] = "question"
question_count += 1
for question in conversations.findAll("q-wh"):
question = question.get_text().replace("\n", "")
if question_count < max_samples and question not in tagged_comments:
tagged_comments[question] = "question"
question_count += 1
# Pulls all the data from the manually generated imparatives dataset
with open('data/imperatives.csv', 'r') as imperative_file:
for row in imperative_file:
imperative = row.replace("\n", "")
if command_count < max_samples and imperative not in tagged_comments:
tagged_comments[imperative] = "command"
command_count += 1
# Also add without punctuation
imperative = re.sub('[^a-zA-Z0-9 \.]', '', row)
if command_count < max_samples and imperative not in tagged_comments:
tagged_comments[imperative] = "command"
command_count += 1
test_comments = []
test_comments_category = []
# Ensure random ordering
comments = list(tagged_comments.items())
random.shuffle(comments)
###
### Balance the dataset
###
local_statement_count = 0
local_question_count = 0
local_command_count = 0
min_count = min([question_count, statement_count, command_count])
for comment, category in comments:
'''
if category is "statement":
if local_statement_count > min_count:
continue
local_statement_count += 1
elif category is "question":
if local_question_count > min_count:
continue
local_question_count += 1
elif category is "command":
if local_command_count > min_count:
continue
local_command_count += 1
'''
test_comments.append(comment.rstrip())
test_comments_category.append(category)
print("\n-------------------------")
print("command", command_count)
print("statement", statement_count)
print("question", question_count)
print("-------------------------\n")
return test_comments, test_comments_category
def import_embedding(embedding_name="data/default"):
'''
Import word embedding to a giant json document
'''
if not embedding_name:
return None, None
file_flag = os.path.isfile(embedding_name+"_word_encoding.json")
file_flag &= os.path.isfile(embedding_name+"_cat_encoding.json")
if not file_flag:
return None, None
word_encoding = {}
with open(embedding_name+"_word_encoding.json") as word_embedding:
word_encoding = json.load(word_embedding)
category_encoding = {}
with open(embedding_name+"_cat_encoding.json") as cat_embedding:
category_encoding = json.load(cat_embedding)
return word_encoding, category_encoding
def export_embedding(word_encoding, category_encoding,
embedding_name="data/default"):
'''
Export word embedding to a giant json document
'''
if not embedding_name \
or (not word_encoding) or 2 > len(word_encoding) \
or (not category_encoding) or 2 > len(category_encoding):
return
with open(embedding_name+"_word_encoding.json", "w") as embedding:
embedding.write(json.dumps(word_encoding))
with open(embedding_name+"_cat_encoding.json", "w") as embedding:
embedding.write(json.dumps(category_encoding))
def encode_data(test_comments, test_comments_category,
data_split=0.8, embedding_name=None, add_pos_tags_flag=False):
print("Encoding Data...")
# Import prior mapping
word_encoding, category_encoding = None, None
if embedding_name:
word_encoding, category_encoding = import_embedding(embedding_name)
# Encode comments word + punc, using prior mapping or make new
encoded_comments, word_encoding, \
word_decoding = encode_phrases(test_comments, word_encoding,
add_pos_tags_flag=add_pos_tags_flag)
encoded_categories, categories_encoding, \
categories_decoding = encode_phrases([" ".join(test_comments_category)],
category_encoding,
add_pos_tags_flag=False)
print("Embedding Name", embedding_name)
if embedding_name:
export_embedding(word_encoding, categories_encoding,
embedding_name=embedding_name)
training_sample = int(len(encoded_comments) * data_split)
print(np.array(encoded_categories[0]))
x_train = np.array(encoded_comments[:training_sample])
x_test = np.array(encoded_comments[training_sample:])
y_train = np.array(encoded_categories[0][:training_sample])
y_test = np.array(encoded_categories[0][training_sample:])
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
return x_train, x_test, y_train, y_test
def load_encoded_data(data_split=0.8, embedding_name="data/default", pos_tags=False):
'''
Loads and encodes embeddings
If data has already been encoded, uses the pre-encoded data
If data has not been encoded, encodes, then saves for future use
returns split training and testing data
'''
file_name = embedding_name+"_pre_encoded_comments.csv"
if pos_tags:
file_name = embedding_name+"_pre_encoded_pos_tagged_comments.csv"
encoded_comments = []
encoded_categories = []
# If the encoded results are not available for quick load,
# create a cache in the data folder of encoded data to pull quickly
if not os.path.isfile(file_name):
print("No Cached Data Found...")
print("Loading Data...")
test_comments, test_comments_category = gen_test_comments()
x_train, x_test, y_train, y_test = encode_data(test_comments,
test_comments_category,
data_split=data_split,
embedding_name="data/default",
add_pos_tags_flag=pos_tags)
for i in range(len(y_train)):
encoded_comments.append([y_train[i], x_train[i]])
for i in range(len(y_test)):
encoded_comments.append([y_test[i], x_test[i]])
with open(file_name, 'w') as encoding_file:
for row in encoded_comments:
encoding_file.write(str(row[0]) + "|||||" + str(row[1]))
encoding_file.write('\n')
else:
print("Loading Data...")
encoded_comments = []
encoded_categories = []
with open(file_name, 'r') as encoding_file:
for row in encoding_file:
row = row.split("|||||")
row[0] = int(row[0])
row[1] = row[1].replace("\n", "").replace("[", "").replace("]", "")
row[1] = np.array(row[1].split(",")).astype('int').tolist()
encoded_categories.append(row[0])
encoded_comments.append(row[1])
training_sample = int(len(encoded_comments) * data_split)
x_train = np.array(encoded_comments[:training_sample])
x_test = np.array(encoded_comments[training_sample:])
y_train = np.array(encoded_categories[:training_sample])
y_test = np.array(encoded_categories[training_sample:])
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
return x_train, x_test, y_train, y_test