-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathprepare_yelp_char.py
115 lines (86 loc) · 3.93 KB
/
prepare_yelp_char.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
"""
prepare_yelp_char.py
description: prepare the yelp data for training in convolutional recurrent architectures over characters
"""
from nlpdatahandlers import YelpDataHandler
import cPickle as pickle
import logging
import numpy as np
from textclf.wordvectors.char import CharMapper
LOGGER_PREFIX = ' %s'
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
def log(msg, logger=logger):
logger.info(LOGGER_PREFIX % msg)
YELP_USEFUL_TRAIN = '../yelp-dataset/TrainSet_useful_185292'
YELP_USEFUL_DEV = '../yelp-dataset/DevSet_useful_185292'
YELP_USEFUL_TEST = '../yelp-dataset/TestSet_useful_185292'
YELP_FUNNY_TRAIN = '../yelp-dataset/TrainSet_funny_75064'
YELP_FUNNY_DEV = '../yelp-dataset/DevSet_funny_75064'
YELP_FUNNY_TEST = '../yelp-dataset/TestSet_funny_75064'
YELP_COOL_TRAIN = '../yelp-dataset/TrainSet_cool_88698'
YELP_COOL_DEV = '../yelp-dataset/DevSet_cool_88698'
YELP_COOL_TEST = '../yelp-dataset/TestSet_cool_88698'
CHARACTERS_PER_WORD = 15
WORDS_PER_DOCUMENT = 300
PREPEND = False
if __name__ == '__main__':
log('Initializing CharMapper')
cm = CharMapper()
yelp = YelpDataHandler()
def get_yelp_char(train_reviews, test_reviews):
log('Converting to character level representations')
log(' --> Starting Training Data...')
train_reviews = yelp.to_char_level_idx(train_reviews,
char_container=cm,
chars_per_word=CHARACTERS_PER_WORD,
words_per_document=WORDS_PER_DOCUMENT,
prepend=PREPEND)
log(' --> Training Data Complete')
log(' --> Starting Testing Data...')
test_reviews = yelp.to_char_level_idx(test_reviews,
char_container=cm,
chars_per_word=CHARACTERS_PER_WORD,
words_per_document=WORDS_PER_DOCUMENT,
prepend=PREPEND)
log(' --> Testing Data Complete')
return train_reviews, test_reviews
##################################
### YELP USEFUL
##################################
log('Creating "useful" reviews sentence-datasets')
(train_reviews, train_labels, test_reviews, test_labels) = \
yelp.get_data(YELP_USEFUL_TRAIN, YELP_USEFUL_DEV, YELP_USEFUL_TEST)
train_reviews, test_reviews = get_yelp_char(train_reviews, test_reviews)
# -- training data save
np.save('Yelp_useful_sentences_train_char_X.npy', train_reviews)
np.save('Yelp_useful_sentences_train_char_y.npy', train_labels)
# -- testing data save
np.save('Yelp_useful_sentences_test_char_X.npy', test_reviews)
np.save('Yelp_useful_sentences_test_char_y.npy', test_labels)
##################################
### YELP FUNNY
##################################
log('Creating "funny" reviews sentence-datasets')
(train_reviews, train_labels, test_reviews, test_labels) = \
yelp.get_data(YELP_FUNNY_TRAIN, YELP_FUNNY_DEV, YELP_FUNNY_TEST)
train_reviews, test_reviews = get_yelp_char(train_reviews, test_reviews)
# -- training data save
np.save('Yelp_funny_sentences_train_char_X.npy', train_reviews)
np.save('Yelp_funny_sentences_train_char_y.npy', train_labels)
# -- testing data save
np.save('Yelp_funny_sentences_test_char_X.npy', test_reviews)
np.save('Yelp_funny_sentences_test_char_y.npy', test_labels)
##################################
### YELP COOL
##################################
log('Creating "cool" reviews sentence-datasets')
(train_reviews, train_labels, test_reviews, test_labels) = \
yelp.get_data(YELP_COOL_TRAIN, YELP_COOL_DEV, YELP_COOL_TEST)
train_reviews, test_reviews = get_yelp_char(train_reviews, test_reviews)
# -- training data save
np.save('Yelp_cool_sentences_train_char_X.npy', train_reviews)
np.save('Yelp_cool_sentences_train_char_y.npy', train_labels)
# -- testing data save
np.save('Yelp_cool_sentences_test_char_X.npy', test_reviews)
np.save('Yelp_cool_sentences_test_char_y.npy', test_labels)