-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathSentiment_IMDb.py
260 lines (226 loc) · 10.5 KB
/
Sentiment_IMDb.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
import numpy as np
import nltk
import sklearn
import datetime
from bs4 import BeautifulSoup
import re
import random
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.feature_selection import chi2
from sklearn.feature_selection import SelectKBest
imdb="IMDb"
# record the running time of the program
start = datetime.datetime.now()
nltk.download('stopwords') # If needed
# the pre-processer function of the vectorizer to remove the html symbols
def remove_html(s):
# another way to remove html symbols while the result get no difference here
# return BeautifulSoup(s,'html').get_text()
return(re.sub('<[^>]*>', '', s))
# read the dataset from path into dataset_file_full
def preprocess(path_pos,path_neg):
file = open(path_pos,encoding='utf-8').readlines()
dataset_file_pos = ''.join(file).split("\n");
file = open(path_neg,encoding='utf-8').readlines()
dataset_file_neg = ''.join(file).split("\n");
dataset_file_full = []
for pos_review in dataset_file_pos:
dataset_file_full.append((pos_review, 1))
for neg_review in dataset_file_neg:
dataset_file_full.append((neg_review, 0))
return dataset_file_full
# random shuffle the dataset_file_full and read the data into X and Y
def random_shuffle(dataset_file_full):
random.shuffle(dataset_file_full)
dataset=[]
Y=[]
for instance in dataset_file_full:
dataset.append(instance[0])
Y.append(instance[1])
return dataset,Y
# Read-in the training data set
path_pos='./'+imdb+'/train/imdb_train_pos.txt'
path_neg='./'+imdb+'/train/imdb_train_neg.txt'
training_full=preprocess(path_pos,path_neg)
# Read-in the test data set
path_pos='./'+imdb+'/test/imdb_test_pos.txt'
path_neg='./'+imdb+'/test/imdb_test_neg.txt'
test_full=preprocess(path_pos,path_neg)
# Read-in the dev data set
path_pos='./'+imdb+'/dev/imdb_dev_pos.txt'
path_neg='./'+imdb+'/dev/imdb_dev_neg.txt'
dev_full=preprocess(path_pos,path_neg)
# construct the stopwords dictionary
stopwords=set(nltk.corpus.stopwords.words('english'))
stopwords.add(".")
stopwords.add(",")
stopwords.add("--")
stopwords.add("``")
# train the data set with the clf classifier model using chi-squared test
def train_classifier(clf,X_train,Y_train,k=500):
chi2_analysis = SelectKBest(chi2, k).fit(X_train, Y_train)
X_train_new=chi2_analysis.transform(X_train)
clf.fit(np.asarray(X_train_new),np.asarray(Y_train))
return clf,chi2_analysis
# get the classification report of the test set
def get_res_test(vectorizer,classfier,chi2_analysis,bool):
X_test = vectorizer.transform(test_set).toarray()
Y_test_gold = np.asarray(Y_test)
X_test = np.asarray(X_test)
Y_test_predictions = classfier.predict(chi2_analysis.transform(X_test))
if bool==1:
print(classification_report(Y_test_gold,Y_test_predictions,digits=4))
print('pred_0 pred_1')
print('gold_0')
print('gold_1')
print(confusion_matrix(Y_test_gold,Y_test_predictions))
# print(datetime.datetime.now() - start)
return X_test
# get the accuracy of the development set
def get_res_dev(vectorizer,classfier,chi2_analysis):
X_dev = vectorizer.transform(dev_set).toarray()
Y_dev_gold = np.asarray(Y_dev)
X_dev = np.asarray(X_dev)
Y_dev_predictions = classfier.predict(chi2_analysis.transform(X_dev))
accuracy = accuracy_score(Y_dev_gold, Y_dev_predictions)
print(prefix+"Accuracy" + ',' + str(
round(accuracy, 5)) + ',' + vectorizer.__class__.__name__ + ',' + classfier.__class__.__name__)
# print(datetime.datetime.now() - start)
return accuracy
training_set,Y_train=random_shuffle(training_full)
test_set,Y_test=random_shuffle(test_full)
dev_set,Y_dev=random_shuffle(dev_full)
# num of feature and num of chi-squared test choosed
num_features=1000
num_features_chi2=500
svm_clf = sklearn.svm.SVC(kernel="linear", gamma='auto')
prefix=''
# create the vectorizer of three different features
count_vectorizer_wobs = CountVectorizer(
preprocessor=remove_html,
stop_words = stopwords,
lowercase = True,
ngram_range=(1,1),
max_features=num_features)
count_vectorizer_bi = CountVectorizer(
preprocessor=remove_html,
stop_words = stopwords,
lowercase = True,
ngram_range=(2,2),
max_features=num_features)
tfidf_vectorizer = TfidfVectorizer(
preprocessor=remove_html,
stop_words = stopwords,
lowercase = True,
ngram_range=(1,1),
max_features=num_features)
# feature_1 word of bags (wobs)
X_train1=count_vectorizer_wobs.fit_transform(training_set).toarray()
svm_clf1,chi2_analysis=train_classifier(svm_clf,X_train1,Y_train,num_features_chi2)
get_res_dev(count_vectorizer_wobs,svm_clf1,chi2_analysis)
# feature_2 bigrams
X_train2=count_vectorizer_bi.fit_transform(training_set).toarray()
svm_clf2,chi2_analysis=train_classifier(svm_clf,X_train2,Y_train,num_features_chi2)
get_res_dev(count_vectorizer_bi,svm_clf2,chi2_analysis)
# feature_3 tf-idf
X_train3=tfidf_vectorizer.fit_transform(training_set).toarray()
svm_clf3,chi2_analysis=train_classifier(svm_clf,X_train3,Y_train,num_features_chi2)
get_res_dev(tfidf_vectorizer,svm_clf3,chi2_analysis)
# combine the feature vectors wobs and tf-idf together
svm_clf,chi2_analysis=train_classifier(svm_clf,np.hstack((X_train1,X_train3)),Y_train,num_features_chi2)
X_test = np.hstack((count_vectorizer_wobs.transform(test_set).toarray(),tfidf_vectorizer.transform(test_set).toarray()))
Y_test_gold = np.asarray(Y_test)
X_test = np.asarray(X_test)
Y_test_predictions = svm_clf.predict(chi2_analysis.transform(X_test))
accuracy = accuracy_score(Y_test_gold, Y_test_predictions)
print(prefix + "Accuracy" + ',' + str(
round(accuracy, 5)) + ',' + 'wobs+tf-idf')
# get the wobs vector and transform it with tf-idf
X_train=count_vectorizer_wobs.fit_transform(training_set).toarray()
trans=TfidfTransformer()
X_train=trans.fit_transform(X_train).toarray()
svm_clf,chi2_analysis=train_classifier(svm_clf,X_train,Y_train,num_features_chi2)
get_res_dev(count_vectorizer_wobs,svm_clf,chi2_analysis)
# tf-idf with (1,3) gram
tfidf_vectorizer.ngram_range=(1,3)
X_train3=tfidf_vectorizer.fit_transform(training_set).toarray()
svm_clf3,chi2_analysis=train_classifier(svm_clf,X_train3,Y_train,num_features_chi2)
get_res_dev(tfidf_vectorizer,svm_clf3,chi2_analysis)
# the best feature above is tfidf with ngram(1,3)
tfidf_vectorizer.ngram_range=(1,3)
# inspect the accuracy of different models based on tf-idf feature using the development set
log_clf = LogisticRegression(solver="liblinear")
rnd_clf = RandomForestClassifier(n_estimators=50)
svm_clf = sklearn.svm.SVC(kernel="linear", gamma='auto')
voting_clf = VotingClassifier(
estimators=[('1', svm_clf), ('2', log_clf), ('3', rnd_clf)],
voting='hard')
voting_clf_svc = VotingClassifier(
estimators=[('1', svm_clf1), ('2', svm_clf2), ('3', svm_clf3)],
voting='hard')
X_train=tfidf_vectorizer.fit_transform(training_set).toarray()
chi2_analysis = SelectKBest(chi2, num_features_chi2).fit(X_train, Y_train)
for clf in (log_clf, rnd_clf, svm_clf, voting_clf,voting_clf_svc):
X_train_new = chi2_analysis.transform(X_train)
clf.fit(np.asarray(X_train_new), np.asarray(Y_train))
get_res_dev(tfidf_vectorizer, clf, chi2_analysis)
# the best model above is voting_clf classifier
# try different num_features and num_chi2 using td-idf with (1,3) gram to tune the voting_clf model
best_result=0
num_features_array=list(range(500,5001))[::500]+list(range(6000,10001))[::1000]+[20000,30000]
for num_features in num_features_array:
num_features_chi2_array = [100]+list(range(500, num_features+1))[::500]
tfidf_vectorizer.max_features=num_features
X_train = tfidf_vectorizer.fit_transform(training_set).toarray()
for num_features_chi2 in num_features_chi2_array:
if (num_features>5000) and (num_features_chi2!=num_features):
continue
voting_clf,chi2_analysis=train_classifier(voting_clf,X_train,Y_train,num_features_chi2)
prefix='num_features' + ',' + str(num_features) + ',' + 'num_chi2' + ',' + str(
num_features_chi2) + ','
result=get_res_dev(tfidf_vectorizer, voting_clf, chi2_analysis)
if result>best_result:
best_result=result
best_num_features=num_features
best_num_features_chi2=num_features_chi2
print('best result,num_features,'+str(best_num_features)+',num_chi2,'+str(best_num_features_chi2))
# best result tf-idf, voting_clf, num_features 30000, num_chi2 30000
# use the voting_clf and tf-idf with the best feature to get classification report of the test set
tfidf_vectorizer.max_features = best_num_features
X_train = tfidf_vectorizer.fit_transform(training_set).toarray()
voting_clf, chi2_analysis = train_classifier(voting_clf, X_train, Y_train, best_num_features_chi2)
get_res_test(tfidf_vectorizer, voting_clf, chi2_analysis,1)
# select the parameter (1500,1000) to draw the learning curve due to efficiency
best_num_features=1500
best_num_features_chi2=1000
tfidf_vectorizer.max_features = best_num_features
X_train = tfidf_vectorizer.fit_transform(training_set).toarray()
voting_clf, chi2_analysis = train_classifier(voting_clf, X_train, Y_train, best_num_features_chi2)
X_test=get_res_test(tfidf_vectorizer, voting_clf, chi2_analysis,0)
# show the learning curve of the model used for analysing the model
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
def plot_learning_curves(clf):
train_errors, test_errors = [], []
for m in range(5, len(X_train))[::100]:
clf,chi2_analysis=train_classifier(clf,X_train[:m],Y_train[:m],best_num_features_chi2)
Y_train_predict = clf.predict(chi2_analysis.transform(X_train[:m]))
Y_test_predict = clf.predict(chi2_analysis.transform((X_test)))
train_errors.append(mean_squared_error(Y_train_predict, Y_train[:m]))
test_errors.append(mean_squared_error(Y_test_predict, Y_test))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(test_errors), "b-", linewidth=3, label="test")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
plot_learning_curves(voting_clf)
plt.show()