-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathmodel_analysis.py
71 lines (58 loc) · 2.68 KB
/
model_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from __future__ import division
import gensim
import matplotlib.pyplot as plt
import scipy.stats as stats
import math
labeledModel1 = gensim.models.Doc2Vec.load('Models/labeledDoc2Vec.model')
labeledModel2 = gensim.models.Doc2Vec.load('Models/labeledDoc2Vec2.model')
models = [labeledModel1, labeledModel2]
for model in models:
data = {'Cosine Similarity': [],
'Prediction': []}
correct_cosines = []
incorrect_cosines = []
for i in range(1, 11):
string1 = 'True' + str(i) + '.txt'
string2 = 'Misinformed' + str(i) + '.txt'
data['Cosine Similarity'].append(model.docvecs.most_similar(string1)[0][1])
data['Cosine Similarity'].append(model.docvecs.most_similar(string2)[0][1])
if('True' in model.docvecs.most_similar(string1)[0][0]):
data['Prediction'].append(1)
correct_cosines.append(model.docvecs.most_similar(string1)[0][1])
else:
data['Prediction'].append(0)
incorrect_cosines.append(model.docvecs.most_similar(string1)[0][1])
if('Misinformed' in model.docvecs.most_similar(string2)[0][0]):
data['Prediction'].append(1)
correct_cosines.append(model.docvecs.most_similar(string2)[0][1])
else:
data['Prediction'].append(0)
incorrect_cosines.append(model.docvecs.most_similar(string2)[0][1])
stat, pval = stats.kruskal(correct_cosines, incorrect_cosines)
print 'H-statistic of Kruskal-Wallis Test: ' + str(stat)
print 'P-value of Kruskal-Wallis Test: ' + str(pval)
num_bins = math.ceil(math.sqrt(len(correct_cosines) + len(incorrect_cosines)))
plt.hist([correct_cosines,incorrect_cosines], bins=num_bins, stacked=True, normed = True, histtype='stepfilled')
plt.xlabel('Pairwise Cosine Similarity')
plt.ylabel('Density')
low = min(data['Cosine Similarity'])
high = max(data['Cosine Similarity'])
plt.axis([low, high, 0, 5])
plt.grid(True)
plt.show()
'''
total = 0
accuracy = 0
for i in range(1, 11):
string1 = 'True' + str(i) + '.txt'
string2 = 'Misinformed' + str(i) + '.txt'
print string1 + ': ' + str(labeledModel1.docvecs.most_similar(string1)[0])
print string2 + ':' + str(labeledModel1.docvecs.most_similar(string2)[0])
total = total + labeledModel1.docvecs.most_similar(string1)[0][1] + labeledModel1.docvecs.most_similar(string2)[0][1]
if('True' in labeledModel1.docvecs.most_similar(string1)[0][0]):
accuracy = accuracy + 1
if('Misinformed' in labeledModel1.docvecs.most_similar(string2)[0][0]):
accuracy = accuracy + 1
print 'Average cosine similarity: ' + str(total / 20)
print 'Classification Accurary: ' + str(accuracy / 20)
'''