-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfalse_predictions.py
67 lines (53 loc) · 2.63 KB
/
false_predictions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from utilities.data_management import open_w_pandas, make_path, split_sets, check_existence
from utilities.plotting import bar_plot
from numpy import all, sum, concatenate
from pandas import concat
# from matplotlib.pyplot import show
import config
# Define data paths
dataset_name = config.dataset
dataset_path = make_path('data/prepared_data/') / (dataset_name + '_partial.csv')
prediction_path = make_path('data/predictions/') / dataset_name / 'test.csv'
analysis_dir = make_path('data/processed_data/') / dataset_name / 'analysis'
fig_dir = make_path('figures') / dataset_name / 'analysis'
check_existence(dataset_path)
check_existence(prediction_path)
# Load data
_, dataset = split_sets(open_w_pandas(dataset_path))
predictions = open_w_pandas(prediction_path)
# Check to make sure there are the same number of rows
if len(dataset) != len(predictions):
raise ValueError('Dataset and predictions are not the same length')
# Extract false negatives
fn_indicator = all([dataset['is_abusive'] == 1, predictions['stacked'] == 0], axis=0)
fp_indicator = all([dataset['is_abusive'] == 0, predictions['stacked'] == 1], axis=0)
false_negatives = concat([
predictions[fn_indicator].reset_index(drop=True),
dataset['document_content'][fn_indicator].reset_index(drop=True)
], axis=1)
false_positives = concat([
predictions[fp_indicator].reset_index(drop=True),
dataset['document_content'][fp_indicator].reset_index(drop=True)
], axis=1)
# Calculate number of correct indicators
pred_cols = false_negatives.columns.values[:-1]
false_negatives['good_count'] = false_negatives[pred_cols].apply(sum, axis=1)
false_positives['good_count'] = false_positives[pred_cols].apply(sum, axis=1)
# Re-order columns
cols = concatenate([pred_cols, ['good_count', 'document_content']])
false_negatives = false_negatives[cols]
false_positives = false_positives[cols]
# Save data
false_negatives.to_csv(analysis_dir / 'false_negatives.csv')
false_positives.to_csv(analysis_dir / 'false_positives.csv')
false_negatives.describe().to_csv(analysis_dir / 'fn_description.csv')
false_positives.describe().to_csv(analysis_dir / 'fp_description.csv')
predictions.describe().to_csv(analysis_dir / 'full_description.csv')
pred_mean = predictions.mean()
fn_mean = false_negatives.mean()[:-1]
fp_mean = false_positives.mean()[:-1]
bar_plot(fn_mean - pred_mean, pred_mean.index.values, 'False negative prediction residuals',
filename=fig_dir / 'false_negative.png')
bar_plot(fp_mean - pred_mean, pred_mean.index.values, 'False positive prediction residuals',
filename=fig_dir / 'false_positive.png')
# show()