-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
135 lines (106 loc) · 4.04 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
#%% imports
from knn_sklearn import *
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from scipy import stats
from functools import partial
from itertools import combinations
#%% load data
df = pd.read_csv('data/cleveland.csv')
df_val = pd.read_csv('data/cleveland-test-sample.csv')
# drop rows where thal and ca are '?'
df = df[df['thal'] != '?']
df = df[df['ca'] != '?']
# convert thal and ca to float
df['thal'] = df['thal'].astype('float64')
df['ca'] = df['ca'].astype('float64')
df['disease'] = df['num'].apply(lambda x: 0 if x == 0 else 1)
df_val['ca'] = df['ca'].astype('float64')
columns = [
'age', 'sex', 'cp', 'trestbps', 'chol', 'fbs', 'restecg',
'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']
# print(df_val.dtypes)
#%% k nearest neighbors all column combinations
import time
# all combinations of columns
combos = []
for i in range(1, len(columns)+1):
combos.extend(list(x) for x in combinations(columns, i) )
print(len(combos), 'combinations')
results = []
best_f1 = 0
best_name = ''
for i, cols in enumerate(combos):
name = '_'.join(cols)
X = np.array([row.to_numpy() for index, row in df[cols].iterrows()])
y = df['disease'].to_numpy()
start = time.time()
for k in range(1, 23, 2):
precision, recall, f1, support = \
k_fold_validation(10, k, X, y)
results.append((name, k, precision, recall, f1, support))
if (f1 > best_f1):
best_f1 = f1
best_name = name
s = time.time() - start
print('%50s'%name, "\t%.5ss"%s, '%f'%((len(combos)-i)*s/3600), '%f'%(i/(len(combos))), best_name, best_f1)
#%% turn results into data frame
resultdf = pd.DataFrame(results, columns=['columns', 'k', 'precision', 'recall', 'f1', 'support'])
resultdf['dims'] = resultdf['columns'].str.split('_').apply(lambda x: len(x))
resultdf.to_csv('data/AllCombinations.csv', index=False)
#%% load all combinations results
resultdf = pd.read_csv('data/AllCombinations.csv')
print(len(resultdf))
#%% get top performing parameters
col = 'f1'
best_accuracy = resultdf.sort_values(col).iloc[-1][col]
display(resultdf.sort_values(col, ascending=False).head(10))
display(resultdf[resultdf[col] == best_accuracy])
#%% precision vs recall
plt.figure()
sns.scatterplot(x='precision', y='recall', data=resultdf[resultdf.f1 >= .8])
plt.title('Precision vs Recall of classifiers with F1 >= .8')
plt.tight_layout()
plt.show()
#%%
plt.figure()
sns.scatterplot(x='precision', y='recall', data=resultdf)
plt.title('Precision vs Recall')
plt.tight_layout()
plt.savefig('images/precision_vs_recall.pdf')
plt.show()
#%%
top_f1 = resultdf.sort_values(col).iloc[-1][col]
n=10
for i in np.linspace(0, top_f1, n, endpoint=True):
sns.scatterplot(
x='precision',
y='recall',
data=resultdf[(resultdf.f1>=i) & (resultdf.f1<i+step)],
label='{s:.2f}'.format(s=i))
plt.legend(bbox_to_anchor=(0, 1), loc='upper left', ncol=1)
plt.title('Precision vs Recall')
plt.tight_layout()
plt.savefig('images/precision_vs_recall_colored.pdf')
plt.show()
#%% best performer - k fold validation
cols, k = (['cp','fbs','exang','ca','thal'], 13)
cols, k = (['ca', 'cp', 'thal'], 11)
X = np.array([row.to_numpy() for index, row in df[cols].iterrows()])
y = df['disease'].to_numpy()
print('precision recall f1 support')
precision, recall, f1, support = test_columns(df, cols, 'disease', k, k_fold_validation)
print('%5f'%precision, ' %5f'%recall, ' %5f'%f1, ' %5f'%support)
#%% Best performer - validation set
cols, k = (['cp','fbs','exang','ca','thal'], 13)
cols, k = (['ca', 'cp', 'thal'], 11)
X_train = np.array([row.to_numpy() for index, row in df[cols].iterrows()])
y_train = df['disease'].to_numpy()
X_validate = np.array([row.to_numpy() for index, row in df_val[cols].iterrows()])
y_validate = df_val['disease'].to_numpy()
classify = build_classifier(k, X_train, y_train)
precision, recall, f1, support = validate(classify, X_validate, y_validate)
print('precision recall f1 support')
print('%5f'%precision, ' %5f'%recall, ' %5f'%f1, ' %5f'%support)