-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathanalyse_bayes_factors.py
170 lines (143 loc) · 9.21 KB
/
analyse_bayes_factors.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
"""
【Analyze】
Compare with Feature Importance Analysis
`python analyse_bayes_factors.py --log True --data XXX --model_type gaussian --model_name gaussian_e --eps XXX --interpret_method XXX --algorithm p_s`
`python analyse_bayes_factors.py --log True --data XXX --model_type nn --model_name nn_1 --eps XXX --interpret_method XXX --algorithm mean`
"""
import time
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datasets.regdata import build_reg_dataset
from utils.utils_file import generate_bayes_factors_filename, generate_bayes_factors_thresholds_curve_filename, \
generate_bayes_factors_thresholds_excel_filename, generate_local_roc_curve_filename, \
generate_global_roc_curve_filename, generate_local_auc_excel_filename, \
generate_global_auc_excel_filename, generate_auc_curve_filename, generate_bayes_factors_thresholds_area_filename, \
generate_binary_global_label_filename, generate_binary_local_label_filename
from utils.utils_parser import DefaultArgumentParser, init_config
from utils.utils_plot import plot_curve, plot_roc_curve, plot_area
if __name__ == '__main__':
start_time = time.time()
parser = DefaultArgumentParser().get_parser()
plt.rc('font', family='Times New Roman')
# model settings
parser.add_argument('--model_type', default='gaussian', type=str, help='Variation inference family')
parser.add_argument('--model_name', default='gaussian_e', type=str, help='choose which model to get distri')
parser.add_argument('--batch_size', default=128, type=int, help='batch size')
parser.add_argument('--eps', default=0, type=float, help='eps for local binary label')
parser.add_argument('--interpret_method', default='gradient', type=str, help='testing statistic')
parser.add_argument('--y_index', default=0, type=int, help='gradient to which output (for multi-outputs)')
parser.add_argument('--algorithm', type=str, default='p_s',
choices=['first', 'mean', 'mean_abs', 'abs_mean', 'p_s'])
opt = parser.parse_args()
opt.exp_name = 'analyse_bayes_factors'
init_config(opt)
bayes_factors = np.load(generate_bayes_factors_filename(opt, last=True)) # (n_data, **n_features)
bayes_factors = np.abs(bayes_factors)
dataset = build_reg_dataset(opt)
global_labels = np.loadtxt(generate_binary_global_label_filename(opt, True))
local_labels = np.loadtxt(generate_binary_local_label_filename(opt, True))
print(f'==> Plotting local roc curves...')
local_auc = []
for j in range(opt.n_features):
local_auc.append(plot_roc_curve(local_labels[:, j], bayes_factors[:, j],
f'{opt.model_name} {opt.interpret_method}_{opt.algorithm} x{j}',
generate_local_roc_curve_filename(opt, f'x{j}')))
features = [f'x{i}' for i in range(opt.n_features)]
writer = pd.ExcelWriter(generate_local_auc_excel_filename(opt))
pd_data2 = pd.DataFrame(np.array(local_auc), index=features).T
pd_data2.to_excel(writer, opt.model_name, float_format='%.3f')
writer.close()
plot_roc_curve(np.reshape(local_labels, -1), np.reshape(bayes_factors, -1),
f'{opt.model_name} {opt.interpret_method}_{opt.algorithm}',
generate_local_roc_curve_filename(opt, 'total'))
plot_curve(range(0, opt.n_features), local_auc, f'{opt.model_name} {opt.interpret_method}_{opt.algorithm}',
generate_auc_curve_filename(opt), xlabel='features', ylabel='auc',
xlim=[0, opt.n_features], ylim=[0.0, 1.0], diagonal=False)
print(f'==> Plotting selected insignificant data rate based threshold1...')
thresholds = np.linspace(0, 1, 101, endpoint=True)
print(f'==> Threshold1: {thresholds}')
result_thresholds = []
for j in range(opt.n_features):
bayes_factors_xj = bayes_factors[:, j] # (n_samples, )
result_thresholds_xj = []
for threshold in thresholds:
locs = np.argwhere(bayes_factors_xj <= threshold)
result_thresholds_xj.append(len(locs))
result_thresholds_xj = [_ / len(bayes_factors_xj) for _ in result_thresholds_xj]
plot_curve(thresholds, result_thresholds_xj, f'{opt.model_name} x{j}',
generate_bayes_factors_thresholds_curve_filename(opt, f'x{j}'),
xlabel='Thresholds', ylabel='Insignificant Data Rate')
result_thresholds.append(np.array(result_thresholds_xj)[None, :])
print(f'==> Plotting area for significant and insignificant features...')
np_result_thresholds = np.concatenate(result_thresholds, axis=0) # (n_features, |thresholds|)
plot_area(thresholds, np_result_thresholds, np.where(global_labels == 1)[0], np.where(global_labels == 0)[0],
f'{opt.model_name}', generate_bayes_factors_thresholds_area_filename(opt),
xlabel='Thresholds', ylabel='Insignificant Data Rate')
print(f'==> Analysing global feature importance based on thresholds of two strategies...')
# avoid round precision. e.g. np.arange(0.991, 1, 0.001) may be not right
if opt.n_samples == 10000:
sub_analyse_thresholds_num = np.arange(0, 10000, 1000)
sub_analyse_thresholds_num = np.append(sub_analyse_thresholds_num, np.arange(9100, 10000, 100))
sub_analyse_thresholds_num = np.append(sub_analyse_thresholds_num, np.arange(9910, 10000, 10))
sub_analyse_thresholds_num = np.append(sub_analyse_thresholds_num, np.arange(9991, 10000, 1))
sub_analyse_thresholds_num = np.append(sub_analyse_thresholds_num, [10000])
# [0, 1000, ..., 9000, 9100, ..., 9900, 9910, ..., 9990, 9991, ..., 9999, 10000]
sub_analyse_thresholds_rate = sub_analyse_thresholds_num / 10000 # [0, 0.1, ..., 0.9, 0.91, ..., 0.99, 0.991, ..., 0.999, 0.9991, ..., 0.9999, 1.]
sub_analyse_thresholds_num -= 1
sub_analyse_thresholds_num[
0] = 0 # [0, 999, ..., 8999, 9099, ..., 9899, 9909, ..., 9989, 9990, ..., 9998, 9999]
else:
sub_analyse_thresholds_rate = np.linspace(0, 1., 10, endpoint=False)
sub_analyse_thresholds_rate = np.append(sub_analyse_thresholds_rate, np.linspace(0.91, 1., 9, endpoint=False))
sub_analyse_thresholds_rate = np.append(sub_analyse_thresholds_rate, np.linspace(0.991, 1., 10, endpoint=True))
sub_analyse_thresholds_num = np.floor(sub_analyse_thresholds_rate * opt.n_samples).astype(int)
sub_analyse_thresholds_num[-1] -= 1
print(f'==> Rate: {sub_analyse_thresholds_rate}\n'
f'==> Number: {sub_analyse_thresholds_num}')
print(f'==> Analysing strategy1: fix threshold1, adjust threshold2...')
result_thresholds = []
for j in range(opt.n_features):
bayes_factors_xj = bayes_factors[:, j]
result_thresholds_xj = []
for threshold in sub_analyse_thresholds_rate:
locs = np.argwhere(bayes_factors_xj <= threshold)
result_thresholds_xj.append(len(locs))
result_thresholds_xj = [_ / len(bayes_factors_xj) for _ in result_thresholds_xj]
result_thresholds.append(np.array(result_thresholds_xj)[None, :])
np_result_thresholds = np.concatenate(result_thresholds, axis=0) # (n_features, |sub_thresholds|)
probs = 1 - np_result_thresholds # (n_features, |sub_thresholds|)
global_auc_threshold1 = []
for i, threshold in enumerate(sub_analyse_thresholds_rate):
print(f'==> Set threshold1={threshold}')
global_auc_threshold1.append(
plot_roc_curve(global_labels, probs[:, i], f'{opt.model_name} threshold1={threshold}',
generate_global_roc_curve_filename(opt, 'threshold1', threshold)))
df = pd.DataFrame(columns=sub_analyse_thresholds_rate, index=features, data=probs)
writer = pd.ExcelWriter(generate_bayes_factors_thresholds_excel_filename(opt, 'thresholds1'))
df.to_excel(writer, opt.model_name)
print(df)
writer.close()
print(f'==> Analysing strategy2: fix threshold2, adjust threshold1...')
sorted_bayes_factors = np.sort(bayes_factors, axis=0)
probs = sorted_bayes_factors[sub_analyse_thresholds_num, :].T # (n_features, |sub_thresholds|)
global_auc_threshold2 = []
for i, threshold in enumerate(sub_analyse_thresholds_rate):
print(f'==> Set threshold2={threshold}')
global_auc_threshold2.append(
plot_roc_curve(global_labels, probs[:, i], f'{opt.model_name} threshold2={threshold}',
generate_global_roc_curve_filename(opt, 'threshold2', threshold)))
df = pd.DataFrame(columns=sub_analyse_thresholds_rate, index=features, data=probs)
writer = pd.ExcelWriter(generate_bayes_factors_thresholds_excel_filename(opt, 'thresholds2'))
df.to_excel(writer, opt.model_name)
print(df)
writer.close()
writer = pd.ExcelWriter(generate_global_auc_excel_filename(opt))
pd_data1 = pd.DataFrame(np.array(global_auc_threshold1), index=sub_analyse_thresholds_rate)
pd_data1.to_excel(writer, 'threshold1', float_format='%.4f')
pd_data2 = pd.DataFrame(np.array(global_auc_threshold2), index=sub_analyse_thresholds_rate)
pd_data2.to_excel(writer, 'threshold2', float_format='%.4f')
writer.close()
end_time = time.time()
elapse_time = end_time - start_time
print(f'All end in {elapse_time // 60:.0f}m {elapse_time % 60:.0f}s.')