-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathstacking.py
132 lines (102 loc) · 4.91 KB
/
stacking.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
class Ensemble:
def __init__(self):
self.x_train = None
self.x_test = None
self.y_train = None
self.y_test = None
self.k = 5
def load_data(self):
x, y = load_breast_cancer(return_X_y=True)
self.x_train, self.x_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.3, random_state=23)
def StackingClassifier(self):
# Define weak learners
weak_learners = [('dt', DecisionTreeClassifier()),
('knn', KNeighborsClassifier()),
('rf', RandomForestClassifier()),
('gb', GradientBoostingClassifier()),
('gn', GaussianNB())]
# Final learner or meta model
final_learner = LogisticRegression()
train_meta_model = None
test_meta_model = None
# Start stacking
for clf_id, clf in weak_learners:
# Predictions for each classifier based on k-fold
predictions_clf = self.k_fold_cross_validation(clf)
# Predictions for test set for each classifier based on train of level 0
test_predictions_clf = self.train_level_0(clf)
# Stack predictions which will form
# the input data for the data model
if isinstance(train_meta_model, np.ndarray):
train_meta_model = np.vstack((train_meta_model, predictions_clf))
else:
train_meta_model = predictions_clf
# Stack predictions from test set
# which will form test data for meta model
if isinstance(test_meta_model, np.ndarray):
test_meta_model = np.vstack((test_meta_model, test_predictions_clf))
else:
test_meta_model = test_predictions_clf
# Transpose train_meta_model
train_meta_model = train_meta_model.T
# Transpose test_meta_model
test_meta_model = test_meta_model.T
# Training level 1
self.train_level_1(final_learner, train_meta_model, test_meta_model)
def k_fold_cross_validation(self, clf):
predictions_clf = None
# Number of samples per fold
batch_size = int(len(self.x_train) / self.k)
# Stars k-fold cross validation
for fold in range(self.k):
# Settings for each batch_size
if fold == (self.k - 1):
test = self.x_train[(batch_size * fold):, :]
batch_start = batch_size * fold
batch_finish = self.x_train.shape[0]
else:
test = self.x_train[(batch_size * fold): (batch_size * (fold + 1)), :]
batch_start = batch_size * fold
batch_finish = batch_size * (fold + 1)
# test & training samples for each fold iteration
fold_x_test = self.x_train[batch_start:batch_finish, :]
fold_x_train = self.x_train[[index for index in range(self.x_train.shape[0]) if
index not in range(batch_start, batch_finish)], :]
# test & training targets for each fold iteration
fold_y_test = self.y_train[batch_start:batch_finish]
fold_y_train = self.y_train[
[index for index in range(self.x_train.shape[0]) if index not in range(batch_start, batch_finish)]]
# Fit current classifier
clf.fit(fold_x_train, fold_y_train)
fold_y_pred = clf.predict(fold_x_test)
# Store predictions for each fold_x_test
if isinstance(predictions_clf, np.ndarray):
predictions_clf = np.concatenate((predictions_clf, fold_y_pred))
else:
predictions_clf = fold_y_pred
return predictions_clf
def train_level_0(self, clf):
# Train in full real training set
clf.fit(self.x_train, self.y_train)
# Get predictions from full real test set
y_pred = clf.predict(self.x_test)
return y_pred
def train_level_1(self, final_learner, train_meta_model, test_meta_model):
# Train is carried out with final learner or meta model
final_learner.fit(train_meta_model, self.y_train)
# Getting train and test accuracies from meta_model
print(f"Train accuracy: {final_learner.score(train_meta_model, self.y_train)}")
print(f"Test accuracy: {final_learner.score(test_meta_model, self.y_test)}")
if __name__ == "__main__":
ensemble = Ensemble()
ensemble.load_data()
ensemble.StackingClassifier()