-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathdecision_tree_classifier.py
152 lines (96 loc) · 5.53 KB
/
decision_tree_classifier.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import numpy as np
from utils import generate_clusterization_data, split_data
from metrics import accuracy
"""CART Decision Tree"""
#https://en.wikipedia.org/wiki/Decision_tree_learning
class Node():
def __init__(self, feature_index=None, threshold_value = None, left = None, right = None, information_gain = None, class_value = None):
self.feature_index = feature_index
self.threshold_value = threshold_value
self.left = left
self.right = right
self.information_gain = information_gain
self.class_value = class_value
class DecisionTreeClassifier():
def __init__(self, min_samples_split = 2, min_samples_leaf = 2, max_depth = 2, criterion = 'gini'):
self.tree = None
self.min_samples_split = min_samples_split
self.min_samples_leaf = min_samples_leaf
self.max_depth = max_depth
self.criterion = criterion
def compute_gini_index(self, labels):
sigma = 0
for class_value in np.unique(labels):
class_probability = len(labels[labels == class_value]) / len(labels)
sigma += class_probability**2
return 1 - sigma
def compute_entropy(self, labels):
entropy = 0
for class_value in np.unique(labels):
class_probability = len(labels[labels == class_value]) / len(labels)
entropy += - class_probability * np.log2(class_probability)
return entropy
def find_best_split(self, data):
max_information_gain = -np.inf
best_split_params = [[] for _ in range(5)]
for feature_index in range(self.features_num):
for feature_value in np.unique(data[:, feature_index]):
left_data, right_data = data[data[:, feature_index] <= feature_value], data[data[:, feature_index] > feature_value]
if len(left_data) != 0 and len(right_data) != 0:
left_data_labels, right_data_labels, labels = left_data[:, -1], right_data[:, -1], data[:, -1]
information_gain = self.criterion_func(labels) - (len(left_data_labels) / len(labels) * self.criterion_func(left_data_labels) +
len(right_data_labels) / len(labels) * self.criterion_func(right_data_labels))
if information_gain > max_information_gain:
max_information_gain = information_gain
best_split_params[0] = left_data
best_split_params[1] = right_data
best_split_params[2] = feature_index
best_split_params[3] = feature_value
best_split_params[4] = information_gain
return best_split_params
def insert_tree(self, data, tree_depth = 0):
labels = data[:,-1]
samples_num = len(data)
if samples_num >= self.min_samples_split and tree_depth <= self.max_depth:
left_data, right_data, feature_index, threshold_value, information_gain = self.find_best_split(data)
if len(left_data) >= self.min_samples_leaf and len(right_data) >= self.min_samples_leaf:
if information_gain > 0:
left_subtree = self.insert_tree(left_data, tree_depth+1)
right_subtree = self.insert_tree(right_data, tree_depth+1)
return Node(feature_index, threshold_value,
left_subtree, right_subtree, information_gain)
leaf_value = np.argmax(np.bincount(labels.astype(int)))
return Node(class_value = leaf_value)
def fit(self, samples, labels):
self.features_num = samples.shape[1]
if self.criterion == 'gini':
self.criterion_func = self.compute_gini_index
elif self.criterion == 'entropy':
self.criterion_func = self.compute_entropy
else:
raise SystemExit(f'Criterion with name "{self.criterion}" not found')
self.tree = self.insert_tree(data = np.concatenate((samples, np.array(labels, ndmin = 2).T), axis = 1))
def predict(self, data):
return [self.predict_sample(sample, self.tree) for sample in data]
def predict_sample(self, sample, tree):
if tree.class_value != None: return tree.class_value
if sample[tree.feature_index] <= tree.threshold_value:
return self.predict_sample(sample, tree.left)
else:
return self.predict_sample(sample, tree.right)
def print_tree(self, tree = None, tree_depth = 0):
if tree == None: tree = self.tree
if tree.threshold_value != None:
self.print_tree(tree.left, tree_depth + 1)
print(f"{' ' * 4 * tree_depth} -> {tree.threshold_value}")
self.print_tree(tree.right, tree_depth + 1)
elif tree.class_value != None:
print(f"{' ' * 4 * tree_depth} -> {tree.class_value}")
if __name__ == "__main__":
X_train, y_train = generate_clusterization_data(n_clusters = 2, n_samples = 300)
X_train, X_test, y_train, y_test = split_data(X_train, y_train, ratio = 0.25)
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
dtc.print_tree()
print(f"accuracy: {accuracy(y_test, y_pred) * 100}%")