-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnn.py
207 lines (157 loc) · 6.74 KB
/
nn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
import numpy as np
import math
"""
Activation/loss functions and derivatives
"""
# softmax
def softmax(x):
e_x = np.exp(x - np.max(x))
return e_x / e_x.sum(axis=0)
def derivative_softmax_input(a, lab):
# derivative of error with respect to softmax input is the one hot encoded label vector subtracted from
# the softmax output. Crazy math going on.
return np.subtract(a, lab)
# relu
def relu(n):
return max(0, n)
def relu_derivative(n):
return (n > 0) * 1
def multi_class_cross_entropy(a, lab):
# 1 * value from output vector at same index as class in the one hot encoding
return math.log(a.A1[np.argmax(lab.A1)])
# vectorize functions that are applied to np vector/array/matrix
v_relu = np.vectorize(relu)
v_relu_d = np.vectorize(relu_derivative)
v_soft_max = np.vectorize(softmax)
"""
Importing data
"""
# get training inputs and labels.
training_data = np.loadtxt("data/mnist_train.csv", delimiter=",")
test_data = np.loadtxt("data/mnist_test.csv", delimiter=",")
# separate data and labels
training_labels = np.copy(training_data[:, 0])
test_labels = np.copy(test_data[:, 0])
training_inputs = np.copy(training_data[:, 1:])
test_inputs = np.copy(test_data[:, 1:])
# normalize training and test inputs. Could be more dynamic and use training_inputs.max or something.
training_inputs = np.matrix(np.divide(training_inputs, 255))
test_inputs = np.matrix(np.divide(test_inputs, 255))
# encode classes for easy error calculation.
# NOTE: know there is 10 classes 0-9. Maybe should find dynamically
encoded_training_labels = np.zeros((len(training_data), 10))
encoded_test_labels = np.zeros((len(test_data), 10))
print(np.shape(encoded_training_labels))
for i in range(len(training_data)):
# set the element corresponding to each input and its classification to 1.
encoded_training_labels[i][int(training_labels[i])] = 1
for i in range(len(test_data)):
encoded_test_labels[i][int(test_labels[i])] = 1
np.matrix(encoded_training_labels)
np.matrix(encoded_test_labels)
"""
Hyper parameters
"""
LEARNING_RATE = .001
input_count, layer_one_node_count = np.shape(training_inputs) # layer one node count == feature count
layer_two_node_count = 200 # second layer nodes
layer_three_node_count = 100 # third layer nodes
layer_four_node_count = 10 # layer four node count == class count
epochs = 10
"""
initialize weights & bias
"""
weights = [
# weights matrices for Li->Lj should be dim (m, n) where m = count(Lj nodes) and n = count(Li nodes)
# e.g here L1->L1 weights matrix has dims (50, 784). So input vector (784, 1) can be multiplied to produce (50, 1).
np.matrix(np.random.uniform(-.1, .1, size=(layer_two_node_count, layer_one_node_count))),
np.matrix(np.random.uniform(-.1, .1, size=(layer_three_node_count, layer_two_node_count))),
np.matrix(np.random.uniform(-.1, .1, size=(layer_four_node_count, layer_three_node_count)))
]
# layers * nodes_in_current_layer * 1
biases = [
np.matrix(np.random.uniform(-.1, .1, size=(layer_two_node_count, 1))),
np.matrix(np.random.uniform(-.1, .1, size=(layer_three_node_count, 1))),
np.matrix(np.random.uniform(-.1, .1, size=(layer_four_node_count, 1)))
]
epoch = 0
# training loop
while epoch < epochs:
correct = 1
# for i in range(1):
for i in range(len(training_inputs)):
# get input and label
input = np.matrix(training_inputs[i]).T
label = np.matrix(encoded_training_labels[i]).T
# forward pass
# L1->L2 relu(W*input + bias)
L2_z = (weights[0] * input) + biases[0]
# apply relu
L2_a = v_relu(L2_z)
# L2->L3 relu(W*L1_a + bias)
L3_z = (weights[1] * L2_a) + biases[1]
L3_a = v_relu(L3_z)
# L3->L4 softmax(W*L3_a + bias)
L4_z = (weights[2] * L3_a) + biases[2]
L4_a = softmax(L4_z) # network output.
if np.argmax(L4_a) == np.argmax(label):
correct += 1
# get network error
if L4_a[np.argmax(L4_a)] == np.NAN:
continue
else:
error = multi_class_cross_entropy(L4_a, label)
# Back propagate
# will store weight and bias deltas in here for now
weight_deltas = []
bias_deltas = []
# derivative of error wrt L3->L4 affine
# dE/dZ[i] = a[i] - label[i] (Can prove and work through from scratch another time. This simplification will be
# more efficient anyway).
layer_four_error = derivative_softmax_input(L4_a, label) * error
# (10,1) * (1,25) = (10, 25) (same dimension as weights L3->L4)
weight_deltas.insert(0, (layer_four_error * L3_a.T) * LEARNING_RATE)
bias_deltas.insert(0, layer_four_error * LEARNING_RATE)
# propagate error to layer 3. (pretty much want dLayer_four_error/dZ_3)
# L3->L4 W^t * Layer four error and then element wise product with relu derivative for layer 3.
layer_three_error = np.multiply((weights[2].T * layer_four_error), relu_derivative(L3_a))
# L2->3 weight deltas.
weight_deltas.insert(0, (layer_three_error * L2_a.T) * LEARNING_RATE)
bias_deltas.insert(0, layer_three_error * LEARNING_RATE)
# propagate layer 3 error back to layer 2
layer_two_error = np.multiply((weights[1].T * layer_three_error), relu_derivative(L2_a))
# L1->L2 weight deltas
weight_deltas.insert(0, (layer_two_error * input.T) * LEARNING_RATE)
bias_deltas.insert(0, layer_two_error * LEARNING_RATE)
# element wise sum of weight/bias deltas and weights/biases
for w in range(len(weights)):
weights[w] = weights[w] + weight_deltas[w]
biases[w] = biases[w] + bias_deltas[w]
print("Epoch: ", epoch, "input: ", i + 1, " Accuracy: ", correct / (i + 1), " Error: ", error)
# FORWARD PASS record outputs for finding derivative of error of nodes in each layer with respect to weights
# feeding into layer.
epoch += 1
# validate on test set
test_count = 0
test_correct = 0
for t in range(len(test_inputs)):
# forward pass
input = np.matrix(test_inputs[t]).T
label = np.matrix(encoded_test_labels[t]).T
L2_z = (weights[0] * input) + biases[0]
L2_a = v_relu(L2_z)
L3_z = (weights[1] * L2_a) + biases[1]
L3_a = v_relu(L3_z)
L4_z = (weights[2] * L3_a) + biases[2]
L4_a = softmax(L4_z)
error = multi_class_cross_entropy(L4_a, label)
if np.argmax(L4_a.A1) == np.argmax(label.A1):
test_correct += 1
test_count += 1
print("Test input: ", test_count, " Accuracy: ", (test_correct / test_count), " Error: ", error)
"""
To do now:
- make all hard coded number/ parameters into vars for easy changing
- make network capable of batches.
- would network train faster if activation and loss was calc'd without calling function? e.g calc activation in-line
"""