defsigmoid(z):return1.0/(1.0+np.exp(-z))defsigmoid_prime(z):# 导数return sigmoid(z)*(1-sigmoid(z))classMLP:def__init__(self, sizes):
self.sizes = sizes
self.num_layers =len(sizes)# sizes: [784, 30, 10]# w : [ch_out, ch_in]# b : [ch_out]
self.weights =[np.random.randn(ch_out, ch_in)for ch_in, ch_out inzip(sizes[:-1], sizes[1:])]
self.biases =[np.random.randn(ch_out,1)for ch_out in sizes[1:]]defforward(self, x):# param x: [784, 1]# return: [10, 1]for b, w inzip(self.biases, self.weights):# [30, 784]@[784, 1] + [30, 1]=> [30, 1]# [10, 30]@[30, 1] + [10, 1]=> [10, 1]
z = np.dot(w, x)+ b
x = sigmoid(z)return x
defbackward(self, x, y):# param x : [784, 1]# param y : [10, 1], one-hot encoding
nabla_w =[np.zeros(w.shape)for w in self.weights]
nabla_b =[np.zeros(b.shape)for b in self.biases]# 1. forward# save activation for every layer
activation =[x]# save z for every layer(z是激活函数前面那层)
zs =[]
activation = x
for b, w inzip(self.biases, self.weights):
z = np.dot(w, x)+ b
activation = sigmoid(z)
zs.append(z)
activations.append(activation)
loss = np.power(activations[-1]- y,2).sum()# 2. backward# 2.1 compute gradient on output layer
delta = activations[-1]*(1- activations[-1])*(activations[-1]- y)
nabla_b[-1]= delta
# [10, 1]@[1, 30] => [10, 30]
nabla_w[-1]= np.dot(delta, activations[-2].T)# 2.2 compute hidden gradientfor l inrange(2, self.num_layers +1):
l =-l
z = zs[l]
a = activations[l]
delta = np.dot(self.weights[l+1].T, delta)* a *(1- a)
nabla_b[l]= delta
nabla_w[l]= np.dot(delta, activations[l-1].T)return nabla_w, nabla_b, loss
deftrain(self, training_data, epochs, batchsz, lr, test_data):# training_data : list of (x, y)# epochs : 1000# batchsz : 10# lr : 0.01# test_data : list of (x, y)if test_data:
n_test =len(test_data)
n =len(training_data)for j inrange(epochs):
random.shuffle(training_data)
batches =[training_data[k:k+batchsz]for k inrange(0, n, batchsz)]# for every batch in current datafor batch in batches:
loss = self.update_batch(batch, lr)if test_data:print("Epoch {0}: {1} / {2}, Loss: {3}".format(j, self.evaluate(test_data), n_test, loss))else:print("Epoch {0} complete".format(j))defupdata_batch(self, batch, lr):
nabla_b =[np.zeros(b.shape)for b in self.biases]
nabla_w =[np.zeros(w.shape)for w in self.weights]
loss =0# for every sample in current batchfor x, y in batch:
nabla_w, nabla_b, loss_ = self.backward(x, y)
nabla_b =[accu + cur for accu, cur inzip(nabla_b, nabla_b_)]
nabla_w =[accu + cur for accu, cur inzip(nabla_w, nabla_w_)]
loss += loss_
nabla_w =[w/len(batch)for w in nabla_w]
nabla_b =[b/len(batch)for b in nabla_b]
loss = loss /len(batch)# w = w - lr * nabla_w
self.weights =[w - lr * nabla for w, nabla inzip(self.weights, nabla_w)]
self.biases =[b - lr * nabla for b, nabla inzip(self.biases, nabla_b)]return loss
defevaluate(self, test_data):# test_data : list of (x, y)
result =[(np.argmax(self.forward(x)), y)for x, y in test_data]
correct =sum(int(pred == y)for pred, y in result)return correct