我们了解神经网络中的梯度下降算法,反向传播算法,损失函数等等,现在我们参考Michael Nielsen实现自己的神经网络构建和梯度下降算法和反向传播算法等等的实现!
1)mnist数据集的读取:
import cPickle
import gzip
# Third-party libraries
import numpy as np
def load_data():
"""Return the MNIST data as a tuple containing the training data,
the validation data, and the test data.
The ``training_data`` is returned as a tuple with two entries.
The first entry contains the actual training images. This is a
numpy ndarray with 50,000 entries. Each entry is, in turn, a
numpy ndarray with 784 values, representing the 28 * 28 = 784
pixels in a single MNIST image.
The second entry in the ``training_data`` tuple is a numpy ndarray
containing 50,000 entries. Those entries are just the digit
values (0...9) for the corresponding images contained in the first
entry of the tuple.
The ``validation_data`` and ``test_data`` are similar, except
each contains only 10,000 images.
This is a nice data format, but for use in neural networks it's
helpful to modify the format of the ``training_data`` a little.
That's done in the wrapper function ``load_data_wrapper()``, see
below.
"""
f = gzip.open('../data/mnist.pkl.gz', 'rb')
training_data, test_data = cPickle.load(f)
f.close()
return (training_data, test_data)
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data,
test_data)``. Based on ``load_data``, but the format is more
convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
containing the input image. ``y`` is a 10-dimensional
numpy.ndarray representing the unit vector corresponding to the
correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
numpy.ndarry containing the input image, and ``y`` is the
corresponding classification, i.e., the digit values (integers)
corresponding to ``x``.
Obviously, this means we're using slightly different formats for
the training data and the validation / test data. These formats
turn out to be the most convenient for use in our neural network
code."""
tr_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
training_results = [vectorized_result(y) for y in tr_d[1]]
training_data = zip(training_inputs, training_results)
#validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
#validation_data = zip(validation_inputs, va_d[1])
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
test_data = zip(test_inputs, te_d[1])
return (training_data, test_data)
def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the jth
position and zeroes elsewhere. This is used to convert a digit
(0...9) into a corresponding desired output from the neural
network."""
e = np.zeros((10, 1))
e[j] = 1.0
return e
2)神经网络和梯度下降算法的实现:
#-*-coding=utf-8-*-
import numpy as np
import tensorflow as tf
import random
import mnist_loader
#sizes=[3,4,3]
#w=[[4*3],[3*4]]
#b=[[4*1],[3*1]]
#a=[[4,],[3,]]
#z=[[4,],[3,]]
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
class Network(object):
def __init__(self,sizes):
self.num_layers=len(sizes)
self.sizes=sizes
self.weights=[np.random.rand(j,i) for i,j in zip(sizes[:-1],sizes[1:])]
self.biases=[np.random.rand(j,1) for j in sizes[1:]]
def feedforward(self,a):
for w,b in zip(self.weights,self.biases):
a=sigmoid(np.dot(w,a)+b)
return a
def SGD(self,training_data,epochs,mini_batch_size,eta,test_data=None):
if test_data:
n_test=len(test_data)
n=len(training_data)
for j in xrange(epochs):
random.shuffle(training_data)
mini_batches=[training_data[k:k+mini_batch_size]
for k in xrange(0,n,mini_batch_size)]
#print(mini_batches)
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch,eta)
if test_data:
print("Epoch:{0},acc:{1}/{2}".format(j,self.evaluate(test_data),n_test))
else:
print("finish")
#return (self.weights,self.biases)
def update_mini_batch(self,mini_batch,eta):
deriv_w_mb=[np.zeros(w.shape) for w in self.weights]
deriv_b_mb=[np.zeros(b.shape) for b in self.biases]
for x,y in mini_batch:
deriv_w,deriv_b=self.backprop(x,y)
deriv_w_mb=[w+dw for w,dw in zip(deriv_w_mb,deriv_w)]
deriv_b_mb=[b+db for b,db in zip(deriv_b_mb,deriv_b)]
self.weights=[w-(eta/len(mini_batch))*nw
for w,nw in zip(self.weights,deriv_w_mb)]
self.biases=[b-(eta/len(mini_batch))*nb
for b,nb in zip(self.biases,deriv_b_mb)]
def backprop(self,x,y):
deriv_b=[np.zeros(b.shape) for b in self.biases]
deriv_w=[np.zeros(w.shape) for w in self.weights]
a=x
a_arr=[x]
z_arr=[]
for w,b in zip(self.weights,self.biases):
#print(np.dot(w,a).shape,b.shape)
z=np.dot(w,a)+b
z_arr.append(z)
a=sigmoid(z)
a_arr.append(a)
#print(a_arr[-1].shape)
delta=(a_arr[-1]-y)*sigmoid_prime(z_arr[-1])
deriv_b[-1]=delta
deriv_w[-1]=np.dot(delta,np.transpose(a_arr[-2]))
for l in xrange(2,self.num_layers):
z=z_arr[-l]
delta=np.dot(np.transpose(self.weights[-l+1]),delta)*sigmoid_prime(z)
#print(delta.shape,a_arr[-l-1].shape)
deriv_b[-l]=delta
deriv_w[-l]=np.dot(delta,np.transpose(a_arr[-l-1]))
return (deriv_w,deriv_b)
def evaluate(self,test_data):
test_results=[(np.argmax(self.feedforward(x)),y)
for (x,y) in test_data]
#return sum(int(x == y) for (x, y) in test_results)
return sum(int(x==y) for (x,y) in test_results)
if __name__=="__main__":
sizes=[784,30,10]
training_data,test_data=mnist_loader.load_data_wrapper()
print(training_data[0][0].shape,training_data[0][1].shape)
net=Network(sizes)
net.SGD(training_data,10,20000,0.2,test_data)
'''
#the model (len(n1_node)*1) format to start with
sizes=[3,4,5,6,7,3]
training_data=[([[1],[2],[3]],[[1],[0],[0]]),
([[4],[5],[6]],[[0],[1],[0]]),
([[7],[8],[9]],[[0],[0],[1]])]
test_data=[([[10],[11],[12]],0),
([[13],[14],[15]],1),
([[16],[17],[18]],2)]
#def SGD(self,training_data,epochs,mini_batch_size,eta,test_data=None):
net=Network(sizes)
w,b=net.SGD(training_data,5,1,0.002,test_data)
'''
'''x=[1,2,3]
x=np.expand_dims(x,1)
y=np.random.rand(3)
y=np.expand_dims(y,1)
z=[(x,y)]
net1=Network(sizes)
(w,b)=net1.backprop(z[0][0],z[0][1])
print(w[0].shape,w[1].shape)
print(w[0],w[1])
print(b[0].shape,b[1].shape)
print(b[0],b[1])'''
如下,我们搭建了一个3层网络,然后第一层的输入为(3,1),第一层的权重为(4,3),第二层的输入为(4,1),第二层的权重为(3,4),第三层为(3,1),这是我们的实现方式!
ps:注意这儿我们并没有让第一层的输入为(3,),只是为了在计算偏导的时候可以方便求解,利用矩阵就可以进行处理而已,并不代表不能输入(3,),也是可以的,只是计算起来比较复杂而已!
ps:(3,)和(4,)两个ndarray进行np.dot运算:
可以看到这种结构无法进行矩阵运算,因此也就无法进行权重w偏导矩阵的求解或者偏置b矩阵的求解,因此注意我们送入网络中训练的数据的维度和权重维度!