以前在误差反向传播法里面介绍的神经网络是两层结构,现在来搭建一个多层的结构,神经网络的一个特点或说优势就是可以不断地叠加层(隐藏层)。
多层结构: 一张数字图片(1*28*28=784)——>Affine1层——>ReLU1(激活层)——>Affine2层——>ReLU2——>......——>AffineN层——>Softmax层(概率)
主要就是生成Affine层和激活函数层,代码就是解释,如有不明白的欢迎留言进行交流
multi_layer_net.py
import numpy as np
from collections import OrderedDict
from common.layers import *
from common.gradient import numerical_gradient
class MultiLayerNet:
'''
全连接的多层神经网络
hiddenSizeList:隐藏层列表,可以设定为若干个
activation:激活函数为relu或sigmoid
weight_init_std:权重的标准差(如0.01)
指定'relu'或'he'的情况下设定“He的初始值”
指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
weight_decay_lambda:Weight Decay(L2范数)的强度
'''
def __init__(self,inputSize,hiddenSizeList,outputSize,activation='relu',weight_init_std='relu',weight_decay_lambda=0):
self.inputSize=inputSize
self.outputSize=outputSize
self.hiddenSizeList=hiddenSizeList
self.hiddenLayerNum=len(hiddenSizeList)
self.weight_decay_lambda=weight_decay_lambda
self.params={}
self.__init_weight(weight_init_std)
#生成Affine层和激活函数层(隐藏层)以及最后的Affine层和Softmax层
activationLayer={'sigmoid': Sigmoid,'relu': Relu}
self.layers=OrderedDict()
for idx in range(1,self.hiddenLayerNum+1):
self.layers['Affine'+str(idx)]=Affine(self.params['W'+str(idx)],self.params['b'+str(idx)])
self.layers['Activation'+str(idx)]=activationLayer[activation]()
idx=self.hiddenLayerNum+1
self.layers['Affine'+str(idx)]=Affine(self.params['W'+str(idx)],self.params['b'+str(idx)])
self.lastLayer=SoftmaxWithLoss()
def __init_weight(self,weight_init_std):
'''初始化权重'''
allSizeList=[self.inputSize]+self.hiddenSizeList+[self.outputSize]
for idx in range(1,len(allSizeList)):
scale=weight_init_std
if str(weight_init_std).lower() in ('relu','he'):
scale=np.sqrt(2.0 / allSizeList[idx-1]) # 使用ReLU的情况下推荐的初始值
elif str(weight_init_std).lower() in ('sigmoid','xavier'):
scale=np.sqrt(1.0 / allSizeList[idx-1]) # 使用sigmoid的情况下推荐的初始值
#np.random.randn标准正态分布,标准差为scale
self.params['W'+str(idx)]=scale*np.random.randn(allSizeList[idx-1],allSizeList[idx])
self.params['b'+str(idx)]=np.zeros(allSizeList[idx])
def predict(self,x):
for layer in self.layers.values():
x=layer.forward(x)
return x
def loss(self,x,t):
'''求损失函数'''
y=self.predict(x)
weight_decay=0
for idx in range(1,self.hiddenLayerNum+2):
W=self.params['W'+str(idx)]
weight_decay+=0.5 * self.weight_decay_lambda * np.sum(W ** 2)
return self.lastLayer.forward(y,t)+weight_decay
def accuracy(self,x,t):
y=self.predict(x)
y=np.argmax(y,axis=1)
if t.ndim != 1 : t=np.argmax(t,axis=1)
accuracy=np.sum(y==t) / float(x.shape[0])
return accuracy
def numerical_gradient(self,x,t):
'''数值微分求梯度'''
loss_W=lambda W: self.loss(x,t)
grads={}
for idx in range(1,self.hiddenLayerNum+2):
grads['W'+str(idx)]=numerical_gradient(loss_W,self.params['W'+str(idx)])
grads['b'+str(idx)]=numerical_gradient(loss_W,self.params['b'+str(idx)])
return grads
def gradient(self,x,t):
'''误差反向传播求梯度'''
# forward
self.loss(x,t)
# backward
dout=1
dout=self.lastLayer.backward(dout)
layers=list(self.layers.values())
layers.reverse()
for layer in layers:
dout=layer.backward(dout)
# 设定
grads={}
for idx in range(1,self.hiddenLayerNum+2):
grads['W'+str(idx)]=self.layers['Affine'+str(idx)].dW+self.weight_decay_lambda * self.layers['Affine'+str(idx)].W
grads['b'+str(idx)]=self.layers['Affine'+str(idx)].db
return grads
layers.py(Affine和激活函数Relu,Sigmoid,Softmax)
import numpy as np
from common.functions import *
class Relu:
def __init__(self):
self.mask=None
def forward(self,x):
self.mask=(x<=0)
out=x.copy()
out[self.mask]=0
return out
def backward(self,dout):
dout[self.mask]=0
dx=dout
return dx
class Sigmoid:
def __init__(self):
self.out=None
def forward(self,x):
out=sigmoid(x)
self.out=out
return out
def backward(self,dout):
dx=dout*(1.0-self.out)*self.out
return dx
class Affine:
def __init__(self,W,b):
self.W =W
self.b=b
self.x=None
self.original_x_shape=None
# 权重和偏置参数的导数
self.dW=None
self.db=None
def forward(self,x):
# 对应张量
self.original_x_shape=x.shape
x=x.reshape(x.shape[0],-1)
self.x=x
out=np.dot(self.x,self.W)+self.b
return out
def backward(self,dout):
dx=np.dot(dout,self.W.T)
self.dW=np.dot(self.x.T,dout)
self.db=np.sum(dout,axis=0)
dx=dx.reshape(*self.original_x_shape) # 还原输入数据的形状(对应张量)
return dx
class SoftmaxWithLoss:
def __init__(self):
self.loss=None
self.y=None # softmax的输出
self.t=None # 监督数据
def forward(self,x,t):
self.t=t
self.y=softmax(x)
self.loss=cross_entropy_error(self.y,self.t)
return self.loss
def backward(self,dout=1):
batch_size=self.t.shape[0]
if self.t.size == self.y.size: # 监督数据是one-hot-vector的情况
dx=(self.y-self.t)/batch_size
else:
dx=self.y.copy()
dx[np.arange(batch_size),self.t]-=1
dx=dx/batch_size
return dx
class Dropout:
'''
随机删除神经元
self.mask:保存的是False和True的数组,False的值为0是删除的数据
'''
def __init__(self,dropout_ratio=0.5):
self.dropout_ratio=dropout_ratio
self.mask=None
def forward(self,x,train_flg=True):
if train_flg:
self.mask=np.random.rand(*x.shape)>self.dropout_ratio
return x*self.mask
else:
return x*(1.0-self.dropout_ratio)
def backward(self,dout):
return dout*self.mask
class BatchNormalization:
def __init__(self,gamma,beta,momentum=0.9,running_mean=None,running_var=None):
self.gamma=gamma
self.beta=beta
self.momentum=momentum
self.input_shape=None # Conv层的情况下为4维,全连接层的情况下为2维
# 测试时使用的平均值和方差
self.running_mean=running_mean
self.running_var=running_var
# backward时使用的中间数据
self.batch_size=None
self.xc=None
self.std=None
self.dgamma=None
self.dbeta=None
def forward(self,x,train_flg=True):
self.input_shape=x.shape
if x.ndim != 2:
N,C,H,W=x.shape
x=x.reshape(N,-1)
out=self.__forward(x,train_flg)
return out.reshape(*self.input_shape)
def __forward(self,x,train_flg):
if self.running_mean is None:
N,D=x.shape
self.running_mean=np.zeros(D)
self.running_var=np.zeros(D)
if train_flg:
mu=x.mean(axis=0)
xc=x-mu
var=np.mean(xc**2,axis=0)
std=np.sqrt(var+10e-7)
xn=xc/std
self.batch_size=x.shape[0]
self.xc=xc
self.xn=xn
self.std=std
self.running_mean=self.momentum*self.running_mean+(1-self.momentum)*mu
self.running_var=self.momentum*self.running_var+(1-self.momentum)*var
else:
xc=x-self.running_mean
xn=xc/((np.sqrt(self.running_var+10e-7)))
out=self.gamma*xn+self.beta
return out
def backward(self,dout):
if dout.ndim != 2:
N,C,H,W=dout.shape
dout=dout.reshape(N,-1)
dx=self.__backward(dout)
dx=dx.reshape(*self.input_shape)
return dx
def __backward(self,dout):
dbeta=dout.sum(axis=0)
dgamma=np.sum(self.xn*dout,axis=0)
dxn=self.gamma*dout
dxc=dxn/self.std
dstd=-np.sum((dxn*self.xc)/(self.std*self.std),axis=0)
dvar=0.5*dstd/self.std
dxc += (2.0/self.batch_size)*self.xc*dvar
dmu=np.sum(dxc,axis=0)
dx=dxc-dmu/self.batch_size
self.dgamma=dgamma
self.dbeta=dbeta
return dx
其中一些方法没有使用到,暂时放出来,后面也将详细专门介绍这些方法,对于如何优化参数都是特别有效的。另外common目录里的functions.py的一些公共函数都在前面有介绍,就不贴出来了,可以参阅Python随机梯度下降法(四)【完结篇】
我们来加载MNIST数据集,此例为三层结构,可以自己随意添加层,类似[80,120,100,100]
import numpy as np
from dataset.mnist import load_mnist
from common.multi_layer_net import MultiLayerNet
#读取MNIST数据
(x_train,t_train),(x_test,t_test)=load_mnist(normalize=True)
train_num=x_train.shape[0]#60000张训练数据
batch_num=200#每次随机抽取的数量
max_iter=500#迭代次数
#三层(隐藏层有两层)
networks=MultiLayerNet(inputSize=784,hiddenSizeList=[80,120],outputSize=10)
lr=0.1
for i in range(max_iter):
batch_mask=np.random.choice(train_num,batch_num)
x_batch=x_train[batch_mask]
t_batch=t_train[batch_mask]
grads=networks.gradient(x_batch,t_batch)
#分别将三层的权重和偏置的梯度进行更新
for k in ('W1','b1','W2','b2','W3','b3'):
networks.params[k]-=lr*grads[k]
if i%100==0:
print(grads['W1'].shape,grads['W2'].shape,grads['W3'].shape)
#(784, 80) (80, 120) (120, 10)
print(grads['b1'].shape,grads['b2'].shape,grads['b3'].shape)
#(80,) (120,) (10,)
print(networks.accuracy(x_train,t_train))
精度(正确率)
0.11478333333333333
0.8880166666666667
0.91055
0.9217666666666666
0.9310666666666667