全连接的多层神经网络结构(MultiLayerNet)

最新推荐文章于 2024-08-22 09:18:16 发布

寅恪光潜

最新推荐文章于 2024-08-22 09:18:16 发布

阅读量2.3k

点赞数 4

分类专栏： Python 文章标签：神经网络多层 Affine ReLU Softmax

本文链接：https://blog.csdn.net/weixin_41896770/article/details/121451390

版权

Python 专栏收录该内容

121 篇文章 34 订阅

订阅专栏

以前在误差反向传播法里面介绍的神经网络是两层结构，现在来搭建一个多层的结构，神经网络的一个特点或说优势就是可以不断地叠加层(隐藏层)。
多层结构：一张数字图片(1*28*28=784)——>Affine1层——>ReLU1(激活层)——>Affine2层——>ReLU2——>......——>AffineN层——>Softmax层（概率）
主要就是生成Affine层和激活函数层，代码就是解释，如有不明白的欢迎留言进行交流

multi_layer_net.py

import numpy as np
from collections import OrderedDict
from common.layers import *
from common.gradient import numerical_gradient

class MultiLayerNet:
    '''
	全连接的多层神经网络
	hiddenSizeList:隐藏层列表，可以设定为若干个
    activation:激活函数为relu或sigmoid
    weight_init_std:权重的标准差(如0.01)
        指定'relu'或'he'的情况下设定“He的初始值”
        指定'sigmoid'或'xavier'的情况下设定“Xavier的初始值”
    weight_decay_lambda:Weight Decay（L2范数）的强度
    '''
    def __init__(self,inputSize,hiddenSizeList,outputSize,activation='relu',weight_init_std='relu',weight_decay_lambda=0):
        self.inputSize=inputSize
        self.outputSize=outputSize
        self.hiddenSizeList=hiddenSizeList
        self.hiddenLayerNum=len(hiddenSizeList)
        self.weight_decay_lambda=weight_decay_lambda
        self.params={}

        self.__init_weight(weight_init_std)

        #生成Affine层和激活函数层（隐藏层）以及最后的Affine层和Softmax层
        activationLayer={'sigmoid': Sigmoid,'relu': Relu}
        self.layers=OrderedDict()
        for idx in range(1,self.hiddenLayerNum+1):
            self.layers['Affine'+str(idx)]=Affine(self.params['W'+str(idx)],self.params['b'+str(idx)])
            self.layers['Activation'+str(idx)]=activationLayer[activation]()

        idx=self.hiddenLayerNum+1
        self.layers['Affine'+str(idx)]=Affine(self.params['W'+str(idx)],self.params['b'+str(idx)])
        self.lastLayer=SoftmaxWithLoss()

    def __init_weight(self,weight_init_std):
        '''初始化权重'''
        allSizeList=[self.inputSize]+self.hiddenSizeList+[self.outputSize]

        for idx in range(1,len(allSizeList)):
            scale=weight_init_std
            if str(weight_init_std).lower() in ('relu','he'):
                scale=np.sqrt(2.0 / allSizeList[idx-1])  # 使用ReLU的情况下推荐的初始值
            elif str(weight_init_std).lower() in ('sigmoid','xavier'):
                scale=np.sqrt(1.0 / allSizeList[idx-1])  # 使用sigmoid的情况下推荐的初始值
            #np.random.randn标准正态分布,标准差为scale
            self.params['W'+str(idx)]=scale*np.random.randn(allSizeList[idx-1],allSizeList[idx])
            self.params['b'+str(idx)]=np.zeros(allSizeList[idx])

    def predict(self,x):
        for layer in self.layers.values():
            x=layer.forward(x)
        return x

    def loss(self,x,t):
        '''求损失函数'''
        y=self.predict(x)

        weight_decay=0
        for idx in range(1,self.hiddenLayerNum+2):
            W=self.params['W'+str(idx)]
            weight_decay+=0.5 * self.weight_decay_lambda * np.sum(W ** 2)

        return self.lastLayer.forward(y,t)+weight_decay

    def accuracy(self,x,t):
        y=self.predict(x)
        y=np.argmax(y,axis=1)
        if t.ndim != 1 : t=np.argmax(t,axis=1)

        accuracy=np.sum(y==t) / float(x.shape[0])
        return accuracy

    def numerical_gradient(self,x,t):
        '''数值微分求梯度'''
        loss_W=lambda W: self.loss(x,t)

        grads={}
        for idx in range(1,self.hiddenLayerNum+2):
            grads['W'+str(idx)]=numerical_gradient(loss_W,self.params['W'+str(idx)])
            grads['b'+str(idx)]=numerical_gradient(loss_W,self.params['b'+str(idx)])

        return grads

    def gradient(self,x,t):
        '''误差反向传播求梯度'''
        # forward
        self.loss(x,t)

        # backward
        dout=1
        dout=self.lastLayer.backward(dout)

        layers=list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout=layer.backward(dout)

        # 设定
        grads={}
        for idx in range(1,self.hiddenLayerNum+2):
            grads['W'+str(idx)]=self.layers['Affine'+str(idx)].dW+self.weight_decay_lambda * self.layers['Affine'+str(idx)].W
            grads['b'+str(idx)]=self.layers['Affine'+str(idx)].db

        return grads

layers.py（Affine和激活函数Relu，Sigmoid，Softmax）

import numpy as np
from common.functions import *

class Relu:
    def __init__(self):
        self.mask=None

    def forward(self,x):
        self.mask=(x<=0)
        out=x.copy()
        out[self.mask]=0

        return out

    def backward(self,dout):
        dout[self.mask]=0
        dx=dout

        return dx

class Sigmoid:
    def __init__(self):
        self.out=None

    def forward(self,x):
        out=sigmoid(x)
        self.out=out
        return out

    def backward(self,dout):
        dx=dout*(1.0-self.out)*self.out

        return dx

class Affine:
    def __init__(self,W,b):
        self.W =W
        self.b=b
        
        self.x=None
        self.original_x_shape=None
        # 权重和偏置参数的导数
        self.dW=None
        self.db=None

    def forward(self,x):
        # 对应张量
        self.original_x_shape=x.shape
        x=x.reshape(x.shape[0],-1)
        self.x=x

        out=np.dot(self.x,self.W)+self.b

        return out

    def backward(self,dout):
        dx=np.dot(dout,self.W.T)
        self.dW=np.dot(self.x.T,dout)
        self.db=np.sum(dout,axis=0)
        
        dx=dx.reshape(*self.original_x_shape)  # 还原输入数据的形状（对应张量）
        return dx

class SoftmaxWithLoss:
    def __init__(self):
        self.loss=None
        self.y=None # softmax的输出
        self.t=None # 监督数据

    def forward(self,x,t):
        self.t=t
        self.y=softmax(x)
        self.loss=cross_entropy_error(self.y,self.t)
        
        return self.loss

    def backward(self,dout=1):
        batch_size=self.t.shape[0]
        if self.t.size == self.y.size: # 监督数据是one-hot-vector的情况
            dx=(self.y-self.t)/batch_size
        else:
            dx=self.y.copy()
            dx[np.arange(batch_size),self.t]-=1
            dx=dx/batch_size
        
        return dx

class Dropout:
    '''
    随机删除神经元
    self.mask：保存的是False和True的数组，False的值为0是删除的数据
	'''
    def __init__(self,dropout_ratio=0.5):
        self.dropout_ratio=dropout_ratio
        self.mask=None

    def forward(self,x,train_flg=True):
        if train_flg:
            self.mask=np.random.rand(*x.shape)>self.dropout_ratio
            return x*self.mask
        else:
            return x*(1.0-self.dropout_ratio)

    def backward(self,dout):
        return dout*self.mask

class BatchNormalization:
    def __init__(self,gamma,beta,momentum=0.9,running_mean=None,running_var=None):
        self.gamma=gamma
        self.beta=beta
        self.momentum=momentum
        self.input_shape=None # Conv层的情况下为4维，全连接层的情况下为2维  

        # 测试时使用的平均值和方差
        self.running_mean=running_mean
        self.running_var=running_var  
        
        # backward时使用的中间数据
        self.batch_size=None
        self.xc=None
        self.std=None
        self.dgamma=None
        self.dbeta=None

    def forward(self,x,train_flg=True):
        self.input_shape=x.shape
        if x.ndim != 2:
            N,C,H,W=x.shape
            x=x.reshape(N,-1)

        out=self.__forward(x,train_flg)
        
        return out.reshape(*self.input_shape)
            
    def __forward(self,x,train_flg):
        if self.running_mean is None:
            N,D=x.shape
            self.running_mean=np.zeros(D)
            self.running_var=np.zeros(D)
                        
        if train_flg:
            mu=x.mean(axis=0)
            xc=x-mu
            var=np.mean(xc**2,axis=0)
            std=np.sqrt(var+10e-7)
            xn=xc/std
            
            self.batch_size=x.shape[0]
            self.xc=xc
            self.xn=xn
            self.std=std
            self.running_mean=self.momentum*self.running_mean+(1-self.momentum)*mu
            self.running_var=self.momentum*self.running_var+(1-self.momentum)*var            
        else:
            xc=x-self.running_mean
            xn=xc/((np.sqrt(self.running_var+10e-7)))
            
        out=self.gamma*xn+self.beta 
        return out

    def backward(self,dout):
        if dout.ndim != 2:
            N,C,H,W=dout.shape
            dout=dout.reshape(N,-1)

        dx=self.__backward(dout)

        dx=dx.reshape(*self.input_shape)
        return dx

    def __backward(self,dout):
        dbeta=dout.sum(axis=0)
        dgamma=np.sum(self.xn*dout,axis=0)
        dxn=self.gamma*dout
        dxc=dxn/self.std
        dstd=-np.sum((dxn*self.xc)/(self.std*self.std),axis=0)
        dvar=0.5*dstd/self.std
        dxc += (2.0/self.batch_size)*self.xc*dvar
        dmu=np.sum(dxc,axis=0)
        dx=dxc-dmu/self.batch_size
        
        self.dgamma=dgamma
        self.dbeta=dbeta
        
        return dx

其中一些方法没有使用到，暂时放出来，后面也将详细专门介绍这些方法，对于如何优化参数都是特别有效的。另外common目录里的functions.py的一些公共函数都在前面有介绍，就不贴出来了，可以参阅Python随机梯度下降法（四）【完结篇】

我们来加载MNIST数据集，此例为三层结构，可以自己随意添加层，类似[80,120,100,100]

import numpy as np
from dataset.mnist import load_mnist
from common.multi_layer_net import MultiLayerNet

#读取MNIST数据
(x_train,t_train),(x_test,t_test)=load_mnist(normalize=True)
train_num=x_train.shape[0]#60000张训练数据
batch_num=200#每次随机抽取的数量
max_iter=500#迭代次数
#三层(隐藏层有两层)
networks=MultiLayerNet(inputSize=784,hiddenSizeList=[80,120],outputSize=10)

lr=0.1
for i in range(max_iter):
    batch_mask=np.random.choice(train_num,batch_num)
    x_batch=x_train[batch_mask]
    t_batch=t_train[batch_mask]
    grads=networks.gradient(x_batch,t_batch)
    #分别将三层的权重和偏置的梯度进行更新
    for k in ('W1','b1','W2','b2','W3','b3'):
        networks.params[k]-=lr*grads[k]
    if i%100==0:
        print(grads['W1'].shape,grads['W2'].shape,grads['W3'].shape)
		#(784, 80) (80, 120) (120, 10)
        print(grads['b1'].shape,grads['b2'].shape,grads['b3'].shape)
		#(80,) (120,) (10,)
        print(networks.accuracy(x_train,t_train))