神经网络初始化

神经网络参数初始化非常重要,适合的初始化可以简化训练过程,提高精确率;参数初始化不是一成不变的,需要根据网络结构、激活函数以及优化算法等进行分析;

本篇记录5种初始化参数方法

1、将参数初始化为0

2、随机初始化参数,np.random.randn(layer_dims[l],layer_dims[l-1])*0.01  然后乘以0.01, 乘以0.01是为了尽量避免梯度消失,随机生成的参数服从标准正态分布(即均值为0, 方差为1的分布),统计学中方差是每个样本减去总体样本均值的平方的平均数

3、xavier初始化方法,np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(1 / layers_dims[l - 1]) 激活函数为tanh时,采用该方法效果较好

4、np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])  激活函数为relu时,采用该方法效果较好

5、np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / ((layers_dims[l]+layers_dims[l - 1])) 

完整代码如下:

#对比几种初始化方法
import numpy as np
import matplotlib.pyplot as plt

#初始化为0
def initialize_parameters_zeros(layers_dims):
	"""
	Arguments:
	layer_dims -- python array (list) containing the size of each layer.
	Returns:
	parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
					W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
					b1 -- bias vector of shape (layers_dims[1], 1)
					...
					WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
					bL -- bias vector of shape (layers_dims[L], 1)
	"""
	parameters = {}
	L = len(layers_dims)  # number of layers in the network

	for l in range(1, L):
		parameters['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
		parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
	return parameters

#随机初始化
def initialize_parameters_random(layers_dims):
	"""
	Arguments:
	layer_dims -- python array (list) containing the size of each layer.
	Returns:
	parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
					W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
					b1 -- bias vector of shape (layers_dims[1], 1)
					...
					WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
					bL -- bias vector of shape (layers_dims[L], 1)
	"""
	np.random.seed(3)  # This seed makes sure your "random" numbers will be the as ours
	parameters = {}
	L = len(layers_dims)  # integer representing the number of layers
	for l in range(1, L):
		parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1])*0.01
		parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
	return parameters

#xavier initialization
def initialize_parameters_xavier(layers_dims):
	"""
	Arguments:
	layer_dims -- python array (list) containing the size of each layer.
	Returns:
	parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
					W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
					b1 -- bias vector of shape (layers_dims[1], 1)
					...
					WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
					bL -- bias vector of shape (layers_dims[L], 1)
	"""
	np.random.seed(3)
	parameters = {}
	L = len(layers_dims)  # integer representing the number of layers
	for l in range(1, L):
		parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(1 / layers_dims[l - 1])
		parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
	return parameters

#He initialization
def initialize_parameters_he(layers_dims):
	"""
	Arguments:
	layer_dims -- python array (list) containing the size of each layer.
	Returns:
	parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
					W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
					b1 -- bias vector of shape (layers_dims[1], 1)
					...
					WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
					bL -- bias vector of shape (layers_dims[L], 1)
	"""
	np.random.seed(3)
	parameters = {}
	L = len(layers_dims)  # integer representing the number of layers

	for l in range(1, L):
		parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
		parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
	return parameters

def initialize_parameters_yo(layers_dims):
	"""
		Arguments:
		layer_dims -- python array (list) containing the size of each layer.
		Returns:
		parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
						W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
						b1 -- bias vector of shape (layers_dims[1], 1)
						...
						WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
						bL -- bias vector of shape (layers_dims[L], 1)
		"""
	np.random.seed(3)
	parameters = {}
	L = len(layers_dims)  # integer representing the number of layers

	for l in range(1, L):
		parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / (layers_dims[l]+layers_dims[l - 1]))
		parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
	return parameters

def relu(Z):
	"""
	:param Z: Output of the linear layer
	:return:
	A: output of activation
	"""
	A = np.maximum(0,Z)
	return A


def initialize_parameters(layer_dims):
	"""
	:param layer_dims: list,每一层单元的个数(维度)
	:return:dictionary,存储参数w1,w2,...,wL,b1,...,bL
	"""
	np.random.seed(3)
	L = len(layer_dims)#the number of layers in the network
	parameters = {}
	for l in range(1, L):
		parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2 / (layer_dims[l - 1]+layer_dims[l]))
		parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
	return parameters

def forward_propagation(initialization="yo"):
	np.random.seed(3)
	data = np.random.randn(1000, 100000)
	print("data shape : ", data.shape)
	layers_dims = [1000, 800, 500, 300, 200, 100, 10]
	num_layers = len(layers_dims)
	# Initialize parameters dictionary.
	if initialization == "zeros":
		parameters = initialize_parameters_zeros(layers_dims)
	elif initialization == "random":
		parameters = initialize_parameters_random(layers_dims)
	elif initialization == "xavier":
		parameters = initialize_parameters_xavier(layers_dims)
	elif initialization == "he":
		parameters = initialize_parameters_he(layers_dims)
	elif initialization == "yo":
		parameters = initialize_parameters_yo(layers_dims)
	A = data
	for l in range(1, num_layers):
		A_pre = A
		W = parameters["W" + str(l)]
		print("W shape : ", W.shape)
		b = parameters["b" + str(l)]
		z = np.dot(W, A_pre) + b  # dot: 向量相乘是内积运算即对应相乘再相加得到一个常量;
		# dot: 矩阵乘法前者的列数等于后者的行数,axb * bxc = axc
		# A = np.tanh(z) #relu activation function
		A = relu(z)
		print("A shape : ", A.shape)
		print(" A flatten shape: ", A.flatten().shape)
		plt.subplot(2, 3, l)
		plt.hist(A.flatten(), facecolor='g')
		plt.xlim([-1, 1])
		plt.yticks([])
	plt.savefig("save_picture/%s.jpg" % initialization, dpi=500)
	plt.show()
	plt.close()

if __name__ == '__main__':
	forward_propagation()
 

下面贴上采用方式5对lenet5参数的初始化

import torch.nn as nn
import numpy as np
import torch

np.random.seed(1307)

class LeNet(nn.Module):
    def __init__(self, cfg):
        super(LeNet, self).__init__()
        self.features = self.features_layers(cfg)

        self.classifier = self.classifier_layers(cfg)

    def forward(self, x):
        out = self.features(x)
        out = out.view(out.size(0), -1)
        out = self.classifier(out)
        return out

    def features_layers(self, cfg):
        layers = []
        in_channels = 1

        conv1 = nn.Conv2d(in_channels, cfg[0], kernel_size=3, stride=1, padding=1)
        layers += [self.init_weight_bias(conv1, [cfg[0], in_channels, 3, 3], cfg[0])]
        # layers += [conv1]
        layers = self.make_activation(cfg[1], layers)
        layers = self.make_pool(cfg[2], layers)

        conv2 = nn.Conv2d(cfg[0], cfg[3], kernel_size=5)
        layers += [self.init_weight_bias(conv2, [cfg[3], cfg[0], 5, 5], cfg[3])]
        # layers += [conv2]
        layers = self.make_activation(cfg[4], layers)
        layers = self.make_pool(cfg[5], layers)

        return nn.Sequential(*layers)

    def classifier_layers(self, cfg):
        layers = []
        num_classes = 10

        linear1 = nn.Linear(cfg[3]*5*5, cfg[6])
        layers += [self.init_weight_bias(linear1, [cfg[6], cfg[3]*5*5], cfg[6])]
        # layers += [linear1]
        layers = self.make_activation(cfg[7], layers)

        linear2 = nn.Linear(cfg[6], cfg[8])
        layers += [self.init_weight_bias(linear2, [cfg[8], cfg[6]], cfg[8])]
        # layers += [linear2]
        layers = self.make_activation(cfg[9], layers)

        linear3 = nn.Linear(cfg[8], num_classes)
        layers += [self.init_weight_bias(linear3, [num_classes, cfg[8]], num_classes)]
        # layers += [linear3]

        return nn.Sequential(*layers)

    def make_activation(self, activation, layers):
        if activation == "relu":
            layers += [nn.ReLU(inplace=True)]
        elif activation == "sigmoid":
            layers += [nn.Sigmoid()]
        elif activation == "tanh":
            layers += [nn.Tanh()]
        else:
            print("the activation is wrong!")
        return layers

    def make_pool(self, pool, layers):
        if pool == "maxpool":
            layers += [nn.MaxPool2d(2)]
        elif pool == "avgpool":
            layers += [nn.AvgPool2d(2)]
        else:
            print("the convolutional pool is wrong!")
        return layers

    def init_weight_bias(self, layer, weight_size, bias_size):
        length = len(weight_size)
        if length == 2:
            init_weights = torch.Tensor(np.random.randn(weight_size[0], weight_size[1])*np.sqrt(2/(weight_size[1]+weight_size[0])))
        else:
            init_weights = torch.Tensor(np.random.randn(weight_size[0], weight_size[1],
                                                        weight_size[2], weight_size[3]) *
                                        np.sqrt(2/(weight_size[3]+weight_size[2]+weight_size[1]+weight_size[0])))
        init_bias = torch.Tensor(np.random.uniform(0, 0, bias_size))

        layer.weight = nn.Parameter(init_weights)

        layer.bias = nn.Parameter(init_bias)


        return layer

if __name__ == '__main__':
    cfg = [5, "sigmoid", "maxpool", 5, "sigmoid", "maxpool", 50, "relu", 150, "relu"]
    net = LeNet(cfg)
    print(net)

 

 

  • 1
    点赞
  • 6
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值