神经网络参数初始化非常重要,适合的初始化可以简化训练过程,提高精确率;参数初始化不是一成不变的,需要根据网络结构、激活函数以及优化算法等进行分析;
本篇记录5种初始化参数方法
1、将参数初始化为0
2、随机初始化参数,np.random.randn(layer_dims[l],layer_dims[l-1])*0.01 然后乘以0.01, 乘以0.01是为了尽量避免梯度消失,随机生成的参数服从标准正态分布(即均值为0, 方差为1的分布),统计学中方差是每个样本减去总体样本均值的平方的平均数
3、xavier初始化方法,np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(1 / layers_dims[l - 1]) 激活函数为tanh时,采用该方法效果较好
4、np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1]) 激活函数为relu时,采用该方法效果较好
5、np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / ((layers_dims[l]+layers_dims[l - 1]))
完整代码如下:
#对比几种初始化方法
import numpy as np
import matplotlib.pyplot as plt
#初始化为0
def initialize_parameters_zeros(layers_dims):
"""
Arguments:
layer_dims -- python array (list) containing the size of each layer.
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
...
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
"""
parameters = {}
L = len(layers_dims) # number of layers in the network
for l in range(1, L):
parameters['W' + str(l)] = np.zeros((layers_dims[l], layers_dims[l - 1]))
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
return parameters
#随机初始化
def initialize_parameters_random(layers_dims):
"""
Arguments:
layer_dims -- python array (list) containing the size of each layer.
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
...
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
"""
np.random.seed(3) # This seed makes sure your "random" numbers will be the as ours
parameters = {}
L = len(layers_dims) # integer representing the number of layers
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1])*0.01
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
return parameters
#xavier initialization
def initialize_parameters_xavier(layers_dims):
"""
Arguments:
layer_dims -- python array (list) containing the size of each layer.
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
...
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
"""
np.random.seed(3)
parameters = {}
L = len(layers_dims) # integer representing the number of layers
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(1 / layers_dims[l - 1])
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
return parameters
#He initialization
def initialize_parameters_he(layers_dims):
"""
Arguments:
layer_dims -- python array (list) containing the size of each layer.
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
...
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
"""
np.random.seed(3)
parameters = {}
L = len(layers_dims) # integer representing the number of layers
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / layers_dims[l - 1])
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
return parameters
def initialize_parameters_yo(layers_dims):
"""
Arguments:
layer_dims -- python array (list) containing the size of each layer.
Returns:
parameters -- python dictionary containing your parameters "W1", "b1", ..., "WL", "bL":
W1 -- weight matrix of shape (layers_dims[1], layers_dims[0])
b1 -- bias vector of shape (layers_dims[1], 1)
...
WL -- weight matrix of shape (layers_dims[L], layers_dims[L-1])
bL -- bias vector of shape (layers_dims[L], 1)
"""
np.random.seed(3)
parameters = {}
L = len(layers_dims) # integer representing the number of layers
for l in range(1, L):
parameters['W' + str(l)] = np.random.randn(layers_dims[l], layers_dims[l - 1]) * np.sqrt(2 / (layers_dims[l]+layers_dims[l - 1]))
parameters['b' + str(l)] = np.zeros((layers_dims[l], 1))
return parameters
def relu(Z):
"""
:param Z: Output of the linear layer
:return:
A: output of activation
"""
A = np.maximum(0,Z)
return A
def initialize_parameters(layer_dims):
"""
:param layer_dims: list,每一层单元的个数(维度)
:return:dictionary,存储参数w1,w2,...,wL,b1,...,bL
"""
np.random.seed(3)
L = len(layer_dims)#the number of layers in the network
parameters = {}
for l in range(1, L):
parameters["W" + str(l)] = np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2 / (layer_dims[l - 1]+layer_dims[l]))
parameters["b" + str(l)] = np.zeros((layer_dims[l], 1))
return parameters
def forward_propagation(initialization="yo"):
np.random.seed(3)
data = np.random.randn(1000, 100000)
print("data shape : ", data.shape)
layers_dims = [1000, 800, 500, 300, 200, 100, 10]
num_layers = len(layers_dims)
# Initialize parameters dictionary.
if initialization == "zeros":
parameters = initialize_parameters_zeros(layers_dims)
elif initialization == "random":
parameters = initialize_parameters_random(layers_dims)
elif initialization == "xavier":
parameters = initialize_parameters_xavier(layers_dims)
elif initialization == "he":
parameters = initialize_parameters_he(layers_dims)
elif initialization == "yo":
parameters = initialize_parameters_yo(layers_dims)
A = data
for l in range(1, num_layers):
A_pre = A
W = parameters["W" + str(l)]
print("W shape : ", W.shape)
b = parameters["b" + str(l)]
z = np.dot(W, A_pre) + b # dot: 向量相乘是内积运算即对应相乘再相加得到一个常量;
# dot: 矩阵乘法前者的列数等于后者的行数,axb * bxc = axc
# A = np.tanh(z) #relu activation function
A = relu(z)
print("A shape : ", A.shape)
print(" A flatten shape: ", A.flatten().shape)
plt.subplot(2, 3, l)
plt.hist(A.flatten(), facecolor='g')
plt.xlim([-1, 1])
plt.yticks([])
plt.savefig("save_picture/%s.jpg" % initialization, dpi=500)
plt.show()
plt.close()
if __name__ == '__main__':
forward_propagation()
下面贴上采用方式5对lenet5参数的初始化
import torch.nn as nn
import numpy as np
import torch
np.random.seed(1307)
class LeNet(nn.Module):
def __init__(self, cfg):
super(LeNet, self).__init__()
self.features = self.features_layers(cfg)
self.classifier = self.classifier_layers(cfg)
def forward(self, x):
out = self.features(x)
out = out.view(out.size(0), -1)
out = self.classifier(out)
return out
def features_layers(self, cfg):
layers = []
in_channels = 1
conv1 = nn.Conv2d(in_channels, cfg[0], kernel_size=3, stride=1, padding=1)
layers += [self.init_weight_bias(conv1, [cfg[0], in_channels, 3, 3], cfg[0])]
# layers += [conv1]
layers = self.make_activation(cfg[1], layers)
layers = self.make_pool(cfg[2], layers)
conv2 = nn.Conv2d(cfg[0], cfg[3], kernel_size=5)
layers += [self.init_weight_bias(conv2, [cfg[3], cfg[0], 5, 5], cfg[3])]
# layers += [conv2]
layers = self.make_activation(cfg[4], layers)
layers = self.make_pool(cfg[5], layers)
return nn.Sequential(*layers)
def classifier_layers(self, cfg):
layers = []
num_classes = 10
linear1 = nn.Linear(cfg[3]*5*5, cfg[6])
layers += [self.init_weight_bias(linear1, [cfg[6], cfg[3]*5*5], cfg[6])]
# layers += [linear1]
layers = self.make_activation(cfg[7], layers)
linear2 = nn.Linear(cfg[6], cfg[8])
layers += [self.init_weight_bias(linear2, [cfg[8], cfg[6]], cfg[8])]
# layers += [linear2]
layers = self.make_activation(cfg[9], layers)
linear3 = nn.Linear(cfg[8], num_classes)
layers += [self.init_weight_bias(linear3, [num_classes, cfg[8]], num_classes)]
# layers += [linear3]
return nn.Sequential(*layers)
def make_activation(self, activation, layers):
if activation == "relu":
layers += [nn.ReLU(inplace=True)]
elif activation == "sigmoid":
layers += [nn.Sigmoid()]
elif activation == "tanh":
layers += [nn.Tanh()]
else:
print("the activation is wrong!")
return layers
def make_pool(self, pool, layers):
if pool == "maxpool":
layers += [nn.MaxPool2d(2)]
elif pool == "avgpool":
layers += [nn.AvgPool2d(2)]
else:
print("the convolutional pool is wrong!")
return layers
def init_weight_bias(self, layer, weight_size, bias_size):
length = len(weight_size)
if length == 2:
init_weights = torch.Tensor(np.random.randn(weight_size[0], weight_size[1])*np.sqrt(2/(weight_size[1]+weight_size[0])))
else:
init_weights = torch.Tensor(np.random.randn(weight_size[0], weight_size[1],
weight_size[2], weight_size[3]) *
np.sqrt(2/(weight_size[3]+weight_size[2]+weight_size[1]+weight_size[0])))
init_bias = torch.Tensor(np.random.uniform(0, 0, bias_size))
layer.weight = nn.Parameter(init_weights)
layer.bias = nn.Parameter(init_bias)
return layer
if __name__ == '__main__':
cfg = [5, "sigmoid", "maxpool", 5, "sigmoid", "maxpool", 50, "relu", 150, "relu"]
net = LeNet(cfg)
print(net)