网络参数初始化对于训练收敛有极大重要性,对于卷积神经网络,在TensorFlow时我一般采用Xavier,但在用PyTorch时发现kaiming_normal(何氏初始化)效果更好。
如果不精心挑选网络初始化方法,训练时网络Loss值可能一直稳定维持在某个数值,例如:对于分类任务,假设有C个类别,Loss可能一直维持在 − log ( 1 C ) -\log(\frac{1}{C}) −log(C1)附近。
代码:
import torch
import torch.nn as nn
class VGG16(nn.Module):
def __init__(self, categories_size):
super(VGG16, self).__init__() # 3 * 224 * 224
self.conv1_1 = nn.Conv2d(3, 64, (3, 3)) # 64 * 222 * 222
self.conv1_2 = nn.Conv2d(64, 64, (3, 3), padding=(1, 1)) # 64 * 222* 222
self.maxpool1 = nn.MaxPool2d((2, 2), padding=(1, 1)) # pooling 64 * 112 * 112
self.conv2_1 = nn.Conv2d(64, 128, 3) # 128 * 110 * 110
self.conv2_2 = nn.Conv2d(128, 128, 3, padding=(1, 1)) # 128 * 110 * 110
self.maxpool2 = nn.MaxPool2d((2, 2), padding=(1, 1)) # pooling 128 * 56 * 56
self.conv3_1 = nn.Conv2d(128, 256, 3) # 256 * 54 * 54
self.conv3_2 = nn.Conv2d(256, 256, 3, padding=(1, 1)) # 256 * 54 * 54
self.conv3_3 = nn.Conv2d(256, 256, 3, padding=(1, 1)) # 256 * 54 * 54
self.maxpool3 = nn.MaxPool2d((2, 2), padding=(1, 1)) # pooling 256 * 28 * 28
self.conv4_1 = nn.Conv2d(256, 512, 3) # 512 * 26 * 26
self.conv4_2 = nn.Conv2d(512, 512, 3, padding=(1, 1)) # 512 * 26 * 26
self.conv4_3 = nn.Conv2d(512, 512, 3, padding=(1, 1)) # 512 * 26 * 26
self.maxpool4 = nn.MaxPool2d((2, 2), padding=(1, 1)) # pooling 512 * 14 * 14
self.conv5_1 = nn.Conv2d(512, 512, 3) # 512 * 12 * 12
self.conv5_2 = nn.Conv2d(512, 512, 3, padding=(1, 1)) # 512 * 12 * 12
self.conv5_3 = nn.Conv2d(512, 512, 3, padding=(1, 1)) # 512 * 12 * 12
self.maxpool5 = nn.MaxPool2d((2, 2), padding=(1, 1)) # pooling 512 * 7 * 7
self.fc1 = nn.Linear(7 * 7 * 512, 4096)
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(4096, categories_size)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.5)
self._init_parameters()
def forward(self, x):
batch_size = x.size(0) # x.size(0) is batch_size
x = self.maxpool1(self.relu(self.conv1_2(self.relu(self.conv1_1(x)))))
x = self.maxpool2(self.relu(self.conv2_2(self.relu(self.conv2_1(x)))))
x = self.maxpool3(self.relu(self.conv3_3(self.relu(self.conv3_2(self.relu(self.conv3_1(x)))))))
x = self.maxpool4(self.relu(self.conv4_3(self.relu(self.conv4_2(self.relu(self.conv4_1(x)))))))
x = self.maxpool5(self.relu(self.conv5_3(self.relu(self.conv5_2(self.relu(self.conv5_1(x)))))))
# 展平
x = x.view(batch_size, -1)
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
def _init_parameters(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
nn.init.constant_(m.bias, 0)
elif isinstance(m, nn.Linear):
nn.init.normal_(m.weight, 0, 0.01)
nn.init.constant_(m.bias, 0)
if __name__ == '__main__':
import numpy as np
vgg16 = VGG16(102)
x = np.zeros((3, 244, 244))
x = np.array([x])
y = vgg16(torch.Tensor(x))
print(y.shape)
# 训练时,启用 Dropout,Batch normalization 等
vgg16.train()
# ...
# 测试时,关闭 Dropout,Batch normalization 等
vgg16.eval()
# ...
注意,训练时若想要打印Loss值,可以考虑加入with torch.no_grad
语句节省内存,可参考文章:https://blog.csdn.net/weixin_44134757/article/details/105775027,之前我不加该语句一直提示内存不够。
def _calc_loss_cls(self, xs, ys, loss_cls_function):
self._net.eval()
with torch.no_grad():
outs = self._net(xs)
loss_cls = loss_cls_function(outs, ys)
self._net.train()
return loss_cls
P.S. 我打印Loss值是为了观察网络在训练时是否收敛。
Reference
pytorch对网络参数进行自定义初始化
看pytorch文档学深度学习——nn.init
zhousteven/VGG16-PyTorch/vgg.py
pytorch的初始化方式总结
pytorch中with torch.no_grad():
利用PyTorch实现VGG16