2015年提出的batchnorm, 也许是网络训练中仅次于relu中的技术点。有了batchnorm+resnet,让训练超过100层的网络成为可能。
上图中的μ是这个批次的均值,σ是这个批次的方差,首先把数据变成0均值,和1方差, 然后增加两个可以学习的参数γ和β。
增加batchnorm后,网络的收敛速度会加快。
batchnorm针对fc的情况
batchnorm针对conv的情况
需要对每个通道单独计算
推理
推理时不需要在进行计算批次的均值和方差,而是采用一个固定的均值和方差.
batch_norm函数
from d2l import mxnet as d2l
from mxnet import autograd, np, npx, init
from mxnet.gluon import nn
npx.set_np()
def batch_norm(X, gamma, beta, moving_mean, moving_var, eps, momentum):
# Use `autograd` to determine whether the current mode is training mode or
# prediction mode
if not autograd.is_training():
# If it is prediction mode, directly use the mean and variance
# obtained by moving average
X_hat = (X - moving_mean) / np.sqrt(moving_var + eps)
else:
assert len(X.shape) in (2, 4)
if len(X.shape) == 2:
# When using a fully-connected layer, calculate the mean and
# variance on the feature dimension
mean = X.mean(axis=0)
var = ((X - mean) ** 2).mean(axis=0)
else:
# When using a two-dimensional convolutional layer, calculate the
# mean and variance on the channel dimension (axis=1). Here we
# need to maintain the shape of `X`, so that the broadcasting
# operation can be carried out later
mean = X.mean(axis=(0, 2, 3), keepdims=True)
var = ((X - mean) ** 2).mean(axis=(0, 2, 3), keepdims=True)
# In training mode, the current mean and variance are used for the
# standardization
X_hat = (X - mean) / np.sqrt(var + eps)
# Update the mean and variance using moving average
moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
moving_var = momentum * moving_var + (1.0 - momentum) * var
Y = gamma * X_hat + beta # Scale and shift
return Y, moving_mean, moving_var
batch_norm 类
class BatchNorm(nn.Block):
# `num_features`: the number of outputs for a fully-connected layer
# or the number of output channels for a convolutional layer. `num_dims`:
# 2 for a fully-connected layer and 4 for a convolutional layer
def __init__(self, num_features, num_dims, **kwargs):
super().__init__(**kwargs)
if num_dims == 2:
shape = (1, num_features)
else:
shape = (1, num_features, 1, 1)
# The scale parameter and the shift parameter (model parameters) are
# initialized to 1 and 0, respectively
self.gamma = self.params.get('gamma', shape=shape, init=init.One())
self.beta = self.params.get('beta', shape=shape, init=init.Zero())
# The variables that are not model parameters are initialized to 0
self.moving_mean = np.zeros(shape)
self.moving_var = np.zeros(shape)
def forward(self, X):
# If `X` is not on the main memory, copy `moving_mean` and
# `moving_var` to the device where `X` is located
if self.moving_mean.ctx != X.ctx:
self.moving_mean = self.moving_mean.copyto(X.ctx)
self.moving_var = self.moving_var.copyto(X.ctx)
# Save the updated `moving_mean` and `moving_var`
Y, self.moving_mean, self.moving_var = batch_norm(
X, self.gamma.data(), self.beta.data(), self.moving_mean,
self.moving_var, eps=1e-12, momentum=0.9)
return Y
把batch_norm增加到lenet中
net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5),
BatchNorm(6, num_dims=4),
nn.Activation('sigmoid'),
nn.MaxPool2D(pool_size=2, strides=2),
nn.Conv2D(16, kernel_size=5),
BatchNorm(16, num_dims=4),
nn.Activation('sigmoid'),
nn.MaxPool2D(pool_size=2, strides=2),
nn.Dense(120),
BatchNorm(120, num_dims=2),
nn.Activation('sigmoid'),
nn.Dense(84),
BatchNorm(84, num_dims=2),
nn.Activation('sigmoid'),
nn.Dense(10))
训练一下
lr, num_epochs, batch_size = 1.0, 10, 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size)
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr)
loss 0.241, train acc 0.912, test acc 0.821
21051.7 examples/sec on gpu(0)
显示一下学习的均值和方差
net[1].gamma.data().reshape(-1,), net[1].beta.data().reshape(-1,)
(array([1.956041 , 0.8407571, 3.1203268, 1.492053 , 1.2769177, 2.3011174], ctx=gpu(0)),
array([ 1.1319363 , 0.36316666, -3.2172477 , -0.7223122 , -0.57278776,
-0.21445163], ctx=gpu(0)))
使用框架的接口训练一下
net = nn.Sequential()
net.add(nn.Conv2D(6, kernel_size=5),
nn.BatchNorm(),
nn.Activation('sigmoid'),
nn.MaxPool2D(pool_size=2, strides=2),
nn.Conv2D(16, kernel_size=5),
nn.BatchNorm(),
nn.Activation('sigmoid'),
nn.MaxPool2D(pool_size=2, strides=2),
nn.Dense(120),
nn.BatchNorm(),
nn.Activation('sigmoid'),
nn.Dense(84),
nn.BatchNorm(),
nn.Activation('sigmoid'),
nn.Dense(10))
d2l.train_ch6(net, train_iter, test_iter, num_epochs, lr)
loss 0.252, train acc 0.907, test acc 0.825
40203.7 examples/sec on gpu(0)
最后的话:
这篇文章发布在CSDN/蓝色的杯子, 没事多留言,让我们一起爱智求真吧.我的邮箱wisdomfriend@126.com.