# 李理：卷积神经网络之Batch Normalization的原理及实现

## 2. Batch Normalization

### 2.1 简介

Batch Normalization是Google的Sergey Ioffe 和 Christian Szegedy提出的，相同的网络结构，使用这种方法比原始的网络训练速度要提高14倍。作者通过训练多个模型的ensemble，在ImageNet上的top5分类错误率降到了4.8%。

### 2.3 解决方法——Batch Normalization

Batch Normalization的算法很简单，如下图所示：

### 2.4 Batch Normalization的预测

running_mean = momentum * running_mean + (1 - momentum) * sample_mean

running_var = momentum * running_var + (1 - momentum) * sample_var

100, 100, 100 , … 1, 1, 1

## 3. Batch Normalization的实现

### 3.2 cell1和cell2

def get_CIFAR10_data(num_training=49000, num_validation=1000, num_test=1000):
"""
Load the CIFAR-10 dataset from disk and perform preprocessing to prepare
it for classifiers. These are the same steps as we used for the SVM, but
condensed to a single function.
"""
# Load the raw CIFAR-10 data
cifar10_dir = '/home/mc/cs231n/assignment2/cs231n/datasets/cifar-10-batches-py'

### 3.3 cell3

def batchnorm_forward(x, gamma, beta, bn_param):
"""
输入:
- x: 输入数据 shape (N, D)
- gamma: 缩放参数 shape (D,)
- beta: 平移参数 shape (D,)
- bn_param: 包含如下参数的dict:
- mode: 'train' or 'test'; 用来区分训练还是测试
- eps: 除以方差时为了防止方差太小而导致数值计算不稳定
- momentum: 前面讨论的momentum.
- running_mean: 数组 shape (D,) 记录最新的均值
- running_var 数组 shape (D,) 记录最新的方差

返回一个tuple:
- out: shape (N, D)
- cache: 缓存反向计算时需要的变量
"""
mode = bn_param['mode']
eps = bn_param.get('eps', 1e-5)
momentum = bn_param.get('momentum', 0.9)

N, D = x.shape
running_mean = bn_param.get('running_mean', np.zeros(D, dtype=x.dtype))
running_var = bn_param.get('running_var', np.zeros(D, dtype=x.dtype))

out, cache = None, None
if mode == 'train':
#############################################################################
# TODO: Implement the training-time forward pass for batch normalization.   #
# Use minibatch statistics to compute the mean and variance, use these      #
# statistics to normalize the incoming data, and scale and shift the        #
# normalized data using gamma and beta.                                     #
#                                                                           #
# You should store the output in the variable out. Any intermediates that   #
# you need for the backward pass should be stored in the cache variable.    #
#                                                                           #
# You should also use your computed sample mean and variance together with  #
# the momentum variable to update the running mean and running variance,    #
# storing your result in the running_mean and running_var variables.        #
#############################################################################
x_mean=x.mean(axis=0)
x_var=x.var(axis=0)
x_normalized=(x-x_mean)/np.sqrt(x_var+eps)
out = gamma * x_normalized + beta

running_mean = momentum * running_mean + (1 - momentum) * x_mean
running_var = momentum * running_var + (1 - momentum) * x_var
cache = (x, x_mean, x_var, x_normalized, beta, gamma, eps)
#############################################################################
#                             END OF YOUR CODE                              #
#############################################################################
elif mode == 'test':
#############################################################################
# TODO: Implement the test-time forward pass for batch normalization. Use   #
# the running mean and variance to normalize the incoming data, then scale  #
# and shift the normalized data using gamma and beta. Store the result in   #
# the out variable.                                                         #
#############################################################################
x_normalized = (x - running_mean)/np.sqrt(running_var +eps)
out = gamma*x_normalized + beta

#############################################################################
#                             END OF YOUR CODE                              #
#############################################################################
else:
raise ValueError('Invalid forward batchnorm mode "%s"' % mode)

# Store the updated running means back into bn_param
bn_param['running_mean'] = running_mean
bn_param['running_var'] = running_var

return out, cache

### 3.5 cell5

  (x, x_mean, x_var, x_normalized, beta, gamma, eps) = cache
N = x.shape[0]
dbeta = np.sum(dout, axis=0)
dgamma = np.sum(x_normalized*dout, axis = 0)
dx_normalized = gamma* dout
dx_var = np.sum(-1.0/2*dx_normalized*(x-x_mean)/(x_var+eps)**(3.0/2), axis =0)
dx_mean = np.sum(-1/np.sqrt(x_var+eps)* dx_normalized, axis = 0) + 1.0/N*dx_var *np.sum(-2*(x-x_mean), axis = 0)
dx = 1/np.sqrt(x_var+eps)*dx_normalized + dx_var*2.0/N*(x-x_mean) + 1.0/N*dx_mean

#### 3.5.3. 第3-5行

out = gamma * x_normalized + beta

  dbeta = np.sum(dout, axis=0)
dgamma = np.sum(x_normalized*dout, axis = 0)
dx_normalized = gamma* dout

#### 3.5.4. 第6行

x_normalized=(x-x_mean)/np.sqrt(x_var+eps)

dx_var = np.sum(-1.0/2*dx_normalized*(x-x_mean)/(x_var+eps)**(3.0/2), axis =0)

np.sum的原因和上面是一样的。

#### 3.5.5. 第7行

dx_mean = np.sum(-1/np.sqrt(x_var+eps)* dx_normalized, axis = 0)

x_normalized=(x-x_mean)/np.sqrt(x_var+eps)

x_var=x.var(axis=0)

1.0/N*dx_var *np.sum(-2*(x-x_mean), axis = 0)

#### 3.5.6 第8行

x影响的变量是：

    x_mean=x.mean(axis=0)
x_var=x.var(axis=0)
x_normalized=(x-x_mean)/np.sqrt(x_var+eps)

x_normalized=(x-x_mean)/np.sqrt(x_var+eps)

### 3.6 cell6

x=[1,2,3] x_mean=1/3(1+2+3)=2

### 3.7 cell7

#### 3.7.1增加affine_bn_relu层

def affine_bn_relu_forward(x, w, b, gamma, beta, bn_param):
affine_out, fc_cache = affine_forward(x, w, b)
bn_out, bn_cache = batchnorm_forward(affine_out, gamma, beta, bn_param)
relu_out, relu_cache = relu_forward(bn_out)
cache = (fc_cache, bn_cache, relu_cache)
return relu_out, cache

def affine_bn_relu_backward(dout, cache):
fc_cache, bn_cache, relu_cache = cache
drelu_out = relu_backward(dout, relu_cache)
dbn_out, dgamma, dbeta = batchnorm_backward(drelu_out, bn_cache)
dx, dw, db = affine_backward(dbn_out, fc_cache)
return dx, dw, db, dgamma, dbeta

#### 3.7.2 修改_init_

 for i in range(1, self.num_layers + 1):

if use_batchnorm and i != self.num_layers:
self.params['beta' + str(i)] = np.zeros(layer_output_dim)
self.params['gamma' + str(i)] = np.ones(layer_output_dim)

#### 3.7.3 修改loss函数

    for i in range(1, self.num_layers):
keyW = 'W' + str(i)
keyb = 'b' + str(i)

if not self.use_batchnorm:
current_input, affine_relu_cache[i] = affine_relu_forward(current_input, self.params[keyW], self.params[keyb])

else:
key_gamma = 'gamma' + str(i)
key_beta = 'beta' + str(i)
current_input, affine_bn_relu_cache[i] = affine_bn_relu_forward(current_input, self.params[keyW],
self.params[keyb],
self.params[key_gamma], self.params[key_beta],
self.bn_params[i - 1])

   for i in range(self.num_layers - 1, 0, -1):
if not self.use_batchnorm:
affine_dx, affine_dw, affine_db = affine_relu_backward(affine_dx, affine_relu_cache[i])

else:
affine_dx, affine_dw, affine_db, dgamma, dbeta = affine_bn_relu_backward(affine_dx, affine_bn_relu_cache[i])
grads['beta' + str(i)] = dbeta
grads['gamma' + str(i)] = dgamma