BatchNormalization.ipynb
准备工作
from __future__ import print_function
import time
import numpy as np
import matplotlib.pyplot as plt
from cs231n.classifiers.fc_net import *
from cs231n.data_utils import get_CIFAR10_data
from cs231n.gradient_check import eval_numerical_gradient, eval_numerical_gradient_array
from cs231n.solver import Solver
%matplotlib inline
plt.rcParams['figure.figsize'] = (10.0, 8.0) # 默认画图大小
plt.rcParams['image.interpolation'] = 'nearest'
plt.rcParams['image.cmap'] = 'gray'
# 自动重新加载导入的模块
%load_ext autoreload
%autoreload 2
def rel_error(x, y):
""" 相对误差 """
return np.max(np.abs(x - y) / (np.maximum(1e-8, np.abs(x) + np.abs(y))))
# 加载预处理过的数据
data = get_CIFAR10_data()
for k, v in data.items():
print('%s: ' % k, v.shape)
# X_train: (49000, 3, 32, 32)
# y_train: (49000,)
# X_val: (1000, 3, 32, 32)
# y_val: (1000,)
# X_test: (1000, 3, 32, 32)
# y_test: (1000,)
实现cs231n/layers.py里的batchnorm_forward再运行以下代码
检查训练时的批量归一化(BN)
np.random.seed(231)
N, D1, D2, D3 = 200, 50, 60, 3
X = np.random.randn(N, D1)
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)
a = np.maximum(0, X.dot(W1)).dot(W2)
print('Before batch normalization:')
print(' means: ', a.mean(axis=0))
print(' stds: ', a.std(axis=0))
# Before batch normalization:
# means: [ -2.3814598 -13.18038246 1.91780462]
# stds: [27.18502186 34.21455511 37.68611762]
# 经过归一化后,均值应该在0附近,标准差应该和1差不多
print('After batch normalization (gamma=1, beta=0)')
a_norm, _ = batchnorm_forward(a, np.ones(D3), np.zeros(D3), {
'mode': 'train'})
print(' mean: ', a_norm.mean(axis=0))
print(' std: ', a_norm.std(axis=0))
# After batch normalization (gamma=1, beta=0)
# mean: [5.32907052e-17 7.04991621e-17 1.85962357e-17]
# std: [0.99999999 1. 1. ]
# 均值应该和beta差不多,标准差和gamma差不多
gamma = np.asarray([1.0, 2.0, 3.0])
beta = np.asarray([11.0, 12.0, 13.0])
a_norm, _ = batchnorm_forward(a, gamma, beta, {
'mode': 'train'})
print('After batch normalization (nontrivial gamma, beta)')
print(' means: ', a_norm.mean(axis=0))
print(' stds: ', a_norm.std(axis=0))
# After batch normalization (nontrivial gamma, beta)
# means: [11. 12. 13.]
# stds: [0.99999999 1.99999999 2.99999999]
检查测试时的BN,测试时不会计算输入数据的均值和方差,而是使用训练时保存的均值和方差
np.random.seed(231)
N, D1, D2, D3 = 200, 50, 60, 3
W1 = np.random.randn(D1, D2)
W2 = np.random.randn(D2, D3)
# 先使用50组随机输入训练,获得训练时的均值和方差,然后用于测试集中
bn_param = {
'mode': 'train'}
gamma = np.ones(D3)
beta = np.zeros(D3)
for t in range(50):
X = np.random.randn(N, D1)
a = np.maximum(0, X.dot(W1)).dot(W2)
batchnorm_forward(a, gamma, beta, bn_param)
bn_param['mode'] = 'test'
X = np.random.randn(N, D1)
a = np.maximum(0, X.dot(W1)).dot(W2)
a_norm, _ = batchnorm_forward(a, gamma, beta, bn_param)
# 50组随机输入的均值和标准差分别是0和1,再经过beta=0,gamma=1
#得到的均值和标准差还是0和1,但是会比训练时有更多的噪声(由输入引起的)
print('After batch normalization (test-time):')
print(' means: ', a_norm.mean(axis=0))
print(' stds: ', a_norm.std(axis=0))
# After batch normalization (test-time):
# means: [-0.03927354 -0.04349152 -0.10452688]
# stds: [1.01531428 1.01238373 0.97819988]
实现cs231n/layers.py里的batchnorm_backward再运行以下代码
np.random.seed(231)
N, D = 4, 5
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D)
bn_param = {
'mode': 'train'}
# 计算数值梯度
fx = lambda x: batchnorm_forward(x, gamma, beta, bn_param)[0]
fg = lambda a: batchnorm_forward(x, a, beta, bn_param)[0]
fb = lambda b: batchnorm_forward(x, gamma, b, bn_param)[0]
dx_num = eval_numerical_gradient_array(fx, x, dout)
da_num = eval_numerical_gradient_array(fg, gamma.copy(), dout)
db_num = eval_numerical_gradient_array(fb, beta.copy(), dout)
# 计算解析梯度
_, cache = batchnorm_forward(x, gamma, beta, bn_param)
dx, dgamma, dbeta = batchnorm_backward(dout, cache)
print('dx error: ', rel_error(dx_num, dx))
print('dgamma error: ', rel_error(da_num, dgamma))
print('dbeta error: ', rel_error(db_num, dbeta))
# dx error: 1.7029258328157158e-09
# dgamma error: 7.420414216247087e-13
# dbeta error: 2.8795057655839487e-12
实现cs231n/layers.py里的batchnorm_backward_alt再运行以下代码
batchnorm_backward_alt()简化了batchnorm_backward()的步骤,所以有一定的速度提升,但是提升的并不多,原来的代码只计算一次,比较不稳定。我加了个100次的循环进去,得到的提升速度在1.2倍到1.7倍之间。
np.random.seed(231)
N, D = 100, 500
sum_time_1 = 0
sum_time_2 = 0
for i in range(100):
x = 5 * np.random.randn(N, D) + 12
gamma = np.random.randn(D)
beta = np.random.randn(D)
dout = np.random.randn(N, D)
bn_param = {
'mode': 'train'}
out, cache = batchnorm_forward(x, gamma, beta, bn_param)
t1 = time.time()
dx1, dgamma1, dbeta1 = batchnorm_backward(dout, cache)
t2 = time.time()
dx2, dgamma2, dbeta2 = batchnorm_backward_alt(dout, cache)
t3 = time.time()
sum_time_1 += t2-t1
sum_time_2 += t3-t2
print('dx difference: ', rel_error(dx1, dx2))
print('dgamma difference: ', rel_error(dgamma1, dgamma2))
print('dbeta difference: ', rel_error(dbeta1, dbeta2))
print('speedup: %.2fx' % (sum_time_1 / sum_time_2))
# dx difference: 8.165524334891262e-12
# dgamma difference: 0.0
# dbeta difference: 0.0
# speedup: 1.49x