# tensorflow实践-batch normalization（一）

tensorflow 中关于batch normalization的函数主要有三个：
-tf.nn.batch_normalization
-tf.layers.batch_normalization
-tf.contrib.layers.batch_norm

## 一、tf.nn.batch_normalization

### 1、tf.nn.moments函数

def moments(x, axes, name=None, keep_dims=False) 

x：输入数据。形如[batchsize，height，width，kernels]
axes：表示在哪个维度上求解。是一个list。
keep_dims：是否保持维度。

#### 例1：计算一个2*3维向量的mean和variance

import tensorflow as tf
img = tf.Variable(tf.random_normal([2, 3]))
axis = list(range(len(img.get_shape()) - 1))  #axis=[0]
mean, variance = tf.nn.moments(img, axis)

img = array([[ 0.7691303 , -0.35025588, -0.09380586],
[-1.4653573 ,  0.29895827,  0.41032326]], dtype=float32),
mean = array([[-0.3481135, -0.0256488,  0.1582587]], dtype=float32),
variance = array([[1.2482337 , 0.10536975, 0.06353654]], dtype=float32)

moments函数就是在第0维上求了均值和方差。

axis=0，那么输出矩阵是1行，求每一列的平均；axis=1，输出矩阵是1列，求每一行的平均。

#### 例2：计算卷积神经网络某层的mean和variance

    import tensorflow as tf
img = tf.Variable(tf.random_normal([128,32,32,64]))
axis = list(range(len(img.get_shape()) - 1))
mean, variance = tf.nn.moments(img, axis)

输出结果如下：
mean = array([[[[ 4.0830136e-04, -4.2963726e-03, -5.0003931e-04,
-4.4543482e-04,  6.0103042e-04, -4.0140026e-04,
2.5256963e-03, -1.0819699e-03, -1.4404759e-03,
-1.4326994e-03, -8.2220486e-04, -1.4163775e-03,
9.7719464e-04,  1.0412441e-03, -1.3563948e-03,
-2.5035394e-03,  7.8339566e-05, -1.4030328e-03,
-7.6795375e-04,  7.8183822e-03, -2.0574513e-03,
6.8343728e-04,  4.1835662e-04, -6.1633415e-03,
-1.1482568e-03,  6.4310152e-03,  1.9873765e-03,
-9.3293225e-04,  3.0720932e-03, -1.4636834e-03,
4.3379748e-04,  2.7630590e-03, -2.0989170e-03,
-7.2691259e-03, -6.3739987e-03, -1.8997930e-03,
-4.7454494e-04,  2.1465109e-03, -3.9908607e-03,
1.4424872e-03, -2.4142924e-03, -6.0538348e-04,
2.5435248e-03, -4.0083809e-04, -3.1555234e-03,
2.5182338e-03, -2.6306501e-03,  8.3392172e-04,
2.5035494e-03,  4.7882642e-03, -2.0719678e-03,
-2.3144923e-04, -2.8795146e-03,  3.9198864e-03,
-5.2687079e-03, -1.1409470e-04,  1.3856608e-03,
3.4842882e-03, -1.0945165e-03,  6.8958546e-03,
2.9155985e-05,  1.3611093e-03,  2.2281366e-03,
2.7138158e-03]]]], dtype=float32),
variance = array([[[[1.0005758 , 1.0044069 , 0.9994525 , 0.9997337 , 1.0071163 ,
1.0019692 , 0.9954032 , 1.0064473 , 1.001137  , 0.9969884 ,
0.9947835 , 0.9986869 , 0.99899316, 1.0043697 , 1.0033542 ,
1.0046039 , 0.99368966, 0.9923917 , 0.99647164, 1.0045955 ,
1.0020585 , 0.9950892 , 1.0052316 , 1.002666  , 1.0090908 ,
1.008016  , 0.9980576 , 0.9993979 , 0.99848366, 0.99986047,
0.99899065, 1.006967  , 1.003453  , 1.0008634 , 1.0005352 ,
0.996747  , 0.99737716, 0.9945858 , 1.0005856 , 0.99736226,
1.0054593 , 1.0028933 , 0.9965185 , 1.000582  , 1.0035369 ,
0.99765056, 0.9987483 , 0.99487376, 0.99546564, 0.99410796,
1.0013032 , 0.99769133, 0.99917245, 1.0011214 , 1.0029986 ,
0.99958813, 0.9991367 , 0.9963123 , 0.9997672 , 1.0027118 ,
0.99747765, 1.0014238 , 1.0001569 , 0.99354565]]]],
dtype=float32)

### 2、tf.nn.batch_normalization函数

    def batch_normalization(x,mean,variance,
offset,scale,variance_epsilon,
name=None)

x：输入任意维度的tensor
mean：tensor均值
variance：tensor方差
offset：平移量，公式中的β。需要训练参数，一般初始化为0。
scale：放缩量，公式中的γ。需要训练的参数，一般初始化为1。

#### 例3：一个BN的完整例子

reference：莫烦tensorflow教程之BN

import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt

ACTIVATION = tf.nn.tanh  # 每一层都是用tanh
N_LAYERS = 7             # 一共7层隐藏层
N_HIDDEN_UNITS = 30      # 每个层隐藏层有30个神经元

def fix_seed(seed=1):
# reproducible
np.random.seed(seed)
tf.set_random_seed(seed)

def plot_his(inputs, inputs_norm):
# plot histogram for the inputs of every layer
for j, all_inputs in enumerate([inputs, inputs_norm]):
for i, input in enumerate(all_inputs):
plt.subplot(2, len(all_inputs), j*len(all_inputs)+(i+1))
plt.cla()
if i == 0:
the_range = (-7, 10)
else:
the_range = (-1, 1)
plt.hist(input.ravel(), bins=15, range=the_range, color='#FF5733')
plt.yticks(())
if j == 1:
plt.xticks(the_range)
else:
plt.xticks(())
ax = plt.gca()
ax.spines['right'].set_color('none')
ax.spines['top'].set_color('none')
plt.title("%s normalizing" % ("Without" if j == 0 else "With"))
plt.draw()
plt.pause(0.01)

# 搭建神经网络
def built_net(xs, ys, norm):
def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
# 添加层
Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0., stddev=1.))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)

# fully connected product
Wx_plus_b = tf.matmul(inputs, Weights) + biases

# 对全连接层输入进行BN，BN加在wx+b之后，激活函数值前
# 每层的 Wx_plus_b 需要进行一次 batch normalize 的步骤,
# 这样输出到 activation 的 Wx_plus_b 就已经被 normalize 过了:
if norm:
# Batch Normalize
fc_mean, fc_var = tf.nn.moments(
Wx_plus_b,
axes=[0],   # 想要 normalize 的维度, [0] 代表 batch 维度
# 如果是图像数据, 可以传入 [0, 1, 2], 相当于求[batch, height, width] 的均值/方差, 注意不要加入 channel 维度
)
scale = tf.Variable(tf.ones([out_size]))
shift = tf.Variable(tf.zeros([out_size]))
epsilon = 0.001

# 如果使用batch进行每次的更新，那每个batch的mean/var都会不同。
# 需要用滑动平均的方法记录慢慢改进的mean/var的值，
# 然后将修改提升后的mean/var放入tf.nn.batch_normalization()
# 而且在test阶段，我们就可以直接调用最后一次修改的mean/var值进行测试
# 而不是采用test时的fc_mean/fc_var
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
# 根据新的 batch 数据, 记录并稍微修改之前的 mean/var
mean, var = mean_var_with_update()

Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)
# 上面那一步, 在做如下事情:
# Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
# Wx_plus_b = Wx_plus_b * scale + shift

# activation
if activation_function is None:
outputs = Wx_plus_b
else:
outputs = activation_function(Wx_plus_b)

return outputs

fix_seed(1)

# 对输入数据进行BN
if norm:
fc_mean, fc_var = tf.nn.moments(
xs,
axes=[0],
)
scale = tf.Variable(tf.ones([1]))
shift = tf.Variable(tf.zeros([1]))
epsilon = 0.001
# apply moving average for mean and var when train on batch
ema = tf.train.ExponentialMovingAverage(decay=0.5)
def mean_var_with_update():
ema_apply_op = ema.apply([fc_mean, fc_var])
with tf.control_dependencies([ema_apply_op]):
return tf.identity(fc_mean), tf.identity(fc_var)
mean, var = mean_var_with_update()
xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)

# 记录每层的输入
layers_inputs = [xs]

# 建立所有的层
for l_n in range(N_LAYERS):
layer_input = layers_inputs[l_n]
in_size = layers_inputs[l_n].get_shape()[1].value

layer_input,    # input
in_size,        # input size
N_HIDDEN_UNITS, # output size
ACTIVATION,     # activation function
norm,           # normalize before activation
)
layers_inputs.append(output)    # 把output加入记录

# 建立输出层
prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)

cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1]))
return [train_op, cost, layers_inputs]

# 创造数据并可视化
fix_seed(1)
x_data = np.linspace(-7, 10, 2500)[:, np.newaxis]
np.random.shuffle(x_data)
noise = np.random.normal(0, 8, x_data.shape)
y_data = np.square(x_data) - 5 + noise

# 可视化输入数据
plt.scatter(x_data, y_data)
plt.show()

xs = tf.placeholder(tf.float32, [None, 1])  # [num_samples, num_features]
ys = tf.placeholder(tf.float32, [None, 1])

train_op, cost, layers_inputs = built_net(xs, ys, norm=False)   # without BN
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True) # with BN

sess = tf.Session()
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
init = tf.initialize_all_variables()
else:
init = tf.global_variables_initializer()
sess.run(init)

# 记录两种网络的cost变化
cost_his = []
cost_his_norm = []
record_step = 5

plt.ion()
plt.figure(figsize=(7, 3))
for i in range(250):
if i % 50 == 0:
# 每层在激活函数值前计算结果值的分布
all_inputs, all_inputs_norm = sess.run([layers_inputs, layers_inputs_norm], feed_dict={xs: x_data, ys: y_data})
plot_his(all_inputs, all_inputs_norm)

# train on batch
sess.run([train_op, train_op_norm], feed_dict={xs: x_data[i*10:i*10+10], ys: y_data[i*10:i*10+10]})

if i % record_step == 0:
# 记录损失
cost_his.append(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
cost_his_norm.append(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))

plt.ioff()
plt.figure()
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his), label='no BN')     # no norm
plt.plot(np.arange(len(cost_his))*record_step, np.array(cost_his_norm), label='BN')   # norm
plt.legend()
plt.show()

tanh误差对比：

relu误差对比：