Tensorflow BN

莫老师传送门
我把plot删除了,计算cost,发现增加bn后,cost少了很多。
主要复制batch的代码,看的不大懂,比如每个中间的ema 都被存储了吗?
每经过一个层就知道应该用哪个ema对应的权值更新。可能每个ema的名字不一样,他是通过名字识别的。
这里写图片描述

train和test时还不大一样,train要用计算得到的var更新原始var,但是test时只用取得它的原始保存的var
这里写图片描述

if norm:
            # Batch Normalize
            fc_mean, fc_var = tf.nn.moments(
                Wx_plus_b,
                axes=[0],   # the dimension you wanna normalize, here [0] for batch
                            # for image, you wanna do [0, 1, 2] for [batch, height, width] but not channel
            )
            scale = tf.Variable(tf.ones([out_size]))
            shift = tf.Variable(tf.zeros([out_size]))
            epsilon = 0.001

            # apply moving average for mean and var when train on batch
            ema = tf.train.ExponentialMovingAverage(decay=0.5)
            def mean_var_with_update():
                ema_apply_op = ema.apply([fc_mean, fc_var])
                with tf.control_dependencies([ema_apply_op]):
                    return tf.identity(fc_mean), tf.identity(fc_var)
            mean, var = mean_var_with_update()

            Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)
            # similar with this two steps:
            # Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
            # Wx_plus_b = Wx_plus_b * scale + shift

具体的使用请参加tensorflow官网

完整代码

"""
visit https://morvanzhou.github.io/tutorials/ for more!
Build two networks.
1. Without batch normalization
2. With batch normalization
Run tests on these two networks.
"""

# 23 Batch Normalization

import numpy as np
import tensorflow as tf


ACTIVATION = tf.nn.tanh
N_LAYERS = 7
N_HIDDEN_UNITS = 30


def fix_seed(seed=1):
    # reproducible
    np.random.seed(seed)
    tf.set_random_seed(seed)


def built_net(xs, ys, norm):
    def add_layer(inputs, in_size, out_size, activation_function=None, norm=False):
        # weights and biases (bad initialization for this case)
        Weights = tf.Variable(tf.random_normal([in_size, out_size], mean=0., stddev=1.))
        biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)

        # fully connected product
        Wx_plus_b = tf.matmul(inputs, Weights) + biases

        # normalize fully connected product
        if norm:
            # Batch Normalize
            fc_mean, fc_var = tf.nn.moments(
                Wx_plus_b,
                axes=[0],   # the dimension you wanna normalize, here [0] for batch
                            # for image, you wanna do [0, 1, 2] for [batch, height, width] but not channel
            )
            scale = tf.Variable(tf.ones([out_size]))
            shift = tf.Variable(tf.zeros([out_size]))
            epsilon = 0.001

            # apply moving average for mean and var when train on batch
            ema = tf.train.ExponentialMovingAverage(decay=0.5)
            def mean_var_with_update():
                ema_apply_op = ema.apply([fc_mean, fc_var])
                with tf.control_dependencies([ema_apply_op]):
                    return tf.identity(fc_mean), tf.identity(fc_var)
            mean, var = mean_var_with_update()

            Wx_plus_b = tf.nn.batch_normalization(Wx_plus_b, mean, var, shift, scale, epsilon)
            # similar with this two steps:
            # Wx_plus_b = (Wx_plus_b - fc_mean) / tf.sqrt(fc_var + 0.001)
            # Wx_plus_b = Wx_plus_b * scale + shift

        # activation
        if activation_function is None:
            outputs = Wx_plus_b
        else:
            outputs = activation_function(Wx_plus_b)

        return outputs

    fix_seed(1)

    if norm:
        # BN for the first input
        fc_mean, fc_var = tf.nn.moments(
            xs,
            axes=[0],
        )
        scale = tf.Variable(tf.ones([1]))
        shift = tf.Variable(tf.zeros([1]))
        epsilon = 0.001
        # apply moving average for mean and var when train on batch
        ema = tf.train.ExponentialMovingAverage(decay=0.5)
        def mean_var_with_update():
            ema_apply_op = ema.apply([fc_mean, fc_var])
            with tf.control_dependencies([ema_apply_op]):
                return tf.identity(fc_mean), tf.identity(fc_var)
        mean, var = mean_var_with_update()
        xs = tf.nn.batch_normalization(xs, mean, var, shift, scale, epsilon)

    # record inputs for every layer
    layers_inputs = [xs]

    # build hidden layers
    for l_n in range(N_LAYERS):
        layer_input = layers_inputs[l_n]
        in_size = layers_inputs[l_n].get_shape()[1].value

        output = add_layer(
            layer_input,    # input
            in_size,        # input size
            N_HIDDEN_UNITS, # output size
            ACTIVATION,     # activation function
            norm,           # normalize before activation
        )
        layers_inputs.append(output)    # add output for next run

    # build output layer
    prediction = add_layer(layers_inputs[-1], 30, 1, activation_function=None)

    cost = tf.reduce_mean(tf.reduce_sum(tf.square(ys - prediction), reduction_indices=[1]))
    train_op = tf.train.GradientDescentOptimizer(0.001).minimize(cost)
    return [train_op, cost, layers_inputs]



# make up data
fix_seed(1)
x_data = np.linspace(-7, 10, 2500)[:, np.newaxis] #[2500,1]
np.random.shuffle(x_data) # change position
noise = np.random.normal(0, 8, x_data.shape) #add noise to y_data
y_data = np.square(x_data) - 5 + noise

xs = tf.placeholder(tf.float32, [None, 1])  # [num_samples, num_features]
ys = tf.placeholder(tf.float32, [None, 1])

train_op, cost, layers_inputs = built_net(xs, ys, norm=False)   # without BN
train_op_norm, cost_norm, layers_inputs_norm = built_net(xs, ys, norm=True) # with BN

sess = tf.Session()
if int((tf.__version__).split('.')[1]) < 12 and int((tf.__version__).split('.')[0]) < 1:
    init = tf.initialize_all_variables()
else:
    init = tf.global_variables_initializer()
sess.run(init)

# record cost
record_step = 30

for i in range(250):
    # train on batch
    sess.run([train_op, train_op_norm], feed_dict={xs: x_data[i*10:i*10+10], ys: y_data[i*10:i*10+10]})

    if i % record_step == 0:
        # record cost
        print(sess.run(cost, feed_dict={xs: x_data, ys: y_data}))
        print(sess.run(cost_norm, feed_dict={xs: x_data, ys: y_data}))
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
残差块(Residual Block)是深度学习中的一种重要的网络结构,用于解决深层神经网络训练过程中的梯度消失和梯度爆炸问题。在TensorFlow中,可以通过自定义层或使用现有的库来实现残差块。 一个典型的残差块由两个主要部分组成:主路径(Main Path)和跳跃连接(Skip Connection)。主路径是指一系列的卷积层、批归一化层和激活函数层,用于提取输入数据的特征。跳跃连接是指将输入数据直接添加到主路径的输出上,以便在后续层中传递原始输入的信息。 以下是一个简单的残差块的TensorFlow实现示例: ```python import tensorflow as tf class ResidualBlock(tf.keras.layers.Layer): def __init__(self, filters, strides=1): super(ResidualBlock, self).__init__() self.conv1 = tf.keras.layers.Conv2D(filters, kernel_size=3, strides=strides, padding='same') self.bn1 = tf.keras.layers.BatchNormalization() self.relu = tf.keras.layers.ReLU() self.conv2 = tf.keras.layers.Conv2D(filters, kernel_size=3, strides=1, padding='same') self.bn2 = tf.keras.layers.BatchNormalization() self.downsample = tf.keras.Sequential([ tf.keras.layers.Conv2D(filters, kernel_size=1, strides=strides), tf.keras.layers.BatchNormalization() ]) def call(self, inputs, training=False): residual = inputs x = self.conv1(inputs) x = self.bn1(x, training=training) x = self.relu(x) x = self.conv2(x) x = self.bn2(x, training=training) if inputs.shape[-1] != x.shape[-1]: residual = self.downsample(inputs) x += residual x = self.relu(x) return x ``` 在这个示例中,我们定义了一个继承自`tf.keras.layers.Layer`的`ResidualBlock`类。在`__init__`方法中,我们定义了残差块的各个层,包括卷积层、批归一化层和激活函数层。在`call`方法中,我们实现了残差块的前向传播逻辑,包括主路径和跳跃连接的计算。 使用残差块时,可以将多个残差块堆叠在一起构成深层网络。这样可以有效地解决梯度消失和梯度爆炸问题,并提高网络的性能和训练效果。

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值