认真学习,佛系更博。
本章将详细介绍batch_normalization层,batch_normalization不仅可用在卷积层,也适用于全连接层;此外,需要注意,卷积层的batch_normalization是对channel维之外所有的维度作归一化操作,其原因我觉得比较合理的是这个: https://www.zhihu.com/question/269658514/answer/827941407
我们先贴batch_normalization的代码:
from enet.layers.base_layer import Layer
from enet.utils.util import get_ndim_except_last
from enet.optimizer import optimizer_dict
import numpy as np
class BatchNormalization(Layer):
"""
全连接神经网络类
"""
def __init__(self, decay=0.9, optimizer="sgd", **k_args):
"""
:param momentum: 计算全局均值标准差时的冲量
"""
super(BatchNormalization, self).__init__(layer_type="batch_normalization")
self.decay = decay
self.running_mean = None
self.running_var = None
self.gamma = None
self.beta = None
self.param_shape = None
self.cache_std = None
self.cache_xc = None
self.cache_xn = None
self.optimizer = optimizer_dict[optimizer](**k_args)
def build(self, input_shape):
"""
根据input_shape来构建网络模型参数
:param input_shape: 输入形状
:return: 无返回值
"""
self.input_shape = input_shape
self.output_shape = input_shape
# 判断input为1维还是多维
self.param_shape = input_shape if isinstance(input_shape, int) else input_shape[-1]
self.gamma = np.random.uniform(low=0.9, high=1.1, size=self.param_shape)
self.beta = np.random.uniform(low=-0.1, high=0.1, size=self.param_shape)
self.running_mean = np.zeros(shape=self.param_shape)
self.running_var = np.zeros(shape=self.param_shape)
def forward(self, input_signal, train, *args, **k_args):
"""
前向传播
:param train: 是否维训练模式
:param input_signal: 输入信息
:return: 输出信号
"""
if train:
sample_mean = np.mean(input_signal, axis=get_ndim_except_last(input_signal))
sample_var = np.var(input_signal, axis=get_ndim_except_last(input_signal))
# 保存中间值,用作后续的梯度更新
self.cache = input_signal
self.cache_std = np.sqrt(sample_var + 1e-7)
self.cache_xc = input_signal - sample_mean
self.cache_xn = self.cache_xc / self.cache_std
# 注意和momentum优化器的区别
self.running_mean = self.decay * self.running_mean + (1 - self.decay) * sample_mean
self.running_var = self.decay * self.running_var + (1 - self.decay) * sample_var
input_signal = self.cache_xn
else:
input_signal = (input_signal - self.running_mean) / np.sqrt(self.running_var + 1e-7)
return input_signal * self.gamma + self.beta
def backward(self, delta):
"""
反向传播
:param delta: 输入梯度
:return: 误差回传
"""
delta_gamma = np.sum(self.cache * delta, axis=get_ndim_except_last(delta))
delta_beta = np.sum(delta, axis=get_ndim_except_last(delta))
self.optimizer.grand(delta_gamma=delta_gamma, delta_beta=delta_beta)
# 计算返回前一层的梯度
# x分为3条线,x->xc, x->mean, x->std;
xn_delta = delta * self.gamma
xc_delta = xn_delta / self.cache_std
std_delta = - np.sum((xn_delta * self.cache_xc) / (self.cache_std * self.cache_std),
axis=get_ndim_except_last(delta))
var_delta = 0.5 * std_delta / self.cache_std
xc_delta += 2.0 * self.cache_xc * var_delta / np.prod(delta.shape[: -1])
return xc_delta - np.mean(xc_delta, axis=get_ndim_except_last(delta))
def update(self, lr):
"""
更新参数
:param lr: 学习率
:return:
"""
delta_gamma, delta_beta = self.optimizer.get_delta_and_reset(lr, "delta_gamma", "delta_beta")
self.gamma += delta_gamma
self.beta += delta_beta
利用numpy的广播机制,我们可以很方便的写出同时适用于卷积层和全连接层的batch_normalization,其实现使用了gamma和beta,用于给最后一维的数据一个自适应的偏移值,作用在于标准化后的数据并不应该完全服从一致的分布,应该做适当的平移和放缩,是不是感觉和注意力模型有些类似?
batch_normalization的forward实现比较好理解,难点在于backward的反向传播,推导公式有些复杂,这里画一个示意图,可以根据示意图对照上面代码进行理解:
链式法则的原则就是,该函数回传的梯度为上层梯度与自身导数的乘积(矩阵需要做适当调整);
关于batch_normalization的介绍基本就到这了,我们尝试搭建一个包含卷积层和batch_normalization的模型,并训练:
from enet.model import Sequential
from enet.data import ImageHandler
from enet.layers import Dense, Sigmoid, Dropout, Softmax, Relu, BatchNormalization, Conv2D, Flatten, MaxPool2D
if __name__ == '__main__':
data_handler = ImageHandler("dataset", gray=True, flatten=False, use_scale=True)
train_data, train_label, test_data, test_label = data_handler.get_data(ratio=0.2, read_cache=False)
model = Sequential()
model.add(Conv2D(filters=32, optimizer="adam", strides=1, kernel_size=(2, 2), name="conv1",
padding="valid", input_shape=(28, 28, 1)))
model.add(BatchNormalization())
model.add(Relu())
model.add(MaxPool2D(size=2))
model.add(Conv2D(filters=64, optimizer="adam", padding="valid", name="conv2", strides=1))
model.add(BatchNormalization())
model.add(Relu())
model.add(MaxPool2D(size=2))
model.add(Flatten())
model.add(Dense(kernel_size=64, optimizer="adam"))
model.add(BatchNormalization())
model.add(Sigmoid())
model.add(Dense(kernel_size=10, activation="sigmoid", optimizer="adam"))
model.compile(loss="mse", lr=0.01)
model.summary()
model.fit(train_data, train_label, epoch=2)
训练过程比较慢,因为img2col和col2img需要消耗大量的时间,我们只训练2代,感兴趣的可以尝试使用as_stride优化卷积过程,详见:https://zhuanlan.zhihu.com/p/64933417,这里不做过多讨论,因为我们的主要目的是为了更好地理解卷积神经网络;
从结果可以看出,使用卷积层可以极大提高准确率,当然,这里添加了batch_normalization,读者可以尝试其他配置,观察下不通配置下的准确率;
后续有时间会更新RNN的实现,有时间再说;
整个代码的github网址为:https://github.com/darkwhale/neural_network,不断更新中;