- 简介
论文链接:https://arxiv.org/abs/1502.03167
yolov2中,根据论文所讲,通过在所有convolutional layer后添加batch normalization层,map提升2%.并且相对于yolov1,移除了droupout,BN提升了模型的泛化性,有效避免过拟合。 - BN原理
- 简约公式:
x i = x i − x ˉ 1 N ∑ j = 1 N ( x j − x ˉ ) 2 + ϵ ( 1 ) x_i = \frac{x_i - \bar{x}}{\sqrt{\frac{1}{N}\sum_{j=1}^{N}(x_j - \bar{x})^2+\epsilon}} \qquad (1) xi=N1∑j=1N(xj−xˉ)2+ϵxi−xˉ(1)
具体算法流程:学习 γ \gamma γ 和 β \beta β,yolo中只设置了scale,只有 γ \gamma γ
- 算法1
- 算法2
训练过程算法流程.因为测试阶段用的均值和方差是来自训练数据,文中是使用了 moving averages来计算用于测试用的 u B , σ 2 u_B,\sigma^2 uB,σ2 ,测试阶段 u B u_B uB不变。
- 反向传播
论文原文话:During training we need to backpropagate the gradient of loss δ \delta δ through this transformation
需要对 u B u_B uB、 σ B 2 \sigma_B^2 σB2、 x i x_i xi、 x ^ i \widehat x_i x i、 γ \gamma γ、 β \beta β求梯度,根据链式法则可得
∂ δ ∂ x ^ i = ∂ δ ∂ y i ∂ y i ∂ x ^ i = ∂ δ ∂ y i γ ( 2 ) \frac{\partial \delta}{\partial \widehat x_i} = \frac{\partial \delta}{\partial y_i } \frac{\partial y_i}{\partial \widehat x_i} = \frac{\partial \delta}{\partial y_i} \gamma \qquad (2) ∂x i∂δ=∂yi∂δ∂x i∂yi=∂yi∂δγ(2)
∂ δ ∂ u B = ∑ i = 1 m ∂ δ ∂ x ^ i ∂ x ^ i ∂ u B + ∂ δ ∂ σ B 2 ∂ σ B 2 ∂ u B = ∑ i = 1 m ∂ δ ∂ x ^ i . − 1 σ B 2 + ϵ + ∂ δ ∂ σ B 2 − 2 m ∑ i = 1 m ( x i − u B ) ( 3 ) \frac{\partial \delta}{\partial \ u_B} =\sum_{i=1}^{m} \frac{\partial \delta}{\partial \widehat x_i } \frac{\partial \widehat x_i}{\partial u_B} + \frac{\partial \delta}{\partial \sigma_B^2 }\frac {\partial \sigma_B^2}{\partial u_B}= \sum_{i=1}^m \frac{\partial \delta}{\partial \widehat x_i } . \frac{-1}{\sqrt{\sigma_B^2+\epsilon}} + \frac{\partial \delta}{\partial \sigma_B^2 }\frac{-2}{m} \sum_{i=1}^m (x_i - u_B)\qquad (3) ∂ uB∂δ=i=1∑m∂x i∂δ∂uB∂x i+∂σB2∂δ∂uB∂σB2=i=1∑m∂x i∂δ.σB2+ϵ−1+∂σB2∂δm−2i=1∑m(xi−uB)(3)
∂ δ ∂ σ B 2 = ∑ i = 1 m ∂ δ ∂ x ^ i ∂ x ^ i ∂ σ B 2 = ∑ i = 1 m ∂ δ ∂ x ^ i ( x i − u B ) − 1 2 ( σ B 2 + ϵ ) − 2 / 3 ( 4 ) \frac{\partial \delta}{\partial \sigma_B^2} =\sum_{i=1}^{m} \frac{\partial \delta}{\partial \widehat x_i } \frac{\partial \widehat x_i}{\partial \sigma_B^2} = \sum_{i=1}^m \frac{\partial \delta}{\partial \widehat x_i }(x_i -u_B)\frac{-1}{2}(\sigma_B^2+ \epsilon)^{-2/3} \qquad (4) ∂σB2∂δ=i=1∑m∂x i∂δ∂σB2∂x i=i=1∑m∂x i∂δ(xi−uB)2−1(σB2+ϵ)−2/3(4)
∂ δ x i = ∂ δ ∂ x ^ i ∂ x ^ i ∂ x i + ∂ δ ∂ σ B 2 ∂ σ B 2 ∂ x i + ∂ δ ∂ u B ∂ u B ∂ x i = ∂ δ ∂ x ^ i 1 σ B 2 + ϵ + ∂ δ ∂ σ B 2 2 m ( x i − u B ) + ∂ δ ∂ u B 1 m ( 5 ) \frac{\partial \delta}{x_i} = \frac{\partial \delta}{\partial \widehat x_i } \frac{\partial \widehat x_i}{\partial x_i } + \frac{\partial \delta}{\partial \sigma_B^2 } \frac{\partial \sigma_B^2}{\partial x_i } + \frac{\partial \delta}{\partial u_B } \frac{\partial u_B}{\partial x_i } = \frac{\partial \delta}{\partial \widehat x_i } \frac{1}{\sqrt{\sigma_B^2+\epsilon}} + \frac{\partial \delta}{\partial \sigma_B^2 }\frac{2}{m}(x_i - u_B)+ \frac{\partial \delta}{\partial u_B }\frac{1}{m} \qquad (5) xi∂δ=∂x i∂δ∂xi∂x i+∂σB2∂δ∂xi∂σB2+∂uB∂δ∂xi∂uB=∂x i∂δσB2+ϵ1+∂σB2∂δm2(xi−uB)+∂uB∂δm1(5)
∂ δ ∂ γ = ∂ δ ∂ y i ∂ y i ∂ γ = ∂ δ ∂ y i x ^ i ( 6 ) \frac{\partial \delta}{\partial \gamma} = \frac{\partial \delta}{\partial y_i} \frac{\partial y_i}{\partial \gamma} = \frac{\partial \delta}{\partial y_i} \widehat x_i\qquad(6) ∂γ∂δ=∂yi∂δ∂γ∂yi=∂yi∂δx i(6)
∂ δ ∂ β = ∂ δ ∂ y i ∂ y i ∂ β = ∂ δ ∂ y i ( 7 ) \frac{\partial \delta}{\partial \beta} = \frac{\partial \delta}{\partial y_i} \frac{\partial y_i}{\partial \beta} = \frac{\partial \delta}{\partial y_i} \qquad(7) ∂β∂δ=∂yi∂δ∂β∂yi=∂yi∂δ(7)
代码分析
- 前向传播
void forward_batchnorm_layer(layer l, network_state state)
{
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, state.input, 1, l.output, 1);
if(l.type == CONNECTED){ // 全连接层需要把宽高设置为1
l.out_c = l.outputs;
l.out_h = l.out_w = 1;
}
// 训练阶段
if(state.train){
// 求每个通道的均值
mean_cpu(l.output, l.batch, l.out_c, l.out_h*l.out_w, l.mean);
// 求每个通道的方差
variance_cpu(l.output, l.mean, l.batch, l.out_c, l.out_h*l.out_w, l.variance);
// moving averages
scal_cpu(l.out_c, .9, l.rolling_mean, 1);
axpy_cpu(l.out_c, .1, l.mean, 1, l.rolling_mean, 1);
scal_cpu(l.out_c, .9, l.rolling_variance, 1);
axpy_cpu(l.out_c, .1, l.variance, 1, l.rolling_variance, 1);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x, 1);
//公式 (1)
normalize_cpu(l.output, l.mean, l.variance, l.batch, l.out_c, l.out_h*l.out_w);
copy_cpu(l.outputs*l.batch, l.output, 1, l.x_norm, 1);
} else { // inference 阶段
normalize_cpu(l.output, l.rolling_mean, l.rolling_variance, l.batch, l.out_c, l.out_h*l.out_w);
}
// 计算 yi,算法1最后一步
scale_bias(l.output, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
}
- 反向传播过程
void backward_batchnorm_layer(const layer l, network_state state)
{ // 公式(6)
backward_scale_cpu(l.x_norm, l.delta, l.batch, l.out_c, l.out_w*l.out_h, l.scale_updates);
// 公式(2)
scale_bias(l.delta, l.scales, l.batch, l.out_c, l.out_h*l.out_w);
// 公式(3)
mean_delta_cpu(l.delta, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.mean_delta);
// 公式(4)
variance_delta_cpu(l.x, l.delta, l.mean, l.variance, l.batch, l.out_c, l.out_w*l.out_h, l.variance_delta);
// 公式(5)
normalize_delta_cpu(l.x, l.mean, l.variance, l.mean_delta, l.variance_delta, l.batch, l.out_c, l.out_w*l.out_h, l.delta);
if(l.type == BATCHNORM) copy_cpu(l.outputs*l.batch, l.delta, 1, state.delta, 1);
}