Conv和BN算子融合
为什么做融合
在神经网络训练的过程中,BN层能够加速网络收敛,并且能够控制过拟合。不过这样也增加了一些运算和参数,在推理过程中,我们可以通过将BN层与卷积层的参数融合,来减少运算,并且为模型稍稍的瘦一下身。
依赖
- protobuf
- numpy
为了省事,使用protobuf来解析caffe模型,因此不需要配置caffe的python接口
融合小工具github
公式
卷积层计算公式
X
c
o
n
v
=
X
∗
W
+
b
c
o
n
v
X_{conv} = X * W + b_{conv}
Xconv=X∗W+bconv
W:权重,
b
c
o
n
v
b_{conv}
bconv:偏置
BN层计算公式
X
b
n
=
s
(
X
−
m
)
σ
+
ϵ
+
b
b
n
X_{bn} = \frac{s(X - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}
Xbn=σ+ϵs(X−m)+bbn
m:均值,
σ
\sigma
σ:方差,s:scale,
b
b
n
b_{bn}
bbn:偏置,
ϵ
\epsilon
ϵ:滑动系数
融合公式
融合实质就是将卷积层的输出
X
c
o
n
v
X_{conv}
Xconv作为BN层的输入
X
X
X代入,得到:
X
b
n
=
X
∗
s
W
σ
+
ϵ
+
s
(
b
c
o
n
v
−
m
)
σ
+
ϵ
+
b
b
n
X_{bn} = X * \frac{sW}{\sqrt{\sigma + \epsilon}} + \frac{s(b_{conv} - m)}{\sqrt{\sigma + \epsilon}} + b_{bn}
Xbn=X∗σ+ϵsW+σ+ϵs(bconv−m)+bbn
最终得到:
W
c
o
n
v
′
=
W
s
σ
+
ϵ
W_{conv}' = W\frac{s}{\sqrt{\sigma + \epsilon}}
Wconv′=Wσ+ϵs
b
c
o
n
v
′
=
(
b
c
o
n
v
−
m
)
s
σ
+
ϵ
+
b
b
n
b_{conv}' = (b_{conv} - m)\frac{s}{\sqrt{\sigma + \epsilon}} + b_{bn}
bconv′=(bconv−m)σ+ϵs+bbn
代码
conv_bn_scale_fuse.py
from src.rw_model import *
import numpy as np
##提取Conv+BatchNorm+Scale的名字和BatchNorm、Scale在Layer列表中的index
def getConvBNLayer(net):
Layer = getNetLayer(net)
Conv_BN_list = []
BN_index = []
for i in range(len(Layer)):
ConvBN = []
if (i+1)<len(Layer) and Layer[i].type == "Convolution" or Layer[i].type == 4 :
if Layer[i + 1].type == "BatchNorm" and Layer[i + 2].type == "Scale":
ConvBN.append(Layer[i].name)
ConvBN.append(Layer[i + 1].name)
ConvBN.append(Layer[i + 2].name)
Layer[i].convolution_param.bias_term = True
BN_index.append(i+1)
BN_index.append(i+2)
Conv_BN_list.append(ConvBN)
return Conv_BN_list,BN_index
##获取参数
def getParam(name,model):
ModelLayer = getModelLayer(model)
for i in range(len(ModelLayer)):
if ModelLayer[i].name == name:
params = ModelLayer[i].blobs
break
return params,i
##将参数按shape生成
def disposeParams(Params):
data,shape = np.array(Params.data),Params.shape.dim
P_data = data.reshape(shape)
return P_data
##修改参数,将Conv、BatchNorm、Scale的参数融合再存到Conv中
def modifyParam(ConvBNName,model):
for CBN in ConvBNName:
W_bc,conv_index = getParam(CBN[0],model)
W_data = disposeParams(W_bc[0])
bc_data = 0
if len(W_bc) != 1:
bc_data = disposeParams(W_bc[1])
m_v,bn_index = getParam(CBN[1],model)
m_data = disposeParams(m_v[0])
v_data = disposeParams(m_v[1])
s_bs,scale_index = getParam(CBN[2],model)
s_data = disposeParams(s_bs[0])
bs_data = disposeParams(s_bs[1])
W_new = []
for i in range(W_data.shape[0]):
temp = W_data[i] * s_data[i] / np.sqrt(v_data[i] + 0.00001)
W_new.append(temp)
b_new = (bc_data - m_data) * s_data / np.sqrt(v_data + 0.00001) + bs_data
model.layer[conv_index].blobs[0].data[:] = np.array(W_new).reshape(1,-1).tolist()[0]
if len(W_bc) == 1:
model.layer[scale_index].blobs[1].data[:] = np.array(b_new).reshape(1, -1).tolist()[0]
model.layer[conv_index].blobs.extend([model.layer[scale_index].blobs[1]])
else:
model.layer[conv_index].blobs[1].data[:] = np.array(b_new).reshape(1,-1).tolist()[0]
return model