Diverse Branch Block: Building a Convolution as an Inception-like Unit
旷视——2021CVPR
paper:https://arxiv.org/pdf/2103.13425.pdf
code:https://github.com/DingXiaoH/DiverseBranchBlock
摘要
提出了一个卷积神经网络(ConvNet)的通用构建块,以提高没有任何推理时间成本的性能。该块被命名为“不同分支块”(DBB),它通过结合不同尺度和复杂的不同分支来丰富特征空间,包括卷积序列、多尺度卷积和平均池,增强了单个卷积的表示能力。训练结束后,DBB可以等价地转换为一个单一的转换层以进行部署。与新的ConvNet架构不同,DBB在维护宏架构的同时使训练时的微观结构复杂化,因此它可以用作任何架构的常规转换层的临时替代。这样,该模型可以被训练以达到更高的性能水平,然后转换为原始的推理时间结构进行推理。DBB在图像分类方面(ImageNet的最高准确率)、对象检测和语义分割提高了1.9%。
论文主要思想
由于卷积操作是一种线性变换,那么具有结合律性质和分配律性质,即可以将多个卷积操作合并成一个操作。多个之路的合并属于结合律,串行合并属于分配律。同理,Batch Normalization(BN)和Average Pooling也是一种线性变化,那么可以将卷积、BN和Average Pooling进行融合。于是乎有了作者提出的六种合并方式:
(1) Conv-BN的合并:(2)分支合并;(3) 卷积序列合并;(4) 深度拼接合并;(5) 均值池化转换;(6) 多尺度卷积转换等。
(PS:在这里,笔者认为不同卷积和也可以合并,如:3*3,5*5,7*7只是相应的位置进行补零操作而已,同时对于dilation convolution也可以进行融合。只是这样的操作计算代价太高,但有使用大卷核的同志们可以尝试)
Keras代码实现
以下是根据论文和Pytorch源码实现的keras版本(支持Tensorflow1.x)。特征通道必须channel last。代码链接
训练部分:
这里只给出上面所说的六个中的一个。其实dbb_dbb函数已经包含了所有六种方式。
def dbb_dbb(self, input, filters, kernel_size, name, dilation_rate=1, use_bias=False, use_bn=True, padding='same'):
"""
|
|
---------------------------------------------
| | | |
| | | |
1x1 1x1 1x1 kxk
| | | |
| | | |
BN BN BN BN
| | | |
| | | |
| kxk avg |
| | | |
| | | |
| BN | |
--------------------Add----------------------
|
|
Diverse Branch Block
"""
x = None
if self.stage == 'train':
x1 = Conv2D(filters, 1, padding=padding, use_bias=use_bias,
dilation_rate=(dilation_rate, dilation_rate), name=name + '_conv_1x1_1')(input)
x2 = Conv2D(filters, 1, padding=padding, use_bias=use_bias,
dilation_rate=(dilation_rate, dilation_rate), name=name + '_conv_1x1_2')(input)
x3 = Conv2D(filters, 1, padding=padding, use_bias=use_bias,
dilation_rate=(dilation_rate, dilation_rate), name=name + '_conv_1x1_3')(input)
x4 = Conv2D(filters, kernel_size, padding=padding, use_bias=use_bias,
dilation_rate=(dilation_rate, dilation_rate), name=name + '_conv_kxk_1')(input)
if use_bn:
x2 = BatchNormalization(name=name + '_bn_2_1')(x2)
x3 = BatchNormalization(name=name + '_bn_3_1')(x3)
x2 = Conv2D(filters, kernel_size, padding=padding, use_bias=use_bias,
dilation_rate=(dilation_rate, dilation_rate), name=name + '_conv_kxk_2')(x2)
x3 = AveragePooling2D(kernel_size, strides=1, padding='same', name=name + '_avg')(x3)
# avg_weights = np.zeros(shape=(kernel_size, kernel_size, filters, filters))
# for i in range(filters):
# avg_weights[:, :, i % filters, i] = 1. / (kernel_size * kernel_size)
# avg = Conv2D(filters, kernel_size, padding=padding, use_bias=False,
# dilation_rate=(dilation_rate, dilation_rate), name=name + '_avg')
# x3 = avg(x3)
# avg.set_weights([avg_weights])
# avg.trainable = False
if use_bn:
x1 = BatchNormalization(name=name + '_bn_1')(x1)
x2 = BatchNormalization(name=name + '_bn_2_2')(x2)
x3 = BatchNormalization(name=name + '_bn_3_2')(x3)
x4 = BatchNormalization(name=name + '_bn_4')(x4)
x = Add(name=name + '_add')([x1, x2, x3, x4])
# x = Add(name=name + '_add')([x1, x2, x4])
else:
x = Conv2D(filters, kernel_size, dilation_rate=dilation_rate,
padding='same', name=name)(input)
self.dbb_block_names['dbb_dbb'].append([name, use_bias, use_bn, None, None])
return x
融合部分:
def fusion_dbb(AC_names, trained_model, infer_model):
"""
|
|
---------------------------------------------
| | | |
| | | |
1x1 1x1 1x1 kxk
| | | |
| | | |
BN BN BN BN
| | | |
| | | |
| kxk avg |
| | | |
| | | |
| BN | |
--------------------Add----------------------
|
|
Diverse Branch Block
"""
for layer_name, use_bias, use_bn, model, epoch in AC_names:
conv_1x1_1_weights = trained_model.get_layer(layer_name + '_conv_1x1_1').get_weights()[0]
conv_1x1_2_weights = trained_model.get_layer(layer_name + '_conv_1x1_2').get_weights()[0]
conv_1x1_3_weights = trained_model.get_layer(layer_name + '_conv_1x1_3').get_weights()[0]
conv_kxk_1_weights = trained_model.get_layer(layer_name + '_conv_kxk_1').get_weights()[0]
conv_kxk_2_weights = trained_model.get_layer(layer_name + '_conv_kxk_2').get_weights()[0]
kernel_size = conv_kxk_2_weights.shape[0]
# #
in_channels = conv_kxk_2_weights.shape[2]
conv_kxk_3_weights = np.zeros_like(conv_kxk_2_weights)
for i in range(in_channels):
conv_kxk_3_weights[:, :, i % in_channels, i] = 1.0 / (kernel_size*kernel_size)
if use_bias:
conv_kxk_1_bias = trained_model.get_layer(layer_name + '_conv_kxk_1').get_weights()[1]
conv_kxk_2_bias = trained_model.get_layer(layer_name + '_conv_kxk_2').get_weights()[1]
conv_1x1_1_bias = trained_model.get_layer(layer_name + '_conv_1x1_1').get_weights()[1]
conv_1x1_2_bias = trained_model.get_layer(layer_name + '_conv_1x1_2').get_weights()[1]
conv_1x1_3_bias = trained_model.get_layer(layer_name + '_conv_1x1_3').get_weights()[1]
else:
conv_kxk_1_bias = np.zeros((conv_kxk_1_weights.shape[-1],))
conv_kxk_2_bias = np.zeros((conv_kxk_2_weights.shape[-1],))
conv_1x1_1_bias = np.zeros((conv_1x1_1_weights.shape[-1],))
conv_1x1_2_bias = np.zeros((conv_1x1_2_weights.shape[-1],))
conv_1x1_3_bias = np.zeros((conv_1x1_3_weights.shape[-1],))
conv_kxk_3_bias = np.zeros_like(conv_kxk_2_bias)
if use_bn:
gammas_1x1_1, betas_1x1_1, means_1x1_1, var_1x1_1 = trained_model.get_layer(layer_name + '_bn_1').get_weights()
gammas_1x1_2_1, betas_1x1_2_1, means_1x1_2_1, var_1x1_2_1 = trained_model.get_layer(layer_name + '_bn_2_1').get_weights()
gammas_kxk_2_2, betas_kxk_2_2, means_kxk_2_2, var_kxk_2_2 = trained_model.get_layer(layer_name + '_bn_2_2').get_weights()
gammas_1x1_3_1, betas_1x1_3_1, means_1x1_3_1, var_1x1_3_1 = trained_model.get_layer(
layer_name + '_bn_3_1').get_weights()
gammas_kxk_3_2, betas_kxk_3_2, means_kxk_3_2, var_kxk_3_2 = trained_model.get_layer(
layer_name + '_bn_3_2').get_weights()
gammas_kxk_4, betas_kxk_4, means_kxk_4, var_kxk_4 = trained_model.get_layer(
layer_name + '_bn_4').get_weights()
else:
gammas_1x1_1, betas_1x1_1, means_1x1_1, var_1x1_1 = [np.ones((conv_1x1_1_weights.shape[-1],)),
np.zeros((conv_1x1_1_weights.shape[-1],)),
np.zeros((conv_1x1_1_weights.shape[-1],)),
np.ones((conv_1x1_1_weights.shape[-1],))]
gammas_1x1_2_1, betas_1x1_2_1, means_1x1_2_1, var_1x1_2_1 = [np.ones((conv_1x1_2_weights.shape[-1],)),
np.zeros((conv_1x1_1_weights.shape[-1],)),
np.zeros((conv_1x1_1_weights.shape[-1],)),
np.ones((conv_1x1_1_weights.shape[-1],))]
gammas_1x1_3_1, betas_1x1_3_1, means_1x1_3_1, var_1x1_3_1 = [np.ones((conv_1x1_3_weights.shape[-1],)),
np.zeros((conv_1x1_3_weights.shape[-1],)),
np.zeros((conv_1x1_3_weights.shape[-1],)),
np.ones((conv_1x1_3_weights.shape[-1],))]
gammas_kxk_2_2, betas_kxk_2_2, means_kxk_2_2, var_kxk_2_2 = [np.ones((conv_kxk_2_weights.shape[-1],)),
np.zeros((conv_kxk_2_weights.shape[-1],)),
np.zeros((conv_kxk_2_weights.shape[-1],)),
np.ones((conv_kxk_2_weights.shape[-1],))]
gammas_kxk_3_2, betas_kxk_3_2, means_kxk_3_2, var_kxk_3_2 = [np.ones((conv_kxk_2_weights.shape[-1],)),
np.zeros((conv_kxk_2_weights.shape[-1],)),
np.zeros((conv_kxk_2_weights.shape[-1],)),
np.ones((conv_kxk_2_weights.shape[-1],))]
gammas_kxk_4, betas_kxk_4, means_kxk_4, var_kxk_4 = [np.ones((conv_kxk_1_weights.shape[-1],)),
np.zeros((conv_kxk_1_weights.shape[-1],)),
np.zeros((conv_kxk_1_weights.shape[-1],)),
np.ones((conv_kxk_1_weights.shape[-1],))]
w_1x1_2 = ((gammas_1x1_2_1 / np.sqrt(np.add(var_1x1_2_1, 1e-10))) * conv_1x1_2_weights).transpose([0, 1, 3, 2])
w_kxk_2 = ((gammas_kxk_2_2 / np.sqrt(np.add(var_kxk_2_2, 1e-10))) * conv_kxk_2_weights).transpose([0, 1, 3, 2])
b_1x1_2 = (((conv_1x1_2_bias - means_1x1_2_1) * gammas_1x1_2_1) / np.sqrt(np.add(var_1x1_2_1, 1e-10))) + betas_1x1_2_1
b_kxk_2 = (((conv_kxk_2_bias - means_kxk_2_2) * gammas_kxk_2_2) / np.sqrt(np.add(var_kxk_2_2, 1e-10))) + betas_kxk_2_2
w_1x1_3 = ((gammas_1x1_3_1 / np.sqrt(np.add(var_1x1_3_1, 1e-10))) * conv_1x1_3_weights).transpose([0, 1, 3, 2])
w_kxk_3 = ((gammas_kxk_3_2 / np.sqrt(np.add(var_kxk_3_2, 1e-10))) * conv_kxk_3_weights).transpose([0, 1, 3, 2])
b_1x1_3 = (((conv_1x1_3_bias - means_1x1_3_1) * gammas_1x1_3_1) / np.sqrt(np.add(var_1x1_3_1, 1e-10))) + betas_1x1_3_1
b_kxk_3 = (((conv_kxk_3_bias - means_kxk_3_2) * gammas_kxk_3_2) / np.sqrt(np.add(var_kxk_3_2, 1e-10))) + betas_kxk_3_2
with tf.Session() as sess:
conv_1x1_2 = tf.convert_to_tensor(w_1x1_2.astype(np.float32))
conv_kxk_2 = tf.convert_to_tensor(w_kxk_2.astype(np.float32))
numpy_w_2 = K.conv2d(conv_kxk_2, conv_1x1_2, padding='same').eval()
conv_1x1_3 = tf.convert_to_tensor(w_1x1_3.astype(np.float32))
conv_kxk_3 = tf.convert_to_tensor(w_kxk_3.astype(np.float32))
numpy_w_3 = K.conv2d(conv_kxk_3, conv_1x1_3, padding='same').eval()
weight2 = numpy_w_2.transpose([0, 1, 3, 2])
bias2 = np.sum(w_kxk_2 * b_1x1_2, axis=(0, 1, 3)) + b_kxk_2
weight3 = numpy_w_3.transpose([0, 1, 3, 2])
bias3 = np.sum(w_kxk_3 * b_1x1_3, axis=(0, 1, 3)) + b_kxk_3
weight4 = (gammas_kxk_4 / np.sqrt(np.add(var_kxk_4, 1e-10))) * conv_kxk_1_weights
weight1 = np.zeros_like(weight4)
weight1[kernel_size // 2, kernel_size // 2, :, :] = (gammas_1x1_1 / np.sqrt(
np.add(var_1x1_1, 1e-10))) * conv_1x1_1_weights
bias1 = (((conv_1x1_1_bias - means_1x1_1) * gammas_1x1_1) / np.sqrt(np.add(var_1x1_1, 1e-10))) + betas_1x1_1
bias4 = (((conv_kxk_1_bias - means_kxk_4) * gammas_kxk_4) / np.sqrt(np.add(var_kxk_4, 1e-10))) + betas_kxk_4
weight = weight1 + weight2 + weight3 + weight4
bias = bias1 + bias2 + bias3 + bias4
# weight = weight1 + weight2 + weight4
# bias = bias1 + bias2 + bias4
infer_model.get_layer(layer_name).set_weights([weight, bias])
声明:本内容来源网络,版权属于原作者,图片来源原论文。如有侵权,联系删除。