前言
这部分接着前一篇文章 【MMDetection 源码解读 yolov3】Backbone - Darknet53 继续往后讲。搭建完了主干特征提取模块,接着就是搭建yolov3的特征融合模块,这部分yolov3使用的是FPN(特征金字塔)的这样一个Up-bottom的结构,能够在增加较少计算量的前提下融合低分辨率语义信息较强的特征图和高分辨率语义信息较弱但空间信息丰富的特征图。下面图画线部分为FPN具体的一个结构
【yolov3 红线标记部分为Neck-FPN部分】
一、FPN
看着上面的图(红线括起来的部分),加下面我的注释,这部分非常简单,因该都没太大问题吧。总体的搭建顺序就是:Conv2D Block 5L(head1) -> Conv2D -> Upsample -> Concat -> Conv2D Block 5L(head2) -> Conv2D -> Upsample -> Concat -> Conv2D Block 5L(head3)
@NECKS.register_module()
class YOLOV3Neck(BaseModule):
"""The neck of YOLOV3.
It can be treated as a simplified version of FPN. It
will take the result from Darknet backbone and do some upsampling and
concatenation. It will finally output the detection result.
Note:
The input feats should be from top to bottom.
i.e., from high-lvl to low-lvl
But YOLOV3Neck will process them in reversed order.
i.e., from bottom (high-lvl) to top (low-lvl)
out(input size = 416x416):
(1, 512, 13, 13)
(1, 256, 26, 26)
(1, 128, 52, 52)
Args:
num_scales (int): The number of scales / stages.
in_channels (List[int]): The number of input channels per scale.
out_channels (List[int]): The number of output channels per scale.
conv_cfg (dict, optional): Config dict for convolution layer.
Default: None.
norm_cfg (dict, optional): Dictionary to construct and config norm
layer. Default: dict(type='BN', requires_grad=True)
act_cfg (dict, optional): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None
"""
def __init__(self,
num_scales, # backbone输出stage的数量
in_channels, # neck的输入channels=backbone的输出channels [1024, 512, 256]
out_channels, # neck的输出channels=head的输入channels [512, 256, 128]
conv_cfg=None, # 卷积配置 一般为None
norm_cfg=dict(type='BN', requires_grad=True), # norm layer配置 一般为BN
act_cfg=dict(type='LeakyReLU', negative_slope=0.1), # 激活函数配置 一般为LeakyReLU
init_cfg=None): # 初始化配置 一般为None
super(YOLOV3Neck, self).__init__(init_cfg)
assert (num_scales == len(in_channels) == len(out_channels)) # check
self.num_scales = num_scales # backbone输出stage的数量
self.in_channels = in_channels # neck的输入channels=backbone的输出channels [1024, 512, 256]
self.out_channels = out_channels # neck的输出channels=head的输入channels [512, 256, 128]
# shortcut
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
# 初始化Neck所需的所有组件
# To support arbitrary scales, the code looks awful, but it works.
# Better solution is welcomed.
# self.detect1/2/3 = Conv2D Block 5L(连着的5个卷积层) 也是3个head之前的5个卷积
self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg)
for i in range(1, self.num_scales): # 搭建另外两个self.detect 和上采样前的卷积
in_c, out_c = self.in_channels[i], self.out_channels[i]
inter_c = out_channels[i - 1]
# conv1/conv2:图中Conv2D Block 5L 和 Upsample 之间的那个Conv
self.add_module(f'conv{i}', ConvModule(inter_c, out_c, 1, **cfg))
# in_c + out_c : High-lvl feats will be cat with low-lvl feats
# detect2/detect3 后两个Conv2D Block 5L
self.add_module(f'detect{i+1}',
DetectionBlock(in_c + out_c, out_c, **cfg))
def forward(self, feats):
# 完整的Neck(FPN)顺序:Conv2D Block 5L(head1) + Conv2D + Upsample + Concat + Conv2D Block 5L(head2)
# + Conv2D + Upsample + Concat + Conv2D Block 5L(head3)
assert len(feats) == self.num_scales
# processed from bottom (high-level) to top (low-level)
outs = [] # outs存放neck部分的输出(head的输入)
out = self.detect1(feats[-1])
outs.append(out) # 存放第一个Neck的输出(第一个head的输入)
# 注意下这里的遍历顺序
# feats是Backbone的输出feature map(low level to high level)所以遍历要用-1 (high level to low level)
for i, x in enumerate(reversed(feats[:-1])):
conv = getattr(self, f'conv{i+1}') # Conv2D
tmp = conv(out)
# Cat with low-lvl feats
tmp = F.interpolate(tmp, scale_factor=2) # Upsample
tmp = torch.cat((tmp, x), 1) # Concat
detect = getattr(self, f'detect{i+2}') # Conv2D Block 5L
out = detect(tmp)
outs.append(out) # 存放第二/第三个Neck的输出(第二/第三个head的输入)
return tuple(outs) # 返回所有Neck的输出 作为Head的输入
class DetectionBlock(BaseModule):
"""Detection block in YOLO neck.
Let out_channels = n, the DetectionBlock contains:
Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer.
The first 6 ConvLayers are formed the following way:
1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n.
The Conv2D layer is 1x1x255.
Some block will have branch after the fifth ConvLayer.
The input channel is arbitrary (in_channels)
Args:
in_channels (int): The number of input channels.
out_channels (int): The number of output channels.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
init_cfg (dict or list[dict], optional): Initialization config dict.
Default: None
"""
# 这个类就是yolov3图中的Conv2D Block 5L:连着5个Conv 后面接的就是Head
def __init__(self,
in_channels, # Backbone输出的其中一个feature map的channel 1024/512/256
out_channels, # Neck的输出 512/256/128
conv_cfg=None, # 卷积配置 一般为None
norm_cfg=dict(type='BN', requires_grad=True), # norm layer配置 一般为BN
act_cfg=dict(type='LeakyReLU', negative_slope=0.1), # 激活函数配置 一般为LeakyReLU
init_cfg=None): # 初始化配置 一般为None
super(DetectionBlock, self).__init__(init_cfg)
double_out_channels = out_channels * 2 # 中间层的channel=out_channels*2=in_channels
# conv1x1 conv3x3 conv1x1 conv3x3 conv1x1
# 512->1024 1024->1024 1024->1024 1024->1024 1024->1024
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
self.conv1 = ConvModule(in_channels, out_channels, 1, **cfg)
self.conv2 = ConvModule(
out_channels, double_out_channels, 3, padding=1, **cfg)
self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg)
self.conv4 = ConvModule(
out_channels, double_out_channels, 3, padding=1, **cfg)
self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg)
def forward(self, x):
tmp = self.conv1(x)
tmp = self.conv2(tmp)
tmp = self.conv3(tmp)
tmp = self.conv4(tmp)
out = self.conv5(tmp)
return out
Neck部分的输入:
返回tuple{3个tensor}格式的feature map,shape分别为:[bs,256,64,64]、[bs,512,32,32]、[bs,1024,16,16],会传到Neck(FPN)层进行特征融合。
Neck部分的输出:
同样是tuple{3个tensor}格式的feature map,shape分别为:[bs,512,16,16]、[bs,256,64,64]、[bs,128,64,64],会传到Head层进行分类-回归预测。
总结
没啥好总结的,很简单的一个Head结构-FPN。