程序阅读笔记2——EfficientDet_Backbone/BiFPN

EfficientDet_Backbone.py

# Author: Zylo117

import math

import torch
from torch import nn

from efficientdet.model import BiFPN, Regressor, Classifier, EfficientNet
from efficientdet.utils import Anchors


class EfficientDetBackbone(nn.Module):
	#_init_初始化函数里面的参数设置
    def __init__(self, num_classes=80, compound_coef=0, load_weights=False, **kwargs):
        super(EfficientDetBackbone, self).__init__()
        self.compound_coef = compound_coef# backbone版本检测器
    	# 1.compound_coef参数:使用哪一个版本的Efficientdet检测器   
		# 2.backbone_compound_coef:这里有两个6表示EfficientDet-D6和D7版本检测器都是使用的EfficientNet-B6作为backbone用作特征提取的。D6和D7的区别只在于input_size即输入图像的尺寸不同
        self.backbone_compound_coef = [0, 1, 2, 3, 4, 5, 6, 6]
        # 3.pn_num_filters参数:对应上面表格中BiFPN模块的Wbifpn即BiFPN做feature fusion时特征矩阵的输出channel数,如使用EfficientDet-D0时,feature fusion后经过可分离卷积的输出channel数为64以及P3-P5特征图将原先channel调整为channel为64。
        self.fpn_num_filters = [64, 88, 112, 160, 224, 288, 384, 384]
        # 4.fpn_cell_repeats:表示BiFPN模块重复使用了多少次,如D0使用了3个模块堆叠
        self.fpn_cell_repeats = [3, 4, 5, 6, 7, 7, 8, 8]
        # 5.input_sizes: 输入网络图像尺寸
        self.input_sizes = [512, 640, 768, 896, 1024, 1280, 1280, 1536]
        # 6.box_class_repeats:经过BiFPN模块feature fusion之后输出有5个预测特征图,将每个预测特征图相互独立的输送到BoxNet和ClassNet中,其中做DW卷积(可分离卷积)的次数Dclass,注意分类网络和回归网络都要做相同次数的可分离卷积之后,才进行最后的分类和预测。做可分离卷积的输入输出channel是相同的,对应的channel数为上面表格中的Wbifpn
        self.box_class_repeats = [3, 3, 3, 4, 4, 4, 5, 5]
        # 7.anchor_scale:在anchor生成时用到的参数,和FasterRCNN和SSD中的anchor_scale不一样
        self.anchor_scale = [4., 4., 4., 4., 4., 4., 4., 5.]
        # 8.aspect_ratios:生成anchor的宽高比,默认使用三种宽高比
        self.aspect_ratios = kwargs.get('ratios', [(1.0, 1.0), (1.4, 0.7), (0.7, 1.4)])
        # 9.num_scales:得到的anchor_scale的个数,这是anchor_scale是anchor的尺度
        self.num_scales = len(kwargs.get('scales', [2 ** 0, 2 ** (1.0 / 3.0), 2 ** (2.0 / 3.0)]))
		# 10.conv_channel_coef:对应backbone主干特征提取网络中的特征图的输出channel,只使用backbone的三个特征图,如EfficientDet-D0:使用EfficientNet-B0中的stage4,stage6以及stage8中3个模块的输出特征图,对应EfficientDet-D0中输入到BiFPN的P3,P4以及P5
        conv_channel_coef = {
            # the channels of P3/P4/P5.
            0: [40, 112, 320],
            1: [40, 112, 320],
            2: [48, 120, 352],
            3: [48, 136, 384],
            4: [56, 160, 448],
            5: [64, 176, 512],
            6: [72, 200, 576],
            7: [72, 200, 576],
        }
		# 11.num_anchors:每个特征图上的每个网格/cell生成的anchor数
        num_anchors = len(self.aspect_ratios) * self.num_scales

        self.bifpn = nn.Sequential(
            *[BiFPN(self.fpn_num_filters[self.compound_coef],# 获取bifpn模块的W_bifpn=64
                    conv_channel_coef[compound_coef],#获取backbone中的p3,p4,p5
                    True if _ == 0 else False,
                    attention=True if compound_coef < 6 else False)# D0-D5网络使用使用Fast Normalization fusion操作
              for _ in range(self.fpn_cell_repeats[compound_coef])])# 获取BiFPN中的D_bifpn,即做多少次BiFPN模块

        self.num_classes = num_classes
        #类变量有classifier和regressor,分别来自于Classifier和regressor这两个类的实例化
        self.regressor = Regressor(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                                   num_layers=self.box_class_repeats[self.compound_coef])
        # 1.in_channels=64:即从BiFPN中输出的特征图的channel,注:可分离卷积时的DW卷积输入输出channel都为64
        # 2.num_anchor:每个特征图上的每个cell需要生成的anchor数
        # 3.num_layers:需要几次可分离卷积操作,如EfficientDet-D0,num_layers=3
        self.classifier = Classifier(in_channels=self.fpn_num_filters[self.compound_coef], num_anchors=num_anchors,
                                     num_classes=num_classes,
                                     num_layers=self.box_class_repeats[self.compound_coef])

        self.anchors = Anchors(anchor_scale=self.anchor_scale[compound_coef], **kwargs)

        self.backbone_net = EfficientNet(self.backbone_compound_coef[compound_coef], load_weights)

    def freeze_bn(self):
        for m in self.modules():
            if isinstance(m, nn.BatchNorm2d):
                m.eval()

    def forward(self, inputs):
        max_size = inputs.shape[-1]# 获取图像的W参数
		# 第一个舍去的为stage3中输出的特征图,保留stage4,6和8的输出特征图
        _, p3, p4, p5 = self.backbone_net(inputs)# 将一个batch中的所有图像输入给EfficientNet进行特征提取,只取stage4,6和8三个的输出特征图

        features = (p3, p4, p5)# 从EfficientNet中得到的3个不同尺寸的特征图,分别为stage4,stage6和stag8的特征图
        features = self.bifpn(features)# 从BIFPN模块中到的5个预测特征层的输出

        regression = self.regressor(features) # 得到一个batch下所有图片的bbox的回归参数 (batchsize, 49104, 4)
        classification = self.classifier(features)# 得到一个batch下所有图片的类别得分       (batchsize, 49104, 90)
        anchors = self.anchors(inputs, inputs.dtype) # 得到一张图像上生成的anchors模板 (1, 49104, 4)

        return features, regression, classification, anchors

    def init_backbone(self, path):
        state_dict = torch.load(path)
        try:
            ret = self.load_state_dict(state_dict, strict=False)
            print(ret)
        except RuntimeError as e:
            print('Ignoring ' + str(e) + '"')

在这里插入图片描述

Efficientdet_model.py

BiFPN模块解析

import torch.nn as nn
import torch
from torchvision.ops.boxes import nms as nms_torch

from efficientnet import EfficientNet as EffNet
from efficientnet.utils import MemoryEfficientSwish, Swish
from efficientnet.utils_extra import Conv2dStaticSamePadding, MaxPool2dStaticSamePadding


def nms(dets, thresh):
    return nms_torch(dets[:, :4], dets[:, 4], thresh)

# 在特征融合的时候使用可分离卷积,使用DW卷积和PW卷积替换之前模块中的普通1*1卷积和3*3卷积操作
class SeparableConvBlock(nn.Module):
    """
    created by Zylo117
    """
# 传进来的in_channel都为64
    def __init__(self, in_channels, out_channels=None, norm=True, activation=False, onnx_export=False):
        super(SeparableConvBlock, self).__init__()
        if out_channels is None:
            out_channels = in_channels# 保证DW卷积的输入特征矩阵channel=输出特征矩阵channel

        # Q: whether separate conv
        #  share bias between depthwise_conv and pointwise_conv
        #  or just pointwise_conv apply bias.
        # A: Confirmed, just pointwise_conv applies bias, depthwise_conv has no bias.

        self.depthwise_conv = Conv2dStaticSamePadding(in_channels, in_channels,
                                                      kernel_size=3, stride=1, groups=in_channels, bias=False)#groups=in_channel数,即:输入channel=输出channel=卷积核个数
        self.pointwise_conv = Conv2dStaticSamePadding(in_channels, out_channels, kernel_size=1, stride=1)#PWconv为普通的1*1卷积

        self.norm = norm # norm为True那么就是BN操作,BN中的输入channel就是PW卷积的输出channel
        if self.norm:
            # Warning: pytorch momentum is different from tensorflow's, momentum_pytorch = 1 - momentum_tensorflow
            self.bn = nn.BatchNorm2d(num_features=out_channels, momentum=0.01, eps=1e-3)

        self.activation = activation#如果activation为true就是用Swish激活函数
        if self.activation:
            self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
# 可分离卷积前向传播过程:先进行DW卷积,在进行PW卷积,有BN就是用BN操作,激活类似
    def forward(self, x):
        x = self.depthwise_conv(x)
        x = self.pointwise_conv(x)

        if self.norm:
            x = self.bn(x)

        if self.activation:
            x = self.swish(x)

        return x

# 定义BIFPN模块
class BiFPN(nn.Module):
    """
    modified by Zylo117
    """
# conv_channels=[40,112,320]分别对应EfficientNet中的stage4,stage6,stage8输出的特征图,即对应EfficientDet中的P3,P4,P5
# num_channels 传递进来的只有64
    def __init__(self, num_channels, conv_channels, first_time=False, epsilon=1e-4, onnx_export=False, attention=True):
        """

        Args:
            num_channels:64
            conv_channels:[40,112,320]
            first_time: whether the input comes directly from the efficientnet,
                        if True, downchannel it first, and downsample P5 to generate P6 then P7
            epsilon: epsilon of fast weighted attention sum of BiFPN, not the BN's epsilon
            onnx_export: if True, use Swish instead of MemoryEfficientSwish
            attention:是True,则使用Fast Normalization fusion操作
        """
        super(BiFPN, self).__init__()
        self.epsilon = epsilon
        # Conv layers:8个卷积操作是做特征融合后的DW卷积,保证输入输出channel一致
        self.conv6_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv5_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv4_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv3_up = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv4_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv5_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv6_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
        self.conv7_down = SeparableConvBlock(num_channels, onnx_export=onnx_export)
#up-是bifpn中TOP-down分支上特征融合过程中使用的可分离卷积,用来保证输入输出通道一致
#down-是bifpn中bottom-up分支上特征融合过程中使用的可分离卷积,同理保证输入输出的特征矩阵的channel不发生改变。
#epsilon:启用加权feature fusion时用到的参数
        # Feature scaling layers,以下是4个上采样操作,对应论文中P7in-P4in的上采样,使用最邻近插值
        self.p6_upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.p5_upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.p4_upsample = nn.Upsample(scale_factor=2, mode='nearest')
        self.p3_upsample = nn.Upsample(scale_factor=2, mode='nearest')
		# 以下四个为下采样操作,对应文中P3out-P6out的下采样,使用最大池化,池化窗口3*3,步长2*2
        self.p4_downsample = MaxPool2dStaticSamePadding(3, 2)
        self.p5_downsample = MaxPool2dStaticSamePadding(3, 2)
        self.p6_downsample = MaxPool2dStaticSamePadding(3, 2)
        self.p7_downsample = MaxPool2dStaticSamePadding(3, 2)

        self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
# 如果事第一次做bifpn,那么将p3,p4,p5三个特征图分别做三次普通卷积+BN操作得到P3in,P4in,P5in
        self.first_time = first_time
        if self.first_time:
       		# 对于P5而言,输入特征图的channel=320,输出channel为64得到P5in
            self.p5_down_channel = nn.Sequential(
                Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
            )
            # 对于P4而言,输入特征图的channel=112,输出channel为64得到P4in
            self.p4_down_channel = nn.Sequential(
                Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
            )
            # 对于P3而言,输入特征图的channel=40,输出channel为64得到P3in
            self.p3_down_channel = nn.Sequential(
                Conv2dStaticSamePadding(conv_channels[0], num_channels, 1),
                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
            )
			# 由于EfficientNet的backbone只将输入图像缩小为原来的1/32,对应于Efficientdet中的特征提取网络只能达到P5
			# 先要将P5进行下采样两次得到P6,P7;对于P5到P6也就是P6in,经过普通卷积+BN+maxpooling得到P7也就是P7in
            self.p5_to_p6 = nn.Sequential(
                Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
                MaxPool2dStaticSamePadding(3, 2)
            )
            self.p6_to_p7 = nn.Sequential(
                MaxPool2dStaticSamePadding(3, 2)
            )
			#由于P4和P5需要做一次残差连接,将P4in直接传递给下一个bifpn模块的P4,所以论文中P4->P4in,P5->P5in做了两次
			#即得到两个相同的P4in和两个相同的P5in
            self.p4_down_channel_2 = nn.Sequential(
                Conv2dStaticSamePadding(conv_channels[1], num_channels, 1),
                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
            )
            self.p5_down_channel_2 = nn.Sequential(
                Conv2dStaticSamePadding(conv_channels[2], num_channels, 1),
                nn.BatchNorm2d(num_channels, momentum=0.01, eps=1e-3),
            )

        # Weight 这里的4个权重为P7in上采样后乘上W1,在与P6in乘上W2,进行特征融合,初始化为两个1
        # 由于top-down分支有4次feature fusion,并且每次融合的输入有两个,并且每个权值都需要经过一次Relu,保证每个权值都≥0
        self.p6_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
        self.p6_w1_relu = nn.ReLU()
        self.p5_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
        self.p5_w1_relu = nn.ReLU()
        self.p4_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
        self.p4_w1_relu = nn.ReLU()
        self.p3_w1 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
        self.p3_w1_relu = nn.ReLU()
		# 同理,对于bottom-up分支的特征融合,P4out--P6out的融合有三个输入,因此初始化三个全为1的权值,而对于P7out而言
        # 特征融合的输入只有两个,因此初始化两个全为1的权值,并且每个权值都需要经过一次Relu,保证每个权值都≥0
        self.p4_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
        self.p4_w2_relu = nn.ReLU()
        self.p5_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
        self.p5_w2_relu = nn.ReLU()
        self.p6_w2 = nn.Parameter(torch.ones(3, dtype=torch.float32), requires_grad=True)
        self.p6_w2_relu = nn.ReLU()
        self.p7_w2 = nn.Parameter(torch.ones(2, dtype=torch.float32), requires_grad=True)
        self.p7_w2_relu = nn.ReLU()

        self.attention = attention

    def forward(self, inputs):# bifpn前向传播过程
        """
        illustration of a minimal bifpn unit
            P7_0 -------------------------> P7_2 -------->
               |-------------|                ?
                             ?                |
            P6_0 ---------> P6_1 ---------> P6_2 -------->
               |-------------|--------------? ?
                             ?                |
            P5_0 ---------> P5_1 ---------> P5_2 -------->
               |-------------|--------------? ?
                             ?                |
            P4_0 ---------> P4_1 ---------> P4_2 -------->
               |-------------|--------------? ?
                             |--------------? |
            P3_0 -------------------------> P3_2 -------->
        """

        # downsample channels using same-padding conv2d to target phase's if not the same
        # judge: same phase as target,
        # if same, pass;
        # elif earlier phase, downsample to target phase's by pooling
        # elif later phase, upsample to target phase's by nearest interpolation

        if self.attention:
            p3_out, p4_out, p5_out, p6_out, p7_out = self._forward_fast_attention(inputs)
        else:
            p3_out, p4_out, p5_out, p6_out, p7_out = self._forward(inputs)

        return p3_out, p4_out, p5_out, p6_out, p7_out
	#以下为实现带有fast normalization fusion的前向传播过程
    def _forward_fast_attention(self, inputs):
    '''
A.对于EfficientDet-D0为例,conv_channels=[40, 112, 320]列表保存的是backbone中的3个特征图的channel,也就是框架图中的P3,P4,P5的输出channel,将P3,P4以及P5经过普通1x1卷积操作将channel调整到统一的尺度为64,并且每个卷积后面都有BN操作,于是得到P3in,P4in以及P5in,此时这经过channel调整后的三个特征图才是BiFPN模块的输入。
B.对于剩下的两个特征图怎么得到呢?将backbone中的P5经过卷积调整到channel为64,再经过BN操作,最后使用一个池化窗口为3x3步长为2的最大池化层得到P6in,作为输入到BiFPN模块的第4个预测特征图;将P6in直接经过池化窗口3x3步长为2的最大池化层得到P7in,作为输入到BiFPN模块的第5个预测特征图。
C.针对EfficientDetB0来说,P3.shape=(batchsize, 40, 64, 64), P4.shape = (batchsize, 112,32,32),
P5.shape = (batchsize, 320,16,16)经过1x1的普通卷积操作(输出channel均为64)+BN操作得到P3in,P4in和P5in,再将P5经过一次1*1的卷积(输出channel为64)+BN+maxpool得到P6in,
P6in.shape=(batchsize,64,8,8),进而将P6in经过一次maxpool得到P7in,且P7in.shape=(batchsize,64, 4,4)。并且这里一共有8组权重,对应论文中的fast norm fusion,在融合的时候使用。

注意这里的每组权重都被初始化为全1,并且每个权重都经过relu激活函数,保证每个权值都是大于等于0的。并且这里的attention是一个布尔变量,为Ture就表示使用fast norm fusion机制,否则就不使用。
D.当使用attention机制时,进入_forward_fast_attention函数中,继续判断当前的BiFPN是否是第一次进行BiFPN操作,为True则获取backbone中的3个特征图为P3,P4以及P5,将P5进行下采样得到P6in,P6in进行下采样得到P7in,P3,P4以及P5进行channel的调整得到P3in-P5in。
如果不是第一次进行BiFPN操作,只需要获取上个BiFPN模块的输出即P3in-P7in 5个特征图即可。

	'''
        if self.first_time:
            p3, p4, p5 = inputs

            p6_in = self.p5_to_p6(p5)
            p7_in = self.p6_to_p7(p6_in)

            p3_in = self.p3_down_channel(p3)
            p4_in = self.p4_down_channel(p4)
            p5_in = self.p5_down_channel(p5)

        else:
            # P3_0, P4_0, P5_0, P6_0 and P7_0
            p3_in, p4_in, p5_in, p6_in, p7_in = inputs

        # P7_0 to P7_2

        # Weights for P6_0 and P7_0 to P6_1
        p6_w1 = self.p6_w1_relu(self.p6_w1)
        weight = p6_w1 / (torch.sum(p6_w1, dim=0) + self.epsilon)
        # Connections for P6_0 and P7_0 to P6_1 respectively
        p6_up = self.conv6_up(self.swish(weight[0] * p6_in + weight[1] * self.p6_upsample(p7_in)))

        # Weights for P5_0 and P6_0 to P5_1
        p5_w1 = self.p5_w1_relu(self.p5_w1)
        weight = p5_w1 / (torch.sum(p5_w1, dim=0) + self.epsilon)
        # Connections for P5_0 and P6_0 to P5_1 respectively
        p5_up = self.conv5_up(self.swish(weight[0] * p5_in + weight[1] * self.p5_upsample(p6_up)))

        # Weights for P4_0 and P5_0 to P4_1
        p4_w1 = self.p4_w1_relu(self.p4_w1)
        weight = p4_w1 / (torch.sum(p4_w1, dim=0) + self.epsilon)
        # Connections for P4_0 and P5_0 to P4_1 respectively
        p4_up = self.conv4_up(self.swish(weight[0] * p4_in + weight[1] * self.p4_upsample(p5_up)))

        # Weights for P3_0 and P4_1 to P3_2
        p3_w1 = self.p3_w1_relu(self.p3_w1)
        weight = p3_w1 / (torch.sum(p3_w1, dim=0) + self.epsilon)
        # Connections for P3_0 and P4_1 to P3_2 respectively
        p3_out = self.conv3_up(self.swish(weight[0] * p3_in + weight[1] * self.p3_upsample(p4_up)))
# 如果是第一次进入BIFPN模块,就会将P4in和P5in做两次相同的操作,一次用作top-down分支,一次用作bottom-up分支
        if self.first_time:
            p4_in = self.p4_down_channel_2(p4)
            p5_in = self.p5_down_channel_2(p5)

        # Weights for P4_0, P4_1 and P3_2 to P4_2
        p4_w2 = self.p4_w2_relu(self.p4_w2)
        weight = p4_w2 / (torch.sum(p4_w2, dim=0) + self.epsilon)
        # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
        p4_out = self.conv4_down(
            self.swish(weight[0] * p4_in + weight[1] * p4_up + weight[2] * self.p4_downsample(p3_out)))

        # Weights for P5_0, P5_1 and P4_2 to P5_2
        p5_w2 = self.p5_w2_relu(self.p5_w2)
        weight = p5_w2 / (torch.sum(p5_w2, dim=0) + self.epsilon)
        # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
        p5_out = self.conv5_down(
            self.swish(weight[0] * p5_in + weight[1] * p5_up + weight[2] * self.p5_downsample(p4_out)))

        # Weights for P6_0, P6_1 and P5_2 to P6_2
        p6_w2 = self.p6_w2_relu(self.p6_w2)
        weight = p6_w2 / (torch.sum(p6_w2, dim=0) + self.epsilon)
        # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
        p6_out = self.conv6_down(
            self.swish(weight[0] * p6_in + weight[1] * p6_up + weight[2] * self.p6_downsample(p5_out)))

        # Weights for P7_0 and P6_2 to P7_2
        p7_w2 = self.p7_w2_relu(self.p7_w2)
        weight = p7_w2 / (torch.sum(p7_w2, dim=0) + self.epsilon)
        # Connections for P7_0 and P6_2 to P7_2
        p7_out = self.conv7_down(self.swish(weight[0] * p7_in + weight[1] * self.p7_downsample(p6_out)))

        return p3_out, p4_out, p5_out, p6_out, p7_out

    def _forward(self, inputs):
        if self.first_time:
            p3, p4, p5 = inputs

            p6_in = self.p5_to_p6(p5)
            p7_in = self.p6_to_p7(p6_in)

            p3_in = self.p3_down_channel(p3)
            p4_in = self.p4_down_channel(p4)
            p5_in = self.p5_down_channel(p5)

        else:
            # P3_0, P4_0, P5_0, P6_0 and P7_0
            p3_in, p4_in, p5_in, p6_in, p7_in = inputs

        # P7_0 to P7_2

        # Connections for P6_0 and P7_0 to P6_1 respectively
        p6_up = self.conv6_up(self.swish(p6_in + self.p6_upsample(p7_in)))

        # Connections for P5_0 and P6_0 to P5_1 respectively
        p5_up = self.conv5_up(self.swish(p5_in + self.p5_upsample(p6_up)))

        # Connections for P4_0 and P5_0 to P4_1 respectively
        p4_up = self.conv4_up(self.swish(p4_in + self.p4_upsample(p5_up)))

        # Connections for P3_0 and P4_1 to P3_2 respectively
        p3_out = self.conv3_up(self.swish(p3_in + self.p3_upsample(p4_up)))

        if self.first_time:
            p4_in = self.p4_down_channel_2(p4)
            p5_in = self.p5_down_channel_2(p5)

        # Connections for P4_0, P4_1 and P3_2 to P4_2 respectively
        p4_out = self.conv4_down(
            self.swish(p4_in + p4_up + self.p4_downsample(p3_out)))

        # Connections for P5_0, P5_1 and P4_2 to P5_2 respectively
        p5_out = self.conv5_down(
            self.swish(p5_in + p5_up + self.p5_downsample(p4_out)))

        # Connections for P6_0, P6_1 and P5_2 to P6_2 respectively
        p6_out = self.conv6_down(
            self.swish(p6_in + p6_up + self.p6_downsample(p5_out)))

        # Connections for P7_0 and P6_2 to P7_2
        p7_out = self.conv7_down(self.swish(p7_in + self.p7_downsample(p6_out)))

        return p3_out, p4_out, p5_out, p6_out, p7_out
'''
对于第一次进行BiFPN时,在获得P3in,P4in_1, P4in_2, P5in_1, P5in_2, P6in, P7in,再将P7in进行上采样乘上权重w1与P6in*权值w2进行feature fusion得到P7up;之后再对P6up进行上采样乘上权值w1与P5in乘上权值w2做feature fusion得到P6up;再对P5up进行上采样乘上权值w1与P4in_1乘上权值w2做feature fusion得到P5up;之后对P4up进行上采样再乘上权值w1与P3in乘上权值w2融合得到P3out;注意这里的每次feature fusion之后都需要使用一个可分离卷积操作。
这里总共有4组初始化全为1的权值(size为2),即并且每个权值都经过relu激活函数,保证权值wi都≥0。并且在做特征融合的之前需要归一化(wi/w1+w2+epison)再乘上各自的input feature,也就是每融合一次进行一次加权求和。
当top-down分支操作结束后,继续判断,此时是否是第一次进行BiFPN,为True则获取P4in2和P5in2,直接将这俩个进行bottom-up分支的feature fusion,相当于残差连接,论文里面说这样可以在不增加计算成本的前提下融合更多的特征信息。
类似于top-down分支,第一组权值有3个,进行激活函数relu和归一化。Feature fusion:P4inx权值w1 + P4up x权值w2 + P3out经过下采样之后再x权值w3,融合之后使用swish激活函数并且接上一个可分离卷积得到bottom-up分支上的第一个输出P4out。
同理得到P5out和P6out,这三个feature fusion都有三个输入。
P7out的输入只有两个,downsample(P6out)和P7in,所以该节点的feature fusion只有两个权值,与P4out-P6out不同,但融合之后的操作是相同的。
最后返回的是bottom-up分支上的5个预测特征图的输出,作为下一个BiFPN的输入。
'''
#定义回归器
class Regressor(nn.Module):
    """
    modified by Zylo117
    """
	# 传递进来的参数依次为:64;9;3
    def __init__(self, in_channels, num_anchors, num_layers, onnx_export=False):
        super(Regressor, self).__init__()
        self.num_layers = num_layers
        self.num_layers = num_layers

        self.conv_list = nn.ModuleList(
            [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
        self.bn_list = nn.ModuleList(
            [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
             range(5)])
        self.header = SeparableConvBlock(in_channels, num_anchors * 4, norm=False, activation=False)
        self.swish = MemoryEfficientSwish() if not onnx_export else Swish()

    def forward(self, inputs):
        feats = []
        for feat, bn_list in zip(inputs, self.bn_list):
            for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):
                feat = conv(feat)
                feat = bn(feat)
                feat = self.swish(feat)
            feat = self.header(feat)
			#卷积之后调整维度信息,也就是上图中的permute操作,将原来的(batchsize,36,h,w)–>(batchsize,hxwx9,4),这样得到第一个特征图上box的所有回归参数。同理将剩下的4个预测特征图再做上面的操作。
            feat = feat.permute(0, 2, 3, 1)
            feat = feat.contiguous().view(feat.shape[0], -1, 4)

            feats.append(feat)
			#最后将5个预测特征层的输出在维度1上进行concat得到一个batch下每张图片的anchors回归参数(batchsize, 49104, 4).
        feats = torch.cat(feats, dim=1)

        return feats

#定义类别器
class Classifier(nn.Module):
    """
    modified by Zylo117
    """

    def __init__(self, in_channels, num_anchors, num_classes, num_layers, onnx_export=False):
        super(Classifier, self).__init__()
        self.num_anchors = num_anchors
        self.num_classes = num_classes
        self.num_layers = num_layers
        #conv_list是一个可分离卷积列表,长度为num_layers
        #bn_list是一个BN操作列表,长度为num_layers x len(pyramid_levels) = 3x5 = 15
        #header是最后的输出层,输出channel为num_anchor*4.
        #num_anchors表示每个特征图上的每个cell预测多少个anchor,每个anchor有4个坐标参数
        #swish为激活函数
        self.conv_list = nn.ModuleList(
            [SeparableConvBlock(in_channels, in_channels, norm=False, activation=False) for i in range(num_layers)])
        self.bn_list = nn.ModuleList(
            [nn.ModuleList([nn.BatchNorm2d(in_channels, momentum=0.01, eps=1e-3) for i in range(num_layers)]) for j in
             range(5)])
        self.header = SeparableConvBlock(in_channels, num_anchors * num_classes, norm=False, activation=False)
        self.swish = MemoryEfficientSwish() if not onnx_export else Swish()
	
    def forward(self, inputs):
        feats = []
        for feat, bn_list in zip(inputs, self.bn_list):#遍历每一个预测特征层
            for i, bn, conv in zip(range(self.num_layers), bn_list, self.conv_list):#对每个预测特征层上先做3个相同的DW卷积,输入输出channel都相同
                feat = conv(feat)#以第一个预测特征层为例,输入为(batchsize,64,h,w)-->(batchsize/4,64,h,w)
                feat = bn(feat)
                feat = self.swish(feat)
            feat = self.header(feat)#预测特征图做完3次DW卷积操作后,再进行一个卷积对所有anchor进行类别分数的预测,输出channel=num_anchors * num_classes
            # shape = (4,x*9,64,64)
            feat = feat.permute(0, 2, 3, 1)
            feat = feat.contiguous().view(feat.shape[0], feat.shape[1], feat.shape[2], self.num_anchors,
                                          self.num_classes)# 使用contiguous变成连续的存储结构,再view,此时shape=(batch_size,h,w,9,90)
            feat = feat.contiguous().view(feat.shape[0], -1, self.num_classes)#得到第一个预测特征图的类别分数,shape=(batch_size,h*w*9,90)

            feats.append(feat) # 将每一个预测特征层上的进行cls_net的输出保存到列表中

        feats = torch.cat(feats, dim=1)# 将一个batch下的所有图片的5个预测特征层上预测的anchor全部累加起来,shape=(4,49104,90)
        feats = feats.sigmoid() # 对一个batch下的每张图片的所有anchors进行sigmoid处理,得到每个类别的预测分数

        return feats
'''
前向传播过程得到5个预测特征图上的所有anchor的预测类别分数,permute之后,tensor的size:[batchsize, 90x9,h, w] -> [batchsize,h, w, 9x90] 这里的9表示每个cell预测的anchor数,
90表示预测类别数,由于是使用COCO中的stuff categories所以这里类别数为90。第一个view之后的size:[batchsize, h ,w ,9, 90];
第二个view之后的size:[batchsize, hxwx9, 90];
再将一个batch下的所有图片在5个预测层上的anchors预测结果在维度1的位置上进行concat,输出的tensor.size=(batch_size, 49104, num_classes)。
最后将预测结果进行一个sigmoid函数得到每个anchors的预测类别分数。

'''

class EfficientNet(nn.Module):
    """
    modified by Zylo117
    """

    def __init__(self, compound_coef, load_weights=False):
        super(EfficientNet, self).__init__()
        # model正向传播EfficientNet得到backbone的所有子模块
        model = EffNet.from_pretrained('efficientnet-b{}'.format(compound_coef), load_weights)
        # 删除EfficientNet-stage9中的结构:一个1*1卷积,bn,pooling,Dropout以及一个fc层
        del model._conv_head
        del model._bn1
        del model._avg_pooling
        del model._dropout
        del model._fc
        self.model = model#其他的子模块传递给类变量self.model
#前向传播过程输入的x即图像,首先进过stage1中的子模块:3x3卷积 步长为2;BN层以及swish激活函数
    def forward(self, x):
        x = self.model._conv_stem(x)
        x = self.model._bn0(x)
        x = self.model._swish(x)
        feature_maps = []

        # TODO: temporarily storing extra tensor last_x and del it later might not be a good idea,
        #  try recording stride changing when creating efficientnet,
        #  and then apply it here.
        last_x = None
        # for循环遍历实现EfficientNet中stage2-stage8,总共16个MBconvblock,也就是MobileNetV3中的bottleneck,不过有些许的不同。
        for idx, block in enumerate(self.model._blocks):
            drop_connect_rate = self.model._global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self.model._blocks)#将当前MBconv block的索引数/总共的MBconv bock数 * drop_connect_rate
            x = block(x, drop_connect_rate=drop_connect_rate)#将stage1中的输出特征矩阵输入到当前MBconv block,得到的输出赋值给x。
#if:如果当前的MBconv block中的步长为2,那么就把上一次stage的输出加入到list中,可得到stage2,stage3,stage4,stage6的输出特征图
            if block._depthwise_conv.stride == [2, 2]:
                feature_maps.append(last_x)
# elif:如果当前MBconv block的索引值等于最后一个,条件满足时为:15 = 16 - 1,将stage8中的输出也保存到list中
            elif idx == len(self.model._blocks) - 1:
                feature_maps.append(x)
            last_x = x
        del last_x# 删除临时储存变量last_x
        return feature_maps[1:]# 此时得到的feature_maps中有stage2,3,4,6,8的5个特征图,只选取后面的4个,由于论文中只需要后面的3个特征图


if __name__ == '__main__':
    from tensorboardX import SummaryWriter


    def count_parameters(model):
        return sum(p.numel() for p in model.parameters() if p.requires_grad)


参考链接1
参考链接2

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值