YOLO改进：动态卷积之有效推理的条件参数化卷积CondConv

最新推荐文章于 2024-08-05 19:11:38 发布

jhunmk29

最新推荐文章于 2024-08-05 19:11:38 发布

阅读量478

点赞数

分类专栏： YOLO 文章标签： YOLO pytorch 深度学习 deep learning 人工智能

本文链接：https://blog.csdn.net/qq_36070656/article/details/133889191

版权

YOLO 专栏收录该内容

8 篇文章 9 订阅

订阅专栏

理论知识

CondConv理论

CondConv论文地址
 CondConv代码地址

问题背景

CNN模型的能力来源于模型的大小和数据集的规模。卷积层设计中的一个基本假设是，相同的卷积核应用于数据集中的每个示例。为了增加模型的容量，模型开发人员通常会添加更多的卷积层或增加现有卷积的大小(内核高度/宽度，输入/输出通道的数量)。这会导致模型增大，影响速度。

CondConv的提出

条件参数化卷积(CondConv)，它通过计算卷积核作为输入的函数来挑战静态卷积核的范式。所谓条件：条件计算的目的是在不增加计算成本的情况下增加模型容量；在条件计算模型中，通过为每个示例仅激活整个网络的一部分来实现的。多分支卷积网络：一个层由多个卷积分支组成，这些分支被聚合以计算最终输出，如：ResNet和Inception等。CondConv层在数学上相当于多分支卷积层，其中每个分支是单个卷积，输出通过加权和聚合，但只需要计算一个卷积。

CondConv的思路不难，具体来说：

卷积核将按照上图(a)的方式进行参数化，其中W1、W2、W3就相当于多个卷积核；ROUTE FN就相当于W1、W2、W3前的权重参数。通过(a1W1+a2W2+a3W3)组合得到一个卷积核，使用这个新得到的卷积核进行卷积操作。

CondConv代码

从代码可以更为直观的理解CondConv做的事情
CondConv-Pytorch

# 权值计算，num_experts是设置的专家数量
class _routing(nn.Module):

    def __init__(self, in_channels, num_experts, dropout_rate):
        super(_routing, self).__init__()

        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(in_channels, num_experts)

    def forward(self, x):
        x = torch.flatten(x)
        x = self.dropout(x)
        x = self.fc(x)
        return F.sigmoid(x)

# CondConv卷积过程
class CondConv2D(_ConvNd):
    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1,
                 bias=True, padding_mode='zeros', num_experts=3, dropout_rate=0.2):
        kernel_size = _pair(kernel_size) # 3 -> (3,3)
        stride = _pair(stride)
        padding = _pair(padding)
        dilation = _pair(dilation)
        super(CondConv2D, self).__init__(
            in_channels, out_channels, kernel_size, stride, padding, dilation,
            False, _pair(0), groups, bias, padding_mode)

		# 全局平均池化会将输入特征由尺寸(bs, c, w, h) -> (bs, c, 1, 1)，为后续的权重计算做准备
        self._avg_pooling = functools.partial(F.adaptive_avg_pool2d, output_size=(1, 1))
        # 调用权重计算函数，输入由(bs, c, 1, 1) -> (bs, num_experts, 1, 1)，通道数发生改变，和专家数量的多少有关
        self._routing_fn = _routing(in_channels, num_experts, dropout_rate)

		# 构建了一个尺寸为(num_experts, out_channels, in_channels , kernel_size, kernel_size)的Tensor变量
        self.weight = Parameter(torch.Tensor(
            num_experts, out_channels, in_channels // groups, *kernel_size))

        self.reset_parameters()

    def _conv_forward(self, input, weight):
        if self.padding_mode != 'zeros':
            return F.conv2d(F.pad(input, self._padding_repeated_twice, mode=self.padding_mode),
                            weight, self.bias, self.stride,
                            _pair(0), self.dilation, self.groups)
        return F.conv2d(input, weight, self.bias, self.stride,
                        self.padding, self.dilation, self.groups)

    def forward(self, inputs):
        b, _, _, _ = inputs.size()
        res = []
        # inputs: (bs, c, w, h) input: (c, w, h)
        for input in inputs:
            input = input.unsqueeze(0)
            pooled_inputs = self._avg_pooling(input)
            routing_weights = self._routing_fn(pooled_inputs)
            # 在维度0上求和
            kernels = torch.sum(routing_weights[:, None, None, None, None] * self.weight, 0)
            out = self._conv_forward(input, kernels)
            res.append(out)
        return torch.cat(res, dim=0)

# 测试
x = torch.rand(1, 20, 40, 40)
condconv = CondConv2D(20, 40, 3, num_experts=3)
print(condconv(x).size()) # [1, 40, 38, 38]

相关的代码注释已经标注在代码上，通过Debug代码可以更好的理解到这种计算方式与注意力机制中的通道注意力的不同。CondConv相当于构建了num_experts个(out_channels, in_channels , kernel_size, kernel_size)，通过融合这num_experts个卷积；而通道注意力是对单个卷积中的通道进行加权赋值的。

YOLO中使用CondConv

考虑在C3中引入CondConv模块，在common.py中粘贴下面的代码：

class _routing(nn.Module):

    def __init__(self, in_channels, num_experts, dropout_rate):
        super(_routing, self).__init__()

        self.dropout = nn.Dropout(dropout_rate)
        self.fc = nn.Linear(in_channels, num_experts)

    def forward(self, x):
        x = torch.flatten(x)
        x = self.dropout(x)
        x = self.fc(x)
        return F.sigmoid(x)


class CondConv2D(_ConvNd):

    def __init__(self, in_channels, out_channels, kernel_size, stride=1,
                 padding=0, dilation=1, groups=1,
                 bias=True, padding_mode='zeros', num_experts=3, dropout_rate=0.2):
        kernel_size = _pair(kernel_size) # 3 -> (3,3)
        stride = _pair(stride)
        padding = _pair(padding)
        dilation = _pair(dilation)
        super(CondConv2D, self).__init__(
            in_channels, out_channels, kernel_size, stride, padding, dilation,
            False, _pair(0), groups, bias, padding_mode)

        self._avg_pooling = functools.partial(F.adaptive_avg_pool2d, output_size=(1, 1))
        self._routing_fn = _routing(in_channels, num_experts, dropout_rate)

        self.weight = Parameter(torch.Tensor(
            num_experts, out_channels, in_channels // groups, *kernel_size))

        self.reset_parameters()

    def _conv_forward(self, input, weight):
        if self.padding_mode != 'zeros':
            return F.conv2d(F.pad(input, self._padding_repeated_twice, mode=self.padding_mode),
                            weight, self.bias, self.stride,
                            _pair(0), self.dilation, self.groups)
        return F.conv2d(input, weight, self.bias, self.stride,
                        self.padding, self.dilation, self.groups)

    def forward(self, inputs):
        b, _, _, _ = inputs.size()
        res = []
        for input in inputs:
            input = input.unsqueeze(0)
            pooled_inputs = self._avg_pooling(input)
            routing_weights = self._routing_fn(pooled_inputs)
            kernels = torch.sum(routing_weights[:, None, None, None, None] * self.weight, 0)
            out = self._conv_forward(input, kernels)
            res.append(out)
        return torch.cat(res, dim=0)

class C3_CondConv(nn.Module):
    # CSP Bottleneck with 3 convolutions
    def __init__(self, c1, c2, num_experts=3, n=1, shortcut=True, g=1, e=0.5):  # ch_in, ch_out, number, shortcut, groups, expansion
        super().__init__()
        c_ = int(c2 * e)  # hidden channels
        self.cv1 = CondConv2D(c1, c_, num_experts=num_experts, kernel_size=1, stride=1)
        self.cv2 = CondConv2D(c1, c_, num_experts=num_experts, kernel_size=1, stride=1)
        self.cv3 = CondConv2D(2 * c_, c2, num_experts=num_experts, kernel_size=1)  # optional act=FReLU(c2)
        self.m = nn.Sequential(*(Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)))

    def forward(self, x):
        return self.cv3(torch.cat((self.m(self.cv1(x)), self.cv2(x)), 1))

# -------------------------------------------测试代码，粘贴时不需要带上--------------------------------------------------
x = torch.rand(1, 20, 40, 40)
condconv = CondConv2D(20, 40, 3, num_experts=3)
print(condconv(x).size())
c3_condconv = C3_CondConv(20, 40, 3)
print(c3_condconv(x).size())

在yolo.py的def parse_model(d, ch):函数中引入如下代码:

if m in {
        Conv, GhostConv, Bottleneck, GhostBottleneck, SPP, SPPF, DWConv, MixConv2d, Focus, CrossConv,
        BottleneckCSP, C3, C3TR, C3SPP, C3Ghost, nn.ConvTranspose2d, DWConvTranspose2d, C3x, C3RFEM, RFEM_KCPNet, C3_CondConv}:
    c1, c2 = ch[f], args[0]
    if c2 != no:  # if not output
        c2 = make_divisible(c2 * gw, 8)

    args = [c1, c2, *args[1:]]
    if m in {BottleneckCSP, C3, C3TR, C3Ghost, C3x, C3RFEM, C3_CondConv}:
        args.insert(2, n)  # number of repeats
        n = 1

设置相关的yaml文件：

# YOLOv5 🚀 by Ultralytics, AGPL-3.0 license

# Parameters
nc: 80  # number of classes
depth_multiple: 0.33  # model depth multiple
width_multiple: 0.50  # layer channel multiple
anchors:
  - [10,13, 16,30, 33,23]  # P3/8
  - [30,61, 62,45, 59,119]  # P4/16
  - [116,90, 156,198, 373,326]  # P5/32

# YOLOv5 v6.0 backbone
backbone:
  # [from, number, module, args]
  [[-1, 1, Conv, [64, 6, 2, 2]],  # 0-P1/2
   [-1, 1, Conv, [128, 3, 2]],  # 1-P2/4
   [-1, 3, C3_CondConv, [128, 3]],
   [-1, 1, Conv, [256, 3, 2]],  # 3-P3/8
   [-1, 6, C3_CondConv, [256, 3]],
   [-1, 1, Conv, [512, 3, 2]],  # 5-P4/16
   [-1, 9, C3_CondConv, [512, 3]],
   [-1, 1, Conv, [1024, 3, 2]],  # 7-P5/32
   [-1, 3, C3_CondConv, [1024, 3]],
   [-1, 1, SPPF, [1024, 5]],  # 9
  ]

# YOLOv5 v6.0 head
head:
  [[-1, 1, Conv, [512, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 6], 1, Concat, [1]],  # cat backbone P4
   [-1, 3, C3_CondConv, [512, 3, False]],  # 13

   [-1, 1, Conv, [256, 1, 1]],
   [-1, 1, nn.Upsample, [None, 2, 'nearest']],
   [[-1, 4], 1, Concat, [1]],  # cat backbone P3
   [-1, 3, C3_CondConv, [256, 3, False]],  # 17 (P3/8-small)

   [-1, 1, Conv, [256, 3, 2]],
   [[-1, 14], 1, Concat, [1]],  # cat head P4
   [-1, 3, C3_CondConv, [512, 3, False]],  # 20 (P4/16-medium)

   [-1, 1, Conv, [512, 3, 2]],
   [[-1, 10], 1, Concat, [1]],  # cat head P5
   [-1, 3, C3_CondConv, [1024, 3, False]],  # 23 (P5/32-large)

   [[17, 20, 23], 1, Detect, [nc, anchors]],  # Detect(P3, P4, P5)
  ]