爆改YOLOv8 | ASF-YOLO改进特征融合层（小目标检测）

不想敲代码！！！

已于 2024-08-14 11:22:56 修改

阅读量165

点赞数 5

分类专栏：爆改yolov8，即插即用文章标签： YOLO yolov8 目标检测人工智能计算机视觉

于 2024-08-14 11:19:24 首次发布

本文链接：https://blog.csdn.net/weixin_43986124/article/details/141185166

版权

爆改yolov8，即插即用专栏收录该内容

1 篇文章 0 订阅

订阅专栏

1，本文介绍

ASF-YOLO框架结合空间和尺度特征，用于准确快速的细胞实例分割。用三重特征编码器模块TPE融合不同OLO分割框架的基础上，采用尺度序列特征融合模块SSFF增强网络的多尺度信息提取能力，并采尺度的特征图以增加详细信息。进一步引入通道和位置注意力机制（CPAM），将SSFF和TPE模块集成在一起，专注于信息通道和与空间位置相关的小对象，以提高检测和分割性能。

本文将讲解如何将ASF-YOLO融合进yolov8，以提高小目标检测的性能。

话不多说，上代码！

2，将ASF-YOLO融合进YOLOv8

2.1 步骤一

首先找到如下的目录'ultralytics/nn/modules'，然后在这个目录下创建一个asfyolo.py文件，文件名字可以根据你自己的习惯起，然后将ASF-YOLO的核心代码复制进去。

# ASF-YOLO 的核心代码
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
 
 
def autopad(k, p=None, d=1):  # kernel, padding, dilation
    # Pad to 'same' shape outputs
    if d > 1:
        k = d * (k - 1) + 1 if isinstance(k, int) else [d * (x - 1) + 1 for x in k]  # actual kernel-size
    if p is None:
        p = k // 2 if isinstance(k, int) else [x // 2 for x in k]  # auto-pad
    return p
 
 
class Conv(nn.Module):
    # Standard convolution with args(ch_in, ch_out, kernel, stride, padding, groups, dilation, activation)
    default_act = nn.SiLU()  # default activation
 
    def __init__(self, c1, c2, k=1, s=1, p=None, g=1, d=1, act=True):
        super().__init__()
        self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p, d), groups=g, dilation=d, bias=False)
        self.bn = nn.BatchNorm2d(c2)
        self.act = self.default_act if act is True else act if isinstance(act, nn.Module) else nn.Identity()
 
    def forward(self, x):
        return self.act(self.bn(self.conv(x)))
 
    def forward_fuse(self, x):
        return self.act(self.conv(x))
 
 
class Zoom_cat(nn.Module):
    def __init__(self):
        super().__init__()
        # self.conv_l_post_down = Conv(in_dim, 2*in_dim, 3, 1, 1)
 
    def forward(self, x):
        """l,m,s表示大中小三个尺度，最终会被整合到m这个尺度上"""
        l, m, s = x[0], x[1], x[2]
        tgt_size = m.shape[2:]
        l = F.adaptive_max_pool2d(l, tgt_size) + F.adaptive_avg_pool2d(l, tgt_size)
        # l = self.conv_l_post_down(l)
        # m = self.conv_m(m)
        # s = self.conv_s_pre_up(s)
        s = F.interpolate(s, m.shape[2:], mode='nearest')
        # s = self.conv_s_post_up(s)
        lms = torch.cat([l, m, s], dim=1)
        return lms
 
 
class ScalSeq(nn.Module):
    def __init__(self, inc, channel):
        super(ScalSeq, self).__init__()
        self.conv0 = Conv(inc[0], channel, 1)
        self.conv1 = Conv(inc[1], channel, 1)
        self.conv2 = Conv(inc[2], channel, 1)
        self.conv3d = nn.Conv3d(channel, channel, kernel_size=(1, 1, 1))
        self.bn = nn.BatchNorm3d(channel)
        self.act = nn.LeakyReLU(0.1)
        self.pool_3d = nn.MaxPool3d(kernel_size=(3,1,1))
 
    def forward(self, x):
        p3, p4, p5 = x[0], x[1], x[2]
        p3 = self.conv0(p3)
        p4_2 = self.conv1(p4)
        p4_2 = F.interpolate(p4_2, p3.size()[2:], mode='nearest')
        p5_2 = self.conv2(p5)
        p5_2 = F.interpolate(p5_2, p3.size()[2:], mode='nearest')
        p3_3d = torch.unsqueeze(p3, -3)
        p4_3d = torch.unsqueeze(p4_2, -3)
        p5_3d = torch.unsqueeze(p5_2, -3)
        combine = torch.cat([p3_3d, p4_3d, p5_3d],dim = 2)
        conv_3d = self.conv3d(combine)
        bn = self.bn(conv_3d)
        act = self.act(bn)
        x = self.pool_3d(act)
        x = torch.squeeze(x, 2)
        return x
 
 
class Add(nn.Module):
    # Concatenate a list of tensors along dimension
    def __init__(self, ch=256):
        super().__init__()
 
    def forward(self, x):
        input1, input2 = x[0], x[1]
        x = input1 + input2
        return x
 
 
class channel_att(nn.Module):
    def __init__(self, channel, b=1, gamma=2):
        super(channel_att, self).__init__()
        kernel_size = int(abs((math.log(channel, 2) + b) / gamma))
        kernel_size = kernel_size if kernel_size % 2 else kernel_size + 1
 
        self.avg_pool = nn.AdaptiveAvgPool2d(1)
        self.conv = nn.Conv1d(1, 1, kernel_size=kernel_size, padding=(kernel_size - 1) // 2, bias=False)
        self.sigmoid = nn.Sigmoid()
 
    def forward(self, x):
        y = self.avg_pool(x)
        y = y.squeeze(-1)
        y = y.transpose(-1, -2)
        y = self.conv(y).transpose(-1, -2).unsqueeze(-1)
        y = self.sigmoid(y)
        return x * y.expand_as(x)
 
 
class local_att(nn.Module):
    def __init__(self, channel, reduction=16):
        super(local_att, self).__init__()
 
        self.conv_1x1 = nn.Conv2d(in_channels=channel, out_channels=channel // reduction, kernel_size=1, stride=1,
                                  bias=False)
 
        self.relu = nn.ReLU()
        self.bn = nn.BatchNorm2d(channel // reduction)
 
        self.F_h = nn.Conv2d(in_channels=channel // reduction, out_channels=channel, kernel_size=1, stride=1,
                             bias=False)
        self.F_w = nn.Conv2d(in_channels=channel // reduction, out_channels=channel, kernel_size=1, stride=1,
                             bias=False)
 
        self.sigmoid_h = nn.Sigmoid()
        self.sigmoid_w = nn.Sigmoid()
 
    def forward(self, x):
        _, _, h, w = x.size()
 
        x_h = torch.mean(x, dim=3, keepdim=True).permute(0, 1, 3, 2)
        x_w = torch.mean(x, dim=2, keepdim=True)
 
        x_cat_conv_relu = self.relu(self.bn(self.conv_1x1(torch.cat((x_h, x_w), 3))))
 
        x_cat_conv_split_h, x_cat_conv_split_w = x_cat_conv_relu.split([h, w], 3)
 
        s_h = self.sigmoid_h(self.F_h(x_cat_conv_split_h.permute(0, 1, 3, 2)))
        s_w = self.sigmoid_w(self.F_w(x_cat_conv_split_w))
 
        out = x * s_h.expand_as(x) * s_w.expand_as(x)
        return out
 
 
class attention_model(nn.Module):
    # Concatenate a list of tensors along dimension
    def __init__(self, ch=256):
        super().__init__()
        self.channel_att = channel_att(ch)
        self.local_att = local_att(ch)
 
    def forward(self, x):
        input1, input2 = x[0], x[1]
        input1 = self.channel_att(input1)
        x = input1 + input2
        x = self.local_att(x)
        return x

2.2 步骤二

之后我们找到'ultralytics/nn/tasks.py'文件，在其中注册我们的ASF-YOLO模块。

首先我们需要在文件的开头导入我们的ASF-YOLO模块， 如下图所示

from .modules.asfyolo import attention_model, Add, ScalSeq, Zoom_cat

2.3 步骤三

找到parse_model方法，大概在六百多行。添加如下代码，如下图所示

代码如下

elif m is Zoom_cat:
    c2 = sum(ch[x] for x in f
elif m is Add:
    c2 = ch[f[-1]]   
elif m is ScalSeq:
    c1 = [ch[x] for x in f]
    c2 = make_divisible(args[0] * width, 8)
    args = [c1, c2]
elif m is attention_model:
    args = [ch[f[-1]]]

到此注册成功，复制后面给的yaml文件，然后直接运行即可

yaml文件

# Ultralytics YOLO 🚀, AGPL-3.0 license
# YOLOv8 object detection model with P3-P5 outputs. For Usage examples see https://docs.ultralytics.com/tasks/detect
 
# Parameters
nc: 80  # number of classes
scales: # model compound scaling constants, i.e. 'model=yolov8n.yaml' will call yolov8.yaml with scale 'n'
  # [depth, width, max_channels]
  n: [0.33, 0.25, 1024]  # YOLOv8n summary: 225 layers,  3157200 parameters,  3157184 gradients,   8.9 GFLOPs
  s: [0.33, 0.50, 1024]  # YOLOv8s summary: 225 layers, 11166560 parameters, 11166544 gradients,  28.8 GFLOPs
  m: [0.67, 0.75, 768]   # YOLOv8m summary: 295 layers, 25902640 parameters, 25902624 gradients,  79.3 GFLOPs
  l: [1.00, 1.00, 512]   # YOLOv8l summary: 365 layers, 43691520 parameters, 43691504 gradients, 165.7 GFLOPs
  x: [1.00, 1.25, 512]   # YOLOv8x summary: 365 layers, 68229648 parameters, 68229632 gradients, 258.5 GFLOPs
 
# YOLOv8.0n backbone
backbone:
  # [from, repeats, module, args]
  - [-1, 1, Conv, [64, 3, 2]]  # 0-P1/2
  - [-1, 1, Conv, [128, 3, 2]]  # 1-P2/4
  - [-1, 3, C2f, [128, True]]
  - [-1, 1, Conv, [256, 3, 2]]  # 3-P3/8
  - [-1, 6, C2f, [256, True]]
  - [-1, 1, Conv, [512, 3, 2]]  # 5-P4/16
  - [-1, 6, C2f, [512, True]]
  - [-1, 1, Conv, [1024, 3, 2]]  # 7-P5/32
  - [-1, 3, C2f, [1024, True]]
  - [-1, 1, SPPF, [1024, 5]]  # 9
 
# YOLOv8.0n head
head:
  - [-1, 1, Conv, [512, 1, 1]] # 10
  - [4, 1, Conv, [512, 1, 1]] # 11
  - [[-1, 6, -2], 1, Zoom_cat, []]  # 12 cat backbone P4
  - [-1, 3, C2f, [512]]  # 13
 
  - [-1, 1, Conv, [256, 1, 1]] # 14
  - [2, 1, Conv, [256, 1, 1]] # 15
  - [[-1, 4, -2], 1, Zoom_cat, []]  # 16  cat backbone P3
  - [-1, 3, C2f, [256]]  # 17 (P3/8-small)
 
  - [-1, 1, Conv, [256, 3, 2]] # 18
  - [[-1, 14], 1, Concat, [1]]  # 19 cat head P4
  - [-1, 3, C2f, [512]]  # 20 (P4/16-medium)
 
  - [-1, 1, Conv, [512, 3, 2]] # 21
  - [[-1, 10], 1, Concat, [1]]  # 22 cat head P5
  - [-1, 3, C2f, [1024]]  # 23 (P5/32-large)
 
  - [[4, 6, 8], 1, ScalSeq, [256]] # 24 args[inchane]
  - [[17, -1], 1, attention_model, [256]] # 25
 
  - [[25, 20, 23], 1, Detect, [nc]]

不知不觉已经看完了哦，动动小手留个点赞吧--_--