今天复现的这篇文章是来自北大核心,CSCD:
[1]齐向明,柴蕊,高一萌.重构SPPCSPC与优化下采样的小目标检测算法[J/OL].计算机工程与应用:1-11[2023-08-14].http://kns.cnki.net/kcms/detail/11.2127.TP.20230628.1548.011.html.
文章在YOLOV7的基础上主要改进的部分分为以下几点:
1.对neck部分的下采样部分添加了四倍下采样的过程,使得对小目标检测的更加准确;
2.对SPPCSPC的前几个卷积层进行了裁剪,并加入了simAM注意力机制,并缩小了池化核;
3.把颈部网络中的下采样过程部分卷积使用了SPD Conv替换;
4.使用wise-IOU损失函数对原先的损失函数进行替换;
话不多说,直接上对比图:
这是原先的YOLOV7结构图:
接着是论文中改进的结构图:
图中画虚线的地方就是修改的部分,接着我们一步一步来剖析这个结构。
四倍下采样过程
首先来看这个结构最麻烦,也是复现起来最繁琐的一部分:添加下采样过程,原来的YOLOV7结构是有2个上采样和下采样的过程,总共输出3个不同的特征图;
而在新的结构中,最终输出了四个特征图,新添加了(160, 160, 45)的特征图,最终可以在这个特征图上添加尺寸较小的先验框,去检测小目标;
首先我们先下载好YOLOV7的源码(转载自他人):
GitHub - bubbliiiing/yolov7-pytorch: 这是一个yolov7的库,可以用于训练自己的数据集。
有了源码之后,参考源码中nets/yolo.py和nets/backbone.py文件去进行修改,这是我修改后的yolo.py文件代码(完整版):
import numpy as np
import torch
import torch.nn as nn
from nets.simAm import simam_module
from nets.backbone import Backbone, Multi_Concat_Block, Conv, SiLU, Transition_Block, Transition_Block_SPD, autopad
class space_to_depth(nn.Module):
# Changing the dimension of the Tensor
def __init__(self, dimension=1):
super().__init__()
self.d = dimension
def forward(self, x):
return torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)
class SPPCSPC(nn.Module):
# CSP https://github.com/WongKinYiu/CrossStagePartialNetworks
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=(3, 5, 9)):
super(SPPCSPC, self).__init__()
c_ = int(2 * c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c1, c_, 1, 1)
self.cv3 = Conv(c_, c_, 3, 1)
self.cv4 = Conv(c_, c_, 1, 1)
self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
self.cv5 = Conv(4 * c_, c_, 1, 1)
self.cv6 = Conv(c_, c_, 3, 1)
# 输出通道数为c2
self.cv7 = Conv(2 * c_, c2, 1, 1)
self.simam = simam_module(c_)
def forward(self, x):
x1 = self.cv1(x)
x1 = self.simam(x1)# 新添加的simam注意力机制
# x1 = self.cv4(self.cv3(self.cv1(x)))
y1 = self.cv6(self.cv5(torch.cat([x1] + [m(x1) for m in self.m], 1)))
y2 = self.cv2(x)
return self.cv7(torch.cat((y1, y2), dim=1))
class RepConv(nn.Module):
# Represented convolution
# https://arxiv.org/abs/2101.03697
def __init__(self, c1, c2, k=3, s=1, p=None, g=1, act=SiLU(), deploy=False):
super(RepConv, self).__init__()
self.deploy = deploy
self.groups = g
self.in_channels = c1
self.out_channels = c2
assert k == 3
assert autopad(k, p) == 1
padding_11 = autopad(k, p) - k // 2
self.act = nn.LeakyReLU(0.1, inplace=True) if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
if deploy:
self.rbr_reparam = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=True)
else:
self.rbr_identity = (nn.BatchNorm2d(num_features=c1, eps=0.001, momentum=0.03) if c2 == c1 and s == 1 else None)
self.rbr_dense = nn.Sequential(
nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False),
nn.BatchNorm2d(num_features=c2, eps=0.001, momentum=0.03),
)
self.rbr_1x1 = nn.Sequential(
nn.Conv2d( c1, c2, 1, s, padding_11, groups=g, bias=False),
nn.BatchNorm2d(num_features=c2, eps=0.001, momentum=0.03),
)
def forward(self, inputs):
if hasattr(self, "rbr_reparam"):
return self.act(self.rbr_reparam(inputs))
if self.rbr_identity is None:
id_out = 0
else:
id_out = self.rbr_identity(inputs)
return self.act(self.rbr_dense(inputs) + self.rbr_1x1(inputs) + id_out)
def get_equivalent_kernel_bias(self):
kernel3x3, bias3x3 = self._fuse_bn_tensor(self.rbr_dense)
kernel1x1, bias1x1 = self._fuse_bn_tensor(self.rbr_1x1)
kernelid, biasid = self._fuse_bn_tensor(self.rbr_identity)
return (
kernel3x3 + self._pad_1x1_to_3x3_tensor(kernel1x1) + kernelid,
bias3x3 + bias1x1 + biasid,
)
def _pad_1x1_to_3x3_tensor(self, kernel1x1):
if kernel1x1 is None:
return 0
else:
return nn.functional.pad(kernel1x1, [1, 1, 1, 1])
def _fuse_bn_tensor(self, branch):
if branch is None:
return 0, 0
if isinstance(branch, nn.Sequential):
kernel = branch[0].weight
running_mean = branch[1].running_mean
running_var = branch[1].running_var
gamma = branch[1].weight
beta = branch[1].bias
eps = branch[1].eps
else:
assert isinstance(branch, nn.BatchNorm2d)
if not hasattr(self, "id_tensor"):
input_dim = self.in_channels // self.groups
kernel_value = np.zeros(
(self.in_channels, input_dim, 3, 3), dtype=np.float32
)
for i in range(self.in_channels):
kernel_value[i, i % input_dim, 1, 1] = 1
self.id_tensor = torch.from_numpy(kernel_value).to(branch.weight.device)
kernel = self.id_tensor
running_mean = branch.running_mean
running_var = branch.running_var
gamma = branch.weight
beta = branch.bias
eps = branch.eps
std = (running_var + eps).sqrt()
t = (gamma / std).reshape(-1, 1, 1, 1)
return kernel * t, beta - running_mean * gamma / std
def repvgg_convert(self):
kernel, bias = self.get_equivalent_kernel_bias()
return (
kernel.detach().cpu().numpy(),
bias.detach().cpu().numpy(),
)
def fuse_conv_bn(self, conv, bn):
std = (bn.running_var + bn.eps).sqrt()
bias = bn.bias - bn.running_mean * bn.weight / std
t = (bn.weight / std).reshape(-1, 1, 1, 1)
weights = conv.weight * t
bn = nn.Identity()
conv = nn.Conv2d(in_channels = conv.in_channels,
out_channels = conv.out_channels,
kernel_size = conv.kernel_size,
stride=conv.stride,
padding = conv.padding,
dilation = conv.dilation,
groups = conv.groups,
bias = True,
padding_mode = conv.padding_mode)
conv.weight = torch.nn.Parameter(weights)
conv.bias = torch.nn.Parameter(bias)
return conv
def fuse_repvgg_block(self):
if self.deploy:
return
print(f"RepConv.fuse_repvgg_block")
self.rbr_dense = self.fuse_conv_bn(self.rbr_dense[0], self.rbr_dense[1])
self.rbr_1x1 = self.fuse_conv_bn(self.rbr_1x1[0], self.rbr_1x1[1])
rbr_1x1_bias = self.rbr_1x1.bias
weight_1x1_expanded = torch.nn.functional.pad(self.rbr_1x1.weight, [1, 1, 1, 1])
# Fuse self.rbr_identity
if (isinstance(self.rbr_identity, nn.BatchNorm2d) or isinstance(self.rbr_identity, nn.modules.batchnorm.SyncBatchNorm)):
identity_conv_1x1 = nn.Conv2d(
in_channels=self.in_channels,
out_channels=self.out_channels,
kernel_size=1,
stride=1,
padding=0,
groups=self.groups,
bias=False)
identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.to(self.rbr_1x1.weight.data.device)
identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.squeeze().squeeze()
identity_conv_1x1.weight.data.fill_(0.0)
identity_conv_1x1.weight.data.fill_diagonal_(1.0)
identity_conv_1x1.weight.data = identity_conv_1x1.weight.data.unsqueeze(2).unsqueeze(3)
identity_conv_1x1 = self.fuse_conv_bn(identity_conv_1x1, self.rbr_identity)
bias_identity_expanded = identity_conv_1x1.bias
weight_identity_expanded = torch.nn.functional.pad(identity_conv_1x1.weight, [1, 1, 1, 1])
else:
bias_identity_expanded = torch.nn.Parameter( torch.zeros_like(rbr_1x1_bias) )
weight_identity_expanded = torch.nn.Parameter( torch.zeros_like(weight_1x1_expanded) )
self.rbr_dense.weight = torch.nn.Parameter(self.rbr_dense.weight + weight_1x1_expanded + weight_identity_expanded)
self.rbr_dense.bias = torch.nn.Parameter(self.rbr_dense.bias + rbr_1x1_bias + bias_identity_expanded)
self.rbr_reparam = self.rbr_dense
self.deploy = True
if self.rbr_identity is not None:
del self.rbr_identity
self.rbr_identity = None
if self.rbr_1x1 is not None:
del self.rbr_1x1
self.rbr_1x1 = None
if self.rbr_dense is not None:
del self.rbr_dense
self.rbr_dense = None
def fuse_conv_and_bn(conv, bn):
fusedconv = nn.Conv2d(conv.in_channels,
conv.out_channels,
kernel_size=conv.kernel_size,
stride=conv.stride,
padding=conv.padding,
groups=conv.groups,
bias=True).requires_grad_(False).to(conv.weight.device)
w_conv = conv.weight.clone().view(conv.out_channels, -1)
w_bn = torch.diag(bn.weight.div(torch.sqrt(bn.eps + bn.running_var)))
# fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape))
fusedconv.weight.copy_(torch.mm(w_bn, w_conv).view(fusedconv.weight.shape).detach())
b_conv = torch.zeros(conv.weight.size(0), device=conv.weight.device) if conv.bias is None else conv.bias
b_bn = bn.bias - bn.weight.mul(bn.running_mean).div(torch.sqrt(bn.running_var + bn.eps))
# fusedconv.bias.copy_(torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn)
fusedconv.bias.copy_((torch.mm(w_bn, b_conv.reshape(-1, 1)).reshape(-1) + b_bn).detach())
return fusedconv
#---------------------------------------------------#
# yolo_body
#---------------------------------------------------#
class YoloBody(nn.Module):
def __init__(self, anchors_mask, num_classes, phi, pretrained=False):
super(YoloBody, self).__init__()
#-----------------------------------------------#
# 定义了不同yolov7版本的参数
#-----------------------------------------------#
transition_channels = {'l' : 32, 'x' : 40}[phi]
block_channels = 32
panet_channels = {'l' : 32, 'x' : 64}[phi]
e = {'l' : 2, 'x' : 1}[phi]
n = {'l' : 4, 'x' : 6}[phi]
ids = {'l' : [-1, -2, -3, -4, -5, -6], 'x' : [-1, -3, -5, -7, -8]}[phi]
conv = {'l' : RepConv, 'x' : Conv}[phi]
#-----------------------------------------------#
# 输入图片是640, 640, 3
#-----------------------------------------------#
#---------------------------------------------------#
# 生成主干模型
# 获得三个有效特征层,他们的shape分别是:
# 80, 80, 512
# 40, 40, 1024
# 20, 20, 1024
#---------------------------------------------------#
self.backbone = Backbone(transition_channels, block_channels, n, phi, pretrained=pretrained)
#------------------------加强特征提取网络------------------------#
self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
# 20, 20, 1024 => 20, 20, 512
self.sppcspc = SPPCSPC(transition_channels * 32, transition_channels * 16)
# 20, 20, 512 => 20, 20, 256 => 40, 40, 256
self.conv_for_P5 = Conv(transition_channels * 16, transition_channels * 8)
# 40, 40, 1024 => 40, 40, 256
self.conv_for_feat2 = Conv(transition_channels * 32, transition_channels * 8)
# 40, 40, 512 => 40, 40, 256
self.conv3_for_upsample1 = Multi_Concat_Block(transition_channels * 16, panet_channels * 4, transition_channels * 8, e=e, n=n, ids=ids)
# 40, 40, 256 => 40, 40, 128 => 80, 80, 128
self.conv_for_P4 = Conv(transition_channels * 8, transition_channels * 4)
# 80, 80, 512 => 80, 80, 128
self.conv_for_feat1 = Conv(transition_channels * 16, transition_channels * 4)
# 80, 80, 256 => 80, 80, 128
self.conv3_for_upsample2 = Multi_Concat_Block(transition_channels * 8, panet_channels * 2, transition_channels * 4, e=e, n=n, ids=ids)
# ---------------------------------------------------------------------------------------------------------------------------------------------------
#添加的新定义卷积层
self.conv_for_P3 = Conv(transition_channels * 4 , transition_channels * 2 )
#添加的新定义卷积层
self.conv_for_newfeat = Conv(transition_channels * 8 , transition_channels * 2)
#添加的新定义卷积层
self.conv3_for_upsample3 = Multi_Concat_Block(transition_channels * 4, panet_channels * 1, transition_channels * 2, e=e, n=n, ids=ids)
#---------------------------------------------------------------------------------------------------------------------------------------------------
# 80, 80, 128 => 40, 40, 256
self.down_sample1 = Transition_Block_SPD(transition_channels * 4, transition_channels * 4)
# 40, 40, 512 => 40, 40, 256
self.conv3_for_downsample1 = Multi_Concat_Block(transition_channels * 16, panet_channels * 4, transition_channels * 8, e=e, n=n, ids=ids)
# 40, 40, 256 => 20, 20, 512
self.down_sample2 = Transition_Block_SPD(transition_channels * 8, transition_channels * 8)
# 20, 20, 1024 => 20, 20, 512
self.conv3_for_downsample2 = Multi_Concat_Block(transition_channels * 32, panet_channels * 8, transition_channels * 16, e=e, n=n, ids=ids)
# ---------------------------------------------------------------------------------------------------------------------------------------------------
#添加的下采样定义
# 160, 160, 64 => 80, 80, 128
self.down_sample3 = Transition_Block_SPD(transition_channels * 2, transition_channels * 2)
# 80, 80, 256 => 80, 80, 128
self.conv3_for_downsample3 = Multi_Concat_Block(transition_channels * 8, panet_channels * 2, transition_channels * 4, e=e, n=n, ids=ids)
# ---------------------------------------------------------------------------------------------------------------------------------------------------
#------------------------加强特征提取网络------------------------#
# 80, 80, 128 => 80, 80, 256
self.rep_conv_1 = conv(transition_channels * 4, transition_channels * 8, 3, 1)
# 40, 40, 256 => 40, 40, 512
self.rep_conv_2 = conv(transition_channels * 8, transition_channels * 16, 3, 1)
# 20, 20, 512 => 20, 20, 1024
self.rep_conv_3 = conv(transition_channels * 16, transition_channels * 32, 3, 1)
#160, 160, 64 => 160, 160, 128 新添加的
self.rep_conv_4 = conv(transition_channels * 2, transition_channels * 4, 3, 1)
self.yolo_head_P2 = nn.Conv2d(transition_channels * 4, len(anchors_mask[3]) * (5 + num_classes), 1)
# 添加的新P2yolo头,这里的anchors_mask[3]需要自己添加,我们的目的是多加一个分支去检验小目标;
# 这里的anchors_mask[3]可以对应比较小的先验框
# 4 + 1 + num_classes
# 80, 80, 256 => 80, 80, 3 * 25 (4 + 1 + 20) & 85 (4 + 1 + 80)
self.yolo_head_P3 = nn.Conv2d(transition_channels * 8, len(anchors_mask[2]) * (5 + num_classes), 1)
# 40, 40, 512 => 40, 40, 3 * 25 & 85
self.yolo_head_P4 = nn.Conv2d(transition_channels * 16, len(anchors_mask[1]) * (5 + num_classes), 1)
# 20, 20, 512 => 20, 20, 3 * 25 & 85
self.yolo_head_P5 = nn.Conv2d(transition_channels * 32, len(anchors_mask[0]) * (5 + num_classes), 1)
def fuse(self):
print('Fusing layers... ')
for m in self.modules():
if isinstance(m, RepConv):
m.fuse_repvgg_block()
elif type(m) is Conv and hasattr(m, 'bn'):
m.conv = fuse_conv_and_bn(m.conv, m.bn)
delattr(m, 'bn')
m.forward = m.fuseforward
return self
def forward(self, x):
# backbone
feat1, feat2, feat3, new_feat = self.backbone.forward(x)
#------------------------加强特征提取网络------------------------#
# 20, 20, 1024 => 20, 20, 512
P5 = self.sppcspc(feat3)
# 20, 20, 512 => 20, 20, 256
P5_conv = self.conv_for_P5(P5)
# 20, 20, 256 => 40, 40, 256
P5_upsample = self.upsample(P5_conv)
# 40, 40, 256 cat 40, 40, 256 => 40, 40, 512
P4 = torch.cat([self.conv_for_feat2(feat2), P5_upsample], 1)
# 40, 40, 512 => 40, 40, 256
P4 = self.conv3_for_upsample1(P4)
# 40, 40, 256 => 40, 40, 128
P4_conv = self.conv_for_P4(P4)
# 40, 40, 128 => 80, 80, 128
P4_upsample = self.upsample(P4_conv)
# 80, 80, 128 cat 80, 80, 128 => 80, 80, 256
P3 = torch.cat([self.conv_for_feat1(feat1), P4_upsample], 1)
# 80, 80, 256 => 80, 80, 128
P3 = self.conv3_for_upsample2(P3)
# --------------------------------------------------------------------------#
# 新搭建的上采样过程:
# 80, 80, 128 => 80, 80, 64
P3_conv = self.conv_for_P3(P3)
# 80, 80, 64 => 160, 160, 64
P3_upsample = self.upsample(P3_conv)
# 160, 160, 64 cat 160, 160, 64 => 160, 160, 128
P2 = torch.cat([self.conv_for_newfeat(new_feat), P3_upsample],1)
# 160, 160, 128 => 160, 160, 64
P2 = self.conv3_for_upsample3(P2)
# --------------------------------------------------------------------------#
# 新添加的下采样过程
# 160, 160, 64 => 80, 80, 128
P2_downsample = self.down_sample3(P2)
# 80, 80, 128 cat 80, 80, 128 => 80, 80, 256
P3 = torch.cat([P2_downsample, P3], 1)
# 80, 80, 256 => 80, 80, 128
P3 = self.conv3_for_downsample3(P3)
# -------------------------------------------------------------------------#
# 80, 80, 128 => 40, 40, 256
P3_downsample = self.down_sample1(P3)
# 40, 40, 256 cat 40, 40, 256 => 40, 40, 512
P4 = torch.cat([P3_downsample, P4], 1)
# 40, 40, 512 => 40, 40, 256
P4 = self.conv3_for_downsample1(P4)
# 40, 40, 256 => 20, 20, 512
P4_downsample = self.down_sample2(P4)
# 20, 20, 512 cat 20, 20, 512 => 20, 20, 1024
P5 = torch.cat([P4_downsample, P5], 1)
# 20, 20, 1024 => 20, 20, 512
P5 = self.conv3_for_downsample2(P5)
#------------------------加强特征提取网络------------------------#
# P2 160, 160, 64 新添加的
# P3 80, 80, 128
# P4 40, 40, 256
# P5 20, 20, 512
P3 = self.rep_conv_1(P3)
P4 = self.rep_conv_2(P4)
P5 = self.rep_conv_3(P5)
P2 = self.rep_conv_4(P2)# 新添加的
# ---------------------------------------------------#
# 第四个特征层
# y4=(batch_size, 75, 160, 160)
# ---------------------------------------------------#
out3 = self.yolo_head_P2(P2)
#---------------------------------------------------#
# 第三个特征层
# y3=(batch_size, 75, 80, 80)
#---------------------------------------------------#
out2 = self.yolo_head_P3(P3)
#---------------------------------------------------#
# 第二个特征层
# y2=(batch_size, 75, 40, 40)
#---------------------------------------------------#
out1 = self.yolo_head_P4(P4)
#---------------------------------------------------#
# 第一个特征层
# y1=(batch_size, 75, 20, 20)
#---------------------------------------------------#
out0 = self.yolo_head_P5(P5)
return [out0, out1, out2, out3]
接着,我们一点一点来看修改之处在代码中是如何体现的:
在YoloBody中首先定义多个函数,便于后面在forward前向计算中使用(注意!这里的只是部分代码):
#添加的新定义卷积层
self.conv_for_P3 = Conv(transition_channels * 4 , transition_channels * 2 )
#添加的新定义卷积层
self.conv_for_newfeat = Conv(transition_channels * 8 , transition_channels * 2)
#添加的新定义卷积层
self.conv3_for_upsample3 = Multi_Concat_Block(transition_channels * 4, panet_channels * 1, transition_channels * 2, e=e, n=n, ids=ids)
#添加的下采样定义
# 160, 160, 64 => 80, 80, 128
self.down_sample3 = Transition_Block_SPD(transition_channels * 2, transition_channels * 2)
# 80, 80, 256 => 80, 80, 128
self.conv3_for_downsample3 = Multi_Concat_Block(transition_channels * 8, panet_channels * 2, transition_channels * 4, e=e, n=n, ids=ids)
#160, 160, 64 => 160, 160, 128 新添加的
self.rep_conv_4 = conv(transition_channels * 2, transition_channels * 4, 3, 1)
self.yolo_head_P2 = nn.Conv2d(transition_channels * 4, len(anchors_mask[3]) * (5 + num_classes), 1)
# 添加的新P2yolo头,这里的anchors_mask[3]需要自己添加,我们的目的是多加一个分支去检验小目标;
# 这里的anchors_mask[3]可以对应比较小的先验框
定义好函数之后,来看前向计算forward部分:
# 新搭建的上采样过程:
# 80, 80, 128 => 80, 80, 64
P3_conv = self.conv_for_P3(P3)
# 80, 80, 64 => 160, 160, 64
P3_upsample = self.upsample(P3_conv)
# 160, 160, 64 cat 160, 160, 64 => 160, 160, 128
P2 = torch.cat([self.conv_for_newfeat(new_feat), P3_upsample],1)
# 160, 160, 128 => 160, 160, 64
P2 = self.conv3_for_upsample3(P2)
# --------------------------------------------------------------------------#
# 新添加的下采样过程
# 160, 160, 64 => 80, 80, 128
P2_downsample = self.down_sample3(P2)
# 80, 80, 128 cat 80, 80, 128 => 80, 80, 256
P3 = torch.cat([P2_downsample, P3], 1)
# 80, 80, 256 => 80, 80, 128
P3 = self.conv3_for_downsample3(P3)
# -------------------------------------------------------------------------#
这里要注意的一点是网络在进行上采样之后,就要从最高处进行下采样了,所以是紧挨着的。
最后将四个特征图送入四个yolohead中:
#------------------------加强特征提取网络------------------------#
# P2 160, 160, 64 新添加的
# P3 80, 80, 128
# P4 40, 40, 256
# P5 20, 20, 512
P3 = self.rep_conv_1(P3)
P4 = self.rep_conv_2(P4)
P5 = self.rep_conv_3(P5)
P2 = self.rep_conv_4(P2)# 新添加的
# ---------------------------------------------------#
# 第四个特征层
# y4=(batch_size, 75, 160, 160)
# ---------------------------------------------------#
out3 = self.yolo_head_P2(P2)
#---------------------------------------------------#
# 第三个特征层
# y3=(batch_size, 75, 80, 80)
#---------------------------------------------------#
out2 = self.yolo_head_P3(P3)
#---------------------------------------------------#
# 第二个特征层
# y2=(batch_size, 75, 40, 40)
#---------------------------------------------------#
out1 = self.yolo_head_P4(P4)
#---------------------------------------------------#
# 第一个特征层
# y1=(batch_size, 75, 20, 20)
#---------------------------------------------------#
out0 = self.yolo_head_P5(P5)
接着进入backbone.py文件中找到class Backbone(nn.Module)部分,将forward部分dark2的输出进行保留,这里使用new_feat保存dark2的输出:
x = self.stem(x)
x = self.dark2(x)
new_feat = x
# new_feat的输出为160,160,256,是一个有效特征层
#-----------------------------------------------#
# dark3的输出为80, 80, 512,是一个有效特征层
#-----------------------------------------------#
x = self.dark3(x)
feat1 = x
#-----------------------------------------------#
# dark4的输出为40, 40, 1024,是一个有效特征层
#-----------------------------------------------#
x = self.dark4(x)
feat2 = x
#-----------------------------------------------#
# dark5的输出为20, 20, 1024,是一个有效特征层
#-----------------------------------------------#
x = self.dark5(x)
feat3 = x
return feat1, feat2, feat3, new_feat
四倍下采样结构改变的部分就完成了,接下来要注意一个细节,因为我们添加了一个新的特征层用于小目标检测,所以源码中的yolo——anchors也要发生改变,看论文中是这样写的:
这里160 x 160 的特征层分辨率较高,容易保留小目标的特征,所以使用先验框的尺寸较小,在代码的体现如下:
在nets/yolo_training.py中找到yolov7的先验框并进行修改,因为这里的作者已经给出明确的先眼眶大小了,所以不必再去使用k-means算法进行聚类,如果你要是想在你自己的数据集上重新选先验框,也可以聚类重新计算一下,准确性会更高;
anchors_path = 'model_data/yolo_anchors.txt'
anchors_mask = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]]
这里的YOLOLoss是在yolo_training.py中的
class YOLOLoss(nn.Module):
def __init__(self, anchors, num_classes, input_shape, anchors_mask = [[0, 1, 2], [3, 4, 5], [6, 7, 8], [9, 10, 11]], label_smoothing = 0):
super(YOLOLoss, self).__init__()
先验框的大小:
33, 43, 76, 45, 70, 93, 24, 13, 20, 24, 36, 22, 7, 12, 14, 9, 11, 17, 3, 4, 4, 8, 7, 6
之后同样在yolo_training.py中的YOLOLoss修改参数:
self.balance = [0.4, 1.0, 4, 1.0]# 这里的banlance会乘每个特征层的损失,放大损失的值
self.stride = [32, 16, 8, 4]# 这里的stride对应先验框的缩放比例,这里修改为了四个特征层,所以需要四个先眼眶比例
到此四倍下采样过程就结束了,接着我们来看SPPCSPC部分。
SPPCSPC
首先来看原先的SPPCSPC结构和论文中的SPPCSPC结构有什么不同:
论文中把SPPCSPC结构中的前两个卷积删掉了,只保留了一个1x1的卷积,并且加入了SimAM结构,并把池化核的大小从5, 9, 13变成了3, 5, 9。代码如下:
这是SimAM的代码:
class simam_module(torch.nn.Module):
def __init__(self, channels=None, e_lambda=1e-4):
super(simam_module, self).__init__()
self.activaton = nn.Sigmoid()
self.e_lambda = e_lambda
def __repr__(self):
s = self.__class__.__name__ + '('
s += ('lambda=%f)' % self.e_lambda)
return s
@staticmethod
def get_module_name():
return "simam"
def forward(self, x):
b, c, h, w = x.size()
n = w * h - 1
x_minus_mu_square = (x - x.mean(dim=[2, 3], keepdim=True)).pow(2)
y = x_minus_mu_square / (4 * (x_minus_mu_square.sum(dim=[2, 3], keepdim=True) / n + self.e_lambda)) + 0.5
return x * self.activaton(y)
这是修改后的SPPCSPC结构:
class SPPCSPC(nn.Module):
# CSP https://github.com/WongKinYiu/CrossStagePartialNetworks
def __init__(self, c1, c2, n=1, shortcut=False, g=1, e=0.5, k=(3, 5, 9)):
super(SPPCSPC, self).__init__()
c_ = int(2 * c2 * e) # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c1, c_, 1, 1)
self.cv3 = Conv(c_, c_, 3, 1)
self.cv4 = Conv(c_, c_, 1, 1)
self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
self.cv5 = Conv(4 * c_, c_, 1, 1)
self.cv6 = Conv(c_, c_, 3, 1)
# 输出通道数为c2
self.cv7 = Conv(2 * c_, c2, 1, 1)
self.simam = simam_module(c_)
def forward(self, x):
x1 = self.cv1(x)
x1 = self.simam(x1)# 新添加的simam注意力机制
# x1 = self.cv4(self.cv3(self.cv1(x)))
y1 = self.cv6(self.cv5(torch.cat([x1] + [m(x1) for m in self.m], 1)))
y2 = self.cv2(x)
return self.cv7(torch.cat((y1, y2), dim=1))
这里一定要对应好输入通道数和输出通道数,对应不好会报错;
到这里SPPCSPC就完成了,接下来看下采样过程的卷积是如何修改的。
改进下采样减少特征丢失
论文中提到的改进下采样部分,是仅针对颈部网络的Transition_Block进行修改,将这个模块中的一个卷积部分改编成了SPD Conv + 1x1 Conv,如图所示:
在代码中体现如下:
这是SPD Conv的定义模块:
class space_to_depth(nn.Module):
# Changing the dimension of the Tensor
def __init__(self, dimension=1):
super().__init__()
self.d = dimension
def forward(self, x):
return torch.cat([x[..., ::2, ::2], x[..., 1::2, ::2], x[..., ::2, 1::2], x[..., 1::2, 1::2]], 1)
这是原来的Transition_Block:
class Transition_Block(nn.Module):
def __init__(self, c1, c2):
super(Transition_Block, self).__init__()
self.cv1 = Conv(c1, c2, 1, 1)
self.cv2 = Conv(c1, c2, 1, 1)
self.cv3 = Conv(c2, c2, 3, 2)
self.mp = MP()
def forward(self, x):
# 160, 160, 256 => 80, 80, 256 => 80, 80, 128
x_1 = self.mp(x)
x_1 = self.cv1(x_1)
# 160, 160, 256 => 160, 160, 128 => 80, 80, 128
x_2 = self.cv2(x)
x_2 = self.cv3(x_2)
# 80, 80, 128 cat 80, 80, 128 => 80, 80, 256
return torch.cat([x_2, x_1], 1)
这是改进过的Transition_Block:
class Transition_Block_SPD(nn.Module):
def __init__(self, c1, c2):
super(Transition_Block_SPD, self).__init__()
self.cv1 = Conv(c1, c2, 1, 1)
self.cv2 = Conv(c1, c2, 1, 1)
self.cv3 = Conv(c2, c2, 3, 2)
self.SPD = space_to_depth()
self.one_x_one = Conv(c2 * 4, c2, 1, 1)
self.mp = MP()
def forward(self, x):
# 160, 160, 256 => 80, 80, 256 => 80, 80, 128
x_1 = self.mp(x)
x_1 = self.cv1(x_1)
# 160, 160, 256 => 160, 160, 128 => 80, 80, 128
# 新的维度变化:160, 160, 256 => 160, 160, 128 => 80, 80, 512(SPD) => 80, 80, 128
x_2 = self.cv2(x)
x_2 = self.SPD(x_2)
x_2 = self.one_x_one(x_2)
# x_2 = self.cv3(x_2)
# 80, 80, 128 cat 80, 80, 128 => 80, 80, 256
return torch.cat([x_2, x_1], 1)
这里详细的过程在代码中注释的已经很完整了,小编就不再解释啦。
然后在nets/yolo.py中对使用的Transition_Block进行修改:
# 80, 80, 128 => 40, 40, 256
self.down_sample1 = Transition_Block_SPD(transition_channels * 4, transition_channels * 4)
# 40, 40, 256 => 20, 20, 512
self.down_sample2 = Transition_Block_SPD(transition_channels * 8, transition_channels * 8)
# 160, 160, 64 => 80, 80, 128
self.down_sample3 = Transition_Block_SPD(transition_channels * 2, transition_channels * 2)
到此这个部分就结束了。
接着就是最后一项,替换损失函数为Wise-IOU函数。
Wise-IOU
这里替换函数非常简单,详情可以参考这篇文章:
YOLOV7改进-添加EIoU,SIoU,AlphaIoU,FocalEIoU,Wise-IoU_魔鬼面具的博客-CSDN博客
替换完毕之后,这篇论文就算是全部复现出来了,总共创新点为四个部分:
1.对neck部分的下采样部分添加了四倍下采样的过程,使得对小目标检测的更加准确;
2.对SPPCSPC的前几个卷积层进行了裁剪,并加入了simAM注意力机制,并缩小了池化核;
3.把颈部网络中的下采样过程部分卷积使用了SPD Conv替换;
4.使用wise-IOU损失函数对原先的损失函数进行替换;
这篇论文还是有水平的,复现过程也比一般的水文更复杂一些(笑死.jpg),希望我的文章可以帮助到大家,谢谢你的观看!觉得好的点个赞吧!