一 yolov5的特点
1.使用残差网络结构:
(1)主干部分:特征提取
(2)残差边部分:不做任何的处理
2.使用CSPNET结构:
(1)主干进行残差、堆叠
(2)残差边部分进行少量的处理
3.使用focus网络结构:
每隔一像素点取一值,将取值后的结果在通道上进行堆叠得到一个特征层
4.使用SiLu激活函数:
5.使用SPP结构:
通过不同池化核大小的最大池化进行特征提取,提高网络的感受野
二 yolov5网络构成
如下图所示,yolov5网络主要分为主干网络部分和加强特征提取网络部分。
1.首先对数据集进行标注、划分为训练集和验证集。对数据进行增强等相关的处理之后,对处理好的图像进行Focus前处理
2.处理后的数据依次进行到主干网络、加强特征网络(FPN、PAN)、和Head层。数据进入到CSPDrakNet和SPP模块进行特征提取,处理得到三个不同的特征图输出给加强特征网络对数据进行深度提取、融合,最后得到三个不同尺寸的预测图。
3。预测图进入Head层回归预测框进行置信度的筛选、NMS非极大抑制等处理得到最后的预测框。最后将预测框与真实进行对比,最后得到各种的损失,比如置信度损失、预测框损失等。后面还要进行反向传播求出损失值对所有参数的梯度。使用SGD优化器进行梯度下降的迭代,以降低目标损失值。
在设置epoch结束后,将验证集送入训练完的模型进行推理。
1.主干网络代码实现
1.1Focus模块处理
Input(640,640,3)=> Focus(320,320,12)
图片的大小高和宽会得到压缩,通道扩张原来的四倍。
Focus(320,320,12)=> Conv2D_BN_SiLU(320,320,64)
得到的特征层会进行卷积、标准化加激活函数
class CSPDarknet(nn.Module):
def __init__(self, base_channels, base_depth, phi, pretrained):
super().__init__()
self.stem = Focus(3, base_channels, k=3) # 输入图片是640, 640, 3
# 初始的基本通道base_channels是64
# 利用focus网络结构进行特征提取
# 640, 640, 3 -> 320, 320, 12 -> 320, 320, 64
Focus定义
class Focus(nn.Module):
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super(Focus, self).__init__()
self.conv = Conv(c1 * 4, c2, k, s, p, g, act)
def forward(self, x):
# 320, 320, 12 => 320, 320, 64
return self.conv(
# 640, 640, 3 => 320, 320, 12
torch.cat( #在通道上进行堆叠
[
x[..., ::2, ::2], #每隔一像素点取一值
x[..., 1::2, ::2],
x[..., ::2, 1::2],
x[..., 1::2, 1::2]
], 1
)
)
代码2
卷积定义
class Conv(nn.Module):
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True):
super(Conv, self).__init__()
self.conv = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False) #卷积
self.bn = nn.BatchNorm2d(c2, eps=0.001, momentum=0.03) #标准化
self.act = SiLU() if act is True else (act if isinstance(act, nn.Module) else nn.Identity())
#激活函数
def forward(self, x):
return self.act(self.bn(self.conv(x)))
def fuseforward(self, x):
return self.act(self.conv(x))
1.2残差结构堆叠
(1)
Conv2D_BN_SiLU(320,320,64)=> Conv2D_BN_SiLU(160,160,128)
Conv2D_BN_SiLU(160,160,128)=>CSPLayer(160, 160, 128)
self.stem = Focus(3, base_channels, k=3) # 输入图片是640, 640, 3
# 初始的基本通道base_channels是64
# 利用focus网络结构进行特征提取
# 640, 640, 3 -> 320, 320, 12 -> 320, 320, 64
self.dark2 = nn.Sequential(Conv(base_channels, base_channels * 2, 3, 2),
# (输入通道,输出通道, 卷积核大小,步长)
# 完成卷积之后,320, 320, 64 -> 160, 160, 128
C3(base_channels * 2, base_channels * 2, base_depth), )
# 完成CSPlayer之后,160, 160, 128 -> 160, 160, 128
(2)
CSPLayer(160, 160, 128)=> Conv2D_BN_SiLU(80,80,256)
Conv2D_BN_SiLU(80,80,256)=> CSPLayer(80, 80, 256)
self.dark3 = nn.Sequential(Conv(base_channels * 2, base_channels * 4, 3, 2),
# 完成卷积之后,160, 160, 128 -> 80, 80, 256
C3(base_channels * 4, base_channels * 4, base_depth * 3), )
# 完成CSPlayer之后,80, 80, 256 -> 80, 80, 256
# 在这里引出有效特征层80, 80, 256
# 进行加强特征提取网络FPN的构建
(3)
CSPLayer(80, 80, 256)=> Conv2D_BN_SiLU(40,40,512)
Conv2D_BN_SiLU(40,40,512)=>CSPLayer(40, 40, 512)
self.dark4 = nn.Sequential(Conv(base_channels * 4, base_channels * 8, 3, 2),
# 完成卷积之后,80, 80, 256 -> 40, 40, 512
C3(base_channels * 8, base_channels * 8, base_depth * 3),)
# 完成CSPlayer之后,40, 40, 512 -> 40, 40, 512
# 在这里引出有效特征层40, 40, 512
# 进行加强特征提取网络FPN的构建
(4)
CSPLayer(40, 40, 512)=> Conv2D_BN_SiLU(20,20,1024)
Conv2D_BN_SiLU(20,20,1024)=> SPPBottleneck(20,20,1024)
SPPBottleneck(20,20,1024)=> CSPLayer(20, 20, 1024)
self.dark5 = nn.Sequential(Conv(base_channels * 8, base_channels * 16, 3, 2),
# 完成卷积之后,40, 40, 512 -> 20, 20, 1024
SPP(base_channels * 16, base_channels * 16),
# 完成SPP之后,20, 20, 1024 -> 20, 20, 1024
C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False),)
# 完成CSPlayer之后,20, 20, 1024 -> 20, 20, 1024
SPP定义
class SPP(nn.Module):
def __init__(self, c1, c2, k=(5, 9, 13)): # 池化核为 5、9、13
super(SPP, self).__init__()
c_ = c1 // 2
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_ * (len(k) + 1), c2, 1, 1)
self.m = nn.ModuleList([nn.MaxPool2d(kernel_size=x, stride=1, padding=x // 2) for x in k])
# 堆叠后利用卷积对通道数进行调整
def forward(self, x):
x = self.cv1(x)
return self.cv2(torch.cat([x] + [m(x) for m in self.m], 1))
CSPLayer定义
class C3(nn.Module):
def __init__(self, c1, c2, n=1, shortcut=True, g=1, e=0.5): # ch_in, ch_out, number, shortcut, groups, expansion
super(C3, self).__init__()
c_ = int(c2 * e) # 隐藏通道
self.cv1 = Conv(c1, c_, 1, 1) #对特征层进行简单的通道调整
self.cv2 = Conv(c1, c_, 1, 1) #进行少量处理
self.cv3 = Conv(2 * c_, c2, 1) # act=FReLU(c2) 对通道进行整合
self.m = nn.Sequential(*[Bottleneck(c_, c_, shortcut, g, e=1.0) for _ in range(n)]) # 对c1不断进行残差结构的特征提取
def forward(self, x):
return self.cv3(torch.cat(
(
self.m(self.cv1(x)),
self.cv2(x) #进行少量的处理
)
, dim=1))
最后输出三个特征层:分别是(80, 80, 256)、(40, 40, 512)、(20, 20, 1024)
3.加强特征提取网络(FPN特征金字塔)
class YoloBody(nn.Module):
def __init__(self, anchors_mask, num_classes, phi, backbone='cspdarknet', pretrained=False, input_shape=[640, 640]):
super(YoloBody, self).__init__()
depth_dict = {'s' : 0.33, 'm' : 0.67, 'l' : 1.00, 'x' : 1.33,}
width_dict = {'s' : 0.50, 'm' : 0.75, 'l' : 1.00, 'x' : 1.25,}
dep_mul, wid_mul = depth_dict[phi], width_dict[phi]
base_channels = int(wid_mul * 64) # 64
base_depth = max(round(dep_mul * 3), 1) # 3
# 输入图片是640, 640, 3初始的基本通道是64
self.backbone_name = backbone
if backbone == "cspdarknet": # 生成CSPdarknet53的主干模型 获得三个有效特征层,他们的shape分别是:
# 80,80,256 40,40,512 0,20,1024
self.backbone = CSPDarknet(base_channels, base_depth, phi, pretrained)
else: # 如果输入不为cspdarknet,则调整通道数使其符合YoloV5的格式
self.backbone = {
'convnext_tiny' : ConvNeXt_Tiny,
'convnext_small' : ConvNeXt_Small,
'swin_transfomer_tiny' : Swin_transformer_Tiny,
}[backbone](pretrained=pretrained, input_shape=input_shape)
in_channels = {
'convnext_tiny' : [192, 384, 768],
'convnext_small' : [192, 384, 768],
'swin_transfomer_tiny' : [192, 384, 768],
}[backbone]
feat1_c, feat2_c, feat3_c = in_channels
self.conv_1x1_feat1 = Conv(feat1_c, base_channels * 4, 1, 1)
self.conv_1x1_feat2 = Conv(feat2_c, base_channels * 8, 1, 1)
self.conv_1x1_feat3 = Conv(feat3_c, base_channels * 16, 1, 1)
self.upsample = nn.Upsample(scale_factor=2, mode="nearest")
self.conv_for_feat3 = Conv(base_channels * 16, base_channels * 8, 1, 1) #利用卷积进行特征提取
self.conv3_for_upsample1 = C3(base_channels * 16, base_channels * 8, base_depth, shortcut=False)
self.conv_for_feat2 = Conv(base_channels * 8, base_channels * 4, 1, 1)
self.conv3_for_upsample2 = C3(base_channels * 8, base_channels * 4, base_depth, shortcut=False)
self.down_sample1 = Conv(base_channels * 4, base_channels * 4, 3, 2)
self.conv3_for_downsample1 = C3(base_channels * 8, base_channels * 8, base_depth, shortcut=False)
self.down_sample2 = Conv(base_channels * 8, base_channels * 8, 3, 2)
self.conv3_for_downsample2 = C3(base_channels * 16, base_channels * 16, base_depth, shortcut=False)
self.yolo_head_P3 = nn.Conv2d(base_channels * 4, len(anchors_mask[2]) * (5 + num_classes), 1)# 80, 80, 256 => 80, 80, 3(先验框数量) * (4(前两个参数获得先验框中心,后两个参数获得预测框高宽) + 1 + num_classes)
self.yolo_head_P4 = nn.Conv2d(base_channels * 8, len(anchors_mask[1]) * (5 + num_classes), 1)# 40, 40, 512 => 40, 40, 3 * (4 + 1 + num_classes)
self.yolo_head_P5 = nn.Conv2d(base_channels * 16, len(anchors_mask[0]) * (5 + num_classes), 1)# 20, 20, 1024 => 20, 20, 3 * (4 + 1 + num_classes
def forward(self, x):
# backbone
feat1, feat2, feat3 = self.backbone(x)
if self.backbone_name != "cspdarknet":
feat1 = self.conv_1x1_feat1(feat1)
feat2 = self.conv_1x1_feat2(feat2)
feat3 = self.conv_1x1_feat3(feat3)
P5 = self.conv_for_feat3(feat3) # 20, 20, 1024 -> 20, 20, 512
P5_upsample = self.upsample(P5) # 20, 20, 512 -> 40, 40, 512 上采样
P4 = torch.cat([P5_upsample, feat2], 1) # 40, 40, 512 cat 40, 40, 512 -> 40, 40, 1024 堆叠
P4 = self.conv3_for_upsample1(P4) # 40, 40, 1024 -> 40, 40, 512 特征提取
P4 = self.conv_for_feat2(P4) # 40, 40, 512 -> 40, 40, 256
P4_upsample = self.upsample(P4) # 40, 40, 256 -> 80, 80, 256 上采样
P3 = torch.cat([P4_upsample, feat1], 1) # 80, 80, 256 cat 80, 80, 256 -> 80, 80, 512
P3 = self.conv3_for_upsample2(P3) # 80, 80, 512 -> 80, 80, 256 特征提取
P3_downsample = self.down_sample1(P3) # 80, 80, 256 -> 40, 40, 256 下采样
P4 = torch.cat([P3_downsample, P4], 1) # 40, 40, 256 cat 40, 40, 256 -> 40, 40, 512 堆叠
P4 = self.conv3_for_downsample1(P4) # 40, 40, 512 -> 40, 40, 512 特征提取
P4_downsample = self.down_sample2(P4) # 40, 40, 512 -> 20, 20, 512 下采样
P5 = torch.cat([P4_downsample, P5], 1) # 20, 20, 512 cat 20, 20, 512 -> 20, 20, 1024 堆叠
P5 = self.conv3_for_downsample2(P5) # 20, 20, 1024 -> 20, 20, 1024 特征提取
out2 = self.yolo_head_P3(P3) # 第三个特征层
# y3=(batch_size,75,80,80)
out1 = self.yolo_head_P4(P4) # 第二个特征层
# y2=(batch_size,75,40,40)
out0 = self.yolo_head_P5(P5) # 第一个特征层
# y1=(batch_size,75,20,20)
return out0, out1, out2
[参考](https://blog.csdn.net/weixin_44791964/article/details/121626848)