yolov4项目记录3-模型构建

最新推荐文章于 2024-04-28 19:55:35 发布

Swayzzu

最新推荐文章于 2024-04-28 19:55:35 发布

阅读量1.3k

点赞数 1

分类专栏： CV 文章标签：深度学习 pytorch 神经网络计算机视觉

本文链接：https://blog.csdn.net/Swayzzu/article/details/122204554

版权

CV 专栏收录该内容

18 篇文章 0 订阅

订阅专栏

注意图中输入应该是608*608*3

一、模型的backbone

1.Mish激活函数

在该模型中，使用了Mish激活函数，表达形式如下：

$f(x) = xtanh(log(1+e^{x}))$

在pytorch中没有该接口，需要自己实现。

2.CBM模块

实现了mish激活函数之后，就可以定义一个基础的卷积模块了。这个卷积包含了卷积层，Batch Normalization层，以及激活函数层。在Backbone部分使用。

这里需要注意的地方是，卷积层的padding，当使用的卷积核是1*1的时候，padding为0，当卷积核大小是3*3的时候，padding为1。

3.残差模块

这个模块用于和卷积结合，构建CSPX模块。残差模块可以使用多个，当只需要使用1个残差模块的时候，残差中需要减少通道数，也就是隐藏层的通道数是输入层的一半，在输出的时候再把通道数提升上来。

4.CSPX模块

在Backbone部分，包含2种模块，即CBM以及CSPX，CSPX中的X就是残差模块的数量。需要注意的地方是：

当CSPX的X不为1的时候，残差模块中间隐藏层是不需要减少通道数的，这个模型中，只有第一个CSP1，也就是只用了1个残差模块，这样的话残差模块的隐藏层需要减少通道数。

后面的X分别是2,8,8,4，均不需要在残差模块中间改变通道数。

注意，这里的代码中，直接把CSP和前后相连的CBM结合了起来。

5.BackBone实现

当输入了图片信息之后，通过了Backbone，应该在分别在第三个、第四个、第五个CSPX模块之后，输出一个结果，这就是不同层级的图片信息，用于和其他层级的信息相融合。

代码如下：

import torch.nn as nn
import torch.nn.functional as F
import torch
import math
import numpy as np

class Mish(nn.Module):
    '''mish激活函数定义'''
    def __init__(self):
        super(Mish, self).__init__()

    def forward(self, x):
        return x*torch.tanh(F.softplus(x))

class BasicConv(nn.Module):
    '''CONV + BN + MISH'''
    def __init__(self, in_channels, out_channels, kernel_size, stride=1):
        super(BasicConv, self).__init__()
        # 在YOLOV4中，只有kernel=1和3两种情况，1的时候padding为0,3的时候padding为1
        self.conv = nn.Conv2d(in_channels,out_channels, kernel_size, stride=stride, padding=kernel_size//2, bias=False)
        self.BN = nn.BatchNorm2d(out_channels)
        self.mish = Mish()

    def forward(self, x):
        x = self.conv(x)
        x = self.BN(x)
        x = self.mish(x)
        return x

class Resblock(nn.Module):
    '''残差模块'''
    def __init__(self, channels, hidden_channels=None):
        super(Resblock, self).__init__()
        if hidden_channels == None:
            hidden_channels = channels

        self.block = nn.Sequential(
            BasicConv(channels, hidden_channels, kernel_size=1),
            BasicConv(hidden_channels, channels, kernel_size=3)
        )

    def forward(self, x):
        return self.block(x) + x

class Resblock_body(nn.Module):
    '''CSPX模块，先进行下采样，再进行分支，拼接之后还有1个1*1卷积
    当只有1个res模块的时候，下采样之后的卷积不改变通道数量
    当有多个Res模块的时候，下采样之后的卷积通道数变为一半'''
    def __init__(self, in_channels, out_channels,num_blocks, first):
        super(Resblock_body, self).__init__()
        # 输入(304, 304, 64), 输出k = (304+2-3)/2 +1 = 152
        self.downsample = BasicConv(in_channels, out_channels, kernel_size=3, stride=2)

        if first:
            self.conv0 = BasicConv(out_channels, out_channels, kernel_size=1)
            self.block = nn.Sequential(
                BasicConv(out_channels, out_channels, kernel_size=1),
                Resblock(out_channels, out_channels//2),
                BasicConv(out_channels, out_channels, kernel_size=1)
            )
            self.concat_conv = BasicConv(out_channels*2, out_channels, kernel_size=1)
        else:
            self.conv0 = BasicConv(out_channels, out_channels//2, kernel_size=1)
            self.block = nn.Sequential(
                BasicConv(out_channels, out_channels//2, kernel_size=1),
                *[Resblock(out_channels//2) for _ in range(num_blocks)],
                BasicConv(out_channels//2, out_channels//2, kernel_size=1)
            )
            self.concat_conv = BasicConv(out_channels, out_channels, kernel_size=1)
    def forward(self, x):
        x0 = self.downsample(x)
        x1 = self.conv0(x0)
        x2 = self.block(x0)
        out = torch.cat([x1, x2], dim=1)
        out = self.concat_conv(out)
        return out

class CSPDarkNet(nn.Module):
    '''合起来，构建backbone'''
    def __init__(self, layers):
        super(CSPDarkNet, self).__init__()
        self.first_channel = 32
        self.channel_list = [64, 128, 256, 512, 1024]
        self.conv1 = BasicConv(3, self.first_channel, 3, 1)

        self.stages = nn.ModuleList([
            Resblock_body(self.first_channel, self.channel_list[0], layers[0], first=True),
            Resblock_body(self.channel_list[0], self.channel_list[1], layers[1], first=False),
            Resblock_body(self.channel_list[1], self.channel_list[2], layers[2], first=False),
            Resblock_body(self.channel_list[2], self.channel_list[3], layers[3], first=False),
            Resblock_body(self.channel_list[3], self.channel_list[4], layers[4], first=False)
        ])
        # 进行权值初始化
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def forward(self, x):
        x = self.conv1(x)
        x = self.stages[0](x)
        x = self.stages[1](x)
        out3 = self.stages[2](x)
        out4 = self.stages[3](out3)
        out5 = self.stages[4](out4)
        return out3, out4, out5

def load_model_pth(model, pth):
    print('Loading weights into state dict, name: %s'%(pth))
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model_dict = model.state_dict()
    pretrained_dict = torch.load(pth, map_location=device)
    matched_dict = {}
    for k,v in pretrained_dict.items():
        if np.shape(model_dict[k]) == np.shape(v):
            matched_dict[k] = v
        else:
            print('unmatched layers: %s'%k)
    print(len(model_dict.keys()), len(pretrained_dict.keys()))
    print('%d layers matched,  %d layers miss' % (len(matched_dict.keys()), len(model_dict) - len(matched_dict.keys())))
    model_dict.update(matched_dict)
    model.load_state_dict(model_dict)
    print('Finished!')
    return model

def darknet53(pretrained):
    model = CSPDarkNet([1,2,8,8,4])
    if pretrained:
        load_model_pth(model, pretrained)
    return model

二、模型的颈部+头部构建

1.CBL模块

包含卷积层，BatchNormalize层，以及LeakyReLU激活函数。同样的，这里卷积层的padding，也是在当卷积核是1的时候，padding是0，卷积核是3的时候，padding是1。

2.SPP模块

在SPP模块中，使用了三个不同大小的kernel，分别是5,9,13，这里的作用是增加感受野，同时还能保证输出的形状是我们所需要的形状。因此，对图片的padding，就需要是核大小除以2，分别是2,4,6，这样的话，如果一个输入是（512,19,19），通过这三个pooling之后，每一个的输出仍然是（512,19,19），但是这样增加了对图片不同大小区域的感知。

3.CBL+上采样

模型中有2个地方用到了上采样，是因为模型需要对从下到上的信息整合。最终的backbone的输出的尺寸是19*19，而所需要整合的前面的尺寸分别是38*38，以及76*76，因此需要进行上采样，将尺寸转换成相同的尺寸才可以融合。

同理，模型还会通过下采样从上到下对信息进行融合，图中未标注，就是右边往下连接的两条线，因此对于76*76以及38*38，都需要进行下采样。

4.多次卷积模块

在模型颈部，有CBL*3以及CBL*5两部分模块，我们需要单独定义。

5.头部模块

这里指的是右边蓝色的输出部分。一层CBL加上一层卷积即可。

6.代码实现

这部分的实现方式，我是把模型分成了五个区域单独实现，并连接起来的。需要注意的是图中没有下采样，而实现的时候要加上。这里最终的yolohead的输出分别是

(B, 255, 19, 19),(B, 255, 38, 38),(B, 255, 76, 76)

中间是包含我们所需信息的维度，3*（4+1+80），这里的80是因为有80个分类。我们将对yolo头部的输出进行decode。

import torch.nn as nn
import torch
from collections import OrderedDict
from CSPDarknet import *


# CBL
def conv2d(filter_in, filter_out, kernel_size, stride=1):
    pad = (kernel_size-1) // 2 if kernel_size else 0
    return nn.Sequential(OrderedDict([
        ('conv', nn.Conv2d(filter_in, filter_out, kernel_size=kernel_size, stride=stride, padding=pad)),
        ('bn', nn.BatchNorm2d(filter_out)),
        ('relu', nn.LeakyReLU(0.1))
         ]))

#SPP[5,9,13]
class SpatialPyramidPooling(nn.Module):
    def __init__(self, pool_sizes=[5,9,13]):
        super(SpatialPyramidPooling, self).__init__()
        self.maxpools = nn.ModuleList([nn.MaxPool2d(pool_size, 1, padding=pool_size//2) for pool_size in pool_sizes])

    def forward(self, x):
        out = [layer(x) for layer in self.maxpools[::-1]]
        out = torch.cat(out+[x], dim=1)
        return out

# 卷积+上采样
class Upsample(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Upsample, self).__init__()
        self.upsample = nn.Sequential(conv2d(in_channels, out_channels, kernel_size=1),
                                      nn.Upsample(scale_factor=2, mode='nearest'))
    def forward(self, x):
        x = self.upsample(x)
        return x

# 三次卷积块[512,1024]
def make_three_conv(filters_list, in_filters):
    m = nn.Sequential(
        conv2d(in_filters, filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1)
    )
    return m

# 五次卷积块[128,256]
def make_five_conv(filters_list, in_filters):
    m = nn.Sequential(
        conv2d(in_filters, filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1),
        conv2d(filters_list[0], filters_list[1], 3),
        conv2d(filters_list[1], filters_list[0], 1)
    )
    return m

# [256,18]
def yolo_head(filters_list, in_filters):
    m = nn.Sequential(conv2d(in_filters, filters_list[0], 3),
                      nn.Conv2d(filters_list[0], filters_list[1],1))
    return m

class YoloBody(nn.Module):
    def __init__(self, num_anchors, num_classes):
        super(YoloBody, self).__init__()
        self.CSPDarknet = CSPDarkNet([1,2,8,8,4])
        # 第一部分
        self.conv1_1 = make_three_conv([512,1024],1024)
        self.spp = SpatialPyramidPooling()
        self.conv1_2 = make_three_conv([512,1024], 2048)
        # 第二部分
        self.conv2_1 = conv2d(512, 256, 1)
        self.conv2_2 = Upsample(512, 256)
        self.conv2_3 = make_five_conv([256, 512], 512)
        # 第三部分
        self.conv3_1 = conv2d(256, 128, 1)
        self.conv3_2 = Upsample(256, 128)
        self.conv3_3 = make_five_conv([128, 256], 256)
        final_out_filter2 = num_anchors * (5 + num_classes)
        self.yolo_head2 = yolo_head([256, final_out_filter2], 128)
        # 第四部分
        self.downsample1 = conv2d(128,256,kernel_size=3, stride=2)
        self.conv4_2 = make_five_conv([256, 512], 512)
        final_out_filter1 = num_anchors * (5 + num_classes)
        self.yolo_head1 = yolo_head([512, final_out_filter1], 256)
        # 第五部分
        self.downsample2 = conv2d(256, 512, 3, 2)
        self.conv5_2 = make_five_conv([512, 1024], 1024)
        final_out_filter0 = num_anchors * (5 + num_classes)
        self.yolo_head0 = yolo_head([1024, final_out_filter0], 512)

    def forward(self, x):
        # x2(76,76,256)  x1(38,38,512)  x0(19,19,1024)
        x2, x1, x0 = self.CSPDarknet(x)
        # 第一部分
        out1 = self.conv1_1(x0)
        out1 = self.spp(out1)
        out1 = self.conv1_2(out1)
        # 第二部分
        out2_1 = self.conv2_1(x1)
        out2_2 = self.conv2_2(out1)
        out2 = torch.cat([out2_1, out2_2], dim=1)
        out2 = self.conv2_3(out2)
        # 第三部分
        out3_1 = self.conv3_1(x2)
        out3_2 = self.conv3_2(out2)
        out3 = torch.cat([out3_1, out3_2], dim=1)
        out3 = self.conv3_3(out3)
        yolo_out2 = self.yolo_head2(out3)
        # 第四部分
        downsample_1 = self.downsample1(out3)
        out4 = torch.cat([downsample_1, out2], dim=1)
        out4 = self.conv4_2(out4)
        yolo_out1 = self.yolo_head1(out4)
        # 第五部分
        downsample_2 = self.downsample2(out4)
        out5 = torch.cat([downsample_2, out1], dim=1)
        out5 = self.conv5_2(out5)
        yolo_out0 = self.yolo_head0(out5)

        return yolo_out0, yolo_out1, yolo_out2

三、模型输出的decode

1.解码流程

模型的输出有三个，分别是(B, 255, 19, 19),(B, 255, 38, 38),(B, 255, 76, 76)，因此我们需要对这三个输出分别解码。

①维度变换

首先需要将输出view成(B, A, n_ch, H, W)的形式，其中H和W就是输出的尺寸，A是锚框数量，n_ch是包含了bx, by, bw, bh, obj, cls的信息，维度为4+1+80=85。之后再进行一个维度变换，最终得到(B, 3, 19, 19, 85)的维度（以第一个为例）。此时，最后一个维度85，包含了我们解码所需的所有信息，也就是说，我们需要对前面B*3*19*19这么多的数据，都进行同样方式的解码。

②取出数据

接下来我们取出来bx, by, bw, bh, obj, cls。注意，此时除了cls之外，其他所有的维度，都减少了一维变成了(B, 3, 19, 19)，因为cls是以切片形式取的，所以维度数量不变，是(B, 3, 19, 19, 80)。

# 取出来bx, by, bw, bh
bx, by = output[..., 0], output[..., 1]
bw, bh = output[..., 2], output[..., 3]
# 取出来obj和cls
obj = output[..., 4]
cls = output[..., 5:]

③初步处理

根据下面的解码图来计算：

由图，我们需要先把bx, by取一个sigmoid，把bw和bh取一个exp。这里加上了缩放因子，据说当图片中的目标既有大又有小的时候，会起作用，暂时没见到实际起作用的情况，不过先加上了，当缩放因子为1的时候，相当于不起作用。这里我们同时也对物体以及分类的置信度取sigmoid。

    # 进行初步转换
    bx = torch.sigmoid(bx)
    by = torch.sigmoid(by)
    bw = torch.exp(bw) * scale_x_y - 0.5 * (scale_x_y - 1)
    bh = torch.exp(bh) * scale_x_y - 0.5 * (scale_x_y - 1)
    # 对物体置信度，分类置信度也取sigmoid
    det_confs = torch.sigmoid(obj)
    cls_confs = torch.sigmoid(cls)

④构建网格

目标是找到点相对于整张图的偏移的比例，但由于我们分了网格，因此先找相对于网格的偏移比例。

这里，图像被分成了19*19个网格，假如中心点在第2行第3列的网格里面。

相对网格的偏移量肯定是小数，比如在x轴方向上偏移是0.5，在y轴上偏移是0.2，也就是在网格中间偏上的位置。

那么，以网格为单位，相对于所有网格来说，中心点的实际偏移量：

在x轴方向的0.5，加上偏移的网格数，也就是2（从0开始计数），那么得到了2.5就是以网格为单位，相对于整张网格图的x轴偏移量。同理，y轴方向上，相对于y轴的，就是1.5。

因此我们需要把x和y方向上的网格数量构建一下，并找到以网格为单位的偏移量。

    # 构建网格grid_x和grid_y
    grid_x = torch.arange(W, dtype=torch.float).repeat(1, 3, W, 1).to(device)
    grid_y = torch.arange(W, dtype=torch.float).repeat(1, 3, H, 1).permute(0, 1, 3, 2).to(device)

    # 求bx和by
    bx = bx + grid_x
    by = by + grid_y

⑤计算实际偏移量

既然找到了相对于网格的偏移量，那么偏移的比例就是偏移量除以网格长度，这个比例，就是相对于网格的偏移比例，同时也是相对于整张图的偏移比例。

另外根据解码图，我们对于w和h，还需要乘上先验框的宽、高，得到最终的bw和bh。

    # 取每个anchor的长和宽，求bw和bh
    for i in range(num_anchors):
        bw[:, i, ...] *= anchors[i * 2]
        bh[:, i, ...] *= anchors[i * 2 + 1]
    # 对数据转换，除以网格数量，得到相对整张图的偏移的比例，并增加一个维度
    bx = (bx / W).unsqueeze(-1)
    by = (by / H).unsqueeze(-1)
    bw = (bw / W).unsqueeze(-1)
    bh = (bh / H).unsqueeze(-1)

⑥得到输出

现在对于中心点相对于原图的偏移量，以及宽高都得到了，我们把这四个数据结合起来，再把obj和cls的置信度都合在一起，就得到了我们最终解码后的输出，用于后续的画图等计算。

    # 四个数据拼接起来，并reshape成[B, -1, 4]的形状
    boxes = torch.cat([bx, by, bw, bh], dim=-1).reshape(B,A*W*H, 4)
    det_confs = det_confs.unsqueeze(-1).reshape(B,A*W*H, 1)
    cls_confs = cls_confs.reshape(B,A*W*H, num_classes)

    outputs = torch.cat([boxes, det_confs, cls_confs], dim=-1)

2.代码实现

除了上面的解码函数，我们还需要提前定义锚框的数量、尺寸大小等其他参数。我们将准备工作写在一个类中，并在类的forward里面调用解码函数。下面是把解码以及类合并到一起的代码。

import numpy as np
import torch.nn as nn
import torch


def yolo_decode(output, num_classes, anchors, num_anchors, scale_x_y):
    # 检查数据在哪个设备
    device = None
    cuda_check = output.is_cuda
    if cuda_check:
        device = output.get_device()
    # 准备bx, by, bw, bh, obj, cls数据，输出数据是[B, Anchors*(4+1+cls), H, W]
    n_ch = 4 + 1 + num_classes
    A = num_anchors
    B = output.size(0)
    H = output.size(2)
    W = output.size(3)
    output = output.view(B, A, n_ch, H, W).permute(0, 1, 3, 4, 2).contiguous()
    # 取出来bx, by, bw, bh
    bx, by = output[..., 0], output[..., 1]
    bw, bh = output[..., 2], output[..., 3]
    # 取出来obj和cls
    obj = output[..., 4]
    cls = output[..., 5:]

    # 进行初步转换
    bx = torch.sigmoid(bx)
    by = torch.sigmoid(by)
    bw = torch.exp(bw) * scale_x_y - 0.5 * (scale_x_y - 1)
    bh = torch.exp(bh) * scale_x_y - 0.5 * (scale_x_y - 1)
    # 对物体置信度，分类置信度也取sigmoid
    det_confs = torch.sigmoid(obj)
    cls_confs = torch.sigmoid(cls)

    # 构建网格grid_x和grid_y
    grid_x = torch.arange(W, dtype=torch.float).repeat(1, 3, W, 1).to(device)
    grid_y = torch.arange(W, dtype=torch.float).repeat(1, 3, H, 1).permute(0, 1, 3, 2).to(device)

    # 求bx和by
    bx = bx + grid_x
    by = by + grid_y

    # 取每个anchor的长和宽，求bw和bh
    for i in range(num_anchors):
        bw[:, i, ...] *= anchors[i * 2]
        bh[:, i, ...] *= anchors[i * 2 + 1]
    # 对数据转换，除以网格数量，得到相对整张图的偏移的比例，并增加一个维度
    bx = (bx / W).unsqueeze(-1)
    by = (by / H).unsqueeze(-1)
    bw = (bw / W).unsqueeze(-1)
    bh = (bh / H).unsqueeze(-1)

    # 四个数据拼接起来，并reshape成[B, -1, 4]的形状
    boxes = torch.cat([bx, by, bw, bh], dim=-1).reshape(B,A*W*H, 4)
    det_confs = det_confs.unsqueeze(-1).reshape(B,A*W*H, 1)
    cls_confs = cls_confs.reshape(B,A*W*H, num_classes)

    outputs = torch.cat([boxes, det_confs, cls_confs], dim=-1)
    return outputs

class YoloLayer(nn.Module):
    def __init__(self, img_size, anchor_masks=[], num_classes=80, anchors=[], num_anchors=9, scale_x_y=1):
        super(YoloLayer, self).__init__()
        self.anchor_masks = anchor_masks
        self.num_classes = num_classes
        # 锚框设置成列表，便于我们取出先验框数据
        if type(anchors) == np.ndarray:
            self.anchors = anchors.tolist()
        else:
            self.anchors = anchors
        self.num_anchors = num_anchors
        self.anchor_step = len(self.anchors) // num_anchors
        self.scale_x_y = scale_x_y
        self.feature_length = [img_size[0]//8, img_size[0]//16, img_size[0]//32]
        self.img_size = img_size

    def forward(self, output):
        if self.training:
            return output
        in_w = output.size(3)
        anchor_index = self.anchor_masks[self.feature_length.index(in_w)] 
        # 取出对应于尺寸的三个锚框的索引
        stride_w = self.img_size[0] / in_w  
        # 每一个网格是32个像素点
        masked_anchors = []
        for m in anchor_index: # 取出来对应的三个锚框的anchor的宽高
            masked_anchors += self.anchors[m*self.anchor_step:(m+1)*self.anchor_step]
        self.masked_anchors = [anchor / stride_w for anchor in masked_anchors] 
        # 将锚框的宽高的单位改成网格，因为解码过程中的bw,bh单位也是网格

        data = yolo_decode(output, self.num_classes, self.masked_anchors, len(anchor_index),scale_x_y=self.scale_x_y)
        return data