YOLO-V3-SPP models.py详细解读-CSDN博客

本文链接：https://blog.csdn.net/qq_38109282/article/details/117374610
文章前言

该文链接至
YOLO-V3-SPP
有兴趣请查看上文对YOLO-V3-SPP的详细解读
model流程

（霹雳啪啦UP分享的源码版本）
在这里插入图片描述
models.py

现在暂时更新到源码注释这块，后面会该model.py的流程做一个流程图
在霹雳吧啦Wz对源码的注释的基础上，我也对源码进行了详细的解析
（阅读请忽略ONNX模型的相关知识，我还没接触该ONNX模型，待使用到再进行详细注释）
from build_utils.layers import *
from build_utils.parse_config import *
from build_utils import torch_utils

ONNX_EXPORT = False

    # 注意第一个参数包含了:，这是类型建议符，非强制
def create_modules(modules_defs: list, img_size):
    """
    Constructs module list of layer blocks from module configuration in module_defs
    :param modules_defs: 通过.cfg文件解析得到的每个层结构的列表
    :param img_size:
    :return:
    """

    img_size = [img_size] * 2 if isinstance(img_size, int) else img_size
    # 删除解析cfg列表中的第一个配置(对应[net]的配置)
    modules_defs.pop(0)  # cfg training hyperparams (unused)
    #output_filters，每层的输入通道list，第0个是图像的输入通道，即3通道
    output_filters = [3]  # input channels
    module_list = nn.ModuleList()
    # 统计哪些特征层的输出会被后续的层使用到(可能是特征融合，也可能是拼接)，类似残差模块
    routs = []  # list of layers which rout to deeper layers
    yolo_index = -1

    # 遍历搭建每个层结构,i是物理序号，从0开始，表示layer序号
    for i, mdef in enumerate(modules_defs):
        modules = nn.Sequential()

        if mdef["type"] == "convolutional":
            bn = mdef["batch_normalize"]  # 1 or 0 / use or not
            filters = mdef["filters"]
            k = mdef["size"]  # kernel size
            stride = mdef["stride"] if "stride" in mdef else (mdef['stride_y'], mdef["stride_x"])
            if isinstance(k, int):
                modules.add_module("Conv2d", nn.Conv2d(in_channels=output_filters[-1],
                                                       out_channels=filters,
                                                       kernel_size=k,
                                                       stride=stride,
                                                       padding=k // 2 if mdef["pad"] else 0,
                                                       bias=not bn))
            else:
                raise TypeError("conv2d filter size must be int type.")

            if bn:
                modules.add_module("BatchNorm2d", nn.BatchNorm2d(filters))
            else:
                # 如果该卷积操作没有bn层，意味着该层为yolo的predictor
                routs.append(i)  # detection output (goes into yolo layer)

            if mdef["activation"] == "leaky":
                modules.add_module("activation", nn.LeakyReLU(0.1, inplace=True))
            else:
                pass

        elif mdef["type"] == "BatchNorm2d":
            pass

        elif mdef["type"] == "maxpool":
            k = mdef["size"]  # kernel size
            stride = mdef["stride"]
            modules = nn.MaxPool2d(kernel_size=k, stride=stride, padding=(k - 1) // 2)

        elif mdef["type"] == "upsample":
            if ONNX_EXPORT:  # explicitly state size, avoid scale_factor
                g = (yolo_index + 1) * 2 / 32  # gain
                modules = nn.Upsample(size=tuple(int(x * g) for x in img_size))
            else:
                modules = nn.Upsample(scale_factor=mdef["stride"])
        # SPP网络结构和FPN网络结构才会出现route层
        elif mdef["type"] == "route":  # [-2],  [-1,-3,-5,-6], [-1, 61]
            layers = mdef["layers"]
            # 如果l＞0，则需要+1，l是表示网络的第几层，而对于output_filters来说，图像也算层，而layers没有将图像作为第一层，于是将layers+1
            # 而当l≤0，l表示的是当前层的往后的l层，是一个偏移信息，不是表示第几层，无需+1纠正索引
            filters = sum([output_filters[l + 1 if l > 0 else l] for l in layers])
            # extend多个值在list尾部
            routs.extend([i + l if l < 0 else l for l in layers])
            # 自定义的拼接函数
            modules = FeatureConcat(layers=layers)
        # 残差模块
        elif mdef["type"] == "shortcut":
            # 注意！当前层为残差模块的下一层
            layers = mdef["from"]
            # 获得残差块最后一层卷积层的通道数，output_filters[-1]表示当前层的前一层，即残差块的最后一层
            filters = output_filters[-1]
            # routs.extend([i + l if l < 0 else l for l in layers])
            # i + layers[0]为残差模块前一层的索引层
            routs.append(i + layers[0])
            modules = WeightedFeatureFusion(layers=layers, weight="weights_type" in mdef)

        elif mdef["type"] == "yolo":
            # yolo_index初始值为-1
            yolo_index += 1  # 记录是第几个yolo_layer [0, 1, 2]
            stride = [32, 16, 8]  # 预测特征层对应原图的缩放比例
            # 使用cfg文件中anchors有很多个，选mask建议的anchors
            modules = YOLOLayer(anchors=mdef["anchors"][mdef["mask"]],  # anchor list
                                nc=mdef["classes"],  # number of classes
                                img_size=img_size,
                                stride=stride[yolo_index])

            # Initialize preceding Conv2d() bias (https://arxiv.org/pdf/1708.02002.pdf section 3.3)
            # 这里对bias的处理始终没用上，因为bias没传入module_list里面，这顿瞎操作
            try:
                # j表示YOLOlayer上一层
                j = -1
                # 最后一个module的Conv2d的偏置bias,因为最后一层的卷积核个数为255个
                bias_ = module_list[j][0].bias  # shape(255,) 索引0对应Sequential中的Conv2d
                bias = bias_.view(modules.na, -1)  # shape(3, 85)
                bias[:, 4] += -4.5  # obj 第5列为obj预测概率的偏置
                # 第6列开始的类别概率偏置
                bias[:, 5:] += math.log(0.6 / (modules.nc - 0.99))  # cls (sigmoid(p) = 1/nc)
                module_list[j][0].bias = torch.nn.Parameter(bias_, requires_grad=bias_.requires_grad)
            except Exception as e:
                print('WARNING: smart bias initialization failure.', e)
        else:
            print("Warning: Unrecognized Layer Type: " + mdef["type"])

        # Register module list and number of output filters
        module_list.append(modules)
        output_filters.append(filters)
    # 生成len(modules_defs)个Fasle的list
    routs_binary = [False] * len(modules_defs)
    # 需要记录输出的位置记为True，需要记录的输出有SPP，FPN，Resnet，Predictor模块里的各个层索引
    for i in routs:
        routs_binary[i] = True
    return module_list, routs_binary

# yolo层是接在网络三个preditor之后的层
# 在create module处调用
class YOLOLayer(nn.Module):
    """
    对YOLO的输出进行处理
    """
    def __init__(self, anchors, nc, img_size, stride):
        super(YOLOLayer, self).__init__()
        self.anchors = torch.Tensor(anchors)# 将numpy格式的anchors转换为tensor格式
        self.stride = stride  # layer stride 特征图上一步对应原图上的步距 [32, 16, 8]
        self.na = len(anchors)  # number of anchors (3)
        self.nc = nc  # number of classes (80)
        # 每个anchor预测参数数量
        self.no = nc + 5  # number of outputs (85: x, y, w, h, obj, cls1, ...)
        self.nx, self.ny, self.ng = 0, 0, (0, 0)  # initialize number of x, y gridpoints
        # 将anchors大小缩放到grid尺度
        self.anchor_vec = self.anchors / self.stride
        # view之后的维度分别代表：batch_size, na, grid_h, grid_w, wh,
        # 值为1的维度对应的值不是固定值，后续操作可根据broadcast广播机制自动扩充，类似reshape
        self.anchor_wh = self.anchor_vec.view(1, self.na, 1, 1, 2)
        self.grid = None

        if ONNX_EXPORT:
            self.training = False
            self.create_grids((img_size[1] // stride, img_size[0] // stride))  # number x, y grid points

    def create_grids(self, ng=(13, 13), device="cpu"):
        """
        更新grids信息并生成新的grids参数
        :param ng: 特征图大小
        :param device:
        :return:
        """
        self.nx, self.ny = ng
        self.ng = torch.tensor(ng, dtype=torch.float)

        # build xy offsets 构建每个cell处的anchor的xy偏移量(在feature map上的)
        if not self.training:  # 训练模式不需要回归到最终预测boxes，也就不需要去计算grid
            # torch.arange(start=0, end=13)的结果并不包含end，类型为int64.即得到
            # torch.arange(num)，输出0->num-1的tensor
            # torch.meshgrid([0,...,ny-1],[0,...,nx-1])
            # yv是以[0,...,ny-1]中的每个元素为每行开头元素（即变成列，复制nx列）得到一个矩阵shape为(ny,nx)，表示整个grid的y坐标信息
            # xv是以[0,...,nx-1]中的每个元素为每列开头元素（即变成行，复制ny行）得到一个矩阵shape为(ny,nx)，表示整个grid的x坐标信息
            yv, xv = torch.meshgrid([torch.arange(self.ny, device=device),
                                     torch.arange(self.nx, device=device)])
            # batch_size, na, grid_h, grid_w, wh
            # 将xv和yv在第三个维度堆叠，生成13x13x2的shape的tensor，堆叠生成的为一个grid坐标。x坐标排列以xv顺序优先排列，y坐标排列以yv顺序排列
            # view添加两个维度作为batch_size和na（anchor）
            self.grid = torch.stack((xv, yv), 2).view((1, 1, self.ny, self.nx, 2)).float()
        #
        if self.anchor_vec.device != device:
            self.anchor_vec = self.anchor_vec.to(device)
            self.anchor_wh = self.anchor_wh.to(device)

    def forward(self, p):
        if ONNX_EXPORT:
            bs = 1  # batch size
        else:
            bs, _, ny, nx = p.shape  # batch_size, predict_param(255), grid(13), grid(13)
            if (self.nx, self.ny) != (nx, ny) or self.grid is None:  # fix no grid bug
                self.create_grids((nx, ny), p.device)

        # view: (batch_size, 255, 13, 13) -> (batch_size, 3, 85, 13, 13)
        # permute: (batch_size, 3, 85, 13, 13) -> (batch_size, 3, 13, 13, 85)
        # [bs, anchor, grid, grid, xywh + obj + classes]
        # permute将tensor维度换位，换位之后，p在内存当中不再连续，需要调用contiguous使p在内存中连续
        p = p.view(bs, self.na, self.no, self.ny, self.nx).permute(0, 1, 3, 4, 2).contiguous()  # prediction

        if self.training:# 如果是训练模式，返回p
            return p
        elif ONNX_EXPORT:
            # Avoid broadcasting for ANE operations
            m = self.na * self.nx * self.ny  # 3*
            ng = 1. / self.ng.repeat(m, 1)
            grid = self.grid.repeat(1, self.na, 1, 1, 1).view(m, 2)
            anchor_wh = self.anchor_wh.repeat(1, 1, self.nx, self.ny, 1).view(m, 2) * ng

            p = p.view(m, self.no)
            # xy = torch.sigmoid(p[:, 0:2]) + grid  # x, y
            # wh = torch.exp(p[:, 2:4]) * anchor_wh  # width, height
            # p_cls = torch.sigmoid(p[:, 4:5]) if self.nc == 1 else \
            #     torch.sigmoid(p[:, 5:self.no]) * torch.sigmoid(p[:, 4:5])  # conf
            p[:, :2] = (torch.sigmoid(p[:, 0:2]) + grid) * ng  # x, y
            p[:, 2:4] = torch.exp(p[:, 2:4]) * anchor_wh  # width, height
            p[:, 4:] = torch.sigmoid(p[:, 4:])
            p[:, 5:] = p[:, 5:self.no] * p[:, 4:5]
            return p
        else:  # inference 如果是验证或者推理阶段
            # [bs, anchor, grid, grid, xywh + obj + classes]
            io = p.clone()  # inference output
            # clone返回一个张量的副本，其与原张量的尺寸和数据类型相同。
            # 与copy_()不同，这个函数记录在计算图中。传递到克隆张量的梯度将传播到原始张量
            # grid的shape=[batch_size, na, grid_h, grid_w, wh],和io最后一维取前两个xy后的shape一致，进行加法
            io[..., :2] = torch.sigmoid(io[..., :2]) + self.grid  # xy 计算在feature map上的xy坐标，对应论文的sigmoid(tx)+cx
            # anchor_wh的shape：[batch_size, na, grid_h, grid_w, wh]与io最后一维取第3，4个，即wh后的shape一致，进行乘法
            io[..., 2:4] = torch.exp(io[..., 2:4]) * self.anchor_wh  # wh yolo method 计算在feature map上的wh
            io[..., :4] *= self.stride  # 换算映射回原图尺度
            # obj和类别预测经过sigmoid
            torch.sigmoid_(io[..., 4:])
            return io.view(bs, -1, self.no), p  # view [1, 3, 13, 13, 85] as [1, 507, 85],3X13X13=507
            # p在这里的shape是[bs, anchor, grid, grid, xywh + obj + classes]


class Darknet(nn.Module):
    """
    YOLOv3 spp object detection model
    """
    # verbose为打印开关，默认关闭
    def __init__(self, cfg, img_size=(416, 416), verbose=False):
        super(Darknet, self).__init__()
        # 这里传入的img_size只在导出ONNX模型时起作用，isinstance判断img_size是否为int变量，返回布尔值，和ONNX模型有关
        self.input_size = [img_size] * 2 if isinstance(img_size, int) else img_size
        # 解析网络对应的.cfg文件，返回module字典
        self.module_defs = parse_model_cfg(cfg)
        # 根据解析的网络结构一层一层去搭建，调用create_modules
        self.module_list, self.routs = create_modules(self.module_defs, img_size)
        # 获取所有YOLOLayer层的索引89，101，113
        self.yolo_layers = get_yolo_layers(self)

        # 打印下模型的信息，如果verbose为True则打印详细信息
        self.info(verbose) if not ONNX_EXPORT else None  # print model description
    # x为输入数据
    def forward(self, x, verbose=False):
        return self.forward_once(x, verbose=verbose)

    def forward_once(self, x, verbose=False):
        # yolo_out收集每个yolo_layer层的输出
        # out收集每个需要保存的模块的输出，即routs记录的模块
        yolo_out, out = [], []
        if verbose:
            print('0', x.shape)
            str = ""

        for i, module in enumerate(self.module_list):
            name = module.__class__.__name__
            if name in ["WeightedFeatureFusion", "FeatureConcat"]:  # sum, concat
                if verbose:
                    # 属于SPP或者FPN或者resnet模块module类有layers属性，layers为相对索引
                    # 从当前i层的前一层计算layers相对索引l列表
                    l = [i - 1] + module.layers  # layers
                    # 对x的shape和需要进行融合和层的shape进行想加，但特征融合，即WeightedFeatureFusion的shape并不是加起来的，而是不变的。这里代码可能有点小问题
                    sh = [list(x.shape)] + [list(out[i].shape) for i in module.layers]  # shapes
                    # x为一对（l,sh）
                    str = ' >> ' + ' + '.join(['layer %g %s' % x for x in zip(l, sh)])
                # 这里module传入两个参数，因为FPN和SPP和resnet模块的module类的forward有两个参数x，out
                x = module(x, out)  # WeightedFeatureFusion(), FeatureConcat()
            elif name == "YOLOLayer":
                yolo_out.append(module(x))
            else:  # run module directly, i.e. mtype = 'convolutional', 'upsample', 'maxpool', 'batchnorm2d' etc.
                x = module(x)
            # 如果rout记录该层索引是要保存的，则保存到out里,如果不需要保存，则添加一个[]元素，out的索引和网络层的索引一致
            out.append(x if self.routs[i] else [])
            if verbose:
                print('%g/%g %s -' % (i, len(self.module_list), name), list(x.shape), str)
                str = ''

        if self.training:  # train
            return yolo_out
        elif ONNX_EXPORT:  # export
            # x = [torch.cat(x, 0) for x in zip(*yolo_out)]
            # return x[0], torch.cat(x[1:3], 1)  # scores, boxes: 3780x80, 3780x4
            p = torch.cat(yolo_out, dim=0)

            # # 根据objectness虑除低概率目标
            # mask = torch.nonzero(torch.gt(p[:, 4], 0.1), as_tuple=False).squeeze(1)
            # # onnx不支持超过一维的索引（pytorch太灵活了）
            # # p = p[mask]
            # p = torch.index_select(p, dim=0, index=mask)
            #
            # # 虑除小面积目标，w > 2 and h > 2 pixel
            # # ONNX暂不支持bitwise_and和all操作
            # mask_s = torch.gt(p[:, 2], 2./self.input_size[0]) & torch.gt(p[:, 3], 2./self.input_size[1])
            # mask_s = torch.nonzero(mask_s, as_tuple=False).squeeze(1)
            # p = torch.index_select(p, dim=0, index=mask_s)  # width-height 虑除小目标
            #
            # if mask_s.numel() == 0:
            #     return torch.empty([0, 85])

            return p
        else:  # inference or test
            # 将yolo_out输出成x和p的列表，yolo_out有3个元祖，每个元祖包含了一个
            # [bs,grid*grid*anchor,xywh + obj + classes]的shape数据和一个[bs,anchor,grid,grid,xywh + obj + classes]的shape数据
            x, p = zip(*yolo_out)  # inference output, training output
            # 拆分之后的x为3个元素的list，对每个元素在1维度进行cat拼接
            # 假设是20类别训练，grid为16
            x = torch.cat(x, 1)  # cat yolo outputs

            return x, p

    def info(self, verbose=False):
        """
        打印模型的信息
        :param verbose:
        :return:
        """
        torch_utils.model_info(self, verbose)


def get_yolo_layers(self):
    """
    获取网络中三个"YOLOLayer"模块对应的索引
    :param self:
    :return:
    """
    # 遍历每个module的class，这个class是YOLOLayer的类名
    return [i for i, m in enumerate(self.module_list) if m.__class__.__name__ == 'YOLOLayer']  # [89, 101, 113]