YOLOv3网络搭建

最新推荐文章于 2024-08-05 18:41:17 发布

咕噜咕噜开心加油

最新推荐文章于 2024-08-05 18:41:17 发布

阅读量2.3k

点赞数 56

文章标签： YOLO

本文链接：https://blog.csdn.net/Linxiaojiejieya/article/details/139948051

版权

3.2 YOLOv3 SPP源码解析(Pytorch版)

在train.py的63行左右，初始化模型，按住Ctrl和点击鼠标，跳转Darknet

model = Darknet(cfg).to(device)

一：Darknet

Darknet来自models.py文件

class Darknet(nn.Module): #定义这个类

def __init__(self, cfg, img_size=(416, 416), verbose=False): #需要3个参数,cfg,图像尺寸和是否需要打印信息，图像尺寸在训练过程中不起任何作用，因为在训练过程中，输入的图像尺寸是变化的，cfg 是模型配置文件的地址

super(Darknet, self).__init__() #调用Darknet类的父类的初始化方法

self.input_size = [img_size] * 2 if isinstance(img_size, int) else img_size # 这里传入的img_size只在导出ONNX模型时起作用

self.module_defs = parse_model_cfg(cfg) #解析网络对应的.cfg文件，module_defs得到的是每一模块的信息；parse_model_cfg：解析模型配置 这个函数在parse_config.py（解析配置）里，而parse_model_cfg函数传入的参数cfg是模型配置文件的地址，相当于是my_yolov3.cfg的地址

self.module_list, self.routs = create_modules(self.module_defs, img_size) #通过create_modules函数传入module_defs，img_size,得到module_list和routs。

class Darknet(nn.Module):
    """
    YOLOv3 spp object detection model
    """
    def __init__(self, cfg, img_size=(416, 416), verbose=False): # 传入的图像尺寸在训练过程中不起任何作用；verbose是否需要打印模块信息
        super(Darknet, self).__init__()
        # 这里传入的img_size只在导出ONNX模型时起作用
        self.input_size = [img_size] * 2 if isinstance(img_size, int) else img_size
        # 解析网络对应的.cfg文件
        self.module_defs = parse_model_cfg(cfg)
        # 根据解析的网络结构一层一层去搭建
        self.module_list, self.routs = create_modules(self.module_defs, img_size)

二：create_modules函数

def create_modules(modules_defs: list, img_size): # 定义这个函数，需要传入modules_defs和img_size

img_size = [img_size] * 2 if isinstance(img_size, int) else img_size #这行代码的作用是确保变量 img_size最终是一个包含两个元素的列表。如果 img_size是一个整数，它会被转换成包含两个相同整数的列表；如果img_size本身已经是一个列表，则保持不变。

modules_defs.pop(0) # 通过pop方法移除net

output_filters = [3] # 记录每个模块输出的channel，初始化为3，因为输入的是RGB图片

module_list = nn.ModuleList() #创建一个 PyTorch 的 ModuleList对象，ModuleList是一个特殊的容器，用于存放 nn.Module子类的实例，并将它们注册为网络的一部分。

routs = [] #统计哪些特征层的输出会被后续的层使用到

yolo_index = -1 # 后面用到再说

def create_modules(modules_defs: list, img_size):
    img_size = [img_size] * 2 if isinstance(img_size, int) else img_size
    modules_defs.pop(0)  # cfg training hyperparams (unused) 通过pop方法将第一个模块弹出，第一个模块对应的是net
    output_filters = [3]  # input channels 
    module_list = nn.ModuleList() 
    # 统计哪些特征层的输出会被后续的层使用到(可能是特征融合，也可能是拼接)
    routs = []  # list of layers which rout to deeper layers
    yolo_index = -1

for i, mdef in enumerate(modules_defs): #enumerate会生成一个包含索引和值的元组，当i=0时

1.convolutional

        if mdef["type"] == "convolutional":
            bn = mdef["batch_normalize"]  # 1 or 0 / use or not
            filters = mdef["filters"]
            k = mdef["size"]  # kernel size
            stride = mdef["stride"] if "stride" in mdef else (mdef['stride_y'], mdef["stride_x"])
            if isinstance(k, int): # 判断卷积核大小是不是整数
                modules.add_module("Conv2d", nn.Conv2d(in_channels=output_filters[-1],
                                                       out_channels=filters,
                                                       kernel_size=k,
                                                       stride=stride,
                                                       padding=k // 2 if mdef["pad"] else 0,
                                                       bias=not bn))
            else:
                raise TypeError("conv2d filter size must be int type.")

            if bn:
                modules.add_module("BatchNorm2d", nn.BatchNorm2d(filters))
            else:
                # 如果该卷积操作没有bn层，意味着该层为yolo的predictor
                routs.append(i)  # detection output (goes into yolo layer)

            if mdef["activation"] == "leaky":
                modules.add_module("activation", nn.LeakyReLU(0.1, inplace=True))
            else:
                pass

然后跳转到106行左右，module_list.append(modules) # 将modules添加到module_list

output_filters.append(filters) # 将输出矩阵添加到output_filters

2.shortcut

经过4个convolutional循环后，跳转到shortcut，先回忆下shortcut

[shortcut]    
from=-3      
activation=linear

layers=-3，filters=64，因为上一层的输出矩阵的通道是64，routs=4-3=1，相当于用到了i=1（实际上是第二层）的卷积的输出，因为是从0开始数的

        elif mdef["type"] == "shortcut":
            layers = mdef["from"] # layers是列表，通过索引0取得数值
            filters = output_filters[-1]
            # routs.extend([i + l if l < 0 else l for l in layers])
            routs.append(i + layers[0])  # 记录了一下使用哪一层的输出
            # 将多个特征图进行拼接操作
            modules = WeightedFeatureFusion(layers=layers, weight="weights_type" in mdef)

相当于对应网络的这个部分

3.maxpool

先来回忆下maxpool

[maxpool]
stride=1
size=5

        elif mdef["type"] == "maxpool":
            k = mdef["size"]  # kernel size
            stride = mdef["stride"]
            modules = nn.MaxPool2d(kernel_size=k, stride=stride, padding=(k - 1) // 2)

module_list.append(modules)
        output_filters.append(filters)

4.route

layers=-2

        elif mdef["type"] == "route":  # [-2],  [-1,-3,-5,-6], [-1, 61]
            layers = mdef["layers"]
            filters = sum([output_filters[l + 1 if l > 0 else l] for l in layers]) # output_filters最初传入了RGB，所以0+1，如果l<0,相当于倒着数，则不会出现问题 记录当前层输出特征矩阵的channel
            routs.extend([i + l if l < 0 else l for l in layers]) # 记录模块索引
            modules = FeatureConcat(layers=layers) # 通过FeatureConcat创建

filters相当于返回了-2层的输出512；-2层被多次利用，所以加到routs；因为layers=-2，就一个，所以返回的是-2层的输出

当layers=[-1,-3,-5,-6],filters=512*4=2048

5.yolo

我的数据集类别为4，这里因人而异

[yolo]
mask = 6,7,8  
anchors = 10,13,  16,30,  33,23,  30,61,  62,45,  59,119,  116,90,  156,198,  373,326
classes=4
num=9
jitter=.3
ignore_thresh = .7
truth_thresh = 1
random=1

stride=[32,16,8],对应512/16=32；512/32=16，512/64=8，相当于缩放比例


        elif mdef["type"] == "yolo":
            yolo_index += 1  # 记录是第几个yolo_layer [0, 1, 2]
            stride = [32, 16, 8]  # 预测特征层对应原图的缩放比例

            modules = YOLOLayer(anchors=mdef["anchors"][mdef["mask"]],  # anchor list
                                nc=mdef["classes"],  # number of classes
                                img_size=img_size,
                                stride=stride[yolo_index])

anchors是所有锚框的大小，mask是锚框的索引，第一层yolo的输出大小是16，对应的是大目标，所以对应的锚框也是大尺度锚框，nc对应的是类别，stride=stride[yolo_index])代表这个yolo层的缩放比例

6.upsample

Upsample 是在深度学习中常用的一种操作，用于增加输入张量的分辨率，即增大其空间尺寸；scale_factor: 缩放因子，用于指定输入张量的空间维度应扩大多少倍。

        elif mdef["type"] == "upsample":
            if ONNX_EXPORT:  # explicitly state size, avoid scale_factor
                g = (yolo_index + 1) * 2 / 32  # gain
                modules = nn.Upsample(size=tuple(int(x * g) for x in img_size))
            else:
                modules = nn.Upsample(scale_factor=mdef["stride"])

全部遍历完之后，routs_binary生成114个False；将多次利用的层数改为True，返回module_list和routs_binary

    routs_binary = [False] * len(modules_defs) #所有的层结构都为False
    for i in routs:
        routs_binary[i] = True
    return module_list, routs_binary

三：YOLOLayer

YOLOlayer整体理解的不太好，还是不太理解

对YOLO的输出进行处理

class YOLOLayer(nn.Module): # 定义这个函数

def __init__(self, anchors, nc, img_size, stride): # 初始化，需要传入这些参数

super(YOLOLayer, self).__init__()
        self.anchors = torch.Tensor(anchors) # anchors，这个是相对于原图的尺寸
        self.stride = stride  # 缩放的尺寸
        self.na = len(anchors)  # anchors的数量
        self.nc = nc  # 类别
        self.no = nc + 5  #  x, y, w, h, obj, cls1, ...)
        self.nx, self.ny, self.ng = 0, 0, (0, 0)  # initialize number of x, y gridpoints grid的宽度和高度，ng是grid cell的size，初始化为0，正向传播过程中会重新赋值
        self.anchor_vec = self.anchors / self.stride #将anchors大小缩放到grid尺度
        # batch_size, na, grid_h, grid_w, wh,
        # 值为1的维度对应的值不是固定值，后续操作可根据broadcast广播机制自动扩充
        self.anchor_wh = self.anchor_vec.view(1, self.na, 1, 1, 2)
        self.grid = None

def create_grids(self, ng=(13, 13), device="cpu"): # 创建网格，传入特征图大小

self.nx, self.ny = ng #宽和高
self.ng = torch.tensor(ng, dtype=torch.float) #转化成浮点类型

if not self.training:  # 验证集，因为训练集不需要回归到最终预测boxes

yv, xv = torch.meshgrid([torch.arange(self.ny, device=device),
                         torch.arange(self.nx, device=device)]) #xv相当于x轴坐标，yv相当于y轴坐标

xv：

tensor([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
        [ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15]], device='cuda:0')

yv：

tensor([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0],
        [ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1],
        [ 2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2],
        [ 3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3],
        [ 4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4],
        [ 5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5,  5],
        [ 6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6],
        [ 7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7,  7],
        [ 8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8,  8],
        [ 9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9],
        [10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10],
        [11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11],
        [12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12],
        [13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13, 13],
        [14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14, 14],
        [15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15]], device='cuda:0')

self.grid = torch.stack((xv, yv), 2).view((1, 1, self.ny, self.nx, 2)).float() #生成了网格的坐标

tensor([[[[[ 0.,  0.],
           [ 1.,  0.],
           [ 2.,  0.],
           [ 3.,  0.],
           [ 4.,  0.],
           [ 5.,  0.],
           [ 6.,  0.],
           [ 7.,  0.],
           [ 8.,  0.],
           [ 9.,  0.],
           [10.,  0.],
           [11.,  0.],
           [12.,  0.],
           [13.,  0.],
           [14.,  0.],
           [15.,  0.]],

          [[ 0.,  1.],
           [ 1.,  1.],
           [ 2.,  1.],
           [ 3.,  1.],
           [ 4.,  1.],
           [ 5.,  1.],
           [ 6.,  1.],
           [ 7.,  1.],
           [ 8.,  1.],
           [ 9.,  1.],
           [10.,  1.],
           [11.,  1.],
           [12.,  1.],
           [13.,  1.],
           [14.,  1.],
           [15.,  1.]],

 def forward(self, p): # p是predictor预测的参数
        if ONNX_EXPORT:
            bs = 1  # batch size
        else:
            bs, _, ny, nx = p.shape  # batch_size, predict_param(255), grid(13), grid(13) ny：grid cell的高；ny：grid cell的宽
            if (self.nx, self.ny) != (nx, ny) or self.grid is None:  # fix no grid bug grid cell是否等于当前输入特征矩阵的nx和ny
                self.create_grids((nx, ny), p.device) #grid cells发生变化，需要重新生成参数

p = p.view(bs, self.na, self.no, self.ny, self.nx).permute(0, 1, 3, 4, 2).contiguous()  #[1,3,16,16,9]

        else:  # inference
            # [bs, anchor, grid, grid, xywh + obj + classes]
            io = p.clone()  # inference output
            io[..., :2] = torch.sigmoid(io[..., :2]) + self.grid  # xy 计算在feature map上的xy坐标
            io[..., 2:4] = torch.exp(io[..., 2:4]) * self.anchor_wh  # wh yolo method 计算在feature map上的wh
            io[..., :4] *= self.stride  # 换算映射回原图尺度
            torch.sigmoid_(io[..., 4:])
            return io.view(bs, -1, self.no), p  # view [1, 3, 13, 13, 85] as [1, 507, 85]