yolov3代码详解（四）

最新推荐文章于 2023-03-03 11:47:09 发布
medusa_zj
最新推荐文章于 2023-03-03 11:47:09 发布
阅读量635
点赞数 2
分类专栏：深度学习
本文链接：https://blog.csdn.net/medusa_zj/article/details/107705729
版权
深度学习专栏收录该内容
19 篇文章 3 订阅
订阅专栏
Pytorch | yolov3代码详解四

models.py
models.py

from __future__ import division

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np

from utils.parse_config import *
from utils.utils import build_targets, to_cpu, non_max_suppression

import matplotlib.pyplot as plt
import matplotlib.patches as patches
##########################################################################
#生成模型
##########################################################################


#在model.py之Darknet中被引用，解析config/yolov3.cfg文件得到网络参数，通过加载网络参数，创建对应的网络结构
def create_modules(module_defs):
    """
    Constructs module list of layer blocks from module configuration in module_defs
    逐个layer创建
    """
    hyperparams = module_defs.pop(0)
    #输出feature maps 的卷积核个数，作为下一次操作中卷积核的通道数
    output_filters = [int(hyperparams["channels"])]
    module_list = nn.ModuleList()   # 一定要用ModuleList()才能被torch识别为module并进行管理，不能用list！
    #module_i代表了当前layer位于网络的第几层
    for module_i, module_def in enumerate(module_defs):
        modules = nn.Sequential()
        """
        torch.nn.Sequential是一个Sequential容器，模块将按照构造
        函数中传递的顺序添加到模块中。通俗的话说，就是根据自己的需
        求，把不同的函数组合成一个（小的）模块使用或者把组合的模块
        添加到自己的网络中。
        """
        # 下面根据不同的层进行不同的设计
        # 卷积层
        if module_def["type"] == "convolutional":
            bn = int(module_def["batch_normalize"])
            filters = int(module_def["filters"])
            kernel_size = int(module_def["size"])
            pad = (kernel_size - 1) // 2
            #add_module(name, module)
            #name (string)：子模块的名字。该添加子模块能够使用给定的名字name来从该模块中被访问
            #module (Module) ：添加到该模块中的子模块
            modules.add_module(
                f"conv_{module_i}",
                nn.Conv2d(
                    in_channels=output_filters[-1],
                    out_channels=filters,
                    kernel_size=kernel_size,
                    stride=int(module_def["stride"]),
                    padding=pad,  #值为0时表示不进行边界像素的填充，如果值大于0，那么增加数字所对应的边界像素层数。
                    bias=not bn,
                ),
            )
            if bn:
                modules.add_module(f"batch_norm_{module_i}", nn.BatchNorm2d(filters, momentum=0.9, eps=1e-5))
            if module_def["activation"] == "leaky":
                modules.add_module(f"leaky_{module_i}", nn.LeakyReLU(0.1))

        # 最大池化模块 （好像没有用到，未看）
        elif module_def["type"] == "maxpool":
            kernel_size = int(module_def["size"])
            stride = int(module_def["stride"])
            if kernel_size == 2 and stride == 1:
                modules.add_module(f"_debug_padding_{module_i}", nn.ZeroPad2d((0, 1, 0, 1)))
            maxpool = nn.MaxPool2d(kernel_size=kernel_size, stride=stride, padding=int((kernel_size - 1) // 2))
            modules.add_module(f"maxpool_{module_i}", maxpool)

        #上采样层
        elif module_def["type"] == "upsample":
            upsample = Upsample(scale_factor=int(module_def["stride"]), mode="nearest")
            modules.add_module(f"upsample_{module_i}", upsample)


        # 这里做的是在卷积核个数维度上的拼接  # 空层
        elif module_def["type"] == "route":
            """
            当layers只有一个值,代表route layer输出的是router layer - 4那一层layer的feature map.
            当layers有2个值时,代表route layer的输出为route layer -1和第61 layer的feature map在深度方向连接起来.
            (比如说3*3*100,3*3*200add起来变成3*3*300)
            """
            layers = [int(x) for x in module_def["layers"].split(",")]
            filters = sum([output_filters[1:][i] for i in layers])
            modules.add_module(f"route_{module_i}", EmptyLayer())

        # 跨层连接  # 空层
        elif module_def["type"] == "shortcut":
             #shortcut layer的输出是前一层和前三层的输出的叠加.  x = layer_outputs[-1] + layer_outputs[layer_i]
            filters = output_filters[1:][int(module_def["from"])]
            modules.add_module(f"shortcut_{module_i}", EmptyLayer())

        #输出yolo层
        elif module_def["type"] == "yolo":
            anchor_idxs = [int(x) for x in module_def["mask"].split(",")]   #[6,7,8]
            # Extract anchors
            anchors = [int(x) for x in module_def["anchors"].split(",")]    
            #[10,13,16,30,33,23,30,61,62,45,59,119,116,90,156,198,373,326]
            anchors = [(anchors[i], anchors[i + 1]) for i in range(0, len(anchors), 2)]
            #[(10,13),(16,30),(33,23),(30,61),(62,45),(59,119),(116,90),(156,198),(373,326)]
            anchors = [anchors[i] for i in anchor_idxs]
            #[(116,90),(156,198),(373,326)]

            num_classes = int(module_def["classes"])
            img_size = int(hyperparams["height"])
            # Define detection layer
            yolo_layer = YOLOLayer(anchors, num_classes, img_size)
            modules.add_module(f"yolo_{module_i}", yolo_layer)
        # Register module list and number of output filters
        module_list.append(modules)
        output_filters.append(filters)  #output_filters用于记录filters以便后面使用

    return hyperparams, module_list

#上采样
class Upsample(nn.Module):
    """ nn.Upsample is deprecated """

    def __init__(self, scale_factor, mode="nearest"):
        super(Upsample, self).__init__()
        self.scale_factor = scale_factor
        self.mode = mode

    def forward(self, x):
        x = F.interpolate(x, scale_factor=self.scale_factor, mode=self.mode)
        return x

# 只是为了占位，以便处理route层和shortcut层
class EmptyLayer(nn.Module):
    """Placeholder for 'route' and 'shortcut' layers"""

    def __init__(self):
        super(EmptyLayer, self).__init__()

#yolo层
class YOLOLayer(nn.Module):
    """Detection layer"""

    def __init__(self, anchors, num_classes, img_dim=416):
        super(YOLOLayer, self).__init__()
        self.anchors = anchors   #三个  #[(116,90),(156,198),(373,326)]
        self.num_anchors = len(anchors)  #3
        self.num_classes = num_classes   #80
        self.ignore_thres = 0.5
        self.mse_loss = nn.MSELoss()  #均方损失函数（x-y）^2
        self.bce_loss = nn.BCELoss()  #交叉熵损失函数
        self.obj_scale = 1
        self.noobj_scale = 100
        self.metrics = {}
        self.img_dim = img_dim
        self.grid_size = 0  # grid size

    #forward调用，计算网格偏移
    def compute_grid_offsets(self, grid_size, cuda=True):
        self.grid_size = grid_size 
        g = self.grid_size   # 网格的个数 13
        FloatTensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor
        self.stride = self.img_dim / self.grid_size   # 每个网格的步幅宽度 416 / 13 = 32
        # Calculate offsets for each grid
        """
        z=torch.arange(1,6)
        tensor([1, 2, 3, 4, 5])
        """
        """
        grid_x # tensor([[[
          [0, 1, 2 … 12],
          [0, 1, 2 … 12],
          ...
          [0, 1, 2 … 12]]]])
        grid_y # tensor([[[
           [0, 0, ...,0],
           [1, 1, ...,1],
           [2, 2, ...,2]
            …
           [12, 12, ...,12]]]])

        """
        self.grid_x = torch.arange(g).repeat(g, 1).view([1, 1, g, g]).type(FloatTensor)
        self.grid_y = torch.arange(g).repeat(g, 1).t().view([1, 1, g, g]).type(FloatTensor)
        self.scaled_anchors = FloatTensor([(a_w / self.stride, a_h / self.stride) for a_w, a_h in self.anchors])   #经过缩放的anchors  即在特征图（13*13）对应的尺寸         
        self.anchor_w = self.scaled_anchors[:, 0:1].view((1, self.num_anchors, 1, 1))    #获取到三个 anchor_w
        self.anchor_h = self.scaled_anchors[:, 1:2].view((1, self.num_anchors, 1, 1))
        """scaled_anchors # tensor([
        [ 3.6250, 2.8125],
        [ 4.8750, 6.1875],
        [11.6562, 10.1875]], device=‘cuda:0’)
        anchor_w # 当前三个anchor 的宽度 3.6 4.8 11.6
        anchor_h # 当前三个anchor 的高度 2.8 6.1 10.1
        """
    
    #前向传播  例：传入的x为[1,255,13,13]  img_dim=13
    def forward(self, x, targets=None, img_dim=None):

        # Tensors for cuda support
        FloatTensor = torch.cuda.FloatTensor if x.is_cuda else torch.FloatTensor
        LongTensor = torch.cuda.LongTensor if x.is_cuda else torch.LongTensor
        ByteTensor = torch.cuda.ByteTensor if x.is_cuda else torch.ByteTensor

        self.img_dim = img_dim
        num_samples = x.size(0)  #batch_size
        grid_size = x.size(2)    #共分了多少个grid 这里对应13，26，52 特征图尺寸

        prediction = (
            x.view(num_samples, self.num_anchors, self.num_classes + 5, grid_size, grid_size)
            .permute(0, 1, 3, 4, 2)
            .contiguous()
        )
        #此处  grid_size为13，self.grid_size为0
        #上面是改变形状：torch.Size([1, 3, 13, 13, 85])
        #得到的是即公式中的tx，ty,tw,th和置信度和类别分类

        # Get outputs
        # x，y坐标都是使用了sigmoid函数进行处理，置信度和类别概率使用同样的方法处理。
        x = torch.sigmoid(prediction[..., 0])  # Center x  x的形状【1,3,13,13】  
        y = torch.sigmoid(prediction[..., 1])  # Center y  y的形状【1,3,13,13】
        w = prediction[..., 2]  # Width的形状【1,3,13,13】
        h = prediction[..., 3]  # Height的形状【1,3,13,13】
        pred_conf = torch.sigmoid(prediction[..., 4])  # Conf 置信度  形状【1,3,13,13】
        pred_cls = torch.sigmoid(prediction[..., 5:])  # Cls pred.分类   形状【1,3,13,13,80】

        # If grid size does not match current we compute new offsets
        #调用计算偏移量
        if grid_size != self.grid_size:  #13和0
            self.compute_grid_offsets(grid_size, cuda=x.is_cuda)
            """
            grid_x # tensor([[[
            [0, 1, 2 … 12],
            [0, 1, 2 … 12],
            ...
            [0, 1, 2 … 12]]]])

            grid_y # tensor([[[
             [0, 0, ...,0],
             [1, 1, ...,1],
             [2, 2, ...,2]
                …
             [12, 12, ...,12]]]])
            
            self.scaled_anchors  经过缩放的anchors  即在特征图（13*13）对应的尺寸

            anchor_w # 当前三个anchor 的宽度 3.6 4.8 11.6

            anchor_h # 当前三个anchor 的高度 2.8 6.1 10.1
            """

        #计算相对于当前网格size的x,y,w,h（即预测框的信息，只是在当前网格中，并没有映射回原图）
        # Add offset and scale with anchors
        pred_boxes = FloatTensor(prediction[..., :4].shape)
        pred_boxes[..., 0] = x.data + self.grid_x     #  x.data 返回和 x 的相同数据 tensor    self.grid_x 为0到12
        pred_boxes[..., 1] = y.data + self.grid_y     #
        pred_boxes[..., 2] = torch.exp(w.data) * self.anchor_w
        pred_boxes[..., 3] = torch.exp(h.data) * self.anchor_h
        #pred_boxes的形状为[1,3,13,13,4]  为x,y,w,h

        #输出形状为【1,507,85】，同时预测框映射回原图中
        output = torch.cat(
            (
                pred_boxes.view(num_samples, -1, 4) * self.stride,
                pred_conf.view(num_samples, -1, 1),
                pred_cls.view(num_samples, -1, self.num_classes),
            ),
            -1,
        )

        #由于target=None（推演的时候设置为None），所以输出的total_loss=0。
        #训练时候target是有值的
        if targets is None:
            return output, 0
        else:
             #iou_scores 最好的那个预测框 和真实值的iou值
             #class_mask 最好的那个预测框 分类正确的索引（分类正确的置1）
             #obj_mask   目标框所在网格的最好anchor置为1（即真实框落在的那个cell的anchors，选择iou最大的那个）
             #noobj_mask obj_mask那里置0，还有计算的iou大于阈值的也置0，其他都为1
             #tx, ty, tw, th, 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值
             #tconf      目标置信度--->其实就是obj_mask换成了float 
            iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf = build_targets(
                pred_boxes=pred_boxes,
                pred_cls=pred_cls,
                target=targets,                     #分类目标
                anchors=self.scaled_anchors,        #经过缩放的anchors  即在特征图（13*13）对应的尺寸
                ignore_thres=self.ignore_thres,     #阈值
            )

            # Loss : Mask outputs to ignore non-existing objects (except with conf. loss)
            loss_x = self.mse_loss(x[obj_mask], tx[obj_mask])   #均方损失   #X为预测的值，tx为真实的值（只计算有物体的）   其中x是 网络输出的x再经过sigmoid得到的  因为公式是g(tx)+cx   现在tx是相当于g(tx) 
            loss_y = self.mse_loss(y[obj_mask], ty[obj_mask])   #（只计算有物体的）
            loss_w = self.mse_loss(w[obj_mask], tw[obj_mask])   #（只计算有物体的）
            loss_h = self.mse_loss(h[obj_mask], th[obj_mask])   #（只计算有物体的）

            loss_conf_obj = self.bce_loss(pred_conf[obj_mask], tconf[obj_mask])  #交叉熵损失   有物体的  pred_conf为预测的， tconf为通过真实框个anchors得到的
            loss_conf_noobj = self.bce_loss(pred_conf[noobj_mask], tconf[noobj_mask])  #交叉熵损失   无物体的
            #权重系数： self.obj_scale = 1， self.noobj_scale = 100
            loss_conf = self.obj_scale * loss_conf_obj + self.noobj_scale * loss_conf_noobj   #乘以系数
            
            loss_cls = self.bce_loss(pred_cls[obj_mask], tcls[obj_mask])    #交叉熵损失   类别
            total_loss = loss_x + loss_y + loss_w + loss_h + loss_conf + loss_cls

            #-------------------------------------------------------------------下为性能评价指标
            # Metrics
            cls_acc = 100 * class_mask[obj_mask].mean()
            conf_obj = pred_conf[obj_mask].mean()
            conf_noobj = pred_conf[noobj_mask].mean()
            conf50 = (pred_conf > 0.5).float()
            iou50 = (iou_scores > 0.5).float()
            iou75 = (iou_scores > 0.75).float()
            detected_mask = conf50 * class_mask * tconf
            precision = torch.sum(iou50 * detected_mask) / (conf50.sum() + 1e-16)
            recall50 = torch.sum(iou50 * detected_mask) / (obj_mask.sum() + 1e-16)
            recall75 = torch.sum(iou75 * detected_mask) / (obj_mask.sum() + 1e-16)

            self.metrics = {
                "loss": to_cpu(total_loss).item(),
                "x": to_cpu(loss_x).item(),
                "y": to_cpu(loss_y).item(),
                "w": to_cpu(loss_w).item(),
                "h": to_cpu(loss_h).item(),
                "conf": to_cpu(loss_conf).item(),
                "cls": to_cpu(loss_cls).item(),
                "cls_acc": to_cpu(cls_acc).item(),
                "recall50": to_cpu(recall50).item(),
                "recall75": to_cpu(recall75).item(),
                "precision": to_cpu(precision).item(),
                "conf_obj": to_cpu(conf_obj).item(),
                "conf_noobj": to_cpu(conf_noobj).item(),
                "grid_size": grid_size,
            }

            return output, total_loss
            #输出形状为【1,507,85】，同时预测框映射回原图中
            #损失函数


#被train.py,detect.py,test.py调用，用于生成神经网络结构
class Darknet(nn.Module):
    """YOLOv3 object detection model"""

    def __init__(self, config_path, img_size=416):
        super(Darknet, self).__init__()
        ##模型中的参数定义，一个list中套了很多个dict.通过这个函数将配置文件中的块存储为列表形式，属性与值一一对应
        """
        这里的parse_model_config用于解析config/yolo3.cfg这个文件的，返回一个列表，里面包含了
        整个darknet53的网络结构，返回值如下：
        [{'type': 'net', 'batch': '1', 'subdivisions': '1', 'width': '320', 'height': '320', 略...}
         {'type': 'convolutional', 'batch_normalize': '1', 'filters': '32', 'size': '3', 'stride': '1', 'pad': '1', 'activation': 'leaky'}
        略...]
        """
        self.module_defs = parse_model_config(config_path)
        #创建模块
        """
        组成一个modules然后append到model_list里面
        根据列表会生成相应的convolutional、maxpool、upsample、route、shortcut、yolo层。
        convolutional层构建方法很常规：设置filter尺寸、数量，添加batch normalize层（在.cfg文件中batch_normalize=1），以及pad层，使用leaky激活函数。
        maxpool层，不过在YOLOv3中没有使用最大池化来进行下采样，是使用的3*3的卷积核，步长=2的卷积操作进行下采样，（细心的同学会发现yolov3.cfg没有maxpool层），一共5次，下采样2^5=32倍数。
        """
        self.hyperparams, self.module_list = create_modules(self.module_defs)
         # hasattr() 函数用于判断对象是否包含对应的属性。yolo层有 metrics 属性
        self.yolo_layers = [layer[0] for layer in self.module_list if hasattr(layer[0], "metrics")]  #hasattr() 函数用于判断对象是否包含对应的属性。
        self.img_size = img_size
        self.seen = 0
        self.header_info = np.array([0, 0, 0, self.seen, 0], dtype=np.int32)

    #前向传播
    def forward(self, x, targets=None):
        img_dim = x.shape[2]
        loss = 0
        layer_outputs, yolo_outputs = [], []
        #通过遍历self.module_defs,与self.module_list，来完成网络的前向传播。
        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
            #如果是"convolutional", "upsample", "maxpool"层，则直接使用前向传播即可。
            if module_def["type"] in ["convolutional", "upsample", "maxpool"]:
                x = module(x)
            #如果是route层，则使用torch.cat()完成特征图的融合（拼接）
            """
            layer只有一个值，那么该route层的输出就是该层
            如果layer有两个值，则route层输出是对应两个层的特征图的融合。
            """
            elif module_def["type"] == "route":
                x = torch.cat([layer_outputs[int(layer_i)] for layer_i in module_def["layers"].split(",")], 1)
            #shortcut层则特别清晰，直接对应两层相叠加即可：
            """
            concat：张量拼接。将darknet中间层和后面的某一层的上采样进行拼接。
            拼接的操作和残差层add的操作是不一样的，拼接会扩充张量的维度，而add只是
            直接相加不会导致张量维度的改变。
            """
            elif module_def["type"] == "shortcut":
                layer_i = int(module_def["from"])
                x = layer_outputs[-1] + layer_outputs[layer_i]
            #yolo层，查看yolo层的forward()和compute_grid_offsets()
            elif module_def["type"] == "yolo":
                x, layer_loss = module[0](x, targets, img_dim)
                loss += layer_loss
                yolo_outputs.append(x)
            layer_outputs.append(x)
        yolo_outputs = to_cpu(torch.cat(yolo_outputs, 1))
        return yolo_outputs if targets is None else (loss, yolo_outputs)

    # 加载网络权重，被train.py,detect.py,test.py调用，用于加载预先训练好的权重  未看
    def load_darknet_weights(self, weights_path):
        """Parses and loads the weights stored in 'weights_path'"""

        # Open the weights file
        with open(weights_path, "rb") as f:
            header = np.fromfile(f, dtype=np.int32, count=5)  # First five are header values
            self.header_info = header  # Needed to write header when saving weights
            self.seen = header[3]  # number of images seen during training
            weights = np.fromfile(f, dtype=np.float32)  # The rest are weights

        # Establish cutoff for loading backbone weights
        cutoff = None
        if "darknet53.conv.74" in weights_path:
            cutoff = 75

        ptr = 0
        for i, (module_def, module) in enumerate(zip(self.module_defs, self.module_list)):
            # 全卷积网络，只有卷积层和bn层有参数，由于bn层的参数是按照 bias, weight, running_mean, running_var的顺序写入列表的，
            # 所以读取的时候也应该按照这个顺序，同时由于有bn层的时候卷积层没有偏置，所以不用读取卷积层的偏置 
            if i == cutoff:
                break
            if module_def["type"] == "convolutional":
                conv_layer = module[0]
                if module_def["batch_normalize"]:
                    # Load BN bias, weights, running mean and running variance
                    bn_layer = module[1]
                    num_b = bn_layer.bias.numel()  # Number of biases
                    # Bias
                    bn_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.bias)
                    bn_layer.bias.data.copy_(bn_b)
                    ptr += num_b
                    # Weight
                    bn_w = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.weight)
                    bn_layer.weight.data.copy_(bn_w)
                    ptr += num_b
                    # Running Mean
                    bn_rm = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_mean)
                    bn_layer.running_mean.data.copy_(bn_rm)
                    ptr += num_b
                    # Running Var
                    bn_rv = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(bn_layer.running_var)
                    bn_layer.running_var.data.copy_(bn_rv)
                    ptr += num_b
                else:
                    # Load conv. bias
                    num_b = conv_layer.bias.numel()
                    conv_b = torch.from_numpy(weights[ptr : ptr + num_b]).view_as(conv_layer.bias)
                    conv_layer.bias.data.copy_(conv_b)
                    ptr += num_b
                # Load conv. weights
                num_w = conv_layer.weight.numel()
                conv_w = torch.from_numpy(weights[ptr : ptr + num_w]).view_as(conv_layer.weight)
                conv_layer.weight.data.copy_(conv_w)
                ptr += num_w

    #保存神经网络的权重
    def save_darknet_weights(self, path, cutoff=-1):
        """
            @:param path    - path of the new weights file
            @:param cutoff  - save layers between 0 and cutoff (cutoff = -1 -> all are saved)
        """
        fp = open(path, "wb")
        self.header_info[3] = self.seen
        self.header_info.tofile(fp)

        # Iterate through layers
        for i, (module_def, module) in enumerate(zip(self.module_defs[:cutoff], self.module_list[:cutoff])):
            if module_def["type"] == "convolutional":
                conv_layer = module[0]
                # If batch norm, load bn first
                if module_def["batch_normalize"]:
                    bn_layer = module[1]
                    bn_layer.bias.data.cpu().numpy().tofile(fp)
                    bn_layer.weight.data.cpu().numpy().tofile(fp)
                    bn_layer.running_mean.data.cpu().numpy().tofile(fp)
                    bn_layer.running_var.data.cpu().numpy().tofile(fp)
                # Load conv bias
                else:
                    conv_layer.bias.data.cpu().numpy().tofile(fp)
                # Load conv weights
                conv_layer.weight.data.cpu().numpy().tofile(fp)

        fp.close()