yolov3代码详解（二）

最新推荐文章于 2024-06-28 15:22:31 发布
medusa_zj
最新推荐文章于 2024-06-28 15:22:31 发布
阅读量670
点赞数 1
分类专栏：深度学习
本文链接：https://blog.csdn.net/medusa_zj/article/details/107705661
版权
深度学习专栏收录该内容
19 篇文章 3 订阅
订阅专栏
Pytorch | yolov3代码详解二

utils.py
utils.py

from __future__ import division
import math
import time
import tqdm
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.patches as patches
##########################################################################
#主要函数
##########################################################################



#tesor从GPU下载到cpu
def to_cpu(tensor):
    return tensor.detach().cpu()


##加载data/coco.names里的物体的80种类别，被test.py,detect.py和train.py引用
def load_classes(path):
    """
    Loads class labels at 'path'
    """
    fp = open(path, "r")
    names = fp.read().split("\n")[:-1]
    return names


#自定义初始化权重的函数，被train.py引用
#model.apply（weights_init_normal）用来初始化模型中每一个子模块的参数。
def weights_init_normal(m):
    classname = m.__class__.__name__
    if classname.find("Conv") != -1:   #Conv正态分布
        torch.nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find("BatchNorm2d") != -1:   #BatchNorm2d正态分布
        torch.nn.init.normal_(m.weight.data, 1.0, 0.02)
        torch.nn.init.constant_(m.bias.data, 0.0)   #偏置初始化为0



#神经网络最后预测出关于416*416图像尺寸的boxes，转化到原始图像大小上去，被detect.py引用
#对图像进行detect的时候，大小设置成416*416，即是current_dim=416，得到的boxes要还原到原图像大小上去
#original_shape为原始图片
def rescale_boxes(boxes, current_dim, original_shape):
    """ Rescales bounding boxes to the original shape """
    orig_h, orig_w = original_shape
    # The amount of padding that was added
    pad_x = max(orig_h - orig_w, 0) * (current_dim / max(original_shape))
    pad_y = max(orig_w - orig_h, 0) * (current_dim / max(original_shape))
    # Image height and width after padding is removed
    unpad_h = current_dim - pad_y
    unpad_w = current_dim - pad_x
    # Rescale bounding boxes to dimension of original image
    boxes[:, 0] = ((boxes[:, 0] - pad_x // 2) / unpad_w) * orig_w
    boxes[:, 1] = ((boxes[:, 1] - pad_y // 2) / unpad_h) * orig_h
    boxes[:, 2] = ((boxes[:, 2] - pad_x // 2) / unpad_w) * orig_w
    boxes[:, 3] = ((boxes[:, 3] - pad_y // 2) / unpad_h) * orig_h
    return boxes


#将中心坐标和高宽，转成左上角右下角的坐标，被下面的non_max_suppression和test.py引用
def xywh2xyxy(x):
    y = x.new(x.shape)
    y[..., 0] = x[..., 0] - x[..., 2] / 2
    y[..., 1] = x[..., 1] - x[..., 3] / 2
    y[..., 2] = x[..., 0] + x[..., 2] / 2
    y[..., 3] = x[..., 1] + x[..., 3] / 2
    return y



#------------------------------------------以下三个函数为性能指标计算---------------------
#计算每个类的预测的精度，被test.py引用
#输入为：真阳性（预测框和实际框匹配设置为1）、置信度、预测的类别、真实值类别的列表
#输出为：统计precision, recall, AP, f1, ap_class指标
def ap_per_class(tp, conf, pred_cls, target_cls):
    """ Compute the average precision, given the recall and precision curves.
    Source: https://github.com/rafaelpadilla/Object-Detection-Metrics.
    # Arguments
        tp:    True positives (list).
        conf:  Objectness value from 0-1 (list).
        pred_cls: Predicted object classes (list).
        target_cls: True object classes (list).
    # Returns
        The average precision as computed in py-faster-rcnn.
    """

    # Sort by objectness
    i = np.argsort(-conf)  #argsort函数返回的是数组值从小到大的索引值
    tp, conf, pred_cls = tp[i], conf[i], pred_cls[i]

    # Find unique classes
    unique_classes = np.unique(target_cls)  #除数组中的重复数字，并进行排序之后输出。

    # Create Precision-Recall curve and compute AP for each class 创建精确召回曲线并计算每个类的AP
    ap, p, r = [], [], []
    for c in tqdm.tqdm(unique_classes, desc="Computing AP"):
        i = pred_cls == c
        n_gt = (target_cls == c).sum()  # Number of ground truth objects   #真实目标 数
        n_p = i.sum()  # Number of predicted objects  #预测的目标 数

        if n_p == 0 and n_gt == 0:
            continue
        elif n_p == 0 or n_gt == 0:
            ap.append(0)
            r.append(0)
            p.append(0)
        else:
            # Accumulate FPs and TPs
            fpc = (1 - tp[i]).cumsum()
            tpc = (tp[i]).cumsum()

            # Recall
            recall_curve = tpc / (n_gt + 1e-16)
            r.append(recall_curve[-1])

            # Precision
            precision_curve = tpc / (tpc + fpc)
            p.append(precision_curve[-1])

            # AP from recall-precision curve
            ap.append(compute_ap(recall_curve, precision_curve))

    # Compute F1 score (harmonic mean of precision and recall)（准确度和召回率的调和平均值）
    p, r, ap = np.array(p), np.array(r), np.array(ap)
    f1 = 2 * p * r / (p + r + 1e-16)

    return p, r, ap, f1, unique_classes.astype("int32")

#由recall,precision计算ap，被上面的ap_per_class引用  ？？？？？？？？？？？？
def compute_ap(recall, precision):
    """ Compute the average precision, given the recall and precision curves.
    Code originally from https://github.com/rbgirshick/py-faster-rcnn.

    # Arguments
        recall:    The recall curve (list).
        precision: The precision curve (list).
    # Returns
        The average precision as computed in py-faster-rcnn.
    """
    # correct AP calculation
    # first append sentinel values at the end
    mrec = np.concatenate(([0.0], recall, [1.0]))   #拼接  
    mpre = np.concatenate(([0.0], precision, [0.0]))   #拼接 

    # compute the precision envelope  计算精度包络
    for i in range(mpre.size - 1, 0, -1):  #range(10,0,-1)意思是从列表的下标为10的元素开始，倒序取到下标为0的元素（但是不包括下标为0元素）
        mpre[i - 1] = np.maximum(mpre[i - 1], mpre[i])  #取最大值

    # to calculate area under PR curve, look for points
    # where X axis (recall) changes value
    #要计算PR曲线下的面积，请查找点
    #其中X轴（调用）更改值
    i = np.where(mrec[1:] != mrec[:-1])[0]

    # and sum (\Delta recall) * prec
    ap = np.sum((mrec[i + 1] - mrec[i]) * mpre[i + 1])
    return ap

#统计batch检测性能指标，被test.py引用
#输入为经过nms得到的预测框，#输出为【预测框和实际框匹配设置为1（真阳性），置信度，标签】  如果预测框有3个，则输出维度也是3个
def get_batch_statistics(outputs, targets, iou_threshold):
    """ Compute true positives, predicted scores and predicted labels per sample """
    #outputs: 一个 batch_size 的所有图片的检测结果， shape为(batch_size, pred_boxes_num, 7) ,其中，  【1,（507筛选后）假设有3个,7】  507在特征图13*13上
    #7指的是x,y,w,h,conf,class_conf,class_pred

    #output：指的是一个batch_size里面的第sample_i张图片的检测结果，它有pred_boxes_num个box。
    #一个 batch_size中第 sample_i 张图片的检测结果，含多个box，每个box都有一个7

    # targets:  (batch_size, 6)，其中6指的是num, cls, center_x, center_y, widht, height，其中
    #num指的是第几个图片，因为一张图片中可能有好几个目标，这些目标的num都是一样的
    batch_metrics = []
    for sample_i in range(len(outputs)):   #一张图片

        if outputs[sample_i] is None:
            continue

        output = outputs[sample_i]  #当前图像检测结果为output
        pred_boxes = output[:, :4]  # 当前图像的预测框的x,y,w,h
        pred_scores = output[:, 4]  # 当前图像的预测框的置信度
        pred_labels = output[:, -1] # 当前图像的预测框的类别label

        true_positives = np.zeros(pred_boxes.shape[0])   #得到以 一张图上有几个预测出来的目标 的矩阵，初始化为0，预测框和实际框匹配，则设置为1

        annotations = targets[targets[:, 0] == sample_i][:, 1:]  #图像sample_i所有的标签写入annotations  【cls, center_x, center_y, widht, height】   即这张图片中真实的标签
        target_labels = annotations[:, 0] if len(annotations) else []  #提取annotations所有的cls写入target_labels
        if len(annotations):  #如果图像sample_i中有框
            detected_boxes = []  #记录检测结果
            target_boxes = annotations[:, 1:]

            for pred_i, (pred_box, pred_label) in enumerate(zip(pred_boxes, pred_labels)):

                # If targets are found break
                if len(detected_boxes) == len(annotations):  #如果所有的目标annotations都找到了，就退出
                    break

                # Ignore if label is not one of the target labels
                if pred_label not in target_labels:  #当前图像的预测框的类别label不在 真实类别中
                    continue

                iou, box_index = bbox_iou(pred_box.unsqueeze(0), target_boxes).max(0)  #如果有相同标签，则计算iou值   得到的box_index  是对应  target_boxes的索引
                if iou >= iou_threshold and box_index not in detected_boxes:
                    true_positives[pred_i] = 1
                    detected_boxes += [box_index]# 记录刚刚匹配成功的真实框target_boxes的索引号box_index，防止它被预测框pred_box重复标记，即一个实际框target_boxes只能被一个预测框pred_box成功匹配
        batch_metrics.append([true_positives, pred_scores, pred_labels])
                 #预测框和实际框匹配，则设置为1，置信度，标签
    return batch_metrics

#------------------------------------------以上为性能指标计算---------------------

#求anchor与真实框的交并比，被下面的build_targets引用
def bbox_wh_iou(wh1, wh2):
    wh2 = wh2.t()   #Tensor进行转置
    w1, h1 = wh1[0], wh1[1]
    w2, h2 = wh2[0], wh2[1]
    inter_area = torch.min(w1, w2) * torch.min(h1, h2)
    union_area = (w1 * h1 + 1e-16) + w2 * h2 - inter_area
    return inter_area / union_area


#计算iou
def bbox_iou(box1, box2, x1y1x2y2=True):
    """
    Returns the IoU of two bounding boxes
    """
    if not x1y1x2y2:
        # Transform from center and width to exact coordinates
        b1_x1, b1_x2 = box1[:, 0] - box1[:, 2] / 2, box1[:, 0] + box1[:, 2] / 2
        b1_y1, b1_y2 = box1[:, 1] - box1[:, 3] / 2, box1[:, 1] + box1[:, 3] / 2
        b2_x1, b2_x2 = box2[:, 0] - box2[:, 2] / 2, box2[:, 0] + box2[:, 2] / 2
        b2_y1, b2_y2 = box2[:, 1] - box2[:, 3] / 2, box2[:, 1] + box2[:, 3] / 2
    else:
        # Get the coordinates of bounding boxes
        b1_x1, b1_y1, b1_x2, b1_y2 = box1[:, 0], box1[:, 1], box1[:, 2], box1[:, 3]
        b2_x1, b2_y1, b2_x2, b2_y2 = box2[:, 0], box2[:, 1], box2[:, 2], box2[:, 3]

    # get the corrdinates of the intersection rectangle
    inter_rect_x1 = torch.max(b1_x1, b2_x1)
    inter_rect_y1 = torch.max(b1_y1, b2_y1)
    inter_rect_x2 = torch.min(b1_x2, b2_x2)
    inter_rect_y2 = torch.min(b1_y2, b2_y2)
    #上面得到两个box一起的重叠部分的左上和右下

    # Intersection area
    inter_area = torch.clamp(inter_rect_x2 - inter_rect_x1 + 1, min=0) * torch.clamp(
        inter_rect_y2 - inter_rect_y1 + 1, min=0
    )
   #得到合在一起的重叠部分的面积

    # Union Area
    b1_area = (b1_x2 - b1_x1 + 1) * (b1_y2 - b1_y1 + 1)
    b2_area = (b2_x2 - b2_x1 + 1) * (b2_y2 - b2_y1 + 1)
    #得到各自的面积

    iou = inter_area / (b1_area + b2_area - inter_area + 1e-16)

    return iou


#非最大值抑制
def non_max_suppression(prediction, conf_thres=0.5, nms_thres=0.4):

    """
    #NMS，被test.py和detect.py引用
    #NMS从对一张图片的10647个anchor中，提取出该图片的预测结果
    #score最高同类型的框，加权计算得到预测框
    #1.输入的是3个yololayer分别在13,26,52三个尺度上进行预测结果，连接成prediction(batchsize,10647,85)，
    #后面的85是x,y,w,h,conf,80clsss；当前处理图像是image_i,有image_pred（10647,85）个预测结果
    #2.筛选出置信度大于设置阈值的所有prediction作为新的prediction；
    #3.然后计算score(score为前面的置信度-conf和类别分类得分-cls中的最大值的乘积)
    #4.根据score进行排序得到最大值，找到和这个score最大的预测类别相同的，计算iou值，通过加权计算，得到最终的预测框(xyxy),最
    #后从prediction中去掉iou大于设置的iou阈值。
    #5.循环3，4这个步骤，直到predicton的size为0


    print(x)   [1,4,8]
    print('#############')
    print(x[...,3])
    print('#############')
    print(x[:,3])

    [[[ 1  2  3  4  5  6  7  8]
    [ 9 10 11 12 13 14 15 16]
    [17 18 19 20 21 22 23 24]
    [25 26 27 28 29 30 31 32]]]
    #############
    [[ 4 12 20 28]]
    #############
    [[25 26 27 28 29 30 31 32]]

    """

    """
    删除目标可信度得分低于“conf_thres”的检测并执行
    非最大抑制以进一步过滤检测。
    返回值：(x1, y1, x2, y2, object_conf, class_score, class_pred)
                                         最大类别值，最大类别索引
    """

    # From (center x, center y, width, height) to (x1, y1, x2, y2)   即左上和右下
    prediction[..., :4] = xywh2xyxy(prediction[..., :4])
    output = [None for _ in range(len(prediction))]
    for image_i, image_pred in enumerate(prediction):
        # Filter out confidence scores below threshold
        image_pred = image_pred[image_pred[:, 4] >= conf_thres] #保留大于置信度的边界框
        # If none are remaining => process next image  如果没有剩余图像=>处理下一个图像
        if not image_pred.size(0):    
            continue
        #如果大于置信度的还有框，则用 对象置信度乘以类置信度
        # Object confidence times class confidence
        # .max(1) 返回每行tensor的最大值  .max(1)[0]具体的最大值 .max(1)[1] 最大值对应的索引
        score = image_pred[:, 4] * image_pred[:, 5:].max(1)[0]
        # Sort by it
        #对象置信度乘以类置信度   完成  从大到小排序
        image_pred = image_pred[(-score).argsort()]


        #下面是得到类置信度最大值和索引
        """
        若keepdim值为True，则在输出张量中，除了被操作的dim维度值降为1，其它维度与输入张量input相同。
        否则，dim维度相当于被执行torch.squeeze()维度压缩操作，导致此维度消失，最终输出张量会比输入张量少一个维度。
        例如 加的话：
        tensor([[ 5.4772],
                [10.9545],
                [16.4317]])
            不加的话
        tensor([ 5.4772, 10.9545, 16.4317])
        区别：torch.Size([3, 1])
              torch.Size([3])
        """
        class_confs, class_preds = image_pred[:, 5:].max(1, keepdim=True)
        detections = torch.cat((image_pred[:, :5], class_confs.float(), class_preds.float()), 1)
        #得到 [;,  x,y,x,y,置信度，最大类别值，最大类别索引]
        #即detections(经过置信度阈值筛选后的数量，7)

        #非极大值抑制处理
        # Perform non-maximum suppression
        keep_boxes = []
        while detections.size(0):  #循环  若该图像有预测框
            #这个函数主要是对数据维度进行扩充。给指定位置加上维数为一的维度，
            # 比如原本有个三行的数据（3），在0的位置加了一维就变成一行三列（1,3）
             """
            print(x)        [4,8]
            print('#############')
            print(x[...,3])
            print('#############')
            print(x[:,3])
            print('#############')
            print(x[:,:3])
            print('#############')
            print(x[0,:3])


            [[ 1  2  3  4  5  6  7  8]
            [ 9 10 11 12 13 14 15 16]
            [17 18 19 20 21 22 23 24]
            [25 26 27 28 29 30 31 32]]
            #############
            [ 4 12 20 28]
            #############
            [ 4 12 20 28]
            #############
            [[ 1  2  3]
             [ 9 10 11]
             [17 18 19]
             [25 26 27]]
            #############
            [1 2 3]
             """
            # 每次取一个box与剩下的所有box进行iou比较
            large_overlap = bbox_iou(detections[0, :4].unsqueeze(0), detections[:, :4]) > nms_thres
            # 得到与当前第一个框的IoU超过IoU阈值的框索引

            label_match = detections[0, -1] == detections[:, -1]   # 得到与第一个框所预测类别的相同的框索引
            # Indices of boxes with lower confidence scores, large IOUs and matching labels
            invalid = large_overlap & label_match  # 满足超过阈值且类相匹配的索引
            weights = detections[invalid, 4:5]     #取出符合invalid条件的所有置信度  不含5
            
            
            # Merge overlapping bboxes by order of confidence
            # 按置信顺序合并重叠的bboxes
            #计算当前对象所有预测框的最优值，计算公式就是根据置信度进行加权计算的
            detections[0, :4] = (weights * detections[invalid, :4]).sum(0) / weights.sum()
            keep_boxes += [detections[0]]  detections  #满足条件就加入keep_boxes
            #对invalid取反，也就是去掉刚才计算过的对象，进行下一轮计算
            detections = detections[~invalid]
        if keep_boxes:
            #如果keep_boxes中有内容，则stack所有的keep_boxes,添加到output列表，如果没有目标（keep_boxes为空）
            #则该位置不变，仍为None
            output[image_i] = torch.stack(keep_boxes)

    return output


#在model.py之YOLOLayer之forward中存在targets时（训练时和test时）被引用，
#为计算损失值等做准备
#随着YOLOLayer层被调用3次，每次有不同的anchors，想明白其中一次调用即可
def build_targets(pred_boxes, pred_cls, target, anchors, ignore_thres):
    #pred_boxes的形状为[1,3,13,13,4]  为x,y,w,h
    #pred_cls的形状为[1,3,13,13,80]
    #target   [num，类别,x,y,w,h]  xywh在0到1之间  此处num为1，因为只考虑一张图片
    #anchors  (3, 2)  经过缩放的 即在特征图上的尺寸
    #ignore_thres  阈值
    
    ByteTensor = torch.cuda.ByteTensor if pred_boxes.is_cuda else torch.ByteTensor
    FloatTensor = torch.cuda.FloatTensor if pred_boxes.is_cuda else torch.FloatTensor

    nB = pred_boxes.size(0)  # batchsize
    nA = pred_boxes.size(1)  #anchor num #3
    nC = pred_cls.size(-1)   #类别数80
    nG = pred_boxes.size(2)  #特征图大小13或26或52

    # Output tensors
    obj_mask = ByteTensor(nB, nA, nG, nG).fill_(0)      #[1,3,13,13]
    #obj_mask表示有物体落在特征图中某一个cell的索引，所以在初始化的时候置0，如果有物体落在那个cell中，那个对应的位置会置1
    noobj_mask = ByteTensor(nB, nA, nG, nG).fill_(1)    #[1,3,13,13]   #没有物体  初始化为1
    class_mask = FloatTensor(nB, nA, nG, nG).fill_(0)   #[1,3,13,13]
    iou_scores = FloatTensor(nB, nA, nG, nG).fill_(0)   #[1,3,13,13]
    tx = FloatTensor(nB, nA, nG, nG).fill_(0)           #[1,3,13,13]
    ty = FloatTensor(nB, nA, nG, nG).fill_(0)           #[1,3,13,13]
    tw = FloatTensor(nB, nA, nG, nG).fill_(0)           #[1,3,13,13]
    th = FloatTensor(nB, nA, nG, nG).fill_(0)           #[1,3,13,13]
    tcls = FloatTensor(nB, nA, nG, nG, nC).fill_(0)     #[1,3,13,13，80]

    # Convert to position relative to box
    target_boxes = target[:, 2:6] * nG   #得到在特征图上的xywh坐标  同时只有4维  即[x,y,w,h]   形状[批次，4]
    gxy = target_boxes[:, :2]   #得到xy   形状[批次，2]
    gwh = target_boxes[:, 2:]   #得到wh   形状[批次，2]
    # Get anchors with best iou
    ious = torch.stack([bbox_wh_iou(anchor, gwh) for anchor in anchors])  #形状[3，目标数量]
    #找到和真实框gwh的IOU最大的那些anchors。   anchor此刻是[3,2] 只有高宽
    #上面计算iou只比较长宽
    best_ious, best_n = ious.max(0)  #得到最大iou和索引   形状[1,目标数量]，形状[1,目标数量]，
    # Separate target values
    b, target_labels = target[:, :2].long().t()   
    gx, gy = gxy.t()   #转置  从n x 2 变成 2 x n
    gw, gh = gwh.t()
    gi, gj = gxy.long().t()  # .long()是把浮点型转为整型（去尾），这样就可以得到目标框中心点所在的网格坐标  
                             #比如现在是13*13的特征图，gx,gy等于(10.4,2.2)那对应的就是特征图上的(10,2)这个格子。
    # Set masks
    #[b, best_n, gj, gi]这个索引代表了在当前特征图上真实值对应的格子上最好的那个anchor的索引
    obj_mask[b, best_n, gj, gi] = 1   # 对目标实体框中心点所在的单元网格，其最优anchor设置为1
    noobj_mask[b, best_n, gj, gi] = 0  # 对目标实体框中心点所在的单元网格，其最优anchor设置为0 （与obj_mask相反）

    # Set noobj mask to zero where iou exceeds ignore threshold

    # 这里不同与上一个策略，上个策略是找到与目标框IOU最优的anchor框，每个目标框对应一个anchor框。
    # 这里不考虑最优问题，只要目标框与anchor的IOU大于阈值，就认为是有效anchor框，即noobj_mask对应的位置设置为0
    for i, anchor_ious in enumerate(ious.t()):  #这里遍历次数是目标个数
        noobj_mask[b[i], anchor_ious > ignore_thres, gj[i], gi[i]] = 0
        #如果预测的IOU值过大，（大于阈值ignore_thres）时，那么可以认为这个cell是有物体的，要置0。
    # 以上操作得到了目标实体框obj_mask和目标非实体框noobj_mask，目标实体框是与实体一一对应的，一个实体有一个最匹配的目标框；



    # Coordinates
    # 将x,y,w,h重新归一化

    # 注意：要明白这里为什么要这么做，此处的归一化和传入target的归一化方式不一样，
    # 传入target的归一化是实际的x,y,w,h / img_size. 即实际x,y,w,h在img_size中的比例，
    # 此处的归一化中，中心坐标x,y是基于单元网络的，w,h是基于anchor框，此处归一化的x,y,w,h，也是模型要拟合的值。    
    #[b, best_n, gj, gi]这个索引代表了在当前特征图上真实值对应的格子上最好的那个anchor的索引
    tx[b, best_n, gj, gi] = gx - gx.floor()  #gx，gy是这个边界框的中心点的坐标，gx.floor和gy.floor便是cell（中心点左上角）的坐标。
    ty[b, best_n, gj, gi] = gy - gy.floor()
    #tx,ty即中心点相对于每一个格子的相对坐标
    # Width and height
    tw[b, best_n, gj, gi] = torch.log(gw / anchors[best_n][:, 0] + 1e-16)
    th[b, best_n, gj, gi] = torch.log(gh / anchors[best_n][:, 1] + 1e-16)
    #将w和h除以得到的anchor尺寸，得到相对于anchors(这里的anchors也是scaled_anchors,除以了stride,相当于对应现在特征图的大小)的w和h，然后取log。


    # One-hot encoding of label
    tcls[b, best_n, gj, gi, target_labels] = 1
    #[b, best_n, gj, gi, target_labels]表明第b张图片，使用第best_n个anchors来预测 哪一类（target_labels）物体。
  


    # Compute label correctness and iou at best anchor
    #argmax(-1)得到索引[b, best_n, gj, gi]预测分类的值，然后判断和真实值是否相等，得到了正确分类的索引index。
    #[b, best_n, gj, gi]对应的方格预测的分类正确
    #对目标实体框中心点所在的单元网格，分类正确时，其class_mask设置为1  
    class_mask[b, best_n, gj, gi] = (pred_cls[b, best_n, gj, gi].argmax(-1) == target_labels).float()
    # iou_scores：预测框pred_boxes中的最佳框与目标实体框target_boxes的交集IOU，以IOU作为分数，IOU越大，分值越高。
    iou_scores[b, best_n, gj, gi] = bbox_iou(pred_boxes[b, best_n, gj, gi], target_boxes, x1y1x2y2=False)

    tconf = obj_mask.float()
    # tconf：正确的目标实体框，其对应anchor框的置信度为1，即置信度的标签，这里转为float，是为了后面和预测的置信度值做loss计算。
    return iou_scores, class_mask, obj_mask, noobj_mask, tx, ty, tw, th, tcls, tconf
    #iou_scores 最好的那个预测框 和真实值的iou值
    #class_mask 最好的那个预测框 分类正确的索引（分类正确的置1）
    #obj_mask   目标框所在网格的最好anchor置为1（即真实框落在的那个cell的anchors，选择iou最大的那个）
    #noobj_mask obj_mask那里置0，还有计算的iou大于阈值的也置0，其他都为1
    #tx, ty, tw, th, 对应的对于该大小的特征图的xywh目标值也就是我们需要拟合的值
    #tconf      目标置信度--->其实就是obj_mask换成了float