YOLO 的从零实现(pytorch)

本文深入介绍了YOLOv1(You Only Look Once)的目标检测模型,包括基本概念如交叉比(IoU)的计算,以及非极大值抑制(NMS)的实现。接着详细阐述了YOLOv1的网络结构,使用PyTorch实现的模型和损失函数,展示了如何计算平均精度(mAP)来评估模型性能。通过对YOLOv1的全面解析,有助于理解目标检测算法的工作原理。
摘要由CSDN通过智能技术生成

You Only Look Once(你只需看一次)

基础知识

交叉比(iou)

iou 计算的是 “预测的边框” 和 “真实的边框” 的交集和并集的比值

在这里插入图片描述
在这里插入图片描述

代码实现

def intersection_over_union(boxes_preds, boxes_labels, box_format='midpoint'):

	"""
    boxes_preds (tensor): 预测边界框 (BATCH_SIZE, 4)
    boxes_labels (tensor): 真实边界框 (BATCH_SIZE, 4)
    box_format (str): midpoint/corners, bbox的shape为 (x,y,w,h) 或 (x1,y1,x2,y2)
	"""

    if box_format == "midpoint":
        box1_x1 = boxes_preds[..., 0:1] - boxes_preds[..., 2:3] / 2
        box1_y1 = boxes_preds[..., 1:2] - boxes_preds[..., 3:4] / 2
        box1_x2 = boxes_preds[..., 0:1] + boxes_preds[..., 2:3] / 2
        box1_y2 = boxes_preds[..., 1:2] + boxes_preds[..., 3:4] / 2
        box2_x1 = boxes_labels[..., 0:1] - boxes_labels[..., 2:3] / 2
        box2_y1 = boxes_labels[..., 1:2] - boxes_labels[..., 3:4] / 2
        box2_x2 = boxes_labels[..., 0:1] + boxes_labels[..., 2:3] / 2
        box2_y2 = boxes_labels[..., 1:2] + boxes_labels[..., 3:4] / 2

    if box_format == "corners":
        box1_x1 = boxes_preds[..., 0:1]
        box1_y1 = boxes_preds[..., 1:2]
        box1_x2 = boxes_preds[..., 2:3]
        box1_y2 = boxes_preds[..., 3:4]  # (N, 1)
        box2_x1 = boxes_labels[..., 0:1]
        box2_y1 = boxes_labels[..., 1:2]
        box2_x2 = boxes_labels[..., 2:3]
        box2_y2 = boxes_labels[..., 3:4]

    x1 = torch.max(box1_x1, box2_x1)
    y1 = torch.max(box1_y1, box2_y1)
    x2 = torch.min(box1_x2, box2_x2)
    y2 = torch.min(box1_y2, box2_y2)

	# .clamp(0) 用于box不相交的情况
    intersection = (x2 - x1).clamp(0) * (y2 - y1).clamp(0)

    box1_area = abs((box1_x2 - box1_x1) * (box1_y2 - box1_y1))
    box2_area = abs((box2_x2 - box2_x1) * (box2_y2 - box2_y1))

    return intersection / (box1_area + box2_area - intersection + 1e-6)


非极大值抑制(Non-Maximum Suppression)

在这里插入图片描述

代码实现

def non_max_suppression(bboxes, iou_threshold, threshold, box_format='corners'):
  
    """
    bboxes (list): 包含bbox信息的列表,指定为 [class_pred, prob_score, x1, y1, x2, y2]
    iou_threshold (float): 预测的bboxes正确的阈值
    threshold (float): 删除预测BBox的阈值
   	box_format (str): bboxes的类型为"midpoint"或 "corners"

    """

    assert type(bboxes) == list

    # 筛选出置信度大于threshold的box
    bboxes = [box for box in bboxes if box[1] > threshold]

    # 按照置信度把box从大到小依次排序
    bboxes = sorted(bboxes, key=lambda x: x[1], reverse=True)

    bboxes_after_nms =[]

    while bboxes:

        chosen_box = bboxes.pop(0)
        boxes = []
        for box in bboxes:
            # 如果类别不相等, 或者 iou < threshold 就把box 放回去, 直至bboxes为空
            if box[0] != chosen_box[0] or intersection_over_union(torch.tensor(chosen_box[2:]),
                                                                  torch.tensor(box[2:]),
                                                                  box_format=box_format) < iou_threshold:
                boxes.append(box)
        bboxes = boxes
        bboxes_after_nms.append(chosen_box)
    return bboxes_after_nms

mAP指标

在这里插入图片描述

代码实现

def mean_average_precision(pred_boxes, true_boxes, iou_threshold=0.5, box_format='midpoint', num_classes=20):
    """
      Calculates mean average precision
      Parameters:
          pred_boxes (list): list of lists containing all bboxes with each bboxes
          specified as [train_idx, class_prediction, prob_score, x1, y1, x2, y2]
          true_boxes (list): Similar as pred_boxes except all the correct ones
          iou_threshold (float): threshold where predicted bboxes is correct
          box_format (str): "midpoint" or "corners" used to specify bboxes
          num_classes (int): number of classes
      Returns:
          float: mAP value across all classes given a specific IoU threshold
      """

    average_precisions = []
    epsilon = 1e-6

    # 每次计算一个类别的AP
    for c in range(num_classes):
        detections = []
        ground_truths = []

        for detection in pred_boxes:
            if detection[1] == c:
                detections.append(detection)
        for true_box in true_boxes:
            if true_box[1] == c:
                ground_truths.append(true_box)

        # 创建一个字典, 存储每个训练图片的真实边界框的数量
        # img 0 有3个边界框
        # img 1 有5个边界框
        # amount_bboxes = {0:3, 1:5, ...}
        amount_bboxes = Counter([gt[0] for gt in ground_truths])

        # amount_bboxes ={0: tensor([0, 0, 0]), 1: tensor([0, 0, 0, 0, 0]), ... }
        for key, val in amount_bboxes.items():
            amount_bboxes[key] = torch.zeros(val)

        # 按照置信度把预测框从大到小排序
        detections.sort(key=lambda x: x[2], reverse=True)

        TP = torch.zeros((len(detections)))
        FP = torch.zeros((len(detections)))
        total_true_bboxes = len(ground_truths)

        # 如果存在这个类别的真实边界框为0,直接跳过
        if total_true_bboxes == 0:
            continue

        # 计算一张图的预测框的AP
        for detection_idx, detection in enumerate(detections):
            # 和预测框的idx相同的, 真实的边界框
            ground_truth_img = [
                bbox for bbox in ground_truths if bbox[0] == detection[0]
            ]
            num_gts = len(ground_truth_img)
            best_iou = 0
            

            for idx, gt in enumerate(ground_truth_img):
                iou = intersection_over_union(
                    torch.tensor(detection[3:]),
                    torch.tensor(gt[3:]),
                    box_format=box_format
                )
                if iou >  best_iou:
                    best_iou = iou
                    best_gt_idx = idx
            if best_iou > iou_threshold:
                if amount_bboxes[detection[0]][best_gt_idx] == 0:
                    TP[detection_idx] = 1
                    amount_bboxes[detection[0]][best_gt_idx] == 1
                else:
                    FP[detection_idx] = 1
            else:
                FP[detection_idx] = 1

        TP_cumsum = torch.cumsum(TP, dim=0)
        FP_cumsum = torch.cumsum(FP, dim=0)
        recalls = TP_cumsum / (total_true_bboxes + epsilon)
        precisions = torch.divide(TP_cumsum, (TP_cumsum + FP_cumsum + epsilon))
        precisions = torch.cat((torch.tensor([1]), precisions))
        recalls = torch.cat((torch.tensor([0]), recalls))

        average_precisions.append(torch.trapz(precisions, recalls))
    return sum(average_precisions) / len(average_precisions)

pyroch 的代码实现

Model

在这里插入图片描述

网络模型结构

在这里插入图片描述

model.py代码实现

import torch
from torch import nn

""" 
kernel_size : 卷积核的大小, filters : 通道数
stride : 步长,  padding  : 填充
构建序列为architecture_config的网络模型
元组的含义是 (kernel_size, filters, stride, padding) 
"M" 代表  stride=2x2, kernel = 2x2的最大池化层
列表里最后一个整数代表重复元组的操作的次数
"""

architecture_config = [
    (7, 64, 2, 3),
    "M",
    (3, 192, 1, 1),
    "M",
    (1, 128, 1, 0),
    (3, 256, 1, 1),
    (1, 256, 1, 0),
    (3, 512, 1, 1),
    "M",
    [(1, 256, 1, 0), (3, 512, 1, 1), 4],
    (1, 512, 1, 0),
    (3, 1024, 1, 1),
    "M",
    [(1, 512, 1, 0), (3, 1024, 1, 1), 2],
    (3, 1024, 1, 1),
    (3, 1024, 2, 1),
    (3, 1024, 1, 1),
    (3, 1024, 1, 1),
]


# 创建一个CNN子块
class CNNBlock(nn.Module):
    def __init__(self, in_channels, out_channels, **kwargs):
        super(CNNBlock, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, bias=False, **kwargs)
        self.batchnorm = nn.BatchNorm2d(out_channels)
        self.leakyrelu = nn.LeakyReLU(0.1)

    def forward(self, x):
        return self.leakyrelu(self.batchnorm(self.conv(x)))


class Yolov1(nn.Module):
    def __init__(self, in_channels=3, **kwargs):
        super(Yolov1, self).__init__()
        self.architecture = architecture_config
        self.in_channels = in_channels
        self.darknet = self._create_conv_layers(self.architecture)
        self.fcs = self._create_fcs(**kwargs)

    def forward(self, x):
        x = self.darknet(x)
        return self.fcs(torch.flatten(x, start_dim=1))

    def _create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        # 循环architecture, 创建layer列表
        for x in architecture:
            if type(x) == tuple:
                layers += [
                    CNNBlock(
                        in_channels, x[1], kernel_size=x[0], stride=x[2], padding=x[3]
                    )
                ]
                in_channels = x[1]
            elif type(x) == str:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]
            elif type(x) == list:
                conv1 = x[0]
                conv2 = x[1]
                num_repeat = x[2]

                for _ in range(num_repeat):
                    layers += [
                        CNNBlock(
                            in_channels,
                            conv1[1],
                            kernel_size=conv1[0],
                            stride=conv1[2],
                            padding=conv1[3]
                        )
                    ]
                    layers += [
                        CNNBlock(
                            conv1[1],
                            conv2[1],
                            kernel_size=conv2[0],
                            stride=conv2[2],
                            padding=conv2[3]
                        )
                    ]
                    in_channels = conv2[1]
        return nn.Sequential(*layers)

    def _create_fcs(self, split_size, num_boxes, num_classes):
        S, B, C = split_size, num_boxes, num_classes
        return nn.Sequential(
            nn.Flatten(),
            nn.Linear(1024 * S * S, 4096),
            nn.Dropout(0.0),
            nn.LeakyReLU(0.1),
            nn.Linear(4096, S * S * (C + B * 5)),
        )

损失函数Loss

在这里插入图片描述
loss.py代码实现

import torch
from torch import nn

from utils import intersection_over_union


class YoloLoss(nn.Module):
    def __init__(self, S=7, B=2, C=20):
        super(YoloLoss, self).__init__()
        self.mse = nn.MSELoss(reduction="sum")
        self.S = S
        self.B = B
        self.C = C

        self.lambda_noobj = 0.5
        self.lambda_coord = 5

    def forward(self, predictions, target):
        # reshape (N, 7, 7, 30)
        predictions = predictions.reshape(-1, self.S, self.S, self.C + self.B * 5)

        # 计算每个bbox与target的iou
        iou_b1 = intersection_over_union(predictions[..., 21:25], target[..., 21:25])
        iou_b2 = intersection_over_union(predictions[..., 26:30], target[..., 21:25])
        ious = torch.cat([iou_b1.unsqueeze(0), iou_b2.unsqueeze(0)], dim=0)

        iou_maxes, bestbox = torch.max(ious, dim=0)
        exists_box = target[..., 20]

        # 负责检测物体的bbox中心点定位误差和宽高定位误差
        # 坐标回归误差
        box_predictions = exists_box * (
            (
                    bestbox * predictions[..., 26:30]
                    + (1 - bestbox) * predictions[..., 21:25]
            )
        )

        box_targets = exists_box * target[..., 21:25]

        box_predictions[..., 2:4] = torch.sign(box_predicitions[..., 2:4]) * torch.sqrt(
            torch.abs(box_predictions[..., 2:4] + 1e-6)
        )
        box_targets[..., 2:4] = torch.sqrt(box_targets[..., 2:4])
        box_loss = self.mse(
            torch.flatten(box_predictions, end_dim=-2),
            torch.flatten(box_targets, end_dim=-2)
        )

        # 负责检测物体的bbox的 confidence误差
        pred_box = (
                bestbox * predictions[..., 25, 26] + (1 - bestbox) * predictions[..., 20:21]
        )
        object_loss = self.mse(
            torch.flatten(exists_box * pred_box),
            torch.flatten(exists_box * target[..., 20:21])
        )

        # 不负责检测物体的bbox的 confidence误差

        no_object_loss = self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 20:21], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )
        no_object_loss += self.mse(
            torch.flatten((1 - exists_box) * predictions[..., 25:26], start_dim=1),
            torch.flatten((1 - exists_box) * target[..., 20:21], start_dim=1)
        )

        # 类别预测误差
        class_loss = self.mse(
            torch.flatten(exists_box * predictions[..., :20], end_dim=-2),
            torch.flatten(exists_box * target[..., :20], end_dim=-2)
        )

        # loss sum
        loss = (
                self.lambda_coord * box_loss
                + object_loss
                + self.lambda_noobj * no_object_loss
                + class_loss
        )
        return loss

未完待续i

  • 3
    点赞
  • 12
    收藏
    觉得还不错? 一键收藏
  • 2
    评论
评论 2
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值