PyTorch 复现YOLOv1目标检测算法

整个工程文件已放到Github上
https://github.com/yaoyi30/Pytorch_YOLOv1

在VOC2007测试集上的mAP值为0.65
在这里插入图片描述
网络预测结果
在这里插入图片描述

在这里插入图片描述在这里插入图片描述

在这里插入图片描述

一、训练目标检测网络主要流程

  1. 构建数据集
  2. 数据预处理、包括数据增强和数据标准化和归一化
  3. 构建网络模型
  4. 设置学习率、优化器、损失函数等超参数
  5. 训练和验证

二、各个流程简要说明

1. 构建数据集

本文使用的是VOC2007和VOC2012数据集
在这里插入图片描述

在工程目录下,新建datasets文件夹,在文件夹内新建JPEGImages文件夹,用来放图片,之后内新建train.txt和val.txt文件用来存放训练和验证数据列表,结构如下:

datasets/
  JPEGImages/   # VOC2007 + VOC2012 all images
     img1.jpg
     img2.jpg
       .
       .
       .
       
  train.txt
  val.txt

2. 数据预处理

将图像resize到统一大小,之后转为tensor格式再进行标准化,预处理之后的图片可以正常输入网络,对于训练集可以采取一些数据增强手段来增强网络的泛化能力,验证集不做数据增强。

    #训练数据预处理、数据增强设置
    train_transform = Compose([
                                    RandomHorizontalFlip(0.5),#水平翻转
                                    RandomVerticalFlip(0.5),#竖直翻转
                                    RandomScale(0.5),#随机尺度变换
                                    RandomGaussianBlur(0.5),#高斯滤波
                                    RandomBrightness(0.5),#亮度调节
                                    RandomHue(0.5),#颜色调节
                                    RandomSaturation(0.5),#饱和度调节
                                    RandomShift(0.5),#随机平移
                                    RandomCrop(0.5),#随机裁剪
                                    Resize(args.input_size),#图像resize到统一大小,坐标值随之变换
                                    ToTensor(),#图像转为tensor格式,值变为0-1之间
                                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])#图像标准化
                             ])
    #验证数据预处理
    val_transform = Compose([
                                    Resize(args.input_size),#图像resize到统一大小,坐标值随之变换
                                    ToTensor(),#图像转为tensor格式,值变为0-1之间
                                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])#图像标准化
                            ])

                                    ])

3. 构建网络模型

本文基于Pytorch搭建了YOLOv1目标检测网络

    model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)

4. 设置学习率、优化器、损失函数等超参数

    #定义损失函数
    loss_function = Detect_Loss(feature_size=args.grid_size, num_bboxes=args.num_bboxes, num_classes=args.nb_classes)
    #定义优化器(初始学习率和权重衰减值)
    optimizer = torch.optim.SGD(model.parameters(), lr=args.init_lr, momentum=args.momentum, weight_decay=args.weight_decay)
    #定义学习率类型,此处StepLR学习率,设置衰减因子以及每隔多少轮变化一次
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=int(args.epochs * 0.3), gamma=0.1)

5. 训练和验证

	#训练和验证模型,具体函写在了utils.py文件中
    history = train_and_val(args.epochs, model, train_loader,val_loader,loss_function, optimizer,scheduler,args.output_dir,device)

三、工程代码文件详细讲解

transform.py

定义数据增强函数

from torchvision.transforms import functional as F
import random
import torch
import numpy as np
import cv2
from PIL import Image
#将图像resize到统一大小,坐标也随之改变
class Resize(object):
    def __init__(self, size):
        self.size = size

    def __call__(self, image,boxes,labels):
        width, height = image.size
        image = F.resize(image, self.size)

        scale_x = self.size[1] / width
        scale_y = self.size[0] / height

        scale_tensor = torch.FloatTensor([[scale_x, scale_y, scale_x, scale_y]]).expand_as(boxes)
        boxes = boxes * scale_tensor
        
        return image,boxes,labels

#高斯滤波
class RandomGaussianBlur(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image, boxes,labels):
        if random.random() < self.prob:
            ksize = random.choice([3, 5])
            image = F.gaussian_blur(image,[ksize, ksize])

        return image, boxes,labels

#随机亮度调节
class RandomBrightness(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            adjust = random.uniform(0.5, 1.5)
            image = F.adjust_brightness(image,adjust)

        return image,boxes,labels

#随机颜色调节
class RandomHue(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            adjust = random.uniform(-0.5, 0.5)
            image = F.adjust_hue(image,adjust)

        return image,boxes,labels

#随机饱和度调节
class RandomSaturation(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            adjust = random.uniform(0.5, 1.5)
            image = F.adjust_saturation(image,adjust)

        return image,boxes,labels

#随机水平翻转
class RandomHorizontalFlip(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            width, height = image.size
            image = F.hflip(image)
            x1, x2 = boxes[:, 0], boxes[:, 2]
            x1_new = width - x2
            x2_new = width - x1
            boxes[:, 0], boxes[:, 2] = x1_new, x2_new

        return image,boxes,labels
#随机更改宽度
class RandomScale(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            width, height = image.size
            scale = random.uniform(0.8,1.2)
            image = F.resize(image,[height,int(width*scale)])

            scale_tensor = torch.FloatTensor([[scale,1,scale,1]]).expand_as(boxes)
            boxes = boxes * scale_tensor

        return image,boxes,labels
#随机平移变换
class RandomShift(object):
    def __init__(self, prob):
        self.prob = prob
        self.mean = [122.67891434, 116.66876762, 104.00698793]

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            center = (boxes[:, 2:] + boxes[:, :2]) / 2.0
            img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

            h, w, c = img.shape
            img_out = np.zeros((h, w, c), dtype=img.dtype)
            mean_bgr = self.mean[::-1]
            img_out[:, :] = mean_bgr

            dx = random.uniform(-w * 0.2, w * 0.2)
            dy = random.uniform(-h * 0.2, h * 0.2)
            dx, dy = int(dx), int(dy)

            if dx >= 0 and dy >= 0:
                img_out[dy:, dx:] = img[:h - dy, :w - dx]
            elif dx >= 0 and dy < 0:
                img_out[:h + dy, dx:] = img[-dy:, :w - dx]
            elif dx < 0 and dy >= 0:
                img_out[dy:, :w + dx] = img[:h - dy, -dx:]
            elif dx < 0 and dy < 0:
                img_out[:h + dy, :w + dx] = img[-dy:, -dx:]

            center = center + torch.FloatTensor([[dx, dy]]).expand_as(center)  # [n, 2]
            mask_x = (center[:, 0] >= 0) & (center[:, 0] < w)  # [n,]
            mask_y = (center[:, 1] >= 0) & (center[:, 1] < h)  # [n,]
            mask = (mask_x & mask_y).view(-1, 1)  # [n, 1], mask for the boxes within the image after shift.

            boxes_out = boxes[mask.expand_as(boxes)].view(-1, 4)  # [m, 4]
            if len(boxes_out) == 0:
                return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)), boxes, labels
            shift = torch.FloatTensor([[dx, dy, dx, dy]]).expand_as(boxes_out)  # [m, 4]

            boxes_out = boxes_out + shift
            boxes_out[:, 0] = boxes_out[:, 0].clamp_(min=0, max=w)
            boxes_out[:, 2] = boxes_out[:, 2].clamp_(min=0, max=w)
            boxes_out[:, 1] = boxes_out[:, 1].clamp_(min=0, max=h)
            boxes_out[:, 3] = boxes_out[:, 3].clamp_(min=0, max=h)

            labels_out = labels[mask.view(-1)]

            image, boxes, labels = Image.fromarray(cv2.cvtColor(img_out, cv2.COLOR_BGR2RGB)), boxes_out, labels_out

        return image,boxes,labels
#随机裁剪
class RandomCrop(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            center = (boxes[:, 2:] + boxes[:, :2]) / 2.0

            w_orig, h_orig = image.size
            h = random.uniform(0.6 * h_orig, h_orig)
            w = random.uniform(0.6 * w_orig, w_orig)
            y = random.uniform(0, h_orig - h)
            x = random.uniform(0, w_orig - w)
            h, w, x, y = int(h), int(w), int(x), int(y)

            center = center - torch.FloatTensor([[x, y]]).expand_as(center)  # [n, 2]
            mask_x = (center[:, 0] >= 0) & (center[:, 0] < w)  # [n,]
            mask_y = (center[:, 1] >= 0) & (center[:, 1] < h)  # [n,]
            mask = (mask_x & mask_y).view(-1, 1)  # [n, 1], mask for the boxes within the image after crop.

            boxes_out = boxes[mask.expand_as(boxes)].view(-1, 4)  # [m, 4]
            if len(boxes_out) == 0:
                return image, boxes, labels
            shift = torch.FloatTensor([[x, y, x, y]]).expand_as(boxes_out)  # [m, 4]

            boxes_out = boxes_out - shift
            boxes_out[:, 0] = boxes_out[:, 0].clamp_(min=0, max=w)
            boxes_out[:, 2] = boxes_out[:, 2].clamp_(min=0, max=w)
            boxes_out[:, 1] = boxes_out[:, 1].clamp_(min=0, max=h)
            boxes_out[:, 3] = boxes_out[:, 3].clamp_(min=0, max=h)

            labels_out = labels[mask.view(-1)]
            box = (x, y, x + w, y + h)
            img_out = image.crop(box)

            image, boxes, labels = img_out,boxes_out,labels_out

        return image,boxes,labels
#随机竖直翻转
class RandomVerticalFlip(object):
    def __init__(self, prob):
        self.prob = prob

    def __call__(self, image,boxes,labels):
        if random.random() < self.prob:
            width, height = image.size
            image = F.vflip(image)
            y1, y2 = boxes[:, 1], boxes[:, 3]
            y1_new = height - y2
            y2_new = height - y1
            boxes[:, 1], boxes[:, 3] = y1_new, y2_new

        return image,boxes,labels
#标准化
class Normalize(object):
    def __init__(self, mean, std):
        self.mean = mean
        self.std = std

    def __call__(self, image,boxes,labels):
        image = F.normalize(image, mean=self.mean, std=self.std)

        return image,boxes,labels
#转为tensor,像素值变为0-1之间
class ToTensor(object):
    def __call__(self, image,boxes,labels):
        image = F.to_tensor(image)

        return image,boxes,labels

#数据增强组合函数
class Compose(object):
    def __init__(self, transforms):
        self.transforms = transforms

    def __call__(self, image,boxes,labels):
        for t in self.transforms:
            image,boxes,labels = t(image,boxes,labels)

        return image,boxes,labels

train.py

定义主函数,在跑实验的时候发现,优化器使用SGD,batch size设置为32,初始学习率设置为0.001这样效果是比较好的

import os
import torch
import torch.nn as nn
from models.yolov1 import YOLO_v1
import argparse
import numpy as np
from utils.transform import Resize,Compose,ToTensor,Normalize,RandomHorizontalFlip,RandomVerticalFlip,RandomScale,\
                            RandomHue,RandomSaturation,RandomBrightness,RandomGaussianBlur,RandomCrop,RandomShift
from utils.datasets import DetData
from utils.loss import Detect_Loss
from utils.engine import train_and_val,plot_loss,plot_lr
# 训练参数定义
def get_args_parser():
    parser = argparse.ArgumentParser('Image Detection Train', add_help=False)
    # batch size设置
    parser.add_argument('--batch_size', default=32, type=int,help='Batch size for training')
    # 训练的轮数
    parser.add_argument('--epochs', default=80, type=int)
    # 输入图像大小,默认为448×448
    parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
    # 数据集的地址
    parser.add_argument('--data_path', default='./datasets/', type=str,help='dataset path')
    # 初始学习率设置,默认为0.001
    parser.add_argument('--init_lr', default=0.001, type=float,help='SGD intial lr')
    parser.add_argument('--momentum', default=0.9, type=float,help='SGD momentum')
    parser.add_argument('--weight_decay', default=5e-4, type=float,help='SGD weight decay')
    # 是否加载backbone的预训练模型
    parser.add_argument('--finetune', default='./weights/resnet50_ram-a26f946b.pth',
                        help='finetune from checkpoint')
    # 类别
    parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
    # 图像划分的网格数,默认为7
    parser.add_argument('--grid_size', default=7, type=int,help='grid size of each image')
    # 每一个网格包括的bounding box的数量,默认为2
    parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')
    # 模型以及训练log等保存文件夹地址
    parser.add_argument('--output_dir', default='./output_dir',help='path where to save, empty for no saving')
    # 训练使用的设备,CPU or GPU
    parser.add_argument('--device', default='cuda',help='device to use for training / testing')
    # 数据加载线程数
    parser.add_argument('--num_workers', default=4, type=int)

    return parser


# 主函数
def main(args):
    # 设置训练用的设备,CPU or GPU
    device = torch.device(args.device)
    # 创建输出文件夹
    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    # 训练集数据预处理
    train_transform = Compose([
                                    RandomHorizontalFlip(0.5),
                                    RandomVerticalFlip(0.5),
                                    RandomScale(0.5),
                                    RandomGaussianBlur(0.5),
                                    RandomBrightness(0.5),
                                    RandomHue(0.5),
                                    RandomSaturation(0.5),
                                    RandomShift(0.5),
                                    RandomCrop(0.5),
                                    Resize(args.input_size),
                                    ToTensor(),
                                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                             ])
    # 验证集数据预处理
    val_transform = Compose([
                                    Resize(args.input_size),
                                    ToTensor(),
                                    Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
                            ])
    # 加载训练数据
    train_dataset = DetData(    image_path = os.path.join(args.data_path, 'JPEGImages'),
                                label_file = os.path.join(args.data_path, 'train.txt'),
                                nb_classes = args.nb_classes,
                                grid_size = args.grid_size,
                                num_bboxes = args.num_bboxes,
                                transform = train_transform)

    train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True,
                                               num_workers=args.num_workers)
    # 加载验证数据
    val_dataset = DetData(    image_path = os.path.join(args.data_path, 'JPEGImages'),
                              label_file = os.path.join(args.data_path, 'val.txt'),
                              nb_classes = args.nb_classes,
                              grid_size=args.grid_size,
                              num_bboxes=args.num_bboxes,
                              transform = val_transform)

    val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False,
                                             num_workers=args.num_workers)
    # 定义YOLOv1网络
    model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
    print(model)
    # 根据设置是否加载预训练模型
    if args.finetune:
        checkpoint = torch.load(args.finetune, map_location='cpu')
        msg = model.load_state_dict(checkpoint, strict=False)
        print(msg)

    # 如果是多GPU训练,加上这句话,[]里面按顺序写GPU的id
    model = nn.DataParallel(model,[0,1,2,3])
    # 损失函数定义
    loss_function = Detect_Loss(feature_size=args.grid_size, num_bboxes=args.num_bboxes, num_classes=args.nb_classes)
    # 优化器定义,本文使用的是SGD
    optimizer = torch.optim.SGD(model.parameters(), lr=args.init_lr, momentum=args.momentum, weight_decay=args.weight_decay)
    # 学习率定义,本文使用的是StepLR
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=int(args.epochs * 0.3), gamma=0.1)
    # 开始训练
    history = train_and_val(args.epochs, model, train_loader,val_loader,loss_function, optimizer,scheduler,args.output_dir,device)
    # 打印损失值以及学习率曲线
    plot_loss(np.arange(0,args.epochs),args.output_dir, history)
    plot_lr(np.arange(0,args.epochs),args.output_dir, history)


if __name__ == '__main__':
    args = get_args_parser()
    args = args.parse_args()
    main(args)

yolov1.py

定义网络结构,本文将YOLOv1的backbone替换为ResNet50,并在backbone的后面加入了只有一层卷积的残差结构,称为Neck网络,最后的head网络使用的也是一层卷积使得网络的输出满足 7×7×(num_classes + 5 * 每一个网格包括的bounding box的数量),其中5代表(x, y, w, h, conf),激活函数为Sigmoid 。

import math
import torch.nn as nn
import torchvision.models.resnet
from torchvision.models.resnet import Bottleneck
#Neck网络类定义
class NeckNet(nn.Module):
    def __init__(self, in_channels,out_channels):
        super(NeckNet, self).__init__()

        self.conv1 = nn.Conv2d(in_channels,out_channels, 3,stride=1, padding=1)
        self.bn1 = nn.BatchNorm2d(out_channels)
        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):

        identity = x
        out = self.conv1(x)
        out = self.bn1(out)
        out += identity
        out = self.relu(out)

        return out

#YOLOv1网络类定义
class YOLOv1(torchvision.models.resnet.ResNet):

    def __init__(self, block, layers, num_classes=20, num_bboxes=2):
        super(YOLOv1, self).__init__(block, layers)
        #每一个网格包括的bounding box的数量,默认为2
        self.B = num_bboxes
        #预测的类别
        self.C = num_classes
        #调用Neck网络,定义为layer5
        self.layer5 = NeckNet(2048,2048)
        #定义head网络
        self.end = nn.Sequential(
            nn.Conv2d(2048, self.C + self.B * 5, 3,stride=2, padding=1),
            nn.BatchNorm2d(self.C + self.B * 5),
            nn.Sigmoid()
        )
        self._init_weights()
    #初始化网络参数
    def _init_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
                if m.bias is not None:
                    m.bias.data.zero_()
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()
            elif isinstance(m, nn.Linear):
                m.weight.data.normal_(0, 0.01)
                m.bias.data.zero_()

    def forward_features(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)

        return x

    def forward(self, x):
        #backbone+Neck网络
        x = self.forward_features(x)
        #head网络
        x = self.end(x)
        #改变输出的特征图的shape,为[batch,7,7,num_classes + 5 * num_bboxes]
        x = x.permute(0, 2, 3, 1)

        return x

#这里使用的是ResNet50,因此block为Bottleneck,layers为[3, 4, 6, 3]
def YOLO_v1(num_classes=20, num_bboxes=2):
    model = YOLOv1(block = Bottleneck, layers = [3, 4, 6, 3], num_classes = num_classes, num_bboxes = num_bboxes)

    return model

export_onnx.py

将训练好的模型转为onnx格式

import torch
from models.yolov1 import YOLO_v1
import argparse

def get_args_parser():
    parser = argparse.ArgumentParser('Export Onnx', add_help=False)
    # 输入图像大小,和训练一致
    parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
    # 训练好的模型地址
    parser.add_argument('--weights', default='./output_dir/last.pth', type=str,help='dataset path')
    # 类别
    parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
    # 每一个网格包括的框的数量,默认为2
    parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')

    return parser

def main(args):
    # 定义输入tensor
    x = torch.randn(1, 3, args.input_size[0],args.input_size[1])
    input_names = ["input"]
    out_names = ["output"]
    # 定义网络
    model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
    # 加载权重
    checkpoint = torch.load(args.weights, map_location='cpu')
    msg = model.load_state_dict(checkpoint, strict=True)
    print(msg)
    # 模型调整为eval模式
    model.eval()
    # 开始转onnx
    torch.onnx.export(model, x, args.weights.replace('pth','onnx'), export_params=True, training=False, input_names=input_names, output_names=out_names)
    print('please run: python -m onnxsim test.onnx test_sim.onnx\n')

if __name__ == '__main__':
    args = get_args_parser()
    args = args.parse_args()
    main(args)

predict.py

进行单张图片预测

import argparse
import torch
import numpy as np
import torchvision.transforms as T
from models.yolov1 import YOLO_v1
from PIL import Image
import cv2
from utils.engine import postprocess

def get_args_parser():
    parser = argparse.ArgumentParser('Predict Image', add_help=False)
    #图像地址
    parser.add_argument('--image_path', default='./dog.jpg', type=str, metavar='MODEL',help='Name of model to train')
    #输入图像大小,与训练一致
    parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
    #训练好的模型文件地址
    parser.add_argument('--weights', default='./output_dir/best.pth', type=str,help='dataset path')
    #类别
    parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
    #类别概率阈值
    parser.add_argument('--conf_thresh', default=0.2, type=float,help='thresh of cls conf')
    #置信度阈值
    parser.add_argument('--prob_thresh', default=0.2, type=float,help='thresh of predict prob')
    #非极大值抑制阈值
    parser.add_argument('--nms_thresh', default=0.5, type=float,help='nms thresh of predict prob')
    # 图像划分的网格数,默认为7
    parser.add_argument('--grid_size', default=7, type=int,help='grid size of each image')
    # 每一个网格包括的框的数量,默认为2
    parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')
    # 设置推理设备,CPU or GPU
    parser.add_argument('--device', default='cuda',help='device to use for training / testing')

    return parser

def main(args):
    # 设置推理设备,CPU or GPU
    device = torch.device(args.device)
    # 读取图像
    image = Image.open(args.image_path).convert('RGB')
    # 获取图像shape
    width, height = image.size
    # 预测类别
    VOC_CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor']
    #图像预处理定义
    transforms = T.Compose([
        T.Resize(args.input_size),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225]),
    ])
    # 定义网络
    model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
    # 加载权重
    checkpoint = torch.load(args.weights, map_location='cpu')
    msg = model.load_state_dict(checkpoint, strict=True)
    print(msg)
    # 模型放到推理设备中
    model.to(device)
    #模型设置为eval模式
    model.eval()
    # 预处理图像
    input_tensor = transforms(image).unsqueeze(0).to(device)
    with torch.no_grad():
        # 模型推理,获取结果
        output = model(input_tensor)
    # 将结果进行解码
    boxes, labels, probs = postprocess(output,width, height, VOC_CLASSES,args.grid_size, args.num_bboxes,
                                       args.conf_thresh,args.prob_thresh,args.nms_thresh,args.nb_classes)
    # 将图像转为opencv格式,方便画框
    cv_image = np.array(image)
    cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
    # 画框
    for box, label, prob in zip(boxes, labels, probs):
        (left,top),(right,bottom) = box
        cv2.rectangle(cv_image, (int(left), int(top)), (int(right), int(bottom)), (128, 128, 0), thickness=2)
        cv2.putText(cv_image, label+' '+'{:.2f}'.format(prob), (int(left),  int(top)-10),
                    cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.5, color=(255, 255, 255), thickness=1, lineType=8)
    # 保存图像
    cv2.imwrite('result.png',cv_image)

if __name__ == '__main__':
    args = get_args_parser()
    args = args.parse_args()
    main(args)

eval.py

进行模型评价

import argparse
import torchvision.transforms as T
import torch
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
from models.yolov1 import YOLO_v1
from utils.metrics import evaluate
from utils.engine import postprocess

def get_args_parser():
    #同上
    parser = argparse.ArgumentParser('Eval Model', add_help=False)
    parser.add_argument('--data_path', default='./datasets/', type=str,help='dataset path')
    parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
    parser.add_argument('--weights', default='./output_dir/last.pth', type=str,help='dataset path')
    parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
    parser.add_argument('--conf_thresh', default=0.01, type=float,help='thresh of cls conf')
    parser.add_argument('--prob_thresh', default=0.01, type=float,help='thresh of predict prob')
    parser.add_argument('--nms_thresh', default=0.5, type=float,help='nms thresh of predict prob')

    parser.add_argument('--grid_size', default=7, type=int,help='grid size of each image')
    parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')
    parser.add_argument('--device', default='cuda',help='device to use for training / testing')

    return parser


def main(args):

    device = torch.device(args.device)

    VOC_CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor']

    transforms = T.Compose([
        T.Resize(args.input_size),
        T.ToTensor(),
        T.Normalize(mean=[0.485, 0.456, 0.406],
                    std=[0.229, 0.224, 0.225]),
    ])

    targets = defaultdict(list)
    preds = defaultdict(list)

    model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)

    checkpoint = torch.load(args.weights, map_location='cpu')
    msg = model.load_state_dict(checkpoint, strict=True)
    print(msg)

    model.to(device)
    model.eval()

    print('Preparing ground-truth data...')

    #读取验证集目标框的类别以及坐标
    annotations = []
    with open(os.path.join(args.data_path,'val.txt'), 'r') as f:
        lines = f.readlines()
    for line in lines:
        anno = line.strip().split()
        annotations.append(anno)

    # 准备ground-truth数据
    image_fnames = []
    for anno in annotations:   
    	# 获取图片名字
        filename = anno[0]
        image_fnames.append(filename)
    	# 获取每张图片对应的坐标以及类别信息
        num_boxes = (len(anno) - 1) // 5
        for b in range(num_boxes):
            x1 = int(anno[5*b + 1])
            y1 = int(anno[5*b + 2])
            x2 = int(anno[5*b + 3])
            y2 = int(anno[5*b + 4])

            class_label = int(anno[5*b + 5])
            class_name = VOC_CLASSES[class_label]

            targets[(filename, class_name)].append([x1, y1, x2, y2])


    print('Predicting...')

    # 将模型预测的结果进行保存
    for filename in tqdm(image_fnames):
        image_path = os.path.join(args.data_path,'JPEGImages',filename)
        image = Image.open(image_path).convert('RGB')
        width, height = image.size
    	# 数据预处理
        input_tensor = transforms(image).unsqueeze(0).to(device)
        with torch.no_grad():
            # 模型推理,获取预测结果
            output = model(input_tensor)
        # 将预测结果解码
        boxes, labels, probs = postprocess(output, width, height, VOC_CLASSES, args.grid_size, args.num_bboxes,
                                           args.conf_thresh, args.prob_thresh, args.nms_thresh, args.nb_classes)
                                           
        for box, class_name, prob in zip(boxes, labels, probs):
            x1y1, x2y2 = box
            x1, y1 = int(x1y1[0]), int(x1y1[1])
            x2, y2 = int(x2y2[0]), int(x2y2[1])
            preds[class_name].append([filename, prob, x1, y1, x2, y2])

    print('Evaluate the detection result...')
    #开始评估
    evaluate(preds, targets, class_names=VOC_CLASSES)


if __name__ == '__main__':
    args = get_args_parser()
    args = args.parse_args()
    main(args)

engine.py

定义训练验证函数、打印曲线函数、解码函数等,这里要注意单GPU训练和多GPU训练在保存模型上的区别,对于非极大值抑制的讲解有兴趣的可以看https://blog.csdn.net/qq_38412266/article/details/139525192?spm=1001.2014.3001.5501

import os
import torch
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
#训练验证函数
def train_and_val(epochs, model, train_loader, val_loader,criterion, optimizer,scheduler,output_dir,device):
    #初始定义
    train_loss = []
    val_loss = []
    learning_rate = []
    best_min_loss = 100
    #将模型放到指定设备中
    model.to(device)
    #开始记录总训练时间时
    fit_time = time.time()
    for e in range(epochs):
    	#清理缓存
        torch.cuda.empty_cache()
    	#打印当前轮学习率值 
        print("This Epoch Learning Rate: {:.6f}  ".format(scheduler.get_last_lr()[0]))
        since = time.time()
        training_loss = 0
    	#模型调整为训练模式
        model.train()
        with tqdm(total=len(train_loader)) as pbar:
            for image, label in train_loader:
    			#将训练数据以及标签放到指定设备中
                image = image.to(device)
                label = label.to(device)
   				#模型推理,获取预测结果
                output = model(image)
   				#loss计算
                loss = criterion(output, label)
   				#反向传播,更新网络参数
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                training_loss += loss.item()
                pbar.update(1)
    	#模型调整为验证模式
        model.eval()
        validation_loss = 0

        with torch.no_grad():
            with tqdm(total=len(val_loader)) as pb:
                for image, label in val_loader:
    				#将训练数据以及标签放到指定设备中
                    image = image.to(device)
                    label = label.to(device)
   					#模型推理,获取预测结果
                    output = model(image)
   					#loss计算
                    loss = criterion(output, label)

                    validation_loss += loss.item()
                    pb.update(1)
   		#将每一轮的loss值记录到列表中
        train_loss.append(training_loss / len(train_loader))
        val_loss.append(validation_loss / len(val_loader))
   		#将每一轮的学习率值记录到列表中
        learning_rate.append(scheduler.get_last_lr())
   		#将每一轮的信息记录到log.txt中
        save_file = open(os.path.join(output_dir,'log.txt'), mode='a+')
        save_file.writelines(["Epoch:{}/{}  ".format(e + 1, epochs)+
              "Learning Rate: {:.6f}  ".format(scheduler.get_last_lr()[0]) +
              "Train Loss: {:.3f}  ".format(training_loss / len(train_loader))+
              "Val Loss: {:.3f}  ".format(validation_loss / len(val_loader))+'\n'])
        save_file.close()

        # 多GPU训练使用,保存每一轮模型
        torch.save(model.module.state_dict(), os.path.join(output_dir,'last.pth'))
        # 单GPU训练使用,保存每一轮模型
        #torch.save(model.state_dict(), os.path.join(output_dir,'last.pth'))
        #如果当前轮的loss值小于记录的最低loss值,保存best模型
        if best_min_loss > (validation_loss / len(val_loader)):
            print("--save best model,loss is {:.6f}--".format(validation_loss / len(val_loader)))
            best_min_loss = validation_loss / len(val_loader)
            # 多GPU训练使用,保存最佳模型
            torch.save(model.module.state_dict(), os.path.join(output_dir,'best.pth'))
        	# 单GPU训练使用,保存最佳模型
            #torch.save(model.state_dict(), os.path.join(output_dir,'best.pth'))
   		#将每一轮的信息打印出来
        print("Epoch:{}/{} ".format(e + 1, epochs),
              "Train Loss: {:.3f} ".format(training_loss / len(train_loader)),
              "Val Loss: {:.3f} ".format(validation_loss / len(val_loader)),
              "Time: {:.2f}s".format((time.time() - since)))
   		#学习率更新
        scheduler.step()
   	#将每一轮的信息保存起来
    history = {'train_loss': train_loss, 'val_loss': val_loss ,'lr':learning_rate}
    print('Total time: {:.2f} m'.format((time.time() - fit_time) / 60))

    return history

#打印loss曲线函数
def plot_loss(x,output_dir, history):
    plt.plot(x, history['val_loss'], label='val', marker='o')
    plt.plot(x, history['train_loss'], label='train', marker='o')
    plt.title('Loss per epoch')
    plt.ylabel('loss')
    plt.xlabel('epoch')
    plt.legend(), plt.grid()
    plt.savefig(os.path.join(output_dir,'loss.png'))
    plt.clf()
#打印学习率曲线函数
def plot_lr(x,output_dir,  history):
    plt.plot(x, history['lr'], label='learning_rate', marker='x')
    plt.title('learning rate per epoch')
    plt.ylabel('Learning_rate')
    plt.xlabel('epoch')
    plt.legend(), plt.grid()
    plt.savefig(os.path.join(output_dir,'learning_rate.png'))
    plt.clf()

#非极大值抑制函数
def nms(boxes, scores,threshold):

    x1 = boxes[:, 0]  # [n,]
    y1 = boxes[:, 1]  # [n,]
    x2 = boxes[:, 2]  # [n,]
    y2 = boxes[:, 3]  # [n,]
    areas = (x2 - x1) * (y2 - y1)  # [n,]

    _, ids_sorted = scores.sort(0, descending=True)  # [n,]
    ids = []
    while ids_sorted.numel() > 0:
        # Assume `ids_sorted` size is [m,] in the beginning of this iter.

        i = ids_sorted.item() if (ids_sorted.numel() == 1) else ids_sorted[0]
        ids.append(i)

        if ids_sorted.numel() == 1:
            break  # If only one box is left (i.e., no box to supress), break.

        inter_x1 = x1[ids_sorted[1:]].clamp(min=x1[i])  # [m-1, ]
        inter_y1 = y1[ids_sorted[1:]].clamp(min=y1[i])  # [m-1, ]
        inter_x2 = x2[ids_sorted[1:]].clamp(max=x2[i])  # [m-1, ]
        inter_y2 = y2[ids_sorted[1:]].clamp(max=y2[i])  # [m-1, ]
        inter_w = (inter_x2 - inter_x1).clamp(min=0)  # [m-1, ]
        inter_h = (inter_y2 - inter_y1).clamp(min=0)  # [m-1, ]

        inters = inter_w * inter_h  # intersections b/w/ box `i` and other boxes, sized [m-1, ].
        unions = areas[i] + areas[ids_sorted[1:]] - inters  # unions b/w/ box `i` and other boxes, sized [m-1, ].
        ious = inters / unions  # [m-1, ]

        # Remove boxes whose IoU is higher than the threshold.
        ids_keep = (ious <= threshold).nonzero().squeeze()  # [m-1, ]. Because `nonzero()` adds extra dimension, squeeze it.
        if ids_keep.numel() == 0:
            break  # If no box left, break.
        ids_sorted = ids_sorted[ids_keep + 1]  # `+1` is needed because `ids_sorted[0] = i`.

    return torch.LongTensor(ids)

#将结果进行解码的函数
def decode(pred_tensor,grid_size,num_bboxes,conf_thresh,prob_thresh,nb_classes):

    S, B, C = grid_size,num_bboxes,nb_classes
    boxes, labels, confidences, class_scores = [], [], [], []

    cell_size = 1.0 / float(S)

    pred_tensor = pred_tensor.cpu().data.squeeze(0)

    pred_tensor_conf_list = []
    for b in range(B):
        pred_tensor_conf_list.append(pred_tensor[:, :, 5 * b + 4].unsqueeze(2))
    grid_ceil_conf = torch.cat(pred_tensor_conf_list, 2)

    grid_ceil_conf, grid_ceil_index = grid_ceil_conf.max(2)
    class_conf, class_index = pred_tensor[:, :, 5 * B:].max(2)
    class_conf[class_conf <= conf_thresh] = 0
    class_prob = class_conf * grid_ceil_conf

    for i in range(S):
        for j in range(S):
            if float(class_prob[j, i]) < prob_thresh:
                continue
            box = pred_tensor[j, i, 5 * grid_ceil_index[j, i]: 5 * grid_ceil_index[j, i] + 4]
            xy_start_pos = torch.FloatTensor([i, j]) * cell_size
            xy_normalized = box[:2] * cell_size + xy_start_pos
            wh_normalized = box[2:]
            box_xyxy = torch.FloatTensor(4)
            box_xyxy[:2] = xy_normalized - 0.5 * wh_normalized
            box_xyxy[2:] = xy_normalized + 0.5 * wh_normalized

            boxes.append(box_xyxy)
            labels.append(class_index[j, i])
            confidences.append(grid_ceil_conf[j, i])
            class_scores.append(class_conf[j, i])

    if len(boxes) > 0:
        boxes = torch.stack(boxes, 0)
        labels = torch.stack(labels, 0)
        confidences = torch.stack(confidences, 0)
        class_scores = torch.stack(class_scores, 0)
    else:
        boxes = torch.FloatTensor(0, 4)
        labels = torch.LongTensor(0)
        confidences = torch.FloatTensor(0)
        class_scores = torch.FloatTensor(0)

    return boxes, labels, confidences, class_scores

def postprocess(output,width, height,VOC_CLASSES,grid_size,num_bboxes,conf_thresh,prob_thresh,nms_thresh,nb_classes):

    boxes,labels,probs = [],[],[]

    boxes_list, labels_list, confidences_list, class_scores_list = decode(output, grid_size, num_bboxes,
                                                                          conf_thresh, prob_thresh,
                                                                          nb_classes)
    if boxes_list.shape[0] != 0:
        boxes_nms, labels_nms, probs_nms = [], [], []
        for class_label in range(len(VOC_CLASSES)):
            ids = (labels_list == class_label)
            if torch.sum(ids) == 0:
                continue

            boxes_list_current_cls = boxes_list[ids]
            labels_list_current_cls = labels_list[ids]
            confidences_list_current_cls = confidences_list[ids]
            class_scores_list_current_cls = class_scores_list[ids]

            ids_postprocess = nms(boxes_list_current_cls, confidences_list_current_cls, nms_thresh)

            boxes_nms.append(boxes_list_current_cls[ids_postprocess])
            labels_nms.append(labels_list_current_cls[ids_postprocess])
            probs_nms.append(
                confidences_list_current_cls[ids_postprocess] * class_scores_list_current_cls[ids_postprocess])

        boxes_nms = torch.cat(boxes_nms, 0)
        labels_nms = torch.cat(labels_nms, 0)
        probs_nms = torch.cat(probs_nms, 0)

        for box, label, prob in zip(boxes_nms, labels_nms, probs_nms):
            x1, x2 = width * box[0], width * box[2]  # unnormalize x with image width.
            y1, y2 = height * box[1], height * box[3]  # unnormalize y with image height.
            boxes.append(((x1, y1), (x2, y2)))

            label_idx = int(label)  # convert from LongTensor to int.
            class_name = VOC_CLASSES[label_idx]
            labels.append(class_name)

            prob = float(prob)
            probs.append(prob)

    return boxes,labels,probs

loss.py

定义检测的损失函数

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable

class Detect_Loss(nn.Module):

    def __init__(self, feature_size=7, num_bboxes=2, num_classes=20, lambda_coord=5.0, lambda_noobj=0.5):

        super(Detect_Loss, self).__init__()

        self.S = feature_size
        self.B = num_bboxes
        self.C = num_classes
        self.lambda_coord = lambda_coord
        self.lambda_noobj = lambda_noobj


    def compute_iou(self, bbox1, bbox2):

        N = bbox1.size(0)
        M = bbox2.size(0)

        lt = torch.max(
        bbox1[:, :2].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
        bbox2[:, :2].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
            )

        rb = torch.min(
        bbox1[:, 2:].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
        bbox2[:, 2:].unsqueeze(0).expand(N, M, 2)  # [M, 2] -> [1, M, 2] -> [N, M, 2]
            )

        wh = rb - lt
        wh[wh < 0] = 0
        inter = wh[:, :, 0] * wh[:, :, 1] # [N, M]

        area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1]) # [N, ]
        area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1]) # [M, ]
        area1 = area1.unsqueeze(1).expand_as(inter) # [N, ] -> [N, 1] -> [N, M]
        area2 = area2.unsqueeze(0).expand_as(inter) # [M, ] -> [1, M] -> [N, M]

        union = area1 + area2 - inter # [N, M, 2]
        iou = inter / union # [N, M, 2]

        return iou

    def forward(self, pred_tensor, target_tensor):

        S, B, C = self.S, self.B, self.C
        N = 5 * B + C

        batch_size = pred_tensor.size(0)
        coord_mask = target_tensor[:, :, :, 4] > 0
        noobj_mask = target_tensor[:, :, :, 4] == 0

        coord_mask = coord_mask.unsqueeze(-1).expand_as(target_tensor)
        noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target_tensor)

        coord_pred = pred_tensor[coord_mask].view(-1, N)

        bbox_pred = coord_pred[:, :5 * B].contiguous().view(-1,5)
        class_pred = coord_pred[:, 5 * B:]

        coord_target = target_tensor[coord_mask].view(-1,N)

        bbox_target = coord_target[:, :5 * B].contiguous().view(-1, 5)
        class_target = coord_target[:, 5 * B:]

        noobj_pred = pred_tensor[noobj_mask].view(-1,N)

        noobj_target = target_tensor[noobj_mask].view(-1,N)

        noobj_conf_mask = torch.cuda.BoolTensor(noobj_pred.size()).fill_(0)
        for b in range(B):
            noobj_conf_mask[:, 4 + b * 5] = 1
        noobj_pred_conf = noobj_pred[noobj_conf_mask]
        noobj_target_conf = noobj_target[noobj_conf_mask]
        loss_noobj = F.mse_loss(noobj_pred_conf, noobj_target_conf, reduction='sum')

        coord_response_mask = torch.cuda.BoolTensor(bbox_target.size()).fill_(0)
        coord_not_response_mask = torch.cuda.BoolTensor(bbox_target.size()).fill_(1)
        bbox_target_iou = torch.zeros(bbox_target.size()).cuda()

        for i in range(0, bbox_target.size(0), B):
            pred = bbox_pred[i:i + B]
            pred_xyxy = Variable(torch.FloatTensor(pred.size()))

            pred_xyxy[:, :2] = pred[:, :2] / float(S) - 0.5 * pred[:, 2:4]
            pred_xyxy[:, 2:4] = pred[:, :2] / float(S) + 0.5 * pred[:, 2:4]

            target = bbox_target[i].view(-1, 5)
            target_xyxy = Variable(torch.FloatTensor(target.size()))

            target_xyxy[:, :2] = target[:, :2] / float(S) - 0.5 * target[:, 2:4]
            target_xyxy[:, 2:4] = target[:, :2] / float(S) + 0.5 * target[:, 2:4]

            iou = self.compute_iou(pred_xyxy[:, :4], target_xyxy[:, :4])
            max_iou, max_index = iou.max(0)
            max_index = max_index.data.cuda()

            coord_response_mask[i + max_index] = 1
            coord_not_response_mask[i+max_index] = 0

            bbox_target_iou[i + max_index, torch.LongTensor([4]).cuda()] = (max_iou).data.cuda()
        bbox_target_iou = Variable(bbox_target_iou).cuda()

        bbox_pred_response = bbox_pred[coord_response_mask].view(-1, 5)
        bbox_target_response = bbox_target[coord_response_mask].view(-1,5)
        target_iou = bbox_target_iou[coord_response_mask].view(-1,5)
        loss_xy = F.mse_loss(bbox_pred_response[:, :2], bbox_target_response[:, :2], reduction='sum')
        loss_wh = F.mse_loss(torch.sqrt(bbox_pred_response[:, 2:4]), torch.sqrt(bbox_target_response[:, 2:4]),reduction='sum')
        loss_obj = F.mse_loss(bbox_pred_response[:, 4], target_iou[:, 4], reduction='sum')

        loss_class = F.mse_loss(class_pred, class_target, reduction='sum')

        loss = self.lambda_coord * (loss_xy + loss_wh) + loss_obj + self.lambda_noobj * loss_noobj + loss_class
        loss = loss / float(batch_size)

        return loss

metrics.py

定义计算mAP的函数

import numpy as np

def compute_average_precision(recall, precision):

    recall = np.concatenate(([0.0], recall, [1.0]))
    precision = np.concatenate(([0.0], precision, [0.0]))

    for i in range(precision.size - 1, 0, -1):
        precision[i - 1] = max(precision[i -1], precision[i])

    ap = 0.0
    for i in range(precision.size - 1):
        ap += (recall[i + 1] - recall[i]) * precision[i + 1]

    return ap


def evaluate(preds,targets,class_names,threshold=0.5):

    aps = []

    for class_name in class_names:
        class_preds = preds[class_name]

        if len(class_preds) == 0:
            ap = 0.0
            print('---class {} AP {}---'.format(class_name, ap))
            aps.append(ap)
            break

        image_fnames = [pred[0] for pred in class_preds]
        probs = [pred[1]  for pred in class_preds]
        boxes = [pred[2:] for pred in class_preds]

        sorted_idxs = np.argsort(probs)[::-1]
        image_fnames = [image_fnames[i] for i in sorted_idxs]
        boxes = [boxes[i] for i in sorted_idxs]

        num_gt_boxes = 0
        for (filename_gt, class_name_gt) in targets:
            if class_name_gt == class_name:
                num_gt_boxes += len(targets[filename_gt, class_name_gt])

        num_detections = len(boxes)
        tp = np.zeros(num_detections)
        fp = np.ones(num_detections)

        for det_idx, (filename, box) in enumerate(zip(image_fnames, boxes)):

            if (filename, class_name) in targets:
                boxes_gt = targets[(filename, class_name)]
                for box_gt in boxes_gt:

                    inter_x1 = max(box_gt[0], box[0])
                    inter_y1 = max(box_gt[1], box[1])
                    inter_x2 = min(box_gt[2], box[2])
                    inter_y2 = min(box_gt[3], box[3])
                    inter_w = max(0.0, inter_x2 - inter_x1 + 1.0)
                    inter_h = max(0.0, inter_y2 - inter_y1 + 1.0)
                    inter = inter_w * inter_h

                    area_det = (box[2] - box[0] + 1.0) * (box[3] - box[1] + 1.0)
                    area_gt = (box_gt[2] - box_gt[0] + 1.0) * (box_gt[3] - box_gt[1] + 1.0)
                    union = area_det + area_gt - inter

                    iou = inter / union
                    if iou >= threshold:
                        tp[det_idx] = 1.0
                        fp[det_idx] = 0.0

                        boxes_gt.remove(box_gt)
                        if len(boxes_gt) == 0:
                            del targets[(filename, class_name)]
                        break

            else:
                pass

        tp_cumsum = np.cumsum(tp)
        fp_cumsum = np.cumsum(fp)

        eps = np.finfo(np.float64).eps
        precision = tp_cumsum / np.maximum(tp_cumsum + fp_cumsum, eps)
        recall = tp_cumsum / float(num_gt_boxes)

        ap = compute_average_precision(recall, precision)
        print('---class {} AP {}---'.format(class_name, ap))
        aps.append(ap)

    print('---mAP {}---'.format(np.mean(aps)))

    return aps

持续更新中

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

姚先生97

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值