整个工程文件已放到Github上
https://github.com/yaoyi30/Pytorch_YOLOv1
在VOC2007测试集上的mAP值为0.65
网络预测结果
一、训练目标检测网络主要流程
- 构建数据集
- 数据预处理、包括数据增强和数据标准化和归一化
- 构建网络模型
- 设置学习率、优化器、损失函数等超参数
- 训练和验证
二、各个流程简要说明
1. 构建数据集
本文使用的是VOC2007和VOC2012数据集
在工程目录下,新建datasets文件夹,在文件夹内新建JPEGImages文件夹,用来放图片,之后内新建train.txt和val.txt文件用来存放训练和验证数据列表,结构如下:
datasets/
JPEGImages/ # VOC2007 + VOC2012 all images
img1.jpg
img2.jpg
.
.
.
train.txt
val.txt
2. 数据预处理
将图像resize到统一大小,之后转为tensor格式再进行标准化,预处理之后的图片可以正常输入网络,对于训练集可以采取一些数据增强手段来增强网络的泛化能力,验证集不做数据增强。
#训练数据预处理、数据增强设置
train_transform = Compose([
RandomHorizontalFlip(0.5),#水平翻转
RandomVerticalFlip(0.5),#竖直翻转
RandomScale(0.5),#随机尺度变换
RandomGaussianBlur(0.5),#高斯滤波
RandomBrightness(0.5),#亮度调节
RandomHue(0.5),#颜色调节
RandomSaturation(0.5),#饱和度调节
RandomShift(0.5),#随机平移
RandomCrop(0.5),#随机裁剪
Resize(args.input_size),#图像resize到统一大小,坐标值随之变换
ToTensor(),#图像转为tensor格式,值变为0-1之间
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])#图像标准化
])
#验证数据预处理
val_transform = Compose([
Resize(args.input_size),#图像resize到统一大小,坐标值随之变换
ToTensor(),#图像转为tensor格式,值变为0-1之间
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])#图像标准化
])
])
3. 构建网络模型
本文基于Pytorch搭建了YOLOv1目标检测网络
model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
4. 设置学习率、优化器、损失函数等超参数
#定义损失函数
loss_function = Detect_Loss(feature_size=args.grid_size, num_bboxes=args.num_bboxes, num_classes=args.nb_classes)
#定义优化器(初始学习率和权重衰减值)
optimizer = torch.optim.SGD(model.parameters(), lr=args.init_lr, momentum=args.momentum, weight_decay=args.weight_decay)
#定义学习率类型,此处StepLR学习率,设置衰减因子以及每隔多少轮变化一次
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=int(args.epochs * 0.3), gamma=0.1)
5. 训练和验证
#训练和验证模型,具体函写在了utils.py文件中
history = train_and_val(args.epochs, model, train_loader,val_loader,loss_function, optimizer,scheduler,args.output_dir,device)
三、工程代码文件详细讲解
transform.py
定义数据增强函数
from torchvision.transforms import functional as F
import random
import torch
import numpy as np
import cv2
from PIL import Image
#将图像resize到统一大小,坐标也随之改变
class Resize(object):
def __init__(self, size):
self.size = size
def __call__(self, image,boxes,labels):
width, height = image.size
image = F.resize(image, self.size)
scale_x = self.size[1] / width
scale_y = self.size[0] / height
scale_tensor = torch.FloatTensor([[scale_x, scale_y, scale_x, scale_y]]).expand_as(boxes)
boxes = boxes * scale_tensor
return image,boxes,labels
#高斯滤波
class RandomGaussianBlur(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image, boxes,labels):
if random.random() < self.prob:
ksize = random.choice([3, 5])
image = F.gaussian_blur(image,[ksize, ksize])
return image, boxes,labels
#随机亮度调节
class RandomBrightness(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
adjust = random.uniform(0.5, 1.5)
image = F.adjust_brightness(image,adjust)
return image,boxes,labels
#随机颜色调节
class RandomHue(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
adjust = random.uniform(-0.5, 0.5)
image = F.adjust_hue(image,adjust)
return image,boxes,labels
#随机饱和度调节
class RandomSaturation(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
adjust = random.uniform(0.5, 1.5)
image = F.adjust_saturation(image,adjust)
return image,boxes,labels
#随机水平翻转
class RandomHorizontalFlip(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
width, height = image.size
image = F.hflip(image)
x1, x2 = boxes[:, 0], boxes[:, 2]
x1_new = width - x2
x2_new = width - x1
boxes[:, 0], boxes[:, 2] = x1_new, x2_new
return image,boxes,labels
#随机更改宽度
class RandomScale(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
width, height = image.size
scale = random.uniform(0.8,1.2)
image = F.resize(image,[height,int(width*scale)])
scale_tensor = torch.FloatTensor([[scale,1,scale,1]]).expand_as(boxes)
boxes = boxes * scale_tensor
return image,boxes,labels
#随机平移变换
class RandomShift(object):
def __init__(self, prob):
self.prob = prob
self.mean = [122.67891434, 116.66876762, 104.00698793]
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
center = (boxes[:, 2:] + boxes[:, :2]) / 2.0
img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
h, w, c = img.shape
img_out = np.zeros((h, w, c), dtype=img.dtype)
mean_bgr = self.mean[::-1]
img_out[:, :] = mean_bgr
dx = random.uniform(-w * 0.2, w * 0.2)
dy = random.uniform(-h * 0.2, h * 0.2)
dx, dy = int(dx), int(dy)
if dx >= 0 and dy >= 0:
img_out[dy:, dx:] = img[:h - dy, :w - dx]
elif dx >= 0 and dy < 0:
img_out[:h + dy, dx:] = img[-dy:, :w - dx]
elif dx < 0 and dy >= 0:
img_out[dy:, :w + dx] = img[:h - dy, -dx:]
elif dx < 0 and dy < 0:
img_out[:h + dy, :w + dx] = img[-dy:, -dx:]
center = center + torch.FloatTensor([[dx, dy]]).expand_as(center) # [n, 2]
mask_x = (center[:, 0] >= 0) & (center[:, 0] < w) # [n,]
mask_y = (center[:, 1] >= 0) & (center[:, 1] < h) # [n,]
mask = (mask_x & mask_y).view(-1, 1) # [n, 1], mask for the boxes within the image after shift.
boxes_out = boxes[mask.expand_as(boxes)].view(-1, 4) # [m, 4]
if len(boxes_out) == 0:
return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)), boxes, labels
shift = torch.FloatTensor([[dx, dy, dx, dy]]).expand_as(boxes_out) # [m, 4]
boxes_out = boxes_out + shift
boxes_out[:, 0] = boxes_out[:, 0].clamp_(min=0, max=w)
boxes_out[:, 2] = boxes_out[:, 2].clamp_(min=0, max=w)
boxes_out[:, 1] = boxes_out[:, 1].clamp_(min=0, max=h)
boxes_out[:, 3] = boxes_out[:, 3].clamp_(min=0, max=h)
labels_out = labels[mask.view(-1)]
image, boxes, labels = Image.fromarray(cv2.cvtColor(img_out, cv2.COLOR_BGR2RGB)), boxes_out, labels_out
return image,boxes,labels
#随机裁剪
class RandomCrop(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
center = (boxes[:, 2:] + boxes[:, :2]) / 2.0
w_orig, h_orig = image.size
h = random.uniform(0.6 * h_orig, h_orig)
w = random.uniform(0.6 * w_orig, w_orig)
y = random.uniform(0, h_orig - h)
x = random.uniform(0, w_orig - w)
h, w, x, y = int(h), int(w), int(x), int(y)
center = center - torch.FloatTensor([[x, y]]).expand_as(center) # [n, 2]
mask_x = (center[:, 0] >= 0) & (center[:, 0] < w) # [n,]
mask_y = (center[:, 1] >= 0) & (center[:, 1] < h) # [n,]
mask = (mask_x & mask_y).view(-1, 1) # [n, 1], mask for the boxes within the image after crop.
boxes_out = boxes[mask.expand_as(boxes)].view(-1, 4) # [m, 4]
if len(boxes_out) == 0:
return image, boxes, labels
shift = torch.FloatTensor([[x, y, x, y]]).expand_as(boxes_out) # [m, 4]
boxes_out = boxes_out - shift
boxes_out[:, 0] = boxes_out[:, 0].clamp_(min=0, max=w)
boxes_out[:, 2] = boxes_out[:, 2].clamp_(min=0, max=w)
boxes_out[:, 1] = boxes_out[:, 1].clamp_(min=0, max=h)
boxes_out[:, 3] = boxes_out[:, 3].clamp_(min=0, max=h)
labels_out = labels[mask.view(-1)]
box = (x, y, x + w, y + h)
img_out = image.crop(box)
image, boxes, labels = img_out,boxes_out,labels_out
return image,boxes,labels
#随机竖直翻转
class RandomVerticalFlip(object):
def __init__(self, prob):
self.prob = prob
def __call__(self, image,boxes,labels):
if random.random() < self.prob:
width, height = image.size
image = F.vflip(image)
y1, y2 = boxes[:, 1], boxes[:, 3]
y1_new = height - y2
y2_new = height - y1
boxes[:, 1], boxes[:, 3] = y1_new, y2_new
return image,boxes,labels
#标准化
class Normalize(object):
def __init__(self, mean, std):
self.mean = mean
self.std = std
def __call__(self, image,boxes,labels):
image = F.normalize(image, mean=self.mean, std=self.std)
return image,boxes,labels
#转为tensor,像素值变为0-1之间
class ToTensor(object):
def __call__(self, image,boxes,labels):
image = F.to_tensor(image)
return image,boxes,labels
#数据增强组合函数
class Compose(object):
def __init__(self, transforms):
self.transforms = transforms
def __call__(self, image,boxes,labels):
for t in self.transforms:
image,boxes,labels = t(image,boxes,labels)
return image,boxes,labels
train.py
定义主函数,在跑实验的时候发现,优化器使用SGD,batch size设置为32,初始学习率设置为0.001这样效果是比较好的
import os
import torch
import torch.nn as nn
from models.yolov1 import YOLO_v1
import argparse
import numpy as np
from utils.transform import Resize,Compose,ToTensor,Normalize,RandomHorizontalFlip,RandomVerticalFlip,RandomScale,\
RandomHue,RandomSaturation,RandomBrightness,RandomGaussianBlur,RandomCrop,RandomShift
from utils.datasets import DetData
from utils.loss import Detect_Loss
from utils.engine import train_and_val,plot_loss,plot_lr
# 训练参数定义
def get_args_parser():
parser = argparse.ArgumentParser('Image Detection Train', add_help=False)
# batch size设置
parser.add_argument('--batch_size', default=32, type=int,help='Batch size for training')
# 训练的轮数
parser.add_argument('--epochs', default=80, type=int)
# 输入图像大小,默认为448×448
parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
# 数据集的地址
parser.add_argument('--data_path', default='./datasets/', type=str,help='dataset path')
# 初始学习率设置,默认为0.001
parser.add_argument('--init_lr', default=0.001, type=float,help='SGD intial lr')
parser.add_argument('--momentum', default=0.9, type=float,help='SGD momentum')
parser.add_argument('--weight_decay', default=5e-4, type=float,help='SGD weight decay')
# 是否加载backbone的预训练模型
parser.add_argument('--finetune', default='./weights/resnet50_ram-a26f946b.pth',
help='finetune from checkpoint')
# 类别
parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
# 图像划分的网格数,默认为7
parser.add_argument('--grid_size', default=7, type=int,help='grid size of each image')
# 每一个网格包括的bounding box的数量,默认为2
parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')
# 模型以及训练log等保存文件夹地址
parser.add_argument('--output_dir', default='./output_dir',help='path where to save, empty for no saving')
# 训练使用的设备,CPU or GPU
parser.add_argument('--device', default='cuda',help='device to use for training / testing')
# 数据加载线程数
parser.add_argument('--num_workers', default=4, type=int)
return parser
# 主函数
def main(args):
# 设置训练用的设备,CPU or GPU
device = torch.device(args.device)
# 创建输出文件夹
if not os.path.exists(args.output_dir):
os.makedirs(args.output_dir)
# 训练集数据预处理
train_transform = Compose([
RandomHorizontalFlip(0.5),
RandomVerticalFlip(0.5),
RandomScale(0.5),
RandomGaussianBlur(0.5),
RandomBrightness(0.5),
RandomHue(0.5),
RandomSaturation(0.5),
RandomShift(0.5),
RandomCrop(0.5),
Resize(args.input_size),
ToTensor(),
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# 验证集数据预处理
val_transform = Compose([
Resize(args.input_size),
ToTensor(),
Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
# 加载训练数据
train_dataset = DetData( image_path = os.path.join(args.data_path, 'JPEGImages'),
label_file = os.path.join(args.data_path, 'train.txt'),
nb_classes = args.nb_classes,
grid_size = args.grid_size,
num_bboxes = args.num_bboxes,
transform = train_transform)
train_loader = torch.utils.data.DataLoader(dataset=train_dataset, batch_size=args.batch_size, shuffle=True,
num_workers=args.num_workers)
# 加载验证数据
val_dataset = DetData( image_path = os.path.join(args.data_path, 'JPEGImages'),
label_file = os.path.join(args.data_path, 'val.txt'),
nb_classes = args.nb_classes,
grid_size=args.grid_size,
num_bboxes=args.num_bboxes,
transform = val_transform)
val_loader = torch.utils.data.DataLoader(dataset=val_dataset, batch_size=args.batch_size, shuffle=False,
num_workers=args.num_workers)
# 定义YOLOv1网络
model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
print(model)
# 根据设置是否加载预训练模型
if args.finetune:
checkpoint = torch.load(args.finetune, map_location='cpu')
msg = model.load_state_dict(checkpoint, strict=False)
print(msg)
# 如果是多GPU训练,加上这句话,[]里面按顺序写GPU的id
model = nn.DataParallel(model,[0,1,2,3])
# 损失函数定义
loss_function = Detect_Loss(feature_size=args.grid_size, num_bboxes=args.num_bboxes, num_classes=args.nb_classes)
# 优化器定义,本文使用的是SGD
optimizer = torch.optim.SGD(model.parameters(), lr=args.init_lr, momentum=args.momentum, weight_decay=args.weight_decay)
# 学习率定义,本文使用的是StepLR
scheduler = torch.optim.lr_scheduler.StepLR(optimizer,step_size=int(args.epochs * 0.3), gamma=0.1)
# 开始训练
history = train_and_val(args.epochs, model, train_loader,val_loader,loss_function, optimizer,scheduler,args.output_dir,device)
# 打印损失值以及学习率曲线
plot_loss(np.arange(0,args.epochs),args.output_dir, history)
plot_lr(np.arange(0,args.epochs),args.output_dir, history)
if __name__ == '__main__':
args = get_args_parser()
args = args.parse_args()
main(args)
yolov1.py
定义网络结构,本文将YOLOv1的backbone替换为ResNet50,并在backbone的后面加入了只有一层卷积的残差结构,称为Neck网络,最后的head网络使用的也是一层卷积使得网络的输出满足 7×7×(num_classes + 5 * 每一个网格包括的bounding box的数量),其中5代表(x, y, w, h, conf),激活函数为Sigmoid 。
import math
import torch.nn as nn
import torchvision.models.resnet
from torchvision.models.resnet import Bottleneck
#Neck网络类定义
class NeckNet(nn.Module):
def __init__(self, in_channels,out_channels):
super(NeckNet, self).__init__()
self.conv1 = nn.Conv2d(in_channels,out_channels, 3,stride=1, padding=1)
self.bn1 = nn.BatchNorm2d(out_channels)
self.relu = nn.ReLU(inplace=True)
def forward(self, x):
identity = x
out = self.conv1(x)
out = self.bn1(out)
out += identity
out = self.relu(out)
return out
#YOLOv1网络类定义
class YOLOv1(torchvision.models.resnet.ResNet):
def __init__(self, block, layers, num_classes=20, num_bboxes=2):
super(YOLOv1, self).__init__(block, layers)
#每一个网格包括的bounding box的数量,默认为2
self.B = num_bboxes
#预测的类别
self.C = num_classes
#调用Neck网络,定义为layer5
self.layer5 = NeckNet(2048,2048)
#定义head网络
self.end = nn.Sequential(
nn.Conv2d(2048, self.C + self.B * 5, 3,stride=2, padding=1),
nn.BatchNorm2d(self.C + self.B * 5),
nn.Sigmoid()
)
self._init_weights()
#初始化网络参数
def _init_weights(self):
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
if m.bias is not None:
m.bias.data.zero_()
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
m.weight.data.normal_(0, 0.01)
m.bias.data.zero_()
def forward_features(self, x):
x = self.conv1(x)
x = self.bn1(x)
x = self.relu(x)
x = self.maxpool(x)
x = self.layer1(x)
x = self.layer2(x)
x = self.layer3(x)
x = self.layer4(x)
x = self.layer5(x)
return x
def forward(self, x):
#backbone+Neck网络
x = self.forward_features(x)
#head网络
x = self.end(x)
#改变输出的特征图的shape,为[batch,7,7,num_classes + 5 * num_bboxes]
x = x.permute(0, 2, 3, 1)
return x
#这里使用的是ResNet50,因此block为Bottleneck,layers为[3, 4, 6, 3]
def YOLO_v1(num_classes=20, num_bboxes=2):
model = YOLOv1(block = Bottleneck, layers = [3, 4, 6, 3], num_classes = num_classes, num_bboxes = num_bboxes)
return model
export_onnx.py
将训练好的模型转为onnx格式
import torch
from models.yolov1 import YOLO_v1
import argparse
def get_args_parser():
parser = argparse.ArgumentParser('Export Onnx', add_help=False)
# 输入图像大小,和训练一致
parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
# 训练好的模型地址
parser.add_argument('--weights', default='./output_dir/last.pth', type=str,help='dataset path')
# 类别
parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
# 每一个网格包括的框的数量,默认为2
parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')
return parser
def main(args):
# 定义输入tensor
x = torch.randn(1, 3, args.input_size[0],args.input_size[1])
input_names = ["input"]
out_names = ["output"]
# 定义网络
model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
# 加载权重
checkpoint = torch.load(args.weights, map_location='cpu')
msg = model.load_state_dict(checkpoint, strict=True)
print(msg)
# 模型调整为eval模式
model.eval()
# 开始转onnx
torch.onnx.export(model, x, args.weights.replace('pth','onnx'), export_params=True, training=False, input_names=input_names, output_names=out_names)
print('please run: python -m onnxsim test.onnx test_sim.onnx\n')
if __name__ == '__main__':
args = get_args_parser()
args = args.parse_args()
main(args)
predict.py
进行单张图片预测
import argparse
import torch
import numpy as np
import torchvision.transforms as T
from models.yolov1 import YOLO_v1
from PIL import Image
import cv2
from utils.engine import postprocess
def get_args_parser():
parser = argparse.ArgumentParser('Predict Image', add_help=False)
#图像地址
parser.add_argument('--image_path', default='./dog.jpg', type=str, metavar='MODEL',help='Name of model to train')
#输入图像大小,与训练一致
parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
#训练好的模型文件地址
parser.add_argument('--weights', default='./output_dir/best.pth', type=str,help='dataset path')
#类别
parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
#类别概率阈值
parser.add_argument('--conf_thresh', default=0.2, type=float,help='thresh of cls conf')
#置信度阈值
parser.add_argument('--prob_thresh', default=0.2, type=float,help='thresh of predict prob')
#非极大值抑制阈值
parser.add_argument('--nms_thresh', default=0.5, type=float,help='nms thresh of predict prob')
# 图像划分的网格数,默认为7
parser.add_argument('--grid_size', default=7, type=int,help='grid size of each image')
# 每一个网格包括的框的数量,默认为2
parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')
# 设置推理设备,CPU or GPU
parser.add_argument('--device', default='cuda',help='device to use for training / testing')
return parser
def main(args):
# 设置推理设备,CPU or GPU
device = torch.device(args.device)
# 读取图像
image = Image.open(args.image_path).convert('RGB')
# 获取图像shape
width, height = image.size
# 预测类别
VOC_CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor']
#图像预处理定义
transforms = T.Compose([
T.Resize(args.input_size),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
# 定义网络
model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
# 加载权重
checkpoint = torch.load(args.weights, map_location='cpu')
msg = model.load_state_dict(checkpoint, strict=True)
print(msg)
# 模型放到推理设备中
model.to(device)
#模型设置为eval模式
model.eval()
# 预处理图像
input_tensor = transforms(image).unsqueeze(0).to(device)
with torch.no_grad():
# 模型推理,获取结果
output = model(input_tensor)
# 将结果进行解码
boxes, labels, probs = postprocess(output,width, height, VOC_CLASSES,args.grid_size, args.num_bboxes,
args.conf_thresh,args.prob_thresh,args.nms_thresh,args.nb_classes)
# 将图像转为opencv格式,方便画框
cv_image = np.array(image)
cv_image = cv2.cvtColor(cv_image, cv2.COLOR_RGB2BGR)
# 画框
for box, label, prob in zip(boxes, labels, probs):
(left,top),(right,bottom) = box
cv2.rectangle(cv_image, (int(left), int(top)), (int(right), int(bottom)), (128, 128, 0), thickness=2)
cv2.putText(cv_image, label+' '+'{:.2f}'.format(prob), (int(left), int(top)-10),
cv2.FONT_HERSHEY_SIMPLEX, fontScale=0.5, color=(255, 255, 255), thickness=1, lineType=8)
# 保存图像
cv2.imwrite('result.png',cv_image)
if __name__ == '__main__':
args = get_args_parser()
args = args.parse_args()
main(args)
eval.py
进行模型评价
import argparse
import torchvision.transforms as T
import torch
import os
import numpy as np
from PIL import Image
from tqdm import tqdm
from collections import defaultdict
from models.yolov1 import YOLO_v1
from utils.metrics import evaluate
from utils.engine import postprocess
def get_args_parser():
#同上
parser = argparse.ArgumentParser('Eval Model', add_help=False)
parser.add_argument('--data_path', default='./datasets/', type=str,help='dataset path')
parser.add_argument('--input_size', default=[448,448],nargs='+',type=int,help='images input size')
parser.add_argument('--weights', default='./output_dir/last.pth', type=str,help='dataset path')
parser.add_argument('--nb_classes', default=20, type=int,help='number of the classification types')
parser.add_argument('--conf_thresh', default=0.01, type=float,help='thresh of cls conf')
parser.add_argument('--prob_thresh', default=0.01, type=float,help='thresh of predict prob')
parser.add_argument('--nms_thresh', default=0.5, type=float,help='nms thresh of predict prob')
parser.add_argument('--grid_size', default=7, type=int,help='grid size of each image')
parser.add_argument('--num_bboxes', default=2, type=int,help='boxes number of each grid')
parser.add_argument('--device', default='cuda',help='device to use for training / testing')
return parser
def main(args):
device = torch.device(args.device)
VOC_CLASSES = ['aeroplane', 'bicycle', 'bird', 'boat','bottle', 'bus', 'car', 'cat', 'chair','cow', 'diningtable', 'dog', 'horse','motorbike', 'person', 'pottedplant','sheep', 'sofa', 'train', 'tvmonitor']
transforms = T.Compose([
T.Resize(args.input_size),
T.ToTensor(),
T.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225]),
])
targets = defaultdict(list)
preds = defaultdict(list)
model = YOLO_v1(num_classes = args.nb_classes,num_bboxes=args.num_bboxes)
checkpoint = torch.load(args.weights, map_location='cpu')
msg = model.load_state_dict(checkpoint, strict=True)
print(msg)
model.to(device)
model.eval()
print('Preparing ground-truth data...')
#读取验证集目标框的类别以及坐标
annotations = []
with open(os.path.join(args.data_path,'val.txt'), 'r') as f:
lines = f.readlines()
for line in lines:
anno = line.strip().split()
annotations.append(anno)
# 准备ground-truth数据
image_fnames = []
for anno in annotations:
# 获取图片名字
filename = anno[0]
image_fnames.append(filename)
# 获取每张图片对应的坐标以及类别信息
num_boxes = (len(anno) - 1) // 5
for b in range(num_boxes):
x1 = int(anno[5*b + 1])
y1 = int(anno[5*b + 2])
x2 = int(anno[5*b + 3])
y2 = int(anno[5*b + 4])
class_label = int(anno[5*b + 5])
class_name = VOC_CLASSES[class_label]
targets[(filename, class_name)].append([x1, y1, x2, y2])
print('Predicting...')
# 将模型预测的结果进行保存
for filename in tqdm(image_fnames):
image_path = os.path.join(args.data_path,'JPEGImages',filename)
image = Image.open(image_path).convert('RGB')
width, height = image.size
# 数据预处理
input_tensor = transforms(image).unsqueeze(0).to(device)
with torch.no_grad():
# 模型推理,获取预测结果
output = model(input_tensor)
# 将预测结果解码
boxes, labels, probs = postprocess(output, width, height, VOC_CLASSES, args.grid_size, args.num_bboxes,
args.conf_thresh, args.prob_thresh, args.nms_thresh, args.nb_classes)
for box, class_name, prob in zip(boxes, labels, probs):
x1y1, x2y2 = box
x1, y1 = int(x1y1[0]), int(x1y1[1])
x2, y2 = int(x2y2[0]), int(x2y2[1])
preds[class_name].append([filename, prob, x1, y1, x2, y2])
print('Evaluate the detection result...')
#开始评估
evaluate(preds, targets, class_names=VOC_CLASSES)
if __name__ == '__main__':
args = get_args_parser()
args = args.parse_args()
main(args)
engine.py
定义训练验证函数、打印曲线函数、解码函数等,这里要注意单GPU训练和多GPU训练在保存模型上的区别,对于非极大值抑制的讲解有兴趣的可以看https://blog.csdn.net/qq_38412266/article/details/139525192?spm=1001.2014.3001.5501
import os
import torch
import time
from tqdm import tqdm
import matplotlib.pyplot as plt
#训练验证函数
def train_and_val(epochs, model, train_loader, val_loader,criterion, optimizer,scheduler,output_dir,device):
#初始定义
train_loss = []
val_loss = []
learning_rate = []
best_min_loss = 100
#将模型放到指定设备中
model.to(device)
#开始记录总训练时间时
fit_time = time.time()
for e in range(epochs):
#清理缓存
torch.cuda.empty_cache()
#打印当前轮学习率值
print("This Epoch Learning Rate: {:.6f} ".format(scheduler.get_last_lr()[0]))
since = time.time()
training_loss = 0
#模型调整为训练模式
model.train()
with tqdm(total=len(train_loader)) as pbar:
for image, label in train_loader:
#将训练数据以及标签放到指定设备中
image = image.to(device)
label = label.to(device)
#模型推理,获取预测结果
output = model(image)
#loss计算
loss = criterion(output, label)
#反向传播,更新网络参数
optimizer.zero_grad()
loss.backward()
optimizer.step()
training_loss += loss.item()
pbar.update(1)
#模型调整为验证模式
model.eval()
validation_loss = 0
with torch.no_grad():
with tqdm(total=len(val_loader)) as pb:
for image, label in val_loader:
#将训练数据以及标签放到指定设备中
image = image.to(device)
label = label.to(device)
#模型推理,获取预测结果
output = model(image)
#loss计算
loss = criterion(output, label)
validation_loss += loss.item()
pb.update(1)
#将每一轮的loss值记录到列表中
train_loss.append(training_loss / len(train_loader))
val_loss.append(validation_loss / len(val_loader))
#将每一轮的学习率值记录到列表中
learning_rate.append(scheduler.get_last_lr())
#将每一轮的信息记录到log.txt中
save_file = open(os.path.join(output_dir,'log.txt'), mode='a+')
save_file.writelines(["Epoch:{}/{} ".format(e + 1, epochs)+
"Learning Rate: {:.6f} ".format(scheduler.get_last_lr()[0]) +
"Train Loss: {:.3f} ".format(training_loss / len(train_loader))+
"Val Loss: {:.3f} ".format(validation_loss / len(val_loader))+'\n'])
save_file.close()
# 多GPU训练使用,保存每一轮模型
torch.save(model.module.state_dict(), os.path.join(output_dir,'last.pth'))
# 单GPU训练使用,保存每一轮模型
#torch.save(model.state_dict(), os.path.join(output_dir,'last.pth'))
#如果当前轮的loss值小于记录的最低loss值,保存best模型
if best_min_loss > (validation_loss / len(val_loader)):
print("--save best model,loss is {:.6f}--".format(validation_loss / len(val_loader)))
best_min_loss = validation_loss / len(val_loader)
# 多GPU训练使用,保存最佳模型
torch.save(model.module.state_dict(), os.path.join(output_dir,'best.pth'))
# 单GPU训练使用,保存最佳模型
#torch.save(model.state_dict(), os.path.join(output_dir,'best.pth'))
#将每一轮的信息打印出来
print("Epoch:{}/{} ".format(e + 1, epochs),
"Train Loss: {:.3f} ".format(training_loss / len(train_loader)),
"Val Loss: {:.3f} ".format(validation_loss / len(val_loader)),
"Time: {:.2f}s".format((time.time() - since)))
#学习率更新
scheduler.step()
#将每一轮的信息保存起来
history = {'train_loss': train_loss, 'val_loss': val_loss ,'lr':learning_rate}
print('Total time: {:.2f} m'.format((time.time() - fit_time) / 60))
return history
#打印loss曲线函数
def plot_loss(x,output_dir, history):
plt.plot(x, history['val_loss'], label='val', marker='o')
plt.plot(x, history['train_loss'], label='train', marker='o')
plt.title('Loss per epoch')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(), plt.grid()
plt.savefig(os.path.join(output_dir,'loss.png'))
plt.clf()
#打印学习率曲线函数
def plot_lr(x,output_dir, history):
plt.plot(x, history['lr'], label='learning_rate', marker='x')
plt.title('learning rate per epoch')
plt.ylabel('Learning_rate')
plt.xlabel('epoch')
plt.legend(), plt.grid()
plt.savefig(os.path.join(output_dir,'learning_rate.png'))
plt.clf()
#非极大值抑制函数
def nms(boxes, scores,threshold):
x1 = boxes[:, 0] # [n,]
y1 = boxes[:, 1] # [n,]
x2 = boxes[:, 2] # [n,]
y2 = boxes[:, 3] # [n,]
areas = (x2 - x1) * (y2 - y1) # [n,]
_, ids_sorted = scores.sort(0, descending=True) # [n,]
ids = []
while ids_sorted.numel() > 0:
# Assume `ids_sorted` size is [m,] in the beginning of this iter.
i = ids_sorted.item() if (ids_sorted.numel() == 1) else ids_sorted[0]
ids.append(i)
if ids_sorted.numel() == 1:
break # If only one box is left (i.e., no box to supress), break.
inter_x1 = x1[ids_sorted[1:]].clamp(min=x1[i]) # [m-1, ]
inter_y1 = y1[ids_sorted[1:]].clamp(min=y1[i]) # [m-1, ]
inter_x2 = x2[ids_sorted[1:]].clamp(max=x2[i]) # [m-1, ]
inter_y2 = y2[ids_sorted[1:]].clamp(max=y2[i]) # [m-1, ]
inter_w = (inter_x2 - inter_x1).clamp(min=0) # [m-1, ]
inter_h = (inter_y2 - inter_y1).clamp(min=0) # [m-1, ]
inters = inter_w * inter_h # intersections b/w/ box `i` and other boxes, sized [m-1, ].
unions = areas[i] + areas[ids_sorted[1:]] - inters # unions b/w/ box `i` and other boxes, sized [m-1, ].
ious = inters / unions # [m-1, ]
# Remove boxes whose IoU is higher than the threshold.
ids_keep = (ious <= threshold).nonzero().squeeze() # [m-1, ]. Because `nonzero()` adds extra dimension, squeeze it.
if ids_keep.numel() == 0:
break # If no box left, break.
ids_sorted = ids_sorted[ids_keep + 1] # `+1` is needed because `ids_sorted[0] = i`.
return torch.LongTensor(ids)
#将结果进行解码的函数
def decode(pred_tensor,grid_size,num_bboxes,conf_thresh,prob_thresh,nb_classes):
S, B, C = grid_size,num_bboxes,nb_classes
boxes, labels, confidences, class_scores = [], [], [], []
cell_size = 1.0 / float(S)
pred_tensor = pred_tensor.cpu().data.squeeze(0)
pred_tensor_conf_list = []
for b in range(B):
pred_tensor_conf_list.append(pred_tensor[:, :, 5 * b + 4].unsqueeze(2))
grid_ceil_conf = torch.cat(pred_tensor_conf_list, 2)
grid_ceil_conf, grid_ceil_index = grid_ceil_conf.max(2)
class_conf, class_index = pred_tensor[:, :, 5 * B:].max(2)
class_conf[class_conf <= conf_thresh] = 0
class_prob = class_conf * grid_ceil_conf
for i in range(S):
for j in range(S):
if float(class_prob[j, i]) < prob_thresh:
continue
box = pred_tensor[j, i, 5 * grid_ceil_index[j, i]: 5 * grid_ceil_index[j, i] + 4]
xy_start_pos = torch.FloatTensor([i, j]) * cell_size
xy_normalized = box[:2] * cell_size + xy_start_pos
wh_normalized = box[2:]
box_xyxy = torch.FloatTensor(4)
box_xyxy[:2] = xy_normalized - 0.5 * wh_normalized
box_xyxy[2:] = xy_normalized + 0.5 * wh_normalized
boxes.append(box_xyxy)
labels.append(class_index[j, i])
confidences.append(grid_ceil_conf[j, i])
class_scores.append(class_conf[j, i])
if len(boxes) > 0:
boxes = torch.stack(boxes, 0)
labels = torch.stack(labels, 0)
confidences = torch.stack(confidences, 0)
class_scores = torch.stack(class_scores, 0)
else:
boxes = torch.FloatTensor(0, 4)
labels = torch.LongTensor(0)
confidences = torch.FloatTensor(0)
class_scores = torch.FloatTensor(0)
return boxes, labels, confidences, class_scores
def postprocess(output,width, height,VOC_CLASSES,grid_size,num_bboxes,conf_thresh,prob_thresh,nms_thresh,nb_classes):
boxes,labels,probs = [],[],[]
boxes_list, labels_list, confidences_list, class_scores_list = decode(output, grid_size, num_bboxes,
conf_thresh, prob_thresh,
nb_classes)
if boxes_list.shape[0] != 0:
boxes_nms, labels_nms, probs_nms = [], [], []
for class_label in range(len(VOC_CLASSES)):
ids = (labels_list == class_label)
if torch.sum(ids) == 0:
continue
boxes_list_current_cls = boxes_list[ids]
labels_list_current_cls = labels_list[ids]
confidences_list_current_cls = confidences_list[ids]
class_scores_list_current_cls = class_scores_list[ids]
ids_postprocess = nms(boxes_list_current_cls, confidences_list_current_cls, nms_thresh)
boxes_nms.append(boxes_list_current_cls[ids_postprocess])
labels_nms.append(labels_list_current_cls[ids_postprocess])
probs_nms.append(
confidences_list_current_cls[ids_postprocess] * class_scores_list_current_cls[ids_postprocess])
boxes_nms = torch.cat(boxes_nms, 0)
labels_nms = torch.cat(labels_nms, 0)
probs_nms = torch.cat(probs_nms, 0)
for box, label, prob in zip(boxes_nms, labels_nms, probs_nms):
x1, x2 = width * box[0], width * box[2] # unnormalize x with image width.
y1, y2 = height * box[1], height * box[3] # unnormalize y with image height.
boxes.append(((x1, y1), (x2, y2)))
label_idx = int(label) # convert from LongTensor to int.
class_name = VOC_CLASSES[label_idx]
labels.append(class_name)
prob = float(prob)
probs.append(prob)
return boxes,labels,probs
loss.py
定义检测的损失函数
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
class Detect_Loss(nn.Module):
def __init__(self, feature_size=7, num_bboxes=2, num_classes=20, lambda_coord=5.0, lambda_noobj=0.5):
super(Detect_Loss, self).__init__()
self.S = feature_size
self.B = num_bboxes
self.C = num_classes
self.lambda_coord = lambda_coord
self.lambda_noobj = lambda_noobj
def compute_iou(self, bbox1, bbox2):
N = bbox1.size(0)
M = bbox2.size(0)
lt = torch.max(
bbox1[:, :2].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
bbox2[:, :2].unsqueeze(0).expand(N, M, 2) # [M, 2] -> [1, M, 2] -> [N, M, 2]
)
rb = torch.min(
bbox1[:, 2:].unsqueeze(1).expand(N, M, 2), # [N, 2] -> [N, 1, 2] -> [N, M, 2]
bbox2[:, 2:].unsqueeze(0).expand(N, M, 2) # [M, 2] -> [1, M, 2] -> [N, M, 2]
)
wh = rb - lt
wh[wh < 0] = 0
inter = wh[:, :, 0] * wh[:, :, 1] # [N, M]
area1 = (bbox1[:, 2] - bbox1[:, 0]) * (bbox1[:, 3] - bbox1[:, 1]) # [N, ]
area2 = (bbox2[:, 2] - bbox2[:, 0]) * (bbox2[:, 3] - bbox2[:, 1]) # [M, ]
area1 = area1.unsqueeze(1).expand_as(inter) # [N, ] -> [N, 1] -> [N, M]
area2 = area2.unsqueeze(0).expand_as(inter) # [M, ] -> [1, M] -> [N, M]
union = area1 + area2 - inter # [N, M, 2]
iou = inter / union # [N, M, 2]
return iou
def forward(self, pred_tensor, target_tensor):
S, B, C = self.S, self.B, self.C
N = 5 * B + C
batch_size = pred_tensor.size(0)
coord_mask = target_tensor[:, :, :, 4] > 0
noobj_mask = target_tensor[:, :, :, 4] == 0
coord_mask = coord_mask.unsqueeze(-1).expand_as(target_tensor)
noobj_mask = noobj_mask.unsqueeze(-1).expand_as(target_tensor)
coord_pred = pred_tensor[coord_mask].view(-1, N)
bbox_pred = coord_pred[:, :5 * B].contiguous().view(-1,5)
class_pred = coord_pred[:, 5 * B:]
coord_target = target_tensor[coord_mask].view(-1,N)
bbox_target = coord_target[:, :5 * B].contiguous().view(-1, 5)
class_target = coord_target[:, 5 * B:]
noobj_pred = pred_tensor[noobj_mask].view(-1,N)
noobj_target = target_tensor[noobj_mask].view(-1,N)
noobj_conf_mask = torch.cuda.BoolTensor(noobj_pred.size()).fill_(0)
for b in range(B):
noobj_conf_mask[:, 4 + b * 5] = 1
noobj_pred_conf = noobj_pred[noobj_conf_mask]
noobj_target_conf = noobj_target[noobj_conf_mask]
loss_noobj = F.mse_loss(noobj_pred_conf, noobj_target_conf, reduction='sum')
coord_response_mask = torch.cuda.BoolTensor(bbox_target.size()).fill_(0)
coord_not_response_mask = torch.cuda.BoolTensor(bbox_target.size()).fill_(1)
bbox_target_iou = torch.zeros(bbox_target.size()).cuda()
for i in range(0, bbox_target.size(0), B):
pred = bbox_pred[i:i + B]
pred_xyxy = Variable(torch.FloatTensor(pred.size()))
pred_xyxy[:, :2] = pred[:, :2] / float(S) - 0.5 * pred[:, 2:4]
pred_xyxy[:, 2:4] = pred[:, :2] / float(S) + 0.5 * pred[:, 2:4]
target = bbox_target[i].view(-1, 5)
target_xyxy = Variable(torch.FloatTensor(target.size()))
target_xyxy[:, :2] = target[:, :2] / float(S) - 0.5 * target[:, 2:4]
target_xyxy[:, 2:4] = target[:, :2] / float(S) + 0.5 * target[:, 2:4]
iou = self.compute_iou(pred_xyxy[:, :4], target_xyxy[:, :4])
max_iou, max_index = iou.max(0)
max_index = max_index.data.cuda()
coord_response_mask[i + max_index] = 1
coord_not_response_mask[i+max_index] = 0
bbox_target_iou[i + max_index, torch.LongTensor([4]).cuda()] = (max_iou).data.cuda()
bbox_target_iou = Variable(bbox_target_iou).cuda()
bbox_pred_response = bbox_pred[coord_response_mask].view(-1, 5)
bbox_target_response = bbox_target[coord_response_mask].view(-1,5)
target_iou = bbox_target_iou[coord_response_mask].view(-1,5)
loss_xy = F.mse_loss(bbox_pred_response[:, :2], bbox_target_response[:, :2], reduction='sum')
loss_wh = F.mse_loss(torch.sqrt(bbox_pred_response[:, 2:4]), torch.sqrt(bbox_target_response[:, 2:4]),reduction='sum')
loss_obj = F.mse_loss(bbox_pred_response[:, 4], target_iou[:, 4], reduction='sum')
loss_class = F.mse_loss(class_pred, class_target, reduction='sum')
loss = self.lambda_coord * (loss_xy + loss_wh) + loss_obj + self.lambda_noobj * loss_noobj + loss_class
loss = loss / float(batch_size)
return loss
metrics.py
定义计算mAP的函数
import numpy as np
def compute_average_precision(recall, precision):
recall = np.concatenate(([0.0], recall, [1.0]))
precision = np.concatenate(([0.0], precision, [0.0]))
for i in range(precision.size - 1, 0, -1):
precision[i - 1] = max(precision[i -1], precision[i])
ap = 0.0
for i in range(precision.size - 1):
ap += (recall[i + 1] - recall[i]) * precision[i + 1]
return ap
def evaluate(preds,targets,class_names,threshold=0.5):
aps = []
for class_name in class_names:
class_preds = preds[class_name]
if len(class_preds) == 0:
ap = 0.0
print('---class {} AP {}---'.format(class_name, ap))
aps.append(ap)
break
image_fnames = [pred[0] for pred in class_preds]
probs = [pred[1] for pred in class_preds]
boxes = [pred[2:] for pred in class_preds]
sorted_idxs = np.argsort(probs)[::-1]
image_fnames = [image_fnames[i] for i in sorted_idxs]
boxes = [boxes[i] for i in sorted_idxs]
num_gt_boxes = 0
for (filename_gt, class_name_gt) in targets:
if class_name_gt == class_name:
num_gt_boxes += len(targets[filename_gt, class_name_gt])
num_detections = len(boxes)
tp = np.zeros(num_detections)
fp = np.ones(num_detections)
for det_idx, (filename, box) in enumerate(zip(image_fnames, boxes)):
if (filename, class_name) in targets:
boxes_gt = targets[(filename, class_name)]
for box_gt in boxes_gt:
inter_x1 = max(box_gt[0], box[0])
inter_y1 = max(box_gt[1], box[1])
inter_x2 = min(box_gt[2], box[2])
inter_y2 = min(box_gt[3], box[3])
inter_w = max(0.0, inter_x2 - inter_x1 + 1.0)
inter_h = max(0.0, inter_y2 - inter_y1 + 1.0)
inter = inter_w * inter_h
area_det = (box[2] - box[0] + 1.0) * (box[3] - box[1] + 1.0)
area_gt = (box_gt[2] - box_gt[0] + 1.0) * (box_gt[3] - box_gt[1] + 1.0)
union = area_det + area_gt - inter
iou = inter / union
if iou >= threshold:
tp[det_idx] = 1.0
fp[det_idx] = 0.0
boxes_gt.remove(box_gt)
if len(boxes_gt) == 0:
del targets[(filename, class_name)]
break
else:
pass
tp_cumsum = np.cumsum(tp)
fp_cumsum = np.cumsum(fp)
eps = np.finfo(np.float64).eps
precision = tp_cumsum / np.maximum(tp_cumsum + fp_cumsum, eps)
recall = tp_cumsum / float(num_gt_boxes)
ap = compute_average_precision(recall, precision)
print('---class {} AP {}---'.format(class_name, ap))
aps.append(ap)
print('---mAP {}---'.format(np.mean(aps)))
return aps
持续更新中