



本篇介绍如何让检测器在视频或者网络摄像头上实时工作。我们将引入一些命令行标签,以便能使用该网络的各种超参数进行一些实验。这个代码是video.py,代码整体上很像detect.py,只有几处变化,只是我们不会在 batch 上迭代,而是在视频的帧上迭代。

注意代码中有一处错误我进行了修改。源代码在计算scaling_factor时,用的scaling_factor = torch.min(416/im_dim,1)[0].view(-1,1)显然不对,应该使用用户输入的args.reso即改为scaling_factor = torch.min(int(args.reso)/im_dim,1)[0].view(-1,1)


from __future__ import division
import time
import torch 
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import cv2 
from util import *
import argparse
import os 
import os.path as osp
from darknet import Darknet
import pickle as pkl
import pandas as pd
import random

def arg_parse():
    #创建一个ArgumentParser对象,格式: 参数名, 目标参数(dest是字典的key),帮助信息,默认值,类型
    parser = argparse.ArgumentParser(description='YOLO v3 检测模型')
    parser.add_argument("--bs", dest = "bs", help = "Batch size,默认为 1", default = 1)
    parser.add_argument("--confidence", dest = "confidence", help = "目标检测结果置信度阈值", default = 0.5)
    parser.add_argument("--nms_thresh", dest = "nms_thresh", help = "NMS非极大值抑制阈值", default = 0.4)
    parser.add_argument("--cfg", dest = 'cfgfile', help = 
                        default = "cfg/yolov3.cfg", type = str)
    parser.add_argument("--weights", dest = 'weightsfile', help = 
                        default = "yolov3.weights", type = str)
    parser.add_argument("--reso", dest = 'reso', help = 
                        "网络输入分辨率. 分辨率越高,则准确率越高; 反之亦然",
                        default = "416", type = str)
    parser.add_argument("--video", dest = "videofile", help = "待检测视频目录", default = "video.avi", type = str)
    return parser.parse_args()
args = arg_parse()# args是一个namespace类型的变量,即argparse.Namespace, 可以像easydict一样使用,就像一个字典,key来索引变量的值   
# Namespace(bs=1, cfgfile='cfg/yolov3.cfg', confidence=0.5,det='det', images='imgs', nms_thresh=0.4, reso='416', weightsfile='yolov3.weights')
batch_size = int(args.bs)
confidence = float(args.confidence)
nms_thesh = float(args.nms_thresh)
start = 0
CUDA = torch.cuda.is_available()# GPU环境是否可用

num_classes = 80# coco 数据集有80类
classes = load_classes("data/coco.names")#将类别文件载入到我们的程序中,coco.names文件中保存的是所有类别的名字,load_classes()返回一个列表classes,每个元素是一个类别的名字

model = Darknet(args.cfgfile)# Darknet类中初始化时得到了网络结构和网络的参数信息,保存在net_info,module_list中
model.load_weights(args.weightsfile)# 将权重文件载入,并复制给对应的网络结构model中
# 网络输入数据大小
model.net_info["height"] = args.reso# model类中net_info是一个字典。’’height’’是图片的宽高,因为图片缩放到416x416,所以宽高一样大
inp_dim = int(model.net_info["height"])#inp_dim是网络输入图片尺寸(如416*416)
assert inp_dim % 32 == 0 # 如果设定的输入图片的尺寸不是32的位数或者不大于32,抛出异常
assert inp_dim > 32

# 如果GPU可用, 模型切换到cuda中运行
if CUDA:

#变成测试模式,这主要是对dropout和batch normalization的操作在训练和测试的时候是不一样的

#要在视频或网络摄像头上运行这个检测器,代码基本可以保持不变,只是我们不会在 batch 上迭代,而是在视频的帧上迭代。
# 将方框和文字写在图片上
def write(x, results):
    c1 = tuple(x[1:3].int())# c1为方框左上角坐标x1,y1
    c2 = tuple(x[3:5].int())# c2为方框右下角坐标x2,y2
    img = results
    cls = int(x[-1])
    color = random.choice(colors)#随机选择一个颜色,用于后面画方框的颜色
    label = "{0}".format(classes[cls])#label为这个框所含目标类别名字的字符串
    cv2.rectangle(img, c1, c2,color, 1)# 在图片上画出(x1,y1,x2,y2)矩形,即我们检测到的目标方框
    t_size = cv2.getTextSize(label, cv2.FONT_HERSHEY_PLAIN, 1 , 1)[0]# 得到一个包含目标名字字符的方框的宽高
    c2 = c1[0] + t_size[0] + 3, c1[1] + t_size[1] + 4# 得到包含目标名字的方框右下角坐标c2,这里在x,y方向上分别加了3、4个像素
    cv2.rectangle(img, c1, c2,color, -1)# 在图片上画一个实心方框,我们将在方框内放置目标类别名字
    cv2.putText(img, label, (c1[0], c1[1] + t_size[1] + 4), cv2.FONT_HERSHEY_PLAIN, 1, [225,255,255], 1);# 在图片上写文字,(c1[0], c1[1] + t_size[1] + 4)为字符串的左下角坐标
    return img

#Detection phase

videofile = args.videofile #or path to the video file. 

cap = cv2.VideoCapture(videofile) #用 OpenCV 打开视频

#cap = cv2.VideoCapture(0)  #for webcam(相机)

# 当没有打开视频时抛出错误
assert cap.isOpened(), 'Cannot capture source'
# frames用于统计图片的帧数
frames = 0  
start = time.time()

fourcc = cv2.VideoWriter_fourcc('M','J','P','G') 
fps = 24 
savedPath = './det/savevideo.avi' # 保存的地址和视频名
ret, frame = cap.read() 
videoWriter = cv2.VideoWriter(savedPath, fourcc, fps,(frame.shape[1], frame.shape[0])) # 最后为视频图片的形状

while cap.isOpened():# ret指示是否读入了一张图片,为true时读入了一帧图片
    ret, frame = cap.read()
    if ret:
        # 将图片按照比例缩放缩放,将空白部分用(128,128,128)填充,得到为416x416的图片。并且将HxWxC转换为CxHxW   
        img = prep_image(frame, inp_dim)
        #cv2.imshow("a", frame)
        # 得到图片的W,H,是一个二元素tuple.因为我们不必再处理 batch,而是一次只处理一张图像,所以很多地方的代码都进行了简化。
        #因为一次只处理一帧,故使用一个元组im_dim替代 im_dim_list 的张量。
        im_dim = frame.shape[1], frame.shape[0]
        im_dim = torch.FloatTensor(im_dim).repeat(1,2)#repeat()可能会改变tensor的维度。它对tensor中对应repeat参数对应的维度上进行重复给定的次数,如果tensor的维度小于repeat()参数给定的维度,tensor的维度将变成和repeat()一致。这里repeat(1,2),表示在第一维度上重复一次,第二维上重复两次,repeat(1,2)有2个元素,表示它给定的维度有2个,所以将长度为2的一维行tensor变成了维度为1x4的二维tensor   
        if CUDA:
            im_dim = im_dim.cuda()
            img = img.cuda()
        # 只进行前向计算,不计算梯度
        with torch.no_grad():
#并且将tensor的维度转换成(batch_size, grid_size*grid_size*num_anchors, 5+类别数量)
            output = model(Variable(img, volatile = True), CUDA)
        output = write_results(output, confidence, num_classes, nms_conf = nms_thesh)

        # output的正常输出类型为float32,如果没有检测到目标时output元素为0,此时为int型,将会用continue进行下一次检测
        if type(output) == int:
            frames += 1
            print("FPS of the video is {:5.4f}".format( frames / (time.time() - start)))
            cv2.imshow("frame", frame)
            key = cv2.waitKey(1)
            if key & 0xFF == ord('q'):
# 而这里每次只有一张图片,每个方框所在图片的尺寸一样,只需将图片的尺寸的行数重复方框的数量次数即可                
        im_dim = im_dim.repeat(output.size(0), 1)
        # 得到每个方框所在图片缩放系数
        #scaling_factor = torch.min(416/im_dim,1)[0].view(-1,1)#这是源代码,下面是我修改的代码
        scaling_factor = torch.min(int(args.reso)/im_dim,1)[0].view(-1,1)
        # 将方框的坐标(x1,y1,x2,y2)转换为相对于填充后的图片中包含原始图片区域(如416*312区域)的计算方式。
        output[:,[1,3]] -= (inp_dim - scaling_factor*im_dim[:,0].view(-1,1))/2
        output[:,[2,4]] -= (inp_dim - scaling_factor*im_dim[:,1].view(-1,1))/2
        # 将坐标映射回原始图片
        output[:,1:5] /= scaling_factor
        for i in range(output.shape[0]):
            output[i, [1,3]] = torch.clamp(output[i, [1,3]], 0.0, im_dim[i,0])
            output[i, [2,4]] = torch.clamp(output[i, [2,4]], 0.0, im_dim[i,1])
        classes = load_classes('data/coco.names')
        colors = pkl.load(open("pallete", "rb"))
        list(map(lambda x: write(x, frame), output))
        cv2.imshow("frame", frame)
        videoWriter.write(frame)           # 每次循环,写入该帧
        key = cv2.waitKey(1)
        # 如果有按键输入则返回按键值编码,输入q返回113
        if key & 0xFF == ord('q'):
        frames += 1
        print(time.time() - start)
        print("FPS of the video is {:5.2f}".format( frames / (time.time() - start)))
        videoWriter.release()              # 结束循环的时候释放


YOLOv5是一种目标检测算法,是对YOLO系列的最新改进。它采用了一种新的架构设计,具有更高的检测精度和更快的检测速度。下面是YOLOv5的代码详细注释: 1.导入必要的库 ```python import torch import torch.nn as nn import torch.nn.functional as F ``` 2.定义Conv和Bottleneck块 ```python # Conv块 def conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False, groups=1): return nn.Conv2d(in_channels, out_channels, kernel_size, stride, padding, bias=bias, groups=groups) # Bottleneck块 class Bottleneck(nn.Module): # 构造函数 def __init__(self, in_channels, out_channels, shortcut=True, groups=1, expansion=0.5): super(Bottleneck, self).__init__() mid_channels = int(out_channels * expansion) self.conv1 = conv(in_channels, mid_channels, kernel_size=1, stride=1, padding=0, bias=False, groups=groups) self.bn1 = nn.BatchNorm2d(mid_channels) self.act1 = nn.SiLU(inplace=True) self.conv2 = conv(mid_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False, groups=groups) self.bn2 = nn.BatchNorm2d(out_channels) self.act2 = nn.SiLU(inplace=True) self.shortcut = shortcut and in_channels == out_channels if self.shortcut: self.conv_shortcut = conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False, groups=groups) self.bn_shortcut = nn.BatchNorm2d(out_channels) # 前向传播函数 def forward(self, x): shortcut = x x = self.conv1(x) x = self.bn1(x) x = self.act1(x) x = self.conv2(x) x = self.bn2(x) x = self.act2(x) if self.shortcut: shortcut = self.conv_shortcut(shortcut) shortcut = self.bn_shortcut(shortcut) x += shortcut x = self.act2(x) return x ``` 3.定义CSPDarknet53主干网络 ```python class CSPDarknet53(nn.Module): # 构造函数 def __init__(self, layers): super(CSPDarknet53, self).__init__() self.stem = nn.Sequential( conv(3, 32, kernel_size=3, stride=1, padding=1, bias=False), nn.BatchNorm2d(32), nn.SiLU(inplace=True) ) self.layer1 = nn.Sequential( conv(32, 64, kernel_size=3, stride=2, padding=1, bias=False), nn.BatchNorm2d(64), nn.SiLU(inplace=True), Bottleneck(64, 64 * 2), nn.BatchNorm2d(64 * 2), nn.SiLU(inplace=True), Bottleneck(64 * 2, 64) ) self.layer2 = nn.Sequential( conv(64, 128, kernel_size=3, stride=2, padding=1, bias=False), nn.BatchNorm2d(128), nn.SiLU(inplace=True), Bottleneck(128, 128 * 2), nn.BatchNorm2d(128 * 2), nn.SiLU(inplace=True), Bottleneck(128 * 2, 128), nn.BatchNorm2d(128), nn.SiLU(inplace=True), Bottleneck(128, 128 * 2), nn.BatchNorm2d(128 * 2), nn.SiLU(inplace=True), Bottleneck(128 * 2, 128) ) self.layer3 = nn.Sequential( conv(128, 256, kernel_size=3, stride=2, padding=1, bias=False), nn.BatchNorm2d(256), nn.SiLU(inplace=True), Bottleneck(256, 256 * 2), nn.BatchNorm2d(256 * 2), nn.SiLU(inplace=True), Bottleneck(256 * 2, 256), nn.BatchNorm2d(256), nn.SiLU(inplace=True), Bottleneck(256, 256 * 2), nn.BatchNorm2d(256 * 2), nn.SiLU(inplace=True), Bottleneck(256 * 2, 256), nn.BatchNorm2d(256), nn.SiLU(inplace=True), Bottleneck(256, 256 * 2), nn.BatchNorm2d(256 * 2), nn.SiLU(inplace=True), Bottleneck(256 * 2, 256) ) self.layer4 = nn.Sequential( conv(256, 512, kernel_size=3, stride=2, padding=1, bias=False), nn.BatchNorm2d(512), nn.SiLU(inplace=True), Bottleneck(512, 512 * 2), nn.BatchNorm2d(512 * 2), nn.SiLU(inplace=True), Bottleneck(512 * 2, 512), nn.BatchNorm2d(512), nn.SiLU(inplace=True), Bottleneck(512, 512 * 2), nn.BatchNorm2d(512 * 2), nn.SiLU(inplace=True), Bottleneck(512 * 2, 512), nn.BatchNorm2d(512), nn.SiLU(inplace=True), Bottleneck(512, 512 * 2), nn.BatchNorm2d(512 * 2), nn.SiLU(inplace=True), Bottleneck(512 * 2, 512) ) self.layers = nn.ModuleList([self.layer1, self.layer2, self.layer3, self.layer4]) self.out_channels = [64, 128, 256, 512] # 前向传播函数 def forward(self, x): x = self.stem(x) outs = [] for layer in self.layers: x = layer(x) outs.append(x) return tuple(outs) ``` 4.定义SPP和PANet模块 ```python # SPP模块 class SPP(nn.Module): # 构造函数 def __init__(self, in_channels, out_channels): super(SPP, self).__init__() self.conv1 = conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(out_channels) self.act1 = nn.SiLU(inplace=True) self.pool2 = nn.MaxPool2d(kernel_size=5, stride=1, padding=2) self.conv2 = conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False) self.bn2 = nn.BatchNorm2d(out_channels) self.act2 = nn.SiLU(inplace=True) self.pool3 = nn.MaxPool2d(kernel_size=9, stride=1, padding=4) self.conv3 = conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False) self.bn3 = nn.BatchNorm2d(out_channels) self.act3 = nn.SiLU(inplace=True) self.conv4 = conv(out_channels * 4, out_channels, kernel_size=1, stride=1, padding=0, bias=False) self.bn4 = nn.BatchNorm2d(out_channels) self.act4 = nn.SiLU(inplace=True) # 前向传播函数 def forward(self, x): x1 = self.conv1(x) x1 = self.bn1(x1) x1 = self.act1(x1) x2 = self.pool2(x) x2 = self.conv2(x2) x2 = self.bn2(x2) x2 = self.act2(x2) x3 = self.pool3(x) x3 = self.conv3(x3) x3 = self.bn3(x3) x3 = self.act3(x3) x = torch.cat([x1, x2, x3, x], dim=1) x = self.conv4(x) x = self.bn4(x) x = self.act4(x) return x # PANet模块 class PANet(nn.Module): # 构造函数 def __init__(self, in_channels, out_channels): super(PANet, self).__init__() self.upsample = nn.Upsample(scale_factor=2, mode='nearest') self.conv1 = conv(in_channels, out_channels, kernel_size=1, stride=1, padding=0, bias=False) self.bn1 = nn.BatchNorm2d(out_channels) self.act1 = nn.SiLU(inplace=True) self.conv2 = conv(out_channels, out_channels, kernel_size=3, stride=1, padding=1, bias=False) self.bn2 = nn.BatchNorm2d(out_channels) self.act2 = nn.SiLU(inplace=True) # 前向传播函数 def forward(self, x, y): x = self.upsample(x) if y.shape[2] != x.shape[2] or y.shape[3] != x.shape[3]: y = F.interpolate(y, size=x.shape[2:], mode='nearest') x = torch.cat([x, y], dim=1) x = self.conv1(x) x = self.bn1(x) x = self.act1(x) x = self.conv2(x) x = self.bn2(x) x = self.act2(x) return x ``` 5.定义YOLOv5头部网络 ```python class YOLOv5Head(nn.Module): # 构造函数 def __init__(self, in_channels, num_classes, anchors): super(YOLOv5Head, self).__init__() self.num_anchors = len(anchors) self.num_classes = num_classes self.conv1 = conv(in_channels, in_channels * 2, kernel_size=3, stride=1, padding=1, bias=False) self.bn1 = nn.BatchNorm2d(in_channels * 2) self.act1 = nn.SiLU(inplace=True) self.conv2 = conv(in_channels * 2, self.num_anchors * (self.num_classes + 5), kernel_size=1, stride=1, padding=0) # 前向传播函数 def forward(self, x): x = self.conv1(x) x = self.bn1(x) x = self.act1(x) x = self.conv2(x) return x ``` 6.定义YOLOv5整体网络 ```python class YOLOv5(nn.Module): # 构造函数 def __init__(self, num_classes, anchors): super(YOLOv5, self).__init__() self.backbone = CSPDarknet53([1, 2, 8, 8]) self.spp = SPP(512, 512) self.panet1 = PANet(512, 256) self.panet2 = PANet(256, 128) self.heads = nn.ModuleList([ YOLOv5Head(512, num_classes, anchors[0]), YOLOv5Head(256, num_classes, anchors[1]), YOLOv5Head(128, num_classes, anchors[2]) ]) self.out_channels = self.backbone.out_channels # 前向传播函数 def forward(self, x): x = self.backbone(x) x = self.spp(x[-1]) x = self.panet1(x, x[-2]) x = self.panet2(x, x[-3]) outputs = [] for i, x in enumerate(x[::-1]): outputs.append(self.heads[i](x)) return tuple(outputs) ``` 以上是YOLOv5的代码详细注释。该代码实现了CSPDarknet53主干网络,SPP和PANet模块以及YOLOv5头部网络,最终实现了一个完整的YOLOv5目标检测网络。


