opencv在4.4版本添加了对yolov4的支持,网上也有相应的教程,但是目前中文网上的教程大都基于C++实现,或者是用cv2.dnn.readNetFromDarknet
.这个比较low-level的接口,此篇将博客记录一种更简单的实现方式,采用·cv2.dnn_DetectionModel
接口(opencv4.1.2版本开始提供),实现对AlexeyAB版本和Ultralytics版本的YOLOv3和YOLOv4以及其他变体进行检测,同事支持自定义数据集情况下训练得到的模型。
1.Pre-requests
- yolov3.weights, yolov3.cfg
- yolov4.weights, yolov4.cfg
- https://github.com/AlexeyAB/darknet
- https://github.com/ultralytics/yolov3/
- labelImg(制作YOLO格式自定义数据集)
- coco.names
- 测试图片,笔者采用的图片来自百度百科(广州市交通治堵方案):
2. Codes
废话不多说,直接上代码:
# -*- coding: utf-8 -*-
"""
Created on Fri Aug 14 22:21:53 2020
@author: 周文青
opencv dnn模块加载yolo模型
issue: https://github.com/AlexeyAB/darknet/issues/6489
CPU E5-2650 V2 @2.6GHz测试环境
"""
import cv2
import matplotlib.pyplot as plt
import time
import numpy as np
coco_names = r"F:\opencv\sources\samples\dnn\coco.names"
model_yolov2 = r"F:\opencv\sources\samples\dnn\yolov2.weights"
cfg_yolov2 = r"F:\opencv\sources\samples\dnn\yolov2.cfg"
model_yolov2_tiny = r"E:\MachineLearning\darknet\yolov2-tiny.weights"
cfg_yolov2_tiny = "E:\MachineLearning\darknet\cfg\yolov2-tiny.cfg"
model_yolov3 = r"F:\opencv\sources\samples\dnn\yolov3.weights"
cfg_yolov3 = r"F:\opencv\sources\samples\dnn\yolov3.cfg"
model_yolov3_tiny = r"E:\MachineLearning\darknet\yolov3-tiny.weights"
cfg_yolov3_tiny = "E:\MachineLearning\darknet\cfg\yolov3-tiny.cfg"
model_yolov3_tiny_prn = r"E:\MachineLearning\darknet\yolov3-tiny-prn.weights"
cfg_yolov3_tiny_prn = "E:\MachineLearning\darknet\cfg\yolov3-tiny-prn.cfg"
model_yolov3_spp = r"F:\opencv\sources\samples\dnn\yolov3-spp.weights"
cfg_yolov3_spp = r"F:\opencv\sources\samples\dnn\yolov3-spp.cfg"
csresnext_model = r"E:\MachineLearning\darknet\csresnext50-panet-spp-original-optimal_final.weights"
csresnext_cfg = r"E:\MachineLearning\darknet\cfg\csresnext50-panet-spp-original-optimal.cfg"
model_yolov4 = r"F:\opencv\sources\samples\dnn\yolov4.weights"
cfg_yolov4 = r"F:\opencv\sources\samples\dnn\yolov4.cfg"
model_yolov4_tiny = r"E:\MachineLearning\darknet\yolov4-tiny.weights"
cfg_yolov4_tiny = "E:\MachineLearning\darknet\cfg\yolov4-tiny.cfg"
enet_model = r"E:\MachineLearning\darknet\enetb0-coco_final.weights"
enet_cfg = r"E:\MachineLearning\darknet\cfg\enet-coco.cfg"
img_file = r"C:\Users\admin\Pictures\car.jpg"
video_file = r'F:/opencv-4.4.0/samples/data/vtest.avi'
model = model_yolov2_tiny
cfg = cfg_yolov2_tiny
with open(coco_names,'rt') as f:
names = f.read().rstrip('\n').split('\n')
def det_image_v1(model, cfg, img_file, c_threshold=0.5, nms=0.5):
classes = names
# initialize a list of colors to represent each possible class label
COLORS = np.random.randint(0, 255, size=(len(classes), 3),dtype="uint8")
print("[INFO] loading model...")
net = cv2.dnn.readNetFromDarknet(cfg, model)
# load the input image and construct an input blob for the image
# by resizing to a fixed 300x300 pixels and then normalizing it
image = cv2.imread(img_file)
(H,W) = image.shape[:2]
# Get the names of output layers
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# generate blob for image input to the network
blob = cv2.dnn.blobFromImage(image,1/255,(416,416),swapRB=True, crop=False)
net.setInput(blob)
start = time.time()
layersOutputs = net.forward(ln)
print(layersOutputs)
boxes = []
confidences = []
classIDs = []
for output in layersOutputs:
# loop over each of the detections
for detection in output:
# extract the class ID and confidence (i.e., probability) of
# the current object detection
scores = detection[5:]
classID = np.argmax(scores)
confidence = scores[classID]
# filter out weak predictions by ensuring the detected
# probability is greater than the minimum probability
if confidence > 0.5:
box = detection[0:4]* np.array([W, H, W, H])
(centerX, centerY, width, height) = box.astype("int")
# use the center (x, y)-coordinates to derive the top and
# and left corner of the bounding box
x = int(centerX - (width / 2))
y = int(centerY - (height / 2))
# update our list of bounding box coordinates, confidences,
# and class IDs
boxes.append([x, y, int(width), int(height)])
confidences.append(float(confidence))
classIDs.append(classID)
# Remove unnecessary boxes using non maximum suppression
idxs = cv2.dnn.NMSBoxes(boxes, confidences, c_threshold, nms)
if len(idxs) > 0:
# loop over the indexes we are keeping
for i in idxs.flatten():
# extract the bounding box coordinates
(x, y) = (boxes[i][0], boxes[i][1])
(w, h) = (boxes[i][2], boxes[i][3])
# draw a bounding box rectangle and label on the image
color = [int(c) for c in COLORS[classIDs[i]]]
cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
text = "{}: {:.4f}".format(classes[classIDs[i]], confidences[i])
cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
0.4, color, 1)
end = time.time()
# print the time required
print('FPS:', 1/(end- start))
# show the output image
cv2.imshow("Image", image)
cv2.waitKey(0)
cv2.destroyAllWindows()
# det_image_v1(model_yolov4,cfg_yolov4, img_file)
def det_image_v2(model, cfg, img_file, c_threshold=0.5, nms=0.5):
# 加载yolo模型
net = cv2.dnn_DetectionModel(model, cfg)
net.setInputSize(512,512) # 设置网络输入尺寸
net.setInputScale(1.0/255)
net.setInputSwapRB(True)
frame=cv2.imread(img_file)
classes, confs, boxes = net.detect(frame, c_threshold, nms)
for id, conf, box in zip(classes.flatten(), confs.flatten(), boxes):
label = '{}, {:.2f}'.format(names[id], conf)
# print(label)
labelsize, baseLine= cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX,0.5,1)
left, top, width, height = box
top = max(top, labelsize[1])
cv2.rectangle(frame, box, color=(0, 255, 0), thickness=3)
cv2.rectangle(frame, (left, top-labelsize[1]),
(left+labelsize[0], top+baseLine),(255, 255, 255), cv2.FILLED)
cv2.putText (frame, label,(left, top), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0,0, 0))
# plt.imshow(frame[:,:,::-1])
# plt.show()
cv2.imshow('frame', frame)
cv2.waitKey(0)
cv2.destroyAllWindows()
# det_image_v2(model, cfg, img_file)
def det_video_v1(model, cfg, video_file, c_threshold=0.5, nms=0.5):
classes = names
# initialize a list of colors to represent each possible class label
COLORS = np.random.randint(0, 255, size=(len(classes), 3),dtype="uint8")
print("[INFO] loading model...")
net = cv2.dnn.readNetFromDarknet(cfg, model)
window_name = 'frame'
cv2.namedWindow(window_name)
cap = cv2.VideoCapture(video_file) # 视频来源
fps_list = []
count = 0
while cap.isOpened():
ok, image = cap.read() # 读取一帧数据
if not ok:
break
(H,W) = image.shape[:2]
# Get the names of output layers
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
# generate blob for image input to the network
blob = cv2.dnn.blobFromImage(image,1/255,(416,416),swapRB=True, crop=False)
net.setInput(blob)
start = time.time()
layersOutputs = net.forward(ln)
boxes = []
confidences = []
classIDs = []
for output in layersOutputs:
# loop over each of the detections
for detection in output:
# extract the class ID and confidence (i.e., probability) of
# the current object detection
scores = detection[5:]
classID = np.argmax(scores)
confidence = scores[classID]
# filter out weak predictions by ensuring the detected
# probability is greater than the minimum probability
if confidence > 0.5:
box = detection[0:4]* np.array([W, H, W, H])
(centerX, centerY, width, height) = box.astype("int")
# use the center (x, y)-coordinates to derive the top and
# and left corner of the bounding box
x = int(centerX - (width / 2))
y = int(centerY - (height / 2))
# update our list of bounding box coordinates, confidences,
# and class IDs
boxes.append([x, y, int(width), int(height)])
confidences.append(float(confidence))
classIDs.append(classID)
# Remove unnecessary boxes using non maximum suppression
idxs = cv2.dnn.NMSBoxes(boxes, confidences, c_threshold, nms)
if len(idxs) > 0:
# loop over the indexes we are keeping
for i in idxs.flatten():
# extract the bounding box coordinates
(x, y) = (boxes[i][0], boxes[i][1])
(w, h) = (boxes[i][2], boxes[i][3])
# draw a bounding box rectangle and label on the image
color = [int(c) for c in COLORS[classIDs[i]]]
cv2.rectangle(image, (x, y), (x + w, y + h), color, 2)
text = "{}: {:.4f}".format(classes[classIDs[i]], confidences[i])
cv2.putText(image, text, (x, y - 5), cv2.FONT_HERSHEY_SIMPLEX,
0.4, color, 1)
end = time.time()
# print the time required
fps_list.append(1/(end- start))
count += 1
if count == 10:
print('FPS:', np.mean(fps_list))
fps_list=[]
count=0
# show the output image
cv2.imshow(window_name, image)
c = cv2.waitKey(1)
if c & 0xFF == ord('q') or c==27: # 安装ESC或者q退出
break
cap.release()
cv2.destroyAllWindows()# 释放摄像头并销毁所有窗口
det_video_v1(model, cfg, video_file)
def det_video_v2(model, cfg, video_file,c_threshold=0.5, nms=0.5):
net = cv2.dnn_DetectionModel(model, cfg)
net.setInputSize(512,512) # 设置网络输入尺寸
net.setInputScale(1.0/255)
net.setInputSwapRB(True)
window_name = 'frame'
cv2.namedWindow(window_name)
cap = cv2.VideoCapture(video_file) # 视频来源
fps_list = []
count = 0
while cap.isOpened():
ok, frame = cap.read() # 读取一帧数据
if not ok:
break
start = time.time()
classes, confs, boxes = net.detect(frame, c_threshold, nms)
for id, conf, box in zip(classes.flatten(), confs.flatten(), boxes):
label = '{}, {:.2f}'.format(names[id], conf)
# print(label)
labelsize, baseLine= cv2.getTextSize(label, cv2.FONT_HERSHEY_SIMPLEX,0.5,1)
left, top, width, height = box
top = max(top, labelsize[1])
cv2.rectangle(frame, box, color=(0, 255, 0), thickness=3)
cv2.rectangle(frame, (left, top-labelsize[1]),
(left+labelsize[0], top+baseLine),(255, 255, 255), cv2.FILLED)
cv2.putText (frame, label,(left, top), cv2.FONT_HERSHEY_SIMPLEX, 0.5,(0,0, 0))
fps_list.append(1/(time.time()-start))
count += 1
if count == 10:
print('FPS:', np.mean(fps_list))
fps_list = []
count = 0
# 显示图像
cv2.imshow(window_name, frame)
c = cv2.waitKey(1)
if c & 0xFF == ord('q') or c==27: # 安装ESC或者q退出
break
cap.release()
cv2.destroyAllWindows()# 释放摄像头并销毁所有窗口
# det_video_v2(model, cfg, video_file)
笔者的cpu硬件为:Intel®Xeon® CPU E5-2650 v2@ 2.60GHz 2.60 GHz(使用GPU的话可以取消注释# net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA) #net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
),在opencv的框架下检测速度大概在0.82s/张图片,下面看一下yolov3和yolov4的结果:
从结果来看,对于笔者使用的测试图片来说,似乎yolov3的检测效果要比yolov4好。yolov4漏检了好多明显的物体。
20200828 Update
测试了opencv的dnn模块加载https://github.com/AlexeyAB/darknet中常用模型对视频进行检测的效果,测试视频来自F:\opencv-4.4.0\samples\data\vtest.avi
行人检测视频。
模型 | 模型大小 | FPS |
---|---|---|
YOLOv2 | 194 MB | - |
YOLOv2-tiny | 43MB | 15.3 |
YOLOv3 | 236 MB | 1.3 |
YOLOv3-tiny | 33.7 MB | 13.3 |
YOLOv3-tiny-prn | 18.8 MB | 17.1 |
YOLOv3-SPP | 240 MB | - |
csresnext50-panet-spp-original-optimal_final | 217 MB | 1.8 |
YOLOv4 | 245 MB | 1.2 |
YOLOV4-tiny | 23.1 MB | 9.8 |
enet-coco | 18.3 MB | 4.9 |
YOLOv2和YOLOv3-SPP无法使用cv2.dnn.readNetFromDarknet,已经在opencv中提交了issue,等待解决。
3. Custom Model
该部分主要介绍加载自定义数据集情况下训练得到yolov3、yolov3-spp、yolov4等模型。熟悉yolov3的小伙伴应该知道目前最常使用的库是AlexeyAB版本的darknet(注意,原版darknet已经不再维护,作者也已经宣布不再进行计算机视觉方面的研究)和ultralytics版本的yolov3(v4),个人感觉pytorch版本的实现更加友好,在可视化和训练输入比darknet方便,而且darknet中的cfg文件基本可以直接在pytorch版本使用,不过这些差异并不影响最终opencv加载darknet模型,如果采用pytorch版本,作者提供了.pt
文件和.weights
文件之间的转换接口,如果采用darknet版本,训练得到的就是weights文件,可以倍opencvhijack加载。下面以pytorch版本的yolov3(v4)训练自定义数据集并且通过opencvdnn模块加载进行简单介绍。
数据集制作、模型配置文件修改、数据配置文件、训练等详细内容可以查看官方教程https://github.com/ultralytics/yolov3/wiki/Train-Custom-Data,数据集制作这一块内容非常重要,因为很多时候我们需要使用不同的算法库来对我们的数据集进行测试,不同的算法库所支持的数据集格式也都不一样,对于目标检测问题来说,主要有以下几种数据集格式:
- PASCAL VOC 格式
- YOLO格式
适合项目地址:
- https://github.com/eriklindernoren/PyTorch-YOLOv3
- https://github.com/ultralytics/yolov3/
- https://github.com/AlexeyAB
- CSV格式
适用项目:
- https://github.com/fizyr/keras-retinanet
- https://github.com/yhenon/pytorch-retinanet
- COCO格式
适用项目:
- https://github.com/zylo117/Yet-Another-EfficientDet-Pytorch
- mmdetection2
- TXT格式
适用程序项目:
-
https://github.com/Tianxiaomo/pytorch-YOLOv4
-
https://github.com/YunYang1994/tensorflow-yolov3
-
https://github.com/YunYang1994/TensorFlow2.0-Examples/tree/master/4-Object_Detection/YOLOV3
数据集格式:
train.txt
xxx/xxx.jpg 18.19,6.32,424.13,421.83,20 323.86,2.65,640.0,421.94,20
xxx/xxx.jpg 48,240,195,371,11 8,12,352,498,14
image_path x_min, y_min, x_max, y_max, class_id x_min, y_min ,…, class_id
make sure that x_max < width and y_max < height
笔者建议以PASCAL VOC或者YOLO格式为基础,再使用转换脚本转换至其他数据集格式,相应的转换脚本在个人仓库:https://github.com/ouening/OD_dataset_conversion_scripts
训练得到pt文件后,在yolov3路径下打开终端,执行下列命令将pt文件转换至darknet的weights文件:
$ python3 -c "from models import *; convert('cfg/yolov3-spp-terahertz-8cls.cfg', 'weights/best_terahertz-yolov3-spp.pt')"
注意convert
的第一个参数是模型配置文件,第二个参数是相应的训练得到的pt模型文件,命令执行后会得到.weights文件,有了cfg文件和weights文件之后,再根据前面的程序就可以用opencvdnn模块加载自定义数据集了,最后要注意的是类别标签文件,该文件确定了不同物体类别所对应的数值标签,要和训练时一致,这也是在数据集制作过程需要注意的。如果使用labelImg制作的YOLO格式数据集,最后有一个classes.txt文件,示例内容如下:
screw_drive
blade
knife
scissors
board_marker
mobile_phone
wireless_mouse
water_bottle
每一行代表一个物体的字符串类别名称。
最后给出C++版本下的实现,由于笔者还不熟悉C++,只列出opencv加载模型部分,可视化显示部分还望各位补充。
String img_file = "C:/Users/admin/Pictures/car.jpg";
const String model_yolov3 = "F:/opencv/sources/samples/dnn/yolov3.weights";
const String cfg_yolov3 = "F:/opencv/sources/samples/dnn/yolov3.cfg";
cv::dnn::DetectionModel model = cv::dnn::DetectionModel(model_yolov3, cfg_yolov3);
model.setInputSwapRB(true);
model.setInputSize(512,512);
model.setInputScale(1.0/255.0);
Mat frame = cv::imread(img_file);
std::vector<int> classIds;
std::vector<float> confidences;
std::vector<Rect> boxes;
model.detect(frame, classIds, confidences, boxes, 0.5, 0.5);
参考链接:https://github.com/AlexeyAB/darknet/issues/6489