最好先手动配置一下pytorch,可参考:ubuntu18.04安装和卸载pytorch
下载yolov5
https://github.com/ultralytics/yolov5 选择yolov5-5.0,并输入以下命令:
pip install -r requirements.txt
或直接这样
git clone https://github.com/ultralytics/yolov5
cd yolov5
pip install -r requirements.txt
wget https://github.com/ultralytics/yolov5/releases/download/v6.0/yolov5m.pt
图片测试:
python detect.py --source 1.jpeg
准备数据集
在根目录下创建以下文件
VOCdevkit
--VOC2007
----Annotations #(放XML标签文件)
----ImageSets
------Main
----JPEGImages # (放原始图片)
在主目录下新建 voc_label.py
import xml.etree.ElementTree as ET
import pickle
import os
from os import listdir, getcwd
from os.path import join
import random
from shutil import copyfile
# 根据自己的数据标签修改
classes=["person","road roller","paver","dumper","helmet"]
def clear_hidden_files(path):
dir_list = os.listdir(path)
for i in dir_list:
abspath = os.path.join(os.path.abspath(path), i)
if os.path.isfile(abspath):
if i.startswith("._"):
os.remove(abspath)
else:
clear_hidden_files(abspath)
def convert(size, box):
dw = 1./size[0]
dh = 1./size[1]
x = (box[0] + box[1])/2.0
y = (box[2] + box[3])/2.0
w = box[1] - box[0]
h = box[3] - box[2]
x = x*dw
w = w*dw
y = y*dh
h = h*dh
return (x,y,w,h)
def convert_annotation(image_id):
in_file = open('VOCdevkit/VOC2007/Annotations/%s.xml' %image_id)
out_file = open('VOCdevkit/VOC2007/YOLOLabels/%s.txt' %image_id, 'w')
tree=ET.parse(in_file)
root = tree.getroot()
size = root.find('size')
w = int(size.find('width').text)
h = int(size.find('height').text)
for obj in root.iter('object'):
difficult = obj.find('difficult').text
cls = obj.find('name').text
if cls not in classes or int(difficult) == 1:
continue
cls_id = classes.index(cls)
xmlbox = obj.find('bndbox')
b = (float(xmlbox.find('xmin').text), float(xmlbox.find('xmax').text), float(xmlbox.find('ymin').text), float(xmlbox.find('ymax').text))
bb = convert((w,h), b)
out_file.write(str(cls_id) + " " + " ".join([str(a) for a in bb]) + '\n')
in_file.close()
out_file.close()
wd = os.getcwd()
wd = os.getcwd()
data_base_dir = os.path.join(wd, "VOCdevkit/")
if not os.path.isdir(data_base_dir):
os.mkdir(data_base_dir)
work_sapce_dir = os.path.join(data_base_dir, "VOC2007/")
if not os.path.isdir(work_sapce_dir):
os.mkdir(work_sapce_dir)
annotation_dir = os.path.join(work_sapce_dir, "Annotations/")
if not os.path.isdir(annotation_dir):
os.mkdir(annotation_dir)
clear_hidden_files(annotation_dir)
image_dir = os.path.join(work_sapce_dir, "JPEGImages/")
if not os.path.isdir(image_dir):
os.mkdir(image_dir)
clear_hidden_files(image_dir)
yolo_labels_dir = os.path.join(work_sapce_dir, "YOLOLabels/")
if not os.path.isdir(yolo_labels_dir):
os.mkdir(yolo_labels_dir)
clear_hidden_files(yolo_labels_dir)
yolov5_images_dir = os.path.join(data_base_dir, "images/")
if not os.path.isdir(yolov5_images_dir):
os.mkdir(yolov5_images_dir)
clear_hidden_files(yolov5_images_dir)
yolov5_labels_dir = os.path.join(data_base_dir, "labels/")
if not os.path.isdir(yolov5_labels_dir):
os.mkdir(yolov5_labels_dir)
clear_hidden_files(yolov5_labels_dir)
yolov5_images_train_dir = os.path.join(yolov5_images_dir, "train/")
if not os.path.isdir(yolov5_images_train_dir):
os.mkdir(yolov5_images_train_dir)
clear_hidden_files(yolov5_images_train_dir)
yolov5_images_test_dir = os.path.join(yolov5_images_dir, "val/")
if not os.path.isdir(yolov5_images_test_dir):
os.mkdir(yolov5_images_test_dir)
clear_hidden_files(yolov5_images_test_dir)
yolov5_labels_train_dir = os.path.join(yolov5_labels_dir, "train/")
if not os.path.isdir(yolov5_labels_train_dir):
os.mkdir(yolov5_labels_train_dir)
clear_hidden_files(yolov5_labels_train_dir)
yolov5_labels_test_dir = os.path.join(yolov5_labels_dir, "val/")
if not os.path.isdir(yolov5_labels_test_dir):
os.mkdir(yolov5_labels_test_dir)
clear_hidden_files(yolov5_labels_test_dir)
train_file = open(os.path.join(wd, "yolov5_train.txt"), 'w')
test_file = open(os.path.join(wd, "yolov5_val.txt"), 'w')
train_file.close()
test_file.close()
train_file = open(os.path.join(wd, "yolov5_train.txt"), 'a')
test_file = open(os.path.join(wd, "yolov5_val.txt"), 'a')
list_imgs = os.listdir(image_dir) # list image files
probo = random.randint(1, 100)
print("Probobility: %d" % probo)
for i in range(0,len(list_imgs)):
path = os.path.join(image_dir,list_imgs[i])
if os.path.isfile(path):
image_path = image_dir + list_imgs[i]
voc_path = list_imgs[i]
(nameWithoutExtention, extention) = os.path.splitext(os.path.basename(image_path))
(voc_nameWithoutExtention, voc_extention) = os.path.splitext(os.path.basename(voc_path))
annotation_name = nameWithoutExtention + '.xml'
annotation_path = os.path.join(annotation_dir, annotation_name)
label_name = nameWithoutExtention + '.txt'
label_path = os.path.join(yolo_labels_dir, label_name)
probo = random.randint(1, 100)
print("Probobility: %d" % probo)
if(probo < 80): # train dataset
if os.path.exists(annotation_path):
train_file.write(image_path + '\n')
convert_annotation(nameWithoutExtention) # convert label
copyfile(image_path, yolov5_images_train_dir + voc_path)
copyfile(label_path, yolov5_labels_train_dir + label_name)
else: # test dataset
if os.path.exists(annotation_path):
test_file.write(image_path + '\n')
convert_annotation(nameWithoutExtention) # convert label
copyfile(image_path, yolov5_images_test_dir + voc_path)
copyfile(label_path, yolov5_labels_test_dir + label_name)
train_file.close()
test_file.close()
用python3命令: python3 voc_label.py
运行后会在 VOCdevkit 下生成 images 和 labels 以及在 VOC2007 下生成 YOLOLabels
修改配置文件
1.修改data/voc.yaml文件
把 train val nc(类别数) names 都修改成自己的:
注意:trian和val要分别指向VOCdevkit/images下的train/ 和 val/
最后的"/"不能少
2.修改模型的配置文件
yolov5提供了四个模型,分别是s、m、L、X,根据显卡和数据的情况自行选择,我选择 s ,之后则修改models文件夹下的yolov5s.yaml,只需要修改第一行nc后面改为自己的类别就可以
训练模型
下载预权重
需要下载与5.0版对应的权重文件
在 https://github.com/ultralytics/yolov5/releases/tag/v5.0 中找到yolov5s.pt并下载
修改train.py
weights:权重文件路径,将default后面的路径换成下载好的权重路径
cfg:存储模型结构的配置文件,将default后面的文件换成自己使用的模型配置文件
data:存储训练、测试数据的文件,换成data/voc.yaml
epochs:指的就是训练过程中整个数据集将被迭代多少次,显卡不行你就调小点。
batch-size:根据显卡算力调节,如果显卡不行,那就调小
img-size:输入图片的尺寸,默认的是640,需要是32的倍数才可以。
修改后运行
python3 train.py
训练好的模型会被保存在runs/exp0/weights/last.pt和best.pt
详细训练数据保存在runs/exp0/results.txt文件中。
训练过程可视化
利用tensorboard可视化训练过程,训练开始会在yolov5目录生成一个runs文件夹,利用tensorboard打开即可查看训练日志,命令如下:
tensorboard --logdir=runs
进行测试
图片测试
修改detect.py文件
要改2个地方,weights和source,
weights:修改成自己训练好的模型的权重, runs\train\exp\weights\best.pt
source :修改成成测试集图片所在的文件夹,VOCdevkit/images/val
运行detect.py文件,在runs\detect\exp
这个路径下,可看到整个测试集的检测结果
电脑摄像头测试
只需要将上述source的值修改成0即可
parser.add_argument('--source', type=str, default='0', help='source')
但运行后出现了报错
TypeError: argument of type 'int' is not iterable
解决方法
参考:目标检测---教你利用yolov5训练自己的目标检测模型
找到utils下的datasets.py文件,给两个 url 参数加上 str 如下图所示:
运行detect.py文件即可
视频测试
只需要将上述source的值修改成视频存放的路径即可
parser.add_argument('--source', type=str, default='video', help='source') # file/folder, 0 for webcam
运行detect.py文件即可
注意:视频只支持这些格式
videos: ['mov', 'avi', 'mp4', 'mpg', 'mpeg', 'm4v', 'wmv', 'mkv']
D435i深度相机测试
目的:使用yolov5并结合D435i深度相机,实现目标检测并获得距离信息
参考:
首先感谢下面三个开源大佬
realsense D455深度相机+YOLO V5结合实现目标检测(二)
Realsense D435i Yolov5目标检测实时获得目标三维位置信息
GitHub - killnice/yolov5-D435i: using yolov5 and realsense D435i
首先终端输入以下命令
pip install pyrealsense2
在主目录下新建 realsensedetect.py,并输入如下内容:
import argparse
import os
import shutil
import time
from pathlib import Path
import cv2
import torch
import torch.backends.cudnn as cudnn
from numpy import random
import numpy as np
import pyrealsense2 as rs
from utils.plots import plot_one_box
from models.experimental import attempt_load
from utils.general import (
check_img_size, non_max_suppression, apply_classifier, scale_coords,
xyxy2xywh, strip_optimizer, set_logging)
from utils.torch_utils import select_device, load_classifier, time_synchronized
from utils.datasets import letterbox
def detect(save_img=False):
out, source, weights, view_img, save_txt, imgsz = \
opt.save_dir, opt.source, opt.weights, opt.view_img, opt.save_txt, opt.img_size
webcam = source == '0' or source.startswith(('rtsp://', 'rtmp://', 'http://')) or source.endswith('.txt')
# Initialize
set_logging()
device = select_device(opt.device)
if os.path.exists(out): # output dir
shutil.rmtree(out) # delete dir
os.makedirs(out) # make new dir
half = device.type != 'cpu' # half precision only supported on CUDA
# Load model
model = attempt_load(weights, map_location=device) # load FP32 model
imgsz = check_img_size(imgsz, s=model.stride.max()) # check img_size
if half:
model.half() # to FP16
# Set Dataloader
vid_path, vid_writer = None, None
view_img = True
cudnn.benchmark = True # set True to speed up constant image size inference
#dataset = LoadStreams(source, img_size=imgsz)
# Get names and colors
names = model.module.names if hasattr(model, 'module') else model.names
colors = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(names))]
# Run inference
t0 = time.time()
img = torch.zeros((1, 3, imgsz, imgsz), device=device) # init img
_ = model(img.half() if half else img) if device.type != 'cpu' else None # run once
pipeline = rs.pipeline()
# 创建 config 对象:
config = rs.config()
# config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.depth, 640, 480, rs.format.z16, 30)
config.enable_stream(rs.stream.color, 640, 480, rs.format.bgr8, 30)
# Start streaming
pipeline.start(config)
align_to_color = rs.align(rs.stream.color)
while True:
start = time.time()
# Wait for a coherent pair of frames(一对连贯的帧): depth and color
frames = pipeline.wait_for_frames()
frames = align_to_color.process(frames)
# depth_frame = frames.get_depth_frame()
depth_frame = frames.get_depth_frame()
color_frame = frames.get_color_frame()
color_image = np.asanyarray(color_frame.get_data())
depth_image = np.asanyarray(depth_frame.get_data())
mask = np.zeros([color_image.shape[0], color_image.shape[1]], dtype=np.uint8)
mask[0:480, 320:640] = 255
sources = [source]
imgs = [None]
path = sources
imgs[0] = color_image
im0s = imgs.copy()
img = [letterbox(x, new_shape=imgsz)[0] for x in im0s]
img = np.stack(img, 0)
img = img[:, :, :, ::-1].transpose(0, 3, 1, 2) # BGR to RGB, to 3x416x416, uint8 to float32
img = np.ascontiguousarray(img, dtype=np.float16 if half else np.float32)
img /= 255.0 # 0 - 255 to 0.0 - 1.0
# Get detections
img = torch.from_numpy(img).to(device)
if img.ndimension() == 3:
img = img.unsqueeze(0)
t1 = time_synchronized()
pred = model(img, augment=opt.augment)[0]
# Apply NMS
pred = non_max_suppression(pred, opt.conf_thres, opt.iou_thres, classes=opt.classes, agnostic=opt.agnostic_nms)
t2 = time_synchronized()
for i, det in enumerate(pred): # detections per image
p, s, im0 = path[i], '%g: ' % i, im0s[i].copy()
s += '%gx%g ' % img.shape[2:] # print string
gn = torch.tensor(im0.shape)[[1, 0, 1, 0]] # normalization gain whwh
if det is not None and len(det):
# Rescale boxes from img_size to im0 size
det[:, :4] = scale_coords(img.shape[2:], det[:, :4], im0.shape).round()
# Print results
for c in det[:, -1].unique():
n = (det[:, -1] == c).sum() # detections per class
s += '%g %ss, ' % (n, names[int(c)]) # add to string
# Write results
for *xyxy, conf, cls in reversed(det):
xywh = (xyxy2xywh(torch.tensor(xyxy).view(1, 4)) / gn).view(-1).tolist() # normalized xywh
line = (cls, conf, *xywh) if opt.save_conf else (cls, *xywh) # label format
distance_list = []
mid_pos = [int((int(xyxy[0]) + int(xyxy[2])) / 2), int((int(xyxy[1]) + int(xyxy[3])) / 2)] # 确定索引深度的中心像素位置左上角和右下角相加在/2
min_val = min(abs(int(xyxy[2]) - int(xyxy[0])), abs(int(xyxy[3]) - int(xyxy[1]))) # 确定深度搜索范围
# print(box,)
randnum = 40
for i in range(randnum):
bias = random.randint(-min_val // 4, min_val // 4)
dist = depth_frame.get_distance(int(mid_pos[0] + bias), int(mid_pos[1] + bias))
# print(int(mid_pos[1] + bias), int(mid_pos[0] + bias))
if dist:
distance_list.append(dist)
distance_list = np.array(distance_list)
distance_list = np.sort(distance_list)[
randnum // 2 - randnum // 4:randnum // 2 + randnum // 4] # 冒泡排序+中值滤波
label = '%s %.2f%s' % (names[int(cls)], np.mean(distance_list), 'm')
plot_one_box(xyxy, im0, label=label, color=colors[int(cls)], line_thickness=3)
# Print time (inference + NMS)
print('%sDone. (%.3fs)' % (s, t2 - t1))
# Stream results
if view_img:
cv2.imshow(p, im0)
if cv2.waitKey(1) == ord('q'): # q to quit
raise StopIteration
print('Done. (%.3fs)' % (time.time() - t0))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument('--weights', nargs='+', type=str, default='runs/train/exp/weights/best.pt', help='model.pt path(s)')
parser.add_argument('--source', type=str, default='inference/images', help='source') # file/folder, 0 for webcam
parser.add_argument('--img-size', type=int, default=640, help='inference size (pixels)')
parser.add_argument('--conf-thres', type=float, default=0.25, help='object confidence threshold')
parser.add_argument('--iou-thres', type=float, default=0.45, help='IOU threshold for NMS')
parser.add_argument('--device', default='', help='cuda device, i.e. 0 or 0,1,2,3 or cpu')
parser.add_argument('--view-img', action='store_true', help='display results')
parser.add_argument('--save-txt', action='store_true', help='save results to *.txt')
parser.add_argument('--save-conf', action='store_true', help='save confidences in --save-txt labels')
parser.add_argument('--save-dir', type=str, default='inference/output', help='directory to save results')
parser.add_argument('--classes', nargs='+', type=int, help='filter by class: --class 0, or --class 0 2 3')
parser.add_argument('--agnostic-nms', action='store_true', help='class-agnostic NMS')
parser.add_argument('--augment', action='store_true', help='augmented inference')
parser.add_argument('--update', action='store_true', help='update all models')
opt = parser.parse_args()
print(opt)
with torch.no_grad(): # 一个上下文管理器,被该语句wrap起来的部分将不会track梯度
detect()
使用自己的权重只需把上述的weights替换成上面自己训练好的 best.pt 即可
运行 realsensedetect.py 即可
也可以参考 上述第二篇文章 获取三维位置信息,需要先在 大佬共享的github 下载
下载好后直接运行即可
如果想用自己的权重文件,只需把 config 目录下的 yolov5s.yaml 文件修改成自己的即可
附
一、如果出现The size of tensor a (80) must match the size of tensor b (56) at non-singleton dimension 3 错误
原因:可能是你是5.0版本的yolo,却用了6.0(或其他)版本的同名权重,如yolov5s.pt
解决方法:
1.下载对应版本的权重
2.在model/common.py文件里加入如下内容:
import warnings
class SPPF(nn.Module):
# Spatial Pyramid Pooling - Fast (SPPF) layer for YOLOv5 by Glenn Jocher
def __init__(self, c1, c2, k=5): # equivalent to SPP(k=(5, 9, 13))
super().__init__()
c_ = c1 // 2 # hidden channels
self.cv1 = Conv(c1, c_, 1, 1)
self.cv2 = Conv(c_ * 4, c2, 1, 1)
self.m = nn.MaxPool2d(kernel_size=k, stride=1, padding=k // 2)
def forward(self, x):
x = self.cv1(x)
with warnings.catch_warnings():
warnings.simplefilter('ignore') # suppress torch 1.9.0 max_pool2d() warning
y1 = self.m(x)
y2 = self.m(y1)
return self.cv2(torch.cat([x, y1, y2, self.m(y2)], 1))
二、对训练结果的解析
# epoch 训练多少轮,显存占了多少,四类损失函数的数值,bbox的数量,图像长边尺寸
Epoch gpu_mem box obj cls total labels img_size
23/299 8.73G 0.04529 0.05286 0.002293 0.1004 307 640: 0%|
# 验证集的结果也比较明确
# P 是 precision(精确率),R 是 Recall(召回率)
Class Images Labels P R mAP@ mAP@
all 1031 8245 0.426 0.382 0.338 0.183
box :GIoU损失函数均值 越小方框越准;
objectness :目标检测loss均值 越小目标检测越准;
classification :分类loss均值 越小分类越准;
precision :精确率 ,表示你认为对的中,确实是对的比例 越大越好;
Recall :召回率,表示本来是对的,你找回了多少对的所占的比率 越大越好;
AP值 => Average Precision,即 平均精确度 。
如何衡量一个模型的性能,单纯用 precision 和 recall 都不科学。于是人们想到,把 PR曲线下的面积当做衡量尺度,于是就有了 AP值这一概念。average是对 precision 进行取平均 。
mAP值 =>Mean Average Precision,即 平均AP值 。 越大越好
mAP是用Precision和Recall作为两轴作图后围成的面积,m表示平均,@后面的数表示判定iou为正负样本的阈值,@0.5:0.95表示阈值取0.5:0.05:0.95后取均值。