本系列文章记录本人硕士阶段YOLO系列目标检测算法自学及其代码实现的过程。其中算法具体实现借鉴于ultralytics YOLO源码Github,删减了源码中部分内容,满足个人科研需求。
本系列文章主要以YOLOv5为例完成算法的实现,后续修改、增加相关模块即可实现其他版本的YOLO算法。
文章地址:
YOLOv5算法实现(一):算法框架概述
YOLOv5算法实现(二):模型搭建
YOLOv5算法实现(三):数据集加载
YOLOv5算法实现(四):正样本匹配与损失计算
YOLOv5算法实现(五):预测结果后处理
YOLOv5算法实现(六):评价指标及实现
YOLOv5算法实现(七):模型训练
YOLOv5算法实现(八):模型验证
YOLOv5算法实现(九):模型预测
0 引言
本篇文章实现模型训练/验证中数据集迭代器的构造,实现数据增强、按批读取等功能,相关内容主要涉及两个文件:
*.data:存储训练集/验证集图片路径;
dataset.py:实现数据读取、数据增强,其运行逻辑如图1所示。
其中矩形训练表示将图像长边缩放为指定img_size,并在原图像长宽比例上,完成短边的缩放,同属于一个batch的图像应有相同的训练shape。
- 输入:从*.data文件中读取训练集/验证集图片和标签的路径,其中YOLO标签形式为
[
c
l
a
s
s
,
x
,
y
,
w
,
h
]
[class, x, y, w, h]
[class,x,y,w,h],
x
,
y
x, y
x,y为
目标中心点相对坐标
, w , h w,h w,h为包围目标所需最小的矩阵框的相对宽度和高度
。 - 图像加载:包含三个函数。
- load_img:将图像长边缩放为指定img_size,并在原图像长宽比例上,完成短边的缩放;
- load_padding_img:在矩形训练中,将图像缩放至指定shape;在非矩形训练中,与load_img相同。在缩放过程中,像素为整数值,因此与给定大小还存在差异,差异部分利用指定像素进行填充。
- load_argument_img:在load_img基础上,在训练中对图像数据增强操作,若开启了矩形训练,则只进行仿射变换和HSV增强;若未开启矩形训练,则进行Mosai数据增强、仿射变换和HSV增强。
- 数据增强:Mosai增强,仿射变换,HSV增强。
- 输出:读取单张图片时,输出 [ c l a s s , x , y , w , h ] [class,x,y,w,h] [class,x,y,w,h];读取batch图片时,输出 [ i m a g e _ i n d e x , c l a s s , x , y , w , h ] [image\_index,class,x,y,w,h] [image_index,class,x,y,w,h],其中 i m a g e _ i n d e x image\_index image_index用于标识标签属于该batch中哪一张图片。
1 数据集加载(datasets.py)
数据增强:Mosai增强
def argument_mosai(imgs, labels, s):
'''
马赛克增强(对四张图片进行拼接)
:param imgs: 输入图像数组
:param labels: 图像标签数组
:param s: 图像大小
:return:
'''
index = len(imgs)
labels4 = []
xc, yc = [int(random.uniform(s * 0.5, s * 1.5)) for _ in range(2)] # mosaic center x, y
for i in range(index):
# load image
img = imgs[i]
h, w = img.shape[:2]
# place img in img4
if i == 0: # top left
# 创建马赛克图像
img4 = np.full((s * 2, s * 2, img.shape[2]), 114, dtype=np.uint8) # base image with 4 tiles
# 计算马赛克图像中的坐标信息(将图像填充到马赛克图像中)
x1a, y1a, x2a, y2a = max(xc - w, 0), max(yc - h, 0), xc, yc # xmin, ymin, xmax, ymax (large image)
# 计算截取的图像区域信息(以xc,yc为第一张图像的右下角坐标填充到马赛克图像中,丢弃越界的区域)
x1b, y1b, x2b, y2b = w - (x2a - x1a), h - (y2a - y1a), w, h # xmin, ymin, xmax, ymax (small image)
elif i == 1: # top right
# 计算马赛克图像中的坐标信息(将图像填充到马赛克图像中)
x1a, y1a, x2a, y2a = xc, max(yc - h, 0), min(xc + w, s * 2), yc
# 计算截取的图像区域信息(以xc,yc为第二张图像的左下角坐标填充到马赛克图像中,丢弃越界的区域)
x1b, y1b, x2b, y2b = 0, h - (y2a - y1a), min(w, x2a - x1a), h
elif i == 2: # bottom left
# 计算马赛克图像中的坐标信息(将图像填充到马赛克图像中)
x1a, y1a, x2a, y2a = max(xc - w, 0), yc, xc, min(s * 2, yc + h)
# 计算截取的图像区域信息(以xc,yc为第三张图像的右上角坐标填充到马赛克图像中,丢弃越界的区域)
x1b, y1b, x2b, y2b = w - (x2a - x1a), 0, max(xc, w), min(y2a - y1a, h)
elif i == 3: # bottom right
# 计算马赛克图像中的坐标信息(将图像填充到马赛克图像中)
x1a, y1a, x2a, y2a = xc, yc, min(xc + w, s * 2), min(s * 2, yc + h)
# 计算截取的图像区域信息(以xc,yc为第四张图像的左上角坐标填充到马赛克图像中,丢弃越界的区域)
x1b, y1b, x2b, y2b = 0, 0, min(w, x2a - x1a), min(y2a - y1a, h)
# 将截取的图像区域填充到马赛克图像的相应位置
img4[y1a:y2a, x1a:x2a] = img[y1b:y2b, x1b:x2b] # img4[ymin:ymax, xmin:xmax]
# 计算pad(图像边界与马赛克边界的距离,越界的情况为负值)
padw = x1a - x1b
padh = y1a - y1b
# Labels 获取对应拼接图像的labels信息
# [class_index, x_center, y_center, w, h]
x = labels[i]
labels = x.copy() # 深拷贝,防止修改原数据
if x.size > 0: # Normalized xywh to pixel xyxy format
# 计算标注数据在马赛克图像中的坐标(绝对坐标)
labels[:, 1] = w * (x[:, 1] - x[:, 3] / 2) + padw # xmin
labels[:, 2] = h * (x[:, 2] - x[:, 4] / 2) + padh # ymin
labels[:, 3] = w * (x[:, 1] + x[:, 3] / 2) + padw # xmax
labels[:, 4] = h * (x[:, 2] + x[:, 4] / 2) + padh # ymax
labels4.append(labels)
# Concat/clip labels
if len(labels4):
labels4 = np.concatenate(labels4, 0)
# 设置上下限防止越界
np.clip(labels4[:, 1:], 0, 2 * s, out=labels4[:, 1:]) # use with random_affine
# 将图像和标签转换为原大小
img4 = cv2.resize(img4, (s, s), cv2.INTER_AREA)
labels4[:, 1:] /= 2
return img4, labels4
数据增强:仿射变换
def random_affine(img, targets=(), degrees=10, translate=.1, scale=.1, shear=10, border=0):
"""随机旋转,缩放,平移以及错切"""
# targets = [cls, xyxy]
# 最终输出的图像尺寸,等于img4.shape / 2
height = img.shape[0] + border * 2
width = img.shape[1] + border * 2
# Rotation and Scale
# 生成旋转以及缩放矩阵
R = np.eye(3) # 生成对角阵
a = random.uniform(-degrees, degrees) # 随机旋转角度
s = random.uniform(1 - scale, 1 + scale) # 随机缩放因子
R[:2] = cv2.getRotationMatrix2D(angle=a, center=(img.shape[1] / 2, img.shape[0] / 2), scale=s)
# Translation
# 生成平移矩阵
T = np.eye(3)
T[0, 2] = random.uniform(-translate, translate) * img.shape[0] + border # x translation (pixels)
T[1, 2] = random.uniform(-translate, translate) * img.shape[1] + border # y translation (pixels)
# Shear
# 生成错切矩阵
S = np.eye(3)
S[0, 1] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # x shear (deg)
S[1, 0] = math.tan(random.uniform(-shear, shear) * math.pi / 180) # y shear (deg)
# Combined rotation matrix
M = S @ T @ R # ORDER IS IMPORTANT HERE!!
if (border != 0) or (M != np.eye(3)).any(): # image changed
# 进行仿射变化
img = cv2.warpAffine(img, M[:2], dsize=(width, height), flags=cv2.INTER_LINEAR, borderValue=(114, 114, 114))
# 转换标签数据
n = len(targets)
if n:
# warp points
xy = np.ones((n * 4, 3))
xy[:, :2] = targets[:, [1, 2, 3, 4, 1, 4, 3, 2]].reshape(n * 4, 2) # x1y1, x2y2, x1y2, x2y1
# [4*n, 3] -> [n, 8]
xy = (xy @ M.T)[:, :2].reshape(n, 8)
# create new boxes
# 对transform后的bbox进行修正(假设变换后的bbox变成了菱形,此时要修正成矩形)
x = xy[:, [0, 2, 4, 6]] # [n, 4]
y = xy[:, [1, 3, 5, 7]] # [n, 4]
xy = np.concatenate((x.min(1), y.min(1), x.max(1), y.max(1))).reshape(4, n).T # [n, 4]
# reject warped points outside of image
# 对坐标进行裁剪,防止越界
xy[:, [0, 2]] = xy[:, [0, 2]].clip(0, width)
xy[:, [1, 3]] = xy[:, [1, 3]].clip(0, height)
w = xy[:, 2] - xy[:, 0]
h = xy[:, 3] - xy[:, 1]
# 计算调整后的每个box的面积
area = w * h
# 计算调整前的每个box的面积
area0 = (targets[:, 3] - targets[:, 1]) * (targets[:, 4] - targets[:, 2])
# 计算每个box的比例
ar = np.maximum(w / (h + 1e-16), h / (w + 1e-16)) # aspect ratio
# 选取长宽大于4个像素,且调整前后面积比例大于0.2,且比例小于10的box
i = (w > 4) & (h > 4) & (area / (area0 * s + 1e-16) > 0.2) & (ar < 10)
targets = targets[i]
targets[:, 1:5] = xy[i]
return img, targets
数据增强:HSV增强
def augment_hsv(img, h_gain=0.5, s_gain=0.5, v_gain=0.5):
r = np.random.uniform(-1, 1, 3) * [h_gain, s_gain, v_gain] + 1 # random gains
hue, sat, val = cv2.split(cv2.cvtColor(img, cv2.COLOR_BGR2HSV))
dtype = img.dtype # uint8
x = np.arange(0, 256, dtype=np.int16)
lut_hue = ((x * r[0]) % 180).astype(dtype)
lut_sat = np.clip(x * r[1], 0, 255).astype(dtype)
lut_val = np.clip(x * r[2], 0, 255).astype(dtype)
img_hsv = cv2.merge((cv2.LUT(hue, lut_hue), cv2.LUT(sat, lut_sat), cv2.LUT(val, lut_val))).astype(dtype)
cv2.cvtColor(img_hsv, cv2.COLOR_HSV2BGR, dst=img) # no return needed
数据加载类
class LoadImagesAndLabels(Dataset): # for training/testing
def __init__(self,
path, # 指向data/my_train_data.txt路径或data/my_val_data.txt路径
# 这里设置的是预处理后输出的图片尺寸
# 当为训练集时,设置的是训练过程中(开启多尺度)的最大尺寸
# 当为验证集时,设置的是最终使用的网络大小
img_size=416,
batch_size=16,
augment=False, # 是否开启数据增强
hyp=None, # 超参数字典,其中包含图像增强中使用的超参数
rect=False, # 是否使用rectangular training(长边缩放为img_size,短边根据图像原比例进行缩放)
cache_images=False, # 是否缓存图片到内存中
pad=0.0):
try:
path = str(Path(path))
# parent = str(Path(path).parent) + os.sep
if os.path.isfile(path): # file
# 读取对应my_train/val_data.txt文件,读取所有图片路径信息
with open(path, "r") as f:
f = f.read().splitlines()
else:
raise Exception("%s does not exist" % path)
# 检查图片格式是否正确,保存支持的图像路径
img_formats = ['.bmp', '.jpg', '.jpeg', '.png', '.tif', '.dng']
self.img_files = [x for x in f if os.path.splitext(x)[-1].lower() in img_formats]
self.img_files.sort() # 防止不同系统排序不同,导致shape文件出现差异
except Exception as e:
raise FileNotFoundError("Error loading data from {}. {}".format(path, e))
# 如果图片列表中没有图片,则报错
n = len(self.img_files)
assert n > 0, "No images found in {}".format(path)
# batch index
# 将图像数据按照batch进行分组,对应数字表示图像所在batch
bi = np.floor(np.arange(n) / batch_size).astype(np.int)
# 数据集划分后的总batch数
nb = bi[-1] + 1
self.n = n # 图像总数目
self.batch = bi # 图像对应的batch
self.img_size = img_size # 这里设置的是预处理后输出的图片尺寸
self.augment = augment # 是否启用augment_hsv
self.hyp = hyp # 超参数字典,其中包含图像增强会使用到的超参数
self.rect = rect # 是否使用rectangular training(长边缩放为img_size,短边根据图像原比例进行缩放)
# 注意: 开启rect后,mosaic就默认关闭
self.mosaic = self.augment and not self.rect # load 4 images at a time into a mosaic (only during training)
# 遍历获取图像对应的标签地址
# (./my_yolo_dataset/train/images/0.jpg) -> (./my_yolo_dataset/train/labels/0.txt)
self.label_files = [x.replace("images", "labels").replace(os.path.splitext(x)[-1], ".txt")
for x in self.img_files]
# 查看data文件下是否缓存有对应数据集的.shapes文件,里面存储了每张图像的width, height
sp = path.replace(".txt", ".shapes") # 图像形状文件路径
try:
with open(sp, "r") as f: # 若有图像形状文件,则读取每张图片的路径
s = [x.split() for x in f.read().splitlines()]
# 判断现有的shape文件中的行数(图像个数)是否与当前数据集中图像个数相等
# 如果不相等则认为是不同的数据集,故重新生成shape文件
assert len(s) == n, "shapefile out of aync"
except Exception as e:
# 生成图像形状文件
# tqdm库会显示处理的进度
image_files = tqdm(self.img_files, desc="Reading image shapes")
# 读取每张图片的size信息
s = [Image.open(f).size for f in image_files]
# 将所有图片的shape信息保存在.shape文件中
np.savetxt(sp, s, fmt="%g") # overwrite existing (if any)
# 记录每张图像的原始大小
self.shapes = np.array(s, dtype=np.float64)
# Rectangular Training https://github.com/ultralytics/yolov3/issues/232
# 如果为ture,训练网络时,会使用类似原图像比例的矩形(让最长边为img_size),而不是img_size x img_size
# 注意: 开启rect后,mosaic就默认关闭
if self.rect:
s = self.shapes # wh
# 计算每张图片的高/宽比
ar = s[:, 1] / s[:, 0]
# argsort函数返回的是数组值从小到大的索引值
# 按照高宽比例进行排序,使后面划分的每个batch中的图像缩放比例最小
irect = ar.argsort()
# 根据排序后的顺序重新设置图像顺序、标签顺序、shape顺序和高/宽比
self.img_files = [self.img_files[i] for i in irect]
self.label_files = [self.label_files[i] for i in irect]
self.shapes = s[irect] # wh
ar = ar[irect]
# 设置训练图像采用的尺寸
# 计算每个batch采用的统一尺度(每个batch中的图片应有相同的宽和高)
shapes = [[1, 1]] * nb # nb: batch数量
for i in range(nb): # 设置每个batch使用的图像的宽和高
ari = ar[bi == i] # bi: batch index,得到同属于一个batch的图片高/宽比
# 获取第i个batch中,最小和最大高宽比
mini, maxi = ari.min(), ari.max()
# 如果高/宽小于1(w > h),将w设为img_size
if maxi < 1:
shapes[i] = [maxi, 1]
# 如果高/宽大于1(w < h),将h设置为img_size
elif mini > 1:
shapes[i] = [1, 1 / mini]
# 计算每个batch输入网络的shape值(向上设置为32的整数倍,因为输出feature_map最大缩放倍数为32)
self.batch_shapes = np.ceil(np.array(shapes) * img_size / 32. + pad).astype(np.int) * 32
# 缓存图像
self.imgs = [None] * n # n为图像总数
# 缓存图像标签
# label: [class, x, y, w, h] 其中的xywh都为相对值
self.labels = [np.zeros((0, 5), dtype=np.float32)] * n
nm, nf, ne, nd = 0, 0, 0, 0 # number mission, found, empty, duplicate
# 将图片标签缓存至内存中
pbar = tqdm(self.label_files)
for i, file in enumerate(pbar):
try:
with open(file, "r") as f:
# 读取每一行label,并按空格划分数据
l = np.array([x.split() for x in f.read().splitlines()], dtype=np.float32)
except Exception as e:
print("An error occurred while loading the file {}: {}".format(file, e))
nm += 1 # file missing
continue
# 如果标注信息不为空的话,检查标签信息
if l.shape[0]:
# 标签信息每行必须是五个值[class, x, y, w, h]
assert l.shape[1] == 5, "> 5 label columns: %s" % file
# 标签中每个值必须大于0
assert (l >= 0).all(), "negative labels: %s" % file
# 标签中坐标信息为相对坐标,必须小于1
assert (l[:, 1:] <= 1).all(), "non-normalized or out of bounds coordinate labels: %s" % file
# 检查每一行,查看是否有重复信息
if np.unique(l, axis=0).shape[0] < l.shape[0]:
nd += 1
self.labels[i] = l
nf += 1 # file found
else:
ne += 1 # file empty
# 更新进度条描述信息
pbar.desc = "Caching labels (%g found, %g missing, %g empty, %g duplicate, for %g images)" % (
nf, nm, ne, nd, n)
assert nf > 0, "No labels found in %s." % os.path.dirname(self.label_files[0]) + os.sep
# 将图像数据缓存至内存中,加快训练速度 (Warning: 图像数据过大时可能会超出系统内存)
if cache_images: # if training
gb = 0 # Gigabytes of cached images 用于记录缓存图像占用RAM大小
pbar = tqdm(range(len(self.img_files)), desc="Caching images")
# 图像原始尺寸,图像用于训练/验证的尺寸
self.img_hw0, self.img_hw = [None] * n, [None] * n
for i in pbar: # max 10k images
self.imgs[i], self.img_hw0[i], self.img_hw[i] = self.load_image(i)
gb += self.imgs[i].nbytes # 用于记录缓存图像占用RAM大小
pbar.desc = "Caching images (%.1fGB)" % (gb / 1E9)
def load_image(self, index):
# 读取图像
img = self.imgs[index]
if img is None: # 若未将图像缓存至内存中,则img为None,每次调用都要用硬盘中读取图片
path = self.img_files[index] # 图像路径
img = cv2.imread(path) # BGR, (c, h, w)
assert img is not None, "Image Not Found " + path
h0, w0 = img.shape[:2] # 图像原始h,w
r = self.img_size / max(h0, w0) # 缩放比例r,长边缩放到img_size
if r != 1:
# 缩放方式,若图像需要缩小,则采用INTER_AREA缩放;若图像需要放大,则采用INTER_LINEAR
interp = cv2.INTER_AREA if r < 1 else cv2.INTER_LINEAR
img = cv2.resize(img, (int(w0 * r), int(h0 * r)), interpolation=interp)
return img, (h0, w0), img.shape[:2] # img, 原始hw, 缩放后hw
else: # 若图像已经缓存至内存中了,则直接读取结果
return self.imgs[index], self.img_hw0[index], self.img_hw[index] # img, 原始hw, 缩放后hw
def load_padding_img(self, index, color=(114, 114, 114)):
'''
将图像用颜色color填充至指定大小
:param index:图像索引
:param color: 填充颜色
:return: 新图像,新标签
'''
img, (h0, w0), (h, w) = self.load_image(index)
shape = img.shape[:2] # 图像原始hw
new_shape = self.batch_shapes[self.batch[index]] if self.rect else self.img_size
if isinstance(new_shape, int):
new_shape = (new_shape, new_shape) # 将新shape转换为数组
# 缩放比例(new / old),将缩放尺度最小的缩放为新形状,防止图像发生较大变化
r = min(new_shape[0] / shape[0], new_shape[1] / shape[1])
if not self.augment: # only scale down, do not scale up (for better test mAP)
r = min(1.0, r)
ratio = r, r # width, height 缩放比例
# 缩放后的图像大小(未进行填充,小于等于指定大小)
new_unpad = int(round(shape[1] * r)), int(round(shape[0] * r))
# width, height 填充量
dw, dh = new_shape[1] - new_unpad[0], new_shape[0] - new_unpad[1]
# 将padding分到上下,左右两侧
dw /= 2
dh /= 2
if shape[::-1] != new_unpad: # shape:[h, w] new_unpad:[w, h]
img = cv2.resize(img, new_unpad, interpolation=cv2.INTER_LINEAR)
top, bottom = int(round(dh - 0.1)), int(round(dh + 0.1)) # 计算上下两侧的padding
left, right = int(round(dw - 0.1)), int(round(dw + 0.1)) # 计算左右两侧的padding
# 增加填充
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_CONSTANT, value=color)
# 标签转换(x,y,w,h)相对坐标 => (x,y,x,y绝对坐标)
labels = self.labels[index]
if labels.size > 0:
x = labels.copy() # label: class, x, y, w, h
labels[:, 1] = ratio[0] * w * (x[:, 1] - x[:, 3] / 2) + dw # pad width
labels[:, 2] = ratio[1] * h * (x[:, 2] - x[:, 4] / 2) + dh # pad height
labels[:, 3] = ratio[0] * w * (x[:, 1] + x[:, 3] / 2) + dw
labels[:, 4] = ratio[1] * h * (x[:, 2] + x[:, 4] / 2) + dh
return img, labels
def load_argument_img(self, index):
"""
将四张图片拼接在一张马赛克图像中
:param self:
:param index: 需要获取的图像索引
:return:
"""
if random.random() < 0.5:
# Mosai数据增强(随机选取四张图像进行拼接)
# 从dataset中随机寻找三张图像进行拼接
indices = [index] + [random.randint(0, len(self.labels) - 1) for _ in range(3)] # 3 additional image indices
img4, labels4 = [], []
# 遍历四张图像存储图像和标签
for i in indices:
img, _, (h, w) = self.load_image(i)
img4.append(img)
label = self.labels[i]
labels4.append(label)
img, labels = argument_mosai(img4, labels4, self.img_size)
else:
img, _, (h, w) = self.load_image(index)
labels = self.labels[index]
# 随机旋转,缩放,平移以及错切
img, labels = random_affine(img, labels,
degrees=self.hyp['degrees'],
translate=self.hyp['translate'],
scale=self.hyp['scale'],
shear=self.hyp['shear'],
)
return img, labels
def __len__(self):
# 换成数据集图像数量
return len(self.img_files)
def __getitem__(self, index):
# 根据索引读取图像
hyp = self.hyp
if self.mosaic: # self.argument and not self.rect
# 数据增强:以50%概率进行Mosai或仿射变换,仿射过多的变化导致特征丢失
if random.random() <= 0.5:
img, labels = self.load_argument_img(index)
else:
img, labels = self.load_padding_img(index)
else:
# load padding_img
img, labels = self.load_padding_img(index)
if self.augment:
# 仿射变换
if not self.mosaic and random.random() <= 0.5:
img, labels = random_affine(img, labels,
degrees=hyp["degrees"],
translate=hyp["translate"],
scale=hyp["scale"],
shear=hyp["shear"])
# HSV色彩空间增强
augment_hsv(img, h_gain=hyp["hsv_h"], s_gain=hyp["hsv_s"], v_gain=hyp["hsv_v"])
nL = len(labels) # 标签个数
if nL:
# 将(xmin,ymin,xmax,ymax)绝对坐标转换为(x,y,w,h)相对坐标
labels[:, 1:5] = xyxy2xywh(labels[:, 1:5])
# Normalize coordinates 0-1
labels[:, [2, 4]] /= img.shape[0] # height
labels[:, [1, 3]] /= img.shape[1] # width
# nL: 标签数,第一个元素存放图像索引,用于标识标签属于batch中哪张图片
labels_out = torch.zeros((nL, 6))
if nL:
labels_out[:, 1:] = torch.from_numpy(labels)
# Convert BGR to RGB, and HWC to CHW(3x512x512)
img = img[:, :, ::-1].transpose(2, 0, 1)
img = np.ascontiguousarray(img)
return torch.from_numpy(img), labels_out, self.img_files[index], index
@staticmethod
def collate_fn(batch):
img, label, path, index = zip(*batch) # transposed
for i, l in enumerate(label):
l[:, 0] = i # 增强图像信息索引,标识该标签属于哪张图像
return torch.stack(img, 0), torch.cat(label, 0), path, index