yolov8
__version__ = "8.2.31"
,注意YOLOv8版本一直在更新换代,可能下面代码的实现方式变了
1 detect
1.1 数据增强
1.1.1 rect
rect数据增强技术是将数据集中的所有图片的长宽比中最接近的图片打包成一个batch,在detect中的val是默认使用的。具体可以见
# model.py: class Model(nn.Module): def val
custom = {"rect": True} # method defaults
rect对应的数据增强代码如下:
def set_rectangle(self):
"""Sets the shape of bounding boxes for YOLO detections as rectangles."""
bi = np.floor(np.arange(self.ni) / self.batch_size).astype(int) # batch index
nb = bi[-1] + 1 # number of batches
s = np.array([x.pop("shape") for x in self.labels]) # hw
ar = s[:, 0] / s[:, 1] # aspect ratio
irect = ar.argsort()
self.im_files = [self.im_files[i] for i in irect]
self.labels = [self.labels[i] for i in irect]
ar = ar[irect]
# Set training image shapes
shapes = [[1, 1]] * nb
for i in range(nb):
ari = ar[bi == i]
mini, maxi = ari.min(), ari.max()
if maxi < 1:
shapes[i] = [maxi, 1]
elif mini > 1:
shapes[i] = [1, 1 / mini]
# shapes中最大值一定是1
# self.pad=0.5,因此batch_shapes得到的值最大时672,[H, 672]或者[W, 672]
# 这个作为后面letterbox变换的imgsz
self.batch_shapes = np.ceil(np.array(shapes) * self.imgsz / self.stride + self.pad).astype(int) * self.stride
self.batch = bi # batch index of image
1.2 postprocess后处理
1.2.1 update_metrics
在后处理preds之后update_metrics函数解析
update_metrics的输入:preds是一个len(preds)=bs的列表
batch是包含gt和img的信息
验证方式没有使用moasic技术
'''
这个函数的目的就是得到self.stats
'''
def update_metrics(self, preds, batch):
"""Metrics."""
for si, pred in enumerate(preds):
idx = batch['batch_idx'] == si
cls = batch['cls'][idx]
bbox = batch['bboxes'][idx]
nl, npr = cls.shape[0], pred.shape[0] # number of labels, predictions
shape = batch['ori_shape'][si]
correct_bboxes = torch.zeros(npr, self.niou, dtype=torch.bool, device=self.device) # init
self.seen += 1
if npr == 0:
if nl:
self.stats.append((correct_bboxes, *torch.zeros((2, 0), device=self.device), cls.squeeze(-1)))
if self.args.plots:
self.confusion_matrix.process_batch(detections=None, labels=cls.squeeze(-1))
continue
# Predictions
if self.args.single_cls:
pred[:, 5] = 0
predn = pred.clone()
# 将predn_boxes(xywh)缩放到原始图片对应的比例大小(注意此时的xywh具是model input的输入大小的所在位置的真值,不是一个比例)
ops.scale_boxes(batch['img'][si].shape[1:], predn[:, :4], shape,
ratio_pad=batch['ratio_pad'][si]) # native-space pred
# Evaluate
if nl:
height, width = batch['img'].shape[2:]
tbox = ops.xywh2xyxy(bbox) * torch.tensor(
(width, height, width, height), device=self.device) # target boxes
# 传入的batch['bboxes']是一个比例,将其缩放回原始图片对应的比例大小
ops.scale_boxes(batch['img'][si].shape[1:], tbox, shape,
ratio_pad=batch['ratio_pad'][si]) # native-space labels
labelsn = torch.cat((cls, tbox), 1) # native-space labels
# 计算mAP之前的TP计算
correct_bboxes = self._process_batch(predn, labelsn)
# TODO: maybe remove these `self.` arguments as they already are member variable
if self.args.plots:
self.confusion_matrix.process_batch(predn, labelsn)
self.stats.append((correct_bboxes, pred[:, 4], pred[:, 5], cls.squeeze(-1))) # (conf, pcls, tcls)
# Save
if self.args.save_json:
self.pred_to_json(predn, batch['im_file'][si])
if self.args.save_txt:
file = self.save_dir / 'labels' / f'{Path(batch["im_file"][si]).stem}.txt'
self.save_one_txt(predn, self.args.save_conf, shape, file)
correct_bboxes = self._process_batch(predn, labelsn)
def _process_batch(self, detections, labels):
"""
Return correct prediction matrix
Arguments:
detections (array[N, 6]), x1, y1, x2, y2, conf, class
labels (array[M, 5]), class, x1, y1, x2, y2
Returns:
correct (array[N, 10]), for 10 IoU levels
"""
# iou: torch.Size(M, N)
iou = box_iou(labels[:, 1:], detections[:, :4])
correct = np.zeros((detections.shape[0], self.iouv.shape[0])).astype(bool)
# correct_class: torch.Size(M, N)
# labels[:, 0:1]: tensor([[0.],[0.],[1.],[1.]], device='cuda:0')
# detections[:, 5]: tensor([0., 1., 0., 1., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1., 1., 1., 1., 1., 0., 0., 1., 0., 1., 1., 1., 1., 0., 0., 0., 1., 0., 0.], device='cuda:0')
# 因此,经过广播机制,correct_class的shape为torch.Size(M, N),含义也很好理解,每一个gt与所有的pred的进行标签匹配,一样则为True, 不一样则为False
correct_class = labels[:, 0:1] == detections[:, 5]
# self.iouv: shape[10, ], 对应的是0.5-0.95
for i in range(len(self.iouv)):
# 这个返回的x是索引,x[0]中的都是label对应的索引,x[1]中的都是detect对应的索引,参考https://blog.csdn.net/shilichangtin/article/details/135335322
x = torch.where((iou >= self.iouv[i]) & correct_class) # IoU > threshold and classes match
if x[0].shape[0]:
matches = torch.cat((torch.stack(x, 1), iou[x[0], x[1]][:, None]),
1).cpu().numpy() # [label, detect, iou]
if x[0].shape[0] > 1:
# 将matches按照iou的大小进行排序
matches = matches[matches[:, 2].argsort()[::-1]]
# 这两行是为了实现gt与pd的一一匹配
# 因为前面已经进行了iou的大小排序,相当于出现一个pd对应多个gt的时候,选择iou最大的那个,一个gt对应多个pd的时候,也是一样
matches = matches[np.unique(matches[:, 1], return_index=True)[1]]
# matches = matches[matches[:, 2].argsort()[::-1]]
matches = matches[np.unique(matches[:, 0], return_index=True)[1]]
correct[matches[:, 1].astype(int), i] = True
return torch.tensor(correct, dtype=torch.bool, device=detections.device)
2 classify
yolov8 classify的train或者val可以按照下面的方式进行
from ultralytics import YOLO
if __name__ == '__main__':
model = YOLO("yolov8n-cls.pt")
results = model.train(data="datasets_class", epochs=3, imgsz=64)
# model = YOLO("runs/classify/train/weights/best.pt")
# metrics = model.val(batch=1)
2.1 imgsz
imgsz是模型图像的输入尺寸
train
在train时,imgsz可以通过两种方式改变imgsz。
- 可以直接通过改变
model.train(data="datasets_class", epochs=3, imgsz=64)
中的imgsz的设定值改变 - 当
model.train(data="datasets_class", epochs=3)
不传入imgsz的情况下,代码的默认imgsz是224,可以通过该改变默认值改变imgsz。# train.py: class ClassificationTrainer(BaseTrainer): def __init__() if overrides.get("imgsz") is None: overrides["imgsz"] = 224
val
val中的imgsz是由模型决定的,可以直接通过在下面代码处打断点进行调试
# model.py: class Model(nn.Module): def _load()
self.task = self.model.args["task"]
2.2 torch_transforms
2.2.1 YOLOv8
介绍过imgsz后,这个在torch_transforms
是训练分类模型或者验证模型的数据增强手段。在train的时候使用的是classify_augmentations
,val时使用的是classify_transforms
。
# dataset.py: class ClassificationDataset: def __init__()
self.torch_transforms = (
classify_augmentations(
size=args.imgsz,
scale=scale,
hflip=args.fliplr,
vflip=args.flipud,
erasing=args.erasing,
auto_augment=args.auto_augment,
hsv_h=args.hsv_h,
hsv_s=args.hsv_s,
hsv_v=args.hsv_v,
)
if augment
else classify_transforms(size=args.imgsz, crop_fraction=args.crop_fraction)
)
主要介绍是这个classify_transforms
,这个图片变换很有趣,可以直接看其代码。
在YOLOv8中,classify_transforms
首先是将img
经过T.Resize
,这个T.Resize
操作是T.Resize(scale_size[0], interpolation=interpolation)
,这个scale_size[0]
很重要,传入的模型输入size一般是224,那么这个T.Resize
选择H,W中最小的值缩放到224,另外较大的H(或W)按照图片原始的长宽比进行缩放。
然后T.CenterCrop(size)
就是在T.Resize
缩放之后的图片进行中心裁剪,裁剪的大小是[224, 224],边缘部分的图片会直接丢失,所以最重要的一点是,在这个分类中,不合适对长宽比极端的图片进行分类,例如输入图片[224, 2240],就不合适,在中心裁剪的时候会丢失掉2240-224的像素信息。
最后经过一个T.ToTensor(), T.Normalize(mean=torch.tensor(mean), std=torch.tensor(std),)
之后的图片就直接作为模型的输入。
from PIL import Image
DEFAULT_MEAN = (0.0, 0.0, 0.0)
DEFAULT_STD = (1.0, 1.0, 1.0)
DEFAULT_CROP_FRACTION = 1.0
import math
import torch
# Classification augmentations -----------------------------------------------------------------------------------------
def classify_transforms(
size=224,
mean=DEFAULT_MEAN,
std=DEFAULT_STD,
interpolation=Image.BILINEAR,
crop_fraction: float = DEFAULT_CROP_FRACTION,
):
"""
Classification transforms for evaluation/inference. Inspired by timm/data/transforms_factory.py.
Args:
size (int): image size
mean (tuple): mean values of RGB channels
std (tuple): std values of RGB channels
interpolation (T.InterpolationMode): interpolation mode. default is T.InterpolationMode.BILINEAR.
crop_fraction (float): fraction of image to crop. default is 1.0.
Returns:
(T.Compose): torchvision transforms
"""
import torchvision.transforms as T # scope for faster 'import ultralytics'
if isinstance(size, (tuple, list)):
assert len(size) == 2
scale_size = tuple(math.floor(x / crop_fraction) for x in size)
else:
scale_size = math.floor(size / crop_fraction)
scale_size = (scale_size, scale_size)
# Aspect ratio is preserved, crops center within image, no borders are added, image is lost
if scale_size[0] == scale_size[1]:
# Simple case, use torchvision built-in Resize with the shortest edge mode (scalar size arg)
tfl = [T.Resize(scale_size[0], interpolation=interpolation)]
else:
# Resize the shortest edge to matching target dim for non-square target
tfl = [T.Resize(scale_size)]
tfl += [T.CenterCrop(size)]
tfl += [
T.ToTensor(),
T.Normalize(
mean=torch.tensor(mean),
std=torch.tensor(std),
),
]
return T.Compose(tfl)
img = Image.open('06162.jpg')
torch_transforms = classify_transforms(size=224, crop_fraction=1.0)
resized_tensor = torch_transforms(img)
to_pil_transform = T.ToPILImage()
resized_img = to_pil_transform(resized_tensor)
resized_img.show()
2.2.2 YOLOv9
yolov9中使用的classify_transforms
是和yolov8__version__ = '8.0.110'
一样的
# yolov9 : augmentations.py
def classify_transforms(size=224):
# Transforms to apply if albumentations not installed
assert isinstance(size, int), f'ERROR: classify_transforms size {size} must be integer, not (list, tuple)'
# T.Compose([T.ToTensor(), T.Resize(size), T.CenterCrop(size), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
return T.Compose([CenterCrop(size), ToTensor(), T.Normalize(IMAGENET_MEAN, IMAGENET_STD)])
# augment.py
def classify_transforms(size=224, mean=(0.0, 0.0, 0.0), std=(1.0, 1.0, 1.0)): # IMAGENET_MEAN, IMAGENET_STD
# Transforms to apply if albumentations not installed
if not isinstance(size, int):
raise TypeError(f'classify_transforms() size {size} must be integer, not (list, tuple)')
if any(mean) or any(std):
return T.Compose([CenterCrop(size), ToTensor(), T.Normalize(mean, std, inplace=True)])
else:
return T.Compose([CenterCrop(size), ToTensor()])
这两个YOLO中的默认使用的中心裁剪是CenterCrop
的方式都是下面代码,这个先在原始img(假设是[320, 664])会裁剪出[320, 320]的img,然后将img缩放到[224, 224]。最后再经过一个ToTensor()
,ToTensor()
和T.ToTensor()
的作用,会将像素的值转化为0-1之间。
class CenterCrop:
# YOLOv5 CenterCrop class for image preprocessing, i.e. T.Compose([CenterCrop(size), ToTensor()])
def __init__(self, size=640):
super().__init__()
self.h, self.w = (size, size) if isinstance(size, int) else size
def __call__(self, im): # im = np.array HWC
imh, imw = im.shape[:2]
m = min(imh, imw) # min dimension
top, left = (imh - m) // 2, (imw - m) // 2
return cv2.resize(im[top:top + m, left:left + m], (self.w, self.h), interpolation=cv2.INTER_LINEAR)
2.2.3 长宽比极端的图片分类
在2.2.1
中提到过,YOLOv8中和YOLOv9中默认使用的classify_transforms
对长宽比极端的图片不友好,只会从原始的图片中截取一小部分进行分类,很容易出错。在YOLOv8和YOLOv9中提到过类似于目标检测letterbox
的在图片添加灰度薄边。
class ClassifyLetterBox:
# YOLOv8 LetterBox class for image preprocessing, i.e. T.Compose([LetterBox(size), ToTensor()])
def __init__(self, size=(640, 640), auto=False, stride=32):
"""Resizes image and crops it to center with max dimensions 'h' and 'w'."""
super().__init__()
self.h, self.w = (size, size) if isinstance(size, int) else size
self.auto = auto # pass max size integer, automatically solve for short side using stride
self.stride = stride # used with auto
def __call__(self, im): # im = np.array HWC
imh, imw = im.shape[:2]
r = min(self.h / imh, self.w / imw) # ratio of new/old
h, w = round(imh * r), round(imw * r) # resized image
hs, ws = (math.ceil(x / self.stride) * self.stride for x in (h, w)) if self.auto else self.h, self.w
top, left = round((hs - h) / 2 - 0.1), round((ws - w) / 2 - 0.1)
im_out = np.full((self.h, self.w, 3), 114, dtype=im.dtype)
im_out[top:top + h, left:left + w] = cv2.resize(im, (w, h), interpolation=cv2.INTER_LINEAR)
return im_out