本人在研究yolov8时,发现它的检测头不同于yolov5(这里讨论官方版yolo,即ultralytics版),但网上的一些帖子都讲的似懂非懂的样子,故开此贴讨论一下。
首先我们要明确yolov8在对检测框回归时,采用了分布焦点损失Distribution Focal Loss(DFL),听起来十分高大上,但实际上原理并不复杂。通过调试yolov8代码,我们可以看到:
class Detect(nn.Module):
"""YOLOv8 Detect head for detection models."""
dynamic = False # force grid reconstruction
export = False # export mode
shape = None
anchors = torch.empty(0) # init
strides = torch.empty(0) # init
def __init__(self, nc=80, ch=()):
"""Initializes the YOLOv8 detection layer with specified number of classes and channels."""
super().__init__()
self.nc = nc # number of classes
self.nl = len(ch) # number of detection layers
self.reg_max = 16 # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
self.no = nc + self.reg_max * 4 # number of outputs per anchor
self.stride = torch.zeros(self.nl) # strides computed during build
c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100)) # channels
self.cv2 = nn.ModuleList(
nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
)
self.cv3 = nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()
def forward(self, x):
"""Concatenates and returns predicted bounding boxes and class probabilities."""
for i in range(self.nl):
x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
if self.training: # Training path
return x
# 余下代码略
最终输出的通道数是16 * 4,因为self.cv2控制的是边界框回归,self.cv3控制的是分类回归(这里假设你已经有深度学习和yolo基础)。也就是说,当特征图通过检测头的时候,会被转换为形状为(batch_size, 16 *4, w, h),而yolov8默认开启3个尺度的回归,所以前向传播会返回一个长度为3的列表,其中每个列表的形状是(batch_size, 16 *4, wi, hi)i=0,1,2
这里可以理解为特征图大小为wi * hi,每个像素生成64个回归信息。
接着是一些数据处理步骤,后续有兴趣再更,我们直接看重点:
class v8DetectionLoss:
"""Criterion class for computing training losses."""
.......
.......
def __call__(self, preds, batch):
"""Calculate the sum of the loss for box, cls and dfl multiplied by batch size."""
loss = torch.zeros(3, device=self.device) # box, cls, dfl
feats = preds[1] if isinstance(preds, tuple) else preds
pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
(self.reg_max * 4, self.nc), 1
)
pred_scores = pred_scores.permute(0, 2, 1).contiguous()
pred_distri = pred_distri.permute(0, 2, 1).contiguous()
dtype = pred_scores.dtype
batch_size = pred_scores.shape[0]
imgsz = torch.tensor(feats[0].shape[2:], device=self.device, dtype=dtype) * self.stride[0] # image size (h,w)
anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
# Targets
targets = torch.cat((batch["batch_idx"].view(-1, 1), batch["cls"].view(-1, 1), batch["bboxes"]), 1)
targets = self.preprocess(targets.to(self.device), batch_size, scale_tensor=imgsz[[1, 0, 1, 0]])
gt_labels, gt_bboxes = targets.split((1, 4), 2) # cls, xyxy
mask_gt = gt_bboxes.sum(2, keepdim=True).gt_(0)
# Pboxes
pred_bboxes = self.bbox_decode(anchor_points, pred_distri) # xyxy, (b, h*w, 4)
_, target_bboxes, target_scores, fg_mask, _ = self.assigner(
pred_scores.detach().sigmoid(),
(pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
anchor_points * stride_tensor,
gt_labels,
gt_bboxes,
mask_gt,
)
target_scores_sum = max(target_scores.sum(), 1)
# Cls loss
# loss[1] = self.varifocal_loss(pred_scores, target_scores, target_labels) / target_scores_sum # VFL way
loss[1] = self.bce(pred_scores, target_scores.to(dtype)).sum() / target_scores_sum # BCE
# Bbox loss
if fg_mask.sum():
target_bboxes /= stride_tensor
loss[0], loss[2] = self.bbox_loss(
pred_distri, pred_bboxes, anchor_points, target_bboxes, target_scores, target_scores_sum, fg_mask, target_bboxes*stride_tensor
)
loss[0] *= self.hyp.box # box gain
loss[1] *= self.hyp.cls # cls gain
loss[2] *= self.hyp.dfl # dfl gain
return loss.sum() * batch_size, loss.detach() # loss(box, cls, dfl)
其中feats代表的是3个尺度的经过计算的特征图,这里贴上我调试的数据:
可以看到形状是(batch_size, reg_max * 4(64) + num_classes(11), hi, wi),然后经过这行代码:
pred_distri, pred_scores = torch.cat([xi.view(feats[0].shape[0], self.no, -1) for xi in feats], 2).split(
(self.reg_max * 4, self.nc), 1
)
会将feats重塑为(batch_size, self.no(64 + 11), -1),这里-1代表自适应,即hi * wi,由于feats的长度为3,所以我们cat后得到的形状(batch_size, self.no(64 + 11), 80*80 + 40*40 +20*20 = 8400),然后再从第二个维度进行分离,即得到的pred_distri, pred_scores分别是(batch_size, 64, 8400)和(batch_sizes, num_classes, 8400),再将形状进行调整:
pred_scores = pred_scores.permute(0, 2, 1).contiguous()
pred_distri = pred_distri.permute(0, 2, 1).contiguous()
得到
这里表明我们每个batch的数据生成了8400个预测,其中box预测的维度是64,类别的预测维度是11,到这里就比较清晰了,接下来做的事情就是将box的64维度表示转换成我们目标检测常用的四维表示法xywh或者xyxy:
anchor_points, stride_tensor = make_anchors(feats, self.stride, 0.5)
def make_anchors(feats, strides, grid_cell_offset=0.5):
"""Generate anchors from features."""
anchor_points, stride_tensor = [], []
assert feats is not None
dtype, device = feats[0].dtype, feats[0].device
for i, stride in enumerate(strides):
_, _, h, w = feats[i].shape
sx = torch.arange(end=w, device=device, dtype=dtype) + grid_cell_offset # shift x
sy = torch.arange(end=h, device=device, dtype=dtype) + grid_cell_offset # shift y
sy, sx = torch.meshgrid(sy, sx, indexing="ij") if TORCH_1_10 else torch.meshgrid(sy, sx)
anchor_points.append(torch.stack((sx, sy), -1).view(-1, 2))
stride_tensor.append(torch.full((h * w, 1), stride, dtype=dtype, device=device))
return torch.cat(anchor_points), torch.cat(stride_tensor)
这个函数生成了每个特征图的中心点,0.5代表我们取中心点的意思,设想一下我们的坐标如果取(0,0), (1,1)...这实际上是反映了每个预测框的左上角坐标,通过+0.5可以理解为取中间的位置。返回的第二个参数代表了图片被缩小的倍数,即8, 16, 32(假设imgsz=640,三个检测头的特征图分别是80, 40, 20)
接着是解码操作,即将刚才得到的box高维表示转换:
pred_bboxes = self.bbox_decode(anchor_points, pred_distri)
def bbox_decode(self, anchor_points, pred_dist):
"""Decode predicted object bounding box coordinates from anchor points and distribution."""
if self.use_dfl:
b, a, c = pred_dist.shape # batch, anchors, channels
pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
# pred_dist = pred_dist.view(b, a, c // 4, 4).transpose(2,3).softmax(3).matmul(self.proj.type(pred_dist.dtype))
# pred_dist = (pred_dist.view(b, a, c // 4, 4).softmax(2) * self.proj.type(pred_dist.dtype).view(1, 1, -1, 1)).sum(2)
return dist2bbox(pred_dist, anchor_points, xywh=False)
pred_dist = pred_dist.view(b, a, 4, c // 4).softmax(3).matmul(self.proj.type(pred_dist.dtype))
这行代码首先重塑预测框形状,这里可以看到,第三个维度已经被重塑成4了,接着对第四个维度进行softmax处理,将其转换为概率分布,然后再点成向量self.proj(self.proj = torch.arange(m.reg_max, dtype=torch.float, device=device),即(0,1,2,3,...15)),将其转换成一个数字,最后,我们就得到了形状为(batch_size, anchors,4)的预测框表示
return dist2bbox(pred_dist, anchor_points, xywh=False)
def dist2bbox(distance, anchor_points, xywh=True, dim=-1):
"""Transform distance(ltrb) to box(xywh or xyxy)."""
lt, rb = distance.chunk(2, dim)
x1y1 = anchor_points - lt
x2y2 = anchor_points + rb
if xywh:
c_xy = (x1y1 + x2y2) / 2
wh = x2y2 - x1y1
return torch.cat((c_xy, wh), dim) # xywh bbox
return torch.cat((x1y1, x2y2), dim) # xyxy bbox
通过这个函数,将刚才的中心点加上/减去相应的预测偏移量,就得到了最终的预测框(x1, y1, x2, y2)
解码完成后,我们已经得到了xyxy形式的预测框,接下来要做的就是将得到的预测框匹配真实目标框(gt):
_, target_bboxes, target_scores, fg_mask, _ = self.assigner(
pred_scores.detach().sigmoid(),
(pred_bboxes.detach() * stride_tensor).type(gt_bboxes.dtype),
anchor_points * stride_tensor,
gt_labels,
gt_bboxes,
mask_gt,
)
# self.assigner = TaskAlignedAssigner(topk=10, num_classes=self.nc, alpha=0.5, beta=6.0)
# 这个方法的定义如上所示,
class TaskAlignedAssigner(nn.Module):
"""
A task-aligned assigner for object detection.
This class assigns ground-truth (gt) objects to anchors based on the task-aligned metric, which combines both
classification and localization information.
Attributes:
topk (int): The number of top candidates to consider.
num_classes (int): The number of object classes.
alpha (float): The alpha parameter for the classification component of the task-aligned metric.
beta (float): The beta parameter for the localization component of the task-aligned metric.
eps (float): A small value to prevent division by zero.
"""
def __init__(self, topk=13, num_classes=80, alpha=1.0, beta=6.0, eps=1e-9):
"""Initialize a TaskAlignedAssigner object with customizable hyperparameters."""
super().__init__()
self.topk = topk
self.num_classes = num_classes
self.bg_idx = num_classes
self.alpha = alpha
self.beta = beta
self.eps = eps
@torch.no_grad()
def forward(self, pd_scores, pd_bboxes, anc_points, gt_labels, gt_bboxes, mask_gt):
"""
Compute the task-aligned assignment. Reference code is available at
https://github.com/Nioolek/PPYOLOE_pytorch/blob/master/ppyoloe/assigner/tal_assigner.py.
Args:
pd_scores (Tensor): shape(bs, num_total_anchors, num_classes)
pd_bboxes (Tensor): shape(bs, num_total_anchors, 4)
anc_points (Tensor): shape(num_total_anchors, 2)
gt_labels (Tensor): shape(bs, n_max_boxes, 1)
gt_bboxes (Tensor): shape(bs, n_max_boxes, 4)
mask_gt (Tensor): shape(bs, n_max_boxes, 1)
Returns:
target_labels (Tensor): shape(bs, num_total_anchors)
target_bboxes (Tensor): shape(bs, num_total_anchors, 4)
target_scores (Tensor): shape(bs, num_total_anchors, num_classes)
fg_mask (Tensor): shape(bs, num_total_anchors)
target_gt_idx (Tensor): shape(bs, num_total_anchors)
"""
self.bs = pd_scores.shape[0]
self.n_max_boxes = gt_bboxes.shape[1]
if self.n_max_boxes == 0:
device = gt_bboxes.device
return (
torch.full_like(pd_scores[..., 0], self.bg_idx).to(device),
torch.zeros_like(pd_bboxes).to(device),
torch.zeros_like(pd_scores).to(device),
torch.zeros_like(pd_scores[..., 0]).to(device),
torch.zeros_like(pd_scores[..., 0]).to(device),
)
mask_pos, align_metric, overlaps = self.get_pos_mask(
pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt
)
target_gt_idx, fg_mask, mask_pos = self.select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes)
# Assigned target
target_labels, target_bboxes, target_scores = self.get_targets(gt_labels, gt_bboxes, target_gt_idx, fg_mask)
# Normalize
align_metric *= mask_pos
pos_align_metrics = align_metric.amax(dim=-1, keepdim=True) # b, max_num_obj
pos_overlaps = (overlaps * mask_pos).amax(dim=-1, keepdim=True) # b, max_num_obj
norm_align_metric = (align_metric * pos_overlaps / (pos_align_metrics + self.eps)).amax(-2).unsqueeze(-1)
target_scores = target_scores * norm_align_metric
return target_labels, target_bboxes, target_scores, fg_mask.bool(), target_gt_idx
可以看到,首先会调用get_pos_mask方法,计算最匹配的目标框:
def get_pos_mask(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, anc_points, mask_gt):
"""Get in_gts mask, (b, max_num_obj, h*w)."""
mask_in_gts = self.select_candidates_in_gts(anc_points, gt_bboxes)
# Get anchor_align metric, (b, max_num_obj, h*w)
align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts * mask_gt)
# Get topk_metric mask, (b, max_num_obj, h*w)
mask_topk = self.select_topk_candidates(align_metric, topk_mask=mask_gt.expand(-1, -1, self.topk).bool())
# Merge all mask to a final mask, (b, max_num_obj, h*w)
mask_pos = mask_topk * mask_in_gts * mask_gt
return mask_pos, align_metric, overlaps
mask_in_gts = self.select_candidates_in_gts(anc_points, gt_bboxes)来得到每个锚点匹配的目标框的掩码值:
def select_candidates_in_gts(xy_centers, gt_bboxes, eps=1e-9):
"""
Select the positive anchor center in gt.
Args:
xy_centers (Tensor): shape(h*w, 2)
gt_bboxes (Tensor): shape(b, n_boxes, 4)
Returns:
(Tensor): shape(b, n_boxes, h*w)
"""
n_anchors = xy_centers.shape[0]
bs, n_boxes, _ = gt_bboxes.shape
lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2) # left-top, right-bottom
bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1)
# return (bbox_deltas.min(3)[0] > eps).to(gt_bboxes.dtype)
return bbox_deltas.amin(3).gt_(eps)
这里xy_centers代表每个特征图生成的锚点,如之前所论述的一样,形状是(8400, 2),通过 lt, rb = gt_bboxes.view(-1, 1, 4).chunk(2, 2) 得到了目标框的四个坐标值,然后通过广播机制:
bbox_deltas = torch.cat((xy_centers[None] - lt, rb - xy_centers[None]), dim=2).view(bs, n_boxes, n_anchors, -1)
计算了每个中心点与每个真实框的距离情况,这里广播可以理解为,我有n1个中心点,n2个目标框,我现在需要知道哪个中心点落在哪个目标框里面,广播就充分利用了GPU并行计算的优势,我们不需要写一个循环去一一遍历,而是简单粗暴的复制n2次中心点,复制n1次目标框,让他们去匹配运算,得到距离。举个例子,我在调试的时候,xy_centers形状=(8400,2),gt_bboxes形状= (batch_size, num_max_gtboxes, 4) = (16, 21, 4),lt, rb的形状就是(16*21, 1, 2),通过广播并cat
view,得到了bbox_deltas.shape = (batch_size, n_boxes, n_anchors, -1) = (16, 21, 8400, 4)
其中最后一个维度是中心点到四条边的距离,通过 return bbox_deltas.amin(3).gt_(eps),amin是取最小值,_gt(eps)是判断与eps的大小,其中eps是一个接近0的数,只要我们四个距离的最小值大于0,说明中心点落在了目标框,匹配成功,最后返回的就是这些掩码信息。
align_metric, overlaps = self.get_box_metrics(pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_in_gts * mask_gt)
def get_box_metrics(self, pd_scores, pd_bboxes, gt_labels, gt_bboxes, mask_gt):
"""Compute alignment metric given predicted and ground truth bounding boxes."""
na = pd_bboxes.shape[-2]
mask_gt = mask_gt.bool() # b, max_num_obj, h*w
overlaps = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_bboxes.dtype, device=pd_bboxes.device)
bbox_scores = torch.zeros([self.bs, self.n_max_boxes, na], dtype=pd_scores.dtype, device=pd_scores.device)
ind = torch.zeros([2, self.bs, self.n_max_boxes], dtype=torch.long) # 2, b, max_num_obj
ind[0] = torch.arange(end=self.bs).view(-1, 1).expand(-1, self.n_max_boxes) # b, max_num_obj
ind[1] = gt_labels.squeeze(-1) # b, max_num_obj
# Get the scores of each grid for each gt cls
bbox_scores[mask_gt] = pd_scores[ind[0], :, ind[1]][mask_gt] # b, max_num_obj, h*w
# (b, max_num_obj, 1, 4), (b, 1, h*w, 4)
pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, self.n_max_boxes, -1, -1)[mask_gt]
gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[mask_gt]
# overlaps[mask_gt] = self.iou_calculation(gt_boxes, pd_boxes)
overlaps[mask_gt] = self.iou_calculation(pd_boxes, gt_boxes)
align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta)
return align_metric, overlaps
mask_in_gts * mask_gt表示将锚点匹配的真实框与有效的真实框再次进行匹配,过滤掉无效的框
这里ind[0]对应了每个batch的索引,ind[1]对应每个真实框的类别索引,
bbox_scores[mask_gt] = pd_scores[ind[0], :, ind[1]][mask_gt] 将上述索引带入到预测框,从而得到对应预测类别的概率
"""贴上我调试时的形状,方便理解:
mask_gt(16, 21, 8400)
pb_bboxes (16, 8400, 4) -> (16, 1, 8400, 4) -> (16, 21, 8400, 4) -> pd_boxes
gt_bboxes (16, 21, 4) -> (16, 21, 1, 4) -> (16, 21, 8400, 4) -> gt_boxes
pd_boxes[mask_gt].shape = gt_boxes[mask_gt].shape = (5709, 4)
到这步也就是说, 有5709个有效的掩码, 从而得到了5709个一一对应的预测框和真实框
还记得预测框怎么得到的吗,就是将预测得到的reg_max * 4解码成4维, 再加上/减去中心点
到这里还存在一个问题,那就是可能我们一个预测框对应了多个真实框,这个情况我们后续还会处理
"""
pd_boxes = pd_bboxes.unsqueeze(1).expand(-1, self.n_max_boxes, -1, -1)[mask_gt]
gt_boxes = gt_bboxes.unsqueeze(2).expand(-1, -1, na, -1)[mask_gt]
# 利用广播,将预测框和真实框进行一一匹配,并通过mask_gt掩码过滤掉无效框,最终得到了一一匹配的预测框与真实框,从而进行进一步的计算
overlaps[mask_gt] = self.iou_calculation(pd_boxes, gt_boxes) 计算每个预测框与真实框的iou
align_metric = bbox_scores.pow(self.alpha) * overlaps.pow(self.beta) 通过结合分类得分和边界框得分,得到最终的分配指标得分
mask_topk = self.select_topk_candidates(align_metric, topk_mask=mask_gt.expand(-1, -1, self.topk).bool()) 根据得分得到topk个锚点
mask_pos = mask_topk * mask_in_gts * mask_gt
得到的掩码同时满足得分topk,中心点在真实框内,真实框有效
这个时候mask_pos的形状是(batch_size, num_max_boxes, h*w(三个特征图像素数之和)),表明我们有batch_size批数据,一个批次最多的真实框数量是num_max_boxes,一个真实框内包含了h*w个中心点,所以这个张量的含义就指示了真实框对应哪些预测框(True, False),所以接下来我们要做的事情就是得到重叠度最高的预测框
target_gt_idx, fg_mask, mask_pos = self.select_highest_overlaps(mask_pos, overlaps, self.n_max_boxes)
def select_highest_overlaps(mask_pos, overlaps, n_max_boxes):
"""
If an anchor box is assigned to multiple gts, the one with the highest IoU will be selected.
Args:
mask_pos (Tensor): shape(b, n_max_boxes, h*w)
overlaps (Tensor): shape(b, n_max_boxes, h*w)
Returns:
target_gt_idx (Tensor): shape(b, h*w)
fg_mask (Tensor): shape(b, h*w)
mask_pos (Tensor): shape(b, n_max_boxes, h*w)
"""
# (b, n_max_boxes, h*w) -> (b, h*w)
fg_mask = mask_pos.sum(-2)
if fg_mask.max() > 1: # one anchor is assigned to multiple gt_bboxes
mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, n_max_boxes, -1) # (b, n_max_boxes, h*w)
max_overlaps_idx = overlaps.argmax(1) # (b, h*w)
is_max_overlaps = torch.zeros(mask_pos.shape, dtype=mask_pos.dtype, device=mask_pos.device)
is_max_overlaps.scatter_(1, max_overlaps_idx.unsqueeze(1), 1)
mask_pos = torch.where(mask_multi_gts, is_max_overlaps, mask_pos).float() # (b, n_max_boxes, h*w)
fg_mask = mask_pos.sum(-2)
# Find each grid serve which gt(index)
target_gt_idx = mask_pos.argmax(-2) # (b, h*w)
return target_gt_idx, fg_mask, mask_pos
fg_mask = mask_pos.sum(-2)通过对倒数第二个维度进行求和,我们可以得到每个锚点被多少个真实框所匹配,至于为什么不是第三个维度,我们看一个例子
mask_pos = [[[True, False, True, True], # 第一个样本
[False, True, False, True],
[True, True, False, False]],
[[True, False, False, True], # 第二个样本
[False, False, True, True],
[True, True, True, False]]]
fg_mask = mask_pos.sum(-2)
fg_mask = [[2, 2, 1, 2], # 第一个样本,每个锚点被多少个真实框标记为正样本
[2, 2, 2, 1]] # 第二个样本
可以看到,这样相当于对列求和,自然就得到了锚点对应的正样本数
if fg_mask.max() > 1: # one anchor is assigned to multiple gt_bboxes
mask_multi_gts = (fg_mask.unsqueeze(1) > 1).expand(-1, n_max_boxes, -1) # (b, n_max_boxes, h*w)
max_overlaps_idx = overlaps.argmax(1) # (b, h*w)
"""
如果有多个匹配的情况,则取出每个锚点最大的iou对应的真实框索引,接下来,通过索引,就可以消除一个预测框对应多个真实框的情况,但是值得注意的是,一个真实框仍然有可能对应多个预测框
"""
未完待续...