YOLOv11检测头代码详细分析

YOLO小行家

已于 2025-04-02 16:12:32 修改

阅读量349

点赞数 6

文章标签： YOLO

于 2025-04-02 16:11:16 首次发布

本文链接：https://blog.csdn.net/weixin_54753067/article/details/146949519

版权

本文详细分析YOLOv11目标检测算法的检测头代码。

项目地址:https://github.com/ultralytics/ultralytics

检测头代码的位置:ultralytics/nn/modules/head.py

检测头代码:

class Detect(nn.Module):
    """YOLO Detect head for detection models."""

    dynamic = False  # force grid reconstruction
    export = False  # export mode
    format = None  # export format
    end2end = False  # end2end
    max_det = 300  # max_det
    shape = None
    anchors = torch.empty(0)  # init
    strides = torch.empty(0)  # init
    legacy = False  # backward compatibility for v3/v5/v8/v9 models

    def __init__(self, nc=80, ch=()):
        """Initializes the YOLO detection layer with specified number of classes and channels."""
        super().__init__()
        self.nc = nc  # number of classes
        self.nl = len(ch)  # number of detection layers
        self.reg_max = 16  # DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
        self.no = nc + self.reg_max * 4  # number of outputs per anchor
        self.stride = torch.zeros(self.nl)  # strides computed during build
        c2, c3 = max((16, ch[0] // 4, self.reg_max * 4)), max(ch[0], min(self.nc, 100))  # channels
        self.cv2 = nn.ModuleList(
            nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4 * self.reg_max, 1)) for x in ch
        )
        self.cv3 = (
            nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
            if self.legacy
            else nn.ModuleList(
                nn.Sequential(
                    nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
                    nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
                    nn.Conv2d(c3, self.nc, 1),
                )
                for x in ch
            )
        )
        self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()

        if self.end2end:
            self.one2one_cv2 = copy.deepcopy(self.cv2)
            self.one2one_cv3 = copy.deepcopy(self.cv3)

    def forward(self, x):
        """Concatenates and returns predicted bounding boxes and class probabilities."""
        if self.end2end:
            return self.forward_end2end(x)

        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
        if self.training:  # Training path
            return x
        y = self._inference(x)
        return y if self.export else (y, x)

    def forward_end2end(self, x):
        """
        Performs forward pass of the v10Detect module.

        Args:
            x (tensor): Input tensor.

        Returns:
            (dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
                           If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
        """
        x_detach = [xi.detach() for xi in x]
        one2one = [
            torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
        ]
        for i in range(self.nl):
            x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
        if self.training:  # Training path
            return {"one2many": x, "one2one": one2one}

        y = self._inference(one2one)
        y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
        return y if self.export else (y, {"one2many": x, "one2one": one2one})

    def _inference(self, x):
        """Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
        # Inference path
        shape = x[0].shape  # BCHW
        x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
        if self.format != "imx" and (self.dynamic or self.shape != shape):
            self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
            self.shape = shape

        if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:  # avoid TF FlexSplitV ops
            box = x_cat[:, : self.reg_max * 4]
            cls = x_cat[:, self.reg_max * 4 :]
        else:
            box, cls = x_cat.split((self.reg_max * 4, self.nc), 1)

        if self.export and self.format in {"tflite", "edgetpu"}:
            # Precompute normalization factor to increase numerical stability
            # See https://github.com/ultralytics/ultralytics/issues/7371
            grid_h = shape[2]
            grid_w = shape[3]
            grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
            norm = self.strides / (self.stride[0] * grid_size)
            dbox = self.decode_bboxes(self.dfl(box) * norm, self.anchors.unsqueeze(0) * norm[:, :2])
        elif self.export and self.format == "imx":
            dbox = self.decode_bboxes(
                self.dfl(box) * self.strides, self.anchors.unsqueeze(0) * self.strides, xywh=False
            )
            return dbox.transpose(1, 2), cls.sigmoid().permute(0, 2, 1)
        else:
            dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0)) * self.strides

        return torch.cat((dbox, cls.sigmoid()), 1)

    def bias_init(self):
        """Initialize Detect() biases, WARNING: requires stride availability."""
        m = self  # self.model[-1]  # Detect() module
        # cf = torch.bincount(torch.tensor(np.concatenate(dataset.labels, 0)[:, 0]).long(), minlength=nc) + 1
        # ncf = math.log(0.6 / (m.nc - 0.999999)) if cf is None else torch.log(cf / cf.sum())  # nominal class frequency
        for a, b, s in zip(m.cv2, m.cv3, m.stride):  # from
            a[-1].bias.data[:] = 1.0  # box
            b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)
        if self.end2end:
            for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride):  # from
                a[-1].bias.data[:] = 1.0  # box
                b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s) ** 2)  # cls (.01 objects, 80 classes, 640 img)

    def decode_bboxes(self, bboxes, anchors, xywh=True):
        """Decode bounding boxes."""
        return dist2bbox(bboxes, anchors, xywh=xywh and (not self.end2end), dim=1)

    @staticmethod
    def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80):
        """
        Post-processes YOLO model predictions.

        Args:
            preds (torch.Tensor): Raw predictions with shape (batch_size, num_anchors, 4 + nc) with last dimension
                format [x, y, w, h, class_probs].
            max_det (int): Maximum detections per image.
            nc (int, optional): Number of classes. Default: 80.

        Returns:
            (torch.Tensor): Processed predictions with shape (batch_size, min(max_det, num_anchors), 6) and last
                dimension format [x, y, w, h, max_class_prob, class_index].
        """
        batch_size, anchors, _ = preds.shape  # i.e. shape(16,8400,84)
        boxes, scores = preds.split([4, nc], dim=-1)
        index = scores.amax(dim=-1).topk(min(max_det, anchors))[1].unsqueeze(-1)
        boxes = boxes.gather(dim=1, index=index.repeat(1, 1, 4))
        scores = scores.gather(dim=1, index=index.repeat(1, 1, nc))
        scores, index = scores.flatten(1).topk(min(max_det, anchors))
        i = torch.arange(batch_size)[..., None]  # batch indices
        return torch.cat([boxes[i, index // nc], scores[..., None], (index % nc)[..., None].float()], dim=-1)

代码详细分析:

1. 类的初始化（__init__ 方法）

def __init__(self, nc=80, ch=()):
    """Initializes the YOLO detection layer with specified number of classes and channels."""
    super().__init__()
    self.nc = nc   number of classes
    self.nl = len(ch)   number of detection layers
    self.reg_max = 16   DFL channels (ch[0] // 16 to scale 4/8/12/16/20 for n/s/m/l/x)
    self.no = nc + self.reg_max  4   number of outputs per anchor
    self.stride = torch.zeros(self.nl)   strides computed during build
    c2, c3 = max((16, ch[0] // 4, self.reg_max  4)), max(ch[0], min(self.nc, 100))   channels
    self.cv2 = nn.ModuleList(
        nn.Sequential(Conv(x, c2, 3), Conv(c2, c2, 3), nn.Conv2d(c2, 4  self.reg_max, 1)) for x in ch
    )
    self.cv3 = (
        nn.ModuleList(nn.Sequential(Conv(x, c3, 3), Conv(c3, c3, 3), nn.Conv2d(c3, self.nc, 1)) for x in ch)
        if self.legacy
        else nn.ModuleList(
            nn.Sequential(
                nn.Sequential(DWConv(x, x, 3), Conv(x, c3, 1)),
                nn.Sequential(DWConv(c3, c3, 3), Conv(c3, c3, 1)),
                nn.Conv2d(c3, self.nc, 1),
            )
            for x in ch
        )
    )
    self.dfl = DFL(self.reg_max) if self.reg_max > 1 else nn.Identity()

- nc：表示类别数量，默认为80。
- ch：输入特征图的通道数列表，长度为检测层数量（nl）。
- reg_max：表示DFL（Distribution Focal Loss）的通道数，默认为16。它用于缩放边界框的回归目标。
- no：每个锚点的输出数量，等于类别数加上边界框回归的通道数（nc + reg_max 4）。
- stride：每个检测层的步长，会在构建模型时计算。
- cv2 和 cv3：是两个模块列表，分别用于生成边界框回归和类别预测的特征。cv2 的输出用于边界框回归，cv3 的输出用于类别预测。
- 如果 legacy 为 True，cv3 使用普通的卷积层；否则使用深度可分离卷积（DWConv）。
- dfl：用于解码边界框的模块，如果 reg_max 大于1，则使用 DFL，否则使用 nn.Identity。 2. 前向传播（forward 方法）

def forward(self, x):
    """Concatenates and returns predicted bounding boxes and class probabilities."""
    if self.end2end:
        return self.forward_end2end(x)

    for i in range(self.nl):
        x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
    if self.training:   Training path
        return x
    y = self._inference(x)
    return y if self.export else (y, x)

- 如果 end2end 为 True，则调用 forward_end2end 方法进行前向传播。
- 否则，对每个检测层的输入特征图 x[i]，分别通过 cv2[i] 和 cv3[i] 获取边界框回归和类别预测的特征，然后将它们拼接起来。
- 如果处于训练模式，直接返回拼接后的特征；否则调用 _inference 方法进行推理。 3. 推理过程（_inference 方法）

def _inference(self, x):
    """Decode predicted bounding boxes and class probabilities based on multiple-level feature maps."""
    shape = x[0].shape   BCHW
    x_cat = torch.cat([xi.view(shape[0], self.no, -1) for xi in x], 2)
    if self.format != "imx" and (self.dynamic or self.shape != shape):
        self.anchors, self.strides = (x.transpose(0, 1) for x in make_anchors(x, self.stride, 0.5))
        self.shape = shape

    if self.export and self.format in {"saved_model", "pb", "tflite", "edgetpu", "tfjs"}:   avoid TF FlexSplitV ops
        box = x_cat[:, : self.reg_max  4]
        cls = x_cat[:, self.reg_max  4 :]
    else:
        box, cls = x_cat.split((self.reg_max  4, self.nc), 1)

    if self.export and self.format in {"tflite", "edgetpu"}:
         Precompute normalization factor to increase numerical stability
        grid_h = shape[2]
        grid_w = shape[3]
        grid_size = torch.tensor([grid_w, grid_h, grid_w, grid_h], device=box.device).reshape(1, 4, 1)
        norm = self.strides / (self.stride[0]  grid_size)
        dbox = self.decode_bboxes(self.dfl(box)  norm, self.anchors.unsqueeze(0)  norm[:, :2])
    elif self.export and self.format == "imx":
        dbox = self.decode_bboxes(
            self.dfl(box)  self.strides, self.anchors.unsqueeze(0)  self.strides, xywh=False
        )
        return dbox.transpose(1, 2), cls.sigmoid().permute(0, 2, 1)
    else:
        dbox = self.decode_bboxes(self.dfl(box), self.anchors.unsqueeze(0))  self.strides

    return torch.cat((dbox, cls.sigmoid()), 1)

- 将所有检测层的特征图拼接起来，形成一个统一的张量 x_cat。
- 如果处于导出模式（export），根据不同的导出格式（如 tflite、edgetpu 等），对边界框回归和类别预测的特征进行处理，以提高数值稳定性。
- 调用 decode_bboxes 方法解码边界框。
- 最后，将解码后的边界框和类别概率拼接起来，返回最终的检测结果。

4. 边界框解码（decode_bboxes 方法）

def decode_bboxes(self, bboxes, anchors, xywh=True):
    """Decode bounding boxes."""
    return dist2bbox(bboxes, anchors, xywh=xywh and (not self.end2end), dim=1)

- 使用 dist2bbox 函数将边界框的分布预测转换为实际的边界框坐标。
- 参数 xywh 决定了边界框的格式是 (x, y, w, h) 还是 (x1, y1, x2, y2)。 5. 后处理（postprocess 方法）

@staticmethod
def postprocess(preds: torch.Tensor, max_det: int, nc: int = 80):
    """
    Post-processes YOLO model predictions.

    Args:
        preds (torch.Tensor): Raw predictions with shape (batch_size, num_anchors, 4 + nc) with last dimension
            format [x, y, w, h, class_probs].
        max_det (int): Maximum detections per image.
        nc (int, optional): Number of classes. Default: 80.

    Returns:
        (torch.Tensor): Processed predictions with shape (batch_size, min(max_det, num_anchors), 6) and last
            dimension format [x, y, w, h, max_class_prob, class_index].
    """
    batch_size, anchors, _ = preds.shape   i.e. shape(16,8400,84)
    boxes, scores = preds.split([4, nc], dim=-1)
    index = scores.amax(dim=-1).topk(min(max_det, anchors))[1].unsqueeze(-1)
    boxes = boxes.gather(dim=1, index=index.repeat(1, 1, 4))
    scores = scores.gather(dim=1, index=index.repeat(1, 1, nc))
    scores, index = scores.flatten(1).topk(min(max_det, anchors))
    i = torch.arange(batch_size)[..., None]   batch indices
    return torch.cat([boxes[i, index // nc], scores[..., None], (index % nc)[..., None].float()], dim=-1)

- 将预测结果分为边界框和类别分数。
- 根据类别分数选择每个图像的前 max_det 个检测结果。
- 返回处理后的预测结果，包括边界框坐标、最大类别概率和类别索引。

6. 偏置初始化（bias_init 方法）

def bias_init(self):
    """Initialize Detect() biases, WARNING: requires stride availability."""
    m = self   self.model[-1]   Detect() module
    for a, b, s in zip(m.cv2, m.cv3, m.stride):   from
        a[-1].bias.data[:] = 1.0   box
        b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s)  2)   cls (.01 objects, 80 classes, 640 img)
    if self.end2end:
        for a, b, s in zip(m.one2one_cv2, m.one2one_cv3, m.stride):   from
            a[-1].bias.data[:] = 1.0   box
            b[-1].bias.data[: m.nc] = math.log(5 / m.nc / (640 / s)  2)   cls (.01 objects, 80 classes, 640 img)

- 初始化边界框回归和类别预测的偏置值。
- 边界框回归的偏置值初始化为1.0。
- 类别预测的偏置值根据类别数量和图像尺寸进行初始化，以确保初始的类别概率接近于0.01。 7. End-to-End 推理（forward_end2end 方法）

def forward_end2end(self, x):
    """
    Performs forward pass of the v10Detect module.

    Args:
        x (tensor): Input tensor.

    Returns:
        (dict, tensor): If not in training mode, returns a dictionary containing the outputs of both one2many and one2one detections.
                       If in training mode, returns a dictionary containing the outputs of one2many and one2one detections separately.
    """
    x_detach = [xi.detach() for xi in x]
    one2one = [
        torch.cat((self.one2one_cv2[i](x_detach[i]), self.one2one_cv3[i](x_detach[i])), 1) for i in range(self.nl)
    ]
    for i in range(self.nl):
        x[i] = torch.cat((self.cv2[i](x[i]), self.cv3[i](x[i])), 1)
    if self.training:   Training path
        return {"one2many": x, "one2one": one2one}

    y = self._inference(one2one)
    y = self.postprocess(y.permute(0, 2, 1), self.max_det, self.nc)
    return y if self.export else (y, {"one2many": x, "one2one": one2one})

- 在 End-to-End 模式下，模型会同时进行 one-to-many 和 one-to-one 的检测。
- 如果处于训练模式，返回 one-to-many 和 one-to-one 的检测结果。
- 如果处于推理模式，对 one-to-one 的检测结果进行后处理，并返回最终的检测结果。