【扒代码】分析数据流动过程loca forward函数-CSDN博客

本文链接：https://blog.csdn.net/2301_77549977/article/details/141030486

    def forward(self, x, bboxes):
        num_objects = bboxes.size(1) if not self.zero_shot else self.num_objects
        # backbone
        backbone_features = self.backbone(x)
        # print(backbone_features.shape)  [4, 3584, 64, 64]
        backbone_features = self.ccff(backbone_features)
        # print(backbone_features.shape)  [4, 3584, 64, 64]
        # prepare the encoder input
        src = self.input_proj(backbone_features)
        print(src.shape)  # [4, 256, 64, 64]
        # sys.exit(0)
        bs, c, h, w = src.size()
        pos_emb = self.pos_emb(bs, h, w, src.device).flatten(2).permute(2, 0, 1)
        src = src.flatten(2).permute(2, 0, 1)

        # push through the encoder
        if self.num_encoder_layers > 0:
            image_features = self.encoder(src, pos_emb, src_key_padding_mask=None, src_mask=None)
        else:
            image_features = src

        # prepare OPE input
        f_e = image_features.permute(1, 2, 0).reshape(-1, self.emb_dim, h, w)

        all_prototypes = self.ope(f_e, pos_emb, bboxes)

        outputs = list()
        # 问题： 这里的all_prototypes是什么？
        for i in range(all_prototypes.size(0)):
            prototypes = all_prototypes[i, ...].permute(1, 0, 2).reshape(
                bs, num_objects, self.kernel_dim, self.kernel_dim, -1
            ).permute(0, 1, 4, 2, 3).flatten(0, 2)[:, None, ...]

            response_maps = F.conv2d(
                torch.cat([f_e for _ in range(num_objects)], dim=1).flatten(0, 1).unsqueeze(0),
                prototypes,
                bias=None,
                padding=self.kernel_dim // 2,
                groups=prototypes.size(0)
            ).view(
                bs, num_objects, self.emb_dim, h, w
            ).max(dim=1)[0]

            # send through regression heads
            if i == all_prototypes.size(0) - 1:
                predicted_dmaps = self.regression_head(response_maps)
            else:
                predicted_dmaps = self.aux_heads[i](response_maps)
            outputs.append(predicted_dmaps)

        return outputs[-1], outputs[:-1]

    def forward(self, x, bboxes):
        # 确定对象的数量，如果不是零样本学习场景，则根据bboxes的数量确定
        num_objects = bboxes.size(1) if not self.zero_shot else self.num_objects
        # backbone
        # 通过主干网络提取特征
        backbone_features = self.backbone(x)
        # prepare the encoder input
        # 准备编码器的输入
        src = self.input_proj(backbone_features)
        # 获取特征的尺寸
        bs, c, h, w = src.size()
        # 生成位置编码并调整其形状以匹配编码器的输入
        pos_emb = self.pos_emb(bs, h, w, src.device).flatten(2).permute(2, 0, 1)
        # 调整src的形状
        src = src.flatten(2).permute(2, 0, 1)

        # push through the encoder
        # 通过编码器处理特征
        if self.num_encoder_layers > 0:
            image_features = self.encoder(src, pos_emb, src_key_padding_mask=None, src_mask=None)
        else:
            image_features = src

        # prepare OPE input
        # 准备OPE（对象原型提取）模块的输入
        f_e = image_features.permute(1, 2, 0).reshape(-1, self.emb_dim, h, w)

        # 调用OPE模块生成所有原型
        # 问题：原型到底是啥
        all_prototypes = self.ope(f_e, pos_emb, bboxes)

        # 初始化输出列表
        outputs = list()

        # 遍历所有原型
        # 问题： 这里的all_prototypes是什么？
        for i in range(all_prototypes.size(0)):
            # 处理每个原型
            prototypes = all_prototypes[i, ...].permute(1, 0, 2).reshape(
                bs, num_objects, self.kernel_dim, self.kernel_dim, -1
            ).permute(0, 1, 4, 2, 3).flatten(0, 2)[:, None, ...]

            # 使用原型和查询特征生成响应图
            # 原型：prototypes
            # 查询特征：[f_e for _ in range(num_objects)]
            response_maps = F.conv2d(
                torch.cat([f_e for _ in range(num_objects)], dim=1).flatten(0, 1).unsqueeze(0),
                prototypes,
                bias=None,
                padding=self.kernel_dim // 2,
                groups=prototypes.size(0)
            ).view(
                bs, num_objects, self.emb_dim, h, w
            ).max(dim=1)[0]

            # send through regression heads
            # 通过回归头处理响应图
            if i == all_prototypes.size(0) - 1:
                predicted_dmaps = self.regression_head(response_maps)
            else:
                predicted_dmaps = self.aux_heads[i](response_maps)
            # 将预测的密度图添加到输出列表
            outputs.append(predicted_dmaps)

        # 返回最终的预测密度图和中间辅助输出
        return outputs[-1], outputs[:-1]