PETR: Position Embedding Transformation for Multi-View 3D Object Detection代码解读梳理

本文链接：https://blog.csdn.net/zisuina_2/article/details/136095266

PETR: Position Embedding Transformation for Multi-View 3D Object Detection

代码链接
https://github.com/exiawsh/StreamPETR

这个petr3d.py的taskhead的入口

    def forward_pts_train(self,
                          gt_bboxes_3d,
                          gt_labels_3d,
                          gt_bboxes,
                          gt_labels,
                          img_metas,
                          centers2d,
                          depths,
                          requires_grad=True,
                          return_losses=False,
                          **data):
        """Forward function for point cloud branch.
        Args:
            pts_feats (list[torch.Tensor]): Features of point cloud branch
            gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
                boxes for each sample.
            gt_labels_3d (list[torch.Tensor]): Ground truth labels for
                boxes of each sampole
            img_metas (list[dict]): Meta information of samples.
            gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
                boxes to be ignored. Defaults to None.
        Returns:
            dict: Losses of each branch.
        """
        #获得X,Y2维度下下的meshgrid bs*n H W 2；为构建视锥做准备 &&&
        location = self.prepare_location(img_metas, **data)

        if not requires_grad:
            self.eval()
            with torch.no_grad():
                outs = self.pts_bbox_head(location, img_metas, None, **data)
            self.train()

        else:
            #训练过程中拿到2D信息的TOPK作为辅助训练
            outs_roi = self.forward_roi_head(location, **data)
            topk_indexes = outs_roi['topk_indexes']
            #输入streampetrhead
            outs = self.pts_bbox_head(location, img_metas, topk_indexes, **data)

        if return_losses:
            loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
            losses = self.pts_bbox_head.loss(*loss_inputs)
            if self.with_img_roi_head:
                loss2d_inputs = [gt_bboxes, gt_labels, centers2d, depths, outs_roi, img_metas]
                losses2d = self.img_roi_head.loss(*loss2d_inputs)
                losses.update(losses2d) 

            return losses
        else:
            return None
            
    #构建BEV视角下的meshgrid
    def prepare_location(self, img_metas, **data):
        pad_h, pad_w, _ = img_metas[0]['pad_shape'][0]
        bs, n = data['img_feats'].shape[:2]
        x = data['img_feats'].flatten(0, 1)
        location = locations(x, self.stride, pad_h, pad_w)[None].repeat(bs*n, 1, 1, 1)
        # return  bs*n H W 2 
        return location

	#misc.py
	def locations(features, stride, pad_h, pad_w):
	        """
	        Arguments:
	            features:  (N, C, H, W)
	        Return:
	            locations:  (H, W, 2)
	        """
	
	        h, w = features.size()[-2:]
	        device = features.device
	        
	        shifts_x = (torch.arange(
	            0, stride*w, step=stride,
	            dtype=torch.float32, device=device
	        ) + stride // 2 ) / pad_w
	        shifts_y = (torch.arange(
	            0, h * stride, step=stride,
	            dtype=torch.float32, device=device
	        ) + stride // 2) / pad_h
	        shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
	        shift_x = shift_x.reshape(-1)
	        shift_y = shift_y.reshape(-1)
	        locations = torch.stack((shift_x, shift_y), dim=1)
	        
	        locations = locations.reshape(h, w, 2)
	        
	        return locations

PETR head的主要流程


    def forward(self, memory_center, img_metas, topk_indexes,  **data):
        """Forward function.
        Args:
            mlvl_feats (tuple[Tensor]): Features from the upstream
                network, each is a 5D-tensor with shape
                (B, N, C, H, W).
        Returns:
            all_cls_scores (Tensor): Outputs from the classification head, \
                shape [nb_dec, bs, num_query, cls_out_channels]. Note \
                cls_out_channels should includes background.
            all_bbox_preds (Tensor): Sigmoid outputs from the regression \
                head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
                Shape [nb_dec, bs, num_query, 9].
        """
        # memory_center 利用meshgrid的维度拉出的2维的特征
        x = data['img_feats']
        B, N, C, H, W = x.shape
        num_tokens = N * H * W
        memory = x.permute(0, 1, 3, 4, 2).reshape(B, num_tokens, C)
        memory = topk_gather(memory, topk_indexes)
		# position_embeding： 2维特征根据（self.coords_d）增加D的维度（这部分和LSS有点像），通过内参，外参映射到世界坐标系-归一化，反归一化进入PossitionEncoder进一步加深编码信息，因此只变化C的维度获得pos_embed（B*N,embed_dims,H,W）
		# cone内包含内参和？
        pos_embed, cone = self.position_embeding(data, memory_center, topk_indexes, img_metas)
        # 2D特征进行提取
        memory = self.memory_embed(memory)
		
        # spatial_alignment in focal petr
        memory = self.spatial_alignment(memory, cone)
        # key: SE layer 融合3Dfeature和2Dfeature
        pos_embed = self.featurized_pe(pos_embed, memory)
		
        reference_points = self.reference_points.weight
        # B * N * 3
        reference_points, attn_mask, mask_dict = self.prepare_for_dn(B, reference_points, img_metas)
        # query: pos2posemb3d 将reference_points进行sin cos编码（对位置进行编码）；将位置信息编码到文本信息中；
        query_pos = self.query_embedding(pos2posemb3d(reference_points))
        
		# transformer是PETRTemporalTransformer
		# q: 3d信息 且加了 3d sin位置编码 query_pos（sin cos编码）- reference_points;
        # k: 通过3D Position Encoder得到的pos_embed（延伸D维度， 相机内参，外参数转换那个）和2d特征图融合;
        # v: 2d特征图 =》memory
        outs_dec, _ = self.transformer(memory, None, query_pos, pos_embed, attn_mask)

        outs_dec = torch.nan_to_num(outs_dec)
        outputs_classes = []
        outputs_coords = []
        for lvl in range(outs_dec.shape[0]):
            reference = inverse_sigmoid(reference_points.clone())
            assert reference.shape[-1] == 3
            outputs_class = self.cls_branches[lvl](outs_dec[lvl])
            tmp = self.reg_branches[lvl](outs_dec[lvl])
			# 3D位置信息都是基于reference_points增加的
            tmp[..., 0:3] += reference[..., 0:3]
            tmp[..., 0:3] = tmp[..., 0:3].sigmoid()

            outputs_coord = tmp
            outputs_classes.append(outputs_class)
            outputs_coords.append(outputs_coord)

        all_cls_scores = torch.stack(outputs_classes)
        all_bbox_preds = torch.stack(outputs_coords)

        all_bbox_preds[..., 0:3] = (all_bbox_preds[..., 0:3] * (self.pc_range[3:6] - self.pc_range[0:3]) + self.pc_range[0:3])


        if mask_dict and mask_dict['pad_size'] > 0:
            output_known_class = all_cls_scores[:, :, :mask_dict['pad_size'], :]
            output_known_coord = all_bbox_preds[:, :, :mask_dict['pad_size'], :]
            outputs_class = all_cls_scores[:, :, mask_dict['pad_size']:, :]
            outputs_coord = all_bbox_preds[:, :, mask_dict['pad_size']:, :]
            mask_dict['output_known_lbs_bboxes']=(output_known_class, output_known_coord)
            outs = {
                'all_cls_scores': outputs_class,
                'all_bbox_preds': outputs_coord,
                'dn_mask_dict':mask_dict,

            }
        else:
            outs = {
                'all_cls_scores': all_cls_scores,
                'all_bbox_preds': all_bbox_preds,
                'dn_mask_dict':None,
            }

        return outs

LSS Lift, Splat, Shoot: Encoding Images from Arbitrary Camera Rigs by Implicitly Unprojecting

bev的鼻祖，有可能需要你的了解
论文链接 https://arxiv.org/pdf/2008.05711v1.pdf
代码链接 https://github.com/nv-tlabs/lift-splat-shoot

PETR系列的转换方式与LSS是有相似度的。可以一起理解；

在这里插入图片描述
图二-视锥

    def get_voxels(self, x, rots, trans, intrins, post_rots, post_trans):
        #构建视锥准备获得self.frustum并通过内外参转换到lidar坐标系
        geom = self.get_geometry(rots, trans, intrins, post_rots, post_trans)
        #对2D的特征获得带有深度的特征，depth使用深度栅格并且用softmax进行概率分布，使用叉乘将2D特征和depth的特征进行融合
        x = self.get_cam_feats(x)
		#将的特征进行采样到BEV视角获得BEV特征
        x = self.voxel_pooling(geom, x)
		#返回BEV特征
        return x
	# task head 的主要入口
    def forward(self, x, rots, trans, intrins, post_rots, post_trans):
        # rots, trans, intrins, post_rots, post_trans 都是相机内参，相机外参，以及图像在经过flip,crop等操作后出的变换参数
        # x是feature
        # get_voxels 主要是构建视椎，获取带有3D的feature，具体看函数get_voxels
        x = self.get_voxels(x, rots, trans, intrins, post_r ots, post_trans)
        # B, C, H, W [B, 64， 200, 200] bevencode 对BEV视角特征进行提取最终获得【B, 1， 200， 200；在LSS中是一个分割结果，是一个通用的框架，可以做车道线，3D检测等其他任务，看你的需求；
        x = self.bevencode(x)
        return x
		#这里返回的x会和GT做WeightedCrossEntropyLoss

构建视锥准备获得self.frustum

视锥的概念就是图像到3D缺的就是深度，本文将深度维度延伸出来，如图二-视锥，构建出深度的概率分布，代码D的维度是N_d=41，并且41格子概率和为1，里面构建深度时候用了softmax。

    def create_frustum(self):
        # make grid in image plane
        ogfH, ogfW = self.data_aug_conf['final_dim']
        fH, fW = ogfH // self.downsample, ogfW // self.downsample
        ds = torch.arange(*self.grid_conf['dbound'], dtype=torch.float).view(-1, 1, 1).expand(-1, fH, fW)
        D, _, _ = ds.shape
        xs = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(1, 1, fW).expand(D, fH, fW)
        ys = torch.linspace(0, ogfH - 1, fH, dtype=torch.float).view(1, fH, 1).expand(D, fH, fW)

        # D x H x W x 3
        frustum = torch.stack((xs, ys, ds), -1)
        return nn.Parameter(frustum, requires_grad=False)

self.frustum = create_frustum，这里get_geometry将视锥根据img2lidar进行转换，将视锥点映射到lidar或者车身坐标系；视锥转换后的点为geom: B x N x D x fH x fW x 3 (4 x 6 x 41 x 8 x 22 x 3)

    def get_geometry(self, rots, trans, intrins, post_rots, post_trans):
        """Determine the (x,y,z) locations (in the ego frame)
        of the points in the point cloud.
        Returns B x N x D x H/downsample x W/downsample x 3
        """
        B, N, _ = trans.shape

        # undo post-transformation
        # B x N x D x H x W x 3
        points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
        points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))

        # cam_to_ego
        points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
                            points[:, :, :, :, :, 2:3]
                            ), 5)
        combine = rots.matmul(torch.inverse(intrins))
        points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
        points += trans.view(B, N, 1, 1, 1, 3)
		# geom_feats: B x N x D x fH x fW x 3 (4 x 6 x 41 x 8 x 22 x 3)
        return points

继续x = self.get_cam_feats(x)；原始是【self.C+ self.D】的特征，分别是2D的特征提取self.C和深度的特征self.D,对self.D的维度进行编码，使用叉乘的概念获得【self.D， self.C】的特征x: B x N x D x fH x fW x C(4 x 6 x 41（D） x 8 x 22 x 64（C））

    def get_cam_feats(self, x):
        """Return B x N x D x H/downsample x W/downsample x C
        """
        B, N, C, imH, imW = x.shape

        x = x.view(B*N, C, imH, imW)
        # 关键点在这里camencode 就是下面CamEncode
        x = self.camencode(x)
        x = x.view(B, N, self.camC, self.D, imH//self.downsample, imW//self.downsample)
        x = x.permute(0, 1, 3, 4, 5, 2)
        return x


 
class CamEncode(nn.Module):
    def __init__(self, D, C, downsample):
        super(CamEncode, self).__init__()
        self.D = D
        self.C = C

        self.trunk = EfficientNet.from_pretrained("efficientnet-b0")

        self.up1 = Up(320+112, 512)
        self.depthnet = nn.Conv2d(512, self.D + self.C, kernel_size=1, padding=0)

    def get_depth_dist(self, x, eps=1e-20):
        # 将深度softmax获得概率和为1的深度分布
        return x.softmax(dim=1)

    def get_depth_feat(self, x):
        # 特征提取
        x = self.get_eff_depth(x)
        # 进一步特征提取，Depth获得【self.D + self.C】的特征，这里很重要， self.C= 64； self.D= 41
        x = self.depthnet(x)
		# 对深度的维度SOFTMAX
        depth = self.get_depth_dist(x[:, :self.D]) # 维度是self.D
        
        # 重要！！！ 这里用了**叉乘**的概念; new_x 为 【N,  self.D,  self.C,  H, W】维度拉出来了
        new_x = depth.unsqueeze(1) * x[:, self.D:(self.D + self.C)].unsqueeze(2)
		
        return depth, new_x

    def get_eff_depth(self, x):
        # adapted from https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py#L231
        endpoints = dict()

        # Stem
        x = self.trunk._swish(self.trunk._bn0(self.trunk._conv_stem(x)))
        prev_x = x

        # Blocks
        for idx, block in enumerate(self.trunk._blocks):
            drop_connect_rate = self.trunk._global_params.drop_connect_rate
            if drop_connect_rate:
                drop_connect_rate *= float(idx) / len(self.trunk._blocks) # scale drop connect_rate
            x = block(x, drop_connect_rate=drop_connect_rate)
            if prev_x.size(2) > x.size(2):
                endpoints['reduction_{}'.format(len(endpoints)+1)] = prev_x
            prev_x = x

        # Head
        endpoints['reduction_{}'.format(len(endpoints)+1)] = x
        x = self.up1(endpoints['reduction_5'], endpoints['reduction_4'])
        return x

    def forward(self, x):
        depth, x = self.get_depth_feat(x)

        return x

获得了geom和深度编码特征的x,进行x = self.voxel_pooling(geom, x)，将特征采样到BEV视角下，这里先将3D点geom_feats获取有效范围内的点，并且根据x,y,z,batch_id进行排序，将排序的点和index保证给每一个点一个rank值，rank相等的点在同一个batch，并且在在同一个格子里面；这也就是有overlap的区域只保留最后一个出现的cam的信息；

   def voxel_pooling(self, geom_feats, x): # 对voxel进行池化操作
        # geom_feats: B x N x D x fH x fW x 3 (4 x 6 x 41 x 8 x 22 x 3)
        # x: B x N x D x fH x fW x C(4 x 6 x 41 x 8 x 22 x 64)
        B, N, D, H, W, C = x.shape # B: 4  N: 6  D: 41  H: 8  W: 22  C: 64
        Nprime = B*N*D*H*W # Nprime: 173184

        # flatten x
        x = x.reshape(Nprime, C) # 将图像展平，一共有 B*N*D*H*W 个点

        # flatten indices
        geom_feats = ((geom_feats - (self.bx - self.dx/2.)) / self.dx).long() # 将[-50,50] [-10 10]的范围平移到[0,100] [0,20]，计算栅格坐标并取整
        geom_feats = geom_feats.view(Nprime, 3) # 将像素映射关系同样展平  geom_feats: B*N*D*H*W x 3 (173184 x 3)
        batch_ix = torch.cat([torch.full([Nprime//B, 1], ix,
                             device=x.device, dtype=torch.long) for ix in range(B)]) # 每个点对应于哪个batch
        geom_feats = torch.cat((geom_feats, batch_ix), 1) # geom_feats: B*N*D*H*W x 4(173184 x 4), geom_feats[:,3]表示batch_id

        # filter out points that are outside box
        # 过滤掉在边界线之外的点 x:0~199  y: 0~199  z: 0
        kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0])\
            & (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1])\
            & (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])
        x = x[kept] # x: 168648 x 64
        geom_feats = geom_feats[kept]

        # get tensors from the same voxel next to each other
        ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B)\
            + geom_feats[:, 1] * (self.nx[2] * B)\
            + geom_feats[:, 2] * B\
            + geom_feats[:, 3] # 给每一个点一个rank值，rank相等的点在同一个batch，并且在在同一个格子里面
        sorts = ranks.argsort() # 按照rank排序，这样rank相近的点就在一起了
        x, geom_feats, ranks = x[sorts], geom_feats[sorts], ranks[sorts]

        # cumsum trick
        if not self.use_quickcumsum:
            x, geom_feats = cumsum_trick(x, geom_feats, ranks)
        else:
            x, geom_feats = QuickCumsum.apply(x, geom_feats, ranks) # 一个batch的一个格子里只留一个点 x: 29072 x 64  geom_feats: 29072 x 4

        # griddify (B x C x Z x X x Y)
        final = torch.zeros((B, C, self.nx[2], self.nx[0], self.nx[1]), device=x.device) # final: 4 x 64 x 1 x 200 x 200
        final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 0], geom_feats[:, 1]] = x # 将x按照栅格坐标放到final中

        # collapse Z
        final = torch.cat(final.unbind(dim=2), 1) # 消除掉z维

		# 这时候已经获得BEV视角下的特征了
        return final # final: 4 x 64 x 200 x 200

QuickCumsum棱台池化累积和技巧

QuickCumsum这里是和一般池化的区别：
就像OFT使用积分图像来加快他们的池化步骤一样，我们应用类似的技术来加快总和池化。考虑到生成的点云的大小，效率对于训练我们的模型至关重要。我们不是填充每个支柱然后执行求和池化，而是通过使用打包和利用“累积技巧”来避免填充求和池化。这个操作有一个解析梯度，可以有效地计算，以加快自动梯度；引用https://blog.csdn.net/zyw2002/article/details/127906938；

该技巧是基于本文方法用图像产生的点云形状是固定的，因此每个点可以预先分配一个区间（即BEV网格）索引，用于指示其属于哪一个区间。按照索引排序后，按下列方法操作：
在这里插入图片描述

class QuickCumsum(torch.autograd.Function):
    @staticmethod
    def forward(ctx, x, geom_feats, ranks):
        x = x.cumsum(0) # 求前缀和
        kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)  
        kept[:-1] = (ranks[1:] != ranks[:-1])  # 筛选出ranks中前后rank值不相等的位置

        x, geom_feats = x[kept], geom_feats[kept]  # rank值相等的点只留下最后一个，即一个batch中的一个格子里只留最后一个点
        x = torch.cat((x[:1], x[1:] - x[:-1]))  # x后一个减前一个，还原到cumsum之前的x，此时的一个点是之前与其rank相等的点的feature的和，相当于把同一个格子的点特征进行了sum

        # save kept for backward
        ctx.save_for_backward(kept)

        # no gradient for geom_feats
        ctx.mark_non_differentiable(geom_feats)

        return x, geom_feats