PETR: Position Embedding Transformation for Multi-View 3D Object Detection
代码链接
https://github.com/exiawsh/StreamPETR
这个petr3d.py的taskhead的入口
def forward_pts_train(self,
gt_bboxes_3d,
gt_labels_3d,
gt_bboxes,
gt_labels,
img_metas,
centers2d,
depths,
requires_grad=True,
return_losses=False,
**data):
"""Forward function for point cloud branch.
Args:
pts_feats (list[torch.Tensor]): Features of point cloud branch
gt_bboxes_3d (list[:obj:`BaseInstance3DBoxes`]): Ground truth
boxes for each sample.
gt_labels_3d (list[torch.Tensor]): Ground truth labels for
boxes of each sampole
img_metas (list[dict]): Meta information of samples.
gt_bboxes_ignore (list[torch.Tensor], optional): Ground truth
boxes to be ignored. Defaults to None.
Returns:
dict: Losses of each branch.
"""
#获得X,Y2维度下下的meshgrid bs*n H W 2;为构建视锥做准备 &&&
location = self.prepare_location(img_metas, **data)
if not requires_grad:
self.eval()
with torch.no_grad():
outs = self.pts_bbox_head(location, img_metas, None, **data)
self.train()
else:
#训练过程中拿到2D信息的TOPK作为辅助训练
outs_roi = self.forward_roi_head(location, **data)
topk_indexes = outs_roi['topk_indexes']
#输入streampetrhead
outs = self.pts_bbox_head(location, img_metas, topk_indexes, **data)
if return_losses:
loss_inputs = [gt_bboxes_3d, gt_labels_3d, outs]
losses = self.pts_bbox_head.loss(*loss_inputs)
if self.with_img_roi_head:
loss2d_inputs = [gt_bboxes, gt_labels, centers2d, depths, outs_roi, img_metas]
losses2d = self.img_roi_head.loss(*loss2d_inputs)
losses.update(losses2d)
return losses
else:
return None
#构建BEV视角下的meshgrid
def prepare_location(self, img_metas, **data):
pad_h, pad_w, _ = img_metas[0]['pad_shape'][0]
bs, n = data['img_feats'].shape[:2]
x = data['img_feats'].flatten(0, 1)
location = locations(x, self.stride, pad_h, pad_w)[None].repeat(bs*n, 1, 1, 1)
# return bs*n H W 2
return location
#misc.py
def locations(features, stride, pad_h, pad_w):
"""
Arguments:
features: (N, C, H, W)
Return:
locations: (H, W, 2)
"""
h, w = features.size()[-2:]
device = features.device
shifts_x = (torch.arange(
0, stride*w, step=stride,
dtype=torch.float32, device=device
) + stride // 2 ) / pad_w
shifts_y = (torch.arange(
0, h * stride, step=stride,
dtype=torch.float32, device=device
) + stride // 2) / pad_h
shift_y, shift_x = torch.meshgrid(shifts_y, shifts_x)
shift_x = shift_x.reshape(-1)
shift_y = shift_y.reshape(-1)
locations = torch.stack((shift_x, shift_y), dim=1)
locations = locations.reshape(h, w, 2)
return locations
PETR head的主要流程
def forward(self, memory_center, img_metas, topk_indexes, **data):
"""Forward function.
Args:
mlvl_feats (tuple[Tensor]): Features from the upstream
network, each is a 5D-tensor with shape
(B, N, C, H, W).
Returns:
all_cls_scores (Tensor): Outputs from the classification head, \
shape [nb_dec, bs, num_query, cls_out_channels]. Note \
cls_out_channels should includes background.
all_bbox_preds (Tensor): Sigmoid outputs from the regression \
head with normalized coordinate format (cx, cy, w, l, cz, h, theta, vx, vy). \
Shape [nb_dec, bs, num_query, 9].
"""
# memory_center 利用meshgrid的维度拉出的2维的特征
x = data['img_feats']
B, N, C, H, W = x.shape
num_tokens = N * H * W
memory = x.permute(0, 1, 3, 4, 2).reshape(B, num_tokens, C)
memory = topk_gather(memory, topk_indexes)
# position_embeding: 2维特征根据(self.coords_d)增加D的维度(这部分和LSS有点像),通过内参,外参映射到世界坐标系-归一化,反归一化进入PossitionEncoder进一步加深编码信息,因此只变化C的维度获得pos_embed(B*N,embed_dims,H,W)
# cone内包含内参和?
pos_embed, cone = self.position_embeding(data, memory_center, topk_indexes, img_metas)
# 2D特征进行提取
memory = self.memory_embed(memory)
# spatial_alignment in focal petr
memory = self.spatial_alignment(memory, cone)
# key: SE layer 融合3Dfeature和2Dfeature
pos_embed = self.featurized_pe(pos_embed, memory)
reference_points = self.reference_points.weight
# B * N * 3
reference_points, attn_mask, mask_dict = self.prepare_for_dn(B, reference_points, img_metas)
# query: pos2posemb3d 将reference_points进行sin cos编码(对位置进行编码);将位置信息编码到文本信息中;
query_pos = self.query_embedding(pos2posemb3d(reference_points))
# transformer是PETRTemporalTransformer
# q: 3d信息 且加了 3d sin位置编码 query_pos(sin cos编码)- reference_points;
# k: 通过3D Position Encoder得到的pos_embed(延伸D维度, 相机内参,外参数转换那个)和2d特征图融合;
# v: 2d特征图 =》memory
outs_dec, _ = self.transformer(memory, None, query_pos, pos_embed, attn_mask)
outs_dec = torch.nan_to_num(outs_dec)
outputs_classes = []
outputs_coords = []
for lvl in range(outs_dec.shape[0]):
reference = inverse_sigmoid(reference_points.clone())
assert reference.shape[-1] == 3
outputs_class = self.cls_branches[lvl](outs_dec[lvl])
tmp = self.reg_branches[lvl](outs_dec[lvl])
# 3D位置信息都是基于reference_points增加的
tmp[..., 0:3] += reference[..., 0:3]
tmp[..., 0:3] = tmp[..., 0:3].sigmoid()
outputs_coord = tmp
outputs_classes.append(outputs_class)
outputs_coords.append(outputs_coord)
all_cls_scores = torch.stack(outputs_classes)
all_bbox_preds = torch.stack(outputs_coords)
all_bbox_preds[..., 0:3] = (all_bbox_preds[..., 0:3] * (self.pc_range[3:6] - self.pc_range[0:3]) + self.pc_range[0:3])
if mask_dict and mask_dict['pad_size'] > 0:
output_known_class = all_cls_scores[:, :, :mask_dict['pad_size'], :]
output_known_coord = all_bbox_preds[:, :, :mask_dict['pad_size'], :]
outputs_class = all_cls_scores[:, :, mask_dict['pad_size']:, :]
outputs_coord = all_bbox_preds[:, :, mask_dict['pad_size']:, :]
mask_dict['output_known_lbs_bboxes']=(output_known_class, output_known_coord)
outs = {
'all_cls_scores': outputs_class,
'all_bbox_preds': outputs_coord,
'dn_mask_dict':mask_dict,
}
else:
outs = {
'all_cls_scores': all_cls_scores,
'all_bbox_preds': all_bbox_preds,
'dn_mask_dict':None,
}
return outs
LSS Lift, Splat, Shoot: Encoding Images from Arbitrary Camera Rigs by Implicitly Unprojecting
bev的鼻祖,有可能需要你的了解
论文链接 https://arxiv.org/pdf/2008.05711v1.pdf
代码链接 https://github.com/nv-tlabs/lift-splat-shoot
PETR系列的转换方式与LSS是有相似度的。可以一起理解;
图二-视锥
def get_voxels(self, x, rots, trans, intrins, post_rots, post_trans):
#构建视锥准备获得self.frustum并通过内外参转换到lidar坐标系
geom = self.get_geometry(rots, trans, intrins, post_rots, post_trans)
#对2D的特征获得带有深度的特征,depth使用深度栅格并且用softmax进行概率分布,使用叉乘将2D特征和depth的特征进行融合
x = self.get_cam_feats(x)
#将的特征进行采样到BEV视角获得BEV特征
x = self.voxel_pooling(geom, x)
#返回BEV特征
return x
# task head 的主要入口
def forward(self, x, rots, trans, intrins, post_rots, post_trans):
# rots, trans, intrins, post_rots, post_trans 都是相机内参,相机外参,以及图像在经过flip,crop等操作后出的变换参数
# x是feature
# get_voxels 主要是构建视椎,获取带有3D的feature,具体看函数get_voxels
x = self.get_voxels(x, rots, trans, intrins, post_r ots, post_trans)
# B, C, H, W [B, 64, 200, 200] bevencode 对BEV视角特征进行提取最终获得【B, 1, 200, 200;在LSS中是一个分割结果,是一个通用的框架,可以做车道线,3D检测等其他任务,看你的需求;
x = self.bevencode(x)
return x
#这里返回的x会和GT做WeightedCrossEntropyLoss
构建视锥准备获得self.frustum
视锥的概念就是图像到3D缺的就是深度,本文将深度维度延伸出来,如图二-视锥,构建出深度的概率分布,代码D的维度是N_d=41,并且41格子概率和为1,里面构建深度时候用了softmax。
def create_frustum(self):
# make grid in image plane
ogfH, ogfW = self.data_aug_conf['final_dim']
fH, fW = ogfH // self.downsample, ogfW // self.downsample
ds = torch.arange(*self.grid_conf['dbound'], dtype=torch.float).view(-1, 1, 1).expand(-1, fH, fW)
D, _, _ = ds.shape
xs = torch.linspace(0, ogfW - 1, fW, dtype=torch.float).view(1, 1, fW).expand(D, fH, fW)
ys = torch.linspace(0, ogfH - 1, fH, dtype=torch.float).view(1, fH, 1).expand(D, fH, fW)
# D x H x W x 3
frustum = torch.stack((xs, ys, ds), -1)
return nn.Parameter(frustum, requires_grad=False)
self.frustum = create_frustum,这里get_geometry将视锥根据img2lidar进行转换,将视锥点映射到lidar或者车身坐标系;视锥转换后的点为geom: B x N x D x fH x fW x 3 (4 x 6 x 41 x 8 x 22 x 3)
def get_geometry(self, rots, trans, intrins, post_rots, post_trans):
"""Determine the (x,y,z) locations (in the ego frame)
of the points in the point cloud.
Returns B x N x D x H/downsample x W/downsample x 3
"""
B, N, _ = trans.shape
# undo post-transformation
# B x N x D x H x W x 3
points = self.frustum - post_trans.view(B, N, 1, 1, 1, 3)
points = torch.inverse(post_rots).view(B, N, 1, 1, 1, 3, 3).matmul(points.unsqueeze(-1))
# cam_to_ego
points = torch.cat((points[:, :, :, :, :, :2] * points[:, :, :, :, :, 2:3],
points[:, :, :, :, :, 2:3]
), 5)
combine = rots.matmul(torch.inverse(intrins))
points = combine.view(B, N, 1, 1, 1, 3, 3).matmul(points).squeeze(-1)
points += trans.view(B, N, 1, 1, 1, 3)
# geom_feats: B x N x D x fH x fW x 3 (4 x 6 x 41 x 8 x 22 x 3)
return points
继续x = self.get_cam_feats(x);原始是【self.C+ self.D】的特征,分别是2D的特征提取self.C和深度的特征self.D,对self.D的维度进行编码,使用叉乘的概念获得【self.D, self.C】的特征x: B x N x D x fH x fW x C(4 x 6 x 41(D) x 8 x 22 x 64(C))
def get_cam_feats(self, x):
"""Return B x N x D x H/downsample x W/downsample x C
"""
B, N, C, imH, imW = x.shape
x = x.view(B*N, C, imH, imW)
# 关键点在这里camencode 就是下面CamEncode
x = self.camencode(x)
x = x.view(B, N, self.camC, self.D, imH//self.downsample, imW//self.downsample)
x = x.permute(0, 1, 3, 4, 5, 2)
return x
class CamEncode(nn.Module):
def __init__(self, D, C, downsample):
super(CamEncode, self).__init__()
self.D = D
self.C = C
self.trunk = EfficientNet.from_pretrained("efficientnet-b0")
self.up1 = Up(320+112, 512)
self.depthnet = nn.Conv2d(512, self.D + self.C, kernel_size=1, padding=0)
def get_depth_dist(self, x, eps=1e-20):
# 将深度softmax获得概率和为1的深度分布
return x.softmax(dim=1)
def get_depth_feat(self, x):
# 特征提取
x = self.get_eff_depth(x)
# 进一步特征提取,Depth获得【self.D + self.C】的特征,这里很重要, self.C= 64; self.D= 41
x = self.depthnet(x)
# 对深度的维度SOFTMAX
depth = self.get_depth_dist(x[:, :self.D]) # 维度是self.D
# 重要!!! 这里用了**叉乘**的概念; new_x 为 【N, self.D, self.C, H, W】维度拉出来了
new_x = depth.unsqueeze(1) * x[:, self.D:(self.D + self.C)].unsqueeze(2)
return depth, new_x
def get_eff_depth(self, x):
# adapted from https://github.com/lukemelas/EfficientNet-PyTorch/blob/master/efficientnet_pytorch/model.py#L231
endpoints = dict()
# Stem
x = self.trunk._swish(self.trunk._bn0(self.trunk._conv_stem(x)))
prev_x = x
# Blocks
for idx, block in enumerate(self.trunk._blocks):
drop_connect_rate = self.trunk._global_params.drop_connect_rate
if drop_connect_rate:
drop_connect_rate *= float(idx) / len(self.trunk._blocks) # scale drop connect_rate
x = block(x, drop_connect_rate=drop_connect_rate)
if prev_x.size(2) > x.size(2):
endpoints['reduction_{}'.format(len(endpoints)+1)] = prev_x
prev_x = x
# Head
endpoints['reduction_{}'.format(len(endpoints)+1)] = x
x = self.up1(endpoints['reduction_5'], endpoints['reduction_4'])
return x
def forward(self, x):
depth, x = self.get_depth_feat(x)
return x
获得了geom和深度编码特征的x,进行x = self.voxel_pooling(geom, x), 将特征采样到BEV视角下,这里先将3D点geom_feats获取有效范围内的点,并且根据x,y,z,batch_id进行排序,将排序的点和index保证给每一个点一个rank值,rank相等的点在同一个batch,并且在在同一个格子里面;这也就是有overlap的区域只保留最后一个出现的cam的信息;
def voxel_pooling(self, geom_feats, x): # 对voxel进行池化操作
# geom_feats: B x N x D x fH x fW x 3 (4 x 6 x 41 x 8 x 22 x 3)
# x: B x N x D x fH x fW x C(4 x 6 x 41 x 8 x 22 x 64)
B, N, D, H, W, C = x.shape # B: 4 N: 6 D: 41 H: 8 W: 22 C: 64
Nprime = B*N*D*H*W # Nprime: 173184
# flatten x
x = x.reshape(Nprime, C) # 将图像展平,一共有 B*N*D*H*W 个点
# flatten indices
geom_feats = ((geom_feats - (self.bx - self.dx/2.)) / self.dx).long() # 将[-50,50] [-10 10]的范围平移到[0,100] [0,20],计算栅格坐标并取整
geom_feats = geom_feats.view(Nprime, 3) # 将像素映射关系同样展平 geom_feats: B*N*D*H*W x 3 (173184 x 3)
batch_ix = torch.cat([torch.full([Nprime//B, 1], ix,
device=x.device, dtype=torch.long) for ix in range(B)]) # 每个点对应于哪个batch
geom_feats = torch.cat((geom_feats, batch_ix), 1) # geom_feats: B*N*D*H*W x 4(173184 x 4), geom_feats[:,3]表示batch_id
# filter out points that are outside box
# 过滤掉在边界线之外的点 x:0~199 y: 0~199 z: 0
kept = (geom_feats[:, 0] >= 0) & (geom_feats[:, 0] < self.nx[0])\
& (geom_feats[:, 1] >= 0) & (geom_feats[:, 1] < self.nx[1])\
& (geom_feats[:, 2] >= 0) & (geom_feats[:, 2] < self.nx[2])
x = x[kept] # x: 168648 x 64
geom_feats = geom_feats[kept]
# get tensors from the same voxel next to each other
ranks = geom_feats[:, 0] * (self.nx[1] * self.nx[2] * B)\
+ geom_feats[:, 1] * (self.nx[2] * B)\
+ geom_feats[:, 2] * B\
+ geom_feats[:, 3] # 给每一个点一个rank值,rank相等的点在同一个batch,并且在在同一个格子里面
sorts = ranks.argsort() # 按照rank排序,这样rank相近的点就在一起了
x, geom_feats, ranks = x[sorts], geom_feats[sorts], ranks[sorts]
# cumsum trick
if not self.use_quickcumsum:
x, geom_feats = cumsum_trick(x, geom_feats, ranks)
else:
x, geom_feats = QuickCumsum.apply(x, geom_feats, ranks) # 一个batch的一个格子里只留一个点 x: 29072 x 64 geom_feats: 29072 x 4
# griddify (B x C x Z x X x Y)
final = torch.zeros((B, C, self.nx[2], self.nx[0], self.nx[1]), device=x.device) # final: 4 x 64 x 1 x 200 x 200
final[geom_feats[:, 3], :, geom_feats[:, 2], geom_feats[:, 0], geom_feats[:, 1]] = x # 将x按照栅格坐标放到final中
# collapse Z
final = torch.cat(final.unbind(dim=2), 1) # 消除掉z维
# 这时候已经获得BEV视角下的特征了
return final # final: 4 x 64 x 200 x 200
QuickCumsum棱台池化累积和技巧
QuickCumsum这里是和一般池化的区别:
就像OFT使用积分图像来加快他们的池化步骤一样,我们应用类似的技术来加快总和池化。考虑到生成的点云的大小,效率对于训练我们的模型至关重要。我们不是填充每个支柱然后执行求和池化,而是通过使用打包和利用“累积技巧”来避免填充求和池化。这个操作有一个解析梯度,可以有效地计算,以加快自动梯度; 引用https://blog.csdn.net/zyw2002/article/details/127906938;
该技巧是基于本文方法用图像产生的点云形状是固定的,因此每个点可以预先分配一个区间(即BEV网格)索引,用于指示其属于哪一个区间。按照索引排序后,按下列方法操作:
class QuickCumsum(torch.autograd.Function):
@staticmethod
def forward(ctx, x, geom_feats, ranks):
x = x.cumsum(0) # 求前缀和
kept = torch.ones(x.shape[0], device=x.device, dtype=torch.bool)
kept[:-1] = (ranks[1:] != ranks[:-1]) # 筛选出ranks中前后rank值不相等的位置
x, geom_feats = x[kept], geom_feats[kept] # rank值相等的点只留下最后一个,即一个batch中的一个格子里只留最后一个点
x = torch.cat((x[:1], x[1:] - x[:-1])) # x后一个减前一个,还原到cumsum之前的x,此时的一个点是之前与其rank相等的点的feature的和,相当于把同一个格子的点特征进行了sum
# save kept for backward
ctx.save_for_backward(kept)
# no gradient for geom_feats
ctx.mark_non_differentiable(geom_feats)
return x, geom_feats