为什么要BEVPool
bevpool 是将图像2D 特征经过内外参转换后映射到3D 空间。
在LSS 中:
输入: 图像特征(NxHxWxC), 深度得分(NxDxHxW)
根据内外参构建三维视锥体素索引(NxDxHxW), 根据体素索引计算得到的三维坐标进行映射。把对应深度的图像特征映射到BEV 空间中。对多个图像特征映射到同一个BEV单元的累计求和。
在BEVPool 中,先将深度得分和图像特征进行叉乘,再根据体素索引进行映射。这就存在一个大的弊端,就是存储和处理大尺寸(NxDxHxWxC)的视锥体特征。这对推理速度和显存占用很不友好。
BEVPool V2
图像特征和深度特征卷积分离
1、离线计算视锥索引和体素索引的对应关系表,推理过程固定
2、通过视锥索引找到对应的图像特征和深度特征
3、对相同体素的特征先外积后求和
bevpool v2 避免了显式计算、存储和预处理视锥体特征, 降低显存,加快处理
详细流程:
1、构建单个相机的视锥空间(DxHxWx3)
2、计算每个相机图像对应视锥在ego坐标系中的位置
3、构建每个视锥格子与bev feat 中每个Voxel 的映射关系,从而推出 图像feat index 、depth index 和 voxel index 关系。大概如下:
视锥体坐标经过内外参变换转到ego 坐标(coor),离散到voxel 坐标。在ego下,筛选有效范围的coor,并计算每个元素对应的全局voxel index。根据Voxel index 对coor 排序,使得同一个Voxel的coor相邻,使得不同视野映射在同一个Voxel的feat相邻。最后错位匹配,找到局部连续元素的起点以及连续元素个数。
构建索引关系返回值:全部是一维tensor
rank_bev : 数量与有效视锥数量一致,存放有效的bev空间的Voxel 索引值。(并不是所有视锥投影到bev中都有效,只保留有效的)
rank_depth:数量与有效视锥数量一致,存放depth 的索引值
rank_feat:数量与有效视锥数量一致,存放图像 feat 的索引值
interval_starts: 数量与voxel的数量一致,每个元素标识着ranks_bev feat的每段"连续片段"的起点
interval_lengths:数量与voxel的数量一致,每个元素标识着ranks_bev feat的每段"连续片段"的长度
构建代码
def voxel_pooling_prepare_v2(self, coor):
B, N, D, H, W, _ = coor.shape
num_points = B * N * D * H * W
# 为depth score的每个值创建索引值
ranks_depth = torch.arange(
0, num_points , dtype=torch.int, device=coor.device)
# 为depth score的每个值创建索引值,一维tensor : B * N * D * H * W
ranks_feat = torch.arange(
0, num_points // D , dtype=torch.int, device=coor.device)
ranks_feat = ranks_feat.reshape(B, N, 1, H, W)
#context feat需要expand 到视锥栅格空间中,因为D个栅格会复用同一个context feat 单元(1*C)
ranks_feat = ranks_feat.expand(B, N, D, H, W).flatten()
# coor是已经转换到key ego坐标下的视锥栅格,对每个栅格离散化到voxel 空间中,
#并将coor更新为voxel的坐标
coor = ((coor - self.grid_lower_bound.to(coor)) /
self.grid_interval.to(coor))
coor = coor.long().view(num_points, 3)
#coor每个元素需要记录其所属的voxel坐标xyz,还需要记录其所属的batch idx,尺寸为(num_points,4)
batch_idx = torch.arange(0, B ).reshape(B, 1). \
expand(B, num_points // B).reshape(num_points, 1).to(coor)
coor = torch.cat((coor, batch_idx), 1)
# 筛选出bevfeat有效范围内的coor
kept = (coor[:, 0] >= 0) & (coor[:, 0] < self.grid_size[0]) & \
(coor[:, 1] >= 0) & (coor[:, 1] < self.grid_size[1]) & \
(coor[:, 2] >= 0) & (coor[:, 2] < self.grid_size[2])
if len(kept) == 0:
return None, None, None, None, None
coor, ranks_depth, ranks_feat = \
coor[kept], ranks_depth[kept], ranks_feat[kept]
# 根据coor的voxel坐标,转换每个voxel对应的全局索引值
ranks_bev = coor[:, 3] * (
self.grid_size[2] * self.grid_size[1] * self.grid_size[0])
ranks_bev += coor[:, 2] * (self.grid_size[1] * self.grid_size[0])
ranks_bev += coor[:, 1] * self.grid_size[0] + coor[:, 0]
#根据voxel index对coor排序,让属于同一个voxel的coor元素相邻
order = ranks_bev.argsort()
ranks_bev, ranks_depth, ranks_feat = \
ranks_bev[order], ranks_depth[order], ranks_feat[order]
#接下来就是原作者炫操作的代码了,核心思想就是错位相减或者错位匹配,找到局部连续元素的起点
kept = torch.ones(
ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
#局部连续元素的起点
interval_starts = torch.where(kept)[0].int()
if len(interval_starts) == 0:
return None, None, None, None, None
interval_lengths = torch.zeros_like(interval_starts)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
#局部连续元素的长度
interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
return ranks_bev.int().contiguous(), ranks_depth.int().contiguous(
), ranks_feat.int().contiguous(), interval_starts.int().contiguous(
), interval_lengths.int().contiguous()
BEVPoolv2 CUDA代码实现
BEVPoolv2助力于LSS方案能在边缘设备上部署,推BEV方案部署落地。
这里写了一个pyhon的函数来调用bev_pool_v2,这里只给了前向推理的代码,反向传播的没给出。
def bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts, interval_lengths):
"""
Args:
depth: (B, N, D, fH, fW)
feat: (B, N, fH, fW, C)
ranks_depth: (N_points, ),
ranks_feat: (N_points, ),
ranks_bev: (N_points, ),
bev_feat_shape: (B, D_Z, D_Y, D_X, C)
interval_starts: (N_pillar, )
interval_lengths: (N_pillar, )
Returns:
x: bev feature in shape (B, C, Dz, Dy, Dx)
"""
x = QuickCumsumCuda.apply(depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts,
interval_lengths) # (B, Dz, Dy, Dx, C)
x = x.permute(0, 4, 1, 2, 3).contiguous() # (B, C, Dz, Dy, Dx)
return x
class QuickCumsumCuda(torch.autograd.Function):
r"""BEVPoolv2 implementation for Lift-Splat-Shoot view transformation.
Please refer to the `paper <https://arxiv.org/abs/2211.17111>`_
"""
@staticmethod
def forward(ctx, depth, feat, ranks_depth, ranks_feat, ranks_bev,
bev_feat_shape, interval_starts, interval_lengths):
ranks_bev = ranks_bev.int() # (N_points, ),
depth = depth.contiguous().float() # (B, N, D, fH, fW)
feat = feat.contiguous().float() # (B, N, fH, fW, C)
ranks_depth = ranks_depth.contiguous().int() # (N_points, ),
ranks_feat = ranks_feat.contiguous().int() # (N_points, ),
interval_lengths = interval_lengths.contiguous().int() # (N_pillar, )
interval_starts = interval_starts.contiguous().int() # (N_pillar, )
out = feat.new_zeros(bev_feat_shape) # (B, D_Z, D_Y, D_X, C)
bev_pool_v2_ext.bev_pool_v2_forward(
depth,
feat,
out,
ranks_depth,
ranks_feat,
ranks_bev,
interval_lengths,
interval_starts,
)
ctx.save_for_backward(ranks_bev, depth, feat, ranks_feat, ranks_depth)
return out
测试函数:
def test_bev_pool_v2():
depth = np.array([0.3, 0.4, 0.2, 0.1, 0.7, 0.6, 0.8, 0.9])
depth = torch.from_numpy(depth).float().cuda()
depth = depth.view(1, 1, 2, 2, 2).requires_grad_()
feat = torch.ones(
size=[1, 1, 2, 2, 2], dtype=torch.float,
device='cuda').requires_grad_()
ranks_depth = torch.from_numpy(np.array([0, 4, 1, 6])).int().cuda()
ranks_feat = torch.from_numpy(np.array([0, 0, 1, 2])).int().cuda()
ranks_bev = torch.from_numpy(np.array([0, 0, 1, 1])).int().cuda()
kept = torch.ones(
ranks_bev.shape[0], device=ranks_bev.device, dtype=torch.bool)
kept[1:] = ranks_bev[1:] != ranks_bev[:-1]
interval_starts = torch.where(kept)[0].int()
if len(interval_starts) == 0:
return None, None, None, None, None
interval_lengths = torch.zeros_like(interval_starts)
interval_lengths[:-1] = interval_starts[1:] - interval_starts[:-1]
interval_lengths[-1] = ranks_bev.shape[0] - interval_starts[-1]
bev_feat = bev_pool_v2(depth, feat, ranks_depth, ranks_feat, ranks_bev,
(1, 1, 2, 2, 2), interval_starts, interval_lengths)
loss = torch.sum(bev_feat)
loss.backward()
assert loss == 4.4
grad_depth = np.array([2., 2., 0., 0., 2., 0., 2., 0.])
grad_depth = torch.from_numpy(grad_depth).float()
grad_depth = grad_depth.cuda().view(1, 1, 2, 2, 2)
assert depth.grad.allclose(grad_depth)
grad_feat = np.array([1.0, 1.0, 0.4, 0.4, 0.8, 0.8, 0., 0.])
grad_feat = torch.from_numpy(grad_feat).float().cuda().view(1, 1, 2, 2, 2)
assert feat.grad.allclose(grad_feat)
bev_pool_v2()函数主要是调用了QuickCumsumCuda()类,使用其前向推理的方法。
QuickCumsumCuda()类的前向推理中,主要调用了bev_pool_v2_ext.bev_pool_v2_forward()。
其中,bev_pool_v2_ext是用cpp实现的,名称为括bev_pool.cpp,调用封装成库的bev_pool_cuda.cu。
bev_pool.cpp代码如下所示:
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <torch/torch.h>
#include <c10/cuda/CUDAGuard.h>
// CUDA function declarations
void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat,
const int* ranks_depth, const int* ranks_feat, const int* ranks_bev,
const int* interval_starts, const int* interval_lengths, float* out);
/*
Function: pillar pooling (forward, cuda)
Args:
depth : input depth, FloatTensor[n, d, h, w]
feat : input features, FloatTensor[n, h, w, c]
out : output features, FloatTensor[b, c, h_out, w_out]
ranks_depth : depth index of points, IntTensor[n_points]
ranks_feat : feat index of points, IntTensor[n_points]
ranks_bev : output index of points, IntTensor[n_points]
interval_lengths : starting position for pooled point, IntTensor[n_intervals]
interval_starts : how many points in each pooled point, IntTensor[n_intervals]
Return:
*/
void bev_pool_v2_forward(
const at::Tensor _depth, // (B, N, D, fH, fW)
const at::Tensor _feat, // (B, N, fH, fW, C)
at::Tensor _out, // (B, D_Z, D_Y, D_X, C)
const at::Tensor _ranks_depth, // (N_points, ),
const at::Tensor _ranks_feat, // (N_points, ),
const at::Tensor _ranks_bev, // (N_points, ),
const at::Tensor _interval_lengths, // (N_pillar, )
const at::Tensor _interval_starts // (N_pillar, )
) {
int c = _feat.size(4);
int n_intervals = _interval_lengths.size(0);
const at::cuda::OptionalCUDAGuard device_guard(device_of(_depth));
const float* depth = _depth.data_ptr<float>();
const float* feat = _feat.data_ptr<float>();
const int* ranks_depth = _ranks_depth.data_ptr<int>();
const int* ranks_feat = _ranks_feat.data_ptr<int>();
const int* ranks_bev = _ranks_bev.data_ptr<int>();
const int* interval_lengths = _interval_lengths.data_ptr<int>();
const int* interval_starts = _interval_starts.data_ptr<int>();
float* out = _out.data_ptr<float>();
bev_pool_v2(
c, n_intervals, depth, feat, ranks_depth, ranks_feat,
ranks_bev, interval_starts, interval_lengths, out
);
}
bev_pool_cuda.cu代码如下所示:
// Copyright (c) Phigent Robotics. All rights reserved.
// Reference https://arxiv.org/abs/2211.17111
#include <stdio.h>
#include <stdlib.h>
// CUDA内核函数,用于处理3D点云数据的特征聚合
__global__ void bev_pool_v2_kernel(int c, int n_intervals,
const float *__restrict__ depth,
const float *__restrict__ feat,
const int *__restrict__ ranks_depth,
const int *__restrict__ ranks_feat,
const int *__restrict__ ranks_bev,
const int *__restrict__ interval_starts,
const int *__restrict__ interval_lengths,
float* __restrict__ out) {
// 计算当前线程的全局索引,确定处理的数据位置
int idx = blockIdx.x * blockDim.x + threadIdx.x;
// 计算当前pillar的索引
int index = idx / c;
// 计算当前处理的特征通道的索引
int cur_c = idx % c;
// 如果当前pillar索引超出范围,则线程直接返回
if (index >= n_intervals) return;
// 获取当前pillar的起始点索引
int interval_start = interval_starts[index];
// 获取当前pillar包含的点的数量
int interval_length = interval_lengths[index];
// 初始化累加器,用于累计特征值
float psum = 0;
// 指向当前深度值和特征值的指针
const float* cur_depth;
const float* cur_feat;
// 遍历当前pillar的所有点
for(int i = 0; i < interval_length; i++) {
// 获取当前点的深度值
cur_depth = depth + ranks_depth[interval_start + i];
// 获取当前通道对应的特征值
cur_feat = feat + ranks_feat[interval_start + i] * c + cur_c;
// 累加当前点的特征值与深度值的乘积
psum += *cur_feat * *cur_depth;
}
// 获取当前pillar在BEV(鸟瞰图)网格中的索引
const int* cur_rank = ranks_bev + interval_start;
// 定位输出数组中的相应位置
float* cur_out = out + *cur_rank * c + cur_c;
// 将累加的特征值写入输出数组
*cur_out = psum;
}
// 定义 bev_pool_v2 函数,用于并行处理3D点云数据
void bev_pool_v2(int c, int n_intervals, const float* depth, const float* feat,
const int* ranks_depth, const int* ranks_feat, const int* ranks_bev,
const int* interval_starts, const int* interval_lengths, float* out) {
// 调用CUDA内核函数bev_pool_v2_kernel
// 使用n_intervals * c计算所需的线程总数,然后除以256确定需要多少个线程块
// 每个线程块使用256个线程
bev_pool_v2_kernel<<<(int)ceil(((double)n_intervals * c / 256)), 256>>>(
c, n_intervals, depth, feat, ranks_depth, ranks_feat,
ranks_bev, interval_starts, interval_lengths, out
);
}