mmdet3d预处理(上)
文章目录
我们将在下图中展示一个最经典的数据集预处理流程,其中蓝色框表示预处理流程中的各项操作。随着预处理的进行,每一个操作都会添加新的键值(图中标记为绿色)到输出字典中,或者更新当前存在的键值(图中标记为橙色)。
预处理流程中的各项操作主要分为数据加载、预处理、格式化、测试时的数据增强。
接下来将展示一个用于 PointPillars 模型的数据集预处理流程的例子。
train_pipeline = [
dict(
type='LoadPointsFromFile',
load_dim=5,
use_dim=5,
backend_args=backend_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
backend_args=backend_args),
dict(type='LoadAnnotations3D', with_bbox_3d=True, with_label_3d=True),
dict(
type='GlobalRotScaleTrans',
rot_range=[-0.3925, 0.3925],
scale_ratio_range=[0.95, 1.05],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D', flip_ratio_bev_horizontal=0.5),
dict(type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectRangeFilter', point_cloud_range=point_cloud_range),
dict(type='ObjectNameFilter', classes=class_names),
dict(type='PointShuffle'),
dict(type='DefaultFormatBundle3D', class_names=class_names),
dict(type='Collect3D', keys=['points', 'gt_bboxes_3d', 'gt_labels_3d'])
]
test_pipeline = [
dict(
type='LoadPointsFromFile',
load_dim=5,
use_dim=5,
backend_args=backend_args),
dict(
type='LoadPointsFromMultiSweeps',
sweeps_num=10,
backend_args=backend_args),
dict(
type='MultiScaleFlipAug',
img_scale=(1333, 800),
pts_scale_ratio=1.0,
flip=False,
pcd_horizontal_flip=False,
pcd_vertical_flip=False,
transforms=[
dict(
type='GlobalRotScaleTrans',
rot_range=[0, 0],
scale_ratio_range=[1., 1.],
translation_std=[0, 0, 0]),
dict(type='RandomFlip3D'),
dict(
type='PointsRangeFilter', point_cloud_range=point_cloud_range),
dict(
type='DefaultFormatBundle3D',
class_names=class_names,
with_label=False),
dict(type='Collect3D', keys=['points'])
])
]
对于每项操作,我们将列出相关的被添加/更新/移除的字典项。
数据加载
LoadPointsFromFile
- 添加:points
LoadPointsFromMultiSweeps
- 更新:points
LoadAnnotations3D
- 添加:gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels, pts_instance_mask, pts_semantic_mask, bbox3d_fields, pts_mask_fields, pts_seg_fields
预处理
GlobalRotScaleTrans
- 添加:pcd_trans, pcd_rotation, pcd_scale_factor
- 更新:points, *bbox3d_fields
RandomFlip3D
- 添加:flip, pcd_horizontal_flip, pcd_vertical_flip
- 更新:points, *bbox3d_fields
PointsRangeFilter
- 更新:points
ObjectRangeFilter
- 更新:gt_bboxes_3d, gt_labels_3d
ObjectNameFilter
- 更新:gt_bboxes_3d, gt_labels_3d
PointShuffle
- 更新:points
PointsRangeFilter
- 更新:points
格式化
DefaultFormatBundle3D
- 更新:points, gt_bboxes_3d, gt_labels_3d, gt_bboxes, gt_labels
Collect3D
- 添加:img_meta (由
meta_keys
指定的键值构成的 img_meta) - 移除:所有除
keys
指定的键值以外的其他键值
测试时的数据增强
MultiScaleFlipAug
- 更新: scale, pcd_scale_factor, flip, flip_direction, pcd_horizontal_flip, pcd_vertical_flip (与这些指定的参数对应的增强后的数据列表)
下面会逐一拆解上述预处理涉及到的各个函数。
本篇首先介绍3d数据在预处理时的 Voxel(体素化)的过程。
基类 DetDataPreprocessor
文件路径:/home/randy/anaconda3/envs/mmdet3d_env_py38/lib/python3.8/site-packages/mmdet/models/data_preprocessors/data_preprocessor.py
class DetDataPreprocessor(ImgDataPreprocessor)
- 支持批量扩充。
- 考虑到目标检测任务,它将另外附加batch_input_shape和pad_shape到data_samples。
它提供如下数据预处理
- 整理数据并将其移动到目标设备。
- 将输入填充到当前批次的最大大小,并定义“pad_value”。填充大小可被定义的“pad_size_divisor”整除
- 将输入堆叠到batch_inputs。
- 如果输入的形状为 (3, H, W),则将输入从 bgr 转换为 rgb。
- 使用定义的标准和平均值规范化图像。
- 在训练期间进行批量扩充。
派生类 Det3DDataPreprocessor
文件路径:mmdetection3d/mmdet3d/models/data_preprocessors/data_preprocessor.py
class Det3DDataPreprocessor(DetDataPreprocessor)
功能
-
Collate and move image and point cloud data to the target device.
-
For point cloud data:
-
If no voxelization, directly return list of point cloud data.
-
If voxelization is applied, voxelize point cloud according to
voxel_type
and obtainvoxels
.
-
体素化包括:
- dynamic_voxelize_forward
- hard_voxelize_forward:实现速度更快,但是牺牲了不确定性
非确定性版本相当快,但不是确定性的。
配置文件设置
data_preprocessor=dict(
type='Det3DDataPreprocessor',
voxel=True,
voxel_layer=dict(
max_num_points=30, // 每个voxel 包含的最多点数
voxel_size=voxel_size,
max_voxels=(30000, 40000), // train 和 test 最多voxel个数
point_cloud_range=point_cloud_range)),
dynamic_voxelize_kernel
源码
- gpu版:projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cuda.cu
- cpu版:projects/BEVFusion/bevfusion/ops/voxel/src/voxelization_cpu.cpp
template <typename T, typename T_int>
__global__ void dynamic_voxelize_kernel(
const T* points, T_int* coors, const float voxel_x, const float voxel_y,
const float voxel_z, const float coors_x_min, const float coors_y_min,
const float coors_z_min, const float coors_x_max, const float coors_y_max,
const float coors_z_max, const int grid_x, const int grid_y,
const int grid_z, const int num_points, const int num_features,
const int NDim) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
CUDA_1D_KERNEL_LOOP(index, num_points) {
// To save some computation
auto points_offset = points + index * num_features;
auto coors_offset = coors + index * NDim;
int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
if (c_x < 0 || c_x >= grid_x) {
coors_offset[0] = -1;
return;
}
int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
if (c_y < 0 || c_y >= grid_y) {
coors_offset[0] = -1;
coors_offset[1] = -1;
return;
}
int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
if (c_z < 0 || c_z >= grid_z) {
coors_offset[0] = -1;
coors_offset[1] = -1;
coors_offset[2] = -1;
} else {
coors_offset[0] = c_x;
coors_offset[1] = c_y;
coors_offset[2] = c_z;
}
}
}
思路:
- 确定x y z 3个方向需要划分的voxel 的个数 voxel_x、voxel_y、voxel_z,然后计算每个点对应的voxel id ,并判断和边界的关系。
int c_x = floor((points_offset[0] - coors_x_min) / voxel_x);
int c_y = floor((points_offset[1] - coors_y_min) / voxel_y);
int c_z = floor((points_offset[2] - coors_z_min) / voxel_z);
- 赋值给offset
coors_offset[0] = c_x;
coors_offset[1] = c_y;
coors_offset[2] = c_z;
hard_voxelize_kernel
不确定性在于不是每个点都要,也不是每个voxel都要,每个voxel保留一定的点,每帧数据保留一定量的voxel,其余的都需要舍去。
所以很多计算出来的点和voxelidx不符合要求直接省略了,不必继续存储了。
template <typename T, typename T_int>
void hard_voxelize_kernel(const torch::TensorAccessor<T, 2> points,
torch::TensorAccessor<T, 3> voxels,
torch::TensorAccessor<T_int, 2> coors,
torch::TensorAccessor<T_int, 1> num_points_per_voxel,
torch::TensorAccessor<T_int, 3> coor_to_voxelidx,
int& voxel_num, const std::vector<float> voxel_size,
const std::vector<float> coors_range,
const std::vector<int> grid_size,
const int max_points, const int max_voxels,
const int num_points, const int num_features,
const int NDim) {
// declare a temp coors
at::Tensor temp_coors = at::zeros(
{num_points, NDim}, at::TensorOptions().dtype(at::kInt).device(at::kCPU));
// First use dynamic voxelization to get coors,
// then check max points/voxels constraints
dynamic_voxelize_kernel<T, int>(points, temp_coors.accessor<int, 2>(),
voxel_size, coors_range, grid_size,
num_points, num_features, NDim);
int voxelidx, num;
auto coor = temp_coors.accessor<int, 2>();
for (int i = 0; i < num_points; ++i) {
// T_int* coor = temp_coors.data_ptr<int>() + i * NDim;
if (coor[i][0] == -1) continue;
voxelidx = coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]];
// record voxel
if (voxelidx == -1) {
voxelidx = voxel_num;
if (max_voxels != -1 && voxel_num >= max_voxels) continue;
voxel_num += 1;
coor_to_voxelidx[coor[i][0]][coor[i][1]][coor[i][2]] = voxelidx;
for (int k = 0; k < NDim; ++k) {
coors[voxelidx][k] = coor[i][k];
}
}
// put points into voxel
num = num_points_per_voxel[voxelidx];
if (max_points == -1 || num < max_points) {
for (int k = 0; k < num_features; ++k) {
voxels[voxelidx][num][k] = points[i][k];
}
num_points_per_voxel[voxelidx] += 1;
}
}
return;
}
思路:
- 使用动态体素化获取体素坐标,即体素序号,然后检查最大点和体素维度的限制;
- 记录每个点对应的 voxel idx
- 将点放进 voxel中,并记录每个voxel包含的点数
CUDA版本的步骤略有不同:
- 计算voxel的尺寸大小
const int grid_x = round((coors_x_max - coors_x_min) / voxel_x);
const int grid_y = round((coors_y_max - coors_y_min) / voxel_y);
const int grid_z = round((coors_z_max - coors_z_min) / voxel_z);
-
将点和voxel 序号进行对应
-
计算点对应的体素坐标
-
每个体素内值保留 max_points 个点,超过部分不要
-
template <typename T_int>
__global__ void point_to_voxelidx_kernel(const T_int* coor,
T_int* point_to_voxelidx,
T_int* point_to_pointidx,
const int max_points,
const int max_voxels,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(index, num_points) {
auto coor_offset = coor + index * NDim;
// skip invalid points
if ((index >= num_points) || (coor_offset[0] == -1)) return;
int num = 0;
int coor_x = coor_offset[0];
int coor_y = coor_offset[1];
int coor_z = coor_offset[2];
// only calculate the coors before this coor[index]
// 仅计算当前点序号之前的点
for (int i = 0; i < index; ++i) {
auto prev_coor = coor + i * NDim;
if (prev_coor[0] == -1) continue;
// Find all previous points that have the same coors
// if find the same coor, record it
if ((prev_coor[0] == coor_x) && (prev_coor[1] == coor_y) &&
(prev_coor[2] == coor_z)) {
num++;
if (num == 1) {
// point to the same coor that first show up
point_to_pointidx[index] = i;
} else if (num >= max_points) {
// out of boundary
return;
}
}
}
if (num == 0) {
// 标记空体素点
point_to_pointidx[index] = index;
}
if (num < max_points) {
// 点数小于 max_points 时,记录提速内点数
point_to_voxelidx[index] = num;
}
}
}
```
3. 确定体素个数以及体素坐标序号(该步骤可加速10倍)
```cpp
template <typename T_int>
__global__ void determin_voxel_num(
// const T_int* coor,
T_int* num_points_per_voxel, T_int* point_to_voxelidx,
T_int* point_to_pointidx, T_int* coor_to_voxelidx, T_int* voxel_num,
const int max_points, const int max_voxels, const int num_points) {
// only calculate the coors before this coor[index]
// 仅计算当前点序号之前的点
for (int i = 0; i < num_points; ++i) {
// if (coor[i][0] == -1)
// continue;
int point_pos_in_voxel = point_to_voxelidx[i];
// record voxel
if (point_pos_in_voxel == -1) {
// out of max_points or invalid point
continue;
} else if (point_pos_in_voxel == 0) { // 记录新体素
// record new voxel
int voxelidx = voxel_num[0];
if (voxel_num[0] >= max_voxels) continue; // 体素数目超过阈值,跳过
voxel_num[0] += 1; // 体素数目 +1
coor_to_voxelidx[i] = voxelidx; // 点号-voxelidx 映射
num_points_per_voxel[voxelidx] = 1; // 当前体素内的点数为 1
} else {
int point_idx = point_to_pointidx[i];
int voxelidx = coor_to_voxelidx[point_idx];
if (voxelidx != -1) {
coor_to_voxelidx[i] = voxelidx; // 点序号与 voxelidx映射
num_points_per_voxel[voxelidx] += 1; // 当前体素内的点数 +1
}
}
}
}
- 拷贝点特征到体素, num_features 及点的x,y,z, intensity及其他特征。
template <typename T, typename T_int>
__global__ void assign_point_to_voxel(const int nthreads, const T* points,
T_int* point_to_voxelidx,
T_int* coor_to_voxelidx, T* voxels,
const int max_points,
const int num_features,
const int num_points, const int NDim) {
CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) {
// const int index = blockIdx.x * threadsPerBlock + threadIdx.x;
int index = thread_idx / num_features;
int num = point_to_voxelidx[index];
int voxelidx = coor_to_voxelidx[index];
if (num > -1 && voxelidx > -1) {
auto voxels_offset =
voxels + voxelidx * max_points * num_features + num * num_features;
int k = thread_idx % num_features;
voxels_offset[k] = points[thread_idx];
}
}
}
-
template <typename T, typename T_int> __global__ void assign_voxel_coors(const int nthreads, T_int* coor, T_int* point_to_voxelidx, T_int* coor_to_voxelidx, T_int* voxel_coors, const int num_points, const int NDim) { CUDA_1D_KERNEL_LOOP(thread_idx, nthreads) { // const int index = blockIdx.x * threadsPerBlock + threadIdx.x; // if (index >= num_points) return; int index = thread_idx / NDim; int num = point_to_voxelidx[index]; int voxelidx = coor_to_voxelidx[index]; if (num == 0 && voxelidx > -1) { auto coors_offset = voxel_coors + voxelidx * NDim; int k = thread_idx % NDim; coors_offset[k] = coor[thread_idx]; } } }