集成了CUDA-PointPillars的程序有时突然崩溃,而且崩溃的位置怪异,发生在CUDA的cudaDeviceSynchronize()或cudaStreamsynchronize()或cudaEventSynchronize、cudaEventRecord()之类的API里,反正是平时不大可能发生的地方,凭以往经验直觉,原因应该不是在这些API里,而是因该其他地方有非法访问把内存搞乱了,调查后发现果然是CUDA-PointPillars里的预处理部分代码有漏洞:
__global__ void generateVoxels_random_kernel(float *points, size_t points_size,
float min_x_range, float max_x_range,
float min_y_range, float max_y_range,
float min_z_range, float max_z_range,
float pillar_x_size, float pillar_y_size, float pillar_z_size,
int grid_y_size, int grid_x_size,
unsigned int *mask, float *voxels)
{
int point_idx = blockIdx.x * blockDim.x + threadIdx.x;
if(point_idx >= points_size) return;
float4 point = ((float4*)points)[point_idx];
if(point.x<min_x_range||point.x>=max_x_range
|| point.y<min_y_range||point.y>=max_y_range
|| point.z<min_z_range||point.z>=max_z_range) return;
int voxel_idx = floorf((point.x - min_x_range)/pillar_x_size);
int voxel_idy = floorf((point.y - min_y_range)/pillar_y_size);
//在GPU上(CPU上表现不同),上面这里floorf()舍入精度非常高(小数点后位数多于5)的数据时可能会导致voxel_idx或voxel_idy的值越界,把float类型数值赋值给int类型变量也会发生舍入+1,从而造成voxel_idx可能等于grid_x_size或者voxel_idy等于grid_y_size,从而造成下面对mask和voxels数组的访问发生越界访问,把内存弄脏而发生崩溃!
unsigned int voxel_index = voxel_idy * grid_x_size
+ voxel_idx;
unsigned int point_id = atomicAdd(&(mask[voxel_index]), 1);
if(point_id >= POINTS_PER_VOXEL) return;
float *address = voxels + (voxel_index*POINTS_PER_VOXEL + point_id)*4;
atomicExch(address+0, point.x);
atomicExch(address+1, point.y);
atomicExch(address+2, point.z);
atomicExch(address+3, point.w);
}
__global__ void generateBaseFeatures_kernel(unsigned int *mask, float *voxels,
int grid_y_size, int grid_x_size,
unsigned int *pillar_num,
float *voxel_features,
unsigned int *voxel_num,
unsigned int *voxel_idxs)
{
unsigned int voxel_idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int voxel_idy = blockIdx.y * blockDim.y + threadIdx.y;
if(voxel_idx >= grid_x_size ||voxel_idy >= grid_y_size) return;
unsigned int voxel_index = voxel_idy * grid_x_size
+ voxel_idx;
unsigned int count = mask[voxel_index];
if( !(count>0) ) return;
count = count<POINTS_PER_VOXEL?count:POINTS_PER_VOXEL;
unsigned int current_pillarId = 0;
current_pillarId = atomicAdd(pillar_num, 1);
//这里没有对current_pillarId做是否达于等于MAX_VOXELS的检查!当点云足够多分布的范围足够大,以至于voxelization得到的voxels大于MAX_VOXELS时,下面对voxel_features数组的访问会发生越界访问!从而把内存弄脏造成崩溃!
voxel_num[current_pillarId] = count;
uint4 idx = {0, 0, voxel_idy, voxel_idx};
((uint4*)voxel_idxs)[current_pillarId] = idx;
for (int i=0; i<count; i++){
int inIndex = voxel_index*POINTS_PER_VOXEL + i;
int outIndex = current_pillarId*POINTS_PER_VOXEL + i;
((float4*)voxel_features)[outIndex] = ((float4*)voxels)[inIndex];
}
// clear buffer for next infer
atomicExch(mask + voxel_index, 0);
}
尝试做了个fix后crash消失了: Fixed two crashes caused by two defects in the preprocessing kernel f… · arnoldfychen/CUDA-PointPillars@2c9c4a1 · GitHub