本文以PRBonn的https://github.com/PRBonn/rangenet_lib,进行前后端加速处理,其中前处理包括投影变换、建立输入图像、标准化。后处理包括逆投影。C++代码过程,上面的讲解十分详细。我们对这部分进行cuda进行处理。
1、函数初始化
void Init(float fov_up, float fov_down, int img_w, int img_h,
int img_d, int m_classes, const int num_threads,
const float* imgmean, const float* imgstd) {
m_fov_up_ = fov_up; //上视野范围
m_fov_down_ = fov_down; //下视野范围
m_fov_ = std::abs(fov_down) + std::abs(fov_up);
m_img_h_ = img_h;
m_img_w_ = img_w;
m_img_d_ = img_d;
m_classes_ = m_classes; //类别种类
m_num_threads_ = num_threads;
// imgmean
cudaMalloc((void**)&m_imgmean_ptr_, 5 * sizeof(float));
cudaMemcpy(m_imgmean_ptr_, imgmean, 5 * sizeof(float),
cudaMemcpyHostToDevice);
// imgstd
cudaMalloc((void**)&m_imgstd_ptr_, 5 * sizeof(float));
cudaMemcpy(m_imgstd_ptr_, imgstd, 5 * sizeof(float), cudaMemcpyHostToDevice);
}
2、预处理
__global__ void project_kernel(const float* dev_points_ptr,
const int num_points, float* projected_data_ptr,
float* projx_ptr, float* projy_ptr,
int* m_invalid_points_ptr_, float fov_down,
float fov, const int img_w, const int img_h,
const int img_d) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id >= num_points) {
return;
}
int index = id;
float x = dev_points_ptr[4 * index]; // input depth
float y = dev_points_ptr[4 * index + 1];
float z = dev_points_ptr[4 * index + 2];
float intensity = dev_points_ptr[4 * index + 3];
float range = std::sqrt(x * x + y * y + z * z);
float yaw = -std::atan2(y, x);
float pitch = std::asin(z / range); //[-pi/2,+pi/2]
float proj_x = 0.5 * (yaw / M_PI + 1.0); // in [0.0, 1.0]
float proj_y = 1.0 - (pitch + std::abs(fov_down)) / fov; // in [0.0, 1.0]
proj_x *= img_w; // in [0.0, W]
proj_y *= img_h; // in [0.0, H]
proj_x = std::floor(proj_x); //向下取整数
proj_x = (img_w - 1.0f) < proj_x ? (img_w - 1.0f) : proj_x;
proj_x = proj_x > 0.0f ? proj_x : 0.0f; // in [0,W-1]
proj_y = std::floor(proj_y); //向下取整数
proj_y = (img_h - 1.0f) < proj_y ? (img_h - 1.0f) : proj_y;
proj_y = proj_y > 0.0f ? proj_y : 0.0f; // in [0,H-1]
int projected_data_index = int(proj_y * img_w + proj_x); //
projected_data_ptr[img_d * projected_data_index] = range;
projected_data_ptr[img_d * projected_data_index + 1] = x;
projected_data_ptr[img_d * projected_data_index + 2] = y;
projected_data_ptr[img_d * projected_data_index + 3] = z;
projected_data_ptr[img_d * projected_data_index + 4] = intensity;
projx_ptr[index] = proj_x;
projy_ptr[index] = proj_y;
}
__global__ void normal_kernel(float* projected_data_ptr, int* invalid_idxs_ptr,
float* normal_data_ptr, const float* imgmean_ptr,
const float* imgstd_ptr, const int num_points,
const int img_w, const int img_h,
const int img_d) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id >= num_points) {
return;
}
float range = projected_data_ptr[img_d * id];
float x = projected_data_ptr[img_d * id + 1];
float y = projected_data_ptr[img_d * id + 2];
float z = projected_data_ptr[img_d * id + 3];
float intensity = projected_data_ptr[img_d * id + 4];
bool all_zeros = false;
if (range == 0.0f && x == 0.0f && y == 0.0f && z == 0.0f &&
intensity == 0.0f) {
invalid_idxs_ptr[id] = -1; // invalid
all_zeros = true;
}
int channel_offset = img_h * img_w;
for (size_t i = 0; i < img_d; i++) {
if (!all_zeros)
projected_data_ptr[img_d * id + i] =(projected_data_ptr[img_d * id + i] - imgmean_ptr[i]) / imgstd_ptr[i];
int buffer_idx = channel_offset * i + id;
normal_data_ptr[buffer_idx] = projected_data_ptr[img_d * id + i];
}
}
void DoPreprocessPointsCuda(const float* dev_points,const int num_points,float* range_image) {
// point cloud
if (m_points_ptr_ != nullptr) {
cudaFree((void*)m_points_ptr_);
m_points_ptr_ = nullptr;
}
// project x
if (m_projx_ptr_ != nullptr) {
cudaFree((void*)m_projx_ptr_);
m_projx_ptr_ = nullptr;
}
// project y
if (m_projy_ptr_ != nullptr) {
cudaFree((void*)&m_projy_ptr_);
m_projy_ptr_ = nullptr;
}
// invalid index
if (m_invalid_idxs_ptr_ != nullptr) {
cudaFree((void*)&m_invalid_idxs_ptr_);
m_invalid_idxs_ptr_ = nullptr;
}
// determine detection roi
if (m_invalid_points_ptr_ != nullptr) {
cudaFree((void*)&m_invalid_points_ptr_);
m_invalid_points_ptr_ = nullptr;
}
cudaMalloc((void**)&m_points_ptr_, num_points * 4 * sizeof(float)); // point
cudaMemcpyAsync(m_points_ptr_, dev_points, num_points * 4 * sizeof(float),cudaMemcpyHostToDevice, *m_cudaStream_);
cudaMalloc((void**)&m_projx_ptr_, num_points * sizeof(float)); // project x
cudaMalloc((void**)&m_projy_ptr_, num_points * sizeof(float)); // project y
cudaMalloc((void**)&m_invalid_points_ptr_,num_points * sizeof(int)); // this point need detection or not
cudaMemset((void*)m_invalid_points_ptr_, PD_VALID, num_points * sizeof(int));
cudaMalloc((void**)&m_invalid_idxs_ptr_, m_img_w_ * m_img_h_ * sizeof(int));
cudaMemset((void*)m_invalid_idxs_ptr_, PD_VALID,m_img_w_ * m_img_h_ * sizeof(int));
// project
float* projected_data_ptr = nullptr;
cudaMalloc((void**)&projected_data_ptr,m_img_w_ * m_img_h_ * m_img_d_ * sizeof(float));
cudaMemset((void*)projected_data_ptr, 0,m_img_w_ * m_img_h_ * m_img_d_ * sizeof(float));
cudaStreamSynchronize(*m_cudaStream_);
int num_block = DIVUP(num_points, m_num_threads_);
project_kernel<<<num_block, m_num_threads_>>>(
m_points_ptr_, num_points, projected_data_ptr, m_projx_ptr_, m_projy_ptr_,
m_invalid_points_ptr_, m_fov_down_, m_fov_, m_img_w_, m_img_h_, m_img_d_);
// normal
num_block = DIVUP(m_img_w_ * m_img_h_, m_num_threads_);
normal_kernel<<<num_block, m_num_threads_>>>(
projected_data_ptr, m_invalid_idxs_ptr_, range_image, m_imgmean_ptr_,
m_imgstd_ptr_, m_img_w_ * m_img_h_, m_img_w_, m_img_h_, m_img_d_);
cudaFree(projected_data_ptr);
}
3、后处理
__global__ void outdata_kernel(const float* predicted_data_ptr,
const int* invalid_idxs_ptr,
float* range_data_ptr, const int num_points,
const int img_w, const int img_h,
const int n_classes) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id >= num_points) {
return;
}
int channel_offset = img_w * img_h;
int valid_value = invalid_idxs_ptr[id];
for (size_t i = 0; i < n_classes; i++) {
int buffer_idx = channel_offset * i + id;
int in = n_classes * id + i;
range_data_ptr[in] = predicted_data_ptr[buffer_idx];
if (valid_value == -1) // invalid point
range_data_ptr[in] = -1;
}
}
__global__ void unproject_kernel(float* projected_data_ptr,
float* semantic_data_ptr, const int num_points,
const float* proj_xs, const float* proj_ys,
int* m_invalid_points_ptr_, const int img_w,
const int img_h, const int n_class) {
int id = blockIdx.x * blockDim.x + threadIdx.x;
if (id >= num_points) {
return;
}
int index = (int)(proj_ys[id] * img_w + proj_xs[id]);
for (int i = 0; i < n_class; i++) {
semantic_data_ptr[n_class * id + i] =(m_invalid_points_ptr_[id] != PD_INVALID)
? projected_data_ptr[n_class * index + i]
: -1;
}
}
void DoPostprocessPointsCuda(const float* predicted_data_ptr,const int num_points_,float* semantic_data_ptr) {
float* range_data_ptr = nullptr;
cudaMalloc((void**)&range_data_ptr,m_img_h_ * m_img_w_ * m_classes_ * sizeof(float));
int pixel_points = m_img_h_ * m_img_w_;
int num_block = DIVUP(pixel_points, m_num_threads_);
outdata_kernel<<<num_block, m_num_threads_>>>(
predicted_data_ptr, m_invalid_idxs_ptr_, range_data_ptr, pixel_points,
m_img_w_, m_img_h_, m_classes_);
// unproject
float* semantic_gpu_data_ptr = nullptr;
pixel_points = num_points_;
num_block = DIVUP(pixel_points, m_num_threads_);
cudaMalloc((void**)&semantic_gpu_data_ptr,num_points_ * m_classes_ * sizeof(float));
cudaMemset((void*)semantic_gpu_data_ptr, 0,num_points_ * m_classes_ * sizeof(float));
unproject_kernel<<<num_block, m_num_threads_>>>(
range_data_ptr, semantic_gpu_data_ptr, pixel_points, m_projx_ptr_,
m_projy_ptr_, m_invalid_points_ptr_, m_img_w_, m_img_h_, m_classes_);
// copy
cudaMemcpyAsync(semantic_data_ptr, semantic_gpu_data_ptr,
num_points_ * m_classes_ * sizeof(float),
cudaMemcpyDeviceToHost, *m_cudaStream_);
cudaStreamSynchronize(*m_cudaStream_);
cudaFree(range_data_ptr);
cudaFree(semantic_gpu_data_ptr);
}
4、连接过程