3D点云检测网络训练踩坑记录(看这一篇就够了)

1. 数据

数据集链接:The KITTI Vision Benchmark Suite(官网)
简介:KITTI是自动驾驶领域最重要的数据集之一,提供了大量的真实场景数据,用来更好的度量和测试算法的表现。除3D目标检测外,KITTI数据集还可用于评测立体图像,光流,视觉测距,3D跟踪等计算机视觉技术在车载环境下的性能。每张图像中最多达15辆车和30个行人目标,同时包含了各种程度的遮挡与截断。

KITTI数据迅雷下载地址
图片下载:https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_image_2.zip
点云下载:https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_velodyne.zip
标签下载:https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_label_2.zip
校正文件下载:https://s3.eu-central-1.amazonaws.com/avg-kitti/data_object_calib.zip

2. 依赖库

2.1 spconv安装

windows系统下安装稀疏卷积库spconv
确定环境中的cuda版本

import torch
print(torch.version.cuda) #11.6
pip install spconv-cu116

参考https://zhuanlan.zhihu.com/p/650809788?utm_id=0

2.2 SharedArray安装

windows系统下SharedArray

1.官网下载https://github.com/imaginary-friend94/Shared-Array-for-Windows
在这里插入图片描述
2.修改setup.py

from setuptools import setup, find_packages, Extension
import numpy as np
import sys
 
if sys.platform == "linux" or sys.platform == "linux2":
    libraries_ext = ["rt"]
elif sys.platform == "win32":
    libraries_ext = []
 
 
ext_modules = [
    Extension('SharedArray',
    #extra_compile_args=["-std=c++11"],  #这里更改了c++ 11,我这里使用这种方法会报错,故而取消这个
    sources = ['shared_memory_python.cpp'],
    libraries = libraries_ext,
    language="c++")    #增加了编译语言
]
 
setup(
        name = 'SharedArray',
        version = '1.2',
        include_dirs = [np.get_include()], #Add Include path of numpy
        ext_modules = ext_modules
      )

3修改 shared_memory_python.cpp

#if defined(_WIN64) || defined(_WIN32) || defined(__CYGWIN__)
#define WIN
#include <Windows.h>
#include <winbase.h>
#include <sddl.h>
#pragma comment(lib, "advapi32.lib")
#elif defined(__linux__)
#define LINUX
#include <sys/mman.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <semaphore.h>
 
struct sem_wrapper
{
    sem_t * sem;
    bool is_locked;
};
 
#endif
 
#include "Python.h"
#include "numpy/arrayobject.h"
#include <iostream>
#include <cstdio>
#include <cstring>
 
#define WIN_SMEM "WINDOWS SHARED MEMORY"
#define ARRAY_STRUCT_SIZE sizeof(PyArrayObject)
#define ARRAY_FULL_SIZE(arr) (size_data_array(arr) + sizeof(int) + arr->nd * sizeof(npy_intp) * 3 + sizeof(int))
 
 
#if defined(WIN)
 
BOOL CreateMyDACL(SECURITY_ATTRIBUTES * pSA)
{
     TCHAR * szSD = TEXT("D:")       // Discretionary ACL
        TEXT("(D;OICI;GA;;;BG)")     // Deny access to 
                                     // built-in guests
        TEXT("(D;OICI;GA;;;AN)")     // Deny access to 
                                     // anonymous logon
        TEXT("(A;OICI;GRGWGX;;;AU)") // Allow 
                                     // read/write/execute 
                                     // to authenticated 
                                     // users
        TEXT("(A;OICI;GA;;;BA)");    // Allow full control 
                                     // to administrators
 
    if (NULL == pSA)
        return FALSE;
 
     return ConvertStringSecurityDescriptorToSecurityDescriptor(
                szSD,
                SDDL_REVISION_1,
                &(pSA->lpSecurityDescriptor),
                NULL);
}
#endif
 
/*
 * copy_from_pointer_array
 * the method returns the number of bytes copied
 */
template <typename ObjectType>
std::size_t copy_from_pointer_array(ObjectType * buffer_dist, ObjectType * buffer_src, size_t len) {
    for (int i = 0; i < len; i++) {
        *buffer_dist = *buffer_src;
        buffer_dist++;
        buffer_src++;
    }
    return len * sizeof(ObjectType);
}
 
std::size_t size_data_array(PyArrayObject *arr) {
    if (arr->nd == 0)
        return 0;
 
    std::size_t size = 1;
    for (int i = 0; i < arr->nd; ++i) {
        size *= (int) PyArray_DIM(arr, i);
    }
    size *= PyArray_ITEMSIZE(arr);
    return size;
}
 
void copy_from_numpy_array_to_buffer(PyArrayObject * array, char * buffer) {
    char * current_pointer = buffer;
    *((int *) current_pointer) = array->nd;
    current_pointer += sizeof(int);
    // dimensions copy
    current_pointer += copy_from_pointer_array(
        (npy_intp * ) current_pointer, 
        (npy_intp * ) array->dimensions, 
        array->nd
    );
    // strides copy
    current_pointer += copy_from_pointer_array(
        (npy_intp * ) current_pointer, 
        (npy_intp * ) array->strides, 
        array->nd
    );
    *((int *) current_pointer) = array->descr->type_num; 
    current_pointer += sizeof(int);
 
    size_t size_data = size_data_array(array);
    /* Copy data from heap to mmap memory */
    std::memcpy((char *) (current_pointer), (char *) array->data, size_data);
}
 
PyArrayObject * copy_from_buffer_to_numpy_array(char * buffer) {
    char * current_pointer = buffer;
    int nd = *((int *) current_pointer);
    current_pointer += sizeof(int);
    npy_intp * dims = new npy_intp[nd];
 
    current_pointer += copy_from_pointer_array(
        (npy_intp * ) dims, 
        (npy_intp * ) current_pointer, 
        nd
    );
    npy_intp * strides = new npy_intp[nd];
    current_pointer += copy_from_pointer_array(
        (npy_intp * ) strides, 
        (npy_intp * ) current_pointer, 
        nd
    );
    int type_num = *((int *) current_pointer);
    current_pointer += sizeof(int);
 
    PyArrayObject * array = (PyArrayObject *) PyArray_SimpleNewFromData(nd, 
        dims, 
        type_num, 
        (void *) current_pointer
    );
    return array;
}
 
 
/*
 * Create a buffer in shared memory
 */
char * create_shared_memory(char * string_shm, int max_buffer_size) {
    bool error_open_file_flag = false;
#if defined(WIN)
    SECURITY_ATTRIBUTES sa;
    if (!CreateMyDACL(&sa))
    {
         // Error encountered; generate message and exit.
        PyErr_SetString(PyExc_RuntimeError, "create_mutex: failed CreateMyDACL");
        return nullptr;
    }
    HANDLE hMapFile;
    hMapFile = CreateFileMapping(
        INVALID_HANDLE_VALUE,
        &sa,
        PAGE_READWRITE,
        0,
        max_buffer_size,
        string_shm);
    if (hMapFile == NULL) error_open_file_flag = true;
#elif defined(LINUX)
    int hMapFile = shm_open(string_shm, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
    if (hMapFile < 0){ 
        error_open_file_flag = true; 
    } else {
        if (ftruncate(hMapFile, max_buffer_size) == -1) error_open_file_flag = true;
    }
#endif
 
    if (error_open_file_flag) {
        PyErr_SetString(PyExc_RuntimeError, "create file is failed");
        return nullptr;
    }
 
#if defined(WIN)
    char * pBuf = (char *) MapViewOfFile(hMapFile,
                        FILE_MAP_ALL_ACCESS,
                        0,
                        0,
                        max_buffer_size);
#elif defined(LINUX)
    char * pBuf = (char *) mmap(NULL, max_buffer_size, PROT_WRITE | PROT_READ, MAP_SHARED, hMapFile, 0);
#endif
 
    if (pBuf == nullptr) {
        PyErr_SetString(PyExc_RuntimeError, "memory not allocated");
        return nullptr;
    }
 
    return pBuf;
}
/*
 * Del a buffer in shared memory
 */
bool delete_shared_memory(char * string_shm) {
#if defined(WIN)
    return true;
#elif defined(LINUX)
    if (shm_unlink(string_shm) == 0) return true;
#endif
}
 
/*
 * Attach a buffer in shared memory
 */
char * attach_shared_memory(char * string_shm) {
    bool error_open_file_flag = false;
#if defined(WIN)
    HANDLE hMapFile = OpenFileMapping(
        FILE_MAP_ALL_ACCESS,
        FALSE,
        string_shm); 
    if (hMapFile == NULL) error_open_file_flag = true;
#elif defined(LINUX)
    int hMapFile = shm_open(string_shm, O_RDWR, 0);
    if (hMapFile == -1) error_open_file_flag = true;
#endif
 
    if (error_open_file_flag) {
        PyErr_SetString(PyExc_RuntimeError, "memory not attached");
        return nullptr;
    }
 
#if defined(WIN)
    char * pBuf = (char *) MapViewOfFile(hMapFile,
                        FILE_MAP_ALL_ACCESS,
                        0,
                        0,
                        sizeof(size_t));
#elif defined(LINUX)
    char * pBuf = (char *) mmap(0, sizeof(size_t), PROT_WRITE | PROT_READ, MAP_SHARED, hMapFile, 0);
#endif
 
    size_t full_array_size = *((size_t *) pBuf);
 
#if defined(WIN)
    UnmapViewOfFile((LPCVOID) pBuf);
    pBuf = (char *) MapViewOfFile(hMapFile,
                        FILE_MAP_ALL_ACCESS,
                        0,
                        0,
                        full_array_size);
#elif defined(LINUX)
    munmap(pBuf, sizeof(size_t));
    pBuf = (char *) mmap(0, full_array_size, PROT_WRITE | PROT_READ, MAP_SHARED, hMapFile, 0);
#endif
 
    pBuf += sizeof(size_t);
 
    if (pBuf == nullptr) {
        PyErr_SetString(PyExc_RuntimeError, "memory not attached");
        return nullptr;
    }
 
    return pBuf;
}
 
static PyObject *
check_mem_sh(PyObject *self, PyObject *args) 
{
    char * string_shm;
    if (!PyArg_ParseTuple(args, "s", &string_shm)) {
        PyErr_SetString(PyExc_RuntimeError, "set_mem_sh: parse except");
    }
    bool error_open_file_flag = false;
 
#if defined(WIN)
    HANDLE hMapFile = OpenFileMapping(
        FILE_MAP_ALL_ACCESS,
        FALSE,
        string_shm);
    if (hMapFile == NULL) error_open_file_flag = true;
#elif defined(LINUX)
    int hMapFile = shm_open(string_shm, O_RDWR, S_IRWXU | S_IRWXG | S_IRWXO);
    if (hMapFile == -1) error_open_file_flag = true;
#endif    
    if (error_open_file_flag) {
        Py_INCREF(Py_False);
        return Py_False;
    }
    Py_INCREF(Py_True);
    return Py_True;
 
}
 
static PyObject *
create_mem_sh(PyObject *self, PyObject *args)
{
    PyObject * pyobj_for_shrdmem = nullptr;
    char * string_shm;
    if (!PyArg_ParseTuple(args, "sO", &string_shm, &pyobj_for_shrdmem)) {
        PyErr_SetString(PyExc_RuntimeError, "set_mem_sh: parse except");
    }
    PyArrayObject * array_for_shrdmem = (PyArrayObject *) pyobj_for_shrdmem;
    array_for_shrdmem = PyArray_GETCONTIGUOUS(array_for_shrdmem);
    if (array_for_shrdmem->base != nullptr) {
        PyErr_SetString(PyExc_RuntimeError, "set_mem_sh: array is not homogeneous");
    }
    /* Аrray size calculation */ 
    char * shBuf = create_shared_memory(string_shm, ARRAY_FULL_SIZE(array_for_shrdmem));
    if (shBuf == nullptr) {
        Py_INCREF(Py_None);
        return Py_None;
    }
    /* Copy array struct from heap to shared memory */
    *((size_t *) shBuf) = ARRAY_FULL_SIZE(array_for_shrdmem);
    shBuf += sizeof(size_t);
    copy_from_numpy_array_to_buffer(array_for_shrdmem, shBuf);
    Py_INCREF(Py_True);
    return Py_True;
}
 
static PyObject *
attach_mem_sh(PyObject *self, PyObject *args)
{
    char * string_shm;
    if (!PyArg_ParseTuple(args, "s", &string_shm)) {
        PyErr_SetString(PyExc_RuntimeError, "get_mem_sh: parse except");
    }
    char * shBuf = attach_shared_memory(string_shm);
    if (shBuf == nullptr) {
        Py_INCREF(Py_None);
        return Py_None;
    }
 
    PyArrayObject * array_for_shrdmem = (PyArrayObject *) shBuf;
    array_for_shrdmem = copy_from_buffer_to_numpy_array(shBuf);
    Py_INCREF((PyObject *) array_for_shrdmem);
    return (PyObject *) array_for_shrdmem;
}
 
static PyObject *
delete_mem_sh(PyObject *self, PyObject *args) {
    char * string_shm;
    if (!PyArg_ParseTuple(args, "s", &string_shm)) {
        PyErr_SetString(PyExc_RuntimeError, "get_mem_sh: parse except");
    }
    if (delete_shared_memory(string_shm)) {
        Py_INCREF(Py_True);
        return Py_True;
    }
    Py_INCREF(Py_False);
    return Py_False;
}
 
void mutex_destructor(PyObject * m_obj) {
#if defined(WIN)
    const char * name = PyCapsule_GetName(m_obj);
    HANDLE mut = (HANDLE) PyCapsule_GetPointer(m_obj, PyCapsule_GetName(m_obj));
    delete name;
#elif defined(LINUX)
    const char * name = PyCapsule_GetName(m_obj);
    sem_wrapper * mut = (sem_wrapper *) PyCapsule_GetPointer(m_obj, name);
    if (name != NULL) {
        if (mut->is_locked) {
            sem_post(mut->sem);
            mut->is_locked = false;
        }
        if (name != NULL) {
            delete name;
        }
        delete mut;
    }
#endif
}
 
static PyObject *
create_mutex(PyObject *self, PyObject *args) {
    bool error_open_file_flag = false;
    char * string_smp;
    if (!PyArg_ParseTuple(args, "s", &string_smp)) {
        PyErr_SetString(PyExc_RuntimeError, "create_mutex: parse except");
        return nullptr;
    }
    char * string_shm_new = new char[strlen(string_smp) + 1];
    strcpy(string_shm_new, string_smp);
#if defined(WIN)
    SECURITY_ATTRIBUTES sa;
    if (!CreateMyDACL(&sa))
    {
         // Error encountered; generate message and exit.
        PyErr_SetString(PyExc_RuntimeError, "create_mutex: failed CreateMyDACL");
        return nullptr;
    }
    HANDLE mut = CreateMutex(
        &sa, 
        FALSE, 
        string_shm_new
    );
    if (mut == nullptr) {
        error_open_file_flag = true;
    }
#elif defined(LINUX)
    sem_wrapper * mut = new sem_wrapper{
        sem_open(string_shm_new, O_CREAT, S_IRWXU | S_IRWXG | S_IRWXO, 1), 
        false
    };
    if (mut->sem == SEM_FAILED) {
        error_open_file_flag = true;
    }
#endif
    if (error_open_file_flag) {
        Py_INCREF(Py_None);
        return Py_None;
    }
    return PyCapsule_New((void *) mut, string_shm_new, (PyCapsule_Destructor) mutex_destructor);
}
 
static PyObject *
open_mutex(PyObject *self, PyObject *args) {
    bool error_open_file_flag = false;
    char * string_smp;
    if (!PyArg_ParseTuple(args, "s", &string_smp)) {
        PyErr_SetString(PyExc_RuntimeError, "open_mutex: parse except");
        return nullptr;
    }
    char * string_shm_new = new char[strlen(string_smp) + 1];
    strcpy(string_shm_new, string_smp);
#if defined(WIN)
    HANDLE mut = OpenMutex(
        MUTEX_ALL_ACCESS, 
        TRUE, 
        string_shm_new
    );
    if (mut == nullptr) error_open_file_flag = true;
#elif defined(LINUX)
    sem_wrapper * mut = new sem_wrapper{
        sem_open(string_shm_new, 0), 
        false
    };
    if (mut->sem == SEM_FAILED) error_open_file_flag = true;
#endif
 
    if (error_open_file_flag) {
        Py_INCREF(Py_None);
        return Py_None;
    }
    return PyCapsule_New((void *) mut, string_shm_new, (PyCapsule_Destructor) mutex_destructor);
}
 
static PyObject *
release_mutex(PyObject *self, PyObject *args) {
    PyObject * caps_mutex;
    if (!PyArg_ParseTuple(args, "O", &caps_mutex)) {
        PyErr_SetString(PyExc_RuntimeError, "release_mutex: parse except");
        return nullptr;
    }
#if defined(WIN)
    HANDLE mut = (HANDLE) PyCapsule_GetPointer(caps_mutex, PyCapsule_GetName(caps_mutex));
    ReleaseMutex(mut);
#elif defined(LINUX)
    sem_wrapper * mut = (sem_wrapper *) PyCapsule_GetPointer(caps_mutex, PyCapsule_GetName(caps_mutex));
    if (mut->is_locked) {
        sem_post(mut->sem);
        mut->is_locked = false;
    }
#endif
    Py_INCREF(Py_True);
    return Py_True;
}
 
static PyObject *
close_mutex(PyObject *self, PyObject *args) {
    PyObject * caps_mutex;
    if (!PyArg_ParseTuple(args, "O", &caps_mutex)) {
        PyErr_SetString(PyExc_RuntimeError, "close_mutex: parse except");
        return nullptr;
    }
    if (caps_mutex != Py_None) {
        mutex_destructor(caps_mutex);
        Py_INCREF(Py_True);
        return Py_True;
    } else {
        Py_INCREF(Py_False);
        return Py_False;    
    }
}
 
static PyObject *
remove_mutex(PyObject *self, PyObject *args) {
    PyObject * caps_mutex;
    if (!PyArg_ParseTuple(args, "O", &caps_mutex)) {
        PyErr_SetString(PyExc_RuntimeError, "close_mutex: parse except");
        return nullptr;
    }
#if defined(WIN)
    Py_INCREF(Py_True);
    return Py_True;
#elif defined(LINUX)
    sem_wrapper * mut = (sem_wrapper *) PyCapsule_GetPointer(caps_mutex, PyCapsule_GetName(caps_mutex));
    const char * name = PyCapsule_GetName(caps_mutex);
    if (sem_unlink(name) == -1) {
        Py_INCREF(Py_False);
        return Py_False;
    }
    if (name != NULL) {
        delete name;
    }
    delete mut;
    PyCapsule_SetName(caps_mutex, NULL);
    Py_INCREF(Py_True);
    return Py_True;
#endif    
}
 
static PyObject * _try_capture_mutex(PyObject * caps_mutex, int msec) {
#if defined(WIN)
    HANDLE mut = (HANDLE) PyCapsule_GetPointer(caps_mutex, PyCapsule_GetName(caps_mutex));
    DWORD out;
    if (msec == -1) {
        out = WaitForSingleObject(mut, INFINITE);
    } else {
        out = WaitForSingleObject(mut, (DWORD) msec);
    }
#elif defined(LINUX)
    sem_wrapper * mut = (sem_wrapper *) PyCapsule_GetPointer(caps_mutex, PyCapsule_GetName(caps_mutex));
    int out;
    if (msec == 0) {
        out = sem_trywait(mut->sem);
    } else if (msec != -1) {
        timespec ts;
        ts.tv_nsec = msec * 1000;
        out = sem_timedwait(mut->sem, &ts);
    } else {
        out = sem_wait(mut->sem);
    }
    if (out == 0) mut->is_locked = true;
#endif
 
    if (out == 0) {
        Py_INCREF(Py_True);
        return Py_True;
    }
    Py_INCREF(Py_False);
    return Py_False;
}
 
static PyObject *
try_capture_mutex(PyObject *self, PyObject *args) {
    PyObject * caps_mutex;
    int timeout;
    if (!PyArg_ParseTuple(args, "Oi", &caps_mutex, &timeout)) {
        PyErr_SetString(PyExc_RuntimeError, "try_capture_mutex: parse except");
        return nullptr;
    }
    return _try_capture_mutex(caps_mutex, timeout);
}
 
static PyObject *
capture_mutex(PyObject *self, PyObject *args) {
    PyObject * caps_mutex;
    if (!PyArg_ParseTuple(args, "O", &caps_mutex)) {
        PyErr_SetString(PyExc_RuntimeError, "capture_mutex: parse except");
        return nullptr;
    }
    return _try_capture_mutex(caps_mutex, -1);
}
 
static PyObject *
get_last_error(PyObject *self, PyObject *args) {
#if defined(WIN)
    PyObject * py_err = Py_BuildValue("i", (unsigned int) GetLastError());
#elif defined(LINUX)
    PyObject * py_err = Py_BuildValue("i", (unsigned int) errno);
#endif
    Py_INCREF(py_err);
    return py_err;
}
 
// static PyObject *
// test_function(PyObject *self, PyObject *args) {
//     int fd = shm_open("/test_sh_m", O_RDWR | O_CREAT, S_IRUSR | S_IWUSR);
//     return Py_None;
// }
 
static PyMethodDef SharedArrayMethods[] = {
 
    {"create_mem_sh",  create_mem_sh, METH_VARARGS,
     "method for create shared memory named."},
    {"attach_mem_sh",  attach_mem_sh, METH_VARARGS,
     "method for get shared memory named."},
    {"delete_mem_sh",  delete_mem_sh, METH_VARARGS,
     "method for del shared memory named."},
    {"check_mem_sh",  check_mem_sh, METH_VARARGS,
     "method for check shared memory named."},
    {"create_mutex",  create_mutex, METH_VARARGS,
     ""},
    {"open_mutex",  open_mutex, METH_VARARGS,
     ""},
    {"release_mutex",  release_mutex, METH_VARARGS,
     ""},
    {"close_mutex",  close_mutex, METH_VARARGS,
     ""},
    {"try_capture_mutex",  try_capture_mutex, METH_VARARGS,
      ""},
    {"close_mutex",  close_mutex, METH_VARARGS,
     ""},
    {"remove_mutex",  remove_mutex, METH_VARARGS,
     ""},
 
    {"capture_mutex",  capture_mutex, METH_VARARGS,
     "capture mutex"},
    {"get_last_error",  get_last_error, METH_VARARGS,
     "returns the result of the call GetLastError() function"},
    {NULL, NULL, 0, NULL}
};
 
static struct PyModuleDef wsamodule = {
    PyModuleDef_HEAD_INIT,
    "SharedArray", 
    NULL, 
    -1,    
    SharedArrayMethods
};
 
PyMODINIT_FUNC
PyInit_SharedArray(void)
{
    import_array();
    return PyModule_Create(&wsamodule);
}

4.最后安装目录下执行 python setup.py develop

参考https://blog.csdn.net/qq_61981718/article/details/130719188

2.3 OpenPCDet安装

windows系统下安装OpenPCDet

  1. OpenPCDet项目官网下载 https://github.com/open-mmlab/OpenPCDet.git
    在这里插入图片描述
  2. conda环境下安装依赖 pip install -r requirements.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
    在这里插入图片描述
  3. 代码修改
    .\pcdet\ops\iou3d_nms\src目录下
    iou3d_nms.cpp
/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
 
#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <stdint.h>
#include "iou3d_nms.h"
 
#define CHECK_CUDA(x) do { \
  if (!x.type().is_cuda()) { \
    fprintf(stderr, "%s must be CUDA tensor at %s:%d\n", #x, __FILE__, __LINE__); \
    exit(-1); \
  } \
} while (0)
#define CHECK_CONTIGUOUS(x) do { \
  if (!x.is_contiguous()) { \
    fprintf(stderr, "%s must be contiguous tensor at %s:%d\n", #x, __FILE__, __LINE__); \
    exit(-1); \
  } \
} while (0)
#define CHECK_INPUT(x) CHECK_CUDA(x);CHECK_CONTIGUOUS(x)
 
#define DIVUP(m,n) ((m) / (n) + ((m) % (n) > 0))
 
#define CHECK_ERROR(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char *file, int line, bool abort=true)
{
   if (code != cudaSuccess)
   {
      fprintf(stderr,"GPUassert: %s %s %d\n", cudaGetErrorString(code), file, line);
      if (abort) exit(code);
   }
}
 
const int THREADS_PER_BLOCK_NMS = sizeof(uint64_t) * 8;
 
void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap);
void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou);
void nmsLauncher(const float *boxes, uint64_t * mask, int boxes_num, float nms_overlap_thresh);
void nmsNormalLauncher(const float *boxes, uint64_t * mask, int boxes_num, float nms_overlap_thresh);
 
 
int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap){
    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
    // params ans_overlap: (N, M)
 
    CHECK_INPUT(boxes_a);
    CHECK_INPUT(boxes_b);
    CHECK_INPUT(ans_overlap);
 
    int num_a = boxes_a.size(0);
    int num_b = boxes_b.size(0);
 
    const float * boxes_a_data = boxes_a.data<float>();
    const float * boxes_b_data = boxes_b.data<float>();
    float * ans_overlap_data = ans_overlap.data<float>();
 
    boxesoverlapLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_overlap_data);
 
    return 1;
}
 
int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou){
    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
    // params ans_overlap: (N, M)
    CHECK_INPUT(boxes_a);
    CHECK_INPUT(boxes_b);
    CHECK_INPUT(ans_iou);
 
    int num_a = boxes_a.size(0);
    int num_b = boxes_b.size(0);
 
    const float * boxes_a_data = boxes_a.data<float>();
    const float * boxes_b_data = boxes_b.data<float>();
    float * ans_iou_data = ans_iou.data<float>();
 
    boxesioubevLauncher(num_a, boxes_a_data, num_b, boxes_b_data, ans_iou_data);
 
    return 1;
}
 
int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
    // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
    // params keep: (N)
    CHECK_INPUT(boxes);
    CHECK_CONTIGUOUS(keep);
 
    int boxes_num = boxes.size(0);
    const float * boxes_data = boxes.data<float>();
    long long * keep_data = keep.data<long long>();
 
    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
 
    uint64_t *mask_data = NULL;
    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(uint64_t)));
    nmsLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
 
    // uint64_t mask_cpu[boxes_num * col_blocks];
    // uint64_t *mask_cpu = new uint64_t [boxes_num * col_blocks];
    std::vector<uint64_t> mask_cpu(boxes_num * col_blocks);
 
//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(uint64_t),
                           cudaMemcpyDeviceToHost));
 
    cudaFree(mask_data);
 
    //uint64_t remv_cpu(col_blocks);
    std::vector<uint64_t> remv_cpu(col_blocks);
    memset(&remv_cpu[0], 0, col_blocks * sizeof(uint64_t));
 
    int num_to_keep = 0;
 
    for (int i = 0; i < boxes_num; i++){
        int nblock = i / THREADS_PER_BLOCK_NMS;
        int inblock = i % THREADS_PER_BLOCK_NMS;
 
        if (!(remv_cpu[nblock] & (1ULL << inblock))){
            keep_data[num_to_keep++] = i;
            uint64_t *p = &mask_cpu[0] + i * col_blocks;
            for (int j = nblock; j < col_blocks; j++){
                remv_cpu[j] |= p[j];
            }
        }
    }
    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
 
    return num_to_keep;
}
 
 
int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh){
    // params boxes: (N, 7) [x, y, z, dx, dy, dz, heading]
    // params keep: (N)
 
    CHECK_INPUT(boxes);
    CHECK_CONTIGUOUS(keep);
 
    int boxes_num = boxes.size(0);
    const float * boxes_data = boxes.data<float>();
    long long * keep_data = keep.data<long long>();
 
    const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
 
    uint64_t *mask_data = NULL;
    CHECK_ERROR(cudaMalloc((void**)&mask_data, boxes_num * col_blocks * sizeof(uint64_t)));
    nmsNormalLauncher(boxes_data, mask_data, boxes_num, nms_overlap_thresh);
 
    // uint64_t mask_cpu[boxes_num * col_blocks];
    // uint64_t *mask_cpu = new uint64_t [boxes_num * col_blocks];
    std::vector<uint64_t> mask_cpu(boxes_num * col_blocks);
 
//    printf("boxes_num=%d, col_blocks=%d\n", boxes_num, col_blocks);
    CHECK_ERROR(cudaMemcpy(&mask_cpu[0], mask_data, boxes_num * col_blocks * sizeof(uint64_t),
                           cudaMemcpyDeviceToHost));
 
    cudaFree(mask_data);
 
    std::vector<uint64_t> remv_cpu(col_blocks);
    memset(&remv_cpu[0], 0, col_blocks * sizeof(uint64_t));
 
    int num_to_keep = 0;
 
    for (int i = 0; i < boxes_num; i++){
        int nblock = i / THREADS_PER_BLOCK_NMS;
        int inblock = i % THREADS_PER_BLOCK_NMS;
 
        if (!(remv_cpu[nblock] & (1ULL << inblock))){
            keep_data[num_to_keep++] = i;
            uint64_t *p = &mask_cpu[0] + i * col_blocks;
            for (int j = nblock; j < col_blocks; j++){
                remv_cpu[j] |= p[j];
            }
        }
    }
    if ( cudaSuccess != cudaGetLastError() ) printf( "Error!\n" );
 
    return num_to_keep;
}

iou3d_nms_kernel.cu

/*
3D IoU Calculation and Rotated NMS(modified from 2D NMS written by others)
Written by Shaoshuai Shi
All Rights Reserved 2019-2020.
*/
 
 
#include <stdio.h>
#include <stdint.h>
#define THREADS_PER_BLOCK 16
#define DIVUP(m, n) ((m) / (n) + ((m) % (n) > 0))
 
// #define DEBUG
const int THREADS_PER_BLOCK_NMS = sizeof(unsigned long long) * 8;
//const float EPS = 1e-8;
struct Point {
    float x, y;
    __device__ Point() {}
    __device__ Point(double _x, double _y){
        x = _x, y = _y;
    }
 
    __device__ void set(float _x, float _y){
        x = _x; y = _y;
    }
 
    __device__ Point operator +(const Point &b)const{
        return Point(x + b.x, y + b.y);
    }
 
    __device__ Point operator -(const Point &b)const{
        return Point(x - b.x, y - b.y);
    }
};
 
__device__ inline float cross(const Point &a, const Point &b){
    return a.x * b.y - a.y * b.x;
}
 
__device__ inline float cross(const Point &p1, const Point &p2, const Point &p0){
    return (p1.x - p0.x) * (p2.y - p0.y) - (p2.x - p0.x) * (p1.y - p0.y);
}
 
__device__ int check_rect_cross_cuda(const Point &p1, const Point &p2, const Point &q1, const Point &q2){
    int ret = min(p1.x,p2.x) <= max(q1.x,q2.x)  &&
              min(q1.x,q2.x) <= max(p1.x,p2.x) &&
              min(p1.y,p2.y) <= max(q1.y,q2.y) &&
              min(q1.y,q2.y) <= max(p1.y,p2.y);
    return ret;
}
 
__device__ inline int check_in_box2d(const float *box, const Point &p){
    //params: (7) [x, y, z, dx, dy, dz, heading]
    const float MARGIN = 1e-2;
 
    float center_x = box[0], center_y = box[1];
    float angle_cos = cos(-box[6]), angle_sin = sin(-box[6]);  // rotate the point in the opposite direction of box
    float rot_x = (p.x - center_x) * angle_cos + (p.y - center_y) * (-angle_sin);
    float rot_y = (p.x - center_x) * angle_sin + (p.y - center_y) * angle_cos;
 
    return (fabs(rot_x) < box[3] / 2 + MARGIN && fabs(rot_y) < box[4] / 2 + MARGIN);
}
 
__device__ inline int intersection(const Point &p1, const Point &p0, const Point &q1, const Point &q0, Point &ans){
    // fast exclusion
    if (check_rect_cross_cuda(p0, p1, q0, q1) == 0) return 0;
 
    // check cross standing
    float s1 = cross(q0, p1, p0);
    float s2 = cross(p1, q1, p0);
    float s3 = cross(p0, q1, q0);
    float s4 = cross(q1, p1, q0);
 
    if (!(s1 * s2 > 0 && s3 * s4 > 0)) return 0;
 
    // calculate intersection of two lines
    float s5 = cross(q1, p1, p0);
    if(fabs(s5 - s1) > 1e-8){
        ans.x = (s5 * q0.x - s1 * q1.x) / (s5 - s1);
        ans.y = (s5 * q0.y - s1 * q1.y) / (s5 - s1);
 
    }
    else{
        float a0 = p0.y - p1.y, b0 = p1.x - p0.x, c0 = p0.x * p1.y - p1.x * p0.y;
        float a1 = q0.y - q1.y, b1 = q1.x - q0.x, c1 = q0.x * q1.y - q1.x * q0.y;
        float D = a0 * b1 - a1 * b0;
 
        ans.x = (b0 * c1 - b1 * c0) / D;
        ans.y = (a1 * c0 - a0 * c1) / D;
    }
 
    return 1;
}
 
__device__ inline void rotate_around_center(const Point &center, const float angle_cos, const float angle_sin, Point &p){
    float new_x = (p.x - center.x) * angle_cos + (p.y - center.y) * (-angle_sin) + center.x;
    float new_y = (p.x - center.x) * angle_sin + (p.y - center.y) * angle_cos + center.y;
    p.set(new_x, new_y);
}
 
__device__ inline int point_cmp(const Point &a, const Point &b, const Point &center){
    return atan2(a.y - center.y, a.x - center.x) > atan2(b.y - center.y, b.x - center.x);
}
 
__device__ inline float box_overlap(const float *box_a, const float *box_b){
    // params box_a: [x, y, z, dx, dy, dz, heading]
    // params box_b: [x, y, z, dx, dy, dz, heading]
 
    float a_angle = box_a[6], b_angle = box_b[6];
    float a_dx_half = box_a[3] / 2, b_dx_half = box_b[3] / 2, a_dy_half = box_a[4] / 2, b_dy_half = box_b[4] / 2;
    float a_x1 = box_a[0] - a_dx_half, a_y1 = box_a[1] - a_dy_half;
    float a_x2 = box_a[0] + a_dx_half, a_y2 = box_a[1] + a_dy_half;
    float b_x1 = box_b[0] - b_dx_half, b_y1 = box_b[1] - b_dy_half;
    float b_x2 = box_b[0] + b_dx_half, b_y2 = box_b[1] + b_dy_half;
 
    Point center_a(box_a[0], box_a[1]);
    Point center_b(box_b[0], box_b[1]);
 
#ifdef DEBUG
    printf("a: (%.3f, %.3f, %.3f, %.3f, %.3f), b: (%.3f, %.3f, %.3f, %.3f, %.3f)\n", a_x1, a_y1, a_x2, a_y2, a_angle,
           b_x1, b_y1, b_x2, b_y2, b_angle);
    printf("center a: (%.3f, %.3f), b: (%.3f, %.3f)\n", center_a.x, center_a.y, center_b.x, center_b.y);
#endif
 
    Point box_a_corners[5];
    box_a_corners[0].set(a_x1, a_y1);
    box_a_corners[1].set(a_x2, a_y1);
    box_a_corners[2].set(a_x2, a_y2);
    box_a_corners[3].set(a_x1, a_y2);
 
    Point box_b_corners[5];
    box_b_corners[0].set(b_x1, b_y1);
    box_b_corners[1].set(b_x2, b_y1);
    box_b_corners[2].set(b_x2, b_y2);
    box_b_corners[3].set(b_x1, b_y2);
 
    // get oriented corners
    float a_angle_cos = cos(a_angle), a_angle_sin = sin(a_angle);
    float b_angle_cos = cos(b_angle), b_angle_sin = sin(b_angle);
 
    for (int k = 0; k < 4; k++){
#ifdef DEBUG
        printf("before corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
#endif
        rotate_around_center(center_a, a_angle_cos, a_angle_sin, box_a_corners[k]);
        rotate_around_center(center_b, b_angle_cos, b_angle_sin, box_b_corners[k]);
#ifdef DEBUG
        printf("corner %d: a(%.3f, %.3f), b(%.3f, %.3f) \n", k, box_a_corners[k].x, box_a_corners[k].y, box_b_corners[k].x, box_b_corners[k].y);
#endif
    }
 
    box_a_corners[4] = box_a_corners[0];
    box_b_corners[4] = box_b_corners[0];
 
    // get intersection of lines
    Point cross_points[16];
    Point poly_center;
    int cnt = 0, flag = 0;
 
    poly_center.set(0, 0);
    for (int i = 0; i < 4; i++){
        for (int j = 0; j < 4; j++){
            flag = intersection(box_a_corners[i + 1], box_a_corners[i], box_b_corners[j + 1], box_b_corners[j], cross_points[cnt]);
            if (flag){
                poly_center = poly_center + cross_points[cnt];
                cnt++;
#ifdef DEBUG
                printf("Cross points (%.3f, %.3f): a(%.3f, %.3f)->(%.3f, %.3f), b(%.3f, %.3f)->(%.3f, %.3f) \n",
                    cross_points[cnt - 1].x, cross_points[cnt - 1].y,
                    box_a_corners[i].x, box_a_corners[i].y, box_a_corners[i + 1].x, box_a_corners[i + 1].y,
                    box_b_corners[i].x, box_b_corners[i].y, box_b_corners[i + 1].x, box_b_corners[i + 1].y);
#endif
            }
        }
    }
 
    // check corners
    for (int k = 0; k < 4; k++){
        if (check_in_box2d(box_a, box_b_corners[k])){
            poly_center = poly_center + box_b_corners[k];
            cross_points[cnt] = box_b_corners[k];
            cnt++;
#ifdef DEBUG
                printf("b corners in a: corner_b(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
#endif
        }
        if (check_in_box2d(box_b, box_a_corners[k])){
            poly_center = poly_center + box_a_corners[k];
            cross_points[cnt] = box_a_corners[k];
            cnt++;
#ifdef DEBUG
                printf("a corners in b: corner_a(%.3f, %.3f)", cross_points[cnt - 1].x, cross_points[cnt - 1].y);
#endif
        }
    }
 
    poly_center.x /= cnt;
    poly_center.y /= cnt;
 
    // sort the points of polygon
    Point temp;
    for (int j = 0; j < cnt - 1; j++){
        for (int i = 0; i < cnt - j - 1; i++){
            if (point_cmp(cross_points[i], cross_points[i + 1], poly_center)){
                temp = cross_points[i];
                cross_points[i] = cross_points[i + 1];
                cross_points[i + 1] = temp;
            }
        }
    }
 
#ifdef DEBUG
    printf("cnt=%d\n", cnt);
    for (int i = 0; i < cnt; i++){
        printf("All cross point %d: (%.3f, %.3f)\n", i, cross_points[i].x, cross_points[i].y);
    }
#endif
 
    // get the overlap areas
    float area = 0;
    for (int k = 0; k < cnt - 1; k++){
        area += cross(cross_points[k] - cross_points[0], cross_points[k + 1] - cross_points[0]);
    }
 
    return fabs(area) / 2.0;
}
 
__device__ inline float iou_bev(const float *box_a, const float *box_b){
    // params box_a: [x, y, z, dx, dy, dz, heading]
    // params box_b: [x, y, z, dx, dy, dz, heading]
    float sa = box_a[3] * box_a[4];
    float sb = box_b[3] * box_b[4];
    float s_overlap = box_overlap(box_a, box_b);
    return s_overlap / fmaxf(sa + sb - s_overlap, 1e-8);
}
 
__global__ void boxes_overlap_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
 
    if (a_idx >= num_a || b_idx >= num_b){
        return;
    }
    const float * cur_box_a = boxes_a + a_idx * 7;
    const float * cur_box_b = boxes_b + b_idx * 7;
    float s_overlap = box_overlap(cur_box_a, cur_box_b);
    ans_overlap[a_idx * num_b + b_idx] = s_overlap;
}
 
__global__ void boxes_iou_bev_kernel(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
    // params boxes_a: (N, 7) [x, y, z, dx, dy, dz, heading]
    // params boxes_b: (M, 7) [x, y, z, dx, dy, dz, heading]
    const int a_idx = blockIdx.y * THREADS_PER_BLOCK + threadIdx.y;
    const int b_idx = blockIdx.x * THREADS_PER_BLOCK + threadIdx.x;
 
    if (a_idx >= num_a || b_idx >= num_b){
        return;
    }
 
    const float * cur_box_a = boxes_a + a_idx * 7;
    const float * cur_box_b = boxes_b + b_idx * 7;
    float cur_iou_bev = iou_bev(cur_box_a, cur_box_b);
    ans_iou[a_idx * num_b + b_idx] = cur_iou_bev;
}
 
__global__ void nms_kernel(const int boxes_num, const float nms_overlap_thresh,
                           const float *boxes, uint64_t*mask){
    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
 
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;
 
    // if (row_start > col_start) return;
 
    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
 
    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
 
    if (threadIdx.x < col_size) {
        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
    }
    __syncthreads();
 
    if (threadIdx.x < row_size) {
        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
        const float *cur_box = boxes + cur_box_idx * 7;
 
        int i = 0;
        uint64_t t = 0;
        int start = 0;
        if (row_start == col_start) {
          start = threadIdx.x + 1;
        }
        for (i = start; i < col_size; i++) {
            if (iou_bev(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
                t |= 1ULL << i;
            }
        }
        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
        mask[cur_box_idx * col_blocks + col_start] = t;
    }
}
 
 
__device__ inline float iou_normal(float const * const a, float const * const b) {
    //params: a: [x, y, z, dx, dy, dz, heading]
    //params: b: [x, y, z, dx, dy, dz, heading]
 
    float left = fmaxf(a[0] - a[3] / 2, b[0] - b[3] / 2), right = fminf(a[0] + a[3] / 2, b[0] + b[3] / 2);
    float top = fmaxf(a[1] - a[4] / 2, b[1] - b[4] / 2), bottom = fminf(a[1] + a[4] / 2, b[1] + b[4] / 2);
    float width = fmaxf(right - left, 0.f), height = fmaxf(bottom - top, 0.f);
    float interS = width * height;
    float Sa = a[3] * a[4];
    float Sb = b[3] * b[4];
    return interS / fmaxf(Sa + Sb - interS, 1e-8);
}
 
 
__global__ void nms_normal_kernel(const int boxes_num, const float nms_overlap_thresh,
                           const float *boxes, uint64_t*mask){
    //params: boxes (N, 7) [x, y, z, dx, dy, dz, heading]
    //params: mask (N, N/THREADS_PER_BLOCK_NMS)
 
    const int row_start = blockIdx.y;
    const int col_start = blockIdx.x;
 
    // if (row_start > col_start) return;
 
    const int row_size = fminf(boxes_num - row_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
    const int col_size = fminf(boxes_num - col_start * THREADS_PER_BLOCK_NMS, THREADS_PER_BLOCK_NMS);
 
    __shared__ float block_boxes[THREADS_PER_BLOCK_NMS * 7];
 
    if (threadIdx.x < col_size) {
        block_boxes[threadIdx.x * 7 + 0] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 0];
        block_boxes[threadIdx.x * 7 + 1] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 1];
        block_boxes[threadIdx.x * 7 + 2] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 2];
        block_boxes[threadIdx.x * 7 + 3] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 3];
        block_boxes[threadIdx.x * 7 + 4] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 4];
        block_boxes[threadIdx.x * 7 + 5] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 5];
        block_boxes[threadIdx.x * 7 + 6] = boxes[(THREADS_PER_BLOCK_NMS * col_start + threadIdx.x) * 7 + 6];
    }
    __syncthreads();
 
    if (threadIdx.x < row_size) {
        const int cur_box_idx = THREADS_PER_BLOCK_NMS * row_start + threadIdx.x;
        const float *cur_box = boxes + cur_box_idx * 7;
 
        int i = 0;
        uint64_t t = 0;
        int start = 0;
        if (row_start == col_start) {
          start = threadIdx.x + 1;
        }
        for (i = start; i < col_size; i++) {
            if (iou_normal(cur_box, block_boxes + i * 7) > nms_overlap_thresh){
                t |= 1ULL << i;
            }
        }
        const int col_blocks = DIVUP(boxes_num, THREADS_PER_BLOCK_NMS);
        mask[cur_box_idx * col_blocks + col_start] = t;
    }
}
 
void boxesoverlapLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_overlap){
 
    dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
    dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
 
    boxes_overlap_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_overlap);
#ifdef DEBUG
    cudaDeviceSynchronize();  // for using printf in kernel function
#endif
}
 
void boxesioubevLauncher(const int num_a, const float *boxes_a, const int num_b, const float *boxes_b, float *ans_iou){
 
    dim3 blocks(DIVUP(num_b, THREADS_PER_BLOCK), DIVUP(num_a, THREADS_PER_BLOCK));  // blockIdx.x(col), blockIdx.y(row)
    dim3 threads(THREADS_PER_BLOCK, THREADS_PER_BLOCK);
 
    boxes_iou_bev_kernel<<<blocks, threads>>>(num_a, boxes_a, num_b, boxes_b, ans_iou);
#ifdef DEBUG
    cudaDeviceSynchronize();  // for using printf in kernel function
#endif
}
 
 
void nmsLauncher(const float *boxes, uint64_t* mask, int boxes_num, float nms_overlap_thresh){
    dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
                DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
    dim3 threads(THREADS_PER_BLOCK_NMS);
    nms_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
}
 
 
void nmsNormalLauncher(const float *boxes, uint64_t* mask, int boxes_num, float nms_overlap_thresh){
    dim3 blocks(DIVUP(boxes_num, THREADS_PER_BLOCK_NMS),
                DIVUP(boxes_num, THREADS_PER_BLOCK_NMS));
    dim3 threads(THREADS_PER_BLOCK_NMS);
    nms_normal_kernel<<<blocks, threads>>>(boxes_num, nms_overlap_thresh, boxes, mask);
}

iou3d_nms.h

#ifndef IOU3D_NMS_H
#define IOU3D_NMS_H
 
#include <torch/serialize/tensor.h>
#include <vector>
#include <assert.h>
#include <cuda.h>
#include <cuda_runtime_api.h>
 
//int boxes_aligned_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap);
int boxes_overlap_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_overlap);
int boxes_iou_bev_gpu(at::Tensor boxes_a, at::Tensor boxes_b, at::Tensor ans_iou);
int nms_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh);
int nms_normal_gpu(at::Tensor boxes, at::Tensor keep, float nms_overlap_thresh);
 
#endif

iou3d_nms_api.cpp

#include <torch/serialize/tensor.h>
#include <torch/extension.h>
#include <vector>
#include <cuda.h>
#include <cuda_runtime_api.h>
 
#include "iou3d_cpu.h"
#include "iou3d_nms.h"
 
 
PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
    //m.def("boxes_aligned_overlap_bev_gpu", &boxes_aligned_overlap_bev_gpu, "aligned oriented boxes overlap");
    m.def("boxes_overlap_bev_gpu", &boxes_overlap_bev_gpu, "oriented boxes overlap");
    m.def("boxes_iou_bev_gpu", &boxes_iou_bev_gpu, "oriented boxes iou");
    m.def("nms_gpu", &nms_gpu, "oriented nms gpu");
    m.def("nms_normal_gpu", &nms_normal_gpu, "nms gpu");
    m.def("boxes_aligned_iou_bev_cpu", &boxes_aligned_iou_bev_cpu, "aligned oriented boxes iou");
    m.def("boxes_iou_bev_cpu", &boxes_iou_bev_cpu, "oriented boxes iou");
}
  1. 编译 python setup.py develop
    ![在这里插入图片描述](https://img-blog.csdnimg.cn/
    可能出现的坑
    问题1(已解决)
无法打开输入文件“F:\OpenPCDet\OpenPCDet\build\temp.win-amd64-3.8\Release\pcdet\ops\iou3d_nms\src\iou3d_cpu.obj”

error: command 'C:\\Program Files (x86)\\Microsoft Visual Studio\\2017\\BuildTools\\VC\\Tools\\MSVC\\14.16.27023\\bin\\HostX86\\x64\\link.exe' failed with exit status 1181

将setup.py中的cmdclass={'build_ext': BuildExtension}”修改为cmdclass={'build_ext': BuildExtension.with_options(use_ninja=False)},然后重新编译;
参考https://blog.csdn.net/qq_61981718/article/details/130741276

问题2(已解决)

安装过程中报错:fatal error: THC/THC.h: No such file or directory

主要是pytorch版本太高导致,可将版本降低至1.11以下,或者更改./OpenPCDet/pcdet/ops/pointnet2/pointnet2_batch/src和./OpenPCDet/pcdet/ops/pointnet2/pointnet2_stack/src文件夹下的几个cpp文件

//#include <THC/THC.h>
...
//extern THCState *state

将这两行注释掉之后重新编译

评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值