尝试利用CMake重编译OpenCV多次失败,苦于用不了OpenCV的GPU模块!
受一个师兄的启发,开始尝试从OpenCV库函数中抠出来自己的CUDA代码,忙活了一周终于有点起色。成功抠出来FAST角点检测的代码,特在此分享经验~
首先需要研读OpenCV库函数的代码,找到其位置:
.cu文件
D:\software\opencv-2.4.13\opencv\sources\modules\gpu\src\cuda
就是你安装OpenCV的地方。。。。那一堆文件;
进去CUDA这个文件夹,看到了熟悉的.cu文件,然后就是根据自己需要打开相应的文件进行修改;.cpp文件
D:\software\opencv-2.4.13\opencv\sources\modules\gpu\src
头文件(主要是一些类的定义)
D:\software\opencv-2.4.13\opencv\sources\modules\gpu\include\opencv2\gpu
以FAST角点检测为例,这里需要把fast.cu、fast.cpp、gpu.hpp三个文件拷贝出来。
下面就是工作的重点,对其修改形成自己的CUDA文件。
修改CU,CPP,.h文件
这一部分需要注意的地方:
- 数据结构的改变,如GpuMat->Mat;
- 注意GPU模块独有的一些结构体类型:
如:template struct PtrStepSz,要注意将其变换为相应变量的指针,如T为unsigned int类型时,就要将其改成uchar*,另外需要注意添加申请设备变量空间和主机端到设备端变量的传递环节等;
- 头文件的包含,需要去掉OpenCV GPU模块头文件的包含,添加用到的OpenCV的库函数以及CUDA库函数;
- 还有一些接口数据传输需要具体调试(需要结合Nsight工具观察核函数的执行情况)过程中进行修改;
- 另外,库函数里会有一堆namespace,我选择直接把它删掉,也不影响什么。
接下来就是漫长的调试过程
下面给出我修改的一段FAST进行非极大值抑制部分的代码
修改前:
int nonmaxSuppression_gpu(const short2* kpLoc, int count, PtrStepSzi score, short2* loc, float* response)
{
void* counter_ptr;
cudaSafeCall( cudaGetSymbolAddress(&counter_ptr, g_counter) );
dim3 block(256);
dim3 grid;
grid.x = divUp(count, block.x);
cudaSafeCall( cudaMemset(counter_ptr, 0, sizeof(unsigned int)) );
nonmaxSuppression<<<grid, block>>>(kpLoc, count, score, loc, response);
cudaSafeCall( cudaGetLastError() );
cudaSafeCall( cudaDeviceSynchronize() );
unsigned int new_count;
cudaSafeCall( cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost) );
return new_count;
}
int nonmaxSuppression_gpu(short2* kpLoc, int count, Mat score, short2* loc, float* response,int max)
{
void* counter_ptr;
//cudaSafeCall(cudaGetSymbolAddress(&counter_ptr, g_counter));
cudaGetSymbolAddress(&counter_ptr, g_counter);
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
dim3 block(256);
dim3 grid;
grid.x = iDivUp(count, block.x);
short2*d_kpLoc;
checkCudaErrors(cudaMalloc((void**)&d_kpLoc, sizeof(short2)*max));
checkCudaErrors(cudaMemcpy(d_kpLoc, kpLoc, sizeof(short2)*max, cudaMemcpyHostToDevice));
short2*d_loc;
checkCudaErrors(cudaMalloc((void**)&d_loc, sizeof(short2)*count));
checkCudaErrors(cudaMemcpy(d_loc, loc, sizeof(short2)*count, cudaMemcpyHostToDevice));
uchar* d_scoredata0 ;
checkCudaErrors(cudaMalloc((void**)&d_scoredata0, sizeof(uchar)*score.cols*score.rows));
checkCudaErrors(cudaMemcpy(d_scoredata0, score.data, sizeof(uchar)*score.cols*score.rows, cudaMemcpyHostToDevice));
float* d_response;
checkCudaErrors(cudaMalloc((void**)&d_response, sizeof(float)*count));
checkCudaErrors(cudaMemcpy(d_response, response, sizeof(float)*count, cudaMemcpyHostToDevice));
// int h_score = score.rows;
int w_score = score.cols;
//cudaSafeCall(cudaMemset(counter_ptr, 0, sizeof(unsigned int)));
cudaMemset(counter_ptr, 0, sizeof(unsigned int));
cudaEventRecord(start);
nonmaxSuppression << <grid, block >> >(d_kpLoc, count, d_scoredata0, d_loc, d_response, w_score);
//nonmaxSuppression << <grid, block >> >(d_kpLoc, count, d_scoredata0, d_loc, w_score);
//cudaSafeCall(cudaGetLastError());
cudaEventRecord(stop);
cudaEventSynchronize(stop);
cudaDeviceSynchronize();
float elptime;
cudaEventElapsedTime(&elptime, start, stop);
cout << "nonmaxSuppression_gpu Elapsed time:" << elptime << "ms" << endl;
unsigned int new_count;
cudaMemcpy(&new_count, counter_ptr, sizeof(unsigned int), cudaMemcpyDeviceToHost);
//checkCudaErrors(cudaMemcpy(kpLoc, d_kpLoc, sizeof(short2)* count, cudaMemcpyHostToDevice));
cudaMemcpy(kpLoc, d_kpLoc, sizeof(short2)* max, cudaMemcpyDeviceToHost);
checkCudaErrors(cudaMemcpy(loc, d_loc, sizeof(short2)*count, cudaMemcpyDeviceToHost));
checkCudaErrors(cudaMemcpy(response, d_response, sizeof(float)*count, cudaMemcpyDeviceToHost));
printf("The keypoints count after nonmaxSuppression is:%d\n ", new_count);
cudaFree(d_scoredata0);
//cudaFree(d_scoredata);
cudaFree(d_response);
cudaFree(d_kpLoc);
cudaFree(d_loc);
return new_count;
}
期间CPU加Nsight调试各种修改参数添加语句等终于搞定,下面是测试的结果:
嗯还算不错, 后来resize了图片大小观察结果好像也都差不太多。
下一步准备搞一下ORB算法!
欢迎大神批评指正~