undistort.h 做下声明
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
extern "C" void undistortcuda(float * src, float *dst,float *camera_matrix,float *dist,int width,int height);
undistort.cu
#include "cuda_runtime.h"
#include "cuda.h"
#include <iostream>
#include <string>
using namespace std;
extern "C" void undistortcuda(float * src, float *dst,float *camera_matrix,float *dist,int width,int height);
__global__ void undistort_(float * src, float *dst, float *camera_matrix, float *distCoeffs, int width, int height)
{
int xIndex = threadIdx.x + blockDim.x * blockIdx.x;
int yIndex = threadIdx.y + blockDim.y * blockIdx.y;
int idx = yIndex * width + xIndex;
float k1 = distCoeffs[0];
float k2 = distCoeffs[1];
float p1 = distCoeffs[2];
float p2 = distCoeffs[3];
float k3 = distCoeffs[4];
float fx = camera_matrix[0];
float fy = camera_matrix[4];
float cx = camera_matrix[2];
float cy = camera_matrix[5];
float x0, y0, x, y;
float x_undist = 0; float y_undist = 0;
x0 = x = (xIndex - cx) / fx;
y0 = y = (yIndex - cy) / fy;
for (int iter = 0; iter < 5; ++iter)
{
float r2 = x * x + y * y;
float icdist = 1. / (1 + (k3*r2 + k2)*r2 + k1)*r2;
float deltaX = 2. * p1 *x*y + p2 *(r2 + 2 * x*x);
float deltaY = p1*(r2 + 2 * y*y) + 2.*p2*x*y;
x = (x0 - deltaX) * icdist;
y = (y0 - deltaY) * icdist;
}
x_undist = x * fx + cx;
y_undist = y * fy + cy;
// 最近邻
if (x_undist >= 0 && y_undist >= 0 && x_undist < width && y_undist < height)
{
dst[idx] = src[(int)y_undist * width + (int)x_undist];
}
else
dst[idx] = 0;
}
void undistortcuda(float * src, float *dst,float *camera_matrix,float *dist,int width,int height)
{
dim3 tpb(32,32);
dim3 blocksPerGrid(( width+tpb-1) / tpb.x , (height + tpb.y -1) / tpb.y);
undistort_<<< blocksPerGrid,tpb >>> (src,dst,camer_matrix,dist,width,height);
}
结果:很快吧1080ti 上1ms,tx2上3ms