前几天写了一个CPU版本的透视变换,经过了几天的debugging,终于移植到GPU里了,CPU版本的链接:
https://blog.csdn.net/YaoJiawei329/article/details/114833550
这算是个笔记吧。 CUDA代码如下
#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core.hpp>
#include <cuda.h>
#include <cuda_runtime.h>
using namespace std;
using namespace cv;
//CPU版本的代码
void warpPerspectiveTransform(const cv::Mat& src, cv::Mat& dst, const cv::Mat& TransformMatrix, const cv::Size& dstSize) {
const double M00 = TransformMatrix.at<double>(0, 0);
const double M01 = TransformMatrix.at<double>(0, 1); //(0,0) X
const double M02 = TransformMatrix.at<double>(0, 2); // ------------->
const double M10 = TransformMatrix.at<double>(1, 0); // |
const double M11 = TransformMatrix.at<double>(1, 1); // |
const double M12 = TransformMatrix.at<double>(1, 2); // |
const double M20 = TransformMatrix.at<double>(2, 0); // |
const double M21 = TransformMatrix.at<double>(2, 1); // |
const double M22 = TransformMatrix.at<double>(2, 2); // Y\/
for (int v = 0; v < dstSize.height; v = v + 1) { //x, y are src, u, v are dst. Search in dst.
for (int u = 0; u < dstSize.width; u = u + 1) {
int y = (int)(((M02-u*M22)*(v*M20-M10)-(M12-v*M22)*(u*M20-M00)) / ((u*M21-M01)*(v*M20-M10)-(v*M21-M11)*(u*M20-M00)));
int x = (int)(((M02-u*M22)*(v*M21-M11)-(M12-v*M22)*(u*M21-M01)) / ((u*M20-M00)*(v*M21-M11)-(v*M20-M10)*(u*M21-M01)));
if (y >=0 && x >= 0 && y < src.rows && x < src.cols) {
dst.at<cv::Vec3b>(v, u) = src.at<cv::Vec3b>(y, x);
}
}
}
}
__global__ void global_warpPerspectiveTransform(const uchar3* src, uchar3* dst, const double* d_tfMatrix, const int src_row, const int src_col, const int dst_row, const int dst_col) {
//这是精确计算二维grid和二维block的thread_id的计算公式。
//int tid = (gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x;
for (int v = blockDim.y * blockIdx.y + threadIdx.y; v < dst_row; v = v + gridDim.y * blockDim.y) { //x, y are src, u, v are dst. Search in dst.
for (int u = blockDim.x * blockIdx.x + threadIdx.x; u < dst_col; u = u + gridDim.x * blockDim.x) {
int x = (int)(((d_tfMatrix[2]-u*d_tfMatrix[8])*(v*d_tfMatrix[7]-d_tfMatrix[4])-(d_tfMatrix[5]-v*d_tfMatrix[8])*(u*d_tfMatrix[7]-d_tfMatrix[1])) / ((u*d_tfMatrix[6]-d_tfMatrix[0])*(v*d_tfMatrix[7]-d_tfMatrix[4])-(v*d_tfMatrix[6]-d_tfMatrix[3])*(u*d_tfMatrix[7]-d_tfMatrix[1])));
int y = (int)(((d_tfMatrix[2]-u*d_tfMatrix[8])*(v*d_tfMatrix[6]-d_tfMatrix[3])-(d_tfMatrix[5]-v*d_tfMatrix[8])*(u*d_tfMatrix[6]-d_tfMatrix[0])) / ((u*d_tfMatrix[7]-d_tfMatrix[1])*(v*d_tfMatrix[6]-d_tfMatrix[3])-(v*d_tfMatrix[7]-d_tfMatrix[4])*(u*d_tfMatrix[6]-d_tfMatrix[0])));
if (x >= 0 && y >=0 && x < src_col && y < src_row) {
dst[v * dst_col + u].x = src[y * src_col + x].x;
dst[v * dst_col + u].y = src[y * src_col + x].y;
dst[v * dst_col + u].z = src[y * src_col + x].z;
}
__syncthreads();//加这个是保险
//直接无视这几行。这几行是debugging过程中的调试代码
// if ((gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x == 10000) {
// printf("d_tfMatrix[5] = %f ", d_tfMatrix[5]);
// printf("x = %d, y = %d ", x, y);
// printf("src_row = %d ", src_row);
// printf("src_col = %d ", src_col);
// printf("y * src_col + x = %d ", y * src_col + x);
// printf("src[y * src_col + x].x = %d", src[y * src_col + x].x);
// }
}
}
}
int main()
{
cv::Mat image = cv::imread("../../warpPerspectiveTransform2/front.jpg");
int image_row = image.rows;
int image_col = image.cols;
cv::Mat image_Perspective(720, 1280, CV_8UC3);
cv::Size dst_size = cv::Size(1280, 720);
cv::Point2f src_points[] = {
cv::Point2f(400, 360),
cv::Point2f(879, 360),
cv::Point2f(1279, 719),
cv::Point2f(0, 719)
};
cv::Point2f dst_points[] = {
cv::Point2f(0, 0),
cv::Point2f(dst_size.width - 1, 0),
cv::Point2f(dst_size.width - 1, dst_size.height - 1),
cv::Point2f(0, dst_size.height - 1)
};
cv::Mat tfMatrix = cv::getPerspectiveTransform(src_points, dst_points);
int row = dst_size.height;
int col = dst_size.width;
uchar3* d_image;
uchar3* d_image_Perspective;
double* d_tfMatrix;
size_t size_image = sizeof(uchar3) * image.rows * image.cols;
size_t size_image_Perspecvive = sizeof(uchar3) * image_Perspective.rows * image_Perspective.cols;
cudaMalloc((void**)&d_image, size_image);
cudaMalloc((void**)&d_image_Perspective, size_image_Perspecvive);
cudaMalloc((void**)&d_tfMatrix, sizeof(double) * 9);
cudaMemcpy(d_image, (uchar3*)image.data, size_image, cudaMemcpyHostToDevice);
cudaMemcpy(d_tfMatrix, (double*)tfMatrix.data, sizeof(double) * 9, cudaMemcpyHostToDevice);
dim3 gridDim(10, 10, 1);
dim3 blockDim(32, 32, 1);
global_warpPerspectiveTransform << <gridDim, blockDim>> >(d_image, d_image_Perspective, d_tfMatrix, image_row, image_col, row, col);
cudaMemcpy((uchar3*)image_Perspective.data, d_image_Perspective, size_image_Perspecvive, cudaMemcpyDeviceToHost);
cv::imshow("image_Perspective", image_Perspective);
cv::waitKey(0);
cudaFree(d_image);
cudaFree(d_image_Perspective);
cudaFree(d_tfMatrix);
return 0;
}
原图:
效果:
和CPU版本的一样。
当日下午更新:将算法拆分为两步进行。(在已知透视变换矩阵tfMatrix之后)第一步计算dst中的点(u, v)在src中对应的坐标(x, y),第二步利用这个坐标对应关系进行赋值。代码如下:
#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include "opencv2/core.hpp"
#include "cuda.h"
#include "cuda_runtime.h"
using namespace std;
using namespace cv;
__device__ void device_calculateSrcCoordinate(const double* TransformMatrix, uint2* srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
const double M00 = TransformMatrix[0];
const double M01 = TransformMatrix[1];
const double M02 = TransformMatrix[2];
const double M10 = TransformMatrix[3];
const double M11 = TransformMatrix[4];
const double M12 = TransformMatrix[5];
const double M20 = TransformMatrix[6];
const double M21 = TransformMatrix[7];
const double M22 = TransformMatrix[8];
for (int v = blockDim.y * blockIdx.y + threadIdx.y; v < dst_row; v = v + gridDim.y * blockDim.y) {//x, y are src, u, v are dst. Search in dst. (x, y) = T^(-1)(u, v)
for (int u = blockDim.x * blockIdx.x + threadIdx.x; u < dst_col; u = u + gridDim.x * blockDim.x) {
//src_x = srcCoordinate[v * dst_col + u].x, src_y = srcCoordinate[v * dst_col + u].y
srcCoordinate[v * src_col + u].x = (uint)(((M02-u*M22)*(v*M21-M11)-(M12-v*M22)*(u*M21-M01)) / ((u*M20-M00)*(v*M21-M11)-(v*M20-M10)*(u*M21-M01)));
srcCoordinate[v * src_col + u].y = (uint)(((M02-u*M22)*(v*M20-M10)-(M12-v*M22)*(u*M20-M00)) / ((u*M21-M01)*(v*M20-M10)-(v*M21-M11)*(u*M20-M00)));
__syncthreads();
}
}
}
__device__ void device_mapping(const uchar3* src, uchar3* dst, const uint2* srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
for (int v = blockDim.y * blockIdx.y + threadIdx.y; v < dst_row; v = v + gridDim.y * blockDim.y) {//x, y are src, u, v are dst. Search in dst.
for (int u = blockDim.x * blockIdx.x + threadIdx.x; u < dst_col; u = u + gridDim.x * blockDim.x) {
dst[v * dst_col + u].x = src[srcCoordinate[v * dst_col + u].y * src_col + srcCoordinate[v * dst_col + u].x].x;
dst[v * dst_col + u].y = src[srcCoordinate[v * dst_col + u].y * src_col + srcCoordinate[v * dst_col + u].x].y;
dst[v * dst_col + u].z = src[srcCoordinate[v * dst_col + u].y * src_col + srcCoordinate[v * dst_col + u].x].z;
__syncthreads();
}
}
}
__global__ void gpu_do(const uchar3* src, uchar3* dst, const double* TransformMatrix, uint2* d_srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
device_calculateSrcCoordinate(TransformMatrix, d_srcCoordinate, src_row, src_col, dst_row, dst_col);
device_mapping(src, dst, d_srcCoordinate, src_row, src_col, dst_row, dst_col);
}
int main()
{
cv::Mat image = cv::imread("../../warpPerspectiveTransform3/front.jpg");
const int image_row = image.rows;
const int image_col = image.cols;
cv::Mat image_Perspective(720, 1280, CV_8UC3);
cv::Size dst_size = cv::Size(1280, 720);
cv::Point2f src_points[] = {
cv::Point2f(400, 360),
cv::Point2f(879, 360),
cv::Point2f(1279, 719),
cv::Point2f(0, 719)
};
cv::Point2f dst_points[] = {
cv::Point2f(0, 0),
cv::Point2f(dst_size.width - 1, 0),
cv::Point2f(dst_size.width - 1, dst_size.height - 1),
cv::Point2f(0, dst_size.height - 1)
};
cv::Mat tfMatrix = cv::getPerspectiveTransform(src_points, dst_points);
uchar3* d_image;
uchar3* d_image_Perspective;
double* d_tfMatrix;
uint2* d_srcCoordinate;
size_t size_image = sizeof(uchar3) * image.rows * image.cols;
size_t size_image_Perspecvive = sizeof(uchar3) * image_Perspective.rows * image_Perspective.cols;
cudaMalloc((void**)&d_image, size_image);
cudaMalloc((void**)&d_image_Perspective, size_image_Perspecvive);
cudaMalloc((void**)&d_tfMatrix, sizeof(double) * 9);
cudaMalloc((void**)&d_srcCoordinate, sizeof(uint2) * image_Perspective.rows * image_Perspective.cols);
cudaMemcpy(d_image, (uchar3*)image.data, size_image, cudaMemcpyHostToDevice);
cudaMemcpy(d_tfMatrix, (double*)tfMatrix.data, sizeof(double) * 9, cudaMemcpyHostToDevice);
dim3 gridDim(10, 10, 1);
dim3 blockDim(32, 32, 1);
gpu_do << <gridDim, blockDim>> >(d_image, d_image_Perspective, d_tfMatrix, d_srcCoordinate, image_row, image_col, dst_size.height, dst_size.width);
cudaMemcpy((uchar3*)image_Perspective.data, d_image_Perspective, size_image_Perspecvive, cudaMemcpyDeviceToHost);
cv::imshow("image_Perspective", image_Perspective);
cv::imwrite("../../warpPerspectiveTransform3/front_Perspective.jpg", image_Perspective);
cv::waitKey(0);
cudaFree(d_image);
cudaFree(d_image_Perspective);
cudaFree(d_tfMatrix);
cudaFree(d_srcCoordinate);
return 0;
}
效果是一样的。
第二天的更新:可以将内核函数gpu_do用普通函数封装:
void gpu_code(const uchar3* src, uchar3* dst, const double* TransformMatrix, uint2* d_srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
dim3 gridDim(10, 10, 1);
dim3 blockDim(32, 32, 1);
gpu_do << <gridDim, blockDim>> >(src, dst, TransformMatrix, d_srcCoordinate, src_row, src_col, dst_row, dst_col);
}
然后正常调用gpu_code就行了,这样做是因为内核函数不能是类中的成员函数,但是成员函数可以调用内核函数。
PS:内核函数中使用float会比double快不少。