CUDA+OpenCV实现透视变换

前几天写了一个CPU版本的透视变换,经过了几天的debugging,终于移植到GPU里了,CPU版本的链接:
https://blog.csdn.net/YaoJiawei329/article/details/114833550
这算是个笔记吧。 CUDA代码如下

#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include <opencv2/core.hpp>
#include <cuda.h>
#include <cuda_runtime.h>

using namespace std;
using namespace cv;

//CPU版本的代码
void warpPerspectiveTransform(const cv::Mat& src, cv::Mat& dst, const cv::Mat& TransformMatrix, const cv::Size& dstSize) {
    const double M00 = TransformMatrix.at<double>(0, 0);
    const double M01 = TransformMatrix.at<double>(0, 1);            //(0,0)          X
    const double M02 = TransformMatrix.at<double>(0, 2);            //   ------------->
    const double M10 = TransformMatrix.at<double>(1, 0);            //  |
    const double M11 = TransformMatrix.at<double>(1, 1);            //  |
    const double M12 = TransformMatrix.at<double>(1, 2);            //  |
    const double M20 = TransformMatrix.at<double>(2, 0);            //  |
    const double M21 = TransformMatrix.at<double>(2, 1);            //  |
    const double M22 = TransformMatrix.at<double>(2, 2);            // Y\/
    for (int v = 0; v < dstSize.height; v = v + 1) {   //x, y are src, u, v are dst. Search in dst.
        for (int u = 0; u < dstSize.width; u = u + 1) {
            int y = (int)(((M02-u*M22)*(v*M20-M10)-(M12-v*M22)*(u*M20-M00)) / ((u*M21-M01)*(v*M20-M10)-(v*M21-M11)*(u*M20-M00)));
            int x = (int)(((M02-u*M22)*(v*M21-M11)-(M12-v*M22)*(u*M21-M01)) / ((u*M20-M00)*(v*M21-M11)-(v*M20-M10)*(u*M21-M01)));
            if (y >=0 && x >= 0 && y < src.rows && x < src.cols) {
                dst.at<cv::Vec3b>(v, u) = src.at<cv::Vec3b>(y, x);
            }
        }
    }
}

__global__ void global_warpPerspectiveTransform(const uchar3* src, uchar3* dst, const double* d_tfMatrix, const int src_row, const int src_col, const int dst_row, const int dst_col) {
    //这是精确计算二维grid和二维block的thread_id的计算公式。
    //int tid = (gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x;
    for (int v = blockDim.y * blockIdx.y + threadIdx.y; v < dst_row; v = v + gridDim.y * blockDim.y) {   //x, y are src, u, v are dst. Search in dst.
        for (int u = blockDim.x * blockIdx.x + threadIdx.x; u < dst_col; u = u + gridDim.x * blockDim.x) {
            int x = (int)(((d_tfMatrix[2]-u*d_tfMatrix[8])*(v*d_tfMatrix[7]-d_tfMatrix[4])-(d_tfMatrix[5]-v*d_tfMatrix[8])*(u*d_tfMatrix[7]-d_tfMatrix[1])) / ((u*d_tfMatrix[6]-d_tfMatrix[0])*(v*d_tfMatrix[7]-d_tfMatrix[4])-(v*d_tfMatrix[6]-d_tfMatrix[3])*(u*d_tfMatrix[7]-d_tfMatrix[1])));
            int y = (int)(((d_tfMatrix[2]-u*d_tfMatrix[8])*(v*d_tfMatrix[6]-d_tfMatrix[3])-(d_tfMatrix[5]-v*d_tfMatrix[8])*(u*d_tfMatrix[6]-d_tfMatrix[0])) / ((u*d_tfMatrix[7]-d_tfMatrix[1])*(v*d_tfMatrix[6]-d_tfMatrix[3])-(v*d_tfMatrix[7]-d_tfMatrix[4])*(u*d_tfMatrix[6]-d_tfMatrix[0])));
            if (x >= 0 && y >=0 && x < src_col && y < src_row) {
                dst[v * dst_col + u].x = src[y * src_col + x].x;
                dst[v * dst_col + u].y = src[y * src_col + x].y;
                dst[v * dst_col + u].z = src[y * src_col + x].z;
            }
            __syncthreads();//加这个是保险
//直接无视这几行。这几行是debugging过程中的调试代码           
//            if ((gridDim.x*blockIdx.y+blockIdx.x)*blockDim.x*blockDim.y+threadIdx.y*blockDim.x+threadIdx.x == 10000) {
//                printf("d_tfMatrix[5] = %f ", d_tfMatrix[5]);
//                printf("x = %d, y = %d ", x, y);
//                printf("src_row = %d ", src_row);
//                printf("src_col = %d ", src_col);
//                printf("y * src_col + x = %d ", y * src_col + x);
//                printf("src[y * src_col + x].x = %d", src[y * src_col + x].x);
//            }
        }
    }
}

int main()
{
    cv::Mat image = cv::imread("../../warpPerspectiveTransform2/front.jpg");
    int image_row = image.rows;
    int image_col = image.cols;
    cv::Mat image_Perspective(720, 1280, CV_8UC3);
    cv::Size dst_size = cv::Size(1280, 720);
    cv::Point2f src_points[] = {
             cv::Point2f(400, 360),
             cv::Point2f(879, 360),
             cv::Point2f(1279, 719),
             cv::Point2f(0, 719)
    };
    cv::Point2f dst_points[] = {
            cv::Point2f(0, 0),
            cv::Point2f(dst_size.width - 1, 0),
            cv::Point2f(dst_size.width - 1, dst_size.height - 1),
            cv::Point2f(0, dst_size.height - 1)
    };
    cv::Mat tfMatrix = cv::getPerspectiveTransform(src_points, dst_points);

    int row = dst_size.height;
    int col = dst_size.width;

    uchar3* d_image;
    uchar3* d_image_Perspective;
    double* d_tfMatrix;

    size_t size_image = sizeof(uchar3) * image.rows * image.cols;
    size_t size_image_Perspecvive = sizeof(uchar3) * image_Perspective.rows * image_Perspective.cols;
    cudaMalloc((void**)&d_image, size_image);
    cudaMalloc((void**)&d_image_Perspective, size_image_Perspecvive);
    cudaMalloc((void**)&d_tfMatrix, sizeof(double) * 9);
    cudaMemcpy(d_image, (uchar3*)image.data, size_image, cudaMemcpyHostToDevice);
    cudaMemcpy(d_tfMatrix, (double*)tfMatrix.data, sizeof(double) * 9, cudaMemcpyHostToDevice);

    dim3 gridDim(10, 10, 1);
    dim3 blockDim(32, 32, 1);
    global_warpPerspectiveTransform << <gridDim, blockDim>> >(d_image, d_image_Perspective, d_tfMatrix, image_row, image_col, row, col);

    cudaMemcpy((uchar3*)image_Perspective.data, d_image_Perspective, size_image_Perspecvive, cudaMemcpyDeviceToHost);

    cv::imshow("image_Perspective", image_Perspective);
    cv::waitKey(0);

    cudaFree(d_image);
    cudaFree(d_image_Perspective);
    cudaFree(d_tfMatrix);

    return 0;
}

原图:
在这里插入图片描述效果:
在这里插入图片描述和CPU版本的一样。

当日下午更新:将算法拆分为两步进行。(在已知透视变换矩阵tfMatrix之后)第一步计算dst中的点(u, v)在src中对应的坐标(x, y),第二步利用这个坐标对应关系进行赋值。代码如下:

#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <opencv2/imgproc.hpp>
#include "opencv2/core.hpp"
#include "cuda.h"
#include "cuda_runtime.h"

using namespace std;
using namespace cv;

__device__ void device_calculateSrcCoordinate(const double* TransformMatrix, uint2* srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
    const double M00 = TransformMatrix[0];
    const double M01 = TransformMatrix[1];
    const double M02 = TransformMatrix[2];
    const double M10 = TransformMatrix[3];
    const double M11 = TransformMatrix[4];
    const double M12 = TransformMatrix[5];
    const double M20 = TransformMatrix[6];
    const double M21 = TransformMatrix[7];
    const double M22 = TransformMatrix[8];
    for (int v = blockDim.y * blockIdx.y + threadIdx.y; v < dst_row; v = v + gridDim.y * blockDim.y) {//x, y are src, u, v are dst. Search in dst. (x, y) = T^(-1)(u, v)
        for (int u = blockDim.x * blockIdx.x + threadIdx.x; u < dst_col; u = u + gridDim.x * blockDim.x) {
            //src_x = srcCoordinate[v * dst_col + u].x, src_y = srcCoordinate[v * dst_col + u].y
            srcCoordinate[v * src_col + u].x = (uint)(((M02-u*M22)*(v*M21-M11)-(M12-v*M22)*(u*M21-M01)) / ((u*M20-M00)*(v*M21-M11)-(v*M20-M10)*(u*M21-M01)));
            srcCoordinate[v * src_col + u].y = (uint)(((M02-u*M22)*(v*M20-M10)-(M12-v*M22)*(u*M20-M00)) / ((u*M21-M01)*(v*M20-M10)-(v*M21-M11)*(u*M20-M00)));
            __syncthreads();
        }
    }
}

__device__ void device_mapping(const uchar3* src, uchar3* dst, const uint2* srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
    for (int v = blockDim.y * blockIdx.y + threadIdx.y; v < dst_row; v = v + gridDim.y * blockDim.y) {//x, y are src, u, v are dst. Search in dst.
        for (int u = blockDim.x * blockIdx.x + threadIdx.x; u < dst_col; u = u + gridDim.x * blockDim.x) {
            dst[v * dst_col + u].x = src[srcCoordinate[v * dst_col + u].y * src_col + srcCoordinate[v * dst_col + u].x].x;
            dst[v * dst_col + u].y = src[srcCoordinate[v * dst_col + u].y * src_col + srcCoordinate[v * dst_col + u].x].y;
            dst[v * dst_col + u].z = src[srcCoordinate[v * dst_col + u].y * src_col + srcCoordinate[v * dst_col + u].x].z;
            __syncthreads();
        }
    }
}

__global__ void gpu_do(const uchar3* src, uchar3* dst, const double* TransformMatrix, uint2* d_srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
    device_calculateSrcCoordinate(TransformMatrix, d_srcCoordinate, src_row, src_col, dst_row, dst_col);
    device_mapping(src, dst, d_srcCoordinate, src_row, src_col, dst_row, dst_col);
}

int main()
{
    cv::Mat image = cv::imread("../../warpPerspectiveTransform3/front.jpg");
    const int image_row = image.rows;
    const int image_col = image.cols;
    cv::Mat image_Perspective(720, 1280, CV_8UC3);
    cv::Size dst_size = cv::Size(1280, 720);
    cv::Point2f src_points[] = {
             cv::Point2f(400, 360),
             cv::Point2f(879, 360),
             cv::Point2f(1279, 719),
             cv::Point2f(0, 719)
    };
    cv::Point2f dst_points[] = {
            cv::Point2f(0, 0),
            cv::Point2f(dst_size.width - 1, 0),
            cv::Point2f(dst_size.width - 1, dst_size.height - 1),
            cv::Point2f(0, dst_size.height - 1)
    };
    cv::Mat tfMatrix = cv::getPerspectiveTransform(src_points, dst_points);

    uchar3* d_image;
    uchar3* d_image_Perspective;
    double* d_tfMatrix;
    uint2* d_srcCoordinate;

    size_t size_image = sizeof(uchar3) * image.rows * image.cols;
    size_t size_image_Perspecvive = sizeof(uchar3) * image_Perspective.rows * image_Perspective.cols;
    cudaMalloc((void**)&d_image, size_image);
    cudaMalloc((void**)&d_image_Perspective, size_image_Perspecvive);
    cudaMalloc((void**)&d_tfMatrix, sizeof(double) * 9);
    cudaMalloc((void**)&d_srcCoordinate, sizeof(uint2) * image_Perspective.rows * image_Perspective.cols);
    cudaMemcpy(d_image, (uchar3*)image.data, size_image, cudaMemcpyHostToDevice);
    cudaMemcpy(d_tfMatrix, (double*)tfMatrix.data, sizeof(double) * 9, cudaMemcpyHostToDevice);

    dim3 gridDim(10, 10, 1);
    dim3 blockDim(32, 32, 1);
    gpu_do << <gridDim, blockDim>> >(d_image, d_image_Perspective, d_tfMatrix, d_srcCoordinate, image_row, image_col, dst_size.height, dst_size.width);

    cudaMemcpy((uchar3*)image_Perspective.data, d_image_Perspective, size_image_Perspecvive, cudaMemcpyDeviceToHost);

    cv::imshow("image_Perspective", image_Perspective);
    cv::imwrite("../../warpPerspectiveTransform3/front_Perspective.jpg", image_Perspective);
    cv::waitKey(0);

    cudaFree(d_image);
    cudaFree(d_image_Perspective);
    cudaFree(d_tfMatrix);
    cudaFree(d_srcCoordinate);

    return 0;
}

效果是一样的。

第二天的更新:可以将内核函数gpu_do用普通函数封装:

void gpu_code(const uchar3* src, uchar3* dst, const double* TransformMatrix, uint2* d_srcCoordinate, const int src_row, const int src_col, const int dst_row, const int dst_col) {
    dim3 gridDim(10, 10, 1);
    dim3 blockDim(32, 32, 1);
    gpu_do << <gridDim, blockDim>> >(src, dst, TransformMatrix, d_srcCoordinate, src_row, src_col, dst_row, dst_col);
}

然后正常调用gpu_code就行了,这样做是因为内核函数不能是类中的成员函数,但是成员函数可以调用内核函数。
PS:内核函数中使用float会比double快不少。

  • 2
    点赞
  • 11
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值