cuda编程实现warpaffine的例子

最新推荐文章于 2024-10-11 19:40:26 发布

拧螺丝专业户2024

最新推荐文章于 2024-10-11 19:40:26 发布

阅读量162

点赞数 6

文章标签： c++

本文链接：https://blog.csdn.net/weixin_43632469/article/details/142859895

版权

使用cuda加速需要评估计算和搬运的得失平衡。一般来说，复杂运算且支持并行的图形计算，cuda有着明显的优势，但同时，I/O的交互开销也是需要关注的。综合考量才能实现效率提升。

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <opencv2/opencv.hpp>

#include <iostream>
#include <omp.h>
#include <vector>

using namespace std;
using namespace cv;
// 检查 CUDA 运行时函数是否成功执行
bool checkRuntime(cudaError_t code, const char* op, const char* file, int line) {
    if (code != cudaSuccess) {
        const char* err_name = cudaGetErrorName(code);
        const char* err_message = cudaGetErrorString(code);
        fprintf(stderr, "CUDA runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message);
        return false;
    }
    return true;
}

//src输入图像 ，affineMatrix：变换矩阵， out输出
bool WarpAffine_cuda_8bits(Mat src, Mat affineMatrix, Mat & out)
{
    cv::Size dsize(src.cols, src.rows);

    int width = src.cols;
    int height = src.rows;

    // 分配主机和设备内存
    std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
    uint8_t* d_input, * d_output;

    cudaMalloc(&d_input, width * height * sizeof(unsigned char));
    cudaMalloc(&d_output, width * height * sizeof(unsigned char));

    // 从主机复制数据到设备
    cudaMemcpy(d_input, src.data, width * height * sizeof(unsigned char), cudaMemcpyHostToDevice);

    // 在主机内存中分配空间用于仿射矩阵
    float h_affine_matrix[6];
    h_affine_matrix[0] = affineMatrix.at<float>(0, 0);
    h_affine_matrix[1] = affineMatrix.at<float>(0, 1);
    h_affine_matrix[2] = affineMatrix.at<float>(0, 2);
    h_affine_matrix[3] = affineMatrix.at<float>(1, 0);
    h_affine_matrix[4] = affineMatrix.at<float>(1, 1);
    h_affine_matrix[5] = affineMatrix.at<float>(1, 2);
    cout << h_affine_matrix[2] << "  " << h_affine_matrix[5] << "  " << width;
    //  affineMatrix.convertTo(Mat(h_affine_matrix, false), CV_32F);

      // 将仿射矩阵复制到设备内存
    float* d_affine_matrix;
    cudaMalloc(&d_affine_matrix, sizeof(float) * 6);
    cudaMemcpy(d_affine_matrix, h_affine_matrix, sizeof(float) * 6, cudaMemcpyHostToDevice);

    // 启动核函数
    dim3 block_size(16, 16);
    dim3 grid_size((width + block_size.x - 1) / block_size.x, (height + block_size.y - 1) / block_size.y);
    warp_affine_bilinear_kernel_8 << <grid_size, block_size >> > (
        d_input, width, width, height,
        d_output, width, width, height,
        100, d_affine_matrix // 100 是用于填充的值
        );

    // 检查 CUDA 核函数是否执行成功
    cudaDeviceSynchronize();
    //  checkRuntime(cudaGetLastError(), "warp_affine_bilinear_kernel", __FILE__, __LINE__);

      // 将结果从设备内存复制回主机内存
    Mat dst(src.size(), CV_8UC1);
    // d_dst.download(dst);
    cudaMemcpy(dst.data, d_output, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost);
    std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
    out = dst.clone();
    cout << std::chrono::duration_cast<std::chrono::milliseconds> (t2 - t1).count();

    // 释放设备内存
    cudaFree(d_affine_matrix);
    // 释放设备内存
    cudaFree(d_input);
    cudaFree(d_output);

    return 0;
}