使用cuda加速需要评估计算和搬运的得失平衡。一般来说,复杂运算且支持并行的图形计算,cuda有着明显的优势,但同时,I/O的交互开销也是需要关注的。综合考量才能实现效率提升。
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include <opencv2/opencv.hpp>
#include <iostream>
#include <omp.h>
#include <vector>
using namespace std;
using namespace cv;
// 检查 CUDA 运行时函数是否成功执行
bool checkRuntime(cudaError_t code, const char* op, const char* file, int line) {
if (code != cudaSuccess) {
const char* err_name = cudaGetErrorName(code);
const char* err_message = cudaGetErrorString(code);
fprintf(stderr, "CUDA runtime error %s:%d %s failed. \n code = %s, message = %s\n", file, line, op, err_name, err_message);
return false;
}
return true;
}
//src输入图像 ,affineMatrix:变换矩阵, out输出
bool WarpAffine_cuda_8bits(Mat src, Mat affineMatrix, Mat & out)
{
cv::Size dsize(src.cols, src.rows);
int width = src.cols;
int height = src.rows;
// 分配主机和设备内存
std::chrono::steady_clock::time_point t1 = std::chrono::steady_clock::now();
uint8_t* d_input, * d_output;
cudaMalloc(&d_input, width * height * sizeof(unsigned char));
cudaMalloc(&d_output, width * height * sizeof(unsigned char));
// 从主机复制数据到设备
cudaMemcpy(d_input, src.data, width * height * sizeof(unsigned char), cudaMemcpyHostToDevice);
// 在主机内存中分配空间用于仿射矩阵
float h_affine_matrix[6];
h_affine_matrix[0] = affineMatrix.at<float>(0, 0);
h_affine_matrix[1] = affineMatrix.at<float>(0, 1);
h_affine_matrix[2] = affineMatrix.at<float>(0, 2);
h_affine_matrix[3] = affineMatrix.at<float>(1, 0);
h_affine_matrix[4] = affineMatrix.at<float>(1, 1);
h_affine_matrix[5] = affineMatrix.at<float>(1, 2);
cout << h_affine_matrix[2] << " " << h_affine_matrix[5] << " " << width;
// affineMatrix.convertTo(Mat(h_affine_matrix, false), CV_32F);
// 将仿射矩阵复制到设备内存
float* d_affine_matrix;
cudaMalloc(&d_affine_matrix, sizeof(float) * 6);
cudaMemcpy(d_affine_matrix, h_affine_matrix, sizeof(float) * 6, cudaMemcpyHostToDevice);
// 启动核函数
dim3 block_size(16, 16);
dim3 grid_size((width + block_size.x - 1) / block_size.x, (height + block_size.y - 1) / block_size.y);
warp_affine_bilinear_kernel_8 << <grid_size, block_size >> > (
d_input, width, width, height,
d_output, width, width, height,
100, d_affine_matrix // 100 是用于填充的值
);
// 检查 CUDA 核函数是否执行成功
cudaDeviceSynchronize();
// checkRuntime(cudaGetLastError(), "warp_affine_bilinear_kernel", __FILE__, __LINE__);
// 将结果从设备内存复制回主机内存
Mat dst(src.size(), CV_8UC1);
// d_dst.download(dst);
cudaMemcpy(dst.data, d_output, width * height * sizeof(unsigned char), cudaMemcpyDeviceToHost);
std::chrono::steady_clock::time_point t2 = std::chrono::steady_clock::now();
out = dst.clone();
cout << std::chrono::duration_cast<std::chrono::milliseconds> (t2 - t1).count();
// 释放设备内存
cudaFree(d_affine_matrix);
// 释放设备内存
cudaFree(d_input);
cudaFree(d_output);
return 0;
}