继配置完CUDA环境后,开始写CUDA代码,此处记录一下。
查看显卡的详细信息:
cd C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v12.0\extras\demo_suite
deviceQuery
调用CUDA脚本
首先推荐看一篇博客:CUDA编程:与Opencv的结合
.cu
#include <iostream>
#include <opencv.hpp>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
__global__ void THRESH_BINARY_CUDA(unsigned char* d_in, unsigned char* d_out, int imageHeight, int imageWidth, int threadDown,int threadUP)
{
int xIndex = threadIdx.x + blockIdx.x * blockDim.x;
int yIndex = threadIdx.y + blockIdx.y * blockDim.y;
int index = yIndex * imageWidth + xIndex;
int blockId = blockIdx.x + blockIdx.y * gridDim.x;
int threadId = blockId * (blockDim.x * blockDim.y) + (threadIdx.y * blockDim.x) + threadIdx.x;
if (xIndex < imageWidth && yIndex < imageHeight)
{
d_out[index] = (d_in[index] >= threadDown && d_in[index] <= threadUP) ? 255 : 0;
}
}
extern "C" void Binary_Image_CUDA(unsigned char* d_in, unsigned char* d_out, int imageHeight, int imageWidth, int threadDown, int threadUP)
{
dim3 block(16, 16);
dim3 grid((imageWidth + block.x - 1) / block.x, (imageHeight + block.y - 1) / block.y);
THRESH_BINARY_CUDA << <grid, block >> > (d_in, d_out, imageHeight, imageWidth, threadDown,threadUP);
cudaThreadSynchronize();
}
.cpp
#include <iostream>
#include <chrono>
#include <opencv2/opencv.hpp>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
using namespace cv;
using namespace std;
extern "C" void Binary_Image_CUDA(unsigned char* d_in, unsigned char* d_out, int imageHeight, int imageWidth, int threadDown, int threadUP);
int main()
{
cv::Mat img = cv::imread("05.png", 0);
if (img.empty())
{
return 0;
}
int imgHeight = img.rows;
int imgWidth = img.cols;
int length = imgHeight * imgWidth;
unsigned char* d_in;
unsigned char* d_out;
cv::Mat dstImg(imgHeight, imgWidth, CV_8UC1, cv::Scalar(60));
//======================创建gpu内存=================
cudaMalloc((void**)&d_in, imgHeight * imgWidth * sizeof(unsigned char));
cudaMalloc((void**)&d_out, imgHeight * imgWidth * sizeof(unsigned char));
//==================host内存到device设备内存进行拷贝===========
cudaMemcpy(d_in, img.data, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyHostToDevice);
//=================创建线程格和线程块==============
auto starttime = std::chrono::system_clock::now();
Binary_Image_CUDA(d_in, d_out, imgHeight, imgWidth, 0,60);
std::chrono::duration<double> diff = std::chrono::system_clock::now() - starttime;
cout << "CUDA耗时:" << diff.count()*1000 << "ms" << endl;
cudaMemcpy(dstImg.data, d_out, imgHeight * imgWidth * sizeof(unsigned char), cudaMemcpyDeviceToHost);
//====================释放GPU内存================
cudaFree(d_in);
cudaFree(d_out);
cv::namedWindow("cuda", cv::WINDOW_NORMAL);
cv::imshow("cuda", dstImg);
cv::waitKey(0);
return 0;
}
运行结果: