cuda实现图像卷积操作,可用于图像平滑、边缘检测等操作
实现代码
输入图像用opencv加载
// 包含头文件
#include <iostream>
#include<cuda.h>
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include<device_launch_parameters.h>
using namespace std;
// CUDA错误检查宏
#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
if (code != cudaSuccess)
{
std::cerr << "CUDA Error: " << cudaGetErrorString(code) << " " << file << " " << line << std::endl;
if (abort) exit(code);
}
}
// 定义核函数
// 这里的图像输入数据和卷积核参数都是按行展开的一维数组
__global__ void convolutionGPU(const uchar* input, uchar* output, const int width, const int height, const int channel, const float* kernel, const int kernelW, const int kernelH)
{
// 计算当前线程坐标,也是当前线程处理的像素点坐标
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
// 判断线程是否在图像范围内
if (col < width && row < height)
{
// 计算滤波器的一半大小
int halfW = kernelW / 2;
int halfH = kernelH / 2;
// 初始化滤波器的总和
float sum[3] = { 0.0f, 0.0f, 0.0f}; // 最多支持3个通道
//float sum = 0.0f;
for (int filterRow = -halfH; filterRow <= halfH; filterRow++)
{
for (int filterCol = -halfW; filterCol <= halfW; filterCol++)
{
// 计算当前像素的位置
int curRow = row + filterRow;
int curCol = col + filterCol;
// 边界处理:使用复制填充
curRow = min(max(curRow, 0), height - 1);
curCol = min(max(curCol, 0), width - 1);
// 获取当前像素的值,opencv图像展开为数组后的数据排列是BGRBGR...
for (int c = 0; c < channel; c++)
{
sum[c] += input[(curRow * width + curCol) * channel + c] * kernel[(filterRow + halfH) * kernelW + (filterCol + halfW)];
}
}
}
// 将结果写入输出图像
for (int c = 0; c < channel; c++)
{
output[(row * width + col) * channel + c] = static_cast<unsigned char>(sum[c]);
}
}
}
void test1(void)
{
// 加载输入图像
cv::Mat inputImage = cv::imread("E:\\pic_data\\CBSD68\\3096.png", 1);
if (inputImage.empty())
{
std::cerr << "Failed to load input image!" << std::endl;
return;
}
// 获取输入图像的宽度和高度
int width = inputImage.cols;
int height = inputImage.rows;
int channel = inputImage.channels();
// 生成卷积核
const int kernelWidth = 3;
const int kernelHeight = 3;
//float kernel[kernelWidth][kernelHeight] = { {1. / 9, 1. / 9, 1. / 9},
// {1. / 9, 1. / 9, 1. / 9},
// {1. / 9, 1. / 9, 1. / 9} };
float kernel[kernelWidth][kernelHeight] = { {1., 1, 1. },
{1., -8., 1.},
{1. , 1., 1.} };
for (size_t i = 0; i < kernelWidth; i++)
{
for (size_t j = 0; j < kernelHeight; j++)
{
cout << kernel[i][j] << " ";
}
cout << endl;
}
// 计算图像字节数
size_t imageSize = channel * width * height * sizeof(uchar);
size_t kernelSize = kernelWidth * kernelHeight * sizeof(float);
// 分配设备内存
uchar* d_inputImage;
uchar* d_outputImage;
float* d_kernel;
CUDA_CHECK(cudaMalloc((void**)&d_inputImage, imageSize));
CUDA_CHECK(cudaMalloc((void**)&d_outputImage, imageSize));
CUDA_CHECK(cudaMalloc((void**)&d_kernel, kernelSize));
// 将输入图像复制到设备内存
CUDA_CHECK(cudaMemcpy(d_inputImage, inputImage.data, imageSize, cudaMemcpyHostToDevice));
CUDA_CHECK(cudaMemcpy(d_kernel, &kernel, kernelSize, cudaMemcpyHostToDevice));
int BLOCK_SIZE = 16;
// 计算块和网格的大小
dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
dim3 gridSize((width + BLOCK_SIZE - 1) / BLOCK_SIZE, (height + BLOCK_SIZE - 1) / BLOCK_SIZE);
// 执行卷积
convolutionGPU << <gridSize, blockSize >> > (d_inputImage, d_outputImage, width, height, channel, d_kernel, kernelWidth, kernelHeight);
CUDA_CHECK(cudaDeviceSynchronize());
// 分配主机内存用于输出图像
uchar* outputImage = new uchar[imageSize];
// 将输出图像从设备复制到主机内存
CUDA_CHECK(cudaMemcpy(outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));
// 将输出图像转换为OpenCV格式
cv::Mat outputImageMat(height, width, inputImage.type(), outputImage);
// 显示输出图像
cv::imshow("in", inputImage);
cv::imshow("out", outputImageMat);
cv::waitKey(0);
cv::destroyAllWindows();
// 释放内存
delete[] outputImage;
CUDA_CHECK(cudaFree(d_kernel));
CUDA_CHECK(cudaFree(d_inputImage));
CUDA_CHECK(cudaFree(d_outputImage));
}
int main()
{
test1();
return 0;
}
运行结果
原图
卷积核为:
float kernel[kernelWidth][kernelHeight] = { {1., 1, 1. },
{1., -8., 1.},
{1. , 1., 1.} };
也是三通道图像
卷积核为:
float kernel[kernelWidth][kernelHeight] = { {1. / 9, 1. / 9, 1. / 9},
{1. / 9, 1. / 9, 1. / 9},
{1. / 9, 1. / 9, 1. / 9} };
滤波结果: