cuda实现图像卷积操作，可用于图像平滑、边缘检测等操作

原创已于 2023-08-05 09:35:44 修改

· 547 阅读

2 ·

版权

文章标签：

#c++

于 2023-08-05 09:30:57 首次发布

cuda 专栏收录该内容

2 篇文章

订阅专栏

cuda实现图像卷积操作，可用于图像平滑、边缘检测等操作

实现代码

输入图像用opencv加载

// 包含头文件
#include <iostream>
#include<cuda.h>
#include <cuda_runtime_api.h>
#include <opencv2/opencv.hpp>
#include<device_launch_parameters.h>
using namespace std;

// CUDA错误检查宏
#define CUDA_CHECK(ans) { gpuAssert((ans), __FILE__, __LINE__); }
inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = true)
{
	if (code != cudaSuccess)
	{
		std::cerr << "CUDA Error: " << cudaGetErrorString(code) << " " << file << " " << line << std::endl;
		if (abort) exit(code);
	}
}

// 定义核函数
// 这里的图像输入数据和卷积核参数都是按行展开的一维数组
__global__ void convolutionGPU(const uchar* input, uchar* output, const int width, const int height, const int channel, const float* kernel, const int kernelW, const int kernelH)
{
	// 计算当前线程坐标，也是当前线程处理的像素点坐标
	int col = blockIdx.x * blockDim.x + threadIdx.x;
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	// 判断线程是否在图像范围内
	if (col < width && row < height)
	{
		// 计算滤波器的一半大小
		int halfW = kernelW / 2;
		int halfH = kernelH / 2;

		// 初始化滤波器的总和
		float sum[3] = { 0.0f, 0.0f, 0.0f}; // 最多支持3个通道
		//float sum = 0.0f;
		for (int filterRow = -halfH; filterRow <= halfH; filterRow++)
		{
			for (int filterCol = -halfW; filterCol <= halfW; filterCol++)
			{
				// 计算当前像素的位置
				int curRow = row + filterRow;
				int curCol = col + filterCol;

				// 边界处理：使用复制填充
				curRow = min(max(curRow, 0), height - 1);
				curCol = min(max(curCol, 0), width - 1);

				// 获取当前像素的值，opencv图像展开为数组后的数据排列是BGRBGR...
				for (int c = 0; c < channel; c++)
				{
					sum[c] += input[(curRow * width + curCol) * channel + c] * kernel[(filterRow + halfH) * kernelW + (filterCol + halfW)];
				}
			}
		}
		// 将结果写入输出图像
		for (int c = 0; c < channel; c++)
		{
			output[(row * width + col) * channel + c] = static_cast<unsigned char>(sum[c]);
		}
	}
}

void test1(void)
{
	// 加载输入图像
	cv::Mat inputImage = cv::imread("E:\\pic_data\\CBSD68\\3096.png", 1);
	if (inputImage.empty())
	{
		std::cerr << "Failed to load input image!" << std::endl;
		return;
	}

	// 获取输入图像的宽度和高度
	int width = inputImage.cols;
	int height = inputImage.rows;
	int channel = inputImage.channels();

	// 生成卷积核
	const int kernelWidth = 3;
	const int kernelHeight = 3;
	//float kernel[kernelWidth][kernelHeight] = { {1. / 9, 1. / 9, 1. / 9},
	//											{1. / 9, 1. / 9, 1. / 9},
	//											{1. / 9, 1. / 9, 1. / 9} };
	float kernel[kernelWidth][kernelHeight] = { {1., 1, 1. },
												{1., -8., 1.},
												{1. , 1., 1.} };
	for (size_t i = 0; i < kernelWidth; i++)
	{
		for (size_t j = 0; j < kernelHeight; j++)
		{
			cout << kernel[i][j] << " ";
		}
		cout << endl;
	}

	// 计算图像字节数
	size_t imageSize = channel * width * height * sizeof(uchar);
	size_t kernelSize = kernelWidth * kernelHeight * sizeof(float);
	// 分配设备内存
	uchar* d_inputImage;
	uchar* d_outputImage;
	float* d_kernel;
	CUDA_CHECK(cudaMalloc((void**)&d_inputImage, imageSize));
	CUDA_CHECK(cudaMalloc((void**)&d_outputImage, imageSize));
	CUDA_CHECK(cudaMalloc((void**)&d_kernel, kernelSize));

	// 将输入图像复制到设备内存
	CUDA_CHECK(cudaMemcpy(d_inputImage, inputImage.data, imageSize, cudaMemcpyHostToDevice));
	CUDA_CHECK(cudaMemcpy(d_kernel, &kernel, kernelSize, cudaMemcpyHostToDevice));

	int BLOCK_SIZE = 16;
	// 计算块和网格的大小
	dim3 blockSize(BLOCK_SIZE, BLOCK_SIZE);
	dim3 gridSize((width + BLOCK_SIZE - 1) / BLOCK_SIZE, (height + BLOCK_SIZE - 1) / BLOCK_SIZE);


	// 执行卷积
	convolutionGPU << <gridSize, blockSize >> > (d_inputImage, d_outputImage, width, height, channel, d_kernel, kernelWidth, kernelHeight);
	CUDA_CHECK(cudaDeviceSynchronize());

	// 分配主机内存用于输出图像
	uchar* outputImage = new uchar[imageSize];

	// 将输出图像从设备复制到主机内存
	CUDA_CHECK(cudaMemcpy(outputImage, d_outputImage, imageSize, cudaMemcpyDeviceToHost));

	// 将输出图像转换为OpenCV格式
	cv::Mat outputImageMat(height, width, inputImage.type(), outputImage);

	// 显示输出图像
	cv::imshow("in", inputImage);
	cv::imshow("out", outputImageMat);
	cv::waitKey(0);
	cv::destroyAllWindows();
	// 释放内存
	delete[] outputImage;
	CUDA_CHECK(cudaFree(d_kernel));
	CUDA_CHECK(cudaFree(d_inputImage));
	CUDA_CHECK(cudaFree(d_outputImage));
}

int main()
{
	test1();
	return 0;
}

运行结果

原图

三通道图像

卷积核为：

float kernel[kernelWidth][kernelHeight] = { {1., 1, 1. },
												{1., -8., 1.},
												{1. , 1., 1.} };

也是三通道图像
在这里插入图片描述

卷积核为：

float kernel[kernelWidth][kernelHeight] = { {1. / 9, 1. / 9, 1. / 9},
											{1. / 9, 1. / 9, 1. / 9},
											{1. / 9, 1. / 9, 1. / 9} };

滤波结果：
在这里插入图片描述