CUDA-基于共享内存的中值滤波算法

翟天保Steven

已于 2024-09-21 16:18:22 修改

阅读量582

点赞数 9

分类专栏： CUDA 文章标签：算法计算机视觉图像处理

于 2024-09-12 14:58:00 首次发布

本文链接：https://blog.csdn.net/zhaitianbao/article/details/142170444

版权

CUDA 专栏收录该内容

9 篇文章 0 订阅

订阅专栏

作者：翟天保Steven
版权声明：著作权归作者所有，商业转载请联系作者获得授权，非商业转载请注明出处

实现原理

中值滤波的实现原理参见文章：

CUDA-中值滤波算法-CSDN博客

本文主要目的在于讲解如何通过共享内存实现进一步的CUDA提速。

功能函数代码

假设同时执行的线程有16*16个，则取16*16*100的空间作为共享空间（每个线程分100个，也可以改成别的数值，只要不爆满就行），并行期间，每个线程按自己的线程序号去对应的空间进行数据操作，彼此不受影响。共享空间的数据读写要快得多，因此使得算法运行速度更快，但同时有一个弊端，共享内存申请的不能太大，意味着该方法不适用于大窗口尺寸的滤波场景。

// 基于共享内存的中值滤波核函数
__global__ void medianFilter_SharedMemory_CUDA(uchar* inputImage, uchar* outputImage, int width, int height, int windowSize)
{
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	if (row < height && col < width)
	{
		// 参数预设
		__shared__ uchar sharedWindow[TILE_WIDTH * TILE_WIDTH * 100];
		int sharedIdx = ((threadIdx.y * blockDim.x + threadIdx.x)) * 100;
		int index = row * width + col;
		int r = windowSize / 2;

		int ms = max(row - r, 0);
		int me = min(row + r, height - 1);
		int ns = max(col - r, 0);
		int ne = min(col + r, width - 1);
		// 赋值
		int count = 0;
		for (int m = ms; m <= me; ++m)
		{
			for (int n = ns; n <= ne; ++n)
			{
				sharedWindow[sharedIdx + count] = inputImage[m * width + n];
				count++;
			}
		}
		// 选择排序
		for (int p = 0; p < count - 1; p++)
		{
			int minIndex = p;
			for (int q = p + 1; q < count; q++)
			{
				if (sharedWindow[sharedIdx + q] < sharedWindow[sharedIdx + minIndex])
				{
					minIndex = q;
				}
			}
			uchar temp = sharedWindow[sharedIdx + p];
			sharedWindow[sharedIdx + p] = sharedWindow[sharedIdx + minIndex];
			sharedWindow[sharedIdx + minIndex] = temp;
		}
		outputImage[index] = sharedWindow[sharedIdx + count / 2];
	}
}

C++测试代码

Filter.h

#pragma once
#include <cuda_runtime.h>
#include <iostream>
#include <vector>
#include <opencv2/opencv.hpp>
#include <device_launch_parameters.h>

using namespace cv;
using namespace std;

#define TILE_WIDTH 16

// 预准备过程
void warmupCUDA();

// 中值滤波-CPU
cv::Mat filterMedian_CPU(cv::Mat input, int FilterWindowSize);

// 基于共享内存的中值滤波-GPU
cv::Mat filterMedian_SharedMemory_GPU(cv::Mat input, int FilterWindowSize);

Filter.cu

#include "Filter.h"

// 预准备过程
void warmupCUDA()
{
    float* dummy_data;
    cudaMalloc((void**)&dummy_data, sizeof(float));
    cudaFree(dummy_data);
}

// 中值滤波-CPU
cv::Mat filterMedian_CPU(cv::Mat input, int FilterWindowSize)
{
	int row = input.rows;
	int col = input.cols;

	// 预设输出
	cv::Mat output = input.clone();

	// 中值滤波
	int r = FilterWindowSize / 2;
#pragma omp parallel for
	for (int i = 0; i < row; ++i)
	{
		vector<uchar> datas;
		for (int j = 0; j < col; ++j)
		{
			// 卷积窗口边界限制，防止越界
			int ms = ((i - r) > 0) ? (i - r) : 0;
			int me = ((i + r) < (row - 1)) ? (i + r) : (row - 1);
			int ns = ((j - r) > 0) ? (j - r) : 0;
			int ne = ((j + r) < (col - 1)) ? (j + r) : (col - 1);
			// 求窗口内有效数据的中值
			datas.clear();
			for (int m = ms; m <= me; ++m)
			{
				for (int n = ns; n <= ne; ++n)
				{
					datas.push_back(input.at<uchar>(m, n));
				}
			}
			sort(datas.begin(), datas.end());
			output.at<uchar>(i, j) = datas[datas.size() / 2];
		}
	}

	return output;
}
// 基于共享内存的中值滤波核函数
__global__ void medianFilter_SharedMemory_CUDA(uchar* inputImage, uchar* outputImage, int width, int height, int windowSize)
{
	int row = blockIdx.y * blockDim.y + threadIdx.y;
	int col = blockIdx.x * blockDim.x + threadIdx.x;

	if (row < height && col < width)
	{
		// 参数预设
		__shared__ uchar sharedWindow[TILE_WIDTH * TILE_WIDTH * 100];
		int sharedIdx = ((threadIdx.y * blockDim.x + threadIdx.x)) * 100;
		int index = row * width + col;
		int r = windowSize / 2;

		int ms = max(row - r, 0);
		int me = min(row + r, height - 1);
		int ns = max(col - r, 0);
		int ne = min(col + r, width - 1);
		// 赋值
		int count = 0;
		for (int m = ms; m <= me; ++m)
		{
			for (int n = ns; n <= ne; ++n)
			{
				sharedWindow[sharedIdx + count] = inputImage[m * width + n];
				count++;
			}
		}
		// 选择排序
		for (int p = 0; p < count - 1; p++)
		{
			int minIndex = p;
			for (int q = p + 1; q < count; q++)
			{
				if (sharedWindow[sharedIdx + q] < sharedWindow[sharedIdx + minIndex])
				{
					minIndex = q;
				}
			}
			uchar temp = sharedWindow[sharedIdx + p];
			sharedWindow[sharedIdx + p] = sharedWindow[sharedIdx + minIndex];
			sharedWindow[sharedIdx + minIndex] = temp;
		}
		outputImage[index] = sharedWindow[sharedIdx + count / 2];
	}
}
// 基于共享内存的中值滤波-GPU
cv::Mat filterMedian_SharedMemory_GPU(cv::Mat input, int FilterWindowSize)
{
	int row = input.rows;
	int col = input.cols;

	// 分配GPU内存
	uchar* d_inputImage, *d_outputImage;
	cudaMalloc(&d_inputImage, row * col * sizeof(uchar));
	cudaMalloc(&d_outputImage, row * col * sizeof(uchar));

	// 将输入图像数据从主机内存复制到GPU内存
	cudaMemcpy(d_inputImage, input.data, row * col * sizeof(uchar), cudaMemcpyHostToDevice);

	// 计算块和线程的大小
	dim3 blockSize(TILE_WIDTH, TILE_WIDTH);
	dim3 gridSize((col + blockSize.x - 1) / blockSize.x, (row + blockSize.y - 1) / blockSize.y);

	// 调用CUDA内核
	medianFilter_SharedMemory_CUDA << <gridSize, blockSize >> > (d_inputImage, d_outputImage, col, row, FilterWindowSize);

	// 将处理后的图像数据从GPU内存复制回主机内存
	cv::Mat output(row, col, CV_8UC1);
	cudaMemcpy(output.data, d_outputImage, row * col * sizeof(uchar), cudaMemcpyDeviceToHost);

	// 清理GPU内存
	cudaFree(d_inputImage);
	cudaFree(d_outputImage);

	return output;
}

main.cpp

#include "Filter.h"

void main()
{
    // 预准备
	warmupCUDA();

	cout << "medianFilter_SharedMemory test begin." << endl;
	// 加载
	cv::Mat src = imread("test pic/test1.jpg", 0);
	int winSize = 9;
	cout << "filterWindowSize:" << winSize << endl;
	cout << "size: " << src.cols << " * " << src.rows << endl;

	// CPU版本
	clock_t s1, e1;
	s1 = clock();
	cv::Mat output1 = filterMedian_CPU(src, winSize);
	e1 = clock();
	cout << "CPU time:" << double(e1 - s1) / 1000 << endl;

	// GPU版本
	clock_t s2, e2;
	s2 = clock();
	cv::Mat output2 = filterMedian_SharedMemory_GPU(src, winSize);
	e2 = clock();
	cout << "GPU time:" << double(e2 - s2) / 1000 << endl;

	// 检查
	int row = src.rows;
	int col = src.cols;
	bool flag = true;
	for (int i = 0; i < row; ++i)
	{
		for (int j = 0; j < col; ++j)
		{
			if (output1.at<uchar>(i, j) != output2.at<uchar>(i, j))
			{
				cout << "i:" << i << " j:" << j << endl;
				flag = false;
				break;
			}
		}
		if (!flag)
		{
			break;
		}
	}
	if (flag)
	{
		cout << "ok!" << endl;
	}
	else
	{
		cout << "error!" << endl;
	}

	// 查看输出
	cv::Mat test1 = output1.clone();
	cv::Mat test2 = output2.clone();

	cout << "medianFilter_SharedMemory test end." << endl;
	
}