cuda学习--并行化实现图像的均值模糊处理

#include <iostream>
#include <string>
#include <cassert>

#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/opencv.hpp>

#include <cuda.h>
#include <cuda_runtime.h>
#include <cuda_runtime_api.h>

#define checkCudaErrors(val) check((val), #val,__FILE__,__LINE__)

//声明输入输出图像
cv::Mat imageInputRGBA;
cv::Mat imageOutputRGBA;

//声明GPU memory
uchar4 *d_inputImageRGBA__;
uchar4 *d_outputImageRGBA__;

float *h_filter__;

size_t numRows() {
	return imageInputRGBA.rows;
}
size_t numCols() {
	return imageInputRGBA.cols;
}

template<typename T>
void check(T err, const char* const func, const char* const file, const int line) {
	if (err != cudaSuccess) {
		std::cerr << "CUDA error at:" << file << ":" << line << std::endl;
		std::cerr << cudaGetErrorString(err) << " " << func << std::endl;
		exit(1);
	}
}

//图像预处理
void preProcess(uchar4 **h_inputImageRGBA, uchar4 **h_outputImageRGBA,
				uchar4 **d_inputImageRGBA, uchar4 **d_outputImageRGBA,
				unsigned char **d_redBlurred,
				unsigned char **d_greenBlurred,
				unsigned char **d_blueBlurred,
				float **h_filter, int *filterWidth,
				const std::string &filename) {

	//确保上下文初始化正常
	checkCudaErrors(cudaFree(0));

	//读取图像
	cv::Mat image = cv::imread(filename.c_str(), CV_LOAD_IMAGE_COLOR);
	if (image.empty()) {
		std::cerr << "Couldn't open file:" << filename << std::endl;
		exit(1);
	}

	cv::cvtColor(image, imageInputRGBA, CV_BGR2RGBA);

	//生成与image同样大小的imageOutputRGBA
	imageOutputRGBA.create(image.rows, image.cols, CV_8UC4);

	//保证图像是连续存放的
	if (!imageInputRGBA.isContinuous() || !imageOutputRGBA.isContinuous()) {
		std::cerr << "Images aren't continuous!!Exiting." << std::endl;
		exit(1);
	}

	//声明CPU上指向imageInputRGBA,imageOutputRGBA的指针
	*h_inputImageRGBA = (uchar4 *)imageInputRGBA.ptr<unsigned char>(0);
	*h_outputImageRGBA = (uchar4 *)imageOutputRGBA.ptr<unsigned char>(0);

	//分配GPU memory
	const size_t numPixels = numRows()*numCols();
	checkCudaErrors(cudaMalloc(d_inputImageRGBA, sizeof(uchar4)*numPixels));
	checkCudaErrors(cudaMalloc(d_outputImageRGBA, sizeof(uchar4)*numPixels));
	checkCudaErrors(cudaMemset(*d_outputImageRGBA, 0, numPixels * sizeof(uchar4)));

	//CPU复制给GPU
	checkCudaErrors(cudaMemcpy(*d_inputImageRGBA, *h_inputImageRGBA, sizeof(uchar4)*numPixels, cudaMemcpyHostToDevice));

	d_inputImageRGBA__ = *d_inputImageRGBA;
	d_outputImageRGBA__ = *d_outputImageRGBA;

	//初始化模糊处理的邻域大小和参数
	const int blurKernelWidth = 9;
	const int blurKernelSigma = 2;

	//归一化处理
	*filterWidth = blurKernelWidth;

	*h_filter = new float[blurKernelWidth * blurKernelWidth];
	h_filter__ = *h_filter;

	float filterSum = 0.f;

	for (int r = -blurKernelWidth / 2; r <= blurKernelWidth / 2; ++r) {
		for (int c = -blurKernelWidth / 2; c <= blurKernelWidth / 2; ++c) {
			float filterValue = expf(-(float)(c*c + r * r) / (2.f*blurKernelSigma*blurKernelSigma));
			(*h_filter)[(r + blurKernelWidth / 2)*blurKernelWidth + c + blurKernelWidth / 2] = filterValue;
			filterSum += filterValue;
		}
	}

	float normalizationFactor = 1.f / filterSum;

	for (int r = -blurKernelWidth / 2; r <= blurKernelWidth / 2; ++r) {
		for (int c = -blurKernelWidth / 2; c <= blurKernelWidth / 2; ++c) {
			(*h_filter)[(r + blurKernelWidth / 2)*blurKernelWidth + c + blurKernelWidth / 2] *= normalizationFactor;
		}
	}

	//分配模糊处理后三通道的GPU memory
	checkCudaErrors(cudaMalloc(d_redBlurred, sizeof(unsigned char)*numPixels));
	checkCudaErrors(cudaMalloc(d_greenBlurred, sizeof(unsigned char)*numPixels));
	checkCudaErrors(cudaMalloc(d_blueBlurred, sizeof(unsigned char)*numPixels));
	//清空
	checkCudaErrors(cudaMemset(*d_redBlurred,0, sizeof(unsigned char)*numPixels));
	checkCudaErrors(cudaMemset(*d_greenBlurred, 0, sizeof(unsigned char)*numPixels));
	checkCudaErrors(cudaMemset(*d_blueBlurred, 0, sizeof(unsigned char)*numPixels));
	checkCudaErrors(cudaFree(0));
}

//高斯模糊处理
__global__
void gaussian_blur(const unsigned char* const inputChannel,
				   unsigned char* const outputChannel,
				   int numRows, int numCols,
				   const float* const filter, const int filterWidth) {
	
	const int2 thread_2D_pos = make_int2(blockIdx.x*blockDim.x + threadIdx.x, blockIdx.y*blockDim.y + threadIdx.y);

	const int thread_1D_pos = thread_2D_pos.y*numCols + thread_2D_pos.x;
	const int absolute_image_position_x = thread_2D_pos.x;
	const int absolute_image_position_y = thread_2D_pos.y;
	if (absolute_image_position_x >= numCols || absolute_image_position_y >= numRows) {
		return;
	}
	float color = 0.0f;
	for (int py = 0; py < filterWidth; py++) {
		for (int px = 0; px < filterWidth; px++) {
			int c_x = absolute_image_position_x + px - filterWidth / 2;
			int c_y = absolute_image_position_y + py - filterWidth / 2;
			c_x = min(max(c_x, 0), numCols - 1);
			c_y = min(max(c_y, 0), numRows - 1);
			float filter_value = filter[py*filterWidth + px];
			color += filter_value * static_cast<float>(inputChannel[c_y*numCols + c_x]);
		}
	}
	outputChannel[thread_1D_pos] = color;
}

//拆分通道
__global__
void separateChannels(const uchar4* const inputImageRGBA,
					  int numRows, int numCols,
					  unsigned char* const redChannel,
					  unsigned char* const greenChannel,
					  unsigned char* const blueChannel) {
	const int2 thread_2D_pos = make_int2(blockIdx.x*blockDim.x + threadIdx.x, blockIdx.y*blockDim.y + threadIdx.y);
	const int thread_1D_pos = thread_2D_pos.y*numCols + thread_2D_pos.x;
	const int absolute_image_position_x = thread_2D_pos.x;
	const int absolute_image_position_y = thread_2D_pos.y;
	if (absolute_image_position_x >= numCols || absolute_image_position_y >= numRows) {
		return;
	}
	redChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].x;
	greenChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].y;
	blueChannel[thread_1D_pos] = inputImageRGBA[thread_1D_pos].z;
}

__global__
void recombinChannels(const unsigned char* const redChannel,
					  const unsigned char* const greenChannel,
					  const unsigned char* const blueChannel,
					  uchar4* const outputImageRGBA,
					  int numRows,
					  int numCols) {
	const int2 thread_2D_pos = make_int2(blockIdx.x*blockDim.x + threadIdx.x, blockIdx.y*blockDim.y + threadIdx.y);

	const int thread_1D_pos = thread_2D_pos.y*numCols + thread_2D_pos.x;

	if (thread_2D_pos.x >= numCols || thread_2D_pos.y >= numRows) {
		return;
	}

	unsigned char red = redChannel[thread_1D_pos];
	unsigned char green = greenChannel[thread_1D_pos];
	unsigned char blue = blueChannel[thread_1D_pos];

	uchar4 outputPixel = make_uchar4(red, green, blue, 255);

	outputImageRGBA[thread_1D_pos] = outputPixel;
}

unsigned char *d_red, *d_green, *d_blue;
float *d_filter;

void allocateMemoryAndCopyToGPU(const size_t numRowsImage, const size_t numColsImage,
								const float* const h_filter, const size_t filterWidth) {
	//分配拆分三通道的GPU memory
	checkCudaErrors(cudaMalloc(&d_red, sizeof(unsigned char)*numRowsImage*numColsImage));
	checkCudaErrors(cudaMalloc(&d_green, sizeof(unsigned char)*numRowsImage*numColsImage));
	checkCudaErrors(cudaMalloc(&d_blue, sizeof(unsigned char)*numRowsImage*numColsImage));

	checkCudaErrors(cudaMalloc(&d_filter, sizeof(float)*filterWidth*filterWidth));

	checkCudaErrors(cudaMemcpy(d_filter, h_filter, sizeof(float)*filterWidth*filterWidth, cudaMemcpyHostToDevice));
}

void postProcess(const std::string&output_file, uchar4* data_ptr) {
	cv::Mat output(numRows(), numCols(), CV_8UC4, (void*)data_ptr);
	cv::Mat imageOutputBGR;
	cv::cvtColor(output, imageOutputBGR, CV_RGBA2BGR);
	cv::imwrite(output_file.c_str(), imageOutputBGR);
}

void cleanup() {
	cudaFree(d_inputImageRGBA__);
	cudaFree(d_outputImageRGBA__);
	delete[] h_filter__;
}

int main(int argc, char* argv[]) {
	std::string input_file= "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small.jpg";
	std::string output_file= "E:/code/study_cuda/study_reduce/study_reduce/cinque_terre_small_togray.jpg";

	uchar4 *h_inputImageRGBA, *d_inputImageRGBA;
	uchar4 *h_outputImageRGBA, *d_outputImageRGBA;
	unsigned char *d_redBlurred, *d_greenBlurred, *d_blueBlurred;

	float *h_filter;
	int filterWidth;

	preProcess(&h_inputImageRGBA, &h_outputImageRGBA, &d_inputImageRGBA, &d_outputImageRGBA,
		&d_redBlurred, &d_greenBlurred, &d_blueBlurred,
		&h_filter, &filterWidth, input_file);

	allocateMemoryAndCopyToGPU(numRows(), numCols(), h_filter, filterWidth);

	const dim3 blockSize(16, 16);
	const dim3 gridSize(numCols() / blockSize.x + 1, numRows() / blockSize.y + 1);

	//拆分三个通道
	separateChannels << <gridSize, blockSize >> > (d_inputImageRGBA, numRows(), numCols(), d_red, d_green, d_blue);
	cudaDeviceSynchronize();

	//每个通道分别做模糊处理
	gaussian_blur << <gridSize, blockSize >> > (d_red, d_redBlurred, numRows(), numCols(), d_filter, filterWidth);
	cudaDeviceSynchronize();

	gaussian_blur << <gridSize, blockSize >> > (d_green, d_greenBlurred, numRows(), numCols(), d_filter, filterWidth);
	cudaDeviceSynchronize();
	
	gaussian_blur << <gridSize, blockSize >> > (d_blue, d_blueBlurred, numRows(), numCols(), d_filter, filterWidth);
	cudaDeviceSynchronize();

	//重新合并起来
	recombinChannels << <gridSize, blockSize >> > (d_redBlurred, d_greenBlurred, d_blueBlurred, d_outputImageRGBA, numRows(), numCols());
	cudaDeviceSynchronize();

	size_t numPixels = numRows()*numCols();
	checkCudaErrors(cudaMemcpy(h_outputImageRGBA, d_outputImageRGBA__, sizeof(uchar4)*numPixels, cudaMemcpyDeviceToHost));

	//写入图片
	postProcess(output_file, h_outputImageRGBA);

	//释放
	checkCudaErrors(cudaFree(d_redBlurred));
	checkCudaErrors(cudaFree(d_greenBlurred));
	checkCudaErrors(cudaFree(d_blueBlurred));

	cleanup();
	return 0;
}

仅用于记录学习过程

  • 1
    点赞
  • 13
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值