
42 篇文章 27 订阅


#include "MedianBlur.h"
#include "cuda_runtime.h"
#include "device_launch_parameters.h"

typedef unsigned char uchar;

const unsigned int BLOCK_W = 8;
const unsigned int BLOCK_H = 8;

__global__ void CudaMedianFilter3(uchar * input, uchar * output, unsigned int DATA_W, unsigned int DATA_H)
	__shared__ float window[BLOCK_W*BLOCK_H][9];

	unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

	unsigned int tid = threadIdx.y*blockDim.y + threadIdx.x;

	if (x >= DATA_W && y >= DATA_H)

	window[tid][0] = (y == 0 || x == 0) ? 0.0f : input[(y - 1)*DATA_W + x - 1];
	window[tid][1] = (y == 0) ? 0.0f : input[(y - 1)*DATA_W + x];
	window[tid][2] = (y == 0 || x == DATA_W - 1) ? 0.0f : input[(y - 1)*DATA_W + x + 1];
	window[tid][3] = (x == 0) ? 0.0f : input[y*DATA_W + x - 1];
	window[tid][4] = input[y*DATA_W + x];
	window[tid][5] = (x == DATA_W - 1) ? 0.0f : input[y*DATA_W + x + 1];
	window[tid][6] = (y == DATA_H - 1 || x == 0) ? 0.0f : input[(y + 1)*DATA_W + x - 1];
	window[tid][7] = (y == DATA_H - 1) ? 0.0f : input[(y + 1)*DATA_W + x];
	window[tid][8] = (y == DATA_H - 1 || x == DATA_W - 1) ? 0.0f : input[(y + 1)*DATA_W + x + 1];

	// Order elements (only half of them)
	for (unsigned int j = 0; j<5; ++j)
		// Find position of minimum element
		int min = j;
		for (unsigned int l = j + 1; l<9; ++l)
			if (window[tid][l] < window[tid][min])
				min = l;

		// Put found minimum element in its place
		float temp = window[tid][j];
		window[tid][j] = window[tid][min];
		window[tid][min] = temp;
	if (((x < 1) && (y < 1)) || ((x > DATA_W - 1) && (y < 1)) || ((x < 1) && (y > DATA_H - 1)) || ((x > DATA_W - 1) && (y > DATA_H - 1)))
		output[y*DATA_W + x] = input[y*DATA_W + x];
		output[y*DATA_W + x] = window[tid][4];

__global__ void CudaMedianFilter5(uchar * input, uchar * output, unsigned int DATA_W, unsigned int DATA_H)
	float window[25];

	unsigned int x = blockIdx.x*blockDim.x + threadIdx.x;
	unsigned int y = blockIdx.y*blockDim.y + threadIdx.y;

	if (x >= DATA_W && y >= DATA_H)

	window[0] = (y == 0 || y == 1 || x == 0 || x == 1) ? 0.0f : input[(y - 2)*DATA_W + x - 2];
	window[1] = (y == 0 || y == 1 || x == 0) ? 0.0f : input[(y - 2)*DATA_W + x - 1];
	window[2] = (y == 0 || y == 1) ? 0.0f : input[(y - 2)*DATA_W + x];
	window[3] = (y == 0 || y == 1 || x == DATA_W - 1) ? 0.0f : input[(y - 2)*DATA_W + x + 1];
	window[4] = (y == 0 || y == 1 || x == DATA_W - 1 || x == DATA_W - 2) ? 0.0f : input[(y - 2)*DATA_W + x + 2];

	window[5] = (y == 0 || x == 0 || x == 1) ? 0.0f : input[(y - 1)*DATA_W + x - 2];
	window[6] = (y == 0 || x == 0) ? 0.0f : input[(y - 1)*DATA_W + x - 1];
	window[7] = (y == 0) ? 0.0f : input[(y - 1)*DATA_W + x];
	window[8] = (y == 0 || x == DATA_W - 1) ? 0.0f : input[(y - 1)*DATA_W + x + 1];
	window[9] = (y == 0 || x == DATA_W - 1 || x == DATA_W - 2) ? 0.0f : input[(y - 1)*DATA_W + x + 2];

	window[10] = (x == 0 || x == 1) ? 0.0f : input[y*DATA_W + x - 2];
	window[11] = (x == 0) ? 0.0f : input[y*DATA_W + x - 1];
	window[12] = input[y*DATA_W + x];
	window[13] = (x == DATA_W - 1) ? 0.0f : input[y*DATA_W + x + 1];
	window[14] = (x == DATA_W - 1 || x == DATA_W - 2) ? 0.0f : input[y*DATA_W + x + 2];

	window[15] = (y == DATA_H - 1 || x == 0 || x == 1) ? 0.0f : input[(y + 1)*DATA_W + x - 2];
	window[16] = (y == DATA_H - 1 || x == 0) ? 0.0f : input[(y + 1)*DATA_W + x - 1];
	window[17] = (y == DATA_H - 1) ? 0.0f : input[(y + 1)*DATA_W + x];
	window[18] = (y == DATA_H - 1 || x == DATA_W - 1) ? 0.0f : input[(y + 1)*DATA_W + x + 1];
	window[19] = (y == DATA_H - 1 || x == DATA_W - 1 || x == DATA_W - 2) ? 0.0f : input[(y + 1)*DATA_W + x + 2];

	window[20] = (y == DATA_H - 2 || y == DATA_H - 1 || x == 0 || x == 1) ? 0.0f : input[(y + 2)*DATA_W + x - 2];
	window[21] = (y == DATA_H - 2 || y == DATA_H - 1 || x == 0) ? 0.0f : input[(y + 2)*DATA_W + x - 1];
	window[22] = (y == DATA_H - 2 || y == DATA_H - 1) ? 0.0f : input[(y + 2)*DATA_W + x];
	window[23] = (y == DATA_H - 2 || y == DATA_H - 1 || x == DATA_W - 1) ? 0.0f : input[(y + 2)*DATA_W + x + 1];
	window[24] = (y == DATA_H - 2 || y == DATA_H - 1 || x == DATA_W - 1 || x == DATA_W - 2) ? 0.0f : input[(y + 2)*DATA_W + x + 2];

	// Order elements (only half of them)
	for (unsigned int j = 0; j<13; ++j)
		// Find position of minimum element
		int min = j;
		for (unsigned int l = j + 1; l<25; ++l)
			if (window[l] < window[min])
				min = l;

		// Put found minimum element in its place 
		float temp = window[j];
		window[j] = window[min];
		window[min] = temp;

	if (((x < 2) && (y < 2)) || ((x > DATA_W - 2) && (y < 2)) || ((x < 2) && (y > DATA_H - 2)) || ((x > DATA_W - 2) && (y > DATA_H - 2)))
		output[y*DATA_W + x] = input[y*DATA_W + x];
		output[y*DATA_W + x] = window[12];

void CudaMedianFilter(uchar ** pImage, int imageWidth, int imageHeight, int kernelSize)
	uchar* pTmpImage = NULL;
	cudaMalloc((void**)&pTmpImage, imageWidth*imageHeight * sizeof(uchar));

	dim3 dimBlock(BLOCK_W, BLOCK_H);
	dim3 dimGrid((imageWidth + dimBlock.x - 1) / dimBlock.x, (imageHeight + dimBlock.y - 1) / dimBlock.y);
	uchar *d_input;

	cudaMemcpy(d_input, *pImage, imageWidth*imageHeight * sizeof(uchar), cudaMemcpyHostToDevice);

	if (kernelSize == 3)
		CudaMedianFilter3 << <dimGrid, dimBlock >> >(d_input, pTmpImage, imageWidth, imageHeight);
	else if (kernelSize == 5)
		CudaMedianFilter5 << <dimGrid, dimBlock >> >(d_input, pTmpImage, imageWidth, imageHeight);
	cudaMemcpy(*pImage, pTmpImage, imageWidth*imageHeight * sizeof(uchar), cudaMemcpyDeviceToHost);


#pragma once
typedef unsigned char uchar;
void CudaMedianFilter(uchar** pImage, int imageWidth, int imageHeight, int kernelSize);


void test_median()
	Mat panel = imread("./data/bridge.bmp", CV_LOAD_IMAGE_GRAYSCALE);
	Mat result = Mat::zeros(panel.size(), CV_8U);

	int image_Height = panel.rows;
	int image_Width = panel.cols;

	uchar * input =;
	Mat Z = Mat::zeros(panel.rows, panel.cols, CV_8UC1);
	uchar * output =;

	CudaMedianFilter(&input, image_Width, image_Height, 3); = input;
	imwrite("./data/bridge_GPU.bmp", result);

void test_pragma()
	int n = 12;

#pragma omp parallel num_threads(4)//定义以下的代码块用4个线程同时处理
		int i = omp_get_thread_num();//获取每个线程的序号
		printf_s("Hello from thread %d\n", i);//结果打印四条序号不同的hello...

  • 2
  • 15
    觉得还不错? 一键收藏
  • 0


  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助




当前余额3.43前往充值 >
领取后你会自动成为博主和红包主的粉丝 规则
钱包余额 0


