#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "opencv2/video/tracking.hpp"
#include "opencv2/highgui/highgui.hpp"
#include "opencv2/imgproc/imgproc.hpp"
#include <opencv2/core/core.hpp>
#include <opencv2/highgui/highgui.hpp>
#include <opencv2/imgproc/imgproc.hpp>
#include <opencv2/opencv.hpp>
#include <time.h>
#include <iostream>
using namespace std;
using namespace cv;
#include <stdio.h>
//定义图像大小
#define N 32
//定义block_size
__global__ void MediaFilter(uchar* In,uchar* Out, int Width, int Height, clock_t *time)
{
uchar window[9];
unsigned int x=blockIdx.x * blockDim.x + threadIdx.x;
unsigned int y=blockIdx.y * blockDim.y + threadIdx.y;
//记录运行开始的时间
clock_t start;
if(x==0&&y==0) start = clock();
if(x>=Width && y>=Height) {printf("break %s\n","");return;}
window[0] = ( y==0 || x==0 ) ? 0 : In[( y - 1 ) * Width + x - 1];
window[1] = ( y==0 ) ? 0 : In[ ( y - 1 ) * Width + x ];
window[2] = ( y==0 || x==Width-1 ) ? 0 : In[ ( y - 1 ) * Width + x + 1 ];
window[3] = ( x==0 ) ? 0: In[ y * Width + x ];
window[4] = In[ y * Width + x ];
window[5] = ( x==Width-1 ) ? 0 : In[ y * Width + x + 1 ];
window[6] = ( y==Height-1 || x == 0 ) ? 0:In[ (y + 1 ) * Width + x - 1 ];
window[7] = ( y==Height-1 ) ? 0 : In[ ( y + 1 ) * Width + x];
window[8] = ( y==Height-1 || x==Width-1 ) ? 0:In[ ( y + 1 ) * Width + x + 1 ];
for (unsigned int j = 0; j < 5; ++j)
{
int min1 = j;
for (unsigned int l=j+1; l<9; ++l)
{
if (window[l] < window[min1] )
{
min1 = l;
}
}
const uchar temp = window[j];
window[j] = window[min1];
window[min1] = temp;
}
Out[y*Width + x] = window[4];
if(x==0&&y==0) *time = clock() - start;
// printf("中值滤波结果 %i\n",Out[y*Width + x]);
}
int main()
{
Mat Img = imread("C:\\Users\\scczyy\\Desktop\\study\\ImageMedia\\lena.png",IMREAD_GRAYSCALE);
imshow("原图",Img);
int Height = Img.rows;
int Width = Img.cols;
int Len = Height*Width;
int MemSize = Len*sizeof(uchar);
printf("data type is %i\n",Img.type());
printf(" the image width is %i\n the image height is %i\n",Width,Height);
uchar *dev_Img;
uchar *dev_OutImg;
clock_t* time;
clock_t startTime,endTime;
clock_t startTime1,endTime1;
cudaMalloc((void**)&dev_Img,MemSize);
cudaMalloc((void**)&dev_OutImg,MemSize);
cudaMalloc((void**)&time,sizeof(clock_t));
cudaMemcpy(dev_Img,Img.data,MemSize,cudaMemcpyHostToDevice);
dim3 threadsPerBlock(N,N);
dim3 blocks( (Width+threadsPerBlock.x-1)/threadsPerBlock.x, (Height+threadsPerBlock.y-1)/threadsPerBlock.y );
// dim3 blocks(16,16);
startTime = clock();
MediaFilter<<<blocks,threadsPerBlock>>>(dev_Img,dev_OutImg,Width,Height,time);
endTime = clock();
cout << "CUDA Time Is " << (double)(endTime - startTime)/CLOCKS_PER_SEC <<"s"<<endl;
//Mat OutImg(Img.rows,Img.cols,CV_8UC1);
Mat OutImg = Mat::zeros(Img.rows,Img.cols, CV_8UC1);
clock_t time_use;
cudaMemcpy(OutImg.data,dev_OutImg,MemSize,cudaMemcpyDeviceToHost);
cudaMemcpy(&time_use,time,sizeof(clock_t),cudaMemcpyDeviceToHost);
printf("GPUtime: %d\n",time_use);
// cout << "【逗号分隔方式】"<<endl<<format(OutImg,"csv")<<endl;
startTime1 = clock();
medianBlur(Img,Img,3);
endTime1 = clock();
cout << "OPENCV Time Is " << (double)(endTime1 - startTime1)/CLOCKS_PER_SEC <<"s"<<endl;
imshow("CUDA",OutImg);
imshow("OPENCV",Img);
waitKey(0);
cudaFree(dev_Img);
cudaFree(dev_OutImg);
return 0;
}
相比于opencv,提速10倍以上