CUDA学习笔记一：CUDA+OpenCV的图像转置，采用Shared Memory进行CUDA程序优化

最新推荐文章于 2024-08-08 10:25:00 发布

啊塔

最新推荐文章于 2024-08-08 10:25:00 发布

阅读量9.1k

点赞数 2

本文链接：https://blog.csdn.net/u014206910/article/details/52064992

版权

原创文章，转载请注明出处......

一、问题背景

最近要做一个关于CUDA的学习分享报告，想在报告中举一个利用CUDA进行图像处理的例子，并使用Shared Memory避免Global Memory不合并访存情况，提高图像处理性能。但是对于CUDA程序如何读取图像有点困惑，网上找到了一篇“第二个cuda程序——图像拉伸”的博文点击打开链接，所示代码涉及了图像交互部分，但是需要包含“cutil_inline.h”头文件（据说是开发人员编写例程时用的头文件），悲催的是自从CUDA5.0之后“cutil.h”跟“cutil_inline.h”等头文件就被移除了，而我安装的是CUDA6.5，所以用不了博文中读取图像的方法。

后来在CUDA的Sample中看到了图像处理的示例程序，但是说实话，对我这种刚入门的人来说，程序有点复杂，所以放弃了研究它的念头。另外，有人跟我说CUDA有一个NPP库可以支持图像的交互，但是不知道这个库该怎么调用。

于是，我想了个方法，用OpenCV函数来读取、显示图像，图像的处理则交由CUDA核函数完成，因为Windows平台上OpenCV与CUDA编程都是在Visual Studio上完成的，因此，此法是可行的。

二、实验过程

1. 实验平台：Visual Studio 2010，CUDA 6.5，OpenCV 2.4.9

2. OpenCV开发环境配置

要在VC上调用OpenCV函数库函数，需要先进行OpenCV开发环境的配置，OpenCV的安装与环境配置参考博文“【OpenCV入门教程之一】安装OpenCV：OpenCV 3.0、OpenCV 2.4.8、OpenCV 2.4.9 ”点击打开链接

3. 代码

本文采用CUDA+OpenCV的环境进行图像转置处理，分别采用CPU与GPU对读入的图像进行转置，其中GPU的实现分为Global Memory与Shared Memory两个版本，实验结果表明采用Shared Memory进行图像转置，可以避免不合并访存的情况，从而提高程序运行速度。

（1）CPU、GPU Global Memory与GPU Shared Memory进行图像转置的函数定义均在头文件”imageTranspose.h“中：

#ifndef _IMAGETRANSPOSE_CU
#define _IMAGETRANSPOSE_CU


#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <Windows.h>			//用于计时
#include <time.h>


#define W 16  //Block的尺寸
#define N 1024  //Grid的尺寸


//图像数据放在GlobalMemory上进行处理
__global__ static void GPUImageTranspose_Global(unsigned char *imageDataSrc, unsigned char *imageDataDst, int Width, int Height)
{
	int tid = threadIdx.x + blockDim.x * blockIdx.x;  //得到线程id
	
	//越界判断，线程可能没有与之对应的像素
	if(tid >= Width * Height)
		return;

	int i, j;  
	i = tid / Width;
	j = tid % Width;
	
	//转置
	imageDataDst[j * Height + i] = imageDataSrc[tid];     
	          
	return;
}


//图像数据放在SharedMemory上进行处理
__global__ static void GPUImageTranspose_Shared(unsigned char *imageDataSrc, unsigned char *imageDataDst, int Width, int Height)
{
	__shared__ unsigned char tile[W][W];  //声明存储图像数据的Shared Memory


	//计算当前线程处理的像素在输入矩阵中的索引
	int x = threadIdx.x + blockIdx.x * W;
	int y = threadIdx.y + blockIdx.y * W;
	int index_in = x + y * Width; 


	//这个越界判断很关键，不然输出结果错误
	if(index_in >= Width * Height)
		return;
	
	//将当前线程处理的像素值从Global Memory复制到Shared Memory
	tile[threadIdx.y][threadIdx.x] = imageDataSrc[index_in];     
	__syncthreads();  //线程同步语句


	//计算当前线程处理的像素在输出矩阵中的索引
	x = threadIdx.x + blockIdx.y * W;
	y = threadIdx.y + blockIdx.x * W;
	int index_out = x + y * Height; 
	
	//将当前线程处理的像素值从Shared Memory复制到Global Memory，通过坐标变换完成转置
	imageDataDst[index_out] = tile[threadIdx.x][threadIdx.y];


	return;
}


//CPU完成图像转置
void CPUImageTranspose(unsigned char *imageDataSrc, unsigned char *imageDataDst, int Width, int Height){      
	
	int i, j;       

	if(imageDataSrc == NULL || imageDataDst == NULL || Width <= 0 || Height <= 0)         
		return;    

	//遍历图像数据完成图像转置
	for(i=0; i<Height; i++) {          
		for(j=0; j<Width; j++) {             
			imageDataDst[j * Height + i] = imageDataSrc[i * Width + j];         
		}     
	}       
}


#endif

（2）主函数在”imageTranspose.cu“文件中定义，主函数调用图像转置函数进行图像处理，以下展示为使用CPU及GPU Global Memory进行图像转置：

#include <cv.h>  //use OpenCV
#include <highgui.h>
#include <stdio.h>
#include <time.h>  //clock_t clock()

#include "imageTranspose_cu.h"

int main()
{	
	//通过OpenCV函数读取图像
	IplImage *ImgSrc = cvLoadImage("<span style="text-align: justify;">Lena.jpg</span>", CV_LOAD_IMAGE_GRAYSCALE);
	int Width = ImgSrc->width;
	int Height = ImgSrc->height;
	//输出图像的宽高尺寸互换
	IplImage *ImgDst_GPU_Global = cvCreateImage(cvSize(Height, Width), IPL_DEPTH_8U, 1);
	IplImage *ImgDst_CPU = cvCreateImage(cvSize(Height, Width), IPL_DEPTH_8U, 1);

	//定义指向图像数据的指针，作为函数调用的参数
	unsigned char *pSrcData = (unsigned char*)(ImgSrc->imageData);
	unsigned char *pDstData_Global = (unsigned char*)(ImgDst_GPU_Global->imageData);
	unsigned char *cDstData = (unsigned char*)(ImgDst_CPU->imageData);

	//分配显存用于存储原图像数组和目标图像数组
	unsigned char *device_ImgDataSrc = NULL;
	unsigned char *device_ImgDataDst_Global = NULL;
	cudaMalloc((void**)&device_ImgDataSrc, sizeof(unsigned char) * Width * Height);
	cudaMalloc((void**)&device_ImgDataDst_Global, sizeof(unsigned char) * Height * Width);

	//将原图像数组传递到显存中
	cudaMemcpy(device_ImgDataSrc, pSrcData, sizeof(unsigned char) * Width * Height, cudaMemcpyHostToDevice);
	
	//GlobalMemory版本的参数设置
	int dimGrid_Global = 6000;  //每个Grid允许的最大Block数为65535
	int dimBlock_Global = 512;  //每个Block允许的最大线程数为512

	//创建事件，启动定时
	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	//启动CUDA核函数，GPU进行图像转置
	GPUImageTranspose_Global<<<dimGrid_Global, dimBlock_Global>>>(device_ImgDataSrc, device_ImgDataDst_Global,  ImgSrc->width, ImgSrc->height);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(start);  //事件同步语句
	cudaEventSynchronize(stop);  //事件同步语句
	//计算CUDA核函数进行图像转置耗时，并显示时间
	float GPUTime_Global = 0;
	cudaEventElapsedTime(&GPUTime_Global, start, stop);
	printf("GPU_Time_Global = %f\n", GPUTime_Global);
	
	//将结果传递至内存
	cudaMemcpy(pDstData_Global, device_ImgDataDst_Global, sizeof(unsigned char) * Width * Height, cudaMemcpyDeviceToHost);
	
	//计算CPU进行图像转置耗时，并显示时间
	clock_t t1 = clock();
	CPUImageTranspose(pSrcData, cDstData, ImgSrc->width, ImgSrc->height);  //CPU进行图像转置
	clock_t t2 = clock();	
	float time_cpu = 0;
	time_cpu = t2 - t1;
	printf("CPU_Time = %f\n", time_cpu*1000/CLOCKS_PER_SEC);  //时间单位ms

	//释放资源
	cvNamedWindow("Src");
	cvShowImage("Src", ImgSrc);

	cvNamedWindow("Dst_CPU");
	cvShowImage("Dst_CPU", ImgDst_CPU);

	cvNamedWindow("Dst_GPU_Global");
	cvShowImage("Dst_GPU_Global", ImgDst_GPU_Global);

	cvWaitKey();

	cudaFree(device_ImgDataSrc);
	cudaFree(device_ImgDataDst_Global);
	
	cvDestroyAllWindows();
	cvReleaseImage(&ImgSrc);
	cvReleaseImage(&ImgDst_CPU);
	cvReleaseImage(&ImgDst_GPU_Global);
	
	return 0;
}

（3）当对CUDA程序进行优化，使用GPU Shared Memory进行图像转置时，”imageTranspose.cu“文件要进行相应的修改：

int main()
{	
	IplImage *ImgSrc = cvLoadImage("<span style="text-align: justify;">Lena.jpg</span>", CV_LOAD_IMAGE_GRAYSCALE);
	int Width = ImgSrc->width;
	int Height = ImgSrc->height;
	//输出图像的宽高尺寸互换
	IplImage *ImgDst_GPU_Shared = cvCreateImage(cvSize(Height, Width), IPL_DEPTH_8U, 1);
	IplImage *ImgDst_CPU = cvCreateImage(cvSize(Height, Width), IPL_DEPTH_8U, 1);

	unsigned char *pSrcData = (unsigned char*)(ImgSrc->imageData);
	unsigned char *pDstData_Shared = (unsigned char*)(ImgDst_GPU_Shared->imageData);
	unsigned char *cDstData = (unsigned char*)(ImgDst_CPU->imageData);

	//分配显存用于存储原图像数组和目标图像数组
	unsigned char *device_ImgDataSrc = NULL;
	unsigned char *device_ImgDataDst_Shared = NULL;
	cudaMalloc((void**)&device_ImgDataSrc, sizeof(unsigned char) * Width * Height);
	cudaMalloc((void**)&device_ImgDataDst_Shared, sizeof(unsigned char) * Height * Width);

	//将原图像数组传递到显存中
	cudaMemcpy(device_ImgDataSrc, pSrcData, sizeof(unsigned char) * Width * Height, cudaMemcpyHostToDevice);

	//SharedMemory版本的参数设置
	dim3 dimGrid_Shared(N/W, N/W);  //每个Grid允许的最大Block数为65535
	dim3 dimBlock_Shared(W, W);  //每个Block允许的最大线程数为512

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
	cudaEventRecord(start, 0);
	GPUImageTranspose_Shared<<<dimGrid_Shared, dimBlock_Shared>>>(device_ImgDataSrc, device_ImgDataDst_Shared,  ImgSrc->width, ImgSrc->height);
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(start);
	cudaEventSynchronize(stop);
	float GPUTime_Shared = 0;
	cudaEventElapsedTime(&GPUTime_Shared, start, stop);
	printf("GPU_Time_Shared = %f\n",  GPUTime_Shared);

	//将结果传递至内存
	cudaMemcpy(pDstData_Shared, device_ImgDataDst_Shared, sizeof(unsigned char) * Width * Height, cudaMemcpyDeviceToHost);

	clock_t t1 = clock();
	CPUImageTranspose(pSrcData, cDstData, ImgSrc->width, ImgSrc->height);  //CPU处理的图像
	clock_t t2 = clock();	
	float time_cpu = 0;
	time_cpu = t2 - t1;
	printf("CPU_Time = %f\n", time_cpu*1000/CLOCKS_PER_SEC);  //时间单位ms

	cvNamedWindow("Src");
	cvShowImage("Src", ImgSrc);

	cvNamedWindow("Dst_CPU");
	cvShowImage("Dst_CPU", ImgDst_CPU);

	cvNamedWindow("Dst_GPU_Shared");
	cvShowImage("Dst_GPU_Shared", ImgDst_GPU_Shared);

	cvWaitKey();

	cudaFree(device_ImgDataSrc);
	cudaFree(device_ImgDataDst_Shared);

	cvDestroyAllWindows();
	cvReleaseImage(&ImgSrc);
	cvReleaseImage(&ImgDst_CPU);
	cvReleaseImage(&ImgDst_GPU_Shared);
	
	return 0;
}

4. 实验结果

（1）采用经典测试图像”Lena.jpg“作为输入图像：