CUDA C学习之纹理内存的使用

最新推荐文章于 2022-11-17 19:57:01 发布
探索鸭
最新推荐文章于 2022-11-17 19:57:01 发布
阅读量550
点赞数 1
分类专栏： CUDA C 文章标签：纹理内存
本文链接：https://blog.csdn.net/qq_40801709/article/details/85880832
版权
CUDA C 专栏收录该内容
1 篇文章 0 订阅
订阅专栏
图片的加权叠加：有无使用纹理内存的GPU下运行和CPU下运行的性能比较。
参考了：
https://blog.csdn.net/langzai310/article/details/83573278
https://blog.csdn.net/shuzfan/article/details/77095270
https://learnopengl-cn.readthedocs.io/zh/latest/01%20Getting%20started/06%20Textures/
#include <opencv2\opencv.hpp> 
#include <iostream>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>

using namespace std;
using namespace cv;

// 打印设备信息
void printDeviceProp(const cudaDeviceProp &prop)
{
	printf("Device Name : %s.\n", prop.name);
	printf("totalGlobalMem : %d.\n", prop.totalGlobalMem);
	printf("sharedMemPerBlock : %d.\n", prop.sharedMemPerBlock);
	printf("regsPerBlock : %d.\n", prop.regsPerBlock);
	printf("warpSize : %d.\n", prop.warpSize);
	printf("memPitch : %d.\n", prop.memPitch);
	printf("maxThreadsPerBlock : %d.\n", prop.maxThreadsPerBlock);
	printf("maxThreadsDim[0 - 2] : %d %d %d.\n", prop.maxThreadsDim[0], prop.maxThreadsDim[1], prop.maxThreadsDim[2]);
	printf("maxGridSize[0 - 2] : %d %d %d.\n", prop.maxGridSize[0], prop.maxGridSize[1], prop.maxGridSize[2]);
	printf("totalConstMem : %d.\n", prop.totalConstMem);
	printf("major.minor : %d.%d.\n", prop.major, prop.minor);
	printf("clockRate : %d.\n", prop.clockRate);
	printf("textureAlignment : %d.\n", prop.textureAlignment);
	printf("deviceOverlap : %d.\n", prop.deviceOverlap);
	printf("multiProcessorCount : %d.\n", prop.multiProcessorCount);
}

// 初始化cuda
bool InitCUDA()
{
	int count;
	cudaGetDeviceCount(&count);
	if (count == 0) {
		fprintf(stderr, "three is no device.\n");
		return false;
	}
	int i;
	for (i = 0; i < count; i++)
	{
		cudaDeviceProp prop;
		cudaGetDeviceProperties(&prop, i);
		printDeviceProp(prop);
		if (cudaGetDeviceProperties(&prop, i) == cudaSuccess) {
			if (prop.major >= 1) { break; }
		}
	}
	if (i == count) {
		fprintf(stderr, "There is no device supporting CUDA 1.x.\n");
		return false;
	}
	cudaSetDevice(i);
	return true;
}

//声明CUDA纹理
//texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex1;
//texture <uchar4, cudaTextureType2D, cudaReadModeNormalizedFloat> refTex2;
texture <uchar4, cudaTextureType2D, cudaReadModeElementType> refTex1;
texture <uchar4, cudaTextureType2D, cudaReadModeElementType> refTex2;//数据类型不转换
//声明CUDA数组
cudaArray* cuArray1;
cudaArray* cuArray2;
//通道数
cudaChannelFormatDesc cuDesc = cudaCreateChannelDesc<uchar4>();

#define PAUSE printf("Press Enter key to continue..."); fgetc(stdin);

//1
__global__ void weightAddKerkel(uchar *pDstImgData, int imgHeight, int imgWidth, int channels)
{
	const int tidx = blockDim.x*blockIdx.x + threadIdx.x;
	const int tidy = blockDim.y*blockIdx.y + threadIdx.y;

	if (tidx<imgWidth && tidy<imgHeight)
	{
		uchar4 lenaBGR, moonBGR;
		//pDstImgData[1] = 200;
		//使用tex2D函数采样纹理 并行采样   又名纹理拾取
		lenaBGR = tex2D(refTex1, tidx, tidy);
		moonBGR = tex2D(refTex2, tidx, tidy);
		//pDstImgData[2] = 300;

		int idx = (tidy*imgWidth + tidx)*channels;
		float alpha = 0.5;
		pDstImgData[idx + 0] = (alpha*lenaBGR.x + (1 - alpha)*moonBGR.x);//*255;
		pDstImgData[idx + 1] = (alpha*lenaBGR.y + (1 - alpha)*moonBGR.y);//*255;
		pDstImgData[idx + 2] = (alpha*lenaBGR.z + (1 - alpha)*moonBGR.z);//*255;
		//pDstImgData[idx + 0] = (lenaBGR.x + moonBGR.x) / 2;
		//pDstImgData[idx + 1] = (lenaBGR.y + moonBGR.y) / 2;
		//pDstImgData[idx + 2] = (lenaBGR.z + moonBGR.z) / 2;
		pDstImgData[idx + 3] = 0;
	}
}
//2
__global__ void weightAddsimply(uchar *simply_d, uchar *lena_d, uchar *moon_d, int imgHeight, int imgWidth, int channels)
{
	const int tidx = blockDim.x*blockIdx.x + threadIdx.x;
	const int tidy = blockDim.y*blockIdx.y + threadIdx.y;
	float alpha = 0.5;
	if (tidx < imgWidth && tidy < imgHeight) {
		if (tidx == 0 || tidy == 0) {
			//int idx1 = (tidy + 1)*imgWidth + tidx;
			//int idx2 = (tidy + 1)*imgWidth + tidx + 1;
			//int idx3 = (tidy)*imgWidth + tidx + 1;
			int idx = (tidy*imgWidth + tidx)*channels;
			simply_d[idx+0] = (lena_d[idx+0])* alpha + (moon_d[idx + 0]) * (1 - alpha);
			simply_d[idx+1] = (lena_d[idx+1])* alpha + (moon_d[idx + 1]) * (1 - alpha);
			simply_d[idx+2] = (lena_d[idx + 2])* alpha + (moon_d[idx+2]) * (1 - alpha);
			simply_d[idx+3] = 0;
		}
		else {
			int idx1 = ((tidy - 1)*imgWidth + tidx)*channels;
			int idx2 = ((tidy - 1)*imgWidth + tidx - 1)*channels;
			int idx3 = ((tidy)*imgWidth + tidx - 1)*channels;
			int idx = (tidy*imgWidth + tidx)*channels;
			simply_d[idx+0] = (lena_d[idx1 + 0] + lena_d[idx2 + 0] + lena_d[idx3 + 0] + lena_d[idx + 0])* alpha / 4
				+ (moon_d[idx1 + 0] + moon_d[idx2 + 0] + moon_d[idx3 + 0] + moon_d[idx + 0])*(1 - alpha) / 4;
			simply_d[idx+1] = (lena_d[idx1 + 1] + lena_d[idx2 + 1] + lena_d[idx3 + 1] + lena_d[idx+1])* alpha / 4
				+ (moon_d[idx1 + 1] + moon_d[idx2 + 1] + moon_d[idx3 + 1] + moon_d[idx + 1])*(1 - alpha) / 4;
			simply_d[idx + 2] = (lena_d[idx1 + 2] + lena_d[idx2 + 2] + lena_d[idx3 + 2] + lena_d[idx+2])* alpha / 4
				+ (moon_d[idx1 + 2] + moon_d[idx2 + 2] + moon_d[idx3 + 2] + moon_d[idx + 2])*(1 - alpha) / 4;
			simply_d[idx+3] = 0;
		}
	}
}
/*
__global__ void weightAddsimply(uchar4 *simply_d, uchar4 *lena_d, uchar4 *moon_d, int imgHeight, int imgWidth, int channels)
{
	const int tidx = blockDim.x*blockIdx.x + threadIdx.x;
	const int tidy = blockDim.y*blockIdx.y + threadIdx.y;
	float alpha = 0.5;

	if (tidx<imgWidth && tidy<imgHeight)
	{
		if (tidx == 0 || tidy == 0) {
			//int idx1 = (tidy + 1)*imgWidth + tidx;
			//int idx2 = (tidy + 1)*imgWidth + tidx + 1;
			//int idx3 = (tidy)*imgWidth + tidx + 1;
			int idx = tidy*imgWidth + tidx;
			simply_d[idx].x = (lena_d[idx].x)* alpha + (moon_d[idx].x) * (1 - alpha);
			simply_d[idx].y = (lena_d[idx].y)* alpha + (moon_d[idx].y) * (1 - alpha);
			simply_d[idx].z = (lena_d[idx].z)* alpha + (moon_d[idx].z) * (1 - alpha);
			simply_d[idx].w = 0;
		}
		else {
			int idx1 = (tidy - 1)*imgWidth + tidx;
			int idx2 = (tidy - 1)*imgWidth + tidx - 1;
			int idx3 = (tidy)*imgWidth + tidx - 1;
			int idx = tidy*imgWidth + tidx;
			simply_d[idx].x = (lena_d[idx1].x+ lena_d[idx2].x+lena_d[idx3].x+ lena_d[idx].x)* alpha/4 
				+ (moon_d[idx1].x+ moon_d[idx2].x+ moon_d[idx3].x+ moon_d[idx].x)*(1 - alpha)/4;
			simply_d[idx].y = (lena_d[idx1].y + lena_d[idx2].y + lena_d[idx3].y + lena_d[idx].y)* alpha / 4
				+ (moon_d[idx1].y + moon_d[idx2].y + moon_d[idx3].y + moon_d[idx].y)*(1 - alpha) / 4;
			simply_d[idx].z = (lena_d[idx1].z + lena_d[idx2].z + lena_d[idx3].z + lena_d[idx].z)* alpha / 4
				+ (moon_d[idx1].z + moon_d[idx2].z + moon_d[idx3].z + moon_d[idx].z)*(1 - alpha) / 4;
			simply_d[idx].w = 0;
		}
	}
}
*/

void cpu_dejia(unsigned char * d_in1, unsigned char * d_in2, unsigned char * d_out, int imgHeight, int imgWidth, int channels) {
	float alpha = 0.5;
	for (int i = 0;i < imgHeight;i++) {
		for (int j = 0;j < imgWidth;j++) {
			for (int k = 0;k < channels;k++) {
				int temp = i*imgWidth*channels + j*channels + k;
				d_out[temp] = alpha*d_in1[temp] + d_in2[temp] * (1 - alpha);
			}
		}
	}
}

void main()
{
	//获取设备信息和初始化
	if (!InitCUDA()) return ;
	//开始
	//PAUSE;

	Mat Lena = imread("D:\\laopo\\beauty\\test.jpg");
	Mat moon = imread("D:\\laopo\\beauty\\test2.jpg");
	//namedWindow("show1");
	//imshow("show1", Lena);
	//namedWindow("show2");
	//imshow("show2", moon);

	// 4通道的图像是RGBA，是RGB加上一个A通道，也叫alpha通道，表示透明度。
	//PNG图像是一种典型的4通道图像。alpha通道可以赋值0到1，或者0到255，表示透明到不透明。
	cvtColor(Lena, Lena, CV_BGR2BGRA);
	cvtColor(moon, moon, CV_BGR2BGRA);
	int imgWidth = Lena.cols;
	int imgHeight = Lena.rows;
	int channels = Lena.channels();
	clock_t start, end;

	
	
	//设置纹理属性
	cudaError_t t;
	refTex1.addressMode[0] = cudaAddressModeClamp;
	refTex1.addressMode[1] = cudaAddressModeClamp;
	refTex1.normalized = false; //没有归一化坐标
	refTex1.filterMode = cudaFilterModePoint;
	//refTex1.filterMode = cudaFilterModeLinear;
	//绑定纹理内存的数据，从全局内存到纹理内存的关联
	cudaMallocArray(&cuArray1, &cuDesc, imgWidth, imgHeight);
	t = cudaBindTextureToArray(refTex1, cuArray1);

	refTex2.addressMode[0] = cudaAddressModeClamp;
	refTex2.addressMode[1] = cudaAddressModeClamp;
	refTex2.normalized = false;
	//线性滤波
	refTex1.filterMode = cudaFilterModePoint;
	//refTex2.filterMode = cudaFilterModeLinear;
	cudaMallocArray(&cuArray2, &cuDesc, imgWidth, imgHeight);
	t = cudaBindTextureToArray(refTex2, cuArray2);

	//拷贝数据到cudaArray
	t = cudaMemcpyToArray(cuArray1, 0, 0, Lena.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);
	t = cudaMemcpyToArray(cuArray2, 0, 0, moon.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);

	//输出图像组
	Mat dstImg = Mat::zeros(imgHeight, imgWidth, CV_8UC4);
	uchar *pDstImgData = NULL;
	t = cudaMalloc(&pDstImgData, imgHeight*imgWidth * sizeof(uchar)*channels);
	//核函数，实现两幅图像加权和
	dim3 block(32, 32);
	dim3 grid((imgWidth + block.x - 1) / block.x, (imgHeight + block.y - 1) / block.y);
	//1
	start = clock();
	weightAddKerkel << <grid, block>> >(pDstImgData, imgHeight, imgWidth, channels);
	//保证所有的线程都已经执行完了kernel function
	cudaThreadSynchronize();
	end = clock();
	double time1 = (double)(end - start) / CLOCKS_PER_SEC;
	printf("GPU use texture exec time is %.8f s\n", time1);
	//从GPU拷贝输出数据到CPU
	t = cudaMemcpy(dstImg.data, pDstImgData, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyDeviceToHost);
	//显示
	namedWindow("show1");
	imshow("show1", dstImg);
	cudaUnbindTexture(refTex1);
	cudaUnbindTexture(refTex2);
	
	//2
	Mat dstImg2 = Mat::zeros(imgHeight, imgWidth, CV_8UC4);
	//uchar4 *lena_d, *moon_d;
	//uchar4 *simply_d = NULL;
	uchar *lena_d, *moon_d;
	uchar *simply_d = NULL;
	cudaMalloc(&simply_d, imgHeight*imgWidth * sizeof(uchar)*channels);
	cudaMalloc(&lena_d, imgHeight*imgWidth * sizeof(uchar)*channels);
	cudaMalloc(&moon_d, imgHeight*imgWidth * sizeof(uchar)*channels);
	cudaMemcpy(lena_d, Lena.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);
	cudaMemcpy(moon_d, moon.data, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyHostToDevice);
	start = clock();
	weightAddsimply << <grid, block >> > (simply_d, lena_d, moon_d, imgHeight, imgWidth, channels);
	cudaThreadSynchronize();
	end = clock();
	cudaMemcpy(dstImg2.data, simply_d, imgWidth*imgHeight * sizeof(uchar)*channels, cudaMemcpyDeviceToHost);
	double time2 = (double)(end - start) / CLOCKS_PER_SEC;
	printf("GPU didn't use texture exec time is %.8f s\n", time2);
	//显示
	namedWindow("show2");
	imshow("show2", dstImg2);

	//3
	Mat hechengImage = Mat::zeros(imgHeight, imgWidth, CV_8UC4);
	start = clock();
	cpu_dejia(Lena.data, moon.data, hechengImage.data, imgHeight, imgWidth, channels);
	end = clock();
	double time3 = (double)(end - start)/CLOCKS_PER_SEC;
	printf("CPU exec time is %.8f s\n", time3);
	namedWindow("hecheng");
	imshow("hecheng", hechengImage);
	double timefast = time3 - time1;
	printf("GPU纹理内存下的处理速度比CPU下快 %.8f s\n",timefast);

	//解除纹理内存和cuda数组绑定
	cudaUnbindTexture(refTex1);
	cudaUnbindTexture(refTex2);
	cudaFreeArray(cuArray1);
	cudaFreeArray(cuArray2);
	//cudaFree(pDstImgData);
	cudaFree(simply_d);
	cudaFree(lena_d);
	cudaFree(moon_d);


	//如果设置waitKey(0),则表示程序会无限制的等待用户的按键事件
	//waitKey()--这个函数是在一个给定的时间内(单位ms)等待用户按键触发;如果用户没有按下 键,则接续等待(循环)
	waitKey(0);
}