CUDA by example Chapter7 纹理内存_cuda热传导-CSDN博客

本文链接：https://blog.csdn.net/wolfcsharp/article/details/100301221

纹理内存

纹理内存简介
热传导模拟

纹理内存简介

和常量内存一样，纹理内存是另一种类型的只读内存。纹理内存是专门为那些在内存访问模式中存在大量空间局部性的图形应用程序而设计的。假如在某个计算应用程序中，一个线程读取的位置与邻近线程读取的位置"非常接近"，使用纹理内存将会减少内存通信量，从而提高性能。
在这里插入图片描述

热传导模拟

简单的传热模型

假设一个矩形网格，热源分布如下。假设热源单元本身的温度保持不变，热量可以在相邻单元之间流动，从高温单元传导到低温单元。
在这里插入图片描述
温度更新的计算方法：
将单元与其邻接单元的温差加起来，然后加上原有温度，等于新时刻的温度：
T_NEW = T_OLD+sum_NEIGHBOR(k(T_NEIGHBOR-T_OLD))
由于邻接单元只有上下左右四个，将上式化简，得到：
T_NEW=T_OLD+k(T_TOP+T_BOTTOm+T_LEFT+T_RIGHT-4T_OLD)

代码中温度更新的计算

更新流程：

给定包含初始温度的网格，它的大部分单元都是0，少部分是初始温度值。将其中作为热源的初始温度值复制到当前时间的网格对应单元中。确保"加热单元将保持恒温"，这个复制操作时在copy_const_kernal()中执行的。
给定一个输入温度网格，根据温度更新的公式计算输出温度网格。这个更新操作是在blend_kernal()中执行的。
将输入温度网格和输出温度网格交换，为下一个步骤的计算做好准备。当模拟下一个时间步时，步骤2中计算得到的输出温度网格将成为步骤1中的输入温度网格。

GPU不使用纹理内存的热传导模拟计算

完整代码

// 代码7.3不使用纹理内存的热传导模型计算
//时间：2019.07.28
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "cpu_anim.h"
#include "book.h"

#define DIM 1024
#define SPEED 0.25f
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f



//更新函数中需要的全局变量
struct DataBlock
{
	unsigned char *output_bitmap;
	float *dev_inSrc;
	float *dev_outSrc;
	float *dev_constSrc;
	CPUAnimBitmap *bitmap;

	cudaEvent_t start, stop;
	float totalTime;
	float frames;
};

__global__ void copy_const_kernal(float *iptr, const float *cptr)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	if (cptr[offset] != 0)iptr[offset] = cptr[offset];

}

__global__ void blend_kernal(float *outSrc, const float *inSrc)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	int left = offset - 1;
	int right = offset + 1;
	if (x == 0)left++;
	if (x == DIM - 1)right--;

	int top = offset - DIM;
	int bottom = offset + DIM;
	if (y == 0)top += DIM;
	if (y == DIM - 1)bottom -= DIM;

	outSrc[offset] = inSrc[offset] + SPEED*(inSrc[top] + inSrc[bottom] + inSrc[left] + inSrc[right] - inSrc[offset] * 4);

}


//在swap这里栽了两天的跟头
void swap(float **a, float **b)
{
	float *c;
	c = *a;
	*a = *b;
	*b = c;
}



void anim_gpu(DataBlock *d, int ticks)
{
	cudaEventRecord(d->start, 0);
	dim3 grids(DIM / 16, DIM / 16);
	dim3 blocks(16, 16);
	CPUAnimBitmap *bitmap = d->bitmap;
	float *c;
	for (int i = 0; i < 90; i++)
	{
		//printf("faf");
		copy_const_kernal << <grids, blocks >> > ( d->dev_inSrc, d->dev_constSrc);
		blend_kernal << <grids, blocks >> > ( d->dev_outSrc, d->dev_inSrc );

		swap(&d->dev_inSrc,&d->dev_outSrc);
	}
	//将float数值映射成颜色，以便用图像显示它
	float_to_color << <grids, blocks >> >(d->output_bitmap,d->dev_inSrc);
	cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost);
	cudaEventRecord(d->stop,0);
	cudaEventSynchronize(d->stop);
	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
	d->totalTime += elapsedTime;
	++d->frames;
	printf("Average Time per frame:%3.1fms\n", d->totalTime / d->frames);
}

void anim_exit(DataBlock *d)
{
	cudaFree(d->dev_constSrc);
	cudaFree(d->dev_inSrc);
	cudaFree(d->dev_outSrc);

	cudaEventDestroy(d->start);
	cudaEventDestroy(d->stop);
}

int main()
{
	DataBlock data;
	CPUAnimBitmap bitmap(DIM, DIM, &data);
	data.bitmap = &bitmap;
	data.totalTime = 0;
	data.frames = 0;
	cudaEventCreate(&data.start);
	cudaEventCreate(&data.stop);
	cudaMalloc((void **)&data.output_bitmap, bitmap.image_size());
	cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_inSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_outSrc, bitmap.image_size());

	//在CPU上分配临时内存，对其填充一些内容，用于初始化data.dev_constSrc和data.dev_inSrc
	float *temp = (float *)malloc(bitmap.image_size());
	for (int i = 0; i < DIM*DIM; i++)
	{
		temp[i] = 0;
		int x = i % DIM;
		int y = i / DIM;
		if ((x>300) && (x<600) && (y>310) && (y < 601))
		{
			temp[i] = MAX_TEMP;
		}
	}
	temp[DIM * 100 + 100] = (MAX_TEMP + MIN_TEMP) / 2;
	temp[DIM * 700 + 100] = MIN_TEMP;
	temp[DIM * 300 + 300] = MIN_TEMP;
	temp[DIM * 200 + 700] = MIN_TEMP;
	for (int y = 800; y < 900; y++)
	{
		for (int x = 400; x < 500; x++)
		{
			temp[x + y*DIM] = MIN_TEMP;
		}
	}
	//用当前填充的内容初始化data.dev_constSrc
	cudaMemcpy(data.dev_constSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);

	//继续向temp中填充一些内容
	for (int y = 800; y < DIM; y++)
	{
		for (int x = 0; x < 200; x++)
		{
			temp[x + y*DIM] = MAX_TEMP;
		}
	}
	//用给当前填充的内容初始化data.dev_inSrc
	cudaMemcpy(data.dev_inSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);

	//释放CPU端的临时内存
	free(temp);
	//用anim_gpu生成动画的每一帧，用anim_exit退出动画显示并释放内存
	bitmap.anim_and_exit((void(*) (void *, int))anim_gpu, (void(*)(void *))anim_exit);
}

运行结果
在这里插入图片描述

需要极其注意，更新步骤中的第三步，交换输入网格和输出网格：
因为dev_inSrc和dev_outSrc分别是两幅图像的入口指针，将二者交换只需要：

float *swap;
swap = dev_inSrc;
dev_inSrc = dev_outSrc;
dev_outSrc = swap;

但是，如果使用函数swap来完成这个工作，由于函数中传入的是形参，形参的改变并不会改变传入参数的原始值，因此我们需要将&dev_inSrc和&dev_outSrc传入swap函数中，才能完成交换任务

//在swap这里栽了两天的跟头
void swap(float **a, float **b)
{
	float *c;
	c = *a;
	*a = *b;
	*b = c;
}

//使用方法
swap(&d->dev_inSrc,&d->dev_outSrc);

GPU使用一维纹理内存的热传导模拟计算

1. 纹理内存的使用场景
在温度更新计算的内存访问模式中存在着巨大的内存空间局部性，这种访问模式可以通过GPU纹理内存来加速。
2.一维纹理内存变量引用的声明

//声明纹理内存变量的引用，这些变量将位于GPU上
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut;

3.一维纹理内存引用变量与实际内存缓冲区的绑定
通过cudaBindTexture()可以将纹理内存引用变量与内存缓冲区进行绑定。

	cudaMalloc((void **)&data.output_bitmap, bitmap.image_size());
	cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_inSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_outSrc, bitmap.image_size());

	//分配内存之后，需要将这些变量绑定到之前声明的纹理引用上去
	cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, bitmap.image_size());
	cudaBindTexture(NULL, texIn, data.dev_inSrc,  bitmap.image_size());
	cudaBindTexture(NULL, texOut, data.dev_outSrc,  bitmap.image_size());

4.读取一维纹理内存方法
使用tex1Dfetch()读取一维纹理内存,传给blend_kernal的dstOut是一个bool变量，用于指明读取哪一个缓冲区作为输入。

__global__ void blend_kernal(float *dst, bool dstOut)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	int left = offset - 1;
	int right = offset + 1;
	if (x == 0)left++;
	if (x == DIM - 1)right--;

	int top = offset - DIM;
	int bottom = offset + DIM;
	if (y == 0)top += DIM;
	if (y == DIM - 1)bottom -= DIM;

	float t, b, l, r, c;
	if (dstOut)
	{
		t = tex1Dfetch(texIn,top);
		b = tex1Dfetch(texIn, bottom);
		l = tex1Dfetch(texIn, left);
		r = tex1Dfetch(texIn, right);
		c = tex1Dfetch(texIn, offset);
	}
	else
	{
		t = tex1Dfetch(texOut, top);
		b = tex1Dfetch(texOut, bottom);
		l = tex1Dfetch(texOut, left);
		r = tex1Dfetch(texOut, right);
		c = tex1Dfetch(texOut, offset);
	}

	dst[offset] = c + SPEED*(t + b + l + r - 4 * c);
}

5.完整代码

// 代码7.3.4使用一维纹理内存的热传导模型计算
//时间：2019.07.29
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "cpu_anim.h"
#include "cuda.h"
#include "book.h"

#define DIM 1024
#define SPEED 0.25f
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f

//声明纹理内存变量的引用，这些变量将位于GPU上
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut;




//更新函数中需要的全局变量
struct DataBlock
{
	unsigned char *output_bitmap;
	float *dev_inSrc;
	float *dev_outSrc;
	float *dev_constSrc;
	CPUAnimBitmap *bitmap;

	cudaEvent_t start, stop;
	float totalTime;
	float frames;
};

__global__ void copy_const_kernal(float *iptr)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	float c = tex1Dfetch(texConstSrc, offset);
	if (c != 0)iptr[offset] = c;

}

__global__ void blend_kernal(float *dst, bool dstOut)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	int left = offset - 1;
	int right = offset + 1;
	if (x == 0)left++;
	if (x == DIM - 1)right--;

	int top = offset - DIM;
	int bottom = offset + DIM;
	if (y == 0)top += DIM;
	if (y == DIM - 1)bottom -= DIM;

	float t, b, l, r, c;
	if (dstOut)
	{
		t = tex1Dfetch(texIn,top);
		b = tex1Dfetch(texIn, bottom);
		l = tex1Dfetch(texIn, left);
		r = tex1Dfetch(texIn, right);
		c = tex1Dfetch(texIn, offset);
	}
	else
	{
		t = tex1Dfetch(texOut, top);
		b = tex1Dfetch(texOut, bottom);
		l = tex1Dfetch(texOut, left);
		r = tex1Dfetch(texOut, right);
		c = tex1Dfetch(texOut, offset);
	}

	dst[offset] = c + SPEED*(t + b + l + r - 4 * c);
}





void anim_gpu(DataBlock *d, int ticks)
{
	cudaEventRecord(d->start, 0);
	dim3 grids(DIM / 16, DIM / 16);
	dim3 blocks(16, 16);
	CPUAnimBitmap *bitmap = d->bitmap;

	//由于tex是全局并且是有界的，因此我们必须通过一个标志来来选择每次迭代中哪个是输入/输出
	volatile bool dstOut = true;
	for (int i = 0; i < 90; i++)
	{
		float *in, *out;
		if (dstOut)
		{
			in = d->dev_inSrc;
			out = d->dev_outSrc;
		}
		else
		{
			in = d->dev_outSrc;
			out = d->dev_inSrc;
		}
		copy_const_kernal << <grids, blocks >> > (in);
		blend_kernal << <grids, blocks >> > (out,dstOut);
		dstOut = !dstOut;
	}
	//将float数值映射成颜色，以便用图像显示它
	float_to_color << <grids, blocks >> >(d->output_bitmap, d->dev_inSrc);
	cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost);
	cudaEventRecord(d->stop, 0);
	cudaEventSynchronize(d->stop);
	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
	d->totalTime += elapsedTime;
	++d->frames;
	printf("Average Time per frame:%3.1fms\n", d->totalTime / d->frames);
}

void anim_exit(DataBlock *d)
{
	cudaUnbindTexture(texIn);
	cudaUnbindTexture(texOut);
	cudaUnbindTexture(texConstSrc);

	cudaFree(d->dev_constSrc);
	cudaFree(d->dev_inSrc);
	cudaFree(d->dev_outSrc);

	cudaEventDestroy(d->start);
	cudaEventDestroy(d->stop);
}

int main()
{
	DataBlock data;
	CPUAnimBitmap bitmap(DIM, DIM, &data);
	data.bitmap = &bitmap;
	data.totalTime = 0;
	data.frames = 0;
	cudaEventCreate(&data.start);
	cudaEventCreate(&data.stop);
	cudaMalloc((void **)&data.output_bitmap, bitmap.image_size());
	cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_inSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_outSrc, bitmap.image_size());

	//分配内存之后，需要将这些变量绑定到之前声明的纹理引用上去
	//cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
	cudaBindTexture(NULL, texConstSrc, data.dev_constSrc, bitmap.image_size());
	cudaBindTexture(NULL, texIn, data.dev_inSrc,  bitmap.image_size());
	cudaBindTexture(NULL, texOut, data.dev_outSrc,  bitmap.image_size());


	//在CPU上分配临时内存，对其填充一些内容，用于初始化data.dev_constSrc和data.dev_inSrc
	float *temp = (float *)malloc(bitmap.image_size());
	for (int i = 0; i < DIM*DIM; i++)
	{
		temp[i] = 0;
		int x = i % DIM;
		int y = i / DIM;
		if ((x>300) && (x<600) && (y>310) && (y < 601))
		{
			temp[i] = MAX_TEMP;
		}
	}
	temp[DIM * 100 + 100] = (MAX_TEMP + MIN_TEMP) / 2;
	temp[DIM * 700 + 100] = MIN_TEMP;
	temp[DIM * 300 + 300] = MIN_TEMP;
	temp[DIM * 200 + 700] = MIN_TEMP;
	for (int y = 800; y < 900; y++)
	{
		for (int x = 400; x < 500; x++)
		{
			temp[x + y*DIM] = MIN_TEMP;
		}
	}
	//用当前填充的内容初始化data.dev_constSrc
	cudaMemcpy(data.dev_constSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);

	//继续向temp中填充一些内容
	for (int y = 800; y < DIM; y++)
	{
		for (int x = 0; x < 200; x++)
		{
			temp[x + y*DIM] = MAX_TEMP;
		}
	}
	//用给当前填充的内容初始化data.dev_inSrc
	cudaMemcpy(data.dev_inSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);

	//释放CPU端的临时内存
	free(temp);
	//用anim_gpu生成动画的每一帧，用anim_exit退出动画显示并释放内存
	bitmap.anim_and_exit((void(*) (void *, int))anim_gpu, (void(*)(void *))anim_exit);
}

6.运行结果
在这里插入图片描述

bang!!!性能并没有得到提升，反而急剧恶化，不知所措，如果有哪位同学遇到跟我同样的问题，欢迎联系我讨论

GPU使用二维纹理内存的热传导模拟计算

1. 二维纹理内存变量引用的声明

//声明纹理内存变量的引用，这些变量将位于GPU上
texture<float,2> texConstSrc;
texture<float,2> texIn;
texture<float,2> texOut;

2.二维纹理内存引用变量与实际内存缓冲区的绑定
通过cudaBindTexture2D()可以将纹理内存引用变量与内存缓冲区进行绑定。
绑定二维纹理时，CUDA运行时要求提供一个cudaChannelFormatDesc，保持默认设置就可以了。

	cudaMalloc((void **)&data.output_bitmap, bitmap.image_size());
	cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_inSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_outSrc, bitmap.image_size());

	//分配内存之后，需要将这些变量绑定到之前声明的纹理引用上去
	cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
	cudaBindTexture2D(NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float)*DIM);
	cudaBindTexture2D(NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float)*DIM);
	cudaBindTexture2D(NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float)*DIM);

3.读取二维纹理内存方法
通过tex2D()读取二维纹理内存

__global__ void blend_kernal(float *dst, bool dstOut)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	int left = offset - 1;
	int right = offset + 1;
	if (x == 0)left++;
	if (x == DIM - 1)right--;

	int top = offset - DIM;
	int bottom = offset + DIM;
	if (y == 0)top += DIM;
	if (y == DIM - 1)bottom -= DIM;

	float t, b, l, r, c;
	if (dstOut)
	{
		t = tex2D(texIn, x, y - 1);
		l = tex2D(texIn, x - 1, y);
		c = tex2D(texIn, x, y);
		r = tex2D(texIn, x + 1, y);
		b = tex2D(texIn, x, y + 1);
	}
	else
	{
		t = tex2D(texOut, x, y - 1);
		l = tex2D(texOut, x - 1, y);
		c = tex2D(texOut, x, y);
		r = tex2D(texOut, x + 1, y);
		b = tex2D(texOut, x, y + 1);
	}

	dst[offset] = c + SPEED*(t + b + l + r - 4 * c);
}

4.完整代码

// 代码7.3.4使用二维纹理内存的热传导模型计算
//时间：2019.07.29
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include "cpu_anim.h"
#include "cuda.h"
#include "book.h"

#define DIM 1024
#define SPEED 0.25f
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f

//声明纹理内存变量的引用，这些变量将位于GPU上
texture<float,2> texConstSrc;
texture<float,2> texIn;
texture<float,2> texOut;




//更新函数中需要的全局变量
struct DataBlock
{
	unsigned char *output_bitmap;
	float *dev_inSrc;
	float *dev_outSrc;
	float *dev_constSrc;
	CPUAnimBitmap *bitmap;

	cudaEvent_t start, stop;
	float totalTime;
	float frames;
};

__global__ void copy_const_kernal(float *iptr)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	float c = tex2D(texConstSrc, x, y);
	if (c != 0)iptr[offset] = c;

}

__global__ void blend_kernal(float *dst, bool dstOut)
{
	//将threadIdx/blockIdx映射到像素位置
	int x = threadIdx.x + blockIdx.x*blockDim.x;
	int y = threadIdx.y + blockIdx.y*blockDim.y;
	int offset = x + y*blockDim.x*gridDim.x;

	int left = offset - 1;
	int right = offset + 1;
	if (x == 0)left++;
	if (x == DIM - 1)right--;

	int top = offset - DIM;
	int bottom = offset + DIM;
	if (y == 0)top += DIM;
	if (y == DIM - 1)bottom -= DIM;

	float t, b, l, r, c;
	if (dstOut)
	{
		t = tex2D(texIn, x, y - 1);
		l = tex2D(texIn, x - 1, y);
		c = tex2D(texIn, x, y);
		r = tex2D(texIn, x + 1, y);
		b = tex2D(texIn, x, y + 1);
	}
	else
	{
		t = tex2D(texOut, x, y - 1);
		l = tex2D(texOut, x - 1, y);
		c = tex2D(texOut, x, y);
		r = tex2D(texOut, x + 1, y);
		b = tex2D(texOut, x, y + 1);
	}

	dst[offset] = c + SPEED*(t + b + l + r - 4 * c);
}





void anim_gpu(DataBlock *d, int ticks)
{
	cudaEventRecord(d->start, 0);
	dim3 grids(DIM / 16, DIM / 16);
	dim3 blocks(16, 16);
	CPUAnimBitmap *bitmap = d->bitmap;

	//由于tex是全局并且是有界的，因此我们必须通过一个标志来来选择每次迭代中哪个是输入/输出
	volatile bool dstOut = true;
	for (int i = 0; i < 90; i++)
	{
		float *in, *out;
		if (dstOut)
		{
			in = d->dev_inSrc;
			out = d->dev_outSrc;
		}
		else
		{
			in = d->dev_outSrc;
			out = d->dev_inSrc;
		}
		copy_const_kernal << <grids, blocks >> > (in);
		blend_kernal << <grids, blocks >> > (out, dstOut);
		dstOut = !dstOut;
	}
	//将float数值映射成颜色，以便用图像显示它
	float_to_color << <grids, blocks >> >(d->output_bitmap, d->dev_inSrc);
	cudaMemcpy(bitmap->get_ptr(), d->output_bitmap, bitmap->image_size(), cudaMemcpyDeviceToHost);
	cudaEventRecord(d->stop, 0);
	cudaEventSynchronize(d->stop);
	float elapsedTime;
	cudaEventElapsedTime(&elapsedTime, d->start, d->stop);
	d->totalTime += elapsedTime;
	++d->frames;
	printf("Average Time per frame:%3.1fms\n", d->totalTime / d->frames);
}

void anim_exit(DataBlock *d)
{
	cudaUnbindTexture(texIn);
	cudaUnbindTexture(texOut);
	cudaUnbindTexture(texConstSrc);

	cudaFree(d->dev_constSrc);
	cudaFree(d->dev_inSrc);
	cudaFree(d->dev_outSrc);

	cudaEventDestroy(d->start);
	cudaEventDestroy(d->stop);
}

int main()
{
	DataBlock data;
	CPUAnimBitmap bitmap(DIM, DIM, &data);
	data.bitmap = &bitmap;
	data.totalTime = 0;
	data.frames = 0;
	cudaEventCreate(&data.start);
	cudaEventCreate(&data.stop);
	cudaMalloc((void **)&data.output_bitmap, bitmap.image_size());
	cudaMalloc((void**)&data.dev_constSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_inSrc, bitmap.image_size());
	cudaMalloc((void **)&data.dev_outSrc, bitmap.image_size());

	//分配内存之后，需要将这些变量绑定到之前声明的纹理引用上去
	cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
	cudaBindTexture2D(NULL, texConstSrc, data.dev_constSrc, desc, DIM, DIM, sizeof(float)*DIM);
	cudaBindTexture2D(NULL, texIn, data.dev_inSrc, desc, DIM, DIM, sizeof(float)*DIM);
	cudaBindTexture2D(NULL, texOut, data.dev_outSrc, desc, DIM, DIM, sizeof(float)*DIM);


	//在CPU上分配临时内存，对其填充一些内容，用于初始化data.dev_constSrc和data.dev_inSrc
	float *temp = (float *)malloc(bitmap.image_size());
	for (int i = 0; i < DIM*DIM; i++)
	{
		temp[i] = 0;
		int x = i % DIM;
		int y = i / DIM;
		if ((x>300) && (x<600) && (y>310) && (y < 601))
		{
			temp[i] = MAX_TEMP;
		}
	}
	temp[DIM * 100 + 100] = (MAX_TEMP + MIN_TEMP) / 2;
	temp[DIM * 700 + 100] = MIN_TEMP;
	temp[DIM * 300 + 300] = MIN_TEMP;
	temp[DIM * 200 + 700] = MIN_TEMP;
	for (int y = 800; y < 900; y++)
	{
		for (int x = 400; x < 500; x++)
		{
			temp[x + y*DIM] = MIN_TEMP;
		}
	}
	//用当前填充的内容初始化data.dev_constSrc
	cudaMemcpy(data.dev_constSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);

	//继续向temp中填充一些内容
	for (int y = 800; y < DIM; y++)
	{
		for (int x = 0; x < 200; x++)
		{
			temp[x + y*DIM] = MAX_TEMP;
		}
	}
	//用给当前填充的内容初始化data.dev_inSrc
	cudaMemcpy(data.dev_inSrc, temp, bitmap.image_size(), cudaMemcpyHostToDevice);

	//释放CPU端的临时内存
	free(temp);
	//用anim_gpu生成动画的每一帧，用anim_exit退出动画显示并释放内存
	bitmap.anim_and_exit((void(*) (void *, int))anim_gpu, (void(*)(void *))anim_exit);
}

5.运行结果
在这里插入图片描述

bang!!!性能并没有得到提升，反而急剧恶化，不知所措，如果有哪位同学遇到跟我同样的问题，欢迎联系我讨论