#include <cuda.h>
#include "book.h"
#include "cpu_anim.h"
#define DIM 1024
#define PI 3.141592653f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f
/*在温度更新计算的内存访问模式中存在着巨大的内存空间局部性,这种访问模式可以通过GPU
纹理内存来加速。使用一维纹理内存*/
//首先,输入数声明为texture类型的引用
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut;
//DataBlock类型的定义
struct DataBlock
{
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
CPUAnimBitmap *bitmap;
cudaEvent_t start,stop;
float totalTime;
float frames;
};
/*给定一个输入温度格网,根据公式更新计算输出设出温度网格
当读取核函数中的纹理时,需要通过特殊的函数来告诉GPU将读取请
求转发到纹理内存而不是标准的全局呐存。因此,当读取内存时不再
使用方括号从缓冲区中讲习班取,而是将blend_kernel()改为使用
tex1Dfetch()函数。 tex1Dfetch()是编译器内置函数(Intrinsic)。
由于纹理引用必须声明为文件作用域内的全局变量,因此我们不再将
输入及缓冲区和输出缓冲区作为参数传递给blend_kernel(),
因为编译器需要在编译时知道tex1Dfetch()应该对哪些纹理采样。
将一个布尔标志dstOut传递给blend_kernel(),这个标志会告诉我们
使用的是哪个缓冲区作为输入,
以及哪个缓冲区作为输出*/
__global__ void blend_kernel(float *dst,bool dstOut)
{
//将threadIdx/BlockIdx映射到像素位置
int x=threadIdx.x+blockIdx.x*blockDim.x;
int y=threadIdx.y+blockIdx.y*blockDim.y;
int offset=x+y*blockDim.x*gridDim.x;
//上下左右像素的计算及边界趆界处理
int left=offset-1;
int right=offset+1;
if(x==0)left++;
if(x==DIM-1)right--;
int top=offset-DIM;
int bottom=offset+DIM;
if(y==0)top+=DIM;
if(y==DIM-1)bottom-=DIM;
float t,l,c,r,b;
if(dstOut)
{
//dstOut为真时,访问的是输入缓冲区
t=tex1Dfetch(texIn,top);
l=tex1Dfetch(texIn,left);
c=tex1Dfetch(texIn,offset);
r=tex1Dfetch(texIn,right);
b=tex1Dfetch(texIn,bottom);
}
else
{
//dstOut为假时,访问的是输出缓冲区
t=tex1Dfetch(texOut,top);
l=tex1Dfetch(texOut,left);
c=tex1Dfetch(texOut,offset);
r=tex1Dfetch(texOut,right);
b=tex1Dfetch(texOut,bottom);
}
dst[offset]=c+SPEED*(t+b+r+l-4*c);//计算当前点的像素
}
//从纹理内存中读取包含热源位置和温度的缓冲区
__global__ void copy_const_kernel(float *iptr)
{
//将threadIdx/BlockIdx映射到像素位置
int x=threadIdx.x+blockIdx.x*blockDim.x;
int y=threadIdx.y+blockIdx.y*blockDim.y;
int offset=x+y*blockDim.x*gridDim.x;
float c=tex1Dfetch(texConstSrc,offset);
if(c!=0)
iptr[offset]=c;
}
//
void anim_gpu(DataBlock *d,int ticks)
{
HANDLE_ERROR(cudaEventRecord(d->start,0));
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
CPUAnimBitmap *bitmap=d->bitmap;
//由于tex是全局并且有界的,因此我们必须通过一个标志来选择每次迭代中哪个是输入/输出
volatile bool dstOut=true;// 标志输入缓冲区还是输出缓冲区,初值为真
for (int i=0;i<90;i++)
{
float *in,*out;
if(dstOut)
{
in=d->dev_inSrc;
out=d->dev_outSrc;
}
else
{
out=d->dev_inSrc;
in=d->dev_outSrc;
}
copy_const_kernel<<<blocks,threads>>>(in);
blend_kernel<<<blocks,threads>>>(out,dstOut);
dstOut=!dstOut;//在每组调用之后通过设置dstOut=!dstOut来进行切换
}
float_to_color<<<blocks,threads>>>(d->output_bitmap,d->dev_inSrc);
HANDLE_ERROR(cudaMemcpy(bitmap->get_ptr(),d->output_bitmap,bitmap->image_size(),cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaEventRecord(d->stop,0));
HANDLE_ERROR(cudaEventSynchronize(d->stop));
float elapsedTime;
HANDLE_ERROR(cudaEventElapsedTime(&elapsedTime,d->start,d->stop));
d->totalTime+=elapsedTime;
++d->frames;
printf("Average Time per frame:%3.1f ms\n",d->totalTime/d->frames);
}
//释放GPU上分配全局缓冲区,清除与纹理的绑定
void anim_exit(DataBlock *d)
{
cudaUnbindTexture(texIn);
cudaUnbindTexture(texOut);
cudaUnbindTexture(texConstSrc);
cudaFree(d->dev_inSrc);
cudaFree(d->dev_outSrc);
cudaFree(d->dev_constSrc);
HANDLE_ERROR(cudaEventDestroy(d->start));
HANDLE_ERROR(cudaEventDestroy(d->stop));
}
int main(void)
{
DataBlock data;
CPUAnimBitmap bitmap(DIM,DIM,&data);
data.bitmap=&bitmap;
data.totalTime=0;
data.frames=0;
int imageSize = bitmap.image_size();
//分配内存
HANDLE_ERROR(cudaMalloc((void**)&data.dev_inSrc,imageSize));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_outSrc,imageSize));
HANDLE_ERROR(cudaMalloc((void**)&data.dev_constSrc,imageSize));
//将dev_inSrc,dev_outSrc,dev_constSrc绑定到纹理引用(texConstSrc,texIn,texOut)
HANDLE_ERROR(cudaBindTexture(NULL,texConstSrc,data.dev_constSrc,imageSize));
HANDLE_ERROR(cudaBindTexture(NULL,texIn,data.dev_inSrc,imageSize));
HANDLE_ERROR(cudaBindTexture(NULL,texOut,data.dev_outSrc,imageSize));
HANDLE_ERROR(cudaEventCreate(&data.start));
HANDLE_ERROR(cudaEventCreate(&data.stop));
HANDLE_ERROR(cudaMalloc((void **)&data.output_bitmap,bitmap.image_size()));
float *temp=(float*)malloc(bitmap.image_size());
for(int i=0;i<DIM*DIM;i++)
{
temp[i]=0;
int x=i%DIM;
int y=i/DIM;
if((x>300)&&(x<600)&&(y>310)&&(y<601))
temp[i]=MAX_TEMP;
}
temp[DIM*100+100]=(MAX_TEMP+MIN_TEMP)/2;
temp[DIM*700+100]=MIN_TEMP;
temp[DIM*300+300]=MIN_TEMP;
temp[DIM*200+700]=MIN_TEMP;
for(int y=800;y<900;y++)
{
for(int x=400;x<500;x++)
{
temp[x+y*DIM]=MIN_TEMP;
}
}
HANDLE_ERROR(cudaMemcpy(data.dev_constSrc,temp,bitmap.image_size(),cudaMemcpyHostToDevice));
for(int y=800;y<DIM;y++)
{
for(int x=0;x<200;x++)
{
temp[x+y*DIM]=MAX_TEMP;
}
}
HANDLE_ERROR(cudaMemcpy(data.dev_inSrc,temp,bitmap.image_size(),cudaMemcpyHostToDevice));
free(temp);
bitmap.anim_and_exit((void(*)(void*,int))anim_gpu,(void(*)(void*))anim_exit);
}