1. 纹理内存使用
纹理内存的使用依赖于API函数。下面直接给出常见的使用流程:
1.1 声明纹理变量
texture<Type, Dim, ReadMode> VarName;
//Type: 前面提到的基本的整型和浮点类型,以及其它的对齐类型
//Dim: 纹理数组的维度,值为1或2或3,默认缺省为1
//ReadMode:cudaReadModelNormalizedFloat 或 cudaReadModelElementType(默认)
NOTE:
cudaReadModelNormalizedFloat:如果Type为整型(8bit或者16bit),则读取数据时会自动将整型数据转化为浮点数。具体地,如果是无符号整型,则转化为[0 1]之间的浮点数;如果是有符号整型,则转化为[-1 1]之间的浮点数。
cudaReadModelElementType:默认值,不进行任何转换
1.2 创建内存
分配内存,内存形式有两种 线性内存 和 CUDA数组。
线性内存通过cudaMalloc()、cudaMallocPitch()或者cudaMalloc3D()分配;
CUDA数组可以通过cudaMalloc3DArray()或者cudaMallocArray()分配。前者可以分配1D、2D、3D的数组,后者一般用于分配2D的CUDA数组。
比如开辟一个二维CUDA数组ArrayName(64 x 64):
//ex:
cudaMalloc( (void**)&data.output_bitmap,imageSize );
cudaMalloc( (void**)&data.dev_inSrc,imageSize );
cudaMalloc( (void**)&data.dev_outSrc,imageSize );
cudaMalloc( (void**)&data.dev_constSrc,imageSize );
1.3 绑定纹理内存
纹理绑定(texture binding)的作用有两个:
- 将指定的缓冲区作为纹理来使用;
- 纹理引用作为纹理的“名字”
通常使用cudaBindTexture() 或 cudaBindTexture2D() 分别将线性内存绑定到1D和2D纹理内存,使用cudaBindTextureToArray()将CUDA数组与纹理绑定。
//ex:将之前的texConstSrc,texIn,texOut与内存绑定
cudaBindTexture( NULL, texConstSrc,
data.dev_constSrc,
imageSize )
cudaBindTexture( NULL, texConstSrc,
data.dev_inSrc,
mageSize )
cudaBindTexture( NULL, texConstSrc,
data.dev_outSrc,
imageSize )
此时,纹理变量设置完毕。可以启用核函数。当读取核函数中的纹理时,需要通过特殊函数告诉GPU将读取请求转发给纹理内存而非标准的全局内存。因此,读取内存时不再使用方括号从缓冲区读取,而是将blend_kernel()改为tex1Dfetch()
1.4 读取
普通变量读取方式:
__global__ void blend_kernel(float *outSrc, const float *inSrc){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x
int left = offset -1;
if (x == 0)
left++;
int right = offset + 1;
if (x == DIM -1)
right--;
int top = offset -DIM;
if (y == 0)
top += DIM;
int bottom = offset + DIM;
if (y == DIM -1)
bottom -= DIM;
outSrc[offset] = inSrc[offset] +
SPEED * (inSrc[top] + inSrc[bottom] + inSrc[left] + inSrc[right]
- 4 * inSrc[offset] );
}
纹理变量的读取方式:
__global__ void blend_kernel(float *dst, bool dstOut){
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x
int left = offset -1;
if (x == 0)
left++;
int right = offset + 1;
if (x == DIM -1)
right--;
int top = offset -DIM;
if (y == 0)
top += DIM;
int bottom = offset + DIM;
if (y == DIM -1)
bottom -= DIM;
float t, l, c, r, b;
if (destOut){
t = tex1Dfetch(texIn, top);
l = tex1Dfetch(texIn, left);
c = tex1Dfetch(texIn, offset);
r = tex1Dfetch(texIn, right);
b = tex1Dfetch(texIn, bottom);
}else{
t = tex1Dfetch(texOut, top);
l = tex1Dfetch(texOut, left);
c = tex1Dfetch(texOut, offset);
r = tex1Dfetch(texOut, right);
b = tex1Dfetch(texOut, bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}
1.5 解绑
cudaUnbindTexture (tex);
cudaFree(devPtr);
具体关于纹波的操作可以参看此链接。
2. 书本例子:
2.1二维纹理:
#include <stdio.h>
#include <GL/glut.h>
#define DIM 1024
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f
// a place for common kernels - starts here
__device__ unsigned char value( float n1, float n2, int hue ) {
if (hue > 360) hue -= 360;
else if (hue < 0) hue += 360;
if (hue < 60)
return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
if (hue < 180)
return (unsigned char)(255 * n2);
if (hue < 240)
return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
return (unsigned char)(255 * n1);
}
__global__ void float_to_color( unsigned char *optr,
const float *outSrc ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float l = outSrc[offset];
float s = 1;
int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
float m1, m2;
if (l <= 0.5f)
m2 = l * (1 + s);
else
m2 = l + s - l * s;
m1 = 2 * l - m2;
optr[offset*4 + 0] = value( m1, m2, h+120 );
optr[offset*4 + 1] = value( m1, m2, h );
optr[offset*4 + 2] = value( m1, m2, h -120 );
optr[offset*4 + 3] = 255;
}
struct CPUAnimBitmap {
unsigned char *pixels;
int width, height;
void *dataBlock;
void (*fAnim)(void*,int);
void (*animExit)(void*);
void (*clickDrag)(void*,int,int,int,int);
int dragStartX, dragStartY;
CPUAnimBitmap( int w, int h, void *d = NULL ) {
width = w;
height = h;
pixels = new unsigned char[width * height * 4];
dataBlock = d;
clickDrag = NULL;
}
~CPUAnimBitmap() {
delete [] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return width * height * 4; }
void click_drag( void (*f)(void*,int,int,int,int)) {
clickDrag = f;
}
void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) {
CPUAnimBitmap** bitmap = get_bitmap_ptr();
*bitmap = this;
fAnim = f;
animExit = e;
// a bug in the Windows GLUT implementation prevents us from
// passing zero arguments to glutInit()
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( width, height );
glutCreateWindow( "bitmap" );
glutKeyboardFunc(Key);
glutDisplayFunc(Draw);
if (clickDrag != NULL)
glutMouseFunc( mouse_func );
glutIdleFunc( idle_func );
glutMainLoop();
}
// static method used for glut callbacks
static CPUAnimBitmap** get_bitmap_ptr( void ) {
static CPUAnimBitmap* gBitmap;
return &gBitmap;
}
// static method used for glut callbacks
static void mouse_func( int button, int state,
int mx, int my ) {
if (button == GLUT_LEFT_BUTTON) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
if (state == GLUT_DOWN) {
bitmap->dragStartX = mx;
bitmap->dragStartY = my;
} else if (state == GLUT_UP) {
bitmap->clickDrag( bitmap->dataBlock,
bitmap->dragStartX,
bitmap->dragStartY,
mx, my );
}
}
}
// static method used for glut callbacks
static void idle_func( void ) {
static int ticks = 1;
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->fAnim( bitmap->dataBlock, ticks++ );
glutPostRedisplay();
}
// static method used for glut callbacks
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27:
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->animExit( bitmap->dataBlock );
//delete bitmap;
exit(0);
}
}
// static method used for glut callbacks
static void Draw( void ) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glutSwapBuffers();
}
};
// these exist on the GPU side
texture<float,2> texConstSrc;
texture<float,2> texIn;
texture<float,2> texOut;
__global__ void blend_kernel( float *dst,
bool dstOut ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float t, l, c, r, b;
if (dstOut) {
t = tex2D(texIn,x,y-1);
l = tex2D(texIn,x-1,y);
c = tex2D(texIn,x,y);
r = tex2D(texIn,x+1,y);
b = tex2D(texIn,x,y+1);
} else {
t = tex2D(texOut,x,y-1);
l = tex2D(texOut,x-1,y);
c = tex2D(texOut,x,y);
r = tex2D(texOut,x+1,y);
b = tex2D(texOut,x,y+1);
}
dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}
__global__ void copy_const_kernel( float *iptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float c = tex2D(texConstSrc,x,y);
if (c != 0)
iptr[offset] = c;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
CPUAnimBitmap *bitmap;
cudaEvent_t start, stop;
float totalTime;
float frames;
};
void anim_gpu( DataBlock *d, int ticks ) {
cudaEventRecord( d->start, 0 );
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
CPUAnimBitmap *bitmap = d->bitmap;
// since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i=0; i<90; i++) {
float *in, *out;
if (dstOut) {
in = d->dev_inSrc;
out = d->dev_outSrc;
} else {
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel<<<blocks,threads>>>( in );
blend_kernel<<<blocks,threads>>>( out, dstOut );
dstOut = !dstOut;
}
float_to_color<<<blocks,threads>>>( d->output_bitmap,
d->dev_inSrc );
cudaMemcpy( bitmap->get_ptr(),
d->output_bitmap,
bitmap->image_size(),
cudaMemcpyDeviceToHost ) ;
cudaEventRecord( d->stop, 0 ) ;
cudaEventSynchronize( d->stop ) ;
float elapsedTime;
cudaEventElapsedTime( &elapsedTime,
d->start, d->stop );
d->totalTime += elapsedTime;
++d->frames;
printf( "Average Time per frame: %3.1f ms\n",
d->totalTime/d->frames );
}
// clean up memory allocated on the GPU
void anim_exit( DataBlock *d ) {
cudaUnbindTexture( texIn );
cudaUnbindTexture( texOut );
cudaUnbindTexture( texConstSrc );
cudaFree( d->dev_inSrc ) ;
cudaFree( d->dev_outSrc );
cudaFree( d->dev_constSrc );
cudaEventDestroy( d->start );
cudaEventDestroy( d->stop );
}
int main( void ) {
DataBlock data;
CPUAnimBitmap bitmap( DIM, DIM, &data );
data.bitmap = &bitmap;
data.totalTime = 0;
data.frames = 0;
cudaEventCreate( &data.start );
cudaEventCreate( &data.stop );
int imageSize = bitmap.image_size();
cudaMalloc( (void**)&data.output_bitmap,
imageSize );
// assume float == 4 chars in size (ie rgba)
cudaMalloc( (void**)&data.dev_inSrc,
imageSize );
cudaMalloc( (void**)&data.dev_outSrc,
imageSize );
cudaMalloc( (void**)&data.dev_constSrc,
imageSize );
cudaChannelFormatDesc desc = cudaCreateChannelDesc<float>();
cudaBindTexture2D( NULL, texConstSrc,
data.dev_constSrc,
desc, DIM, DIM,
sizeof(float) * DIM ) ;
cudaBindTexture2D( NULL, texIn,
data.dev_inSrc,
desc, DIM, DIM,
sizeof(float) * DIM ) ;
cudaBindTexture2D( NULL, texOut,
data.dev_outSrc,
desc, DIM, DIM,
sizeof(float) * DIM ) ;
// initialize the constant data
float *temp = (float*)malloc( imageSize );
for (int i=0; i<DIM*DIM; i++) {
temp[i] = 0;
int x = i % DIM;
int y = i / DIM;
if ((x>300) && (x<600) && (y>310) && (y<601))
temp[i] = MAX_TEMP;
}
temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2;
temp[DIM*700+100] = MIN_TEMP;
temp[DIM*300+300] = MIN_TEMP;
temp[DIM*200+700] = MIN_TEMP;
for (int y=800; y<900; y++) {
for (int x=400; x<500; x++) {
temp[x+y*DIM] = MIN_TEMP;
}
}
cudaMemcpy( data.dev_constSrc, temp,
imageSize,
cudaMemcpyHostToDevice ) ;
// initialize the input data
for (int y=800; y<DIM; y++) {
for (int x=0; x<200; x++) {
temp[x+y*DIM] = MAX_TEMP;
}
}
cudaMemcpy( data.dev_inSrc, temp,
imageSize,
cudaMemcpyHostToDevice );
free( temp );
bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu,
(void (*)(void*))anim_exit );
}
编译,输出:
2.2 一维纹理:
#include <stdio.h>
#include <GL/glut.h>
#define DIM 1024
#define PI 3.1415926535897932f
#define MAX_TEMP 1.0f
#define MIN_TEMP 0.0001f
#define SPEED 0.25f
// these exist on the GPU side
texture<float> texConstSrc;
texture<float> texIn;
texture<float> texOut;
__device__ unsigned char value( float n1, float n2, int hue ) {
if (hue > 360) hue -= 360;
else if (hue < 0) hue += 360;
if (hue < 60)
return (unsigned char)(255 * (n1 + (n2-n1)*hue/60));
if (hue < 180)
return (unsigned char)(255 * n2);
if (hue < 240)
return (unsigned char)(255 * (n1 + (n2-n1)*(240-hue)/60));
return (unsigned char)(255 * n1);
}
__global__ void float_to_color( unsigned char *optr,
const float *outSrc ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float l = outSrc[offset];
float s = 1;
int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
float m1, m2;
if (l <= 0.5f)
m2 = l * (1 + s);
else
m2 = l + s - l * s;
m1 = 2 * l - m2;
optr[offset*4 + 0] = value( m1, m2, h+120 );
optr[offset*4 + 1] = value( m1, m2, h );
optr[offset*4 + 2] = value( m1, m2, h -120 );
optr[offset*4 + 3] = 255;
}
__global__ void float_to_color( uchar4 *optr,
const float *outSrc ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float l = outSrc[offset];
float s = 1;
int h = (180 + (int)(360.0f * outSrc[offset])) % 360;
float m1, m2;
if (l <= 0.5f)
m2 = l * (1 + s);
else
m2 = l + s - l * s;
m1 = 2 * l - m2;
optr[offset].x = value( m1, m2, h+120 );
optr[offset].y = value( m1, m2, h );
optr[offset].z = value( m1, m2, h -120 );
optr[offset].w = 255;
}
struct CPUAnimBitmap {
unsigned char *pixels;
int width, height;
void *dataBlock;
void (*fAnim)(void*,int);
void (*animExit)(void*);
void (*clickDrag)(void*,int,int,int,int);
int dragStartX, dragStartY;
CPUAnimBitmap( int w, int h, void *d = NULL ) {
width = w;
height = h;
pixels = new unsigned char[width * height * 4];
dataBlock = d;
clickDrag = NULL;
}
~CPUAnimBitmap() {
delete [] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return width * height * 4; }
void click_drag( void (*f)(void*,int,int,int,int)) {
clickDrag = f;
}
void anim_and_exit( void (*f)(void*,int), void(*e)(void*) ) {
CPUAnimBitmap** bitmap = get_bitmap_ptr();
*bitmap = this;
fAnim = f;
animExit = e;
// a bug in the Windows GLUT implementation prevents us from
// passing zero arguments to glutInit()
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_DOUBLE | GLUT_RGBA );
glutInitWindowSize( width, height );
glutCreateWindow( "bitmap" );
glutKeyboardFunc(Key);
glutDisplayFunc(Draw);
if (clickDrag != NULL)
glutMouseFunc( mouse_func );
glutIdleFunc( idle_func );
glutMainLoop();
}
// static method used for glut callbacks
static CPUAnimBitmap** get_bitmap_ptr( void ) {
static CPUAnimBitmap* gBitmap;
return &gBitmap;
}
// static method used for glut callbacks
static void mouse_func( int button, int state,
int mx, int my ) {
if (button == GLUT_LEFT_BUTTON) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
if (state == GLUT_DOWN) {
bitmap->dragStartX = mx;
bitmap->dragStartY = my;
} else if (state == GLUT_UP) {
bitmap->clickDrag( bitmap->dataBlock,
bitmap->dragStartX,
bitmap->dragStartY,
mx, my );
}
}
}
// static method used for glut callbacks
static void idle_func( void ) {
static int ticks = 1;
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->fAnim( bitmap->dataBlock, ticks++ );
glutPostRedisplay();
}
// static method used for glut callbacks
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27:
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
bitmap->animExit( bitmap->dataBlock );
//delete bitmap;
exit(0);
}
}
// static method used for glut callbacks
static void Draw( void ) {
CPUAnimBitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->width, bitmap->height, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glutSwapBuffers();
}
};
// this kernel takes in a 2-d array of floats
// it updates the value-of-interest by a scaled value based
// on itself and its nearest neighbors
__global__ void blend_kernel( float *dst,
bool dstOut ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
int left = offset - 1;
int right = offset + 1;
if (x == 0) left++;
if (x == DIM-1) right--;
int top = offset - DIM;
int bottom = offset + DIM;
if (y == 0) top += DIM;
if (y == DIM-1) bottom -= DIM;
float t, l, c, r, b;
if (dstOut) {
t = tex1Dfetch(texIn,top);
l = tex1Dfetch(texIn,left);
c = tex1Dfetch(texIn,offset);
r = tex1Dfetch(texIn,right);
b = tex1Dfetch(texIn,bottom);
} else {
t = tex1Dfetch(texOut,top);
l = tex1Dfetch(texOut,left);
c = tex1Dfetch(texOut,offset);
r = tex1Dfetch(texOut,right);
b = tex1Dfetch(texOut,bottom);
}
dst[offset] = c + SPEED * (t + b + r + l - 4 * c);
}
// NOTE - texOffsetConstSrc could either be passed as a
// parameter to this function, or passed in __constant__ memory
// if we declared it as a global above, it would be
// a parameter here:
// __global__ void copy_const_kernel( float *iptr,
// size_t texOffset )
__global__ void copy_const_kernel( float *iptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float c = tex1Dfetch(texConstSrc,offset);
if (c != 0)
iptr[offset] = c;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *output_bitmap;
float *dev_inSrc;
float *dev_outSrc;
float *dev_constSrc;
CPUAnimBitmap *bitmap;
cudaEvent_t start, stop;
float totalTime;
float frames;
};
void anim_gpu( DataBlock *d, int ticks ) {
cudaEventRecord( d->start, 0 );
dim3 blocks(DIM/16,DIM/16);
dim3 threads(16,16);
CPUAnimBitmap *bitmap = d->bitmap;
// since tex is global and bound, we have to use a flag to
// select which is in/out per iteration
volatile bool dstOut = true;
for (int i=0; i<90; i++) {
float *in, *out;
if (dstOut) {
in = d->dev_inSrc;
out = d->dev_outSrc;
} else {
out = d->dev_inSrc;
in = d->dev_outSrc;
}
copy_const_kernel<<<blocks,threads>>>( in );
blend_kernel<<<blocks,threads>>>( out, dstOut );
dstOut = !dstOut;
}
float_to_color<<<blocks,threads>>>( d->output_bitmap,
d->dev_inSrc );
cudaMemcpy( bitmap->get_ptr(),
d->output_bitmap,
bitmap->image_size(),
cudaMemcpyDeviceToHost );
cudaEventRecord( d->stop, 0 );
cudaEventSynchronize( d->stop );
float elapsedTime;
cudaEventElapsedTime( &elapsedTime,
d->start, d->stop );
d->totalTime += elapsedTime;
++d->frames;
printf( "Average Time per frame: %3.1f ms\n",
d->totalTime/d->frames );
}
// clean up memory allocated on the GPU
void anim_exit( DataBlock *d ) {
cudaUnbindTexture( texIn );
cudaUnbindTexture( texOut );
cudaUnbindTexture( texConstSrc );
cudaFree( d->dev_inSrc ) ;
cudaFree( d->dev_outSrc ) ;
cudaFree( d->dev_constSrc );
cudaEventDestroy( d->start );
cudaEventDestroy( d->stop );
}
int main( void ) {
DataBlock data;
CPUAnimBitmap bitmap( DIM, DIM, &data );
data.bitmap = &bitmap;
data.totalTime = 0;
data.frames = 0;
cudaEventCreate( &data.start ) ;
cudaEventCreate( &data.stop );
int imageSize = bitmap.image_size();
cudaMalloc( (void**)&data.output_bitmap,
imageSize ) ;
// assume float == 4 chars in size (ie rgba)
cudaMalloc( (void**)&data.dev_inSrc,
imageSize ) ;
cudaMalloc( (void**)&data.dev_outSrc,
imageSize ) ;
cudaMalloc( (void**)&data.dev_constSrc,
imageSize ) ;
cudaBindTexture( NULL, texConstSrc,
data.dev_constSrc,
imageSize ) ;
cudaBindTexture( NULL, texIn,
data.dev_inSrc,
imageSize );
cudaBindTexture( NULL, texOut,
data.dev_outSrc,
imageSize );
// intialize the constant data
float *temp = (float*)malloc( imageSize );
for (int i=0; i<DIM*DIM; i++) {
temp[i] = 0;
int x = i % DIM;
int y = i / DIM;
if ((x>300) && (x<600) && (y>310) && (y<601))
temp[i] = MAX_TEMP;
}
temp[DIM*100+100] = (MAX_TEMP + MIN_TEMP)/2;
temp[DIM*700+100] = MIN_TEMP;
temp[DIM*300+300] = MIN_TEMP;
temp[DIM*200+700] = MIN_TEMP;
for (int y=800; y<900; y++) {
for (int x=400; x<500; x++) {
temp[x+y*DIM] = MIN_TEMP;
}
}
cudaMemcpy( data.dev_constSrc, temp,
imageSize,
cudaMemcpyHostToDevice );
// initialize the input data
for (int y=800; y<DIM; y++) {
for (int x=0; x<200; x++) {
temp[x+y*DIM] = MAX_TEMP;
}
}
cudaMemcpy( data.dev_inSrc, temp,
imageSize,
cudaMemcpyHostToDevice );
free( temp );
bitmap.anim_and_exit( (void (*)(void*,int))anim_gpu,
(void (*)(void*))anim_exit );
}
编译,输出: