1.光线跟踪
#include <stdio.h>
#include <GL/glut.h>
struct CPUBitmap {
unsigned char *pixels;
int x, y;
void *dataBlock;
void (*bitmapExit)(void*);
CPUBitmap( int width, int height, void *d = NULL ) {
pixels = new unsigned char[width * height * 4];
x = width;
y = height;
dataBlock = d;
}
~CPUBitmap() {
delete [] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return x * y * 4; }
void display_and_exit( void(*e)(void*) = NULL ) {
CPUBitmap** bitmap = get_bitmap_ptr();
*bitmap = this;
bitmapExit = e;
// a bug in the Windows GLUT implementation prevents us from
// passing zero arguments to glutInit()
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
glutInitWindowSize( x, y );
glutCreateWindow( "bitmap" );
glutKeyboardFunc(Key);
glutDisplayFunc(Draw);
glutMainLoop();
}
// static method used for glut callbacks
static CPUBitmap** get_bitmap_ptr( void ) {
static CPUBitmap *gBitmap;
return &gBitmap;
}
// static method used for glut callbacks
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27:
CPUBitmap* bitmap = *(get_bitmap_ptr());
if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)
bitmap->bitmapExit( bitmap->dataBlock );
exit(0);
}
}
// static method used for glut callbacks
static void Draw( void ) {
CPUBitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glFlush();
}
};
#define DIM 1024
#define rnd( x ) (x * rand() / RAND_MAX)
#define INF 2e10f
struct Sphere {
float r,b,g;
float radius;
float x,y,z;
__device__ float hit( float ox, float oy, float *n ) {
float dx = ox - x;
float dy = oy - y;
if (dx*dx + dy*dy < radius*radius) {
float dz = sqrtf( radius*radius - dx*dx - dy*dy );
*n = dz / sqrtf( radius * radius );
return dz + z;
}
return -INF;
}
};
#define SPHERES 20
__global__ void kernel( Sphere *s, unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = (x - DIM/2);
float oy = (y - DIM/2);
float r=0, g=0, b=0;
float maxz = -INF;
for(int i=0; i<SPHERES; i++) {
float n;
float t = s[i].hit( ox, oy, &n );
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
ptr[offset*4 + 0] = (int)(r * 255);
ptr[offset*4 + 1] = (int)(g * 255);
ptr[offset*4 + 2] = (int)(b * 255);
ptr[offset*4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
Sphere *s;
};
int main( void ) {
DataBlock data;
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;
// allocate memory on the GPU for the output bitmap
cudaMalloc( (void**)&dev_bitmap,
bitmap.image_size() );
Sphere *s;
cudaMalloc( (void**)&s,
sizeof(Sphere) * SPHERES );
// allocate temp memory, initialize it, copy to constant
// memory on the GPU, then free our temp memory
Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
for (int i=0; i<SPHERES; i++) {
temp_s[i].r = rnd( 1.0f );
temp_s[i].g = rnd( 1.0f );
temp_s[i].b = rnd( 1.0f );
temp_s[i].x = rnd( 1000.0f ) - 500;
temp_s[i].y = rnd( 1000.0f ) - 500;
temp_s[i].z = rnd( 1000.0f ) - 500;
temp_s[i].radius = rnd( 100.0f ) + 20;
}
cudaMemcpy( s, temp_s,
sizeof(Sphere) * SPHERES,
cudaMemcpyHostToDevice );
free( temp_s );
// generate a bitmap from our sphere data
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>( s, dev_bitmap );
// copy our bitmap back from the GPU for display
cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost );
// get stop time, and display the timing results
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float elapsedTime;
cudaEventElapsedTime( &elapsedTime,
start, stop ) ;
printf( "Time to generate: %3.1f ms\n", elapsedTime );
cudaEventDestroy( start ) ;
cudaEventDestroy( stop ) ;
cudaFree( dev_bitmap ) ;
cudaFree( s );
// display
bitmap.display_and_exit();
}
编译,输出:
用时4+ms
2.常量内存实现光线跟踪
#include <stdio.h>
#include <GL/glut.h>
struct CPUBitmap {
unsigned char *pixels;
int x, y;
void *dataBlock;
void (*bitmapExit)(void*);
CPUBitmap( int width, int height, void *d = NULL ) {
pixels = new unsigned char[width * height * 4];
x = width;
y = height;
dataBlock = d;
}
~CPUBitmap() {
delete [] pixels;
}
unsigned char* get_ptr( void ) const { return pixels; }
long image_size( void ) const { return x * y * 4; }
void display_and_exit( void(*e)(void*) = NULL ) {
CPUBitmap** bitmap = get_bitmap_ptr();
*bitmap = this;
bitmapExit = e;
// a bug in the Windows GLUT implementation prevents us from
// passing zero arguments to glutInit()
int c=1;
char* dummy = "";
glutInit( &c, &dummy );
glutInitDisplayMode( GLUT_SINGLE | GLUT_RGBA );
glutInitWindowSize( x, y );
glutCreateWindow( "bitmap" );
glutKeyboardFunc(Key);
glutDisplayFunc(Draw);
glutMainLoop();
}
// static method used for glut callbacks
static CPUBitmap** get_bitmap_ptr( void ) {
static CPUBitmap *gBitmap;
return &gBitmap;
}
// static method used for glut callbacks
static void Key(unsigned char key, int x, int y) {
switch (key) {
case 27:
CPUBitmap* bitmap = *(get_bitmap_ptr());
if (bitmap->dataBlock != NULL && bitmap->bitmapExit != NULL)
bitmap->bitmapExit( bitmap->dataBlock );
exit(0);
}
}
// static method used for glut callbacks
static void Draw( void ) {
CPUBitmap* bitmap = *(get_bitmap_ptr());
glClearColor( 0.0, 0.0, 0.0, 1.0 );
glClear( GL_COLOR_BUFFER_BIT );
glDrawPixels( bitmap->x, bitmap->y, GL_RGBA, GL_UNSIGNED_BYTE, bitmap->pixels );
glFlush();
}
};
#define DIM 1024
#define rnd( x ) (x * rand() / RAND_MAX)
#define INF 2e10f
struct Sphere {
float r,b,g;
float radius;
float x,y,z;
__device__ float hit( float ox, float oy, float *n ) {
float dx = ox - x;
float dy = oy - y;
if (dx*dx + dy*dy < radius*radius) {
float dz = sqrtf( radius*radius - dx*dx - dy*dy );
*n = dz / sqrtf( radius * radius );
return dz + z;
}
return -INF;
}
};
#define SPHERES 20
//下面有修改
__constant__ Sphere s[SPHERES];
//下面有修改
__global__ void kernel( unsigned char *ptr ) {
// map from threadIdx/BlockIdx to pixel position
int x = threadIdx.x + blockIdx.x * blockDim.x;
int y = threadIdx.y + blockIdx.y * blockDim.y;
int offset = x + y * blockDim.x * gridDim.x;
float ox = (x - DIM/2);
float oy = (y - DIM/2);
float r=0, g=0, b=0;
float maxz = -INF;
for(int i=0; i<SPHERES; i++) {
float n;
float t = s[i].hit( ox, oy, &n );//计算来自(ox,oy)的光线是否与这个球面相交
if (t > maxz) {
float fscale = n;
r = s[i].r * fscale;
g = s[i].g * fscale;
b = s[i].b * fscale;
maxz = t;
}
}
ptr[offset*4 + 0] = (int)(r * 255);
ptr[offset*4 + 1] = (int)(g * 255);
ptr[offset*4 + 2] = (int)(b * 255);
ptr[offset*4 + 3] = 255;
}
// globals needed by the update routine
struct DataBlock {
unsigned char *dev_bitmap;
};
int main( void ) {
DataBlock data;
// capture the start time
cudaEvent_t start, stop;
cudaEventCreate( &start );
cudaEventCreate( &stop );
cudaEventRecord( start, 0 );
CPUBitmap bitmap( DIM, DIM, &data );
unsigned char *dev_bitmap;
// allocate memory on the GPU for the output bitmap
cudaMalloc( (void**)&dev_bitmap,
bitmap.image_size() );
// allocate temp memory, initialize it, copy to constant
// memory on the GPU, then free our temp memory
Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
for (int i=0; i<SPHERES; i++) {
temp_s[i].r = rnd( 1.0f );
temp_s[i].g = rnd( 1.0f );
temp_s[i].b = rnd( 1.0f );
temp_s[i].x = rnd( 1000.0f ) - 500;
temp_s[i].y = rnd( 1000.0f ) - 500;
temp_s[i].z = rnd( 1000.0f ) - 500;
temp_s[i].radius = rnd( 100.0f ) + 20;
}
//下面有修改
cudaMemcpyToSymbol( s, temp_s, sizeof(Sphere) * SPHERES);
free( temp_s );
// generate a bitmap from our sphere data
dim3 grids(DIM/16,DIM/16);
dim3 threads(16,16);
kernel<<<grids,threads>>>( dev_bitmap );
// copy our bitmap back from the GPU for display
cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
bitmap.image_size(),
cudaMemcpyDeviceToHost );
// get stop time, and display the timing results
cudaEventRecord( stop, 0 );
cudaEventSynchronize( stop );
float elapsedTime;
cudaEventElapsedTime( &elapsedTime,
start, stop ) ;
printf( "Time to generate: %3.1f ms\n", elapsedTime );
cudaEventDestroy( start ) ;
cudaEventDestroy( stop ) ;
cudaFree( dev_bitmap ) ;
// display
bitmap.display_and_exit();
}
编译,输出:花费时间3-4ms左右
3.总结与分析
总的看,加速的效果一般,增加球的数量try try
修改为10000个球,非常量内存结果为:837.8ms
常量内存 发现10000个球放不下!
改为1000个,结果:76.9ms
再把非常量内存也改为1000,结果:99.7ms