NVIDIA CUDA 学习 (4) Constant Memory and Events

Ray Tracing

在这里插入图片描述

#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_bitmap.h"
#define rnd( x ) (x * rand() / RAND_MAX)
#define SPHERES 20
Sphere *s;
int main( void ) {
	// capture the start time
	cudaEvent_t start, stop;
	HANDLE_ERROR( cudaEventCreate( &start ) );
	HANDLE_ERROR( cudaEventCreate( &stop ) );
	HANDLE_ERROR( cudaEventRecord( start, 0 ) );
	CPUBitmap bitmap( DIM, DIM );
	unsigned char *dev_bitmap;
	// allocate memory on the GPU for the output bitmap
	HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
		bitmap.image_size() ) );
	// allocate memory for the Sphere dataset
	HANDLE_ERROR( cudaMalloc( (void**)&s,
		sizeof(Sphere) * SPHERES ) );
	// allocate temp memory, initialize it, copy to
	// memory on the GPU, and then free our temp memory
	Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
	for (int i=0; i<SPHERES; i++) {
		temp_s[i].r = rnd( 1.0f );
		temp_s[i].g = rnd( 1.0f );
		temp_s[i].b = rnd( 1.0f );
		temp_s[i].x = rnd( 1000.0f ) - 500;
		temp_s[i].y = rnd( 1000.0f ) - 500;
		temp_s[i].z = rnd( 1000.0f ) - 500;
		temp_s[i].radius = rnd( 100.0f ) + 20;
	}
	HANDLE_ERROR( cudaMemcpy( s, temp_s,
		sizeof(Sphere) * SPHERES,
		cudaMemcpyHostToDevice ) );
	free( temp_s );
	// generate a bitmap from our sphere data
	dim3 grids(DIM/16,DIM/16);
	dim3 threads(16,16);
	kernel<<<grids,threads>>>( dev_bitmap );
	// copy our bitmap back from the GPU for display
	HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
		bitmap.image_size(),
		cudaMemcpyDeviceToHost ) );
	bitmap.display_and_exit();
	// free our memory
	cudaFree( dev_bitmap );
	cudaFree( s );
}

__global__ void kernel( unsigned char *ptr ) {
	// map from threadIdx/BlockIdx to pixel position
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;
	int offset = x + y * blockDim.x * gridDim.x;
	float ox = (x - DIM/2);
	float oy = (y - DIM/2);
	float r=0, g=0, b=0;
	float maxz = -INF;
	for(int i=0; i<SPHERES; i++) {
		float n;
		float t = s[i].hit( ox, oy, &n );
		if (t > maxz) {
			float fscale = n;
			r = s[i].r * fscale;
			g = s[i].g * fscale;
			b = s[i].b * fscale;
		}
	}
	ptr[offset*4 + 0] = (int)(r * 255);
	ptr[offset*4 + 1] = (int)(g * 255);
	ptr[offset*4 + 2] = (int)(b * 255);
	ptr[offset*4 + 3] = 255;
}

在这里插入图片描述

Constant Memory

__constant__ Sphere s[SPHERES];

这个就声明了一个常量内存。

int main( void ) {
	CPUBitmap bitmap( DIM, DIM );
	unsigned char *dev_bitmap;
	// allocate memory on the GPU for the output bitmap
	HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
	bitmap.image_size() ) );
	// allocate temp memory, initialize it, copy to constant
	// memory on the GPU, and then free our temp memory
	Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
	for (int i=0; i<SPHERES; i++) {
		temp_s[i].r = rnd( 1.0f );
		temp_s[i].g = rnd( 1.0f );
		temp_s[i].b = rnd( 1.0f );
		temp_s[i].x = rnd( 1000.0f ) - 500;
		temp_s[i].y = rnd( 1000.0f ) - 500;
		temp_s[i].z = rnd( 1000.0f ) - 500;
		temp_s[i].radius = rnd( 100.0f ) + 20;
	}
	HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,
	sizeof(Sphere) * SPHERES) );
	free( temp_s );
	// generate a bitmap from our sphere data
	dim3 grids(DIM/16,DIM/16);
	dim3 threads(16,16);
	kernel<<<grids,threads>>>( dev_bitmap );
	// copy our bitmap back from the GPU for display
	HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
	bitmap.image_size(),
	cudaMemcpyDeviceToHost ) );
	bitmap.display_and_exit();
	// free our memory
	cudaFree( dev_bitmap );
}
HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,sizeof(Sphere) * SPHERES ) );

这一行就是赋值。其好处有两点:
在这里插入图片描述half-warp之间读取同一个数据的时候速度明显加快,且节省了带宽,因为可以内部spread,一次独取赋给所有人。但是这是建立在整个half-warp读取同一个constant memory上的,如果他们独取不同的constant memory,组需要依次读取,而从global中读取是可以占用大量带宽同时读取的,这样的话constant读取速度反而会比读global更慢。

Measuring Performance with Events

int main( void ) {
	// capture the start time
	cudaEvent_t start, stop;
	HANDLE_ERROR( cudaEventCreate( &start ) );
	HANDLE_ERROR( cudaEventCreate( &stop ) );
	HANDLE_ERROR( cudaEventRecord( start, 0 ) );
	CPUBitmap bitmap( DIM, DIM );
	unsigned char *dev_bitmap;
	// allocate memory on the GPU for the output bitmap
	HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
	bitmap.image_size() ) );
	// allocate memory for the Sphere dataset
	HANDLE_ERROR( cudaMalloc( (void**)&s,
	sizeof(Sphere) * SPHERES ) );
	// allocate temp memory, initialize it, copy to
	// memory on the GPU, and then free our temp memory
	Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
	for (int i=0; i<SPHERES; i++) {
		temp_s[i].r = rnd( 1.0f );
		temp_s[i].g = rnd( 1.0f );
		temp_s[i].b = rnd( 1.0f );
		temp_s[i].x = rnd( 1000.0f ) - 500;
		temp_s[i].y = rnd( 1000.0f ) - 500;
		temp_s[i].z = rnd( 1000.0f ) - 500;
		temp_s[i].radius = rnd( 100.0f ) + 20;
	}
	HANDLE_ERROR( cudaMemcpy( s, temp_s,
	sizeof(Sphere) * SPHERES,
	cudaMemcpyHostToDevice ) );
	free( temp_s );
	// generate a bitmap from our sphere data
	dim3 grids(DIM/16,DIM/16);
	dim3 threads(16,16);
	kernel<<<grids,threads>>>( s, dev_bitmap );
	// copy our bitmap back from the GPU for display
	HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
	bitmap.image_size(),
	cudaMemcpyDeviceToHost ) );
	// get stop time, and display the timing results
	HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
	HANDLE_ERROR( cudaEventSynchronize( stop ) );
	float elapsedTime;
	HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
	start, stop ) );
	printf( "Time to generate: %3.1f ms\n", elapsedTime );
	HANDLE_ERROR( cudaEventDestroy( start ) );
	HANDLE_ERROR( cudaEventDestroy( stop ) );
	// display
	bitmap.display_and_exit();
	// free our memory
	cudaFree( dev_bitmap );
	cudaFree( s );
}

cudaEventSynchronize(stop)能让GPU把上面的任务执行完。当这个函数返回的时候,意味着所有在stop事件之前的所有GPU的工作都已经完成了。因此我们可以放心计算时间了。

cudaEventElapsedTime()能够计算两个事件之间经过的时间。

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值