NVIDIA CUDA 学习 (4) Constant Memory and Events

最新推荐文章于 2023-12-14 14:43:25 发布

闭门即深山

最新推荐文章于 2023-12-14 14:43:25 发布

阅读量130

点赞数

分类专栏： CUDA 文章标签： c语言指针机器学习深度学习

本文链接：https://blog.csdn.net/weixin_47658743/article/details/108465187

版权

CUDA 专栏收录该内容

5 篇文章 1 订阅

订阅专栏

Constant Memory and Events

Ray Tracing
Constant Memory
Measuring Performance with Events

Ray Tracing

在这里插入图片描述

#include "cuda.h"
#include "../common/book.h"
#include "../common/cpu_bitmap.h"
#define rnd( x ) (x * rand() / RAND_MAX)
#define SPHERES 20
Sphere *s;
int main( void ) {
	// capture the start time
	cudaEvent_t start, stop;
	HANDLE_ERROR( cudaEventCreate( &start ) );
	HANDLE_ERROR( cudaEventCreate( &stop ) );
	HANDLE_ERROR( cudaEventRecord( start, 0 ) );
	CPUBitmap bitmap( DIM, DIM );
	unsigned char *dev_bitmap;
	// allocate memory on the GPU for the output bitmap
	HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
		bitmap.image_size() ) );
	// allocate memory for the Sphere dataset
	HANDLE_ERROR( cudaMalloc( (void**)&s,
		sizeof(Sphere) * SPHERES ) );
	// allocate temp memory, initialize it, copy to
	// memory on the GPU, and then free our temp memory
	Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
	for (int i=0; i<SPHERES; i++) {
		temp_s[i].r = rnd( 1.0f );
		temp_s[i].g = rnd( 1.0f );
		temp_s[i].b = rnd( 1.0f );
		temp_s[i].x = rnd( 1000.0f ) - 500;
		temp_s[i].y = rnd( 1000.0f ) - 500;
		temp_s[i].z = rnd( 1000.0f ) - 500;
		temp_s[i].radius = rnd( 100.0f ) + 20;
	}
	HANDLE_ERROR( cudaMemcpy( s, temp_s,
		sizeof(Sphere) * SPHERES,
		cudaMemcpyHostToDevice ) );
	free( temp_s );
	// generate a bitmap from our sphere data
	dim3 grids(DIM/16,DIM/16);
	dim3 threads(16,16);
	kernel<<<grids,threads>>>( dev_bitmap );
	// copy our bitmap back from the GPU for display
	HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
		bitmap.image_size(),
		cudaMemcpyDeviceToHost ) );
	bitmap.display_and_exit();
	// free our memory
	cudaFree( dev_bitmap );
	cudaFree( s );
}

__global__ void kernel( unsigned char *ptr ) {
	// map from threadIdx/BlockIdx to pixel position
	int x = threadIdx.x + blockIdx.x * blockDim.x;
	int y = threadIdx.y + blockIdx.y * blockDim.y;
	int offset = x + y * blockDim.x * gridDim.x;
	float ox = (x - DIM/2);
	float oy = (y - DIM/2);
	float r=0, g=0, b=0;
	float maxz = -INF;
	for(int i=0; i<SPHERES; i++) {
		float n;
		float t = s[i].hit( ox, oy, &n );
		if (t > maxz) {
			float fscale = n;
			r = s[i].r * fscale;
			g = s[i].g * fscale;
			b = s[i].b * fscale;
		}
	}
	ptr[offset*4 + 0] = (int)(r * 255);
	ptr[offset*4 + 1] = (int)(g * 255);
	ptr[offset*4 + 2] = (int)(b * 255);
	ptr[offset*4 + 3] = 255;
}

在这里插入图片描述

Constant Memory

__constant__ Sphere s[SPHERES];

这个就声明了一个常量内存。

int main( void ) {
	CPUBitmap bitmap( DIM, DIM );
	unsigned char *dev_bitmap;
	// allocate memory on the GPU for the output bitmap
	HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
	bitmap.image_size() ) );
	// allocate temp memory, initialize it, copy to constant
	// memory on the GPU, and then free our temp memory
	Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
	for (int i=0; i<SPHERES; i++) {
		temp_s[i].r = rnd( 1.0f );
		temp_s[i].g = rnd( 1.0f );
		temp_s[i].b = rnd( 1.0f );
		temp_s[i].x = rnd( 1000.0f ) - 500;
		temp_s[i].y = rnd( 1000.0f ) - 500;
		temp_s[i].z = rnd( 1000.0f ) - 500;
		temp_s[i].radius = rnd( 100.0f ) + 20;
	}
	HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,
	sizeof(Sphere) * SPHERES) );
	free( temp_s );
	// generate a bitmap from our sphere data
	dim3 grids(DIM/16,DIM/16);
	dim3 threads(16,16);
	kernel<<<grids,threads>>>( dev_bitmap );
	// copy our bitmap back from the GPU for display
	HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
	bitmap.image_size(),
	cudaMemcpyDeviceToHost ) );
	bitmap.display_and_exit();
	// free our memory
	cudaFree( dev_bitmap );
}

HANDLE_ERROR( cudaMemcpyToSymbol( s, temp_s,sizeof(Sphere) * SPHERES ) );

这一行就是赋值。其好处有两点：
在这里插入图片描述 half-warp之间读取同一个数据的时候速度明显加快，且节省了带宽，因为可以内部spread，一次独取赋给所有人。但是这是建立在整个half-warp读取同一个constant memory上的，如果他们独取不同的constant memory，组需要依次读取，而从global中读取是可以占用大量带宽同时读取的，这样的话constant读取速度反而会比读global更慢。

Measuring Performance with Events

int main( void ) {
	// capture the start time
	cudaEvent_t start, stop;
	HANDLE_ERROR( cudaEventCreate( &start ) );
	HANDLE_ERROR( cudaEventCreate( &stop ) );
	HANDLE_ERROR( cudaEventRecord( start, 0 ) );
	CPUBitmap bitmap( DIM, DIM );
	unsigned char *dev_bitmap;
	// allocate memory on the GPU for the output bitmap
	HANDLE_ERROR( cudaMalloc( (void**)&dev_bitmap,
	bitmap.image_size() ) );
	// allocate memory for the Sphere dataset
	HANDLE_ERROR( cudaMalloc( (void**)&s,
	sizeof(Sphere) * SPHERES ) );
	// allocate temp memory, initialize it, copy to
	// memory on the GPU, and then free our temp memory
	Sphere *temp_s = (Sphere*)malloc( sizeof(Sphere) * SPHERES );
	for (int i=0; i<SPHERES; i++) {
		temp_s[i].r = rnd( 1.0f );
		temp_s[i].g = rnd( 1.0f );
		temp_s[i].b = rnd( 1.0f );
		temp_s[i].x = rnd( 1000.0f ) - 500;
		temp_s[i].y = rnd( 1000.0f ) - 500;
		temp_s[i].z = rnd( 1000.0f ) - 500;
		temp_s[i].radius = rnd( 100.0f ) + 20;
	}
	HANDLE_ERROR( cudaMemcpy( s, temp_s,
	sizeof(Sphere) * SPHERES,
	cudaMemcpyHostToDevice ) );
	free( temp_s );
	// generate a bitmap from our sphere data
	dim3 grids(DIM/16,DIM/16);
	dim3 threads(16,16);
	kernel<<<grids,threads>>>( s, dev_bitmap );
	// copy our bitmap back from the GPU for display
	HANDLE_ERROR( cudaMemcpy( bitmap.get_ptr(), dev_bitmap,
	bitmap.image_size(),
	cudaMemcpyDeviceToHost ) );
	// get stop time, and display the timing results
	HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
	HANDLE_ERROR( cudaEventSynchronize( stop ) );
	float elapsedTime;
	HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
	start, stop ) );
	printf( "Time to generate: %3.1f ms\n", elapsedTime );
	HANDLE_ERROR( cudaEventDestroy( start ) );
	HANDLE_ERROR( cudaEventDestroy( stop ) );
	// display
	bitmap.display_and_exit();
	// free our memory
	cudaFree( dev_bitmap );
	cudaFree( s );
}