一. Page-locked Host Memory
- 使用
cudaHostAlloc()
为Host分配内存属于页锁定内存, 这种内存不会被OS转移到disk上, 只能存在于物理内存中. 访问更安全. 但是只使用物理内存会使内存占用比例升高, 影响程序性能. - 使用Page-locked Host Memory可以提高Host和GPU间的数据传输速度.
- 多用在
cudaMemcpy
的src或者dst上. - Page-locked Host Memory要及时使用
cudaFreeHost
释放.
用不同内存分配方式测试数据传输速度:
(辅助代码见: http://blog.csdn.net/full_speed_turbo/article/details/71107132)
#include "../common/book.h"
#define SIZE (100*1024*1024)
float cuda_malloc_test( int size, bool up)
{
cudaEvent_t start, stop;
int *a, *dev_a;
float elapsedTime;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, size*sizeof(int) ) );
a = (int*)malloc(size * sizeof(int) );
HANDLE_NULL(a);
HANDLE_ERROR( cudaEventRecord( start, 0) );
for (int i=0; i<100; i++)
{
if (up)
{
HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice) );
}
else
{
HANDLE_ERROR( cudaMemcpy( a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost ) );
}
}
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
HANDLE_ERROR( cudaFree(dev_a) );
free(a);
return elapsedTime;
}
float cuda_alloc_host_test(int size, bool up)
{
cudaEvent_t start, stop;
int *a, *dev_a;
float elapsedTime;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a, size*sizeof(int) ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&a, size *sizeof(int), cudaHostAllocDefault ) );
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
for (int i=0; i<100; i++)
{
if (up)
{
HANDLE_ERROR( cudaMemcpy(dev_a, a, size*sizeof(int), cudaMemcpyHostToDevice ) );
}
else
{
HANDLE_ERROR( cudaMemcpy(a, dev_a, size*sizeof(int), cudaMemcpyDeviceToHost ) );
}
}
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime, start, stop ) );
HANDLE_ERROR( cudaFreeHost( a ) );
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
return elapsedTime;
}
int main(void)
{
float elapsedTime;
float MB = (float)100*SIZE*sizeof(int)/1024/1024;
elapsedTime = cuda_malloc_test( SIZE, true );
printf( "Time using malloc: %3.1f ms\n", elapsedTime );
printf( "\t MB/s during copy up: %3.1f\n", MB/(elapsedTime/1000) );
elapsedTime = cuda_malloc_test( SIZE, false );
printf( "Time using malloc: %3.1f ms\n", elapsedTime );
printf( "\t MB/s during copy down: %3.1f\n", MB/(elapsedTime/1000) );
elapsedTime = cuda_alloc_host_test( SIZE, true );
printf( "Time using HostAlloc: %3.1f ms\n", elapsedTime );
printf( "\t MB/s during copy up: %3.1f\n", MB/(elapsedTime/1000) );
elapsedTime = cuda_alloc_host_test( SIZE, false );
printf( "Time using HostAlloc: %3.1f ms\n", elapsedTime );
printf( "\t MB/s during copy down: %3.1f\n", MB/(elapsedTime/1000) );
return 0;
}
输出:
2. streams
- stream可以看做GPU的一个task队列.
- 使用的Host内存需要用
cudaHostAlloc
分配. - stream操作需要
cudaStreamSynchronize
同步. - 多个stream配合时, 安排任务时要使用广度优先准则. 不要一次性把一个stream安排满了再去安排另外的stream. 否则, 另外的stream会被阻塞.
使用深度优先准则安排stream工作队列:
#include "../common/book.h"
#define N (1024*1024)
#define FULL_DATA_SIZE (N*20)
__global__ void kernel( int *a, int *b, int *c)
{
int idx = threadIdx.x + blockIdx.x * blockDim.x;
if(idx < N)
{
int idx1 = (idx + 1) % 256;
int idx2 = (idx + 2) % 256;
float as = (a[idx] + a[idx1] + a[idx2]) / 3.0f;
float bs = (b[idx] + b[idx1] + b[idx2]) / 3.0f;
c[idx] = (as + bs) / 2;
}
}
int main(void)
{
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if(!prop.deviceOverlap)
{
printf("Device will not handle overlaps, so no speed up from streams\n");
return 0;
}
cudaEvent_t start, stop;
float elapsedTime;
//开启计时器
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
//初始化streams
cudaStream_t stream0, stream1;
HANDLE_ERROR( cudaStreamCreate( &stream0 ) );
HANDLE_ERROR( cudaStreamCreate( &stream1 ) );
//分配内存
int *host_a, *host_b, *host_c;
int *dev_a0, *dev_b0, *dev_c0;
int *dev_a1, *dev_b1, *dev_c1;
HANDLE_ERROR( cudaMalloc( (void**)&dev_a0, N*sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b0, N*sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c0, N*sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_a1, N*sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b1, N*sizeof(int) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_c1, N*sizeof(int) ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_a, FULL_DATA_SIZE*sizeof(int), cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_b, FULL_DATA_SIZE*sizeof(int), cudaHostAllocDefault ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&host_c, FULL_DATA_SIZE*sizeof(int), cudaHostAllocDefault ) );
for (int i=0; i<FULL_DATA_SIZE; i++)
{
host_a[i] = rand();
host_b[i] = rand();
}
//分段计算
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
for (int i=0; i<FULL_DATA_SIZE; i+=N*2)
{
//先安排stream0.
HANDLE_ERROR( cudaMemcpyAsync( dev_a0, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_b0, host_b+i,
N *sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
kernel<<<N/256, 256, 0, stream0>>>( dev_a0, dev_b0, dev_c0 );
HANDLE_ERROR( cudaMemcpyAsync( host_c+i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0 ) );
//安排stream1.
HANDLE_ERROR( cudaMemcpyAsync( dev_a1, host_a+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
HANDLE_ERROR( cudaMemcpyAsync( dev_b1, host_b+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
kernel<<<N/256,256,0,stream1>>>( dev_a1, dev_b1, dev_c1 );
HANDLE_ERROR( cudaMemcpyAsync( host_c+i+N, dev_c1,
N*sizeof(int),
cudaMemcpyDeviceToHost,
stream1 ) );
}
//同步streams.
HANDLE_ERROR( cudaStreamSynchronize( stream0 ) );
HANDLE_ERROR( cudaStreamSynchronize( stream1 ) );
//获取时间
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start,
stop ) );
printf( "Time taken: %3.1f ms\n", elapsedTime );
//释放streams
HANDLE_ERROR( cudaStreamDestroy( stream0 ) );
HANDLE_ERROR( cudaStreamDestroy( stream1 ) );
//释放Host内存
HANDLE_ERROR( cudaFreeHost( host_a ) );
HANDLE_ERROR( cudaFreeHost( host_b ) );
HANDLE_ERROR( cudaFreeHost( host_c ) );
//释放GPU内存
HANDLE_ERROR( cudaFree( dev_a0 ) );
HANDLE_ERROR( cudaFree( dev_b0 ) );
HANDLE_ERROR( cudaFree( dev_c0 ) );
HANDLE_ERROR( cudaFree( dev_a1 ) );
HANDLE_ERROR( cudaFree( dev_b1 ) );
HANDLE_ERROR( cudaFree( dev_c1 ) );
return 0;
}
使用广度优先原则安排stream任务队列:
//分段计算
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
for (int i=0; i<FULL_DATA_SIZE; i+=N*2)
{
//安排stream0.
HANDLE_ERROR( cudaMemcpyAsync( dev_a0, host_a+i,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
//安排stream1.
HANDLE_ERROR( cudaMemcpyAsync( dev_a1, host_a+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
//安排stream0.
HANDLE_ERROR( cudaMemcpyAsync( dev_b0, host_b+i,
N *sizeof(int),
cudaMemcpyHostToDevice,
stream0 ) );
//安排stream1.
HANDLE_ERROR( cudaMemcpyAsync( dev_b1, host_b+i+N,
N * sizeof(int),
cudaMemcpyHostToDevice,
stream1 ) );
//安排stream0.
kernel<<<N/256, 256, 0, stream0>>>( dev_a0, dev_b0, dev_c0 );
//安排stream1.
kernel<<<N/256, 256, 0, stream1>>>( dev_a1, dev_b1, dev_c1 );
//安排stream0.
HANDLE_ERROR( cudaMemcpyAsync( host_c+i, dev_c0,
N * sizeof(int),
cudaMemcpyDeviceToHost,
stream0 ) );
//安排stream1.
HANDLE_ERROR( cudaMemcpyAsync( host_c+i+N, dev_c1,
N*sizeof(int),
cudaMemcpyDeviceToHost,
stream1 ) );
}
其他代码不变.
其实, 前期分配内存的时间比for循环里用的时间长的多.
所以, 对这个程序来说, 瓶颈在于Host和device之间的数据传输太慢了.