第11章 多GPU系统上的CUDA C
“比在一个GPU上计算要更好的,只有在两个GPU上计算。”
11.1 本章目标
- 了解如何分配和使用零拷贝内存(Zero-Copy Memory);
- 了解如何在同一个应用程序中使用多个GPU;
- 了解如何分配和使用可移动的固定内存(Portable Pinned Memory)。
11.2 零拷贝主机内存
cudaHostAlloc() 传递参数 cudaHostAllocDefault 分配固定内存;cudaHostAllocMapped分配主机固定内存。
这种内存除了可以用于主机与GPU之间的内存复制外,还可以子CUDA C核函数中直接访问这种类型的主机内存。由于这种内存不需要复制到GPU,因此也称为零拷贝内存。
11.2.1 通过零拷贝内存实现点积运算
GPU访问主机内存:实现点积运算。GPU上执行归约运算,并使用零拷贝作为输入缓冲区和输出缓冲区。
点积运算的主机内存版本:malloc_test
零拷贝版本: cuda_host_alloc_test,cudaHostAllocMapped零拷贝内存标志;cudaHostAllocWriteCombined 运行时将内存分配为“合并式写入(Write-Combined)内存”
cudaHostGetDevicePointer 获取GPU指针
#include "../common/book.h"
#define imin(a,b) (a<b?a:b)
const int N = 33 * 1024 * 1024;
const int threadsPerBlock = 256;
const int blocksPerGrid =
imin( 32, (N+threadsPerBlock-1) / threadsPerBlock );
__global__ void dot( int size, float *a, float *b, float *c ) {
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < size) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
// for reductions, threadsPerBlock must be a power of 2
// because of the following code
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
float malloc_test( int size ) {
cudaEvent_t start, stop;
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
float elapsedTime;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// allocate memory on the CPU side
a = (float*)malloc( size*sizeof(float) );
b = (float*)malloc( size*sizeof(float) );
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
size*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
size*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
blocksPerGrid*sizeof(float) ) );
// fill in the host memory with data
for (int i=0; i<size; i++) {
a[i] = i;
b[i] = i*2;
}
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
// copy the arrays 'a' and 'b' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(float),
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( dev_b, b, size*sizeof(float),
cudaMemcpyHostToDevice ) );
dot<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a, dev_b,
dev_partial_c );
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
blocksPerGrid*sizeof(float),
cudaMemcpyDeviceToHost ) );
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
// finish up on the CPU side
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_partial_c ) );
// free memory on the CPU side
free( a );
free( b );
free( partial_c );
// free events
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
printf( "Value calculated: %f\n", c );
return elapsedTime;
}
float cuda_host_alloc_test( int size ) {
cudaEvent_t start, stop;
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
float elapsedTime;
HANDLE_ERROR( cudaEventCreate( &start ) );
HANDLE_ERROR( cudaEventCreate( &stop ) );
// allocate the memory on the CPU
HANDLE_ERROR( cudaHostAlloc( (void**)&a,
size*sizeof(float),
cudaHostAllocWriteCombined |
cudaHostAllocMapped ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&b,
size*sizeof(float),
cudaHostAllocWriteCombined |
cudaHostAllocMapped ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&partial_c,
blocksPerGrid*sizeof(float),
cudaHostAllocMapped ) );
// find out the GPU pointers
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_a, a, 0 ) );
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_b, b, 0 ) );
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_partial_c,
partial_c, 0 ) );
// fill in the host memory with data
for (int i=0; i<size; i++) {
a[i] = i;
b[i] = i*2;
}
HANDLE_ERROR( cudaEventRecord( start, 0 ) );
dot<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a, dev_b,
dev_partial_c );
HANDLE_ERROR( cudaThreadSynchronize() );
HANDLE_ERROR( cudaEventRecord( stop, 0 ) );
HANDLE_ERROR( cudaEventSynchronize( stop ) );
HANDLE_ERROR( cudaEventElapsedTime( &elapsedTime,
start, stop ) );
// finish up on the CPU side
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
HANDLE_ERROR( cudaFreeHost( a ) );
HANDLE_ERROR( cudaFreeHost( b ) );
HANDLE_ERROR( cudaFreeHost( partial_c ) );
// free events
HANDLE_ERROR( cudaEventDestroy( start ) );
HANDLE_ERROR( cudaEventDestroy( stop ) );
printf( "Value calculated: %f\n", c );
return elapsedTime;
}
int main( void ) {
cudaDeviceProp prop;
int whichDevice;
HANDLE_ERROR( cudaGetDevice( &whichDevice ) );
HANDLE_ERROR( cudaGetDeviceProperties( &prop, whichDevice ) );
if (prop.canMapHostMemory != 1) {
printf( "Device can not map memory.\n" );
return 0;
}
float elapsedTime;
HANDLE_ERROR( cudaSetDeviceFlags( cudaDeviceMapHost ) );
// try it with malloc
elapsedTime = malloc_test( N );
printf( "Time using cudaMalloc: %3.1f ms\n",
elapsedTime );
// now try it with cudaHostAlloc
elapsedTime = cuda_host_alloc_test( N );
printf( "Time using cudaHostAlloc: %3.1f ms\n",
elapsedTime );
}
11.2.2 零拷贝内存的性能
独立GPU自己拥有专门的DRAM,通常位于CPU之外的电路板上。位于PCIE槽中的扩展卡。
集成GPU是系统芯片组中内置的图形处理器,通常与CPU共享系统内存。
对于集成GPU,使用零拷贝通常有性能提升,因为内存在物理上与主机是共享的,可以避免数据复制,
但它占用系统内存,所以系统性能会降低。
当输入内存和输出内存都只能使用一次时,那么独立GPU上使用零拷贝内存性能会提升,因为GPU在设计时考虑了隐藏内存访问带来的延迟,因此它将减轻PCIE总线上的读写等操作的延迟,从而提升性能,
但不会缓存零拷贝内存的内容,如果多次读取内存,将得不偿失,还不如一开始就将数据复制到GPU。
查看GPU是集成还是独立:方法1,打开机箱来观察;方法2,通过代码cudaGetDeviceProperties() 返回的结构来判断属性integrated,如果是集成GPU,则这个值是true,否则为false。
11.3 使用多个GPU
NVIDIA的SLI技术将它们桥接。
我们将改点积运算为多个GPU,
#include "../common/book.h"
#define imin(a,b) (a<b?a:b)
#define N (33*1024*1024)
const int threadsPerBlock = 256;
const int blocksPerGrid =
imin( 32, (N/2+threadsPerBlock-1) / threadsPerBlock );
__global__ void dot( int size, float *a, float *b, float *c ) {
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < size) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
// for reductions, threadsPerBlock must be a power of 2
// because of the following code
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
struct DataStruct {
int deviceID;
int size;
float *a;
float *b;
float returnValue;
};
void* routine( void *pvoidData ) {
DataStruct *data = (DataStruct*)pvoidData;
HANDLE_ERROR( cudaSetDevice( data->deviceID ) );
int size = data->size;
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
// allocate memory on the CPU side
a = data->a;
b = data->b;
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaMalloc( (void**)&dev_a,
size*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_b,
size*sizeof(float) ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
blocksPerGrid*sizeof(float) ) );
// copy the arrays 'a' and 'b' to the GPU
HANDLE_ERROR( cudaMemcpy( dev_a, a, size*sizeof(float),
cudaMemcpyHostToDevice ) );
HANDLE_ERROR( cudaMemcpy( dev_b, b, size*sizeof(float),
cudaMemcpyHostToDevice ) );
dot<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a, dev_b,
dev_partial_c );
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
blocksPerGrid*sizeof(float),
cudaMemcpyDeviceToHost ) );
// finish up on the CPU side
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
HANDLE_ERROR( cudaFree( dev_a ) );
HANDLE_ERROR( cudaFree( dev_b ) );
HANDLE_ERROR( cudaFree( dev_partial_c ) );
// free memory on the CPU side
free( partial_c );
data->returnValue = c;
return 0;
}
int main( void ) {
int deviceCount;
HANDLE_ERROR( cudaGetDeviceCount( &deviceCount ) );
if (deviceCount < 2) {
printf( "We need at least two compute 1.0 or greater "
"devices, but only found %d\n", deviceCount );
return 0;
}
float *a = (float*)malloc( sizeof(float) * N );
HANDLE_NULL( a );
float *b = (float*)malloc( sizeof(float) * N );
HANDLE_NULL( b );
// fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i*2;
}
// prepare for multithread
DataStruct data[2];
data[0].deviceID = 0;
data[0].size = N/2;
data[0].a = a;
data[0].b = b;
data[1].deviceID = 1;
data[1].size = N/2;
data[1].a = a + N/2;
data[1].b = b + N/2;
CUTThread thread = start_thread( routine, &(data[0]) );
routine( &(data[1]) );
end_thread( thread );
// free memory on the CPU side
free( a );
free( b );
printf( "Value calculated: %f\n",
data[0].returnValue + data[1].returnValue );
return 0;
}
每个GPU都需要由一个不同的CPU线程来控制,start_thread创建了新的线程【这个函数是自己写的哟】。
11.4 可移动的固定内存
固定内存只是相对于分配他的线程是固定内存,其他线程让然当它是正常内存可以拷贝、各种操作。
一种补救措施:我们将固定内存分配为可移动的,就是可以在主机线程之间移动这块内存,并且每个线程都视其为固定内存。
需要使用cudaHostAlloc() 来分配内存,并且调用新的标志 cudaHostAllocPortable,这个标志可以与其他标志一起使用,如cudaHostAllocWriteCombined和cudaHostAllocMapped。这意味着分配主机内存时,可将其视为可移动、零拷贝以及合并式写入等的任意组合。
例子,点积运算
#include "../common/book.h"
#define imin(a,b) (a<b?a:b)
#define N (33*1024*1024)
const int threadsPerBlock = 256;
const int blocksPerGrid =
imin( 32, (N/2+threadsPerBlock-1) / threadsPerBlock );
__global__ void dot( int size, float *a, float *b, float *c ) {
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while (tid < size) {
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
// set the cache values
cache[cacheIndex] = temp;
// synchronize threads in this block
__syncthreads();
// for reductions, threadsPerBlock must be a power of 2
// because of the following code
int i = blockDim.x/2;
while (i != 0) {
if (cacheIndex < i)
cache[cacheIndex] += cache[cacheIndex + i];
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
c[blockIdx.x] = cache[0];
}
struct DataStruct {
int deviceID;
int size;
int offset;
float *a;
float *b;
float returnValue;
};
void* routine( void *pvoidData ) {
DataStruct *data = (DataStruct*)pvoidData;
if (data->deviceID != 0) {
HANDLE_ERROR( cudaSetDevice( data->deviceID ) );
HANDLE_ERROR( cudaSetDeviceFlags( cudaDeviceMapHost ) );
}
int size = data->size;
float *a, *b, c, *partial_c;
float *dev_a, *dev_b, *dev_partial_c;
// allocate memory on the CPU side
a = data->a;
b = data->b;
partial_c = (float*)malloc( blocksPerGrid*sizeof(float) );
// allocate the memory on the GPU
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_a, a, 0 ) );
HANDLE_ERROR( cudaHostGetDevicePointer( &dev_b, b, 0 ) );
HANDLE_ERROR( cudaMalloc( (void**)&dev_partial_c,
blocksPerGrid*sizeof(float) ) );
// offset 'a' and 'b' to where this GPU is gets it data
dev_a += data->offset;
dev_b += data->offset;
dot<<<blocksPerGrid,threadsPerBlock>>>( size, dev_a, dev_b,
dev_partial_c );
// copy the array 'c' back from the GPU to the CPU
HANDLE_ERROR( cudaMemcpy( partial_c, dev_partial_c,
blocksPerGrid*sizeof(float),
cudaMemcpyDeviceToHost ) );
// finish up on the CPU side
c = 0;
for (int i=0; i<blocksPerGrid; i++) {
c += partial_c[i];
}
HANDLE_ERROR( cudaFree( dev_partial_c ) );
// free memory on the CPU side
free( partial_c );
data->returnValue = c;
return 0;
}
int main( void ) {
int deviceCount;
HANDLE_ERROR( cudaGetDeviceCount( &deviceCount ) );
if (deviceCount < 2) {
printf( "We need at least two compute 1.0 or greater "
"devices, but only found %d\n", deviceCount );
return 0;
}
cudaDeviceProp prop;
for (int i=0; i<2; i++) {
HANDLE_ERROR( cudaGetDeviceProperties( &prop, i ) );
if (prop.canMapHostMemory != 1) {
printf( "Device %d can not map memory.\n", i );
return 0;
}
}
float *a, *b;
HANDLE_ERROR( cudaSetDevice( 0 ) );
HANDLE_ERROR( cudaSetDeviceFlags( cudaDeviceMapHost ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&a, N*sizeof(float),
cudaHostAllocWriteCombined |
cudaHostAllocPortable |
cudaHostAllocMapped ) );
HANDLE_ERROR( cudaHostAlloc( (void**)&b, N*sizeof(float),
cudaHostAllocWriteCombined |
cudaHostAllocPortable |
cudaHostAllocMapped ) );
// fill in the host memory with data
for (int i=0; i<N; i++) {
a[i] = i;
b[i] = i*2;
}
// prepare for multithread
DataStruct data[2];
data[0].deviceID = 0;
data[0].offset = 0;
data[0].size = N/2;
data[0].a = a;
data[0].b = b;
data[1].deviceID = 1;
data[1].offset = N/2;
data[1].size = N/2;
data[1].a = a;
data[1].b = b;
CUTThread thread = start_thread( routine, &(data[1]) );
routine( &(data[0]) );
end_thread( thread );
// free memory on the CPU side
HANDLE_ERROR( cudaFreeHost( a ) );
HANDLE_ERROR( cudaFreeHost( b ) );
printf( "Value calculated: %f\n",
data[0].returnValue + data[1].returnValue );
return 0;
}
注意routine() 中的if很微妙,
if (data->deviceID != 0) {
HANDLE_ERROR( cudaSetDevice( data->deviceID ) );
HANDLE_ERROR( cudaSetDeviceFlags( cudaDeviceMapHost ) );
}