并行计算大当其道,cuda c, 给了开发者与NVIDIA 通用型GPU设备交互的能力,举2个栗子先了解一下,
1,枚举设备属性代码
int main( void ) {
cudaDeviceProp prop;
int count;
HANDLE_ERROR( cudaGetDeviceCount( &count ) );
for (int i=0; i< count; i++) {
HANDLE_ERROR( cudaGetDeviceProperties( &prop, i ) );
printf( " --- General Information for device %d ---\n", i );
printf( "Name: %s\n", prop.name );
printf( "Compute capability: %d.%d\n", prop.major, prop.minor );
printf( "Clock rate: %d\n", prop.clockRate );
printf( "Device copy overlap: " );
if (prop.deviceOverlap)
printf( "Enabled\n" );
else
printf( "Disabled\n");
printf( "Kernel execution timeout : " );
if (prop.kernelExecTimeoutEnabled)
printf( "Enabled\n" );
else
printf( "Disabled\n" );
printf( " --- Memory Information for device %d ---\n", i );
printf( "Total global mem: %ld\n", prop.totalGlobalMem );
printf( "Total constant Mem: %ld\n", prop.totalConstMem );
printf( "Max mem pitch: %ld\n", prop.memPitch );
printf( "Texture Alignment: %ld\n", prop.textureAlignment );
printf( " --- MP Information for device %d ---\n", i );
printf( "Multiprocessor count: %d\n",
prop.multiProcessorCount );
printf( "Shared mem per mp: %ld\n", prop.sharedMemPerBlock );
printf( "Registers per mp: %d\n", prop.regsPerBlock );
printf( "Threads in warp: %d\n", prop.warpSize );
printf( "Max threads per block: %d\n",
prop.maxThreadsPerBlock );
printf( "Max thread dimensions: (%d, %d, %d)\n",
prop.maxThreadsDim[0], prop.maxThreadsDim[1],
prop.maxThreadsDim[2] );
printf( "Max grid dimensions: (%d, %d, %d)\n",
prop.maxGridSize[0], prop.maxGridSize[1],
prop.maxGridSize[2] );
printf( "\n" );
}
}
结果
marco@marco-System-Product-Name:~/Tests/chapter03$ nvcc -arch sm_60 enum_gpu.cu -o enum_gpu
marco@marco-System-Product-Name:~/Tests/chapter03$ ./enum_gpu
--- General Information for device 0 ---
Name: GeForce GTX 1050 Ti
Compute capability: 6.1
Clock rate: 1392000
Device copy overlap: Enabled
Kernel execution timeout : Enabled
--- Memory Information for device 0 ---
Total global mem: 4233035776
Total constant Mem: 65536
Max mem pitch: 2147483647
Texture Alignment: 512
--- MP Information for device 0 ---
Multiprocessor count: 6
Shared mem per mp: 49152
Registers per mp: 65536
Threads in warp: 32
Max threads per block: 1024
Max thread dimensions: (1024, 1024, 64)
Max grid dimensions: (2147483647, 65535, 65535)
多GPU情况下如何选择合适的GPU设备运行保证性能呢?
int main( void ) {
cudaDeviceProp prop;
int dev;
HANDLE_ERROR( cudaGetDevice( &dev ) );
printf( "ID of current CUDA device: %d\n", dev );
memset( &prop, 0, sizeof( cudaDeviceProp ) );
prop.major = 1;
prop.minor = 3;
HANDLE_ERROR( cudaChooseDevice( &dev, &prop ) );
printf( "ID of CUDA device closest to revision 1.3: %d\n", dev );
HANDLE_ERROR( cudaSetDevice( dev ) );
}
结果:当然当前只有一个设备满足哈
marco@marco-System-Product-Name:~/Tests/chapter03$ ./set_gpu
ID of current CUDA device: 0
ID of CUDA device closest to revision 1.3: 0