1. 在CUDA运行时API中有很多函数可以帮助管理这些设备,使用这些运行时API查询设备信息,代码如下:
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <math.h>
#define EXIT_FAULLURE -1
#define EXIT_SUCCESS 1
int main(int argc, char **argv[])
{
printf("%s Starting... \n\n", argv[0]);
int deviceCount;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess)
{
printf("cudaGetDeviceCount returned %d\n -> %s\n", (int)error_id, cudaGetErrorString(error_id));
printf("Result = FALL\n");
exit(EXIT_FAULLURE);
}
if (deviceCount == 0)
{
printf("There are no available device(s) that support CUDA\n");
}
else
{
printf("Deltected %d CUDA Capable device(s)\n", deviceCount);
}
int dev, driverVersion = 0, runtimeVersion = 0;
dev = 0;
cudaSetDevice(dev);
cudaDeviceProp deviceProp;
cudaGetDeviceProperties(&deviceProp, dev);
printf("Device %d: \"%s\"\n", dev, deviceProp.name);
cudaDriverGetVersion(&driverVersion);
cudaRuntimeGetVersion(&runtimeVersion);
printf("CUDA Driver Version / Runtime Version %d.%d / %d.%d\n",
driverVersion / 1000, (driverVersion % 100) / 10,
runtimeVersion / 1000, (runtimeVersion % 100) / 10);
printf("CUDA Capability Major/Minor version number: %d.%d\n",
deviceProp.major, deviceProp.minor);
printf("Total amount of global memory: %2.f MBytes (%llu bytes\n)",
(float)deviceProp.totalGlobalMem / (pow(1024.0, 3)),(unsigned long long) deviceProp.totalGlobalMem);
printf("GPU Clock rate: %.0f MHz (%0.2f GHz)\n",
deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);
printf("Memory Clock rate: %.0f Mhz\n",
deviceProp.memoryClockRate * 1e-3f);
printf(" Memory Bus Width: %d-bit\n",
deviceProp.memoryBusWidth);
if (deviceProp.l2CacheSize)
{
printf(" L2 Cache Size: %d bytes\n",deviceProp.l2CacheSize);
}
printf(" Max Texture Dimension Size (x,y,z) 1D = (%d), 2D = (%d, %d), 3D = (%d, %d, %d)\n",
deviceProp.maxTexture1D, deviceProp.maxTexture2D[0], deviceProp.maxTexture2D[1],
deviceProp.maxTexture3D[0], deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
printf(" Max Layered Texture Size (dim) x layers 1D=(%d) x %d, 2D=(%d,%d) x %d\n",
deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1], deviceProp.maxTexture2DLayered[0],
deviceProp.maxTexture2DLayered[1], deviceProp.maxTexture2DLayered[2]);
printf("Total amount of constant memory: %lu bytes\n", deviceProp.totalConstMem);
printf("Total amount of shared memory per block: %lu bytes\n", deviceProp.sharedMemPerBlock);
printf("Total number of registers available per block: %d\n", deviceProp.regsPerBlock);
printf("Warp size: %d\n", deviceProp.warpSize);
printf("Maximum number of threads per mult iprocessor: %d\n", deviceProp.maxThreadsPerMultiProcessor);
printf("Maximum number of threads per block: %d\n", deviceProp.maxThreadsPerBlock);
printf("Maximum sizes of each dimension of a block: %d x %d x %d\n",
deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1], deviceProp.maxThreadsDim[2]);
printf("Maximum sizes of each dimension of a grid: %d x %d x %d\n",
deviceProp.maxGridSize[0], deviceProp.maxGridSize[1], deviceProp.maxGridSize[2]);
printf("Maximum memory pitch %lu bytes\n", deviceProp.memPitch);
exit(EXIT_SUCCESS);
return 0;
}
2. 其中, cudaDeviceProp 结构定义如下:
// cudaDeviceProp 结构定义如下:
struct cudaDeviceProp {
char name[256]; // 用于标识设备的ASCII字符串;
size_t totalGlobalMem; // 设备上可用的全局存储器的总量
size_t sharedMemPerBlock; // 线程块可以使用的共享存储器的最大值,多处理器上的所有线程块可以同时共享这些存储器
int regsPerBlock; // 线程块可以使用的32位寄存器的最大值,多处理器上的所有线程块可以同时共享这些寄存器
int warpSize; // 按线程计算的warp块大小
size_t memPitch; // 允许通过cudaMallocPitch()为包含存储器区域的存储器复制函数分配的最大间距(pitch)
int maxThreadsPerBlock; // 每个块中的最大线程数
int maxThreadsDim[3]; // 块各个维度的最大值
int maxGridSize[3]; // 网格各个维度的最大值
size_t totalConstMem; // 设备上可用的不变存储器总量
int major; // 定义设备计算能力的主要修订号
int minor; // 定义设备计算能力的次要修订号
int clockRate; // 时钟频率
size_t textureAlignment; // 对齐要求;与textureAlignment字节对齐的纹理基址无需对纹理取样应用偏移;
int deviceOverlap; // 如果设备可在主机和设备之间并发复制存储器,同时又能执行内核,则此值为 1;否则此值为 0;
int multiProcessorCount; // 设备上多处理器的数量
}
3. 运行结果,如下: