CUDA编程之CUDA Sample-1_Utilities-deviceQuery

最新推荐文章于 2024-10-12 09:40:09 发布

翊桐公羽

最新推荐文章于 2024-10-12 09:40:09 发布

阅读量891

点赞数 25

文章标签：算法

本文链接：https://blog.csdn.net/zhujingwen_tongji/article/details/140323897

版权

CUDA Samples 中 1_Utilities 文件夹里包含了一些实用工具和小型示例程序,它们通常用于支持和演示其他 CUDA 示例程序的功能。

deviceQuery 这个sample主要是是查询系统上安装的 CUDA 设备的属性。这个程序可以帮助开发人员了解系统上可用的 CUDA 设备的具体信息,以便编写更高效的 CUDA 应用程序。与其相似的Sample还有deviceQueryDrv，它的功能与deviceQuery 一样，区别是deviceQuery 使用了cuda runtime API, deviceQueryDrv使用了CUDA driverAPI。

该sample有以下主要功能:

列出设备: 该程序会列出系统上可用的所有 CUDA 设备,并显示每个设备的设备编号、名称、计算能力等基本信息。
设备属性: 对于每个可用的 CUDA 设备,该程序都会打印出设备的详细属性,包括:
- 总显存大小
- 最大线程块大小
- 最大网格大小
- 核心数量
- 驱动程序版本
- CUDA 运行时版本
- 设备支持的 CUDA 计算能力
- 设备支持的 CUDA 功能
GPU 信息: 该程序还会打印出系统上 GPU 的制造商、型号、GPU 架构等信息。
系统信息: 除了 CUDA 设备信息,该程序还会打印出系统的操作系统、CPU 信息等基本系统信息。

通过运行 deviceQuery 示例程序,开发人员可以了解系统上可用的 CUDA 硬件资源,从而针对不同的硬件配置优化自己的 CUDA 应用程序。这有助于提高 CUDA 应用程序的性能和可移植性。

#include <cuda_runtime.h>
#include <helper_cuda.h>

#include <iostream>
#include <memory>
#include <string>

int *pArgc = NULL;
char **pArgv = NULL;

#if CUDART_VERSION < 5000

// CUDA-C includes
#include <cuda.h>

// This function wraps the CUDA Driver API into a template function
template <class T>
inline void getCudaAttribute(T *attribute, CUdevice_attribute device_attribute,
                             int device) {
  CUresult error = cuDeviceGetAttribute(attribute, device_attribute, device);

  if (CUDA_SUCCESS != error) {
    fprintf(
        stderr,
        "cuSafeCallNoSync() Driver API error = %04d from file <%s>, line %i.\n",
        error, __FILE__, __LINE__);

    exit(EXIT_FAILURE);
  }
}

#endif /* CUDART_VERSION < 5000 */


// Program main

int main(int argc, char **argv) {
  pArgc = &argc;
  pArgv = argv;

  printf("%s Starting...\n\n", argv[0]);
  printf(
      " CUDA Device Query (Runtime API) version (CUDART static linking)\n\n");

  int deviceCount = 0;
  cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

  if (error_id != cudaSuccess) {
    printf("cudaGetDeviceCount returned %d\n-> %s\n",
           static_cast<int>(error_id), cudaGetErrorString(error_id));
    printf("Result = FAIL\n");
    exit(EXIT_FAILURE);
  }

  // This function call returns 0 if there are no CUDA capable devices.
  if (deviceCount == 0) {
    printf("There are no available device(s) that support CUDA\n");
  } else {
    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
  }

  int dev, driverVersion = 0, runtimeVersion = 0;

  for (dev = 0; dev < deviceCount; ++dev) {
    cudaSetDevice(dev);
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, dev);

    printf("\nDevice %d: \"%s\"\n", dev, deviceProp.name);

    // Console log
    cudaDriverGetVersion(&driverVersion);
    cudaRuntimeGetVersion(&runtimeVersion);
    printf("  CUDA Driver Version / Runtime Version          %d.%d / %d.%d\n",
           driverVersion / 1000, (driverVersion % 100) / 10,
           runtimeVersion / 1000, (runtimeVersion % 100) / 10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n",
           deviceProp.major, deviceProp.minor);

    char msg[256];
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    sprintf_s(msg, sizeof(msg),
              "  Total amount of global memory:                 %.0f MBytes "
              "(%llu bytes)\n",
              static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
              (unsigned long long)deviceProp.totalGlobalMem);
#else
    snprintf(msg, sizeof(msg),
             "  Total amount of global memory:                 %.0f MBytes "
             "(%llu bytes)\n",
             static_cast<float>(deviceProp.totalGlobalMem / 1048576.0f),
             (unsigned long long)deviceProp.totalGlobalMem);
#endif
    printf("%s", msg);

    printf("  (%03d) Multiprocessors, (%03d) CUDA Cores/MP:    %d CUDA Cores\n",
           deviceProp.multiProcessorCount,
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor),
           _ConvertSMVer2Cores(deviceProp.major, deviceProp.minor) *
               deviceProp.multiProcessorCount);
    printf(
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
        "GHz)\n",
        deviceProp.clockRate * 1e-3f, deviceProp.clockRate * 1e-6f);

#if CUDART_VERSION >= 5000
    // This is supported in CUDA 5.0 (runtime API device properties)
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           deviceProp.memoryClockRate * 1e-3f);
    printf("  Memory Bus Width:                              %d-bit\n",
           deviceProp.memoryBusWidth);

    if (deviceProp.l2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             deviceProp.l2CacheSize);
    }

#else
    // This only available in CUDA 4.0-4.2 (but these were only exposed in the
    // CUDA Driver API)
    int memoryClock;
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                          dev);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
    int memBusWidth;
    getCudaAttribute<int>(&memBusWidth,
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
    int L2CacheSize;
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

    if (L2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             L2CacheSize);
    }

#endif

    printf(
        "  Maximum Texture Dimension Size (x,y,z)         1D=(%d), 2D=(%d, "
        "%d), 3D=(%d, %d, %d)\n",
        deviceProp.maxTexture1D, deviceProp.maxTexture2D[0],
        deviceProp.maxTexture2D[1], deviceProp.maxTexture3D[0],
        deviceProp.maxTexture3D[1], deviceProp.maxTexture3D[2]);
    printf(
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
        deviceProp.maxTexture1DLayered[0], deviceProp.maxTexture1DLayered[1]);
    printf(
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
        "layers\n",
        deviceProp.maxTexture2DLayered[0], deviceProp.maxTexture2DLayered[1],
        deviceProp.maxTexture2DLayered[2]);

    printf("  Total amount of constant memory:               %zu bytes\n",
           deviceProp.totalConstMem);
    printf("  Total amount of shared memory per block:       %zu bytes\n",
           deviceProp.sharedMemPerBlock);
    printf("  Total shared memory per multiprocessor:        %zu bytes\n",
           deviceProp.sharedMemPerMultiprocessor);
    printf("  Total number of registers available per block: %d\n",
           deviceProp.regsPerBlock);
    printf("  Warp size:                                     %d\n",
           deviceProp.warpSize);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           deviceProp.maxThreadsPerMultiProcessor);
    printf("  Maximum number of threads per block:           %d\n",
           deviceProp.maxThreadsPerBlock);
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
           deviceProp.maxThreadsDim[0], deviceProp.maxThreadsDim[1],
           deviceProp.maxThreadsDim[2]);
    printf("  Max dimension size of a grid size    (x,y,z): (%d, %d, %d)\n",
           deviceProp.maxGridSize[0], deviceProp.maxGridSize[1],
           deviceProp.maxGridSize[2]);
    printf("  Maximum memory pitch:                          %zu bytes\n",
           deviceProp.memPitch);
    printf("  Texture alignment:                             %zu bytes\n",
           deviceProp.textureAlignment);
    printf(
        "  Concurrent copy and kernel execution:          %s with %d copy "
        "engine(s)\n",
        (deviceProp.deviceOverlap ? "Yes" : "No"), deviceProp.asyncEngineCount);
    printf("  Run time limit on kernels:                     %s\n",
           deviceProp.kernelExecTimeoutEnabled ? "Yes" : "No");
    printf("  Integrated GPU sharing Host Memory:            %s\n",
           deviceProp.integrated ? "Yes" : "No");
    printf("  Support host page-locked memory mapping:       %s\n",
           deviceProp.canMapHostMemory ? "Yes" : "No");
    printf("  Alignment requirement for Surfaces:            %s\n",
           deviceProp.surfaceAlignment ? "Yes" : "No");
    printf("  Device has ECC support:                        %s\n",
           deviceProp.ECCEnabled ? "Enabled" : "Disabled");
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
           deviceProp.tccDriver ? "TCC (Tesla Compute Cluster Driver)"
                                : "WDDM (Windows Display Driver Model)");
#endif
    printf("  Device supports Unified Addressing (UVA):      %s\n",
           deviceProp.unifiedAddressing ? "Yes" : "No");
    printf("  Device supports Managed Memory:                %s\n",
           deviceProp.managedMemory ? "Yes" : "No");
    printf("  Device supports Compute Preemption:            %s\n",
           deviceProp.computePreemptionSupported ? "Yes" : "No");
    printf("  Supports Cooperative Kernel Launch:            %s\n",
           deviceProp.cooperativeLaunch ? "Yes" : "No");
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
           deviceProp.cooperativeMultiDeviceLaunch ? "Yes" : "No");
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
           deviceProp.pciDomainID, deviceProp.pciBusID, deviceProp.pciDeviceID);

    const char *sComputeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device "
        "simultaneously)",
        "Exclusive (only one host thread in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
        "device)",
        "Exclusive Process (many threads in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Unknown", NULL};
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[deviceProp.computeMode]);
  }

  // If there are 2 or more GPUs, query to determine whether RDMA is supported
  if (deviceCount >= 2) {
    cudaDeviceProp prop[64];
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
    int gpu_p2p_count = 0;

    for (int i = 0; i < deviceCount; i++) {
      checkCudaErrors(cudaGetDeviceProperties(&prop[i], i));

      // Only boards based on Fermi or later can support P2P
      if ((prop[i].major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
          // must be enabled to support this
          && prop[i].tccDriver
#endif
          ) {
        // This is an array of P2P capable GPUs
        gpuid[gpu_p2p_count++] = i;
      }
    }

    // Show all the combinations of support P2P GPUs
    int can_access_peer;

    if (gpu_p2p_count >= 2) {
      for (int i = 0; i < gpu_p2p_count; i++) {
        for (int j = 0; j < gpu_p2p_count; j++) {
          if (gpuid[i] == gpuid[j]) {
            continue;
          }
          checkCudaErrors(
              cudaDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
          printf("> Peer access from %s (GPU%d) -> %s (GPU%d) : %s\n",
                 prop[gpuid[i]].name, gpuid[i], prop[gpuid[j]].name, gpuid[j],
                 can_access_peer ? "Yes" : "No");
        }
      }
    }
  }

  // csv masterlog info
  // *****************************
  // exe and CUDA driver name
  printf("\n");
  std::string sProfileString = "deviceQuery, CUDA Driver = CUDART";
  char cTemp[16];

  // driver version
  sProfileString += ", CUDA Driver Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  sprintf_s(cTemp, 10, "%d.%d", driverVersion / 1000,
            (driverVersion % 100) / 10);
#else
  snprintf(cTemp, sizeof(cTemp), "%d.%d", driverVersion / 1000,
           (driverVersion % 100) / 10);
#endif
  sProfileString += cTemp;

  // Runtime version
  sProfileString += ", CUDA Runtime Version = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  sprintf_s(cTemp, 10, "%d.%d", runtimeVersion / 1000,
            (runtimeVersion % 100) / 10);
#else
  snprintf(cTemp, sizeof(cTemp), "%d.%d", runtimeVersion / 1000,
           (runtimeVersion % 100) / 10);
#endif
  sProfileString += cTemp;

  // Device count
  sProfileString += ", NumDevs = ";
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
  sprintf_s(cTemp, 10, "%d", deviceCount);
#else
  snprintf(cTemp, sizeof(cTemp), "%d", deviceCount);
#endif
  sProfileString += cTemp;
  sProfileString += "\n";
  printf("%s", sProfileString.c_str());

  printf("Result = PASS\n");

  // finish
  exit(EXIT_SUCCESS);
}

代码详解：

1.在sample代码最前面有一段针对CUDA版本低于5.0的处理。

这段代码是一种使用 CUDA driver API 访问 CUDA 设备属性的方法。这在使用 CUDA 版本低于 5.0 时是必要的,因为 CUDA 运行时 API 在那个版本之前没有全面的设备查询函数。

这段代码主要做了这些事情:

预处理指令 #if CUDART_VERSION < 5000 检查 CUDA 运行时版本是否低于 5.0。这确保了该代码只用于较旧的 CUDA 版本。
在预处理块内,导入了必要的 CUDA-C 包含文件,特别是提供访问 CUDA 驱动 API 的 <cuda.h>。
getCudaAttribute 函数是一个模板函数,它封装了 CUDA 驱动 API 中的 cuDeviceGetAttribute 函数。它接受以下参数:
- attribute: 一个指向存储属性值的变量的指针。
- device_attribute: 要查询的特定属性,定义在 CUdevice_attribute 枚举中。
- device: 要查询的 CUDA 设备索引。

该函数使用 CUresult 类型检查 cuDeviceGetAttribute 的返回值。如果返回值不是 CUDA_SUCCESS,它会打印一条错误消息并退出程序。

2. Main函数

首先输出一些初始信息,然后尝试获取设备数量。如果获取设备数量失败,程序会输出错误信息并退出。如果成功获取设备数量,程序会依次输出每个设备的详细信息,包括:

设备名称
CUDA 驱动版本和运行时版本
CUDA 计算能力主版本和次版本号
设备的总显存大小
设备的多处理器数量和 CUDA 核心数量
设备的最高时钟频率
对于 CUDA 5.0 及以上版本,程序还会输出设备的内存时钟频率和内存总线宽度。如果设备有 L2 缓存,也会输出 L2 缓存大小。

接着main函数中如果检测到 2 个或更多 GPU 设备,程序会进一步检查它们是否支持 P2P 访问。程序首先遍历所有可用的 GPU 设备,并检查它们的计算能力。只有基于 Fermi 或更新架构的 GPU 才能支持 P2P 访问。对于 Windows 系统,还需要开启 Tesla Compute Cluster 驱动程序。程序将所有支持 P2P 访问的 GPU 设备的 ID 存储在 gpuid 数组中。然后程序会逐一检查 gpuid 数组中的 GPU 对之间是否支持 P2P 访问。使用 cudaDeviceCanAccessPeer() 函数可以查询两个 GPU 设备之间是否可以进行直接的 P2P 访问。程序会打印出每对 GPU 设备之间 P2P 访问的支持情况,即"Yes"或"No"。

最后收集 GPU 设备的信息,并将其以 CSV 格式记录到主日志中。

CUDA 相关API解读

该Sample使用的CUDA API: cudaSetDevice, cudaGetDeviceCount, cudaGetDeviceProperties, cudaDriverGetVersion, cudaRuntimeGetVersion

这些API属于CUDA runtime API。

1.cudaSetDevice

host线程可以随时通过调用 cudaSetDevice() 来设置它所操作的设备。host内存分配和内核启动都在当前设置的设备上进行;流和事件也是与当前设置的host相关联。如果没有调用 cudaSetDevice()，当前设备就是设备 0。

2. cudaGetDeviceCount

host可能拥有多个设备。下面的代码示例展示了如何枚举这些设备、查询它们的属性,并确定 CUDA 启用设备的数量：

int deviceCount;
cudaGetDeviceCount(&deviceCount);
int device;
for (device = 0; device < deviceCount; ++device) {
    cudaDeviceProp deviceProp;
    cudaGetDeviceProperties(&deviceProp, device);
    printf("Device %d has compute capability %d.%d.\n",
           device, deviceProp.major, deviceProp.minor);
}

3.cudaGetDeviceProperties

cudaGetDeviceProperties 函数用于获取指定 CUDA 设备的属性信息:

设备名称 (name)
设备全局内存大小 (totalGlobalMem)
每个块的共享内存大小 (sharedMemPerBlock)
每个块的寄存器数量 (regsPerBlock)
CUDA 线程束大小 (warpSize)
每个块最大线程数 (maxThreadsPerBlock)
网格的最大尺寸 (maxGridSize)
设备时钟频率 (clockRate)
设备常量内存大小 (totalConstMem)
设备纹理内存大小 (totalTexSize)
设备并行计算能力 (major、minor)
设备最大线程维度 (maxThreadsDim)
设备最大网格尺寸 (maxGridSize)
设备缓存大小 (l2CacheSize)
设备内存总线带宽 (memoryBusWidth)
设备峰值内存传输速率 (memoryClockRate)
设备峰值 FLOPS 性能 (multiProcessorCount、clockRate、maxThreadsPerMultiProcessor)
设备支持的 CUDA 版本 (cudaRuntimeVersion、driverVersion)
设备 ECC 支持 (ECCEnabled)
设备 Unified 地址空间支持 (unifiedAddressing)
设备动态并行支持 (concurrentKernels)
设备 Async 并行支持 (asyncEngineCount)
设备统一内存 (Unified Memory) 支持 (managedMemory)
设备 GPU 直接访问主机内存 (Peer-to-Peer) 支持 (peerAccess)

4.cudaDriverGetVersion

cudaDriverGetVersion 函数用于获取当前系统上安装的 CUDA driver的版本号。

5.cudaRuntimeGetVersion

cudaRuntimeGetVersion 函数用于获取当前系统上运行的 CUDA 运行时库的版本号。

运行结果：

 CUDA Device Query (Runtime API) version (CUDART static linking)

Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce RTX 4080"
  CUDA Driver Version / Runtime Version          12.5 / 12.5
  CUDA Capability Major/Minor version number:    8.9
  Total amount of global memory:                 16376 MBytes (17170956288 bytes)
  (076) Multiprocessors, (128) CUDA Cores/MP:    9728 CUDA Cores
  GPU Max Clock rate:                            2505 MHz (2.50 GHz)
  Memory Clock rate:                             11201 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 67108864 bytes
  Maximum Texture Dimension Size (x,y,z)         1D=(131072), 2D=(131072, 65536), 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total shared memory per multiprocessor:        102400 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size    (x,y,z): (2147483647, 65535, 65535)
  Maximum memory pitch:                          2147483647 bytes
  Texture alignment:                             512 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >

deviceQuery, CUDA Driver = CUDART, CUDA Driver Version = 12.5, CUDA Runtime Version = 12.5, NumDevs = 1
Result = PASS

翊桐公羽

关注

25
点赞
踩
20

收藏

觉得还不错? 一键收藏
0
评论
复制链接

分享到 QQ

分享到新浪微博

扫一扫