CUDA编程之CUDA Sample-1_Utilities-deviceQueryDrv

翊桐公羽

于 2024-07-10 16:41:08 发布

阅读量801

点赞数 24

文章标签：算法

本文链接：https://blog.csdn.net/zhujingwen_tongji/article/details/140327120

版权

CUDA Samples 中 1_Utilities 文件夹里包含了一些实用工具和小型示例程序,它们通常用于支持和演示其他 CUDA 示例程序的功能。

deviceQueryDrv和deviceQuery 类似，主要是是查询系统上安装的 CUDA 设备的属性。区别是deviceQuery 使用了cuda runtime API, deviceQueryDrv使用了CUDA driverAPI。这个程序可以帮助开发人员了解系统上可用的 CUDA 设备的具体信息,以便编写更高效的 CUDA 应用程序。

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <cuda.h>
#include <helper_cuda_drvapi.h>


// Program main

int main(int argc, char **argv) {
  CUdevice dev;
  int major = 0, minor = 0;
  int deviceCount = 0;
  char deviceName[256];

  printf("%s Starting...\n\n", argv[0]);

  // note your project will need to link with cuda.lib files on windows
  printf("CUDA Device Query (Driver API) statically linked version \n");

  checkCudaErrors(cuInit(0));

  checkCudaErrors(cuDeviceGetCount(&deviceCount));

  // This function call returns 0 if there are no CUDA capable devices.
  if (deviceCount == 0) {
    printf("There are no available device(s) that support CUDA\n");
  } else {
    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
  }

  for (dev = 0; dev < deviceCount; ++dev) {
    checkCudaErrors(cuDeviceGetAttribute(
        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
    checkCudaErrors(cuDeviceGetAttribute(
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));

    checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));

    printf("\nDevice %d: \"%s\"\n", dev, deviceName);

    int driverVersion = 0;
    checkCudaErrors(cuDriverGetVersion(&driverVersion));
    printf("  CUDA Driver Version:                           %d.%d\n",
           driverVersion / 1000, (driverVersion % 100) / 10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major,
           minor);

    size_t totalGlobalMem;
    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));

    char msg[256];
    SPRINTF(msg,
            "  Total amount of global memory:                 %.0f MBytes "
            "(%llu bytes)\n",
            (float)totalGlobalMem / 1048576.0f,
            (unsigned long long)totalGlobalMem);
    printf("%s", msg);

    int multiProcessorCount;
    getCudaAttribute<int>(&multiProcessorCount,
                          CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);

    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
           multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
           _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);

    int clockRate;
    getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
    printf(
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
        "GHz)\n",
        clockRate * 1e-3f, clockRate * 1e-6f);
    int memoryClock;
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                          dev);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
    int memBusWidth;
    getCudaAttribute<int>(&memBusWidth,
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
    int L2CacheSize;
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

    if (L2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             L2CacheSize);
    }

    int maxTex1D, maxTex2D[2], maxTex3D[3];
    getCudaAttribute<int>(&maxTex1D,
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex2D[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex2D[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
    getCudaAttribute<int>(&maxTex3D[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex3D[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
    getCudaAttribute<int>(&maxTex3D[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
    printf(
        "  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
        "3D=(%d, %d, %d)\n",
        maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
        maxTex3D[2]);

    int maxTex1DLayered[2];
    getCudaAttribute<int>(&maxTex1DLayered[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
                          dev);
    getCudaAttribute<int>(&maxTex1DLayered[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
                          dev);
    printf(
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
        maxTex1DLayered[0], maxTex1DLayered[1]);

    int maxTex2DLayered[3];
    getCudaAttribute<int>(&maxTex2DLayered[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
                          dev);
    getCudaAttribute<int>(&maxTex2DLayered[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
                          dev);
    getCudaAttribute<int>(&maxTex2DLayered[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
                          dev);
    printf(
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
        "layers\n",
        maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);

    int totalConstantMemory;
    getCudaAttribute<int>(&totalConstantMemory,
                          CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
    printf("  Total amount of constant memory:               %u bytes\n",
           totalConstantMemory);
    int sharedMemPerBlock;
    getCudaAttribute<int>(&sharedMemPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
    printf("  Total amount of shared memory per block:       %u bytes\n",
           sharedMemPerBlock);
    int regsPerBlock;
    getCudaAttribute<int>(&regsPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
    printf("  Total number of registers available per block: %d\n",
           regsPerBlock);
    int warpSize;
    getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
    printf("  Warp size:                                     %d\n", warpSize);
    int maxThreadsPerMultiProcessor;
    getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
                          dev);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           maxThreadsPerMultiProcessor);
    int maxThreadsPerBlock;
    getCudaAttribute<int>(&maxThreadsPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
    printf("  Maximum number of threads per block:           %d\n",
           maxThreadsPerBlock);

    int blockDim[3];
    getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                          dev);
    getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                          dev);
    getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                          dev);
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
           blockDim[0], blockDim[1], blockDim[2]);
    int gridDim[3];
    getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
    getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
    getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
    printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n",
           gridDim[0], gridDim[1], gridDim[2]);

    int textureAlign;
    getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
                          dev);
    printf("  Texture alignment:                             %u bytes\n",
           textureAlign);

    int memPitch;
    getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
    printf("  Maximum memory pitch:                          %u bytes\n",
           memPitch);

    int gpuOverlap;
    getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);

    int asyncEngineCount;
    getCudaAttribute<int>(&asyncEngineCount,
                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
    printf(
        "  Concurrent copy and kernel execution:          %s with %d copy "
        "engine(s)\n",
        (gpuOverlap ? "Yes" : "No"), asyncEngineCount);

    int kernelExecTimeoutEnabled;
    getCudaAttribute<int>(&kernelExecTimeoutEnabled,
                          CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
    printf("  Run time limit on kernels:                     %s\n",
           kernelExecTimeoutEnabled ? "Yes" : "No");
    int integrated;
    getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
    printf("  Integrated GPU sharing Host Memory:            %s\n",
           integrated ? "Yes" : "No");
    int canMapHostMemory;
    getCudaAttribute<int>(&canMapHostMemory,
                          CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
    printf("  Support host page-locked memory mapping:       %s\n",
           canMapHostMemory ? "Yes" : "No");

    int concurrentKernels;
    getCudaAttribute<int>(&concurrentKernels,
                          CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
    printf("  Concurrent kernel execution:                   %s\n",
           concurrentKernels ? "Yes" : "No");

    int surfaceAlignment;
    getCudaAttribute<int>(&surfaceAlignment,
                          CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
    printf("  Alignment requirement for Surfaces:            %s\n",
           surfaceAlignment ? "Yes" : "No");

    int eccEnabled;
    getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
    printf("  Device has ECC support:                        %s\n",
           eccEnabled ? "Enabled" : "Disabled");

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    int tccDriver;
    getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
           tccDriver ? "TCC (Tesla Compute Cluster Driver)"
                     : "WDDM (Windows Display Driver Model)");
#endif

    int unifiedAddressing;
    getCudaAttribute<int>(&unifiedAddressing,
                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
    printf("  Device supports Unified Addressing (UVA):      %s\n",
           unifiedAddressing ? "Yes" : "No");

    int managedMemory;
    getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
                          dev);
    printf("  Device supports Managed Memory:                %s\n",
           managedMemory ? "Yes" : "No");

    int computePreemption;
    getCudaAttribute<int>(&computePreemption,
                          CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
                          dev);
    printf("  Device supports Compute Preemption:            %s\n",
           computePreemption ? "Yes" : "No");

    int cooperativeLaunch;
    getCudaAttribute<int>(&cooperativeLaunch,
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
    printf("  Supports Cooperative Kernel Launch:            %s\n",
           cooperativeLaunch ? "Yes" : "No");

    int cooperativeMultiDevLaunch;
    getCudaAttribute<int>(&cooperativeMultiDevLaunch,
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
                          dev);
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
           cooperativeMultiDevLaunch ? "Yes" : "No");

    int pciDomainID, pciBusID, pciDeviceID;
    getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
    getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
    getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
           pciDomainID, pciBusID, pciDeviceID);

    const char *sComputeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device "
        "simultaneously)",
        "Exclusive (only one host thread in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
        "device)",
        "Exclusive Process (many threads in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Unknown", NULL};

    int computeMode;
    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[computeMode]);
  }

  // If there are 2 or more GPUs, query to determine whether RDMA is supported
  if (deviceCount >= 2) {
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
    int gpu_p2p_count = 0;
    int tccDriver = 0;

    for (int i = 0; i < deviceCount; i++) {
      checkCudaErrors(cuDeviceGetAttribute(
          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
      checkCudaErrors(cuDeviceGetAttribute(
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
      getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);

      // Only boards based on Fermi or later can support P2P
      if ((major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
          // must be enabled to support this
          && tccDriver
#endif
          ) {
        // This is an array of P2P capable GPUs
        gpuid[gpu_p2p_count++] = i;
      }
    }

    // Show all the combinations of support P2P GPUs
    int can_access_peer;
    char deviceName0[256], deviceName1[256];

    if (gpu_p2p_count >= 2) {
      for (int i = 0; i < gpu_p2p_count; i++) {
        for (int j = 0; j < gpu_p2p_count; j++) {
          if (gpuid[i] == gpuid[j]) {
            continue;
          }
          checkCudaErrors(
              cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
          checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
          checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
          printf(
              "> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
              "%s\n",
              deviceName0, gpuid[i], deviceName1, gpuid[j],
              can_access_peer ? "Yes" : "No");
        }
      }
    }
  }

  printf("Result = PASS\n");

  exit(EXIT_SUCCESS);
}

代码详解：

程序会依次输出每个设备的详细信息,包括:

设备数量和设备属性:
- 设备数量
- 每个设备的名称
- 每个设备的计算能力主版本号和次版本号
- 每个设备的总内存大小
设备功能属性:
- 每个设备最大的线程数、线程块数和网格大小
- 每个设备支持的最大纹理维度和最大纹理数组大小
- 每个设备支持的最大常量内存大小和寄存器数量
- 每个设备支持的 ECC 错误检查和纠正
设备内存相关属性:
- 每个设备的global内存带宽
- 每个设备的shared内存每个块的大小
- 每个设备的constant内存大小
设备计算能力相关属性:
- 每个设备的计算能力主/次版本号
- 每个设备的 warp 大小
- 每个设备的最大线程维度
- 每个设备的最大网格维度

CUDA 相关API解读

该Sample使用的CUDA API: cuInit, cuDeviceGetCount, cuDeviceComputeCapability, cuDriverGetVersion, cuDeviceTotalMem, cuDeviceGetAttribute

这些API属于CUDA driver API。

1.cuInit

需要先使用 cuInit() 函数初始化driverAPI,然后才能调用driver API 中的任何函数。

接下来必须创建一个与特定设备关联的 CUDA 上下文(context),并将其设置为当前调用线程的活跃上下文。

首先调用 cuInit() 函数来初始化驱动 API。
然后创建一个与特定设备关联的 CUDA 上下文。
将创建的 CUDA 上下文设置为当前调用线程的活跃上下文。
之后就可以开始使用驱动 API 中的函数了。

这个初始化和上下文设置步骤是在通过驱动 API 与 CUDA 设备进行交互之前必须完成的前提条件。

在 CUDA 上下文中,内核(kernels)需要由device代码显式地以 PTX 或二进制对象的形式加载,因此,用 C++ 编写的内核必须单独编译成 PTX 或二进制对象。内核的启动使用 API 入口点。

任何想要在未来设备架构上运行的应用程序都必须加载 PTX 代码,而不是二进制代码。这是因为二进制代码是特定于架构的,因此与未来的架构不兼容,而 PTX 代码则是在加载时由设备驱动程序编译成二进制代码的。

以下是使用driver API 编写的内核示例的主机代码:

int main()
{
    int N = ...;
    size_t size = N * sizeof(float);

    // Allocate input vectors h_A and h_B in host memory
    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);

    // Initialize input vectors
    ...

    // Initialize
    cuInit(0);

    // Get number of devices supporting CUDA
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        printf("There is no device supporting CUDA.\n");
        exit (0);
    }

    // Get handle for device 0
    CUdevice cuDevice;
    cuDeviceGet(&cuDevice, 0);

    // Create context
    CUcontext cuContext;
    cuCtxCreate(&cuContext, 0, cuDevice);

    // Create module from binary file
    CUmodule cuModule;
    cuModuleLoad(&cuModule, "VecAdd.ptx");

    // Allocate vectors in device memory
    CUdeviceptr d_A;
    cuMemAlloc(&d_A, size);
    CUdeviceptr d_B;
    cuMemAlloc(&d_B, size);
    CUdeviceptr d_C;
    cuMemAlloc(&d_C, size);

    // Copy vectors from host memory to device memory
    cuMemcpyHtoD(d_A, h_A, size);
    cuMemcpyHtoD(d_B, h_B, size);

    // Get function handle from module
    CUfunction vecAdd;
    cuModuleGetFunction(&vecAdd, cuModule, "VecAdd");

    // Invoke kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =
            (N + threadsPerBlock - 1) / threadsPerBlock;
    void* args[] = { &d_A, &d_B, &d_C, &N };
    cuLaunchKernel(vecAdd,
                   blocksPerGrid, 1, 1, threadsPerBlock, 1, 1,
                   0, 0, args, 0);

    ...
}

2. cuDeviceGetCount

该API用于获取系统中可用的 CUDA 设备的数量。

3. cuDeviceComputeCapability

该函数用于获取指定 CUDA 设备的计算能力。

4.cuDriverGetVersion

该函数用于获取当前系统上安装的 CUDA driver的版本号。

5.cuDeviceTotalMem

该函数用于获取指定 CUDA 设备的总内存大小

6. cuDeviceGetAttribue

该函数用于获取指定 CUDA 设备的属性值。

除此之外，代码中大量调用getCudaAttribute函数，该函数是被包装在helper_cuda_drvapi.h 里的模板函数，实际调用的是cuDeviceGetAttribue API。

运行结果：

CUDA Device Query (Driver API) statically linked version
Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce RTX 4080"
  CUDA Driver Version:                           12.5
  CUDA Capability Major/Minor version number:    8.9
  Total amount of global memory:                 16376 MBytes (17170956288 bytes)
  (76) Multiprocessors, (128) CUDA Cores/MP:     9728 CUDA Cores
  GPU Max Clock rate:                            2505 MHz (2.50 GHz)
  Memory Clock rate:                             11201 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 67108864 bytes
  Max Texture Dimension Sizes                    1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size (x,y,z):    (2147483647, 65535, 65535)
  Texture alignment:                             512 bytes
  Maximum memory pitch:                          2147483647 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
Result = PASS

翊桐公羽

关注

24
点赞
踩
15

收藏

觉得还不错? 一键收藏
0
评论
CUDA编程之CUDA Sample-1_Utilities-deviceQueryDrv

CUDA Samples 中 1_Utilities 文件夹里包含了一些实用工具和小型示例程序,它们通常用于支持和演示其他 CUDA 示例程序的功能。deviceQueryDrv和类似，主要是是查询系统上安装的 CUDA 设备的属性。区别是使用了cuda runtime API, deviceQueryDrv使用了CUDA driverAPI。这个程序可以帮助开发人员了解系统上可用的 CUDA 设备的具体信息,以便编写更高效的 CUDA 应用程序。
复制链接

扫一扫