CUDA编程之CUDA Sample-1_Utilities-deviceQueryDrv

CUDA Samples 中 1_Utilities 文件夹里包含了一些实用工具和小型示例程序,它们通常用于支持和演示其他 CUDA 示例程序的功能。

deviceQueryDrv和deviceQuery 类似,主要是是查询系统上安装的 CUDA 设备的属性。区别是deviceQuery 使用了cuda runtime API, deviceQueryDrv使用了CUDA driverAPI。这个程序可以帮助开发人员了解系统上可用的 CUDA 设备的具体信息,以便编写更高效的 CUDA 应用程序。

// includes, system
#include <stdlib.h>
#include <stdio.h>
#include <string.h>

#include <cuda.h>
#include <helper_cuda_drvapi.h>


// Program main

int main(int argc, char **argv) {
  CUdevice dev;
  int major = 0, minor = 0;
  int deviceCount = 0;
  char deviceName[256];

  printf("%s Starting...\n\n", argv[0]);

  // note your project will need to link with cuda.lib files on windows
  printf("CUDA Device Query (Driver API) statically linked version \n");

  checkCudaErrors(cuInit(0));

  checkCudaErrors(cuDeviceGetCount(&deviceCount));

  // This function call returns 0 if there are no CUDA capable devices.
  if (deviceCount == 0) {
    printf("There are no available device(s) that support CUDA\n");
  } else {
    printf("Detected %d CUDA Capable device(s)\n", deviceCount);
  }

  for (dev = 0; dev < deviceCount; ++dev) {
    checkCudaErrors(cuDeviceGetAttribute(
        &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, dev));
    checkCudaErrors(cuDeviceGetAttribute(
        &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, dev));

    checkCudaErrors(cuDeviceGetName(deviceName, 256, dev));

    printf("\nDevice %d: \"%s\"\n", dev, deviceName);

    int driverVersion = 0;
    checkCudaErrors(cuDriverGetVersion(&driverVersion));
    printf("  CUDA Driver Version:                           %d.%d\n",
           driverVersion / 1000, (driverVersion % 100) / 10);
    printf("  CUDA Capability Major/Minor version number:    %d.%d\n", major,
           minor);

    size_t totalGlobalMem;
    checkCudaErrors(cuDeviceTotalMem(&totalGlobalMem, dev));

    char msg[256];
    SPRINTF(msg,
            "  Total amount of global memory:                 %.0f MBytes "
            "(%llu bytes)\n",
            (float)totalGlobalMem / 1048576.0f,
            (unsigned long long)totalGlobalMem);
    printf("%s", msg);

    int multiProcessorCount;
    getCudaAttribute<int>(&multiProcessorCount,
                          CU_DEVICE_ATTRIBUTE_MULTIPROCESSOR_COUNT, dev);

    printf("  (%2d) Multiprocessors, (%3d) CUDA Cores/MP:     %d CUDA Cores\n",
           multiProcessorCount, _ConvertSMVer2CoresDRV(major, minor),
           _ConvertSMVer2CoresDRV(major, minor) * multiProcessorCount);

    int clockRate;
    getCudaAttribute<int>(&clockRate, CU_DEVICE_ATTRIBUTE_CLOCK_RATE, dev);
    printf(
        "  GPU Max Clock rate:                            %.0f MHz (%0.2f "
        "GHz)\n",
        clockRate * 1e-3f, clockRate * 1e-6f);
    int memoryClock;
    getCudaAttribute<int>(&memoryClock, CU_DEVICE_ATTRIBUTE_MEMORY_CLOCK_RATE,
                          dev);
    printf("  Memory Clock rate:                             %.0f Mhz\n",
           memoryClock * 1e-3f);
    int memBusWidth;
    getCudaAttribute<int>(&memBusWidth,
                          CU_DEVICE_ATTRIBUTE_GLOBAL_MEMORY_BUS_WIDTH, dev);
    printf("  Memory Bus Width:                              %d-bit\n",
           memBusWidth);
    int L2CacheSize;
    getCudaAttribute<int>(&L2CacheSize, CU_DEVICE_ATTRIBUTE_L2_CACHE_SIZE, dev);

    if (L2CacheSize) {
      printf("  L2 Cache Size:                                 %d bytes\n",
             L2CacheSize);
    }

    int maxTex1D, maxTex2D[2], maxTex3D[3];
    getCudaAttribute<int>(&maxTex1D,
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex2D[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex2D[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_HEIGHT, dev);
    getCudaAttribute<int>(&maxTex3D[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_WIDTH, dev);
    getCudaAttribute<int>(&maxTex3D[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_HEIGHT, dev);
    getCudaAttribute<int>(&maxTex3D[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE3D_DEPTH, dev);
    printf(
        "  Max Texture Dimension Sizes                    1D=(%d) 2D=(%d, %d) "
        "3D=(%d, %d, %d)\n",
        maxTex1D, maxTex2D[0], maxTex2D[1], maxTex3D[0], maxTex3D[1],
        maxTex3D[2]);

    int maxTex1DLayered[2];
    getCudaAttribute<int>(&maxTex1DLayered[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_WIDTH,
                          dev);
    getCudaAttribute<int>(&maxTex1DLayered[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE1D_LAYERED_LAYERS,
                          dev);
    printf(
        "  Maximum Layered 1D Texture Size, (num) layers  1D=(%d), %d layers\n",
        maxTex1DLayered[0], maxTex1DLayered[1]);

    int maxTex2DLayered[3];
    getCudaAttribute<int>(&maxTex2DLayered[0],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_WIDTH,
                          dev);
    getCudaAttribute<int>(&maxTex2DLayered[1],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_HEIGHT,
                          dev);
    getCudaAttribute<int>(&maxTex2DLayered[2],
                          CU_DEVICE_ATTRIBUTE_MAXIMUM_TEXTURE2D_LAYERED_LAYERS,
                          dev);
    printf(
        "  Maximum Layered 2D Texture Size, (num) layers  2D=(%d, %d), %d "
        "layers\n",
        maxTex2DLayered[0], maxTex2DLayered[1], maxTex2DLayered[2]);

    int totalConstantMemory;
    getCudaAttribute<int>(&totalConstantMemory,
                          CU_DEVICE_ATTRIBUTE_TOTAL_CONSTANT_MEMORY, dev);
    printf("  Total amount of constant memory:               %u bytes\n",
           totalConstantMemory);
    int sharedMemPerBlock;
    getCudaAttribute<int>(&sharedMemPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_SHARED_MEMORY_PER_BLOCK, dev);
    printf("  Total amount of shared memory per block:       %u bytes\n",
           sharedMemPerBlock);
    int regsPerBlock;
    getCudaAttribute<int>(&regsPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_REGISTERS_PER_BLOCK, dev);
    printf("  Total number of registers available per block: %d\n",
           regsPerBlock);
    int warpSize;
    getCudaAttribute<int>(&warpSize, CU_DEVICE_ATTRIBUTE_WARP_SIZE, dev);
    printf("  Warp size:                                     %d\n", warpSize);
    int maxThreadsPerMultiProcessor;
    getCudaAttribute<int>(&maxThreadsPerMultiProcessor,
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_MULTIPROCESSOR,
                          dev);
    printf("  Maximum number of threads per multiprocessor:  %d\n",
           maxThreadsPerMultiProcessor);
    int maxThreadsPerBlock;
    getCudaAttribute<int>(&maxThreadsPerBlock,
                          CU_DEVICE_ATTRIBUTE_MAX_THREADS_PER_BLOCK, dev);
    printf("  Maximum number of threads per block:           %d\n",
           maxThreadsPerBlock);

    int blockDim[3];
    getCudaAttribute<int>(&blockDim[0], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_X,
                          dev);
    getCudaAttribute<int>(&blockDim[1], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Y,
                          dev);
    getCudaAttribute<int>(&blockDim[2], CU_DEVICE_ATTRIBUTE_MAX_BLOCK_DIM_Z,
                          dev);
    printf("  Max dimension size of a thread block (x,y,z): (%d, %d, %d)\n",
           blockDim[0], blockDim[1], blockDim[2]);
    int gridDim[3];
    getCudaAttribute<int>(&gridDim[0], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_X, dev);
    getCudaAttribute<int>(&gridDim[1], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Y, dev);
    getCudaAttribute<int>(&gridDim[2], CU_DEVICE_ATTRIBUTE_MAX_GRID_DIM_Z, dev);
    printf("  Max dimension size of a grid size (x,y,z):    (%d, %d, %d)\n",
           gridDim[0], gridDim[1], gridDim[2]);

    int textureAlign;
    getCudaAttribute<int>(&textureAlign, CU_DEVICE_ATTRIBUTE_TEXTURE_ALIGNMENT,
                          dev);
    printf("  Texture alignment:                             %u bytes\n",
           textureAlign);

    int memPitch;
    getCudaAttribute<int>(&memPitch, CU_DEVICE_ATTRIBUTE_MAX_PITCH, dev);
    printf("  Maximum memory pitch:                          %u bytes\n",
           memPitch);

    int gpuOverlap;
    getCudaAttribute<int>(&gpuOverlap, CU_DEVICE_ATTRIBUTE_GPU_OVERLAP, dev);

    int asyncEngineCount;
    getCudaAttribute<int>(&asyncEngineCount,
                          CU_DEVICE_ATTRIBUTE_ASYNC_ENGINE_COUNT, dev);
    printf(
        "  Concurrent copy and kernel execution:          %s with %d copy "
        "engine(s)\n",
        (gpuOverlap ? "Yes" : "No"), asyncEngineCount);

    int kernelExecTimeoutEnabled;
    getCudaAttribute<int>(&kernelExecTimeoutEnabled,
                          CU_DEVICE_ATTRIBUTE_KERNEL_EXEC_TIMEOUT, dev);
    printf("  Run time limit on kernels:                     %s\n",
           kernelExecTimeoutEnabled ? "Yes" : "No");
    int integrated;
    getCudaAttribute<int>(&integrated, CU_DEVICE_ATTRIBUTE_INTEGRATED, dev);
    printf("  Integrated GPU sharing Host Memory:            %s\n",
           integrated ? "Yes" : "No");
    int canMapHostMemory;
    getCudaAttribute<int>(&canMapHostMemory,
                          CU_DEVICE_ATTRIBUTE_CAN_MAP_HOST_MEMORY, dev);
    printf("  Support host page-locked memory mapping:       %s\n",
           canMapHostMemory ? "Yes" : "No");

    int concurrentKernels;
    getCudaAttribute<int>(&concurrentKernels,
                          CU_DEVICE_ATTRIBUTE_CONCURRENT_KERNELS, dev);
    printf("  Concurrent kernel execution:                   %s\n",
           concurrentKernels ? "Yes" : "No");

    int surfaceAlignment;
    getCudaAttribute<int>(&surfaceAlignment,
                          CU_DEVICE_ATTRIBUTE_SURFACE_ALIGNMENT, dev);
    printf("  Alignment requirement for Surfaces:            %s\n",
           surfaceAlignment ? "Yes" : "No");

    int eccEnabled;
    getCudaAttribute<int>(&eccEnabled, CU_DEVICE_ATTRIBUTE_ECC_ENABLED, dev);
    printf("  Device has ECC support:                        %s\n",
           eccEnabled ? "Enabled" : "Disabled");

#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
    int tccDriver;
    getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, dev);
    printf("  CUDA Device Driver Mode (TCC or WDDM):         %s\n",
           tccDriver ? "TCC (Tesla Compute Cluster Driver)"
                     : "WDDM (Windows Display Driver Model)");
#endif

    int unifiedAddressing;
    getCudaAttribute<int>(&unifiedAddressing,
                          CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, dev);
    printf("  Device supports Unified Addressing (UVA):      %s\n",
           unifiedAddressing ? "Yes" : "No");

    int managedMemory;
    getCudaAttribute<int>(&managedMemory, CU_DEVICE_ATTRIBUTE_MANAGED_MEMORY,
                          dev);
    printf("  Device supports Managed Memory:                %s\n",
           managedMemory ? "Yes" : "No");

    int computePreemption;
    getCudaAttribute<int>(&computePreemption,
                          CU_DEVICE_ATTRIBUTE_COMPUTE_PREEMPTION_SUPPORTED,
                          dev);
    printf("  Device supports Compute Preemption:            %s\n",
           computePreemption ? "Yes" : "No");

    int cooperativeLaunch;
    getCudaAttribute<int>(&cooperativeLaunch,
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_LAUNCH, dev);
    printf("  Supports Cooperative Kernel Launch:            %s\n",
           cooperativeLaunch ? "Yes" : "No");

    int cooperativeMultiDevLaunch;
    getCudaAttribute<int>(&cooperativeMultiDevLaunch,
                          CU_DEVICE_ATTRIBUTE_COOPERATIVE_MULTI_DEVICE_LAUNCH,
                          dev);
    printf("  Supports MultiDevice Co-op Kernel Launch:      %s\n",
           cooperativeMultiDevLaunch ? "Yes" : "No");

    int pciDomainID, pciBusID, pciDeviceID;
    getCudaAttribute<int>(&pciDomainID, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, dev);
    getCudaAttribute<int>(&pciBusID, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, dev);
    getCudaAttribute<int>(&pciDeviceID, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, dev);
    printf("  Device PCI Domain ID / Bus ID / location ID:   %d / %d / %d\n",
           pciDomainID, pciBusID, pciDeviceID);

    const char *sComputeMode[] = {
        "Default (multiple host threads can use ::cudaSetDevice() with device "
        "simultaneously)",
        "Exclusive (only one host thread in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Prohibited (no host thread can use ::cudaSetDevice() with this "
        "device)",
        "Exclusive Process (many threads in one process is able to use "
        "::cudaSetDevice() with this device)",
        "Unknown", NULL};

    int computeMode;
    getCudaAttribute<int>(&computeMode, CU_DEVICE_ATTRIBUTE_COMPUTE_MODE, dev);
    printf("  Compute Mode:\n");
    printf("     < %s >\n", sComputeMode[computeMode]);
  }

  // If there are 2 or more GPUs, query to determine whether RDMA is supported
  if (deviceCount >= 2) {
    int gpuid[64];  // we want to find the first two GPUs that can support P2P
    int gpu_p2p_count = 0;
    int tccDriver = 0;

    for (int i = 0; i < deviceCount; i++) {
      checkCudaErrors(cuDeviceGetAttribute(
          &major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, i));
      checkCudaErrors(cuDeviceGetAttribute(
          &minor, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MINOR, i));
      getCudaAttribute<int>(&tccDriver, CU_DEVICE_ATTRIBUTE_TCC_DRIVER, i);

      // Only boards based on Fermi or later can support P2P
      if ((major >= 2)
#if defined(WIN32) || defined(_WIN32) || defined(WIN64) || defined(_WIN64)
          // on Windows (64-bit), the Tesla Compute Cluster driver for windows
          // must be enabled to support this
          && tccDriver
#endif
          ) {
        // This is an array of P2P capable GPUs
        gpuid[gpu_p2p_count++] = i;
      }
    }

    // Show all the combinations of support P2P GPUs
    int can_access_peer;
    char deviceName0[256], deviceName1[256];

    if (gpu_p2p_count >= 2) {
      for (int i = 0; i < gpu_p2p_count; i++) {
        for (int j = 0; j < gpu_p2p_count; j++) {
          if (gpuid[i] == gpuid[j]) {
            continue;
          }
          checkCudaErrors(
              cuDeviceCanAccessPeer(&can_access_peer, gpuid[i], gpuid[j]));
          checkCudaErrors(cuDeviceGetName(deviceName0, 256, gpuid[i]));
          checkCudaErrors(cuDeviceGetName(deviceName1, 256, gpuid[j]));
          printf(
              "> Peer-to-Peer (P2P) access from %s (GPU%d) -> %s (GPU%d) : "
              "%s\n",
              deviceName0, gpuid[i], deviceName1, gpuid[j],
              can_access_peer ? "Yes" : "No");
        }
      }
    }
  }

  printf("Result = PASS\n");

  exit(EXIT_SUCCESS);
}

 代码详解:

程序会依次输出每个设备的详细信息,包括:

  • 设备数量和设备属性:

    • 设备数量
    • 每个设备的名称
    • 每个设备的计算能力主版本号和次版本号
    • 每个设备的总内存大小
  • 设备功能属性:

    • 每个设备最大的线程数、线程块数和网格大小
    • 每个设备支持的最大纹理维度和最大纹理数组大小
    • 每个设备支持的最大常量内存大小和寄存器数量
    • 每个设备支持的 ECC 错误检查和纠正
  • 设备内存相关属性:

    • 每个设备的global内存带宽
    • 每个设备的shared内存每个块的大小
    • 每个设备的constant内存大小
  • 设备计算能力相关属性:

    • 每个设备的计算能力主/次版本号
    • 每个设备的 warp 大小
    • 每个设备的最大线程维度
    • 每个设备的最大网格维度

CUDA  相关API解读

该Sample使用的CUDA API:  cuInit, cuDeviceGetCount, cuDeviceComputeCapability, cuDriverGetVersion, cuDeviceTotalMem, cuDeviceGetAttribute

这些API属于CUDA driver API。

1.cuInit

需要先使用 cuInit() 函数初始化driverAPI,然后才能调用driver API 中的任何函数。

接下来必须创建一个与特定设备关联的 CUDA 上下文(context),并将其设置为当前调用线程的活跃上下文。

  1. 首先调用 cuInit() 函数来初始化驱动 API。
  2. 然后创建一个与特定设备关联的 CUDA 上下文。
  3. 将创建的 CUDA 上下文设置为当前调用线程的活跃上下文。
  4. 之后就可以开始使用驱动 API 中的函数了。

这个初始化和上下文设置步骤是在通过驱动 API 与 CUDA 设备进行交互之前必须完成的前提条件。

在 CUDA 上下文中,内核(kernels)需要由device代码显式地以 PTX 或二进制对象的形式加载,因此,用 C++ 编写的内核必须单独编译成 PTX 或二进制对象。内核的启动使用 API 入口点。

任何想要在未来设备架构上运行的应用程序都必须加载 PTX 代码,而不是二进制代码。这是因为二进制代码是特定于架构的,因此与未来的架构不兼容,而 PTX 代码则是在加载时由设备驱动程序编译成二进制代码的。

以下是使用driver API 编写的内核示例的主机代码:

int main()
{
    int N = ...;
    size_t size = N * sizeof(float);

    // Allocate input vectors h_A and h_B in host memory
    float* h_A = (float*)malloc(size);
    float* h_B = (float*)malloc(size);

    // Initialize input vectors
    ...

    // Initialize
    cuInit(0);

    // Get number of devices supporting CUDA
    int deviceCount = 0;
    cuDeviceGetCount(&deviceCount);
    if (deviceCount == 0) {
        printf("There is no device supporting CUDA.\n");
        exit (0);
    }

    // Get handle for device 0
    CUdevice cuDevice;
    cuDeviceGet(&cuDevice, 0);

    // Create context
    CUcontext cuContext;
    cuCtxCreate(&cuContext, 0, cuDevice);

    // Create module from binary file
    CUmodule cuModule;
    cuModuleLoad(&cuModule, "VecAdd.ptx");

    // Allocate vectors in device memory
    CUdeviceptr d_A;
    cuMemAlloc(&d_A, size);
    CUdeviceptr d_B;
    cuMemAlloc(&d_B, size);
    CUdeviceptr d_C;
    cuMemAlloc(&d_C, size);

    // Copy vectors from host memory to device memory
    cuMemcpyHtoD(d_A, h_A, size);
    cuMemcpyHtoD(d_B, h_B, size);

    // Get function handle from module
    CUfunction vecAdd;
    cuModuleGetFunction(&vecAdd, cuModule, "VecAdd");

    // Invoke kernel
    int threadsPerBlock = 256;
    int blocksPerGrid =
            (N + threadsPerBlock - 1) / threadsPerBlock;
    void* args[] = { &d_A, &d_B, &d_C, &N };
    cuLaunchKernel(vecAdd,
                   blocksPerGrid, 1, 1, threadsPerBlock, 1, 1,
                   0, 0, args, 0);

    ...
}

2.  cuDeviceGetCount 

该API用于获取系统中可用的 CUDA 设备的数量。

3. cuDeviceComputeCapability 

该函数用于获取指定 CUDA 设备的计算能力。

4.cuDriverGetVersion

函数用于获取当前系统上安装的 CUDA driver的版本号。

5.cuDeviceTotalMem 

该函数用于获取指定 CUDA 设备的总内存大小

6. cuDeviceGetAttribue

 该函数用于获取指定 CUDA 设备的属性值。

除此之外,代码中大量调用getCudaAttribute函数,该函数是被包装在helper_cuda_drvapi.h 里的模板函数 ,实际调用的是cuDeviceGetAttribue API。

运行结果:

CUDA Device Query (Driver API) statically linked version
Detected 1 CUDA Capable device(s)

Device 0: "NVIDIA GeForce RTX 4080"
  CUDA Driver Version:                           12.5
  CUDA Capability Major/Minor version number:    8.9
  Total amount of global memory:                 16376 MBytes (17170956288 bytes)
  (76) Multiprocessors, (128) CUDA Cores/MP:     9728 CUDA Cores
  GPU Max Clock rate:                            2505 MHz (2.50 GHz)
  Memory Clock rate:                             11201 Mhz
  Memory Bus Width:                              256-bit
  L2 Cache Size:                                 67108864 bytes
  Max Texture Dimension Sizes                    1D=(131072) 2D=(131072, 65536) 3D=(16384, 16384, 16384)
  Maximum Layered 1D Texture Size, (num) layers  1D=(32768), 2048 layers
  Maximum Layered 2D Texture Size, (num) layers  2D=(32768, 32768), 2048 layers
  Total amount of constant memory:               65536 bytes
  Total amount of shared memory per block:       49152 bytes
  Total number of registers available per block: 65536
  Warp size:                                     32
  Maximum number of threads per multiprocessor:  1536
  Maximum number of threads per block:           1024
  Max dimension size of a thread block (x,y,z): (1024, 1024, 64)
  Max dimension size of a grid size (x,y,z):    (2147483647, 65535, 65535)
  Texture alignment:                             512 bytes
  Maximum memory pitch:                          2147483647 bytes
  Concurrent copy and kernel execution:          Yes with 1 copy engine(s)
  Run time limit on kernels:                     Yes
  Integrated GPU sharing Host Memory:            No
  Support host page-locked memory mapping:       Yes
  Concurrent kernel execution:                   Yes
  Alignment requirement for Surfaces:            Yes
  Device has ECC support:                        Disabled
  CUDA Device Driver Mode (TCC or WDDM):         WDDM (Windows Display Driver Model)
  Device supports Unified Addressing (UVA):      Yes
  Device supports Managed Memory:                Yes
  Device supports Compute Preemption:            Yes
  Supports Cooperative Kernel Launch:            Yes
  Supports MultiDevice Co-op Kernel Launch:      No
  Device PCI Domain ID / Bus ID / location ID:   0 / 1 / 0
  Compute Mode:
     < Default (multiple host threads can use ::cudaSetDevice() with device simultaneously) >
Result = PASS

  • 24
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值