CUDA编程之CUDA Sample-1_Utilities-bandwidthTest

翊桐公羽

已于 2024-07-10 14:45:02 修改

阅读量996

点赞数 36

文章标签：算法

于 2024-07-10 14:33:23 首次发布

本文链接：https://blog.csdn.net/zhujingwen_tongji/article/details/140319813

版权

CUDA Samples 中 1_Utilities 文件夹里包含了一些实用工具和小型示例程序,它们通常用于支持和演示其他 CUDA 示例程序的功能。

bandwidthTest这个sample是一个简单的测试程序,用于测量 GPU 的内存拷贝带宽以及 PCI-E 总线上的内存拷贝带宽。这个测试应用程序能够测量以下几种情况的带宽:

设备间(device to device)拷贝带宽
主机到设备(host to device)拷贝带宽,包括可分页内存(pageable memory)和锁页内存(page-locked memory)
设备到主机(device to host)拷贝带宽,包括可分页内存和锁页内存

这个程序可以帮助开发人员了解 CUDA 应用程序在不同内存访问模式下的性能表现,从而优化应用程序的内存使用策略,提高整体性能。

通过测量这些不同情况下的内存拷贝带宽,开发人员可以更好地评估 GPU 和 PCI-E 总线的性能瓶颈,进而针对性地优化应用程序的内存访问模式,从而获得更好的运行效率。

总之,这个简单的测试程序为 CUDA 应用程序的性能调优提供了一个有价值的工具和参考。


// CUDA runtime
#include <cuda_runtime.h>

// includes
#include <helper_cuda.h>  // helper functions for CUDA error checking and initialization
#include <helper_functions.h>  // helper for shared functions common to CUDA Samples

#include <cuda.h>

#include <cassert>
#include <iostream>
#include <memory>

static const char *sSDKsample = "CUDA Bandwidth Test";

// defines, project
#define MEMCOPY_ITERATIONS 100
#define DEFAULT_SIZE (32 * (1e6))      // 32 M
#define DEFAULT_INCREMENT (4 * (1e6))  // 4 M
#define CACHE_CLEAR_SIZE (16 * (1e6))  // 16 M

// shmoo mode defines
#define SHMOO_MEMSIZE_MAX (64 * (1e6))       // 64 M
#define SHMOO_MEMSIZE_START (1e3)            // 1 KB
#define SHMOO_INCREMENT_1KB (1e3)            // 1 KB
#define SHMOO_INCREMENT_2KB (2 * 1e3)        // 2 KB
#define SHMOO_INCREMENT_10KB (10 * (1e3))    // 10KB
#define SHMOO_INCREMENT_100KB (100 * (1e3))  // 100 KB
#define SHMOO_INCREMENT_1MB (1e6)            // 1 MB
#define SHMOO_INCREMENT_2MB (2 * 1e6)        // 2 MB
#define SHMOO_INCREMENT_4MB (4 * 1e6)        // 4 MB
#define SHMOO_LIMIT_20KB (20 * (1e3))        // 20 KB
#define SHMOO_LIMIT_50KB (50 * (1e3))        // 50 KB
#define SHMOO_LIMIT_100KB (100 * (1e3))      // 100 KB
#define SHMOO_LIMIT_1MB (1e6)                // 1 MB
#define SHMOO_LIMIT_16MB (16 * 1e6)          // 16 MB
#define SHMOO_LIMIT_32MB (32 * 1e6)          // 32 MB

// CPU cache flush
#define FLUSH_SIZE (256 * 1024 * 1024)
char *flush_buf;

// enums, project
enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE };

const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
                                 "Device to Device", NULL};

const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};

// if true, use CPU based timing for everything
static bool bDontUseGPUTiming;

int *pArgc = NULL;
char **pArgv = NULL;


// declaration, forward
int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
                   testMode mode, memcpyKind kind, printMode printmode,
                   memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc);
void testBandwidthRange(unsigned int start, unsigned int end,
                        unsigned int increment, memcpyKind kind,
                        printMode printmode, memoryMode memMode,
                        int startDevice, int endDevice, bool wc);
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
                          unsigned int count, memcpyKind kind,
                          memoryMode memMode, int iNumDevs, bool wc);
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
                     unsigned int count, memcpyKind kind, memoryMode memMode,
                     int iNumDevs, bool wc);
void printHelp(void);


// Program main

int main(int argc, char **argv) {
  pArgc = &argc;
  pArgv = argv;

  flush_buf = (char *)malloc(FLUSH_SIZE);

  // set logfile name and start logs
  printf("[%s] - Starting...\n", sSDKsample);

  int iRetVal = runTest(argc, (const char **)argv);

  if (iRetVal < 0) {
    checkCudaErrors(cudaSetDevice(0));
  }

  // finish
  printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");

  printf(
      "\nNOTE: The CUDA Samples are not meant for performance measurements. "
      "Results may vary when GPU Boost is enabled.\n");

  free(flush_buf);

  exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
}

///
// Parse args, run the appropriate tests
///
int runTest(const int argc, const char **argv) {
  int start = DEFAULT_SIZE;
  int end = DEFAULT_SIZE;
  int startDevice = 0;
  int endDevice = 0;
  int increment = DEFAULT_INCREMENT;
  testMode mode = QUICK_MODE;
  bool htod = false;
  bool dtoh = false;
  bool dtod = false;
  bool wc = false;
  char *modeStr;
  char *device = NULL;
  printMode printmode = USER_READABLE;
  char *memModeStr = NULL;
  memoryMode memMode = PINNED;

  // process command line args
  if (checkCmdLineFlag(argc, argv, "help")) {
    printHelp();
    return 0;
  }

  if (checkCmdLineFlag(argc, argv, "csv")) {
    printmode = CSV;
  }

  if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
    if (strcmp(memModeStr, "pageable") == 0) {
      memMode = PAGEABLE;
    } else if (strcmp(memModeStr, "pinned") == 0) {
      memMode = PINNED;
    } else {
      printf("Invalid memory mode - valid modes are pageable or pinned\n");
      printf("See --help for more information\n");
      return -1000;
    }
  } else {
    // default - pinned memory
    memMode = PINNED;
  }

  if (getCmdLineArgumentString(argc, argv, "device", &device)) {
    int deviceCount;
    cudaError_t error_id = cudaGetDeviceCount(&deviceCount);

    if (error_id != cudaSuccess) {
      printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
             cudaGetErrorString(error_id));
      exit(EXIT_FAILURE);
    }

    if (deviceCount == 0) {
      printf("!!!!!No devices found!!!!!\n");
      return -2000;
    }

    if (strcmp(device, "all") == 0) {
      printf(
          "\n!!!!!Cumulative Bandwidth to be computed from all the devices "
          "!!!!!!\n\n");
      startDevice = 0;
      endDevice = deviceCount - 1;
    } else {
      startDevice = endDevice = atoi(device);

      if (startDevice >= deviceCount || startDevice < 0) {
        printf(
            "\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
            "used !!!!!\n",
            startDevice, 0);
        startDevice = endDevice = 0;
      }
    }
  }

  printf("Running on...\n\n");

  for (int currentDevice = startDevice; currentDevice <= endDevice;
       currentDevice++) {
    cudaDeviceProp deviceProp;
    cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);

    if (error_id == cudaSuccess) {
      printf(" Device %d: %s\n", currentDevice, deviceProp.name);

      if (deviceProp.computeMode == cudaComputeModeProhibited) {
        fprintf(stderr,
                "Error: device is running in <Compute Mode Prohibited>, no "
                "threads can use ::cudaSetDevice().\n");
        checkCudaErrors(cudaSetDevice(currentDevice));

        exit(EXIT_FAILURE);
      }
    } else {
      printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
             cudaGetErrorString(error_id));
      checkCudaErrors(cudaSetDevice(currentDevice));

      exit(EXIT_FAILURE);
    }
  }

  if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
    // figure out the mode
    if (strcmp(modeStr, "quick") == 0) {
      printf(" Quick Mode\n\n");
      mode = QUICK_MODE;
    } else if (strcmp(modeStr, "shmoo") == 0) {
      printf(" Shmoo Mode\n\n");
      mode = SHMOO_MODE;
    } else if (strcmp(modeStr, "range") == 0) {
      printf(" Range Mode\n\n");
      mode = RANGE_MODE;
    } else {
      printf("Invalid mode - valid modes are quick, range, or shmoo\n");
      printf("See --help for more information\n");
      return -3000;
    }
  } else {
    // default mode - quick
    printf(" Quick Mode\n\n");
    mode = QUICK_MODE;
  }

  if (checkCmdLineFlag(argc, argv, "htod")) {
    htod = true;
  }

  if (checkCmdLineFlag(argc, argv, "dtoh")) {
    dtoh = true;
  }

  if (checkCmdLineFlag(argc, argv, "dtod")) {
    dtod = true;
  }

#if CUDART_VERSION >= 2020

  if (checkCmdLineFlag(argc, argv, "wc")) {
    wc = true;
  }

#endif

  if (checkCmdLineFlag(argc, argv, "cputiming")) {
    bDontUseGPUTiming = true;
  }

  if (!htod && !dtoh && !dtod) {
    // default:  All
    htod = true;
    dtoh = true;
    dtod = true;
  }

  if (RANGE_MODE == mode) {
    if (checkCmdLineFlag(argc, (const char **)argv, "start")) {
      start = getCmdLineArgumentInt(argc, argv, "start");

      if (start <= 0) {
        printf("Illegal argument - start must be greater than zero\n");
        return -4000;
      }
    } else {
      printf("Must specify a starting size in range mode\n");
      printf("See --help for more information\n");
      return -5000;
    }

    if (checkCmdLineFlag(argc, (const char **)argv, "end")) {
      end = getCmdLineArgumentInt(argc, argv, "end");

      if (end <= 0) {
        printf("Illegal argument - end must be greater than zero\n");
        return -6000;
      }

      if (start > end) {
        printf("Illegal argument - start is greater than end\n");
        return -7000;
      }
    } else {
      printf("Must specify an end size in range mode.\n");
      printf("See --help for more information\n");
      return -8000;
    }

    if (checkCmdLineFlag(argc, argv, "increment")) {
      increment = getCmdLineArgumentInt(argc, argv, "increment");

      if (increment <= 0) {
        printf("Illegal argument - increment must be greater than zero\n");
        return -9000;
      }
    } else {
      printf("Must specify an increment in user mode\n");
      printf("See --help for more information\n");
      return -10000;
    }
  }

  if (htod) {
    testBandwidth((unsigned int)start, (unsigned int)end,
                  (unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
                  memMode, startDevice, endDevice, wc);
  }

  if (dtoh) {
    testBandwidth((unsigned int)start, (unsigned int)end,
                  (unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
                  memMode, startDevice, endDevice, wc);
  }

  if (dtod) {
    testBandwidth((unsigned int)start, (unsigned int)end,
                  (unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
                  memMode, startDevice, endDevice, wc);
  }

  // Ensure that we reset all CUDA Devices in question
  for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
    cudaSetDevice(nDevice);
  }

  return 0;
}

///
//  Run a bandwidth test
///
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
                   testMode mode, memcpyKind kind, printMode printmode,
                   memoryMode memMode, int startDevice, int endDevice,
                   bool wc) {
  switch (mode) {
    case QUICK_MODE:
      testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
                         endDevice, wc);
      break;

    case RANGE_MODE:
      testBandwidthRange(start, end, increment, kind, printmode, memMode,
                         startDevice, endDevice, wc);
      break;

    case SHMOO_MODE:
      testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
      break;

    default:
      break;
  }
}

//
//  Run a quick mode bandwidth test
//
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc) {
  testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
                     startDevice, endDevice, wc);
}

///
//  Run a range mode bandwidth test
//
void testBandwidthRange(unsigned int start, unsigned int end,
                        unsigned int increment, memcpyKind kind,
                        printMode printmode, memoryMode memMode,
                        int startDevice, int endDevice, bool wc) {
  // count the number of copies we're going to run
  unsigned int count = 1 + ((end - start) / increment);

  unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
  double *bandwidths = (double *)malloc(count * sizeof(double));

  // Before calculating the cumulative bandwidth, initialize bandwidths array to
  // NULL
  for (unsigned int i = 0; i < count; i++) {
    bandwidths[i] = 0.0;
  }

  // Use the device asked by the user
  for (int currentDevice = startDevice; currentDevice <= endDevice;
       currentDevice++) {
    cudaSetDevice(currentDevice);

    // run each of the copies
    for (unsigned int i = 0; i < count; i++) {
      memSizes[i] = start + i * increment;

      switch (kind) {
        case DEVICE_TO_HOST:
          bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
          break;

        case HOST_TO_DEVICE:
          bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
          break;

        case DEVICE_TO_DEVICE:
          bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
          break;
      }
    }
  }  // Complete the bandwidth computation on all the devices

  // print results
  if (printmode == CSV) {
    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
                    (1 + endDevice - startDevice), wc);
  } else {
    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
                         (1 + endDevice - startDevice), wc);
  }

  // clean up
  free(memSizes);
  free(bandwidths);
}

//
// Intense shmoo mode - covers a large range of values with varying increments
//
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
                        memoryMode memMode, int startDevice, int endDevice,
                        bool wc) {
  // count the number of copies to make
  unsigned int count =
      1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
      ((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
      ((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
      ((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
      ((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
      ((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
      ((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);

  unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
  double *bandwidths = (double *)malloc(count * sizeof(double));

  // Before calculating the cumulative bandwidth, initialize bandwidths array to
  // NULL
  for (unsigned int i = 0; i < count; i++) {
    bandwidths[i] = 0.0;
  }

  // Use the device asked by the user
  for (int currentDevice = startDevice; currentDevice <= endDevice;
       currentDevice++) {
    cudaSetDevice(currentDevice);
    // Run the shmoo
    int iteration = 0;
    unsigned int memSize = 0;

    while (memSize <= SHMOO_MEMSIZE_MAX) {
      if (memSize < SHMOO_LIMIT_20KB) {
        memSize += SHMOO_INCREMENT_1KB;
      } else if (memSize < SHMOO_LIMIT_50KB) {
        memSize += SHMOO_INCREMENT_2KB;
      } else if (memSize < SHMOO_LIMIT_100KB) {
        memSize += SHMOO_INCREMENT_10KB;
      } else if (memSize < SHMOO_LIMIT_1MB) {
        memSize += SHMOO_INCREMENT_100KB;
      } else if (memSize < SHMOO_LIMIT_16MB) {
        memSize += SHMOO_INCREMENT_1MB;
      } else if (memSize < SHMOO_LIMIT_32MB) {
        memSize += SHMOO_INCREMENT_2MB;
      } else {
        memSize += SHMOO_INCREMENT_4MB;
      }

      memSizes[iteration] = memSize;

      switch (kind) {
        case DEVICE_TO_HOST:
          bandwidths[iteration] +=
              testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
          break;

        case HOST_TO_DEVICE:
          bandwidths[iteration] +=
              testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
          break;

        case DEVICE_TO_DEVICE:
          bandwidths[iteration] +=
              testDeviceToDeviceTransfer(memSizes[iteration]);
          break;
      }

      iteration++;
      printf(".");
      fflush(0);
    }
  }  // Complete the bandwidth computation on all the devices

  // print results
  printf("\n");

  if (CSV == printmode) {
    printResultsCSV(memSizes, bandwidths, count, kind, memMode,
                    (1 + endDevice - startDevice), wc);
  } else {
    printResultsReadable(memSizes, bandwidths, count, kind, memMode,
                         (1 + endDevice - startDevice), wc);
  }

  // clean up
  free(memSizes);
  free(bandwidths);
}

///
//  test the bandwidth of a device to host memcopy of a specific size
///
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc) {
  StopWatchInterface *timer = NULL;
  float elapsedTimeInMs = 0.0f;
  float bandwidthInGBs = 0.0f;
  unsigned char *h_idata = NULL;
  unsigned char *h_odata = NULL;
  cudaEvent_t start, stop;

  sdkCreateTimer(&timer);
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));

  // allocate host memory
  if (PINNED == memMode) {
  // pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020
    checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
                                  (wc) ? cudaHostAllocWriteCombined : 0));
    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
                                  (wc) ? cudaHostAllocWriteCombined : 0));
#else
    checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
    checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
  } else {
    // pageable memory mode - use malloc
    h_idata = (unsigned char *)malloc(memSize);
    h_odata = (unsigned char *)malloc(memSize);

    if (h_idata == 0 || h_odata == 0) {
      fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
      exit(EXIT_FAILURE);
    }
  }

  // initialize the memory
  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
    h_idata[i] = (unsigned char)(i & 0xff);
  }

  // allocate device memory
  unsigned char *d_idata;
  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));

  // initialize the device memory
  checkCudaErrors(
      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));

  // copy data from GPU to Host
  if (PINNED == memMode) {
    if (bDontUseGPUTiming) sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
                                      cudaMemcpyDeviceToHost, 0));
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaDeviceSynchronize());
    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
    if (bDontUseGPUTiming) {
      sdkStopTimer(&timer);
      elapsedTimeInMs = sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
    }
  } else {
    elapsedTimeInMs = 0;
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      sdkStartTimer(&timer);
      checkCudaErrors(
          cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
      sdkStopTimer(&timer);
      elapsedTimeInMs += sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
      memset(flush_buf, i, FLUSH_SIZE);
    }
  }

  // calculate bandwidth in GB/s
  double time_s = elapsedTimeInMs / 1e3;
  bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
  bandwidthInGBs = bandwidthInGBs / time_s;
  // clean up memory
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaEventDestroy(start));
  sdkDeleteTimer(&timer);

  if (PINNED == memMode) {
    checkCudaErrors(cudaFreeHost(h_idata));
    checkCudaErrors(cudaFreeHost(h_odata));
  } else {
    free(h_idata);
    free(h_odata);
  }

  checkCudaErrors(cudaFree(d_idata));

  return bandwidthInGBs;
}

///
//! test the bandwidth of a host to device memcopy of a specific size
///
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
                               bool wc) {
  StopWatchInterface *timer = NULL;
  float elapsedTimeInMs = 0.0f;
  float bandwidthInGBs = 0.0f;
  cudaEvent_t start, stop;
  sdkCreateTimer(&timer);
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));

  // allocate host memory
  unsigned char *h_odata = NULL;

  if (PINNED == memMode) {
#if CUDART_VERSION >= 2020
    // pinned memory mode - use special function to get OS-pinned memory
    checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
                                  (wc) ? cudaHostAllocWriteCombined : 0));
#else
    // pinned memory mode - use special function to get OS-pinned memory
    checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
  } else {
    // pageable memory mode - use malloc
    h_odata = (unsigned char *)malloc(memSize);

    if (h_odata == 0) {
      fprintf(stderr, "Not enough memory available on host to run test!\n");
      exit(EXIT_FAILURE);
    }
  }

  unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
  unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);

  if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
    fprintf(stderr, "Not enough memory available on host to run test!\n");
    exit(EXIT_FAILURE);
  }

  // initialize the memory
  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
    h_odata[i] = (unsigned char)(i & 0xff);
  }

  for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
    h_cacheClear1[i] = (unsigned char)(i & 0xff);
    h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
  }

  // allocate device memory
  unsigned char *d_idata;
  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));

  // copy host memory to device memory
  if (PINNED == memMode) {
    if (bDontUseGPUTiming) sdkStartTimer(&timer);
    checkCudaErrors(cudaEventRecord(start, 0));
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
                                      cudaMemcpyHostToDevice, 0));
    }
    checkCudaErrors(cudaEventRecord(stop, 0));
    checkCudaErrors(cudaDeviceSynchronize());
    checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
    if (bDontUseGPUTiming) {
      sdkStopTimer(&timer);
      elapsedTimeInMs = sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
    }
  } else {
    elapsedTimeInMs = 0;
    for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
      sdkStartTimer(&timer);
      checkCudaErrors(
          cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
      sdkStopTimer(&timer);
      elapsedTimeInMs += sdkGetTimerValue(&timer);
      sdkResetTimer(&timer);
      memset(flush_buf, i, FLUSH_SIZE);
    }
  }

  // calculate bandwidth in GB/s
  double time_s = elapsedTimeInMs / 1e3;
  bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
  bandwidthInGBs = bandwidthInGBs / time_s;
  // clean up memory
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaEventDestroy(start));
  sdkDeleteTimer(&timer);

  if (PINNED == memMode) {
    checkCudaErrors(cudaFreeHost(h_odata));
  } else {
    free(h_odata);
  }

  free(h_cacheClear1);
  free(h_cacheClear2);
  checkCudaErrors(cudaFree(d_idata));

  return bandwidthInGBs;
}

///
//! test the bandwidth of a device to device memcopy of a specific size
///
float testDeviceToDeviceTransfer(unsigned int memSize) {
  StopWatchInterface *timer = NULL;
  float elapsedTimeInMs = 0.0f;
  float bandwidthInGBs = 0.0f;
  cudaEvent_t start, stop;

  sdkCreateTimer(&timer);
  checkCudaErrors(cudaEventCreate(&start));
  checkCudaErrors(cudaEventCreate(&stop));

  // allocate host memory
  unsigned char *h_idata = (unsigned char *)malloc(memSize);

  if (h_idata == 0) {
    fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
    exit(EXIT_FAILURE);
  }

  // initialize the host memory
  for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
    h_idata[i] = (unsigned char)(i & 0xff);
  }

  // allocate device memory
  unsigned char *d_idata;
  checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
  unsigned char *d_odata;
  checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));

  // initialize memory
  checkCudaErrors(
      cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));

  // run the memcopy
  sdkStartTimer(&timer);
  checkCudaErrors(cudaEventRecord(start, 0));

  for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
    checkCudaErrors(
        cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
  }

  checkCudaErrors(cudaEventRecord(stop, 0));

  // Since device to device memory copies are non-blocking,
  // cudaDeviceSynchronize() is required in order to get
  // proper timing.
  checkCudaErrors(cudaDeviceSynchronize());

  // get the total elapsed time in ms
  sdkStopTimer(&timer);
  checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));

  if (bDontUseGPUTiming) {
    elapsedTimeInMs = sdkGetTimerValue(&timer);
  }

  // calculate bandwidth in GB/s
  double time_s = elapsedTimeInMs / 1e3;
  bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
  bandwidthInGBs = bandwidthInGBs / time_s;

  // clean up memory
  sdkDeleteTimer(&timer);
  free(h_idata);
  checkCudaErrors(cudaEventDestroy(stop));
  checkCudaErrors(cudaEventDestroy(start));
  checkCudaErrors(cudaFree(d_idata));
  checkCudaErrors(cudaFree(d_odata));

  return bandwidthInGBs;
}

/
// print results in an easily read format

void printResultsReadable(unsigned int *memSizes, double *bandwidths,
                          unsigned int count, memcpyKind kind,
                          memoryMode memMode, int iNumDevs, bool wc) {
  printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
  printf(" %s Memory Transfers\n", sMemoryMode[memMode]);

  if (wc) {
    printf(" Write-Combined Memory Writes are Enabled");
  }

  printf("   Transfer Size (Bytes)\tBandwidth(GB/s)\n");
  unsigned int i;

  for (i = 0; i < (count - 1); i++) {
    printf("   %u\t\t\t%s%.1f\n", memSizes[i],
           (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
  }

  printf("   %u\t\t\t%s%.1f\n\n", memSizes[i],
         (memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}

///
// print results in a database format
///
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
                     unsigned int count, memcpyKind kind, memoryMode memMode,
                     int iNumDevs, bool wc) {
  std::string sConfig;

  // log config information
  if (kind == DEVICE_TO_DEVICE) {
    sConfig += "D2D";
  } else {
    if (kind == DEVICE_TO_HOST) {
      sConfig += "D2H";
    } else if (kind == HOST_TO_DEVICE) {
      sConfig += "H2D";
    }

    if (memMode == PAGEABLE) {
      sConfig += "-Paged";
    } else if (memMode == PINNED) {
      sConfig += "-Pinned";

      if (wc) {
        sConfig += "-WriteCombined";
      }
    }
  }

  unsigned int i;
  double dSeconds = 0.0;

  for (i = 0; i < count; i++) {
    dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
    printf(
        "bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
        "bytes, NumDevsUsed = %d\n",
        sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
  }
}

///
// Print help screen
///
void printHelp(void) {
  printf("Usage:  bandwidthTest [OPTION]...\n");
  printf(
      "Test the bandwidth for device to host, host to device, and device to "
      "device transfers\n");
  printf("\n");
  printf(
      "Example:  measure the bandwidth of device to host pinned memory copies "
      "in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
  printf(
      "./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
      "--increment=1024 --dtoh\n");

  printf("\n");
  printf("Options:\n");
  printf("--help\tDisplay this help menu\n");
  printf("--csv\tPrint results as a CSV\n");
  printf("--device=[deviceno]\tSpecify the device device to be used\n");
  printf("  all - compute cumulative bandwidth on all the devices\n");
  printf("  0,1,2,...,n - Specify any particular device to be used\n");
  printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
  printf("  pageable - pageable memory\n");
  printf("  pinned   - non-pageable system memory\n");
  printf("--mode=[MODE]\tSpecify the mode to use\n");
  printf("  quick - performs a quick measurement\n");
  printf("  range - measures a user-specified range of values\n");
  printf("  shmoo - performs an intense shmoo of a large range of values\n");

  printf("--htod\tMeasure host to device transfers\n");
  printf("--dtoh\tMeasure device to host transfers\n");
  printf("--dtod\tMeasure device to device transfers\n");
#if CUDART_VERSION >= 2020
  printf("--wc\tAllocate pinned memory as write-combined\n");
#endif
  printf("--cputiming\tForce CPU-based timing always\n");

  printf("Range mode options\n");
  printf("--start=[SIZE]\tStarting transfer size in bytes\n");
  printf("--end=[SIZE]\tEnding transfer size in bytes\n");
  printf("--increment=[SIZE]\tIncrement size in bytes\n");
}

代码详解：

1. 程序首先会检测可用的 CUDA 设备,并选择一个合适的设备进行测试。

其中测试配置需要用户输入args:

memmod: 内存操作模式,如 memcpy、memset 等

direction: 内存传输方向,如从主机到设备、从设备到主机等

ETC: 等其他一些测试参数

2. 接着在主机和设备上分配内存。然后在主机内存和设备内存上分配用于测试的内存空间

3. 接下来,程序会执行一系列的内存读写操作,根据测试配置执行不同的操作:

主机到设备的内存拷贝

设备到主机的内存拷贝

在设备内存上进行的纯内存拷贝

设备内的内存写操作

设备内的内存读操作

在每种操作中,程序会记录时间并计算带宽，记录内存拷贝操作的执行时间，根据传输的内存大小和执行时间,计算出内存带宽(单位为 GB/s)。

4. 最后,程序会输出各种测试操作的带宽结果,供用户分析和比较。

该Sample中main函数主要调用了runTest函数，该函数会解析用户输入的args测试参数,然后根据这些参数运行合适的内存带宽测试,最终输出各种测试操作的带宽数据。以下是用户可以输入的测试参数：

>bandwidthTest.exe --help
[CUDA Bandwidth Test] - Starting...
Usage:  bandwidthTest [OPTION]...
Test the bandwidth for device to host, host to device, and device to device transfers

Example:  measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments
./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh

Options:
--help  Display this help menu
--csv   Print results as a CSV
--device=[deviceno]     Specify the device device to be used
  all - compute cumulative bandwidth on all the devices
  0,1,2,...,n - Specify any particular device to be used
--memory=[MEMMODE]      Specify which memory mode to use
  pageable - pageable memory
  pinned   - non-pageable system memory
--mode=[MODE]   Specify the mode to use
  quick - performs a quick measurement
  range - measures a user-specified range of values
  shmoo - performs an intense shmoo of a large range of values
--htod  Measure host to device transfers
--dtoh  Measure device to host transfers
--dtod  Measure device to device transfers
--wc    Allocate pinned memory as write-combined
--cputiming     Force CPU-based timing always
Range mode options
--start=[SIZE]  Starting transfer size in bytes
--end=[SIZE]    Ending transfer size in bytes
--increment=[SIZE]      Increment size in bytes
Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

解析完测试参数后会调用testBandwidth函数，该函数会根据传入的 mode 参数,调用相应的测试函数

SHMOO_MODE: 调用 testBandwidthShmoo 函数,执行全面的"Shmoo"测试
RANGE_MODE: 调用 testBandwidthRange 函数,执行指定范围和增量的测试
QUICK_MODE: 调用 testBandwidthQuick 函数,执行默认大小的快速测试

这些测试函数内部会执行实际的内存带宽测试,并输出测试结果。

本文以为RANGE_MODE例，该模式是通过testBandwidthRange函数测试,它实现了"RANGE_MODE"下的内存带宽测试。该函数细节如下

函数参数:
- start, end, increment: 定义了测试范围和增量步长
- kind: 指定内存传输方向,如 DEVICE_TO_HOST、HOST_TO_DEVICE、DEVICE_TO_DEVICE
- printmode: 指定输出格式,如 CSV 或可读格式
- memMode: 指定内存操作模式,如 memcpy、memset 等
- startDevice, endDevice: 指定要测试的设备范围
- wc: 是否使用写合并(write combining)内存
函数实现:
- 根据测试范围和增量,计算需要进行的测试次数
- 动态分配内存用于存储测试结果
- 初始化 bandwidths 数组为 0
- 遍历指定的设备范围,对每个设备执行测试
  - 在每个设备上,遍历测试范围,调用相应的带宽测试函数:
    - testDeviceToHostTransfer
    - testHostToDeviceTransfer
    - testDeviceToDeviceTransfer
  - 将每次测试的带宽结果累加到 bandwidths 数组中
- 根据指定的输出格式(CSV 或可读格式),调用相应的打印函数输出结果
- 释放动态分配的内存
测试流程:
- 首先确定测试范围和增量,计算需要进行的测试次数
- 对于指定的设备范围,在每个设备上执行测试
- 对于每个设备,逐步增大内存块大小,调用相应的带宽测试函数
- 将每次测试的带宽结果累加,最后输出汇总结果

testBandwidthRange 函数中以testHostToDeviceTransfer为例，该函数用于测试主机到设备的内存拷贝操作的带宽的 CUDA 函数。下面是对代码的解释:

该函数接受三个参数:
- memSize: 要拷贝的内存大小(以字节为单位)
- memMode: 内存模式,可以是"页面锁定模式"(PINNED)或"可分页模式"(PAGEABLE)
- wc: 一个布尔值,用于启用/禁用写组合(Write Combining)内存
该函数首先根据内存模式分配主机内存,并初始化内存内容。
然后它分配设备内存,并进行主机到设备的内存拷贝操作。如果是页面锁定模式,它使用 cudaMemcpyAsync() 函数进行异步拷贝,并使用 CUDA 事件测量拷贝耗时。如果是可分页模式,它使用 cudaMemcpy() 函数进行同步拷贝,并使用 SDK 计时器测量耗时。
最后,该函数计算带宽并返回结果。带宽计算公式为:(memSize * MEMCOPY_ITERATIONS) / (time_s * 1e9) GB/s。
在函数结束时,它会释放分配的主机和设备内存。

总的来说,testBandwidth 函数是 bandwidthtest 程序的核心,它根据用户的输入参数,调用相应的测试函数来执行内存带宽测试。其他模式的代码逻辑类似，在此不做赘述。

CUDA 相关API解读

该Sample使用的CUDA API: cudaEventCreate, cudaEventRecord, cudaEventDestroy, cudaEventElapsedTime, cudaMemcpyAsync

这些API属于CUDA runtime API, 我们先介绍CUDA runtime API中event的概念：

Events runtime提供了一种密切监控device端进度以及执行精确计时的方式,让应用程序能够在程序的任何时候异步记录事件,并查询这些事件何时完成。当前一个事件的所有任务(或可选地,给定Stream中的所有命令)都已完成时,该事件就算完成了。Stream 0中的事件在所有先前的任务和所有Stream中的命令都完成后才算完成。

Events创建和销毁

创建2个events:

cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);

销毁2个events：

cudaEventDestroy(start);
cudaEventDestroy(stop);

计算时间

cudaEventRecord(start, 0);
for (int i = 0; i < 2; ++i) {
    cudaMemcpyAsync(inputDev + i * size, inputHost + i * size,
                    size, cudaMemcpyHostToDevice, stream[i]);
    MyKernel<<<100, 512, 0, stream[i]>>>
               (outputDev + i * size, inputDev + i * size, size);
    cudaMemcpyAsync(outputHost + i * size, outputDev + i * size,
                    size, cudaMemcpyDeviceToHost, stream[i]);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);

运行结果：

默认Quick Mode下的测试结果：

[CUDA Bandwidth Test] - Starting...
Running on...

 Device 0: NVIDIA GeForce RTX 4080
 Quick Mode

 Host to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     18.4

 Device to Host Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     15.7

 Device to Device Bandwidth, 1 Device(s)
 PINNED Memory Transfers
   Transfer Size (Bytes)        Bandwidth(GB/s)
   32000000                     2839.9

Result = PASS

NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.

翊桐公羽

关注

36
点赞
踩
13

收藏

觉得还不错? 一键收藏
0
评论
CUDA编程之CUDA Sample-1_Utilities-bandwidthTest

CUDA Samples 中 1_Utilities 文件夹里包含了一些实用工具和小型示例程序,它们通常用于支持和演示其他 CUDA 示例程序的功能。bandwidthTest这个sample是一个简单的测试程序,用于测量 GPU 的内存拷贝带宽以及 PCI-E 总线上的内存拷贝带宽。这个程序可以帮助开发人员了解 CUDA 应用程序在不同内存访问模式下的性能表现,从而优化应用程序的内存使用策略,提高整体性能。
复制链接

扫一扫