CUDA Samples 中 1_Utilities 文件夹里包含了一些实用工具和小型示例程序,它们通常用于支持和演示其他 CUDA 示例程序的功能。
bandwidthTest这个sample是一个简单的测试程序,用于测量 GPU 的内存拷贝带宽以及 PCI-E 总线上的内存拷贝带宽。这个测试应用程序能够测量以下几种情况的带宽:
- 设备间(device to device)拷贝带宽
- 主机到设备(host to device)拷贝带宽,包括可分页内存(pageable memory)和锁页内存(page-locked memory)
- 设备到主机(device to host)拷贝带宽,包括可分页内存和锁页内存
这个程序可以帮助开发人员了解 CUDA 应用程序在不同内存访问模式下的性能表现,从而优化应用程序的内存使用策略,提高整体性能。
通过测量这些不同情况下的内存拷贝带宽,开发人员可以更好地评估 GPU 和 PCI-E 总线的性能瓶颈,进而针对性地优化应用程序的内存访问模式,从而获得更好的运行效率。
总之,这个简单的测试程序为 CUDA 应用程序的性能调优提供了一个有价值的工具和参考。
// CUDA runtime
#include <cuda_runtime.h>
// includes
#include <helper_cuda.h> // helper functions for CUDA error checking and initialization
#include <helper_functions.h> // helper for shared functions common to CUDA Samples
#include <cuda.h>
#include <cassert>
#include <iostream>
#include <memory>
static const char *sSDKsample = "CUDA Bandwidth Test";
// defines, project
#define MEMCOPY_ITERATIONS 100
#define DEFAULT_SIZE (32 * (1e6)) // 32 M
#define DEFAULT_INCREMENT (4 * (1e6)) // 4 M
#define CACHE_CLEAR_SIZE (16 * (1e6)) // 16 M
// shmoo mode defines
#define SHMOO_MEMSIZE_MAX (64 * (1e6)) // 64 M
#define SHMOO_MEMSIZE_START (1e3) // 1 KB
#define SHMOO_INCREMENT_1KB (1e3) // 1 KB
#define SHMOO_INCREMENT_2KB (2 * 1e3) // 2 KB
#define SHMOO_INCREMENT_10KB (10 * (1e3)) // 10KB
#define SHMOO_INCREMENT_100KB (100 * (1e3)) // 100 KB
#define SHMOO_INCREMENT_1MB (1e6) // 1 MB
#define SHMOO_INCREMENT_2MB (2 * 1e6) // 2 MB
#define SHMOO_INCREMENT_4MB (4 * 1e6) // 4 MB
#define SHMOO_LIMIT_20KB (20 * (1e3)) // 20 KB
#define SHMOO_LIMIT_50KB (50 * (1e3)) // 50 KB
#define SHMOO_LIMIT_100KB (100 * (1e3)) // 100 KB
#define SHMOO_LIMIT_1MB (1e6) // 1 MB
#define SHMOO_LIMIT_16MB (16 * 1e6) // 16 MB
#define SHMOO_LIMIT_32MB (32 * 1e6) // 32 MB
// CPU cache flush
#define FLUSH_SIZE (256 * 1024 * 1024)
char *flush_buf;
// enums, project
enum testMode { QUICK_MODE, RANGE_MODE, SHMOO_MODE };
enum memcpyKind { DEVICE_TO_HOST, HOST_TO_DEVICE, DEVICE_TO_DEVICE };
enum printMode { USER_READABLE, CSV };
enum memoryMode { PINNED, PAGEABLE };
const char *sMemoryCopyKind[] = {"Device to Host", "Host to Device",
"Device to Device", NULL};
const char *sMemoryMode[] = {"PINNED", "PAGEABLE", NULL};
// if true, use CPU based timing for everything
static bool bDontUseGPUTiming;
int *pArgc = NULL;
char **pArgv = NULL;
// declaration, forward
int runTest(const int argc, const char **argv);
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice, bool wc);
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc);
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc);
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc);
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
bool wc);
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
bool wc);
float testDeviceToDeviceTransfer(unsigned int memSize);
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc);
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc);
void printHelp(void);
// Program main
int main(int argc, char **argv) {
pArgc = &argc;
pArgv = argv;
flush_buf = (char *)malloc(FLUSH_SIZE);
// set logfile name and start logs
printf("[%s] - Starting...\n", sSDKsample);
int iRetVal = runTest(argc, (const char **)argv);
if (iRetVal < 0) {
checkCudaErrors(cudaSetDevice(0));
}
// finish
printf("%s\n", (iRetVal == 0) ? "Result = PASS" : "Result = FAIL");
printf(
"\nNOTE: The CUDA Samples are not meant for performance measurements. "
"Results may vary when GPU Boost is enabled.\n");
free(flush_buf);
exit((iRetVal == 0) ? EXIT_SUCCESS : EXIT_FAILURE);
}
///
// Parse args, run the appropriate tests
///
int runTest(const int argc, const char **argv) {
int start = DEFAULT_SIZE;
int end = DEFAULT_SIZE;
int startDevice = 0;
int endDevice = 0;
int increment = DEFAULT_INCREMENT;
testMode mode = QUICK_MODE;
bool htod = false;
bool dtoh = false;
bool dtod = false;
bool wc = false;
char *modeStr;
char *device = NULL;
printMode printmode = USER_READABLE;
char *memModeStr = NULL;
memoryMode memMode = PINNED;
// process command line args
if (checkCmdLineFlag(argc, argv, "help")) {
printHelp();
return 0;
}
if (checkCmdLineFlag(argc, argv, "csv")) {
printmode = CSV;
}
if (getCmdLineArgumentString(argc, argv, "memory", &memModeStr)) {
if (strcmp(memModeStr, "pageable") == 0) {
memMode = PAGEABLE;
} else if (strcmp(memModeStr, "pinned") == 0) {
memMode = PINNED;
} else {
printf("Invalid memory mode - valid modes are pageable or pinned\n");
printf("See --help for more information\n");
return -1000;
}
} else {
// default - pinned memory
memMode = PINNED;
}
if (getCmdLineArgumentString(argc, argv, "device", &device)) {
int deviceCount;
cudaError_t error_id = cudaGetDeviceCount(&deviceCount);
if (error_id != cudaSuccess) {
printf("cudaGetDeviceCount returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
exit(EXIT_FAILURE);
}
if (deviceCount == 0) {
printf("!!!!!No devices found!!!!!\n");
return -2000;
}
if (strcmp(device, "all") == 0) {
printf(
"\n!!!!!Cumulative Bandwidth to be computed from all the devices "
"!!!!!!\n\n");
startDevice = 0;
endDevice = deviceCount - 1;
} else {
startDevice = endDevice = atoi(device);
if (startDevice >= deviceCount || startDevice < 0) {
printf(
"\n!!!!!Invalid GPU number %d given hence default gpu %d will be "
"used !!!!!\n",
startDevice, 0);
startDevice = endDevice = 0;
}
}
}
printf("Running on...\n\n");
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaDeviceProp deviceProp;
cudaError_t error_id = cudaGetDeviceProperties(&deviceProp, currentDevice);
if (error_id == cudaSuccess) {
printf(" Device %d: %s\n", currentDevice, deviceProp.name);
if (deviceProp.computeMode == cudaComputeModeProhibited) {
fprintf(stderr,
"Error: device is running in <Compute Mode Prohibited>, no "
"threads can use ::cudaSetDevice().\n");
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
}
} else {
printf("cudaGetDeviceProperties returned %d\n-> %s\n", (int)error_id,
cudaGetErrorString(error_id));
checkCudaErrors(cudaSetDevice(currentDevice));
exit(EXIT_FAILURE);
}
}
if (getCmdLineArgumentString(argc, argv, "mode", &modeStr)) {
// figure out the mode
if (strcmp(modeStr, "quick") == 0) {
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
} else if (strcmp(modeStr, "shmoo") == 0) {
printf(" Shmoo Mode\n\n");
mode = SHMOO_MODE;
} else if (strcmp(modeStr, "range") == 0) {
printf(" Range Mode\n\n");
mode = RANGE_MODE;
} else {
printf("Invalid mode - valid modes are quick, range, or shmoo\n");
printf("See --help for more information\n");
return -3000;
}
} else {
// default mode - quick
printf(" Quick Mode\n\n");
mode = QUICK_MODE;
}
if (checkCmdLineFlag(argc, argv, "htod")) {
htod = true;
}
if (checkCmdLineFlag(argc, argv, "dtoh")) {
dtoh = true;
}
if (checkCmdLineFlag(argc, argv, "dtod")) {
dtod = true;
}
#if CUDART_VERSION >= 2020
if (checkCmdLineFlag(argc, argv, "wc")) {
wc = true;
}
#endif
if (checkCmdLineFlag(argc, argv, "cputiming")) {
bDontUseGPUTiming = true;
}
if (!htod && !dtoh && !dtod) {
// default: All
htod = true;
dtoh = true;
dtod = true;
}
if (RANGE_MODE == mode) {
if (checkCmdLineFlag(argc, (const char **)argv, "start")) {
start = getCmdLineArgumentInt(argc, argv, "start");
if (start <= 0) {
printf("Illegal argument - start must be greater than zero\n");
return -4000;
}
} else {
printf("Must specify a starting size in range mode\n");
printf("See --help for more information\n");
return -5000;
}
if (checkCmdLineFlag(argc, (const char **)argv, "end")) {
end = getCmdLineArgumentInt(argc, argv, "end");
if (end <= 0) {
printf("Illegal argument - end must be greater than zero\n");
return -6000;
}
if (start > end) {
printf("Illegal argument - start is greater than end\n");
return -7000;
}
} else {
printf("Must specify an end size in range mode.\n");
printf("See --help for more information\n");
return -8000;
}
if (checkCmdLineFlag(argc, argv, "increment")) {
increment = getCmdLineArgumentInt(argc, argv, "increment");
if (increment <= 0) {
printf("Illegal argument - increment must be greater than zero\n");
return -9000;
}
} else {
printf("Must specify an increment in user mode\n");
printf("See --help for more information\n");
return -10000;
}
}
if (htod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, HOST_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
}
if (dtoh) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_HOST, printmode,
memMode, startDevice, endDevice, wc);
}
if (dtod) {
testBandwidth((unsigned int)start, (unsigned int)end,
(unsigned int)increment, mode, DEVICE_TO_DEVICE, printmode,
memMode, startDevice, endDevice, wc);
}
// Ensure that we reset all CUDA Devices in question
for (int nDevice = startDevice; nDevice <= endDevice; nDevice++) {
cudaSetDevice(nDevice);
}
return 0;
}
///
// Run a bandwidth test
///
void testBandwidth(unsigned int start, unsigned int end, unsigned int increment,
testMode mode, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
switch (mode) {
case QUICK_MODE:
testBandwidthQuick(DEFAULT_SIZE, kind, printmode, memMode, startDevice,
endDevice, wc);
break;
case RANGE_MODE:
testBandwidthRange(start, end, increment, kind, printmode, memMode,
startDevice, endDevice, wc);
break;
case SHMOO_MODE:
testBandwidthShmoo(kind, printmode, memMode, startDevice, endDevice, wc);
break;
default:
break;
}
}
//
// Run a quick mode bandwidth test
//
void testBandwidthQuick(unsigned int size, memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
testBandwidthRange(size, size, DEFAULT_INCREMENT, kind, printmode, memMode,
startDevice, endDevice, wc);
}
///
// Run a range mode bandwidth test
//
void testBandwidthRange(unsigned int start, unsigned int end,
unsigned int increment, memcpyKind kind,
printMode printmode, memoryMode memMode,
int startDevice, int endDevice, bool wc) {
// count the number of copies we're going to run
unsigned int count = 1 + ((end - start) / increment);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
// Before calculating the cumulative bandwidth, initialize bandwidths array to
// NULL
for (unsigned int i = 0; i < count; i++) {
bandwidths[i] = 0.0;
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaSetDevice(currentDevice);
// run each of the copies
for (unsigned int i = 0; i < count; i++) {
memSizes[i] = start + i * increment;
switch (kind) {
case DEVICE_TO_HOST:
bandwidths[i] += testDeviceToHostTransfer(memSizes[i], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[i] += testHostToDeviceTransfer(memSizes[i], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[i] += testDeviceToDeviceTransfer(memSizes[i]);
break;
}
}
} // Complete the bandwidth computation on all the devices
// print results
if (printmode == CSV) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
}
// clean up
free(memSizes);
free(bandwidths);
}
//
// Intense shmoo mode - covers a large range of values with varying increments
//
void testBandwidthShmoo(memcpyKind kind, printMode printmode,
memoryMode memMode, int startDevice, int endDevice,
bool wc) {
// count the number of copies to make
unsigned int count =
1 + (SHMOO_LIMIT_20KB / SHMOO_INCREMENT_1KB) +
((SHMOO_LIMIT_50KB - SHMOO_LIMIT_20KB) / SHMOO_INCREMENT_2KB) +
((SHMOO_LIMIT_100KB - SHMOO_LIMIT_50KB) / SHMOO_INCREMENT_10KB) +
((SHMOO_LIMIT_1MB - SHMOO_LIMIT_100KB) / SHMOO_INCREMENT_100KB) +
((SHMOO_LIMIT_16MB - SHMOO_LIMIT_1MB) / SHMOO_INCREMENT_1MB) +
((SHMOO_LIMIT_32MB - SHMOO_LIMIT_16MB) / SHMOO_INCREMENT_2MB) +
((SHMOO_MEMSIZE_MAX - SHMOO_LIMIT_32MB) / SHMOO_INCREMENT_4MB);
unsigned int *memSizes = (unsigned int *)malloc(count * sizeof(unsigned int));
double *bandwidths = (double *)malloc(count * sizeof(double));
// Before calculating the cumulative bandwidth, initialize bandwidths array to
// NULL
for (unsigned int i = 0; i < count; i++) {
bandwidths[i] = 0.0;
}
// Use the device asked by the user
for (int currentDevice = startDevice; currentDevice <= endDevice;
currentDevice++) {
cudaSetDevice(currentDevice);
// Run the shmoo
int iteration = 0;
unsigned int memSize = 0;
while (memSize <= SHMOO_MEMSIZE_MAX) {
if (memSize < SHMOO_LIMIT_20KB) {
memSize += SHMOO_INCREMENT_1KB;
} else if (memSize < SHMOO_LIMIT_50KB) {
memSize += SHMOO_INCREMENT_2KB;
} else if (memSize < SHMOO_LIMIT_100KB) {
memSize += SHMOO_INCREMENT_10KB;
} else if (memSize < SHMOO_LIMIT_1MB) {
memSize += SHMOO_INCREMENT_100KB;
} else if (memSize < SHMOO_LIMIT_16MB) {
memSize += SHMOO_INCREMENT_1MB;
} else if (memSize < SHMOO_LIMIT_32MB) {
memSize += SHMOO_INCREMENT_2MB;
} else {
memSize += SHMOO_INCREMENT_4MB;
}
memSizes[iteration] = memSize;
switch (kind) {
case DEVICE_TO_HOST:
bandwidths[iteration] +=
testDeviceToHostTransfer(memSizes[iteration], memMode, wc);
break;
case HOST_TO_DEVICE:
bandwidths[iteration] +=
testHostToDeviceTransfer(memSizes[iteration], memMode, wc);
break;
case DEVICE_TO_DEVICE:
bandwidths[iteration] +=
testDeviceToDeviceTransfer(memSizes[iteration]);
break;
}
iteration++;
printf(".");
fflush(0);
}
} // Complete the bandwidth computation on all the devices
// print results
printf("\n");
if (CSV == printmode) {
printResultsCSV(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
} else {
printResultsReadable(memSizes, bandwidths, count, kind, memMode,
(1 + endDevice - startDevice), wc);
}
// clean up
free(memSizes);
free(bandwidths);
}
///
// test the bandwidth of a device to host memcopy of a specific size
///
float testDeviceToHostTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
unsigned char *h_idata = NULL;
unsigned char *h_odata = NULL;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
if (PINNED == memMode) {
// pinned memory mode - use special function to get OS-pinned memory
#if CUDART_VERSION >= 2020
checkCudaErrors(cudaHostAlloc((void **)&h_idata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else
checkCudaErrors(cudaMallocHost((void **)&h_idata, memSize));
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
// pageable memory mode - use malloc
h_idata = (unsigned char *)malloc(memSize);
h_odata = (unsigned char *)malloc(memSize);
if (h_idata == 0 || h_odata == 0) {
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
exit(EXIT_FAILURE);
}
}
// initialize the memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// initialize the device memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// copy data from GPU to Host
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(h_odata, d_idata, memSize,
cudaMemcpyDeviceToHost, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(h_odata, d_idata, memSize, cudaMemcpyDeviceToHost));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
memset(flush_buf, i, FLUSH_SIZE);
}
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
sdkDeleteTimer(&timer);
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_idata));
checkCudaErrors(cudaFreeHost(h_odata));
} else {
free(h_idata);
free(h_odata);
}
checkCudaErrors(cudaFree(d_idata));
return bandwidthInGBs;
}
///
//! test the bandwidth of a host to device memcopy of a specific size
///
float testHostToDeviceTransfer(unsigned int memSize, memoryMode memMode,
bool wc) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
unsigned char *h_odata = NULL;
if (PINNED == memMode) {
#if CUDART_VERSION >= 2020
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaHostAlloc((void **)&h_odata, memSize,
(wc) ? cudaHostAllocWriteCombined : 0));
#else
// pinned memory mode - use special function to get OS-pinned memory
checkCudaErrors(cudaMallocHost((void **)&h_odata, memSize));
#endif
} else {
// pageable memory mode - use malloc
h_odata = (unsigned char *)malloc(memSize);
if (h_odata == 0) {
fprintf(stderr, "Not enough memory available on host to run test!\n");
exit(EXIT_FAILURE);
}
}
unsigned char *h_cacheClear1 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
unsigned char *h_cacheClear2 = (unsigned char *)malloc(CACHE_CLEAR_SIZE);
if (h_cacheClear1 == 0 || h_cacheClear2 == 0) {
fprintf(stderr, "Not enough memory available on host to run test!\n");
exit(EXIT_FAILURE);
}
// initialize the memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_odata[i] = (unsigned char)(i & 0xff);
}
for (unsigned int i = 0; i < CACHE_CLEAR_SIZE / sizeof(unsigned char); i++) {
h_cacheClear1[i] = (unsigned char)(i & 0xff);
h_cacheClear2[i] = (unsigned char)(0xff - (i & 0xff));
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
// copy host memory to device memory
if (PINNED == memMode) {
if (bDontUseGPUTiming) sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(cudaMemcpyAsync(d_idata, h_odata, memSize,
cudaMemcpyHostToDevice, 0));
}
checkCudaErrors(cudaEventRecord(stop, 0));
checkCudaErrors(cudaDeviceSynchronize());
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
sdkStopTimer(&timer);
elapsedTimeInMs = sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
}
} else {
elapsedTimeInMs = 0;
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
sdkStartTimer(&timer);
checkCudaErrors(
cudaMemcpy(d_idata, h_odata, memSize, cudaMemcpyHostToDevice));
sdkStopTimer(&timer);
elapsedTimeInMs += sdkGetTimerValue(&timer);
sdkResetTimer(&timer);
memset(flush_buf, i, FLUSH_SIZE);
}
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
sdkDeleteTimer(&timer);
if (PINNED == memMode) {
checkCudaErrors(cudaFreeHost(h_odata));
} else {
free(h_odata);
}
free(h_cacheClear1);
free(h_cacheClear2);
checkCudaErrors(cudaFree(d_idata));
return bandwidthInGBs;
}
///
//! test the bandwidth of a device to device memcopy of a specific size
///
float testDeviceToDeviceTransfer(unsigned int memSize) {
StopWatchInterface *timer = NULL;
float elapsedTimeInMs = 0.0f;
float bandwidthInGBs = 0.0f;
cudaEvent_t start, stop;
sdkCreateTimer(&timer);
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// allocate host memory
unsigned char *h_idata = (unsigned char *)malloc(memSize);
if (h_idata == 0) {
fprintf(stderr, "Not enough memory avaialable on host to run test!\n");
exit(EXIT_FAILURE);
}
// initialize the host memory
for (unsigned int i = 0; i < memSize / sizeof(unsigned char); i++) {
h_idata[i] = (unsigned char)(i & 0xff);
}
// allocate device memory
unsigned char *d_idata;
checkCudaErrors(cudaMalloc((void **)&d_idata, memSize));
unsigned char *d_odata;
checkCudaErrors(cudaMalloc((void **)&d_odata, memSize));
// initialize memory
checkCudaErrors(
cudaMemcpy(d_idata, h_idata, memSize, cudaMemcpyHostToDevice));
// run the memcopy
sdkStartTimer(&timer);
checkCudaErrors(cudaEventRecord(start, 0));
for (unsigned int i = 0; i < MEMCOPY_ITERATIONS; i++) {
checkCudaErrors(
cudaMemcpy(d_odata, d_idata, memSize, cudaMemcpyDeviceToDevice));
}
checkCudaErrors(cudaEventRecord(stop, 0));
// Since device to device memory copies are non-blocking,
// cudaDeviceSynchronize() is required in order to get
// proper timing.
checkCudaErrors(cudaDeviceSynchronize());
// get the total elapsed time in ms
sdkStopTimer(&timer);
checkCudaErrors(cudaEventElapsedTime(&elapsedTimeInMs, start, stop));
if (bDontUseGPUTiming) {
elapsedTimeInMs = sdkGetTimerValue(&timer);
}
// calculate bandwidth in GB/s
double time_s = elapsedTimeInMs / 1e3;
bandwidthInGBs = (2.0f * memSize * (float)MEMCOPY_ITERATIONS) / (double)1e9;
bandwidthInGBs = bandwidthInGBs / time_s;
// clean up memory
sdkDeleteTimer(&timer);
free(h_idata);
checkCudaErrors(cudaEventDestroy(stop));
checkCudaErrors(cudaEventDestroy(start));
checkCudaErrors(cudaFree(d_idata));
checkCudaErrors(cudaFree(d_odata));
return bandwidthInGBs;
}
/
// print results in an easily read format
void printResultsReadable(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind,
memoryMode memMode, int iNumDevs, bool wc) {
printf(" %s Bandwidth, %i Device(s)\n", sMemoryCopyKind[kind], iNumDevs);
printf(" %s Memory Transfers\n", sMemoryMode[memMode]);
if (wc) {
printf(" Write-Combined Memory Writes are Enabled");
}
printf(" Transfer Size (Bytes)\tBandwidth(GB/s)\n");
unsigned int i;
for (i = 0; i < (count - 1); i++) {
printf(" %u\t\t\t%s%.1f\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
printf(" %u\t\t\t%s%.1f\n\n", memSizes[i],
(memSizes[i] < 10000) ? "\t" : "", bandwidths[i]);
}
///
// print results in a database format
///
void printResultsCSV(unsigned int *memSizes, double *bandwidths,
unsigned int count, memcpyKind kind, memoryMode memMode,
int iNumDevs, bool wc) {
std::string sConfig;
// log config information
if (kind == DEVICE_TO_DEVICE) {
sConfig += "D2D";
} else {
if (kind == DEVICE_TO_HOST) {
sConfig += "D2H";
} else if (kind == HOST_TO_DEVICE) {
sConfig += "H2D";
}
if (memMode == PAGEABLE) {
sConfig += "-Paged";
} else if (memMode == PINNED) {
sConfig += "-Pinned";
if (wc) {
sConfig += "-WriteCombined";
}
}
}
unsigned int i;
double dSeconds = 0.0;
for (i = 0; i < count; i++) {
dSeconds = (double)memSizes[i] / (bandwidths[i] * (double)(1e9));
printf(
"bandwidthTest-%s, Bandwidth = %.1f GB/s, Time = %.5f s, Size = %u "
"bytes, NumDevsUsed = %d\n",
sConfig.c_str(), bandwidths[i], dSeconds, memSizes[i], iNumDevs);
}
}
///
// Print help screen
///
void printHelp(void) {
printf("Usage: bandwidthTest [OPTION]...\n");
printf(
"Test the bandwidth for device to host, host to device, and device to "
"device transfers\n");
printf("\n");
printf(
"Example: measure the bandwidth of device to host pinned memory copies "
"in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments\n");
printf(
"./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 "
"--increment=1024 --dtoh\n");
printf("\n");
printf("Options:\n");
printf("--help\tDisplay this help menu\n");
printf("--csv\tPrint results as a CSV\n");
printf("--device=[deviceno]\tSpecify the device device to be used\n");
printf(" all - compute cumulative bandwidth on all the devices\n");
printf(" 0,1,2,...,n - Specify any particular device to be used\n");
printf("--memory=[MEMMODE]\tSpecify which memory mode to use\n");
printf(" pageable - pageable memory\n");
printf(" pinned - non-pageable system memory\n");
printf("--mode=[MODE]\tSpecify the mode to use\n");
printf(" quick - performs a quick measurement\n");
printf(" range - measures a user-specified range of values\n");
printf(" shmoo - performs an intense shmoo of a large range of values\n");
printf("--htod\tMeasure host to device transfers\n");
printf("--dtoh\tMeasure device to host transfers\n");
printf("--dtod\tMeasure device to device transfers\n");
#if CUDART_VERSION >= 2020
printf("--wc\tAllocate pinned memory as write-combined\n");
#endif
printf("--cputiming\tForce CPU-based timing always\n");
printf("Range mode options\n");
printf("--start=[SIZE]\tStarting transfer size in bytes\n");
printf("--end=[SIZE]\tEnding transfer size in bytes\n");
printf("--increment=[SIZE]\tIncrement size in bytes\n");
}
代码详解:
1. 程序首先会检测可用的 CUDA 设备,并选择一个合适的设备进行测试。
其中测试配置需要用户输入args:
memmod: 内存操作模式,如 memcpy、memset 等
direction: 内存传输方向,如从主机到设备、从设备到主机等
ETC: 等其他一些测试参数
2. 接着在主机和设备上分配内存。然后在主机内存和设备内存上分配用于测试的内存空间
3. 接下来,程序会执行一系列的内存读写操作,根据测试配置执行不同的操作:
主机到设备的内存拷贝
设备到主机的内存拷贝
在设备内存上进行的纯内存拷贝
设备内的内存写操作
设备内的内存读操作
在每种操作中,程序会记录时间并计算带宽,记录内存拷贝操作的执行时间,根据传输的内存大小和执行时间,计算出内存带宽(单位为 GB/s)。
4. 最后,程序会输出各种测试操作的带宽结果,供用户分析和比较。
该Sample中main函数主要调用了runTest函数,该函数会解析用户输入的args测试参数,然后根据这些参数运行合适的内存带宽测试,最终输出各种测试操作的带宽数据。以下是用户可以输入的测试参数:
>bandwidthTest.exe --help
[CUDA Bandwidth Test] - Starting...
Usage: bandwidthTest [OPTION]...
Test the bandwidth for device to host, host to device, and device to device transfers
Example: measure the bandwidth of device to host pinned memory copies in the range 1024 Bytes to 102400 Bytes in 1024 Byte increments
./bandwidthTest --memory=pinned --mode=range --start=1024 --end=102400 --increment=1024 --dtoh
Options:
--help Display this help menu
--csv Print results as a CSV
--device=[deviceno] Specify the device device to be used
all - compute cumulative bandwidth on all the devices
0,1,2,...,n - Specify any particular device to be used
--memory=[MEMMODE] Specify which memory mode to use
pageable - pageable memory
pinned - non-pageable system memory
--mode=[MODE] Specify the mode to use
quick - performs a quick measurement
range - measures a user-specified range of values
shmoo - performs an intense shmoo of a large range of values
--htod Measure host to device transfers
--dtoh Measure device to host transfers
--dtod Measure device to device transfers
--wc Allocate pinned memory as write-combined
--cputiming Force CPU-based timing always
Range mode options
--start=[SIZE] Starting transfer size in bytes
--end=[SIZE] Ending transfer size in bytes
--increment=[SIZE] Increment size in bytes
Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.
解析完测试参数后会调用testBandwidth函数,该函数会根据传入的 mode
参数,调用相应的测试函数
SHMOO_MODE
: 调用testBandwidthShmoo
函数,执行全面的"Shmoo"测试RANGE_MODE
: 调用testBandwidthRange
函数,执行指定范围和增量的测试QUICK_MODE
: 调用testBandwidthQuick
函数,执行默认大小的快速测试
这些测试函数内部会执行实际的内存带宽测试,并输出测试结果。
本文以为RANGE_MODE
例,该模式是通过testBandwidthRange函数测试,它实现了"RANGE_MODE
"下的内存带宽测试。该函数细节如下
-
函数参数:
start
,end
,increment
: 定义了测试范围和增量步长kind
: 指定内存传输方向,如 DEVICE_TO_HOST、HOST_TO_DEVICE、DEVICE_TO_DEVICEprintmode
: 指定输出格式,如 CSV 或可读格式memMode
: 指定内存操作模式,如 memcpy、memset 等startDevice
,endDevice
: 指定要测试的设备范围wc
: 是否使用写合并(write combining)内存
-
函数实现:
- 根据测试范围和增量,计算需要进行的测试次数
- 动态分配内存用于存储测试结果
- 初始化
bandwidths
数组为 0 - 遍历指定的设备范围,对每个设备执行测试
- 在每个设备上,遍历测试范围,调用相应的带宽测试函数:
testDeviceToHostTransfer
testHostToDeviceTransfer
testDeviceToDeviceTransfer
- 将每次测试的带宽结果累加到
bandwidths
数组中
- 在每个设备上,遍历测试范围,调用相应的带宽测试函数:
- 根据指定的输出格式(CSV 或可读格式),调用相应的打印函数输出结果
- 释放动态分配的内存
-
测试流程:
- 首先确定测试范围和增量,计算需要进行的测试次数
- 对于指定的设备范围,在每个设备上执行测试
- 对于每个设备,逐步增大内存块大小,调用相应的带宽测试函数
- 将每次测试的带宽结果累加,最后输出汇总结果
testBandwidthRange
函数中以testHostToDeviceTransfer为例,
该函数用于测试主机到设备的内存拷贝操作的带宽的 CUDA 函数。下面是对代码的解释:
-
该函数接受三个参数:
memSize
: 要拷贝的内存大小(以字节为单位)memMode
: 内存模式,可以是"页面锁定模式"(PINNED)或"可分页模式"(PAGEABLE)wc
: 一个布尔值,用于启用/禁用写组合(Write Combining)内存
-
该函数首先根据内存模式分配主机内存,并初始化内存内容。
-
然后它分配设备内存,并进行主机到设备的内存拷贝操作。如果是页面锁定模式,它使用
cudaMemcpyAsync()
函数进行异步拷贝,并使用 CUDA 事件测量拷贝耗时。如果是可分页模式,它使用cudaMemcpy()
函数进行同步拷贝,并使用 SDK 计时器测量耗时。 -
最后,该函数计算带宽并返回结果。带宽计算公式为:
(memSize * MEMCOPY_ITERATIONS) / (time_s * 1e9) GB/s
。 -
在函数结束时,它会释放分配的主机和设备内存。
总的来说,testBandwidth
函数是 bandwidthtest 程序的核心,它根据用户的输入参数,调用相应的测试函数来执行内存带宽测试。其他模式的代码逻辑类似,在此不做赘述。
CUDA 相关API解读
该Sample使用的CUDA API: cudaEventCreate, cudaEventRecord, cudaEventDestroy, cudaEventElapsedTime, cudaMemcpyAsync
这些API属于CUDA runtime API, 我们先介绍CUDA runtime API中event的概念:
Events runtime提供了一种密切监控device端进度以及执行精确计时的方式,让应用程序能够在程序的任何时候异步记录事件,并查询这些事件何时完成。当前一个事件的所有任务(或可选地,给定Stream中的所有命令)都已完成时,该事件就算完成了。Stream 0中的事件在所有先前的任务和所有Stream中的命令都完成后才算完成。
Events创建和销毁
创建2个events:
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
销毁2个events:
cudaEventDestroy(start);
cudaEventDestroy(stop);
计算时间
cudaEventRecord(start, 0);
for (int i = 0; i < 2; ++i) {
cudaMemcpyAsync(inputDev + i * size, inputHost + i * size,
size, cudaMemcpyHostToDevice, stream[i]);
MyKernel<<<100, 512, 0, stream[i]>>>
(outputDev + i * size, inputDev + i * size, size);
cudaMemcpyAsync(outputHost + i * size, outputDev + i * size,
size, cudaMemcpyDeviceToHost, stream[i]);
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
运行结果:
默认Quick Mode下的测试结果:
[CUDA Bandwidth Test] - Starting...
Running on...
Device 0: NVIDIA GeForce RTX 4080
Quick Mode
Host to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 18.4
Device to Host Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 15.7
Device to Device Bandwidth, 1 Device(s)
PINNED Memory Transfers
Transfer Size (Bytes) Bandwidth(GB/s)
32000000 2839.9
Result = PASS
NOTE: The CUDA Samples are not meant for performance measurements. Results may vary when GPU Boost is enabled.