2.1.6 验证核函数
三种方法:
①编写一个相同功能的主机函数验证核函数的结果。
②在Fermi及更高版本的设备端的核函数中使用 printf 函数
③可以将执行参数设置为 <<<1, 1>>>,因此强制使用一个块和一个线程执行核函数,这模拟了串行执行程序。
2.1.7 处理错误
由于许多 CUDA 函数是异步的,所以有时可能很难确定某个错误是由哪一步程序引起的。定义一个错误处理宏封装所有的CUDA API 调用,这样可以简化错误检查过程:
#define checkCudaErrors(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
2.2 给核函数计时
衡量核函数性能的方法有很多,最简单的方法是在主机端使用一个 CPU 或 GPU 计时器来计算内核的执行时间。
2.2.1 用CPU计时器计时
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#define abs(a) a > 0 ? a :a
#define checkCudaErrors(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
void checkResult(float* hostRef, float* gpuRef, const int N) {
double epslion = 1.0E-8;
bool match = 1;
for (int i = 0; i < N; i++) {
if (abs(hostRef[i] - gpuRef[i]) > epslion) {
match = 0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n", hostRef[i], gpuRef[i], i);
break;
}
}
if (match) printf("Arrays match.\n\n");
}
void initialData(float* ip, int size) { //generate different seed for random number
time_t t;
srand((unsigned)time(&t));
for (int i = 0; i < size; i++) {
ip[i] = (float)(rand() & 0xFF) / 10.0f;
}
}
void sumArraysOnHost(float* A, float* B, float* C, const int N) {
for (int idx = 0; idx < N; idx++)
C[idx] = A[idx] + B[idx];
}
__global__ void sumArraysOnGPU(float* A, float* B, float* C, const int N) {
int i = blockIdx.x * blockDim.x + threadIdx.x;
if (i < N) C[i] = A[i] + B[i];
}
double cpuSecond()
{
return clock();
}
int main(int argc, char** argv) {
printf("%s Starting...\n", argv[0]);
//set up device
int dev = 0;
cudaDeviceProp deviceProp;
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, dev));
printf("Using Device %d: %s\n", dev, deviceProp.name);
checkCudaErrors(cudaSetDevice(dev));
//set up data size of vectors
int nElem = 1 << 24;
printf("Vector size %d\n", nElem);
//malloc host memory
size_t nBytes = nElem * sizeof(float);
float* h_A, * h_B, * hostRef, * gpuRef;
h_A = (float*)malloc(nBytes);
h_B = (float*)malloc(nBytes);
hostRef = (float*)malloc(nBytes);
gpuRef = (float*)malloc(nBytes);
double iStart, iElaps;
//initialize data at hose side
iStart = cpuSecond();
initialData(h_A, nElem);
initialData(h_B, nElem);
iElaps = cpuSecond() - iStart;
memset(hostRef, 0, nBytes);
memset(gpuRef, 0, nBytes);
//add vector at host side for result checks
iStart = cpuSecond();
sumArraysOnHost(h_A, h_B, hostRef, nElem);
iElaps = cpuSecond() - iStart;
//malloc device global memory
float* d_A, * d_B, * d_C;
cudaMalloc((void**)& d_A, nBytes);
cudaMalloc((void**)& d_B, nBytes);
cudaMalloc((void**)& d_C, nBytes);
//transfer data from host to device
cudaMemcpy(d_A, h_A, nBytes, cudaMemcpyHostToDevice);
cudaMemcpy(d_B, h_B, nBytes, cudaMemcpyHostToDevice);
//invoke kernel at host side
int iLen = 1024;
dim3 block(iLen);
dim3 grid((nElem + block.x - 1) / block.x);
iStart = cpuSecond();
sumArraysOnGPU <<<grid, block >>> (d_A, d_B, d_C, nElem);
cudaThreadSynchronize();
iElaps = cpuSecond() - iStart;
printf("sumArraysOnGPU <<<%d,%d>>> Time elapsed %f sec\n", grid.x, block.x, iElaps);
//copy kernel result back to host side
cudaMemcpy(gpuRef, d_C, nBytes, cudaMemcpyDeviceToHost);
//check device results
checkResult(hostRef, gpuRef, nElem);
//free device global memory
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
//free host memory
free(h_A);
free(h_B);
free(hostRef);
free(gpuRef);
system("pause");
return(0);
}
2.2.2 用 nvprof 工具计时
nvprof 是CUDA 5.0以来,NVIDIA 提供的一个命令行分析工具,可以帮助从应用程序的 CPU 和 GPU活动情况中获取时间线信息,其包括内核执行,内存传输,以及 CUDA API 的调用。
使用语法:
nvprof[nvprof_args] <application> [application_args]
应用程序没有参数,直接调用即可:
nvprof application_name
例子:管理员身份运行 cmd 转移到.exe的对应文件夹下,调用 nvprof 工具运行相应的程序。