错误检测头文件 error.cuh:
#pragma once
#include <stdio.h>
#define CHECK(call) \
do \
{ \
const cudaError_t error_code = call; \
if (error_code != cudaSuccess) \
{ \
printf("CUDA Error:\n"); \
printf(" File: %s\n", __FILE__); \
printf(" Line: %d\n", __LINE__); \
printf(" Error code: %d\n", error_code); \
printf(" Error text: %s\n", \
cudaGetErrorString(error_code)); \
exit(1); \
} \
} while (0)
CUDA事件计时框架示例 arithmetic2gpu.cu:
#include "error.cuh"
#include <math.h>
#include <stdio.h>
#ifdef USE_DP //编译选项宏定义
typedef double real; //使用双精度浮点数
#else
typedef float real; //使用单精度浮点数
#endif
const int NUM_REPEATS = 10;
const real x0 = 100.0;
void __global__ arithmetic(real *x, const real x0, const int N);
int main(int argc, char **argv)
{
if (argc != 2)
{
printf("usage: %s N\n", argv[0]);
exit(1);
}
const int N = atoi(argv[1]);
const int block_size = 128;
const int grid_size = (N + block_size - 1) / block_size;
const int M = sizeof(real) * N;
real *h_x = (real*) malloc(M);
real *d_x;
CHECK(cudaMalloc((void **)&d_x, M));
float t_sum = 0;
float t2_sum = 0;
for (int repeat = 0; repeat <= NUM_REPEATS; ++repeat)
{
for (int n = 0; n < N; ++n)
{
h_x[n] = 0.0;
}
CHECK(cudaMemcpy(d_x, h_x, M, cudaMemcpyHostToDevice));
cudaEvent_t start, stop; //事件变量定义
CHECK(cudaEventCreate(&start)); //事件变量初始化
CHECK(cudaEventCreate(&stop)); //事件变量初始化
CHECK(cudaEventRecord(start)); //传入函数中开始计时
cudaEventQuery(start); //注意,此处不能用CHECK宏函数,CUDA流队列
arithmetic<<<grid_size, block_size>>>(d_x, x0, N); //被计时的代码块
CHECK(cudaEventRecord(stop)); //计时结束
CHECK(cudaEventSynchronize(stop)); //所有线程同步
float elapsed_time; //定义时间差变量
CHECK(cudaEventElapsedTime(&elapsed_time, start, stop)); //计算时间差
printf("Time = %g ms.\n", elapsed_time);
if (repeat > 0)
{
t_sum += elapsed_time;
t2_sum += elapsed_time * elapsed_time;
}
CHECK(cudaEventDestroy(start));
CHECK(cudaEventDestroy(stop));
}
const float t_ave = t_sum / NUM_REPEATS;
const float t_err = sqrt(t2_sum / NUM_REPEATS - t_ave * t_ave);
printf("Time = %g +- %g ms.\n", t_ave, t_err);
free(h_x);
CHECK(cudaFree(d_x));
return 0;
}
void __global__ arithmetic(real *d_x, const real x0, const int N)
{
const int n = blockDim.x * blockIdx.x + threadIdx.x;
if (n < N)
{
real x_tmp = d_x[n];
while (sqrt(x_tmp) < x0)
{
++x_tmp;
}
d_x[n] = x_tmp;
}
}
编译命令:
nvcc -03 -arch=sm_75 -DUSE_DP arithmetic2gpu.cu -o arithmetic2gpu
运行:
./arithmetic2gpu