1、使用Cuda提供的Event进行计时
// create cuda event handles
cudaEvent_t start, stop;
checkCudaErrors(cudaEventCreate(&start));
checkCudaErrors(cudaEventCreate(&stop));
// record timestamp before and after running of kernel_function
cudaEventRecord(start, 0);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
cudaEventRecord(stop, 0);
// get elapsetime through cudaEventElapsedTime, it is calculated by the unit of ms
checkCudaErrors(cudaEventElapsedTime(&gpu_time, start, stop));
printf("time spent executing by the GPU: %.2f\n", gpu_time/1000);
2、使用CPU记录内核程序运行时间
//initialization
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
//get the timepoint before and after
sdkStartTimer(&timer);
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
sdkStopTimer(&timer);
//out the time
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
那么两种计时的区别在哪?
先来看一下以上两者所得结果的差别
差距究竟在哪呢?我们来更换一下cpu计时的位置:
//initialization
StopWatchInterface *timer = NULL;
sdkCreateTimer(&timer);
sdkResetTimer(&timer);
//get the timepoint before and after
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
sdkStartTimer(&timer);
increment_kernel<<<blocks, threads, 0, 0>>>(d_a, value);
sdkStopTimer(&timer);
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
//out the time
printf("time spent by CPU in CUDA calls: %.2f\n", sdkGetTimerValue(&timer));
那么就很明显了,GPUevent的计时仅仅计算内核程序的运行时间。而CPU的计时会将调用内核,复制内存,甚至于系统内的一些时间算入,所以要精确计算内核程序所用时间还是使用GPUevent更好