说明
这个是官方样例 asyncAPI.cu,内容有所删改,只是为了方便初学者理解,相对较为综合,但收获也挺大。里面的注释都是我根据自己的理解添加的
代码
// includes, system
#include <stdio.h>
// includes CUDA Runtime
#include <cuda_runtime.h>
__global__ void increment_kernel(int* g_data, int inc_value) {
int idx = blockIdx.x * blockDim.x + threadIdx.x;
g_data[idx] = g_data[idx] + inc_value;
}
int main(int argc, char* argv[]) {
//显示第一张显卡名称
cudaDeviceProp deviceProps;
cudaGetDeviceProperties(&deviceProps, 0);
printf("CUDA device [%s]\n", deviceProps.name);
int n = 16 * 1024 * 1024;
int nbytes = n * sizeof(int);
int value = 26;
//分配CPU内存
int* a = 0;
cudaMallocHost((void**)&a, nbytes);
memset(a, 0, nbytes);
//分配GPU内存
int* d_a = 0;
cudaMalloc((void**)&d_a, nbytes);
cudaMemset(d_a, 255, nbytes);
//设置核函数启动参数
dim3 threads = dim3(512, 1);
dim3 blocks = dim3(n / threads.x, 1);
//创建计时任务
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
//强制同步
cudaDeviceSynchronize();
//开始计时
cudaEventRecord(start, 0);
//异步传输数据
cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
//执行核函数
increment_kernel << <blocks, threads, 0, 0 >> > (d_a, 100);
//异步传输数据
cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
//停止计时
cudaEventRecord(stop, 0);
//获取时间
float gpu_time = 0.0f;
cudaEventSynchronize(stop);
cudaEventElapsedTime(&gpu_time, start, stop);
//打印时间
printf("time spent executing by the GPU: %.4f\n", gpu_time);
//释放资源
cudaEventDestroy(start);
cudaEventDestroy(stop);
cudaFreeHost(a);
cudaFree(d_a);
}