Cuda实战-06 官方样例asyncAPI

最新推荐文章于 2024-06-04 09:36:10 发布

梦星辰.

最新推荐文章于 2024-06-04 09:36:10 发布

阅读量148

点赞数

分类专栏： Cuda编程技术文章标签：算法

本文链接：https://blog.csdn.net/weixin_45792450/article/details/129594322

版权

Cuda编程技术专栏收录该内容

23 篇文章 4 订阅

订阅专栏

说明

这个是官方样例 asyncAPI.cu，内容有所删改，只是为了方便初学者理解，相对较为综合，但收获也挺大。里面的注释都是我根据自己的理解添加的

代码


 // includes, system
 #include <stdio.h>

 // includes CUDA Runtime
 #include <cuda_runtime.h>
 
 __global__ void increment_kernel(int* g_data, int inc_value) {
     int idx = blockIdx.x * blockDim.x + threadIdx.x;
     g_data[idx] = g_data[idx] + inc_value;
 }
 
 int main(int argc, char* argv[]) {
 
     //显示第一张显卡名称
     cudaDeviceProp deviceProps;
     cudaGetDeviceProperties(&deviceProps, 0);
     printf("CUDA device [%s]\n", deviceProps.name);
 
     int n = 16 * 1024 * 1024;
     int nbytes = n * sizeof(int);
     int value = 26;
 
     //分配CPU内存
     int* a = 0;
     cudaMallocHost((void**)&a, nbytes);
     memset(a, 0, nbytes);
 
     //分配GPU内存
     int* d_a = 0;
     cudaMalloc((void**)&d_a, nbytes);
     cudaMemset(d_a, 255, nbytes);
 
     //设置核函数启动参数
     dim3 threads = dim3(512, 1);
     dim3 blocks = dim3(n / threads.x, 1);
 
     //创建计时任务
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
 
     //强制同步
     cudaDeviceSynchronize();
     //开始计时
     cudaEventRecord(start, 0);
     //异步传输数据
     cudaMemcpyAsync(d_a, a, nbytes, cudaMemcpyHostToDevice, 0);
     //执行核函数
     increment_kernel << <blocks, threads, 0, 0 >> > (d_a, 100);
     //异步传输数据
     cudaMemcpyAsync(a, d_a, nbytes, cudaMemcpyDeviceToHost, 0);
     //停止计时
     cudaEventRecord(stop, 0);
     //获取时间
     float gpu_time = 0.0f;
     cudaEventSynchronize(stop);
     cudaEventElapsedTime(&gpu_time, start, stop);
 
     //打印时间
     printf("time spent executing by the GPU: %.4f\n", gpu_time);
 
     //释放资源
     cudaEventDestroy(start);
     cudaEventDestroy(stop);
     cudaFreeHost(a);
     cudaFree(d_a);
 }