cuda_c学习笔记-向量加法

最新推荐文章于 2023-12-25 12:05:17 发布

weixin_34088598

最新推荐文章于 2023-12-25 12:05:17 发布

阅读量264

点赞数

文章标签： python 内存管理

原文链接：http://www.cnblogs.com/nwpuxuezha/p/4468860.html

版权

用cuda计算向量加法A+B=C

流程：

1.申请主机内存。向量A，向量B，计算结果C

2.初始化数据。用0-1之间的随机数初始化向量A，B，C

3.GPU内存申请。申请A，B，C需要的GPU内存空间

4.数据拷贝。把数据从主机内存拷贝至GPU内存

5.计算需要的线程数和线程块数。

6.调用GPU加法函数

7.数据拷贝。把结果从GPU内存拷贝至主机内存。

8.在CPU上重新运行一遍，与GPU结果进行对照。

9.释放GPU内存。

10.释放主机内存。

11.重置GPU状态。

要点：内存管理，数据拷贝。

代码：

  1 #include <stdio.h>
  2 #include <cuda_runtime.h>
  3 __global__ void
  4 vectorAdd(const float *A, const float *B, float *C, int numElements)
  5 {
  6     int i = blockDim.x * blockIdx.x + threadIdx.x;
  7 
  8     if (i < numElements)
  9     {
 10         C[i] = A[i] + B[i];
 11     }
 12 }
 13 
 14 int main(void)
 15 {
 16     //检测cuda返回值
 17     cudaError_t err = cudaSuccess;
 18 
 19     //初始化向量维度
 20     int numElements = 50000;
 21     //计算内存需求
 22     size_t size = numElements * sizeof(float);
 23 
 24     printf("[Vector addition of %d elements ]\n", numElements);
 25 
 26     //
 27     // 对主机的A,B,C申请内存空间
 28     float *host_A = (float *)malloc(size);
 29     float *host_B = (float *)malloc(size);
 30     float *host_C = (float *)malloc(size);
 31     //判断是否申请成功
 32     if (host_A == NULL || host_B == NULL || host_C == NULL)
 33     {
 34         fprintf(stderr, "Failed to allocate host vectors!\n");
 35         exit(EXIT_FAILURE);
 36     }
 37     // 初始化主机A,B
 38     for (int i = 0; i < numElements; ++i)
 39     {
 40         host_A[i] = rand()/(float)RAND_MAX;
 41         host_B[i] = rand()/(float)RAND_MAX;
 42     }
 43 
 44     //
 45     //申请cuda内存空间并判断
 46     float *device_A = NULL;
 47     err = cudaMalloc((void **)&device_A, size);
 48     if (err != cudaSuccess)
 49     {
 50         fprintf(stderr, "对向量A申请cuda内存空间失败 (错误代码 %s)!\n", cudaGetErrorString(err));
 51         exit(EXIT_FAILURE);
 52     }
 53     float *device_B = NULL;
 54     err = cudaMalloc((void **)&device_B, size);
 55     if (err != cudaSuccess)
 56     {
 57         fprintf(stderr, "对向量B申请cuda内存空间失败 (错误代码 %s)!\n", cudaGetErrorString(err));
 58         exit(EXIT_FAILURE);
 59     }
 60     float *device_C = NULL;
 61     err = cudaMalloc((void **)&device_C, size);
 62     if (err != cudaSuccess)
 63     {
 64         fprintf(stderr, "对向量C申请cuda内存空间失败 (错误代码 %s)!\n", cudaGetErrorString(err));
 65         exit(EXIT_FAILURE);
 66     }
 67     //拷贝数据到cuda内存并检测
 68     printf("Copy input data from the host memory to the CUDA device\n");
 69     err = cudaMemcpy(device_A, host_A, size, cudaMemcpyHostToDevice);
 70     if (err != cudaSuccess)
 71     {
 72         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 73         exit(EXIT_FAILURE);
 74     }
 75     err = cudaMemcpy(device_B, host_B, size, cudaMemcpyHostToDevice);
 76     if (err != cudaSuccess)
 77     {
 78         fprintf(stderr, "Failed to copy vector A from host to device (error code %s)!\n", cudaGetErrorString(err));
 79         exit(EXIT_FAILURE);
 80     }
 81 
 82     //计算线程块与线程
 83     //每线程块线程数
 84     int threadsPerBlock = 256;
 85     //每网格线程块数
 86     int blocksPerGrid = (numElements + threadsPerBlock - 1)/ threadsPerBlock;
 87     printf("CUDA kernel launch with %d blocks of %d threads\n", blocksPerGrid, threadsPerBlock);
 88     vectorAdd<<<blocksPerGrid, threadsPerBlock>>>(device_A, device_B, device_C, numElements);
 89     //判断cuda程序运行情况
 90     err = cudaGetLastError();
 91     if (err != cudaSuccess)
 92     {
 93         fprintf(stderr, "Failed to launch vectorAdd kernel (error code %s)!\n", cudaGetErrorString(err));
 94         exit(EXIT_FAILURE);
 95     }
 96 
 97     //计算结果拷贝回主机
 98     printf("Copy output data from the CUDA device to the host memory\n");
 99     err = cudaMemcpy(host_C, device_C, size, cudaMemcpyDeviceToHost);
100     if (err != cudaSuccess)
101     {
102         fprintf(stderr,"计算结果拷贝回主机失败（错误代码：%s）\n",cudaGetErrorString(err));
103         exit(EXIT_FAILURE);
104     }
105 
106     //结果验证
107     for(int i = 0; i < numElements; ++i)
108     {
109         if(fabs(host_A[i] + host_B[i] - host_C[i]) > 1e-5)
110         {
111             fprintf(stderr,"验证失败%d\n",i);
112             exit(EXIT_FAILURE);
113         }
114     }
115     printf("验证成功\n");
116 
117     //释放cuda内存和主机内存
118     err = cudaFree(device_A);
119     err = cudaFree(device_B);
120     err = cudaFree(device_C);
121     free(host_A);
122     free(host_B);
123     free(host_C);
124 
125     //重置cuda状态
126     err = cudaDeviceReset();
127     printf("结束");
128     return 0;
129 }

好烦……GPU搞起来真麻烦

转载于:https://www.cnblogs.com/nwpuxuezha/p/4468860.html

weixin_34088598

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
cuda_c学习笔记-向量加法

用cuda计算向量加法A+B=C流程：1.申请主机内存。向量A，向量B，计算结果C2.初始化数据。用0-1之间的随机数初始化向量A，B，C3.GPU内存申请。申请A，B，C需要的GPU内存空间4.数据拷贝。把数据从主机内存拷贝至GPU内存5.计算需要的线程数和线程块数。6.调用GPU加法函数7.数据拷贝。把结果从GPU内存拷贝至主机内存。8.在CPU上重新运行一...
复制链接

扫一扫