CUDA并行求和算法
前提准备:先了解CPU自动分配内存求和过程,
再了解CPU手动分配内存(malloc)求和过程,
然后,进行利用GPU的CUDA程序编写:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<stdlib.h>
__global__ void arr_add(int *a, int *b, int *c, int num)
{
int i = threadIdx.x;
if (i < num) {
c[i] = a[i] + b[i];
}
}
int main()
{
//Step1.1 分配host内存
int num = 10;
int a[10], b[10], c[10];
int* a_gpu, * b_gpu, * c_gpu;
//Step1.2 进行数据初始化
for (size_t i = 0; i < num; i++) {
a[i] = i;
b[i] = 2 * i;
}
//Step2.1 分配device内存
cudaMalloc((void **)&a_gpu, num * sizeof(int));
cudaMalloc((void **)&b_gpu, num * sizeof(int));
cudaMalloc((void **)&c_gpu, num * sizeof(int));
//Step2.2 从host将数据拷贝到device上
cudaMemcpy(a_gpu, a, num * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(b_gpu, b, num * sizeof(int), cudaMemcpyHostToDevice);
//Step3 调用CUDA的核函数在device上完成指定的运算
arr_add<<<1, 10 >>> (a_gpu, b_gpu, c_gpu, num);
//Step4 将device的结果拷贝到host上
cudaMemcpy(c, c_gpu, num * sizeof(int), cudaMemcpyDeviceToHost);
for (size_t i = 0; i < num; i++) {
printf("%d+%d=%d\n", a[i], b[i], c[i]);
}
//Step5 释放device上的内存和host上的内存
cudaFree(a_gpu);
cudaFree(b_gpu);
cudaFree(c_gpu);
return 0;
}