#include <iostream>
#define N 10
__global__ void ArrayAdd(int* a, int* b, int* c) {
// thread id, GPU中的线程ID,不同线程处理不同数据
// blockIdx是cuda运行时定义的内置变量,用于标识线程ID
int tid = blockIdx.x;
if (tid < N) c[tid] = a[tid] + b[tid];
}
int main() {
// 分配CPU内存
int a[N] = {}, b[N] = {}; c[N] = {};
// 位于CPU内存上的指向GPU内存地址的指针
int *dev_a = nullptr, *dev_b = nullptr, *dev_c = nullptr;
// 生成输入数据
for (int idx = 0; idx < N; ++idx) {
a[idx] = 3 * idx;
b[idx] = idx * idx;
}
// 分配GPU内存
cudaMalloc((void**)&dev_a, N * sizeof(int));
cudaMalloc((void**)&dev_b, N * sizeof(int));
cudaMalloc((void**)&dev_c, N * sizeof(int));
// 从CPU内存到GPU内存拷贝输入数据
cudaMemcpy(dev_a, a, N * sizeof(int), cudaMemcpyHostToDevide);
cudaMemcpy(dev_b, b, N * sizeof(int), cudaMemcpyHostToDevide);
// 执行GPU程序
ArrayAdd<<<N, 1>>>(dev_a, dev_b, dev_c);
// 从GPU内存到CPU内存拷贝输出数据
cudaMemcpy(c, dev_c, N * sizeof(int), cudaMemcpyDeviceToHost);
for (int idx = 0; idx < N; ++idx) {
printf("%d + %d = %d", a[idx], b[idx], c[idx];
}
// 释放分配的GPU内存
cudaFree(dev_a); dev_a = nullptr;
cudaFree(dev_b); dev_b = nullptr;
cudaFree(dev_c); dev_c = nullptr;
return 0;
}
// 修改核函数调用,保证每个线程块完全利用其中的128个线程,且不启动多余的线程块
ArrayAdd<<<(N + 127) / 128, 128>>>(dev_a, dev_b, dev_c);
__global__ void ArrayAdd(int* a, int * b, int* c) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
while (tid < N) {
c[tid] = a[tid] + b[tid];
tid += blockDim.x * gridDim.x; // tid递增步长是线程数目
}
}
// CPU code
ArrayAdd<<<(128, 128>>>(dev_a, dev_b, dev_c);