#include <cuda_runtime.h>
#include <iostream>
// 定义数据类型为 float
typedef float real;
// 减少操作的核心函数(如你之前提供的)
__global__ void reduce_global(real* d_x, real* d_y)
{
const int tid = threadIdx.x;
real* x = d_x + blockDim.x * blockIdx.x;
for (int offset = blockDim.x >> 1; offset > 0; offset >>= 1)
{
if (tid < offset)
{
x[tid] += x[tid + offset];
}
__syncthreads();
}
if (tid == 0)
{
d_y[blockIdx.x] = x[0];
}
}
int main()
{
const int N = 1 << 20; // 例如,我们有 1M 个元素
const int threadsPerBlock = 256;
const int blocks = N / threadsPerBlock;
real* h_x = new real[N];
real* h_y = new real[blocks];
real* d_x, * d_y;
// 初始化 h_x 的数据
for (int i = 0; i < N; i++)
{
h_x[i] = 1.0f;
}
// 在设备上分配内存
cudaMalloc((void**)&d_x, N * sizeof(real));
cudaMalloc((void**)&d_y, blocks * sizeof(real));
// 复制数据到设备
cudaMemcpy(d_x, h_x, N * sizeof(real), cudaMemcpyHostToDevice);
// 调用归约 kernel
reduce_global << <blocks, threadsPerBlock >> > (d_x, d_y);
// 复制归约结果回主机
cudaMemcpy(h_y, d_y, blocks * sizeof(real), cudaMemcpyDeviceToHost);
// 在 CPU 上进一步归约结果
real sum = 0;
for (int i = 0; i < blocks; i++)
{
sum += h_y[i];
}
std::cout << "Total Sum: " << sum << std::endl;
// 释放内存
delete[] h_x;
delete[] h_y;
cudaFree(d_x);
cudaFree(d_y);
return 0;
}
CUDA:使用全局内存的数组规约
最新推荐文章于 2024-05-08 11:40:30 发布