CUDA global_reduce实现
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void global_reduce(float* d_out, float* d_in)
{
int myID = blockIdx.x * blockDim.x + threadIdx.x;
int idx = threadIdx.x;
for (unsigned int s = blockDim.x / 2; s > 0; s=s/2)
{
if (idx < s)
{
d_in[myID] += d_in[myID + s];
}
__syncthreads();
}
if (idx == 0)
{
d_out[blockIdx.x] = d_in[myID];
}
}
int main()
{
float h_in[1024];
int i;
for (i = 0; i < 1024;i++)
{
h_in[i] = i;
}
float h_out[1];
float* d_in;
float* d_out;
cudaMalloc(&d_in, sizeof(float) * 1024);
cudaMalloc(&d_out, sizeof(float));
cudaMemcpy(d_in, h_in, sizeof(float) * 1024, cudaMemcpyHostToDevice);
global_reduce << <1, 1024 >> > (d_out, d_in);
cudaMemcpy(h_out, d_out, sizeof(float), cudaMemcpyDeviceToHost);
float sum = 0;
for (int i = 0; i < 1024; i++)
{
sum += h_in[i];
}
printf("%f\n", h_out[0]);
printf("%f", sum);
return 0;
}