CUDA Samples上的例子,可是那个封装的优点太复杂,不适合初学者看,按照上面的方法实现了一下。如下
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include "book.h"
#define SIZE 81920000
#define THREAD_NUM 512
__global__ void reduce1(float *a, float *c, int size)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
__shared__ float sdata[THREAD_NUM];
sdata[threadIdx.x] = i < size ? a[i] : 0;
__syncthreads();
int j = THREAD_NUM / 2;
while (j != 0)
{
if (threadIdx.x < j)
{
sdata[threadIdx.x] += sdata[threadIdx.x + j];
}
__syncthreads();
j /= 2;
}
if (threadIdx.x == 0)
{
c[blockIdx.x] = sdata[0];
}
}
__global__ void reduce2(float *a, float *c, int size)
{
int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
__shared__ float sdata[THREAD_NUM];
float sum = i < size ? a[i] : 0;
if (i + blockDim.x < size)
sum += a[i+blockDim.x];
sdata[threadIdx.x] = sum;
__syncthreads();
int j = THREAD_NUM/2;
while (j > 0)
{
if (threadIdx.x < j)
sdata[threadIdx.x] += sdata[threadIdx.x + j];
__syncthreads();
j /= 2;
}
if (threadIdx.x == 0)
{
c[blockIdx.x] = sdata[0];
// printf("c[%d] = %f\n",blockIdx.x, sdata[0]);
}
}
__global__ void reduce3(float *a, float *c, int size)
{
int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
__shared__ float sdata[THREAD_NUM];
float sum = i < size ? a[i] : 0;
if (i + blockDim.x < size)
sum += a[i+blockDim.x];
sdata[threadIdx.x] = sum;
__syncthreads();
int j = THREAD_NUM/2;
while (j > 32)
{
if (threadIdx.x < j)
sdata[threadIdx.x] += sdata[threadIdx.x + j];
__syncthreads();
j /= 2;
}
if (threadIdx.x < 32)
sdata[threadIdx.x] += sdata[threadIdx.x + 32];
if (threadIdx.x < 16)
sdata[threadIdx.x] += sdata[threadIdx.x + 16];
if (threadIdx.x < 8)
sdata[threadIdx.x] += sdata[threadIdx.x + 8];
if (threadIdx.x < 4)
sdata[threadIdx.x] += sdata[threadIdx.x + 4];
if (threadIdx.x < 2)
sdata[threadIdx.x] += sdata[threadIdx.x + 2];
if (threadIdx.x < 1)
sdata[threadIdx.x] += sdata[threadIdx.x + 1];
if (threadIdx.x == 0)
{
c[blockIdx.x] = sdata[0];
// printf("c[%d] = %f\n",blockIdx.x, sdata[0]);
}
}
__global__ void reduce5(float *a, float *c, int size)
{
int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
__shared__ float sdata[THREAD_NUM];
float sum = i < size ? a[i] : 0;
if (i + blockDim.x < size)
sum += a[i+blockDim.x];
sdata[threadIdx.x] = sum;
__syncthreads();
int j = THREAD_NUM/2;
while (j > 32)
{
if (threadIdx.x < j)
sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + j];
__syncthreads();
j /= 2;
}
if (threadIdx.x < 32)
sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 32];
for (int offset=16; offset>0; offset/=2)
{
sum += __shfl_down(sum, offset);
}
/*
for (int offset=32; offset>0; offset/=2)
{
sum += __shfl_down(sum, offset);
}
*/
if (threadIdx.x == 0)
{
c[blockIdx.x] = sum;
// printf("c[%d] = %f\n",blockIdx.x, sdata[0]);
}
}
__global__ void reduce4(float *a, float *c, int size)
{
int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
__shared__ float sdata[THREAD_NUM];
float sum = i < size ? a[i] : 0;
if (i + blockDim.x < size)
sum += a[i+blockDim.x];
sdata[threadIdx.x] = sum;
__syncthreads();
if (threadIdx.x < 256)
sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 256];
__syncthreads();
if (threadIdx.x < 128)
sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 128];
__syncthreads();
if (threadIdx.x < 64)
sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 64];
__syncthreads();
if (threadIdx.x < 32)
sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 32];
if (threadIdx.x < 16)
// sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
sum = __shfl_down(sum,16);
if (threadIdx.x < 8)
// sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
sum = __shfl_down(sum,8);
if (threadIdx.x < 4)
// sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
sum = __shfl_down(sum,4);
if (threadIdx.x < 2)
// sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
sum = __shfl_down(sum,2);
if (threadIdx.x < 1)
// sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
sum = __shfl_down(sum,1);
if (threadIdx.x == 0)
{
c[blockIdx.x] = sum;
}
}
int main(void)
{
int block = (SIZE+THREAD_NUM-1) / THREAD_NUM;
float *h_a, *h_c;
int size_a = SIZE * sizeof(float);
int size_c = block * sizeof(float);
//malloc
h_a = (float*)malloc(size_a);
h_c = (float*)malloc(size_c);
assert(h_a!=NULL && h_c!=NULL);
//init
for (int i=0; i<SIZE; ++i)
{
// h_a[i] = rand() / (float)RAND_MAX;
h_a[i] = 1;
}
float *d_a, *d_c;
HANDLE_ERROR(cudaMalloc((void **)&d_a, size_a));
HANDLE_ERROR(cudaMalloc((void **)&d_c, size_c));
//cpoy
HANDLE_ERROR(cudaMemcpy(d_a, h_a, size_a, cudaMemcpyHostToDevice));
reduce5<<<block,THREAD_NUM>>>(d_a,d_c,SIZE);
HANDLE_ERROR(cudaMemcpy(h_c, d_c, size_c, cudaMemcpyDeviceToHost));
//verfity
float sum = 0.0f;
for (int i=0; i<SIZE; ++i)
{
sum += h_a[i];
}
printf("HOST : sum=%f\n",sum);
sum = 0.0f;
for (int i=0; i<block; ++i)
{
sum += h_c[i];
}
printf("DEVICE : sum=%f\n",sum);
return 0;
}