CUDA Samples上的例子,可是那个封装的优点太复杂,不适合初学者看,按照上面的方法实现了一下。如下
#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include "book.h"
#define SIZE 81920000
#define THREAD_NUM 512
__global__ void reduce1(float *a, float *c, int size)
{
int i = threadIdx.x + blockDim.x * blockIdx.x;
__shared__ float sdata[THREAD_NUM];
sdata[threadIdx.x] = i < size ? a[i] : 0;
__syncthreads();
int j = THREAD_NUM / 2;
while (j != 0)
{
if (threadIdx.x < j)
{
sdata[threadIdx.x] += sdata[threadIdx.x + j];
}
__syncthreads();
j /= 2;
}
if (threadIdx.x == 0)
{
c[blockIdx.x] = sdata[0];
}
}
__global__ void reduce2(float *a, float *c, int size)
{
int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
__shared__ float sdata[THREAD_NUM];
float sum = i < size ? a[i] : 0;
if (i + blockDim.x < size)
sum += a[i+blockDim.x];
sdata[threadIdx.x] = sum;
__syncthreads();
int j = THREAD_NUM/2;
while (j > 0)
{
if (threadIdx.x < j)
sdata[threadIdx.x] += sdata[threadIdx.x + j];
__syncthreads();
j /= 2;
}
if (threadIdx.x == 0)
{