规约算法-reduction

CUDA Samples上的例子,可是那个封装的优点太复杂,不适合初学者看,按照上面的方法实现了一下。如下

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <math.h>
#include "book.h"
#define SIZE 81920000
#define THREAD_NUM 512

__global__ void reduce1(float *a, float *c, int size)
{
        int i = threadIdx.x + blockDim.x * blockIdx.x;
        __shared__ float sdata[THREAD_NUM];
        sdata[threadIdx.x] = i < size ? a[i] : 0;
        __syncthreads();    

        int j = THREAD_NUM / 2;
        while (j != 0)
        {
                if (threadIdx.x < j)
                {
                        sdata[threadIdx.x] += sdata[threadIdx.x + j];
                }
                 __syncthreads();

                j /= 2;
        }


        if (threadIdx.x == 0)
        {
                c[blockIdx.x] = sdata[0];
        }
}


__global__ void reduce2(float *a, float *c, int size)
{
        int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
        __shared__ float sdata[THREAD_NUM];

        float sum = i < size ? a[i] : 0;

        if (i + blockDim.x < size)
                sum += a[i+blockDim.x];

        sdata[threadIdx.x] = sum;
        __syncthreads();

        int j = THREAD_NUM/2;
        while (j > 0)
        {
                if (threadIdx.x < j)
                        sdata[threadIdx.x] += sdata[threadIdx.x + j];
                __syncthreads();
                j /= 2;
        }

        if (threadIdx.x == 0)
        {
            c[blockIdx.x]    = sdata[0];
//          printf("c[%d] = %f\n",blockIdx.x, sdata[0]);
        }

}


__global__ void reduce3(float *a, float *c, int size)
{
        int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
        __shared__ float sdata[THREAD_NUM];

        float sum = i < size ? a[i] : 0;

        if (i + blockDim.x < size)
                sum += a[i+blockDim.x];

        sdata[threadIdx.x] = sum;
        __syncthreads();

        int j = THREAD_NUM/2;
        while (j > 32)
        {
                if (threadIdx.x < j)
                        sdata[threadIdx.x] += sdata[threadIdx.x + j];
                __syncthreads();
                j /= 2;
        }


        if (threadIdx.x < 32)
                sdata[threadIdx.x] += sdata[threadIdx.x + 32];

        if (threadIdx.x < 16)
                sdata[threadIdx.x] += sdata[threadIdx.x + 16];

        if (threadIdx.x < 8)
                sdata[threadIdx.x] += sdata[threadIdx.x + 8];

        if (threadIdx.x < 4)
                sdata[threadIdx.x] += sdata[threadIdx.x + 4];

        if (threadIdx.x < 2)
                sdata[threadIdx.x] += sdata[threadIdx.x + 2];

        if (threadIdx.x < 1)
                sdata[threadIdx.x] += sdata[threadIdx.x + 1];

        if (threadIdx.x == 0)
        {
            c[blockIdx.x]    = sdata[0];
//          printf("c[%d] = %f\n",blockIdx.x, sdata[0]);
        }

}

__global__ void reduce5(float *a, float *c, int size)
{
        int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
        __shared__ float sdata[THREAD_NUM];

        float sum = i < size ? a[i] : 0;

        if (i + blockDim.x < size)
                sum += a[i+blockDim.x];

        sdata[threadIdx.x] = sum;
        __syncthreads();

        int j = THREAD_NUM/2;
        while (j > 32)
        {
                if (threadIdx.x < j)
                        sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + j];
                __syncthreads();
                j /= 2;
        }


        if (threadIdx.x < 32)
                sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 32];

        for (int offset=16; offset>0; offset/=2)
        {
                sum += __shfl_down(sum, offset);
        }

        /*
        for (int offset=32; offset>0; offset/=2)
        {
                sum += __shfl_down(sum, offset);
        }
*/

        if (threadIdx.x == 0)
        {
            c[blockIdx.x]    = sum;
//          printf("c[%d] = %f\n",blockIdx.x, sdata[0]);
        }

}

__global__ void reduce4(float *a, float *c, int size)
{
        int i = threadIdx.x + blockIdx.x * 2 * blockDim.x;
        __shared__ float sdata[THREAD_NUM];

        float sum = i < size ? a[i] : 0;

        if (i + blockDim.x < size)
                sum += a[i+blockDim.x];

        sdata[threadIdx.x] = sum;
        __syncthreads();

        if (threadIdx.x < 256)
                sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 256];
        __syncthreads();

        if (threadIdx.x < 128)
                sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 128];
        __syncthreads();

        if (threadIdx.x < 64)
                sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 64];
        __syncthreads();

        if (threadIdx.x < 32)
                sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 32];

        if (threadIdx.x < 16)
//              sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 16];
                  sum = __shfl_down(sum,16);

        if (threadIdx.x < 8)
            //  sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 8];
                  sum = __shfl_down(sum,8);

        if (threadIdx.x < 4)
            //  sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 4];
                  sum = __shfl_down(sum,4);

        if (threadIdx.x < 2)
//              sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 2];
                  sum = __shfl_down(sum,2);

        if (threadIdx.x < 1)
    //          sdata[threadIdx.x] = sum = sum + sdata[threadIdx.x + 1];
                  sum = __shfl_down(sum,1);

        if (threadIdx.x == 0)
        {
            c[blockIdx.x]    = sum;
        }

}

int main(void)
{

        int block = (SIZE+THREAD_NUM-1) / THREAD_NUM;

        float *h_a, *h_c;

        int size_a = SIZE * sizeof(float);
        int size_c = block * sizeof(float);
        //malloc
        h_a = (float*)malloc(size_a);
        h_c = (float*)malloc(size_c);

        assert(h_a!=NULL && h_c!=NULL);

        //init
        for (int i=0; i<SIZE; ++i)
        {
//              h_a[i] = rand() / (float)RAND_MAX;
                h_a[i] = 1;
        }

        float *d_a, *d_c;
        HANDLE_ERROR(cudaMalloc((void **)&d_a, size_a));    
        HANDLE_ERROR(cudaMalloc((void **)&d_c, size_c));    

        //cpoy
        HANDLE_ERROR(cudaMemcpy(d_a, h_a, size_a, cudaMemcpyHostToDevice));

        reduce5<<<block,THREAD_NUM>>>(d_a,d_c,SIZE);

        HANDLE_ERROR(cudaMemcpy(h_c, d_c, size_c, cudaMemcpyDeviceToHost));

        //verfity
        float sum = 0.0f;
        for (int i=0; i<SIZE; ++i)
        {
                sum += h_a[i];
        }
        printf("HOST : sum=%f\n",sum);

        sum = 0.0f;
        for (int i=0; i<block; ++i)
        {
                sum += h_c[i];
        }
        printf("DEVICE : sum=%f\n",sum);

        return 0;
}
  • 0
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值