cuda reduce学习

这个程序实现的是加法的并行运算:

#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <cuda_runtime.h>
__global__ void parallel_reduce_kernel(float* d_out, float* d_in){
    int myID = threadIdx.x + blockIdx.x*blockDim.x;
    int tid = threadIdx.x;

    for (unsigned int s = blockDim.x/2; s > 0; s>>=1){
        if(tid < s){
            d_in[myID] += d_in[myID+s];
        }
        __syncthreads();//进行线程的同步
    }
    if(tid == 0){
        d_out[blockIdx.x] = d_in[blockIdx.x*blockDim.x];
    }
}

int main(){
    srand(time(0));
    float data[1024];
    for(int i = 0; i < 1024; ++i){
        data[i] = rand()%1024;
    }
    float* data_gpu, *out_gpu;
    cudaMalloc(&data_gpu, sizeof(float)*1024);
    cudaMemcpy(data_gpu, data, sizeof(float)*1024, cudaMemcpyHostToDevice);
    cudaMalloc(&out_gpu, sizeof(float)*1);
    parallel_reduce_kernel <<< 1, 1024 >>>(out_gpu, data_gpu);
    float* out = (float*)malloc(sizeof(float));
    cudaMemcpy(out, out_gpu, sizeof(float), cudaMemcpyDeviceToHost);
    float sum = 0;
    for(int i = 0; i < 1024; ++i){
        sum += data[i];
    }
    printf("%f\n",sum);
    printf("%f,", out[0]);
    return 1;
}

附加原子运算:实现的是统计任务

__global__ void add(int* data, int* bin, int n){
    int idx = threadIdx.x;
    if(idx < n)
        atomicAdd(bin+data[idx],1);
}

int main(){
    srand(time(0));
    //get random data [0-63]
    int data[300];
    for(int i = 0; i < 300; ++i){
        data[i] = rand()%64;
    }
    //get the hist of random data in cpu
    int hist[64];
    memset(hist, 0, sizeof(int)*64);
    for(int i = 0; i < 300; ++i){
        ++hist[data[i]];
    }
    for(int i = 0; i < 30; ++i){
        printf("%d,", hist[i]);
    }
    printf("\n");
    int* hist_gpu;
    cudaMalloc(&hist_gpu, sizeof(int)*64);
    cudaMemset(hist_gpu, 0, sizeof(int)*64);
    int* data_gpu;
    cudaMalloc(&data_gpu, sizeof(int)*300);
    cudaMemcpy(data_gpu, data, sizeof(int)*300, cudaMemcpyHostToDevice);

    add<<<1,300>>>(data_gpu, hist_gpu, 300);
    cudaMemcpy(hist, hist_gpu, sizeof(int)*64, cudaMemcpyDeviceToHost);
    for(int i = 0; i < 30; ++i){
        printf("%d,", hist[i]);
    }
    cudaFree(hist_gpu);
    return 1;
}
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值