CUDA 并行规约代码实现

cpp文件

#include "stdafx.h"
#include <stdio.h>
#include <stdlib.h>       //为rand()及srand()提供函数声明
#include <time.h>  
#include <time.h>  oklli


extern "C" int addWithCuda(float* sum, float* array_0, int size);


int _tmain(int argc, _TCHAR* argv[])
{
    int i = 0, j = 0, k = 0;
    float sum = 0;
    float * sum1 = (float*)malloc( sizeof(float));
    int size = 4;
    float* array_0 = (float*)malloc(size * sizeof(float));     //创建一维数组
    srand(time(NULL));

    for (i = 0; i < size; i++)
    {
        //生成随机数
        *(array_0 + i ) = (float)rand() / (RAND_MAX / 10);
    }
    
    /*for (i = 0; i < size; i++)
    {
        printf("%f ", *(array_0 + i ));
    }
    printf("\n");
    */


    clock_t start = clock();
    for (i = 0; i < size; i++)
    {
        sum = sum + *(array_0 + i);
    }
    clock_t end = clock();
    double interval = double(end - start) / CLK_TCK;
    printf("CPU运行时间为:%lf\n", interval);
    


    // Add vectors in parallel.
    clock_t start1 = clock();
    int cudaStatus = addWithCuda(sum1, array_0, size);
    clock_t end1 = clock();
    double interval1 = double(end1 - start1) / CLK_TCK;
    printf("GPU运行时间为:%lf\n", interval1);
    //printf("加速比为:%lf\n", interval / interval1);
    
    printf("CPU运算结果如下:%f\n",sum);
    
    // cudaDeviceReset must be called before exiting in order for profiling and
    // tracing tools such as Nsight and Visual Profiler to show complete traces.

    return 0;
}

kernel.cu文件


#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>
#include<math.h>


__global__ void addKernel(float *array_0, float *dev_partial_sum)
{
    //申请共享内存
    __shared__ float share[512];
    //复制全局内存数据到共享内存
    share[threadIdx.x] = array_0[threadIdx.x + blockIdx.x * blockDim.x];
    
    __syncthreads();
    printf("%f ",array_0[threadIdx.x + blockIdx.x * blockDim.x]);
    //通过循环设置不同阶段
    for (int i = blockDim.x/2; i >0; i =i/2)
    {
        if (threadIdx.x <i)
            share[threadIdx.x] += share[threadIdx.x + i];
        __syncthreads();
    }
    //将结果写回
    if (threadIdx.x == 0)
    {
        dev_partial_sum[blockIdx.x] = share[0];
        printf("\n汇总%f\n",share[0]);
    }
}


// Helper function for using CUDA to add vectors in parallel.
extern "C" int addWithCuda(float* sum, const float* array_0, int size)
{
    float* dev_array;
    float* dev_array1;
    float* dev_sum;
    cudaError_t cudaStatus;

    // Choose which GPU to run on, change this on a multi-GPU system.
    cudaStatus = cudaSetDevice(0);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaSetDevice failed!  Do you have a CUDA-capable GPU installed?");
        goto Error;
    }

    // Allocate GPU buffers for three vectors (two input, one output)    .
    cudaStatus = cudaMalloc((void**)&dev_array, size * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_array1, size/2 * sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    cudaStatus = cudaMalloc((void**)&dev_sum, 1*sizeof(float));
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMalloc failed!");
        goto Error;
    }

    // Copy input vectors from host memory to GPU buffers.
    cudaStatus = cudaMemcpy(dev_array, array_0, size * sizeof(float), cudaMemcpyHostToDevice);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    addKernel << <2, 2 >> > (dev_array,dev_array1);
    addKernel << <1, 2 >> > (dev_array1,dev_sum);


    // Check for any errors launching the kernel
    cudaStatus = cudaGetLastError();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
        goto Error;
    }

    // cudaDeviceSynchronize waits for the kernel to finish, and returns
    // any errors encountered during the launch.
    cudaStatus = cudaDeviceSynchronize();
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
        goto Error;
    }
    // Copy output vector from GPU buffer to host memory.
    cudaStatus = cudaMemcpy(sum, dev_sum, sizeof(float), cudaMemcpyDeviceToHost);
    if (cudaStatus != cudaSuccess) {
        fprintf(stderr, "cudaMemcpy failed!");
        goto Error;
    }
    
Error:
    cudaFree(dev_sum);
    cudaFree(dev_array);

    return cudaStatus;
}

 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值