cpp文件
#include "stdafx.h"
#include <stdio.h>
#include <stdlib.h> //为rand()及srand()提供函数声明
#include <time.h>
#include <time.h> oklli
extern "C" int addWithCuda(float* sum, float* array_0, int size);
int _tmain(int argc, _TCHAR* argv[])
{
int i = 0, j = 0, k = 0;
float sum = 0;
float * sum1 = (float*)malloc( sizeof(float));
int size = 4;
float* array_0 = (float*)malloc(size * sizeof(float)); //创建一维数组
srand(time(NULL));
for (i = 0; i < size; i++)
{
//生成随机数
*(array_0 + i ) = (float)rand() / (RAND_MAX / 10);
}
/*for (i = 0; i < size; i++)
{
printf("%f ", *(array_0 + i ));
}
printf("\n");
*/
clock_t start = clock();
for (i = 0; i < size; i++)
{
sum = sum + *(array_0 + i);
}
clock_t end = clock();
double interval = double(end - start) / CLK_TCK;
printf("CPU运行时间为:%lf\n", interval);
// Add vectors in parallel.
clock_t start1 = clock();
int cudaStatus = addWithCuda(sum1, array_0, size);
clock_t end1 = clock();
double interval1 = double(end1 - start1) / CLK_TCK;
printf("GPU运行时间为:%lf\n", interval1);
//printf("加速比为:%lf\n", interval / interval1);
printf("CPU运算结果如下:%f\n",sum);
// cudaDeviceReset must be called before exiting in order for profiling and
// tracing tools such as Nsight and Visual Profiler to show complete traces.
return 0;
}
kernel.cu文件
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <stdio.h>
#include<math.h>
__global__ void addKernel(float *array_0, float *dev_partial_sum)
{
//申请共享内存
__shared__ float share[512];
//复制全局内存数据到共享内存
share[threadIdx.x] = array_0[threadIdx.x + blockIdx.x * blockDim.x];
__syncthreads();
printf("%f ",array_0[threadIdx.x + blockIdx.x * blockDim.x]);
//通过循环设置不同阶段
for (int i = blockDim.x/2; i >0; i =i/2)
{
if (threadIdx.x <i)
share[threadIdx.x] += share[threadIdx.x + i];
__syncthreads();
}
//将结果写回
if (threadIdx.x == 0)
{
dev_partial_sum[blockIdx.x] = share[0];
printf("\n汇总%f\n",share[0]);
}
}
// Helper function for using CUDA to add vectors in parallel.
extern "C" int addWithCuda(float* sum, const float* array_0, int size)
{
float* dev_array;
float* dev_array1;
float* dev_sum;
cudaError_t cudaStatus;
// Choose which GPU to run on, change this on a multi-GPU system.
cudaStatus = cudaSetDevice(0);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaSetDevice failed! Do you have a CUDA-capable GPU installed?");
goto Error;
}
// Allocate GPU buffers for three vectors (two input, one output) .
cudaStatus = cudaMalloc((void**)&dev_array, size * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_array1, size/2 * sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
cudaStatus = cudaMalloc((void**)&dev_sum, 1*sizeof(float));
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMalloc failed!");
goto Error;
}
// Copy input vectors from host memory to GPU buffers.
cudaStatus = cudaMemcpy(dev_array, array_0, size * sizeof(float), cudaMemcpyHostToDevice);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
addKernel << <2, 2 >> > (dev_array,dev_array1);
addKernel << <1, 2 >> > (dev_array1,dev_sum);
// Check for any errors launching the kernel
cudaStatus = cudaGetLastError();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "addKernel launch failed: %s\n", cudaGetErrorString(cudaStatus));
goto Error;
}
// cudaDeviceSynchronize waits for the kernel to finish, and returns
// any errors encountered during the launch.
cudaStatus = cudaDeviceSynchronize();
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaDeviceSynchronize returned error code %d after launching addKernel!\n", cudaStatus);
goto Error;
}
// Copy output vector from GPU buffer to host memory.
cudaStatus = cudaMemcpy(sum, dev_sum, sizeof(float), cudaMemcpyDeviceToHost);
if (cudaStatus != cudaSuccess) {
fprintf(stderr, "cudaMemcpy failed!");
goto Error;
}
Error:
cudaFree(dev_sum);
cudaFree(dev_array);
return cudaStatus;
}