并行规约求和
#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>
#define size 1024
/*
0 1 2 3 4 5 6 7 8 9 0
1 1 5 3 9 5 13 7 17 9 1
6 1 5 3 22 5 13 7 17 9 2
28 1 5 3 22 5 13 7 17 9 3
45 1 5 3 22 5 13 7 17 9 4
*/
#define CHECK(call) \
{ \
const cudaError_t error = call; \
if (error != cudaSuccess) \
{ \
printf("Error: %s:%d, ", __FILE__, __LINE__); \
printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
exit(1); \
} \
}
void checkResult(float *hostRef,float *gpuRef,const int N){
double epsilon=1.0E-8;
bool match=1;
for(int i=0;i<N;i++)
{
if(abs(hostRef[i]-gpuRef[i])>epsilon){
match=0;
printf("Arrays do not match!\n");
printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);
break;
}
}
if(match)printf("Arrays match.\n\n");
}
__global__ void addKernel(long int*a)
{
int tid=threadIdx.x;
if(tid>=size)
return;
for(int stride = 1;stride<size;stride*=2){
if(tid %(2*stride)==0)
a[tid]+=a[tid+stride];
__syncthreads();
}
}
int main()
{
long int a[size*2]={0};
long int answer = 0;
for ( int i = 0; i < size ; i++ )
{
a[i] = i;
answer += a[i];
//printf("%d ",answer);
}
printf("\n answer=%d \n" , answer);
cudaEvent_t stop, start;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsedTime = 0;
long int*d_a;
cudaMalloc((void**)&d_a,sizeof(long int)*size*2);
cudaMemcpy(d_a,a,sizeof(long int)*size*2,cudaMemcpyHostToDevice);
cudaEventRecord(start, 0);
int block_num=256;
int grid_num=size*(size+block_num-1)/block_num;
addKernel<<<1,size,0>>>(d_a);
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("使用的时间为%f\n", elapsedTime);
cudaMemcpy(a,d_a,sizeof(long int)*size*2,cudaMemcpyDeviceToHost);
printf("answer2 = %d\n",a[0]);
/*for ( int i = 0; i < size*2 ; i++ )
{
printf("%d ",a[i]);
}
*/
cudaFree(d_a);
return 0;
}
这段代码只能求出我这台电脑最大线程1024以内的和,还无法实现大于1024个线程之后的数的求和,之后再做多个Grid中分开求得和的情况。
3月4日补充:
#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdlib.h>
#include <time.h>
#define size 10000
#define THREAD_NUM 256
__global__ void addKernel(int*a)
{
int tid=threadIdx.x+blockDim.x* blockIdx.x;
//printf("blockDim.x=%d blockIdx.x=%d threadIdx.x=%d\n", blockDim.x, blockIdx.x, threadIdx.x);
if(tid>=size)
return;
for(int stride = 1;stride<size;stride*=2){
if(tid %(2*stride)==0)
a[tid]+=a[tid+stride];
__syncthreads();
}
}
int main()
{
int block_num= THREAD_NUM;
int grid_num=size*(size+block_num-1)/block_num;
int a[size*2]={0};
int answer = 0;
for ( int i = 0; i < size ; i++ )
{
a[i] = i;
answer += a[i];
//printf("%d ",answer);
}
printf("\n answer=%d \n" , answer);
cudaEvent_t stop, start;
cudaEventCreate(&start);
cudaEventCreate(&stop);
float elapsedTime = 0;
int*d_a;
cudaMalloc((void**)&d_a,sizeof(int)*size*2);
cudaMemcpy(d_a,a,sizeof(int)*size*2,cudaMemcpyHostToDevice);
cudaEventRecord(start, 0);
addKernel<<<grid_num, block_num,0>>>(d_a);
cudaThreadSynchronize();
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("使用的时间为%f\n", elapsedTime);
cudaMemcpy(a,d_a,sizeof( int)*size*2,cudaMemcpyDeviceToHost);
printf("answer2 = %d\n",a[0]);
/*for ( int i = 0; i < size*2 ; i++ )
{
printf("%d ",a[i]);
}
*/
cudaFree(d_a);
return 0;
}
这段代码可以成功运行并输出正确的答案,但无法计算size大于10000的情况