并行规约求和-CSDN博客

本文链接：https://blog.csdn.net/Alfafar/article/details/114335689

并行规约求和

#include <cuda_runtime.h>
#include "device_launch_parameters.h"
#include <stdio.h>

#define size 1024
/*
0 1 2 3 4 5 6 7 8 9   0
1 1 5 3 9 5 13 7 17 9  1
6 1 5 3 22 5 13 7 17 9  2
28 1 5 3 22 5 13 7 17 9		3
45 1 5 3 22 5 13 7 17 9		4
*/
#define CHECK(call) \
{ \
　　const cudaError_t error = call; \
　　if (error != cudaSuccess) \
　　{ \
　　　　printf("Error: %s:%d, ", __FILE__, __LINE__); \
　　　　printf("code:%d, reason: %s\n", error, cudaGetErrorString(error)); \
　　　　exit(1); \
　　} \
}
void checkResult(float *hostRef,float *gpuRef,const int N){
	double epsilon=1.0E-8;
	bool match=1;
	for(int i=0;i<N;i++)
	{
		if(abs(hostRef[i]-gpuRef[i])>epsilon){
			match=0;
			printf("Arrays do not match!\n");
			printf("host %5.2f gpu %5.2f at current %d\n",hostRef[i],gpuRef[i],i);
			break;
		}
	}
	if(match)printf("Arrays match.\n\n");
}

__global__ void addKernel(long int*a)
{
	int tid=threadIdx.x;
	if(tid>=size)
	return;

	for(int stride = 1;stride<size;stride*=2){
	if(tid %(2*stride)==0)
	a[tid]+=a[tid+stride]; 
		__syncthreads();

	}
	
}


int main()
{

   long int a[size*2]={0};
   long int answer = 0;
   for ( int i = 0; i < size ; i++ )
   {
   a[i] = i;
   answer += a[i];
    //printf("%d ",answer);
   }
   printf("\n answer=%d \n" , answer);
   cudaEvent_t stop, start;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
		float elapsedTime = 0;

   long int*d_a;
   cudaMalloc((void**)&d_a,sizeof(long int)*size*2);
   cudaMemcpy(d_a,a,sizeof(long int)*size*2,cudaMemcpyHostToDevice);
   cudaEventRecord(start, 0);
   int block_num=256;
   int grid_num=size*(size+block_num-1)/block_num;
   addKernel<<<1,size,0>>>(d_a);


   	cudaThreadSynchronize();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("使用的时间为%f\n", elapsedTime);
	cudaMemcpy(a,d_a,sizeof(long int)*size*2,cudaMemcpyDeviceToHost);
	printf("answer2 = %d\n",a[0]);
  
/*for ( int i = 0; i < size*2 ; i++ )
  {
 printf("%d ",a[i]);
   }
   */
   cudaFree(d_a);
   return 0;
}

这段代码只能求出我这台电脑最大线程1024以内的和，还无法实现大于1024个线程之后的数的求和，之后再做多个Grid中分开求得和的情况。

3月4日补充：

#include <stdio.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <stdlib.h>
#include <time.h>

#define size 10000
#define THREAD_NUM 256
__global__ void addKernel(int*a)
{
	int tid=threadIdx.x+blockDim.x* blockIdx.x;
	//printf("blockDim.x=%d blockIdx.x=%d threadIdx.x=%d\n", blockDim.x, blockIdx.x, threadIdx.x);
	if(tid>=size)
	return;

	for(int stride = 1;stride<size;stride*=2){
	if(tid %(2*stride)==0)
	a[tid]+=a[tid+stride]; 
		__syncthreads();

	}
	
}


int main()
{
   int block_num= THREAD_NUM;
   int grid_num=size*(size+block_num-1)/block_num;


   int a[size*2]={0};
   int answer = 0;
   for ( int i = 0; i < size ; i++ )
   {
   a[i] = i;
   answer += a[i];
    //printf("%d ",answer);
   }
   printf("\n answer=%d \n" , answer);
   cudaEvent_t stop, start;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);
		float elapsedTime = 0;

    int*d_a;
   cudaMalloc((void**)&d_a,sizeof(int)*size*2);
   cudaMemcpy(d_a,a,sizeof(int)*size*2,cudaMemcpyHostToDevice);
   cudaEventRecord(start, 0);

   addKernel<<<grid_num, block_num,0>>>(d_a);


   	cudaThreadSynchronize();
	cudaEventRecord(stop, 0);
	cudaEventSynchronize(stop);
	cudaEventElapsedTime(&elapsedTime, start, stop);
	printf("使用的时间为%f\n", elapsedTime);
	cudaMemcpy(a,d_a,sizeof( int)*size*2,cudaMemcpyDeviceToHost);
	printf("answer2 = %d\n",a[0]);
  
/*for ( int i = 0; i < size*2 ; i++ )
  {
 printf("%d ",a[i]);
   }
   */
   cudaFree(d_a);
   return 0;
}