[CUDA] GPU并行归约计算两数组内积

1.GPU上进行归约运算,最后在CPU上汇总结果 

#include "cuda_runtime.h"  
#include "cublas_v2.h"  

#include <time.h>  
#include <iostream>  

#define N (128 * 10)
#define threadsPerBlock 128
#define blocksPerGrid 8


using namespace std;

__global__ void init(int *a, int *b)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	while(tid < N)
	{
		a[tid] = tid;
		b[tid] = 2;
		tid += blockDim.x * gridDim.x;
	}
		
}

__global__ void dot(int *a, int *b, int *c)
{
	int tid;
	__shared__ int cache[threadsPerBlock];
	int temp = 0;

	tid = threadIdx.x + blockIdx.x * blockDim.x;
	while(tid < N)
	{
		temp += a[tid] * b[tid];
		tid += blockDim.x * gridDim.x;
	}
	__syncthreads();
	cache[threadIdx.x] = temp;
	int i = threadsPerBlock / 2;
	while (i != 0)
	{
		if(threadIdx.x < i)
			cache[threadIdx.x] += cache[i + threadIdx.x];
		__syncthreads();
		i /= 2;
	}
	if(threadIdx.x == 0)
	{
		c[blockIdx.x] = cache[0];
	}
}
int main()
{
	int a[N], b[N], c[blocksPerGrid];
	int *dev_a, *dev_b, *dev_c;
	int i, temp1 = 0, temp2 = 0;

	cudaMalloc((void **)&dev_a, N * sizeof(int));
	cudaMalloc((void **)&dev_b, N * sizeof(int));
	cudaMalloc((void **)&dev_c, blocksPerGrid * sizeof(int));

	init << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b);
	dot << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b, dev_c);
	
	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(c, dev_c, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);

	for (i = 0; i < blocksPerGrid; i++)
	{
		temp1 += c[i];
	}

	for (i = 0; i < N; i++)
	{
		temp2 += i * 2;
	}

	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);

	printf("calculation result: %d\n", temp1);
	printf("real result: %d\n", temp2);

	getchar();
	return 0;
}




2.GPU上进行归约运算直接得到结果

此部分使用了原子锁来避免竞争线程对结果指针变量dev_c的修改导致结果出错。若指针变量dev_c为浮点类型,则计算功能集达到2.0以上时才可以直接使用atomicAdd()函数对dev_c所指向的内存进行原子加操作,否则可能会出错。

#include "cuda_runtime.h"  
#include "cublas_v2.h"  

#include <time.h>  
#include <iostream>  

#define N (128 * 10)
#define threadsPerBlock 128
#define blocksPerGrid 8


using namespace std;

struct Lock
{
	int *mutex;
	Lock(void)
	{
		int state = 0;
		cudaMalloc((void **)&mutex, sizeof(int));
		cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
	}
	~Lock(void)
	{
		cudaFree(mutex);
	}
	__device__ void lock(void)
	{
		while(atomicCAS(mutex, 0, 1) != 0);
	}
	__device__ void unlock(void)
	{
		atomicExch(mutex, 0);
	}
};

__global__ void init(int *a, int *b)
{
	int tid = threadIdx.x + blockIdx.x * blockDim.x;
	while(tid < N)
	{
		a[tid] = tid;
		b[tid] = 2;
		tid += blockDim.x * gridDim.x;
	}
		
}

__global__ void dot(Lock lock, int *a, int *b, int *c)
{
	int tid;
	__shared__ int cache[threadsPerBlock];
	int temp = 0;

	tid = threadIdx.x + blockIdx.x * blockDim.x;
	while(tid < N)
	{
		temp += a[tid] * b[tid];
		tid += blockDim.x * gridDim.x;
	}
	__syncthreads();
	cache[threadIdx.x] = temp;
	int i = threadsPerBlock / 2;
	while (i != 0)
	{
		if(threadIdx.x < i)
			cache[threadIdx.x] += cache[i + threadIdx.x];
		__syncthreads();
		i /= 2;
	}
	if(threadIdx.x == 0)
	{
		lock.lock();
		*c += cache[0];
		lock.unlock();
		//若*c为浮点类型,则计算功能集2.0以上时才可以用atomicAdd(c, cache[0]);代替以上三行
	}
}
int main()
{
	int a[N], b[N], c = 0;
	int *dev_a, *dev_b, *dev_c;
	int i, temp1 = 0, temp2 = 0;
	Lock lock;

	cudaMalloc((void **)&dev_a, N * sizeof(int));
	cudaMalloc((void **)&dev_b, N * sizeof(int));
	cudaMalloc((void **)&dev_c, sizeof(int));

	init << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b);
	dot << <blocksPerGrid, threadsPerBlock >> > (lock, dev_a, dev_b, dev_c);
	
	cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
	cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
	

	temp1 = c;

	for (i = 0; i < N; i++)
	{
		temp2 += i * 2;
	}

	cudaFree(dev_a);
	cudaFree(dev_b);
	cudaFree(dev_c);

	printf("calculation result: %d\n", temp1);
	printf("real result: %d\n", temp2);
	

	getchar();
	return 0;
}




 

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值