1.GPU上进行归约运算,最后在CPU上汇总结果
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include <time.h>
#include <iostream>
#define N (128 * 10)
#define threadsPerBlock 128
#define blocksPerGrid 8
using namespace std;
__global__ void init(int *a, int *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while(tid < N)
{
a[tid] = tid;
b[tid] = 2;
tid += blockDim.x * gridDim.x;
}
}
__global__ void dot(int *a, int *b, int *c)
{
int tid;
__shared__ int cache[threadsPerBlock];
int temp = 0;
tid = threadIdx.x + blockIdx.x * blockDim.x;
while(tid < N)
{
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
__syncthreads();
cache[threadIdx.x] = temp;
int i = threadsPerBlock / 2;
while (i != 0)
{
if(threadIdx.x < i)
cache[threadIdx.x] += cache[i + threadIdx.x];
__syncthreads();
i /= 2;
}
if(threadIdx.x == 0)
{
c[blockIdx.x] = cache[0];
}
}
int main()
{
int a[N], b[N], c[blocksPerGrid];
int *dev_a, *dev_b, *dev_c;
int i, temp1 = 0, temp2 = 0;
cudaMalloc((void **)&dev_a, N * sizeof(int));
cudaMalloc((void **)&dev_b, N * sizeof(int));
cudaMalloc((void **)&dev_c, blocksPerGrid * sizeof(int));
init << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b);
dot << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b, dev_c);
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(c, dev_c, blocksPerGrid * sizeof(int), cudaMemcpyDeviceToHost);
for (i = 0; i < blocksPerGrid; i++)
{
temp1 += c[i];
}
for (i = 0; i < N; i++)
{
temp2 += i * 2;
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("calculation result: %d\n", temp1);
printf("real result: %d\n", temp2);
getchar();
return 0;
}
2.GPU上进行归约运算直接得到结果
此部分使用了原子锁来避免竞争线程对结果指针变量dev_c的修改导致结果出错。若指针变量dev_c为浮点类型,则计算功能集达到2.0以上时才可以直接使用atomicAdd()函数对dev_c所指向的内存进行原子加操作,否则可能会出错。
#include "cuda_runtime.h"
#include "cublas_v2.h"
#include <time.h>
#include <iostream>
#define N (128 * 10)
#define threadsPerBlock 128
#define blocksPerGrid 8
using namespace std;
struct Lock
{
int *mutex;
Lock(void)
{
int state = 0;
cudaMalloc((void **)&mutex, sizeof(int));
cudaMemcpy(mutex, &state, sizeof(int), cudaMemcpyHostToDevice);
}
~Lock(void)
{
cudaFree(mutex);
}
__device__ void lock(void)
{
while(atomicCAS(mutex, 0, 1) != 0);
}
__device__ void unlock(void)
{
atomicExch(mutex, 0);
}
};
__global__ void init(int *a, int *b)
{
int tid = threadIdx.x + blockIdx.x * blockDim.x;
while(tid < N)
{
a[tid] = tid;
b[tid] = 2;
tid += blockDim.x * gridDim.x;
}
}
__global__ void dot(Lock lock, int *a, int *b, int *c)
{
int tid;
__shared__ int cache[threadsPerBlock];
int temp = 0;
tid = threadIdx.x + blockIdx.x * blockDim.x;
while(tid < N)
{
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
__syncthreads();
cache[threadIdx.x] = temp;
int i = threadsPerBlock / 2;
while (i != 0)
{
if(threadIdx.x < i)
cache[threadIdx.x] += cache[i + threadIdx.x];
__syncthreads();
i /= 2;
}
if(threadIdx.x == 0)
{
lock.lock();
*c += cache[0];
lock.unlock();
//若*c为浮点类型,则计算功能集2.0以上时才可以用atomicAdd(c, cache[0]);代替以上三行
}
}
int main()
{
int a[N], b[N], c = 0;
int *dev_a, *dev_b, *dev_c;
int i, temp1 = 0, temp2 = 0;
Lock lock;
cudaMalloc((void **)&dev_a, N * sizeof(int));
cudaMalloc((void **)&dev_b, N * sizeof(int));
cudaMalloc((void **)&dev_c, sizeof(int));
init << <blocksPerGrid, threadsPerBlock >> > (dev_a, dev_b);
dot << <blocksPerGrid, threadsPerBlock >> > (lock, dev_a, dev_b, dev_c);
cudaMemcpy(a, dev_a, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(b, dev_b, N * sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(&c, dev_c, sizeof(int), cudaMemcpyDeviceToHost);
temp1 = c;
for (i = 0; i < N; i++)
{
temp2 += i * 2;
}
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
printf("calculation result: %d\n", temp1);
printf("real result: %d\n", temp2);
getchar();
return 0;
}