本文利用CUDA实现向量的点积运算。主要思想是:每个线程计算两个相应元素的乘积,然后利用共享内存(__shared__)累计每个线程的结果,得到一个线程块内向量的部分内积,最后在CPU上对个线程块的结果求和,从而得到点积运算结果。
主要代码如下:
/********************************************************************
* dotProduct.cu
*********************************************************************/
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <cutil_inline.h>
#define imin(a,b) (a<b?a:b)
const int N = 33*1024;
const int threadsPerBlock = 256;
const int blocksPerGrid = imin(32, (N+threadsPerBlock-1)/threadsPerBlock);
bool InitCUDA(void)
{
......
}
/************************************************************************/
__global__ void dotProduct(float* a, float* b, float* c)
{
__shared__ float cache[threadsPerBlock];
int tid = threadIdx.x + blockIdx.x * blockDim.x;
int cacheIndex = threadIdx.x;
float temp = 0;
while(tid < N)
{
temp += a[tid] * b[tid];
tid += blockDim.x * gridDim.x;
}
cache[cacheIndex] = temp;
__syncthreads();
int i = blockDim.x/2;
while (i != 0)
{
if (cacheIndex < i)
{
cache[cacheIndex] += cache[cacheIndex + i];
}
__syncthreads();
i /= 2;
}
if (cacheIndex == 0)
{
c[blockIdx.x] = cache[0];
}
}
/************************************************************************/
int main(int argc, char* argv[])
{
if(!InitCUDA()) {
return 0;
}
float *a, *b, c, *partialc;
a = (float *)malloc(N * sizeof(float));
b = (float *)malloc(N * sizeof(float));
partialc = (float *)malloc(blocksPerGrid * sizeof(float));
float *dev_a, *dev_b, *dev_Partialc;
cutilSafeCall(cudaMalloc((void**)&dev_a, N*sizeof(float)));
cutilSafeCall(cudaMalloc((void**)&dev_b, N*sizeof(float)));
cutilSafeCall(cudaMalloc((void**)&dev_Partialc, blocksPerGrid*sizeof(float)));
for(int i=0; i<N; i++)
{
a[i] = i;
b[i] = i * 2;
}
cutilSafeCall(cudaMemcpy(dev_a, a, N*sizeof(float), cudaMemcpyHostToDevice));
cutilSafeCall(cudaMemcpy(dev_b, b, N*sizeof(float), cudaMemcpyHostToDevice));
dotProduct<<<blocksPerGrid, threadsPerBlock>>>(dev_a, dev_b, dev_Partialc);
cutilSafeCall(cudaMemcpy(partialc, dev_Partialc, blocksPerGrid*sizeof(float), cudaMemcpyDeviceToHost));
c = 0;
for (int i=0; i<blocksPerGrid; i++)
{
c += partialc[i];
printf("Partial %d: %.6g\n", i, partialc[i]);
}
#define sum_squares(x) (x*(x+1)*(2*x+1)/6)
printf("Does GPU value %.6g equals to %.6g ?\n", c, 2*sum_squares((float)(N-1)));
cutilSafeCall(cudaFree(dev_a));
cutilSafeCall(cudaFree(dev_b));
cutilSafeCall(cudaFree(dev_Partialc));
free(a);
free(b);
free(partialc);
return 0;
}
说明:
1. 函数__syncthreads()确保线程块中的每个线程都执行完该函数前的语句才开始执行下一条语句。这保证了对一个线程块内的各分量求和的时候,分量乘积已经计算结束。
2. 关于blocksPerGrid, threadsPerBlock值的确定:首先确定块内线程的数量threadsPerBlock, blocksPerGrid = (N+ threadsPerBlock - 1) / threadsPerBlock. 如果给定线程块的数量B, 当blocksPerBlock > B时,为避免线程的浪费降低性能,需要取min(B, blocksPerBlock).