欢迎访问小站,阅读原文 http://www.yandong.org/archives/755
基础函数
基本思路:并行计算乘法,串行将每个BLOCK中的所有线程的结果在thread=0线程中汇总,
__global__ void dot( TYPE *a, TYPE *b, TYPE *c)
{
__shared__ TYPE temp[threadsPerBlock];
int index = threadIdx.x + blockIdx.x * blockDim.x;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();
if( 0 == threadIdx.x ) {
for( int i= 0; i< threadsPerBlock; i++ )
{
c[blockIdx.x] += temp[i];
}
}
}
改进1 树形加法
基本思路:并行乘法,并行加法。每个BLOCK中的数据汇总也可以使用并行计算,采用树形加法。
/*并行乘法,并行加法*/
__global__ void dot3( TYPE *a, TYPE *b, TYPE *c)
{
__shared__ TYPE temp[threadsPerBlock];
int index = threadIdx.x + blockIdx.x * blockDim.x;
int offset=1, mask=1;
temp[threadIdx.x] = a[index] * b[index];
__syncthreads();/*树形加法*/
while(offset < threadsPerBlock) {
if((threadIdx.x & mask) == 0) {
temp[threadIdx.x] += temp[threadIdx.x + offset];
}
offset += offset;
mask = offset + mask;
__syncthreads();
}
if(threadIdx.x == 0) {
c[blockIdx.x] = temp[0];
}
}
改进2 浮点数精度
基本思路:浮点运算的精度问题,采用Kahan's Summation Formula 来提高精确度。 并将main函数中的c由float类型改为double类型。
/*并行乘法,并行加法, 浮点数精度优化Kahan's Summation Formula*/
__global__ void dot( TYPE *a, TYPE *b, TYPE *c)
{
__shared__ TYPE temp[threadsPerBlock];
int index = threadIdx.x + blockIdx.x * blockDim.x;
int offset=1, mask=1;
TYPE y=0,r;
/*Kahan's Summation Formula*/
temp[threadIdx.x] =0;
y -= a[index] * b[index];
r=temp[threadIdx.x]-y;
y=(r-temp[threadIdx.x])+y;
temp[threadIdx.x] =r;
__syncthreads();
while(offset < threadsPerBlock) {
if((threadIdx.x & mask) == 0) {
temp[threadIdx.x] += temp[threadIdx.x + offset];
}
offset += offset;
mask = offset + mask;
__syncthreads();
}
if(threadIdx.x == 0) {
c[blockIdx.x] = temp[0];
}
}
改进3 Bank conflict
基本思路:改进2的树形加法存在Bank conflict的问题。
/*并行乘法,并行加法, 浮点数精度优化Kahan's Summation Formula 解决bank conflict*/
__global__ void dot3( TYPE *a, TYPE *b, TYPE *c)
{
__shared__ TYPE temp[threadsPerBlock];
int index = threadIdx.x + blockIdx.x * blockDim.x;
int offset=1;
TYPE y=0,r;
/*Kahan's Summation Formula*/
temp[threadIdx.x] =0;
y -= a[index] * b[index];
r=temp[threadIdx.x]-y;
y=(r-temp[threadIdx.x])+y;
temp[threadIdx.x] =r;
__syncthreads();
offset = threadsPerBlock / 2;
while(offset > 0) {
if(threadIdx.x < offset) {
temp[threadIdx.x] += temp[threadIdx.x + offset];
}
offset >>= 1;
__syncthreads();
}
if(threadIdx.x == 0) {
c[blockIdx.x] = temp[0];
}
}
改进4 循环展开
基本思路:将树形加法进行循环展开
/*并行乘法,并行加法, 浮点数精度优化Kahan's Summation Formula 解决bank conflict 循环展开*/
__global__ void dot4( TYPE *a, TYPE *b, TYPE *c)
{
__shared__ TYPE temp[threadsPerBlock];
int index = threadIdx.x + blockIdx.x * blockDim.x;
int offset=1;
TYPE y=0,r;
/*Kahan's Summation Formula*/
temp[threadIdx.x] =0;
y -= a[index] * b[index];
r=temp[threadIdx.x]-y;
y=(r-temp[threadIdx.x])+y;
temp[threadIdx.x] =r;
__syncthreads();
if(threadIdx.x < 256) { temp[threadIdx.x] += temp[threadIdx.x + 256]; }
__syncthreads();
if(threadIdx.x < 128) { temp[threadIdx.x] += temp[threadIdx.x + 128]; }
__syncthreads();
if(threadIdx.x < 64) { temp[threadIdx.x] += temp[threadIdx.x + 64]; }
__syncthreads();
if(threadIdx.x < 32) { temp[threadIdx.x] += temp[threadIdx.x + 32]; }
__syncthreads();
if(threadIdx.x < 16) { temp[threadIdx.x] += temp[threadIdx.x + 16]; }
__syncthreads();
if(threadIdx.x < 8) { temp[threadIdx.x] += temp[threadIdx.x + 8]; }
__syncthreads();
if(threadIdx.x < 4) { temp[threadIdx.x] += temp[threadIdx.x + 4]; }
__syncthreads();
if(threadIdx.x < 2) { temp[threadIdx.x] += temp[threadIdx.x + 2]; }
__syncthreads();
if(threadIdx.x < 1) { temp[threadIdx.x] += temp[threadIdx.x + 1]; }
__syncthreads();
if(threadIdx.x == 0) {
c[blockIdx.x] = temp[0];
}
}
改进5 Ideal Thread
基本思路:解决空闲线程问题,即<32不再展开
/*并行乘法,并行加法, 浮点数精度优化Kahan's Summation Formula 解决bank conflict 循环展开, 空闲线程*/
__global__ void dot5(TYPE *a, TYPE *b, TYPE *c)
{
__shared__ TYPE temp[threadsPerBlock];
int index = threadIdx.x + blockIdx.x * blockDim.x;
int offset=1;
TYPE y=0,r;
/*Kahan's Summation Formula*/
temp[threadIdx.x] =0;
y -= a[index] * b[index];
r=temp[threadIdx.x]-y;
y=(r-temp[threadIdx.x])+y;
temp[threadIdx.x] =r;
__syncthreads();
if(threadIdx.x < 256) { temp[threadIdx.x] += temp[threadIdx.x + 256]; }
__syncthreads();
if(threadIdx.x < 128) { temp[threadIdx.x] += temp[threadIdx.x + 128]; }
__syncthreads();
if(threadIdx.x < 64) { temp[threadIdx.x] += temp[threadIdx.x + 64]; }
__syncthreads();
if (threadIdx.x < 32)
{
temp[threadIdx.x] += temp[threadIdx.x + 32];
temp[threadIdx.x] += temp[threadIdx.x + 16];
temp[threadIdx.x] += temp[threadIdx.x +8];
temp[threadIdx.x] += temp[threadIdx.x +4];
temp[threadIdx.x] += temp[threadIdx.x + 2];
temp[threadIdx.x] += temp[threadIdx.x + 1];
}
__syncthreads();
if(threadIdx.x == 0) {
c[blockIdx.x] = temp[0];
}
}
性能总结
改进 | 性能us |
dot0 | 1699738.016 |
Dot1 | 114682.144 |
Dot2 | 120819.072 |
Dot3 | 71126.24 |
Dot4 | 57975.072 |
Dot5 | 49489.856 |