1. seperated threads and interleaved addressing
template <class T>
__global__ void reduce0(T *g_idata, T *g_odata, unsigned int n) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
T *sdata = SharedMemory<T>();
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
cg::sync(cta);
// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
// modulo arithmetic is slow!
if ((tid % (2 * s)) == 0) {
sdata[tid] += sdata[tid + s];
}
cg::sync(cta);
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
分析:根据注释,这个算法用取模来决定哪些thread进行计算,取模运算符在GPU中效率很低,其次选择的thread都分布在不同的warp中,意味着每个循环每个warp都要执行计算,而一个block中的warps在硬件层面并不是完全并行的,需求同样多的thread情况下尽可能使用来自同一个warp的thread才能实现最大性能。
2. contiguous threads but with shared memory bank conflict
template <class T>
__global__ void reduce1(T *g_idata, T *g_odata, unsigned int n) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
T *sdata = SharedMemory<T>();
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
cg::sync(cta);
// do reduction in shared mem
for (unsigned int s = 1; s < blockDim.x; s *= 2) {
int index = 2 * s * tid;
if (index < blockDim.x) {
sdata[index] += sdata[index + s];
}
cg::sync(cta);
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
分析:这种方法改变了thread号与shared memory数据下标的对应关系,例如s=1时执行计算的是threadIdx.x=0-127(假设一个block中256个thread);s=2时,进行计算的thread为0-63;s=4时为0-31...这样随着循环的进行GPU使用了尽可能少的warp。但是这种方法会带来bank conflict。
bank conflict:发生于同一个warp的thread访问同一个shared memory中同一个bank的不同数据时。Volta架构中shared memory为32个bank,申请时连续4字节(1word)的数据会被分配在一个bank上,例如将连续的32个int存入shared memory中则每个数据都会被分配到不同的bank上。
上面的代码中,假设T为int型,则sdata[0]和sdata[32]会被分配到同一bank上,那么当s=2时,thread 0和thread 16虽然同时访问的是不同数据,但由于其位于同一内存bank所以不能一次取出,产生延迟。
3. squential addressing solves bank conflict
template <class T>
__global__ void reduce2(T *g_idata, T *g_odata, unsigned int n) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
T *sdata = SharedMemory<T>();
// load shared mem
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
sdata[tid] = (i < n) ? g_idata[i] : 0;
cg::sync(cta);
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] += sdata[tid + s];
}
cg::sync(cta);
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = sdata[0];
}
分析:可以看出一个warp中32个thread如果各访问一个bank的数据则刚好不会产生bank conflict, 而上个算法之所以产生是因为其各thread访问的数据地址不连续,所以我们希望一个warp内的某个thread总是负责特定一个bank的reduction,例如thread 0-31无论在第几个循环,相邻thread访问的数据总是相邻的。
4. perform first level reduction at first
template <class T>
__global__ void reduce3(T *g_idata, T *g_odata, unsigned int n) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
T mySum = (i < n) ? g_idata[i] : 0;
if (i + blockDim.x < n) mySum += g_idata[i + blockDim.x];
sdata[tid] = mySum;
cg::sync(cta);
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 0; s >>= 1) {
if (tid < s) {
sdata[tid] = mySum = mySum + sdata[tid + s];
}
cg::sync(cta);
}
// write result for this block to global mem
if (tid == 0) g_odata[blockIdx.x] = mySum;
}
分析:与上个方法唯一不同是先在local memory(mySum)中计算2*blockDim.x个数(n>=2*blockDim.x)的一次reduction,再进行block-level的reduction。
至于为什么不多加几次,我也没弄清楚,但是想象一下如果进行太多片外memory (local 和global)的读写势必会造成一些block读数据较慢而使计算资源空闲。
5. threads communication with warp shuffle
template <class T, unsigned int blockSize>
__global__ void reduce4(T *g_idata, T *g_odata, unsigned int n) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockDim.x * 2) + threadIdx.x;
T mySum = (i < n) ? g_idata[i] : 0;
if (i + blockSize < n) mySum += g_idata[i + blockSize];
sdata[tid] = mySum;
cg::sync(cta);
// do reduction in shared mem
for (unsigned int s = blockDim.x / 2; s > 32; s >>= 1) {
if (tid < s) {
sdata[tid] = mySum = mySum + sdata[tid + s];
}
cg::sync(cta);
}
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);//得到当前thread所在的size为32的线程组
if (cta.thread_rank() < 32) {//lane ID<32
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64) mySum += sdata[tid + 32]; //完成mySum = sdata[0-31] + sdata[32-63],这里虽然不能用shuffle但也把它拿出循环也是为了提高性能
// Reduce final warp using shuffle
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
mySum += tile32.shfl_down(mySum, offset);
}
}
分析: 关于使用shuffle操作的reduction在NVIDIA的developer blog中有详细介绍:https://devblogs.nvidia.com/faster-parallel-reductions-kepler/。 shuffle用来直接访问同一warp内特定thread中的数据,这里的reduction算法和3,4中一样,除了最后剩下32个数据时,在一个warp内用shufl_down来访问当前thread号+offset的thread中的mySum。
6. completely unrolled
template <class T, unsigned int blockSize>
__global__ void reduce5(T *g_idata, T *g_odata, unsigned int n) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * (blockSize * 2) + threadIdx.x;
T mySum = (i < n) ? g_idata[i] : 0;
if (i + blockSize < n) mySum += g_idata[i + blockSize];
sdata[tid] = mySum;
cg::sync(cta);
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
sdata[tid] = mySum = mySum + sdata[tid + 256];
}
cg::sync(cta);
if ((blockSize >= 256) && (tid < 128)) {
sdata[tid] = mySum = mySum + sdata[tid + 128];
}
cg::sync(cta);
if ((blockSize >= 128) && (tid < 64)) {
sdata[tid] = mySum = mySum + sdata[tid + 64];
}
cg::sync(cta);
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
if (cta.thread_rank() < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64) mySum += sdata[tid + 32];
// Reduce final warp using shuffle
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
mySum += tile32.shfl_down(mySum, offset);
}
}
// write result for this block to global mem
if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
}
分析: 算法和5相同,只是将for循环完全展开,因为维护需要额外的开销,所以简单地把循环展开也可以使性能提升。
7. sequentially addition
template <class T, unsigned int blockSize, bool nIsPow2>
__global__ void reduce6(T *g_idata, T *g_odata, unsigned int n) {
// Handle to thread block group
cg::thread_block cta = cg::this_thread_block();
T *sdata = SharedMemory<T>();
// perform first level of reduction,
// reading from global memory, writing to shared memory
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockSize * 2 + threadIdx.x;
unsigned int gridSize = blockSize * 2 * gridDim.x;
T mySum = 0;
// we reduce multiple elements per thread. The number is determined by the
// number of active thread blocks (via gridDim). More blocks will result
// in a larger gridSize and therefore fewer elements per thread
while (i < n) {
mySum += g_idata[i];
// ensure we don't read out of bounds -- this is optimized away for powerOf2
// sized arrays
if (nIsPow2 || i + blockSize < n) mySum += g_idata[i + blockSize];
i += gridSize;
}
// each thread puts its local sum into shared memory
sdata[tid] = mySum;
cg::sync(cta);
// do reduction in shared mem
if ((blockSize >= 512) && (tid < 256)) {
sdata[tid] = mySum = mySum + sdata[tid + 256];
}
cg::sync(cta);
if ((blockSize >= 256) && (tid < 128)) {
sdata[tid] = mySum = mySum + sdata[tid + 128];
}
cg::sync(cta);
if ((blockSize >= 128) && (tid < 64)) {
sdata[tid] = mySum = mySum + sdata[tid + 64];
}
cg::sync(cta);
cg::thread_block_tile<32> tile32 = cg::tiled_partition<32>(cta);
if (cta.thread_rank() < 32) {
// Fetch final intermediate sum from 2nd warp
if (blockSize >= 64) mySum += sdata[tid + 32];
// Reduce final warp using shuffle
for (int offset = tile32.size() / 2; offset > 0; offset /= 2) {
mySum += tile32.shfl_down(mySum, offset);
}
}
// write result for this block to global mem
if (cta.thread_rank() == 0) g_odata[blockIdx.x] = mySum;
}
分析: 每个while循环计算一次2*blocksize个数的一次reduction,整个grid在单次循环中把2*blocksize*gridDim.x个数据reduce到blocksize*gridDim.x个数据,经过整个while后只需要把block内每个thread中的mySum全部加起来,同样这里也是采用完全展开的方式。同样只剩最后一个warp时利用shuffle。