reduce
- trival version
__global__ void reduce(int *input, unsigned int N, int *total){
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int x[BSIZE];
x[tid] = (i<N) ? input[i] : 0;
__syncthreads();
for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) {
__syncthreads();
if (tid % (2*stride) == 0)
x[tid] += x[tid + stride];
}
if (tid == 0) atomicAdd(total,x[tid]);
}
- optimized
//No divergence until stride < 32
//All warps active when stride ≥ 32
__global__ void reduce(int *input, unsigned int N, int *total){
unsigned int tid = threadIdx.x;
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
__shared__ int x[BSIZE];
x[tid] = (i<N) ? input[i] : 0;
__syncthreads();
for (stride = blockDim.x/2; stride > 1; stride /= 2) {
__syncthreads();
if (tid < stride)
x[tid] += x[tid + stride];
}
if (tid == 0) atomicAdd(total,x[tid]);
}
scan
recursive version
void exclusive_scan_recursive(int* start, int* end, int* output, int* scratch)
{
int N = end - start;
if (N == 0) return;
else if (N == 1)
{
output[0] = 0;
return;
}
// sum pairs in parallel.
for (int i = 0; i < N/2; i++)
output[i] = start[2*i] + start[2*i+1];
// prefix sum on the compacted array.
exclusive_scan_recursive(output, output + N/2, scratch, scratch + (N/2));
// finally, update the odd values in parallel.
for (int i = 0; i < N; i++)
{
output[i] = scratch[i/2];
if (i % 2)
output[i] += start[i-1];
}
}
iterative version
void exclusive_scan_iterative(int* data, int* end)
{
int N = end - data;
// upsweep phase.
for (int twod = 1; twod < N; twod*=2)
{
int twod1 = twod*2;
parallel_for (int i = 0; i < N; i += twod1)
data[i+twod1-1] += data[i+twod-1];
}
data[N-1] = 0;
// downsweep phase.
for (int twod = N/2; twod >= 1; twod /= 2)
{
int twod1 = twod*2;
parallel_for (int i = 0; i < N; i += twod1)
{
int t = data[i+twod-1];
data[i+twod-1] = data[i+twod1-1];
// change twod1 below to twod to reverse prefix sum.
data[i+twod1-1] += t;
}
}
}
- cuda version, better than Thrust sometime
another version from nvidia https://developer.nvidia.com/gpugems/GPUGems3/gpugems3_ch39.html
//need global sync
__global__ void upsweep_out_block(int* array, int length, int twod){
int index = blockIdx.x * blockDim.x + threadIdx.x;
int twod1 = twod*2;
if(index%twod1 == 0 && index < length){
array[index+twod1-1] += array[index+twod-1];
}
}
//sweep size limited in one block, need no global sync
__global__ void upsweep_within_block(int* array, int length){
__shared__ int local_copy[kThreadsPerBlock];
int index = blockIdx.x * blockDim.x + threadIdx.x;
local_copy[threadIdx.x] = array[index];
__syncthreads();
if(index < length){
for(int twod=1; twod < kThreadsPerBlock; twod*=2){
int twod1 = twod*2;
if(index%twod1 == 0){
local_copy[threadIdx.x+twod1-1] += local_copy[threadIdx.x+twod-1];
}
__syncthreads();
}
}
array[index] = local_copy[threadIdx.x];
}
__global__ void downsweep_out_block(int* array, int length, int twod){
int index = blockIdx.x * blockDim.x + threadIdx.x;
int twod1 = twod*2;
if(index%twod1 == 0 && index < length){
int t = array[index+twod-1];
array[index+twod-1] = array[index+twod1-1];
array[index+twod1-1] += t;
}
}
__global__ void downsweep_within_block(int* array, int length){
__shared__ int local_copy[kThreadsPerBlock];
int index = blockIdx.x * blockDim.x + threadIdx.x;
local_copy[threadIdx.x] = array[index];
__syncthreads();
for(int twod=kThreadsPerBlock/2; twod >=1; twod/=2){
int twod1 = twod*2;
if(index%twod1 == 0 && index < length){
int t = local_copy[threadIdx.x+twod-1];
local_copy[threadIdx.x+twod-1] = local_copy[threadIdx.x+twod1-1];
local_copy[threadIdx.x+twod1-1] += t;
}
__syncthreads();
}
array[index] = local_copy[threadIdx.x];
}
void exclusive_scan(int* device_start, int length, int* device_result)
{
//assume device_result is initialized to input
int rounded_length = nextPow2(length);
int blocks = (rounded_length+kThreadsPerBlock-1)/kThreadsPerBlock;
upsweep_within_block<<<blocks, kThreadsPerBlock>>>(device_result, rounded_length);
for(int twod=kThreadsPerBlock; twod<rounded_length; twod *=2 ){
upsweep_out_block<<<blocks, kThreadsPerBlock>>>(device_result, rounded_length, twod);
}
cudaMemset(device_result+rounded_length-1, 0, sizeof(int));
for(int twod=rounded_length/2; twod >=kThreadsPerBlock; twod /=2 ){
downsweep_out_block<<<blocks, kThreadsPerBlock>>>(device_result, rounded_length, twod);
}
downsweep_within_block<<<blocks, kThreadsPerBlock>>>(device_result, rounded_length);
}
quick sort
- naive
function QuickSort(A){
p <- random pivot
s <- 1, t<-|A|
//serial code
while(s < t){
while (A[s] < p)s++
while (A[t] > p)t--
if(s < t) swap(A[s], A[t])
}
in parallel:
QuickSort(L)
QuickSort(R)
return L+M+R
}
- optimized
function QuickSort(A){
p <- random pivot
//paralleled code
L <- Select(A, <p)
M <- Select(A, =p)
R <- Select(A, >p)
in parallel:
QuickSort(L)
QuickSort(R)
return L+M+R
}
select
A = 1 2 3 4 5 6 7 8
In = 1 0 0 1 0 0 1 1
P = 0 1 1 1 2 2 2 3
Out = 1 4 7 8
function Select(A, In){
P <- PrefixSum(In)
parallel_for(i <- 1 to |A|)
if(In[i]) Out[P[i]] <- A[i]
}