最近研究用CUDA实现排序算法,这是尝试之一,下面是QuickSort排序算法用CUDA实现的排序核心代码。全部代码正在完善中,欢迎纠错....
#include "gpuqsort.h"
#undef THREADS
#define THREADS blockDim.x
extern __shared__ unsigned int sarray[];
#ifdef HASATOMICS
__device__ unsigned int ohtotal = 0;
#endif
__device__ inline void swap(unsigned int& a, unsigned int& b)
{
unsigned int tmp = a;
a = b;
b = tmp;
}
__device__ inline
void bitonicSort(unsigned int* fromvalues, unsigned int* tovalues, unsigned int from, unsigned int size)
{
unsigned int* shared = (unsigned int*)sarray;
unsigned int coal = (from&0xf);
size = size + coal;
from = from - coal;
int sb = 2 << (int)(__log2f(size));
for(int i=threadIdx.x;i<size;i+=THREADS)
{
shared[i] = fromvalues[i+from];
}
for(int i=threadIdx.x;i<coal;i+=THREADS)
shared[i]=0;
for(int i=threadIdx.x+size;i<sb;i+=THREADS)
shared[i] = 0xffffffff;
__syncthreads();
for (int k = 2; k <= sb; k *= 2)
{
for (int j = k / 2; j>0; j /= 2)
{
for(int tid=threadIdx.x;tid<sb;tid+=THREADS)
{
unsigned int ixj = tid ^ j;
if (ixj > tid)
{
if ((tid & k) == 0)
{
if (shared[tid] > shared[ixj])
{
swap(shared[tid], shared[ixj]);
}
}
else
{
if (shared[tid] < shared[ixj])
{
swap(shared[tid], shared[ixj]);
}
}
}
}
__syncthreads();
}
}
__syncthreads();
for(int i=threadIdx.x;i<size;i+=THREADS)
if(i>=coal)
tovalues[i+from] = shared[i];
__syncthreads();
}
__device__ inline void cumcount(unsigned int *lblock, unsigned int *rblock)
{
int tx = threadIdx.x;
int offset = 1;
__syncthreads();
for (int d = THREADS>>1; d > 0; d >>= 1) // build sum in place up