核心代码:
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
// If we're too deep or there are few elements left, we use an insertion sort...
if (depth >= MAX_DEPTH || right-left <= INSERTION_SORT)
{
selection_sort(data, left, right);
return;
}
unsigned int *lptr = data+left;
unsigned int *rptr = data+right;
unsigned int pivot = data[(left+right)/2];
// Do the partitioning.
while (lptr <= rptr)
{
// Find the next left- and right-hand values to swap
unsigned int lval = *lptr;
unsigned int rval = *rptr;
// Move the left pointer as long as the pointed element is smaller than the pivot.
while (lval < pivot)
{
lptr++;
lval = *lptr;
}
// Move the right pointer as long as the pointed element is larger than the pivot.
while (rval > pivot)
{
rptr--;
rval = *rptr;
}
// If the swap points are valid, do the swap!
if (lptr <= rptr)
{
*lptr++ = rval;
*rptr-- = lval;
}
}
// Now the recursive part
int nright = rptr - data;
int nleft = lptr - data;
// Launch a new block to sort the left part.
if (left < (rptr-data))
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
cdp_simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1);
cudaStreamDestroy(s);
}
// Launch a new block to sort the right part.
if ((lptr-data) < right)
{
cudaStream_t s1;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cdp_simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1);
cudaStreamDestroy(s1);
}
}
完整代码:
/*
* Copyright 1993-2015 NVIDIA Corporation. All rights reserved.
*
* Please refer to the NVIDIA end user license agreement (EULA) associated
* with this source code for terms and conditions that govern your use of
* this software. Any use, reproduction, disclosure, or distribution of
* this software and related documentation outside the terms of the EULA
* is strictly prohibited.
*
*/
#include <iostream>
#include <cstdio>
#include <helper_cuda.h>
#include <helper_string.h>
#define MAX_DEPTH 16
#define INSERTION_SORT 32
// Selection sort used when depth gets too big or the number of elements drops
// below a threshold.
__device__ void selection_sort(unsigned int *data, int left, int right)
{
for (int i = left ; i <= right ; ++i)
{
unsigned min_val = data[i];
int min_idx = i;
// Find the smallest value in the range [left, right].
for (int j = i+1 ; j <= right ; ++j)
{
unsigned val_j = data[j];
if (val_j < min_val)
{
min_idx = j;
min_val = val_j;
}
}
// Swap the values.
if (i != min_idx)
{
data[min_idx] = data[i];
data[i] = min_val;
}
}
}
// Very basic quicksort algorithm, recursively launching the next level.
__global__ void cdp_simple_quicksort(unsigned int *data, int left, int right, int depth)
{
// If we're too deep or there are few elements left, we use an insertion sort...
if (depth >= MAX_DEPTH || right-left <= INSERTION_SORT)
{
selection_sort(data, left, right);
return;
}
unsigned int *lptr = data+left;
unsigned int *rptr = data+right;
unsigned int pivot = data[(left+right)/2];
// Do the partitioning.
while (lptr <= rptr)
{
// Find the next left- and right-hand values to swap
unsigned int lval = *lptr;
unsigned int rval = *rptr;
// Move the left pointer as long as the pointed element is smaller than the pivot.
while (lval < pivot)
{
lptr++;
lval = *lptr;
}
// Move the right pointer as long as the pointed element is larger than the pivot.
while (rval > pivot)
{
rptr--;
rval = *rptr;
}
// If the swap points are valid, do the swap!
if (lptr <= rptr)
{
*lptr++ = rval;
*rptr-- = lval;
}
}
// Now the recursive part
int nright = rptr - data;
int nleft = lptr - data;
// Launch a new block to sort the left part.
if (left < (rptr-data))
{
cudaStream_t s;
cudaStreamCreateWithFlags(&s, cudaStreamNonBlocking);
cdp_simple_quicksort<<< 1, 1, 0, s >>>(data, left, nright, depth+1);
cudaStreamDestroy(s);
}
// Launch a new block to sort the right part.
if ((lptr-data) < right)
{
cudaStream_t s1;
cudaStreamCreateWithFlags(&s1, cudaStreamNonBlocking);
cdp_simple_quicksort<<< 1, 1, 0, s1 >>>(data, nleft, right, depth+1);
cudaStreamDestroy(s1);
}
}
// Call the quicksort kernel from the host.
void run_qsort(unsigned int *data, unsigned int nitems)
{
// Prepare CDP for the max depth 'MAX_DEPTH'.
checkCudaErrors(cudaDeviceSetLimit(cudaLimitDevRuntimeSyncDepth, MAX_DEPTH));
// Launch on device
int left = 0;
int right = nitems-1;
std::cout << "Launching kernel on the GPU" << std::endl;
cdp_simple_quicksort<<< 1, 1 >>>(data, left, right, 0);
checkCudaErrors(cudaDeviceSynchronize());
}
// Initialize data on the host.
void initialize_data(unsigned int *dst, unsigned int nitems)
{
// Fixed seed for illustration
srand(2047);
// Fill dst with random values
for (unsigned i = 0 ; i < nitems ; i++)
dst[i] = rand() % nitems ;
}
// Verify the results.
void check_results(int n, unsigned int *results_d)
{
unsigned int *results_h = new unsigned[n];
checkCudaErrors(cudaMemcpy(results_h, results_d, n*sizeof(unsigned), cudaMemcpyDeviceToHost));
for (int i = 1 ; i < n ; ++i)
if (results_h[i-1] > results_h[i])
{
std::cout << "Invalid item[" << i-1 << "]: " << results_h[i-1] << " greater than " << results_h[i] << std::endl;
exit(EXIT_FAILURE);
}
std::cout << "OK" << std::endl;
delete[] results_h;
}
// Main entry point.
int main(int argc, char **argv)
{
int num_items = 128;
bool verbose = false;
if (checkCmdLineFlag(argc, (const char **)argv, "help") ||
checkCmdLineFlag(argc, (const char **)argv, "h"))
{
std::cerr << "Usage: " << argv[0] << " num_items=<num_items>\twhere num_items is the number of items to sort" << std::endl;
exit(EXIT_SUCCESS);
}
if (checkCmdLineFlag(argc, (const char **)argv, "v"))
{
verbose = true;
}
if (checkCmdLineFlag(argc, (const char **)argv, "num_items"))
{
num_items = getCmdLineArgumentInt(argc, (const char **)argv, "num_items");
if (num_items < 1)
{
std::cerr << "ERROR: num_items has to be greater than 1" << std::endl;
exit(EXIT_FAILURE);
}
}
// Find/set device and get device properties
int device = -1;
cudaDeviceProp deviceProp;
device = findCudaDevice(argc, (const char **)argv);
checkCudaErrors(cudaGetDeviceProperties(&deviceProp, device));
if (!(deviceProp.major > 3 || (deviceProp.major == 3 && deviceProp.minor >= 5)))
{
printf("GPU %d - %s does not support CUDA Dynamic Parallelism\n Exiting.", device, deviceProp.name);
exit(EXIT_WAIVED);
}
// Create input data
unsigned int *h_data = 0;
unsigned int *d_data = 0;
// Allocate CPU memory and initialize data.
std::cout << "Initializing data:" << std::endl;
h_data =(unsigned int *)malloc(num_items*sizeof(unsigned int));
initialize_data(h_data, num_items);
if (verbose)
{
for (int i=0 ; i<num_items ; i++)
std::cout << "Data [" << i << "]: " << h_data[i] << std::endl;
}
// Allocate GPU memory.
checkCudaErrors(cudaMalloc((void **)&d_data, num_items * sizeof(unsigned int)));
checkCudaErrors(cudaMemcpy(d_data, h_data, num_items * sizeof(unsigned int), cudaMemcpyHostToDevice));
// Execute
std::cout << "Running quicksort on " << num_items << " elements" << std::endl;
run_qsort(d_data, num_items);
// Check result
std::cout << "Validating results: ";
check_results(num_items, d_data);
free(h_data);
checkCudaErrors(cudaFree(d_data));
exit(EXIT_SUCCESS);
}
选自cuda 自带的例子。