奇偶排序
可以证明奇偶排序是一种使序列逆序数降为0(全局最优)的迭代算法,当连续一轮奇排列和偶排列完成后,交换的元素个数都为0个,则排序已收敛到全局最优。
考虑到每次奇或偶排列需要LENGTH/2组比较,且在每次排序中,组之间没有依赖关系,于是在序列长度较大时,使用CUDA对其加速成为可能,将每一组比较对作为一个CUDA线程执行,于是奇排序有如下代码:
int curLocation = *tid % *length;
if(!(curLocation % 2) && curLocation != *length-1)
if(array[curLocation] > array[curLocation+1])
{
int temp = array[curLocation];
atomicExch(&array[curLocation], array[curLocation+1]);
atomicExch(&array[curLocation+1], temp);
atomicAdd(vOdd, 1);
}
偶排序有如下代码:
int curLocation = *tid % *length;
if(curLocation % 2 == 1 && curLocation != *length-1)
if(array[curLocation] > array[curLocation+1])
{
int temp = array[curLocation];
atomicExch(&array[curLocation], array[curLocation+1]);
atomicExch(&array[curLocation+1], temp);
atomicAdd(vEven, 1);
}
每个线程执行一组奇、偶排序,考虑到偶排序需要在奇排序完成之后才能执行,因此需要使用线程同步函数,由于GRID中只设置了一个BLOCK,于是线程同步函数就可以保证所有分组比较全部完成,代码如下:
//printf("4\n");
int tId = threadIdx.x + blockIdx.x * blockDim.x;
//printf("tID: %d\n", tId);
oddSort(array, length, &tId, vOdd);
// 等待Block内所有线程完成奇排序
__syncthreads();
evenSort(array, length, &tId, vEven);
// 等待Block内所有线程完成偶排序
__syncthreads();
于是整个CUDA并行加速奇偶排序算法如下所示:
#include <cuda.h>
#include <cuda_runtime_api.h>
#include <cuda_runtime.h>
#include <device_launch_parameters.h>
#include <iostream>
using namespace std;
#define LENGTH 1024
__device__ void oddSort(int *array, int *length, int* tid, int *vOdd)
{
int curLocation = *tid % *length;
if(!(curLocation % 2) && curLocation != *length-1)
if(array[curLocation] > array[curLocation+1])
{
int temp = array[curLocation];
atomicExch(&array[curLocation], array[curLocation+1]);
atomicExch(&array[curLocation+1], temp);
atomicAdd(vOdd, 1);
}
}
__device__ void evenSort(int *array, int *length, int* tid, int *vEven)
{
int curLocation = *tid % *length;
if(curLocation % 2 == 1 && curLocation != *length-1)
if(array[curLocation] > array[curLocation+1])
{
int temp = array[curLocation];
atomicExch(&array[curLocation], array[curLocation+1]);
atomicExch(&array[curLocation+1], temp);
atomicAdd(vEven, 1);
}
}
__global__ void oddEvenSort(int *array, int *length, int *vOdd, int* vEven)
{
//printf("4\n");
int tId = threadIdx.x + blockIdx.x * blockDim.x;
//printf("tID: %d\n", tId);
oddSort(array, length, &tId, vOdd);
// 等待Block内所有线程完成奇排序
__syncthreads();
evenSort(array, length, &tId, vEven);
// 等待Block内所有线程完成偶排序
__syncthreads();
}
extern "C" void hostOddEvenSort(int *array, int length, int odd, int even)
{
int *arrayDev, *lengthDev, *vOdd, *vEven;
cudaMalloc(&arrayDev, length*sizeof(int));
cudaMemcpy(arrayDev, array, length*sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&lengthDev, sizeof(int));
cudaMemcpy(lengthDev, &length, sizeof(int), cudaMemcpyHostToDevice);
cudaMalloc(&vOdd, sizeof(int));
cudaMalloc(&vEven, sizeof(int));
int *arrayHost, *vOddHost, *vEvenHost;
arrayHost = (int*)malloc(length*sizeof(int));
vOddHost = (int*)malloc(sizeof(int));
vEvenHost = (int*)malloc(sizeof(int));
do
{
cudaMemcpy(vOdd, &odd, sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(vEven, &even, sizeof(int), cudaMemcpyHostToDevice);
//cout<< 2<< endl;
oddEvenSort<<<1, LENGTH>>>(arrayDev, lengthDev, vOdd, vEven);
//cout<< 3<< endl;
cudaMemcpy(vOddHost, vOdd, sizeof(int), cudaMemcpyDeviceToHost);
cudaMemcpy(vEvenHost, vEven, sizeof(int), cudaMemcpyDeviceToHost);
cout<< "odd: "<< *vOddHost<< "even: "<< *vEvenHost<< endl;
}while(*vOddHost!=0 || *vEvenHost!=0);
cudaMemcpy(arrayHost, arrayDev, length*sizeof(int), cudaMemcpyDeviceToHost);
for(int i=0; i<length; i++)
cout<< arrayHost[i]<< ",";
cout<< "."<< endl;
}
int main()
{
cout<< "begin: "<< endl;
int *array = new int[LENGTH];
// cout<< -1<< endl;
// 逆序数输入
for(int i=0; i<LENGTH; i++)
array[i] = LENGTH - i;
// cout<< 0<< endl;
// 奇偶排序后顺序输出
hostOddEvenSort(array, LENGTH, 0, 0);
return 0;
}