串行版本源码如下:
#include "stdio.h"
#include "iostream"
using namespace std;
bool ASCENDING=true;
bool DESCENDING=false;
bool dir=true;
void bitonicSort(int lo,int n,bool dir);
void bitonicMerge(int lo,int n,bool dir);
void exchange(int i,int j);
void compare(int i,int j,bool dir);
//input array
int a[8]={5,4,2,1,6,3,8,7};
int b[8]={5,4,2,1,6,3,8,7};
int N=8;
void bitonicSort(int lo,int n,bool dir)
{
if (n>1)
{
int m=n/2;
bitonicSort(lo,m,ASCENDING);
bitonicSort(lo+m,m,DESCENDING);
bitonicMerge(lo,n,dir);
}
}
void bitonicMerge(int lo,int n,bool dir)
{
if (n>1)
{
int m=n/2;
for (int i = lo;i < lo+m;i++)
compare(i,i+m,dir);
bitonicMerge(lo,m,dir);
bitonicMerge(lo+m,m,dir);
}
}
void compare(int i,int j,bool dir)
{
if(dir == (a[i] > a[j]))
exchange(i,j);
}
void exchange(int i,int j)
{
int t=a[i];
a[i]=a[j];
a[j]=t;
}
void main()
{
bitonicSort(0,8,ASCENDING);
for (int i=0;i < 8; i++)
{
cout<
<
<= N ;k=2*k)
{
for (j=k>>1;j > 0;j = j>>1)
{
for (i = 0;i < N;i++)
{
int ixj=i^j;
if((ixj > i))
{
if((i&k)==0 && b[i] > b[ixj])
{
tem=b[i];
b[i]=b[ixj];
b[ixj]=tem;
}
if((i&k)!=0 && b[i] < b[ixj])
{
tem=b[i];
b[i]=b[ixj];
b[ixj]=tem;
}
}
}
}
}
for (int i=0;i < 8; i++)
{
cout<
<
并行(cuda)版本源码如下:
#include "bitonic_sort.cuh"
#include "common.cuh"
extern float sum;
__global__ void bitonic_sort_step(float *dev_values, int j, int k)
{
unsigned int i, ixj; /* Sorting partners: i and ixj */
i = threadIdx.x + blockDim.x * blockIdx.x;
ixj = i^j;
/* The threads with the lowest ids sort the array. */
if ((ixj)>i) {
if ((i&k)==0) {
/* Sort ascending */
if (dev_values[i]>dev_values[ixj]) {
/* exchange(i,ixj); */
float temp = dev_values[i];
dev_values[i] = dev_values[ixj];
dev_values[ixj] = temp;
}
}
if ((i&k)!=0) {
/* Sort descending */
if (dev_values[i]
<<="1)" minor (j="k">
>1; j>0; j=j>>1) {
bitonic_sort_step<<
>>(dev_values, j, k);
}
}
cudaEventRecord(stop, 0);
cudaEventSynchronize(stop);
float elapsedTime;
cudaEventElapsedTime(&elapsedTime, start, stop);
printf("kernel耗时:%f毫秒\n",elapsedTime);
sum = sum + elapsedTime;
cudaMemcpy(values, dev_values, size, cudaMemcpyDeviceToHost);
cudaFree(dev_values);
}