由于奇偶冒泡排序分奇下标和偶下标排序,这就确保了在一轮排序【奇/偶】过程中,隔一个处理一个,同一轮的排序中被处理的各元素是互不影响的,所以可以并行处理。。。事实上,奇偶冒泡排序在串行CPU上本没一点优势,他揍是为并行而生.
- #include <cuda.h>
- #include <time.h>
- #include <stdio.h>
- #include <stdlib.h>
- #include <math.h>
- #include <pz_cpy>
- #include "cuPrintf.cu"
- bool InitGPUSet()
- {
- cudaDeviceProp tCard;
- int num = 0;
- if(cudaSuccess == cudaGetDeviceCount(&num))
- {
- for(int i = 0; i < num; ++ i)
- {
- cudaSetDevice(i);
- cudaGetDeviceProperties(&tCard, i);
- printf("GPU: %s/n", tCard.name);
- }
- }
- else return false;
- return true;
- }
- bool InitCuPrint()
- {
- cudaError_t err = cudaPrintfInit();
- return 0 == strcmp("no error", cudaGetErrorString(err));
- }
- __global__ void even_sort(int* ary, int size)
- {
- int tid = threadIdx.x;//线程从0开始编号
- if(1 == (tid + 1 ) % 2)//第奇数个轮回
- {
- cuPrintf("before swap %d/n", tid);
- for(int i = 0; i < size; ++ i) cuPrintf("%d ", ary[i]);
- cuPrintf("/n");
- if(ary[tid] > ary[tid + 1] && tid + 1 < size)
- {
- int tp = ary[tid];
- ary[tid] = ary[tid + 1];
- ary[tid + 1] = tp;
- cuPrintf("after swap %d/n", tid);
- for(int i = 0; i < size; ++ i) cuPrintf("%d ", ary[i]);
- cuPrintf("/n");
- }
- }
- __syncthreads();
- //cuPrintf("block: %d/t", bid);
- //for(int i = 0; i < size; ++ i) cuPrintf("%d ", ary[i]);
- //cuPrintf("/n");
- }
- __global__ void odd_sort(int* ary, int size)
- {
- int tid = threadIdx.x;//线程从0开始编号
- if(0 == (tid + 1 ) % 2)//第奇数个轮回
- {
- /*cuPrintf("before swap %d/n", tid);
- for(int i = 0; i < size; ++ i) cuPrintf("%d ", ary[i]);
- cuPrintf("/n");*/
- if(ary[tid] > ary[tid + 1] && tid + 1 < size)
- {
- int tp = ary[tid];
- ary[tid] = ary[tid + 1];
- ary[tid + 1] = tp;
- /*
- cuPrintf("after swap %d/n", tid);
- for(int i = 0; i < size; ++ i) cuPrintf("%d ", ary[i]);
- cuPrintf("/n");
- */
- }
- }
- //__syncthreads();
- }
- int main(void)
- {
- const int SIZE = 27;
- const int BLOCK_SIZE = SIZE;
- const int THREAD_SIZE = SIZE;
- if(!InitGPUSet()) puts("GPU failed");
- else if(!InitCuPrint()) puts("cuPrintf failed");
- else
- {
- int ary[SIZE];// = {6, 5, 4, 3, 2, 1};
- srand(time(0));
- for(int i = 0; i < SIZE; ++ i) ary[i] = rand() % 100;
- for(int i = 0; i < SIZE; ++ i) printf("%d ", ary[i]);
- puts("");
- int* gary;
- cudaMalloc((void**) &gary, SIZE * sizeof(int));
- cudaMemcpy(gary, ary, SIZE * sizeof(int), cudaMemcpyHostToDevice);
- for(int i = 0; i < BLOCK_SIZE; ++ i)
- {
- even_sort<<<1, THREAD_SIZE, 0>>>(gary, SIZE);
- //cudaPrintfDisplay(stdout, false);
- odd_sort<<<1, THREAD_SIZE, 0>>>(gary, SIZE);
- }
- //cudaPrintfEnd();
- cudaMemcpy(ary, gary, SIZE * sizeof(int), cudaMemcpyDeviceToHost);
- for(int i = 0; i < SIZE; ++ i) printf("%d ", ary[i]);
- puts("");
- cudaFree(gary);
- }
- return 0;
- }