CUDA并行排序(4)——双调排序(二维double型数据)



#include <stdio.h>
#include<iostream>
#include<math.h>
#include <iomanip>
using namespace std;
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
__global__ void helloCUDA(double **dp_out_params, unsigned int len)
{

	//在核函数内部定义的变量,没有 __shared__ 都是寄存器变量
	//,每一个线程都有自己的寄存器,线程之间互不干涉
	//unsigned int offset = 0;
	//共享内存变量,对于每个线程是唯一的,线程块之间互不干涉
	//,在线程块内部各个线程共享
    //__shared__ double sortbuf[8][9];     // Max of 1024 elements - TODO: make this dynamic


    int x=threadIdx.x;
    int y=threadIdx.y;
    dp_out_params[x][y] = ((x-5)*(y+1)+(x+100)%7)*0.2634;

    __shared__ double sortbuf[8][9];
    sortbuf[x][y]=dp_out_params[x][y];
/*
    if(threadIdx.x==0)
    {
        printf("%f  ", sortbuf[1][y]);

        printf("\n y=%d  ", y);
    }

*/
    __syncthreads();


    // Now the sort loops
    // Here, "k" is the sort level (remember bitonic does a multi-level butterfly style sort)
    // and "j" is the partner element in the butterfly.
    // Two threads each work on one butterfly, because the read/write needs to happen
    // simultaneously
    if(len<=8)
    {
    	 for (unsigned int k=2; k<=8; k*=2) // Butterfly stride increments in powers of 2
    	    {

	            for (unsigned int j=k>>1; j>0; j>>=1) // Strides also in powers of to, up to <k
    	        {
    	            //printf("JJJJJJ\n");

    	            unsigned int swap_idx = threadIdx.x ^ j; // Index of element we're compare-and-swapping with
    	            double *my_elem = sortbuf[threadIdx.x];
    	            double *swap_elem = sortbuf[swap_idx];

/*

    	            if(threadIdx.x==7)
    	            {
        	            printf("%f  ", *(my_elem+threadIdx.y) );
        	            __syncthreads();
        	            printf("\n");
        	            __syncthreads();
    	            }
*/


    	            __syncthreads();

    	            // The k'th bit of my threadid (and hence my sort item ID)
    	            // determines if we sort ascending or descending.
    	            // However, since threads are reading from the top AND the bottom of
    	            // the butterfly, if my ID is > swap_idx, then ascending means mine<swap.
    	            // Finally, if either my_elem or swap_elem is out of range, then it
    	            // ALWAYS acts like it's the largest number.
    	            // Confusing? It saves us two writes though.
    	            unsigned int ascend = k * (swap_idx < threadIdx.x);
    	            unsigned int descend = k * (swap_idx > threadIdx.x);
    	            bool swap = false;




    	            if ((threadIdx.x & k) == ascend)
    	            {
    	                if (*my_elem > *swap_elem)
    	                    swap = true;
    	            }

    	            if ((threadIdx.x & k) == descend)
    	            {
    	                if (*my_elem < *swap_elem)
    	                    swap = true;
    	            }




    	            // If we had to swap, then write my data to the other element's position.
    	            // Don't forget to track out-of-range status too!
/*    	            if (swap)
    	            {
    	            	//当前线程寄存器中的my_elem,赋值给线程块共享内存变量sortbuf[swap_idx]
        	                sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);
    	            }*/



    	            if (swap)
    	            {
    	            	//当前线程寄存器中的my_elem,赋值给线程块共享内存变量sortbuf[swap_idx]
        	                sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);
            	            __syncthreads();
    	            }

    	            __syncthreads();
    	        }//for()

    	    }//for()
    }//if()
    else
    {
    	printf("数组过长");
    }


    dp_out_params[x][y] = sortbuf[x][y];

    /*
    if(threadIdx.x==0&&threadIdx.y==0)
    {
    	printf("\nKKKKKK\n" );
    }*/
}

///
int main()
{
	printf("Hello main()\n");

	cudaError_t res;
	///
	unsigned int ROWS = 8;
	unsigned int COLS = 9;




	double *d_out_params = NULL;
	res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)


	double **dp_out_params = NULL;
	res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)


	double **hp_out_params = NULL;
	hp_out_params = (double**)malloc(ROWS*sizeof(double*));


	double *h_out_params = NULL;
	h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));


	for (int r = 0; r < ROWS; r++)
	{
		hp_out_params[r] = d_out_params + r*COLS;
	}


	res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)


	dim3 dimBlock( ROWS,   COLS,  1);
	dim3 dimGrid(  1,    1,  1);

	helloCUDA<<<dimGrid, dimBlock>>>(dp_out_params, ROWS);
	cudaDeviceSynchronize();

	res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double), cudaMemcpyDeviceToHost);CHECK(res)

    for (int i=0;i<ROWS;i++)//输出数组array1
    {
       for (int j=0;j<COLS;j++)
      {
    	   //Type expression must have pointer-to-object type
    	   //cout<<h_out_params[i][j]<<"  ";

    	   printf("%f  ",*h_out_params++);
       }
       cout<<endl;

    }

    printf("Goodbye main()\n");
    return 0;
}



原始数据:

-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  
-0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  
0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  
0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  
1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  
0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  
1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000 



排序后:

Hello main()
-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  
-0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  
0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  
0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  
0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  
1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000  
1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  
Goodbye main()












  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值