CUDA并行排序(4)——双调排序(二维double型数据)

最新推荐文章于 2022-03-30 00:52:41 发布

吃我老猪一钉耙

最新推荐文章于 2022-03-30 00:52:41 发布

阅读量884

点赞数 1

分类专栏： CUDA零基础 C++ 基础知识

本文链接：https://blog.csdn.net/zlf19910726/article/details/78756988

版权

C++ 基础知识同时被 2 个专栏收录

62 篇文章 1 订阅

订阅专栏

CUDA零基础

15 篇文章 1 订阅

订阅专栏

#include <stdio.h>
#include<iostream>
#include<math.h>
#include <iomanip>
using namespace std;
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
__global__ void helloCUDA(double **dp_out_params, unsigned int len)
{

	//在核函数内部定义的变量，没有 __shared__ 都是寄存器变量
	//，每一个线程都有自己的寄存器，线程之间互不干涉
	//unsigned int offset = 0;
	//共享内存变量，对于每个线程是唯一的，线程块之间互不干涉
	//，在线程块内部各个线程共享
    //__shared__ double sortbuf[8][9];     // Max of 1024 elements - TODO: make this dynamic


    int x=threadIdx.x;
    int y=threadIdx.y;
    dp_out_params[x][y] = ((x-5)*(y+1)+(x+100)%7)*0.2634;

    __shared__ double sortbuf[8][9];
    sortbuf[x][y]=dp_out_params[x][y];
/*
    if(threadIdx.x==0)
    {
        printf("%f  ", sortbuf[1][y]);

        printf("\n y=%d  ", y);
    }

*/
    __syncthreads();


    // Now the sort loops
    // Here, "k" is the sort level (remember bitonic does a multi-level butterfly style sort)
    // and "j" is the partner element in the butterfly.
    // Two threads each work on one butterfly, because the read/write needs to happen
    // simultaneously
    if(len<=8)
    {
    	 for (unsigned int k=2; k<=8; k*=2) // Butterfly stride increments in powers of 2
    	    {

	            for (unsigned int j=k>>1; j>0; j>>=1) // Strides also in powers of to, up to <k
    	        {
    	            //printf("JJJJJJ\n");

    	            unsigned int swap_idx = threadIdx.x ^ j; // Index of element we're compare-and-swapping with
    	            double *my_elem = sortbuf[threadIdx.x];
    	            double *swap_elem = sortbuf[swap_idx];

/*

    	            if(threadIdx.x==7)
    	            {
        	            printf("%f  ", *(my_elem+threadIdx.y) );
        	            __syncthreads();
        	            printf("\n");
        	            __syncthreads();
    	            }
*/


    	            __syncthreads();

    	            // The k'th bit of my threadid (and hence my sort item ID)
    	            // determines if we sort ascending or descending.
    	            // However, since threads are reading from the top AND the bottom of
    	            // the butterfly, if my ID is > swap_idx, then ascending means mine<swap.
    	            // Finally, if either my_elem or swap_elem is out of range, then it
    	            // ALWAYS acts like it's the largest number.
    	            // Confusing? It saves us two writes though.
    	            unsigned int ascend = k * (swap_idx < threadIdx.x);
    	            unsigned int descend = k * (swap_idx > threadIdx.x);
    	            bool swap = false;




    	            if ((threadIdx.x & k) == ascend)
    	            {
    	                if (*my_elem > *swap_elem)
    	                    swap = true;
    	            }

    	            if ((threadIdx.x & k) == descend)
    	            {
    	                if (*my_elem < *swap_elem)
    	                    swap = true;
    	            }




    	            // If we had to swap, then write my data to the other element's position.
    	            // Don't forget to track out-of-range status too!
/*    	            if (swap)
    	            {
    	            	//当前线程寄存器中的my_elem，赋值给线程块共享内存变量sortbuf[swap_idx]
        	                sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);
    	            }*/



    	            if (swap)
    	            {
    	            	//当前线程寄存器中的my_elem，赋值给线程块共享内存变量sortbuf[swap_idx]
        	                sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);
            	            __syncthreads();
    	            }

    	            __syncthreads();
    	        }//for()

    	    }//for()
    }//if()
    else
    {
    	printf("数组过长");
    }


    dp_out_params[x][y] = sortbuf[x][y];

    /*
    if(threadIdx.x==0&&threadIdx.y==0)
    {
    	printf("\nKKKKKK\n" );
    }*/
}

///
int main()
{
	printf("Hello main()\n");

	cudaError_t res;
	///
	unsigned int ROWS = 8;
	unsigned int COLS = 9;




	double *d_out_params = NULL;
	res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)


	double **dp_out_params = NULL;
	res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)


	double **hp_out_params = NULL;
	hp_out_params = (double**)malloc(ROWS*sizeof(double*));


	double *h_out_params = NULL;
	h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));


	for (int r = 0; r < ROWS; r++)
	{
		hp_out_params[r] = d_out_params + r*COLS;
	}


	res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)


	dim3 dimBlock( ROWS,   COLS,  1);
	dim3 dimGrid(  1,    1,  1);

	helloCUDA<<<dimGrid, dimBlock>>>(dp_out_params, ROWS);
	cudaDeviceSynchronize();

	res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double), cudaMemcpyDeviceToHost);CHECK(res)

    for (int i=0;i<ROWS;i++)//输出数组array1
    {
       for (int j=0;j<COLS;j++)
      {
    	   //Type expression must have pointer-to-object type
    	   //cout<<h_out_params[i][j]<<"  ";

    	   printf("%f  ",*h_out_params++);
       }
       cout<<endl;

    }

    printf("Goodbye main()\n");
    return 0;
}

原始数据：

-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  
-0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  
0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  
0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  
1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  
0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  
1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000

排序后：

Hello main()
-0.790200  -2.107200  -3.424200  -4.741200  -6.058200  -7.375200  -8.692200  -10.009200  -11.326200  
-0.263400  -1.317000  -2.370600  -3.424200  -4.477800  -5.531400  -6.585000  -7.638600  -8.692200  
0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
0.263400  -0.526800  -1.317000  -2.107200  -2.897400  -3.687600  -4.477800  -5.268000  -6.058200  
0.526800  0.790200  1.053600  1.317000  1.580400  1.843800  2.107200  2.370600  2.634000  
0.790200  0.263400  -0.263400  -0.790200  -1.317000  -1.843800  -2.370600  -2.897400  -3.424200  
1.053600  1.580400  2.107200  2.634000  3.160800  3.687600  4.214400  4.741200  5.268000  
1.317000  1.053600  0.790200  0.526800  0.263400  0.000000  -0.263400  -0.526800  -0.790200  
Goodbye main()

吃我老猪一钉耙

关注

1
点赞
踩
2

收藏

觉得还不错? 一键收藏
1
评论
CUDA并行排序(4)——双调排序(二维double型数据)

#include #include#include#include using namespace std;#define CHECK(res) if(res!=cudaSuccess){exit(-1);}__global__ void helloCUDA(double **dp_out_params, unsigned int len){ //在核函数内部定义的变量，没有 _
复制链接

扫一扫