#include <stdio.h>
#include<iostream>
#include<math.h>
#include <iomanip>
using namespace std;
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
__global__ void helloCUDA(double **dp_out_params, unsigned int len)
{
//在核函数内部定义的变量,没有 __shared__ 都是寄存器变量
//,每一个线程都有自己的寄存器,线程之间互不干涉
//unsigned int offset = 0;
//共享内存变量,对于每个线程是唯一的,线程块之间互不干涉
//,在线程块内部各个线程共享
//__shared__ double sortbuf[8][9]; // Max of 1024 elements - TODO: make this dynamic
int x=threadIdx.x;
int y=threadIdx.y;
dp_out_params[x][y] = ((x-5)*(y+1)+(x+100)%7)*0.2634;
__shared__ double sortbuf[8][9];
sortbuf[x][y]=dp_out_params[x][y];
/*
if(threadIdx.x==0)
{
printf("%f ", sortbuf[1][y]);
printf("\n y=%d ", y);
}
*/
__syncthreads();
// Now the sort loops
// Here, "k" is the sort level (remember bitonic does a multi-level butterfly style sort)
// and "j" is the partner element in the butterfly.
// Two threads each work on one butterfly, because the read/write needs to happen
// simultaneously
if(len<=8)
{
for (unsigned int k=2; k<=8; k*=2) // Butterfly stride increments in powers of 2
{
for (unsigned int j=k>>1; j>0; j>>=1) // Strides also in powers of to, up to <k
{
//printf("JJJJJJ\n");
unsigned int swap_idx = threadIdx.x ^ j; // Index of element we're compare-and-swapping with
double *my_elem = sortbuf[threadIdx.x];
double *swap_elem = sortbuf[swap_idx];
/*
if(threadIdx.x==7)
{
printf("%f ", *(my_elem+threadIdx.y) );
__syncthreads();
printf("\n");
__syncthreads();
}
*/
__syncthreads();
// The k'th bit of my threadid (and hence my sort item ID)
// determines if we sort ascending or descending.
// However, since threads are reading from the top AND the bottom of
// the butterfly, if my ID is > swap_idx, then ascending means mine<swap.
// Finally, if either my_elem or swap_elem is out of range, then it
// ALWAYS acts like it's the largest number.
// Confusing? It saves us two writes though.
unsigned int ascend = k * (swap_idx < threadIdx.x);
unsigned int descend = k * (swap_idx > threadIdx.x);
bool swap = false;
if ((threadIdx.x & k) == ascend)
{
if (*my_elem > *swap_elem)
swap = true;
}
if ((threadIdx.x & k) == descend)
{
if (*my_elem < *swap_elem)
swap = true;
}
// If we had to swap, then write my data to the other element's position.
// Don't forget to track out-of-range status too!
/* if (swap)
{
//当前线程寄存器中的my_elem,赋值给线程块共享内存变量sortbuf[swap_idx]
sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);
}*/
if (swap)
{
//当前线程寄存器中的my_elem,赋值给线程块共享内存变量sortbuf[swap_idx]
sortbuf[swap_idx][threadIdx.y] = *(my_elem+threadIdx.y);
__syncthreads();
}
__syncthreads();
}//for()
}//for()
}//if()
else
{
printf("数组过长");
}
dp_out_params[x][y] = sortbuf[x][y];
/*
if(threadIdx.x==0&&threadIdx.y==0)
{
printf("\nKKKKKK\n" );
}*/
}
///
int main()
{
printf("Hello main()\n");
cudaError_t res;
///
unsigned int ROWS = 8;
unsigned int COLS = 9;
double *d_out_params = NULL;
res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)
double **dp_out_params = NULL;
res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)
double **hp_out_params = NULL;
hp_out_params = (double**)malloc(ROWS*sizeof(double*));
double *h_out_params = NULL;
h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));
for (int r = 0; r < ROWS; r++)
{
hp_out_params[r] = d_out_params + r*COLS;
}
res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)
dim3 dimBlock( ROWS, COLS, 1);
dim3 dimGrid( 1, 1, 1);
helloCUDA<<<dimGrid, dimBlock>>>(dp_out_params, ROWS);
cudaDeviceSynchronize();
res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double), cudaMemcpyDeviceToHost);CHECK(res)
for (int i=0;i<ROWS;i++)//输出数组array1
{
for (int j=0;j<COLS;j++)
{
//Type expression must have pointer-to-object type
//cout<<h_out_params[i][j]<<" ";
printf("%f ",*h_out_params++);
}
cout<<endl;
}
printf("Goodbye main()\n");
return 0;
}
原始数据:
-0.790200 -2.107200 -3.424200 -4.741200 -6.058200 -7.375200 -8.692200 -10.009200 -11.326200
-0.263400 -1.317000 -2.370600 -3.424200 -4.477800 -5.531400 -6.585000 -7.638600 -8.692200
0.263400 -0.526800 -1.317000 -2.107200 -2.897400 -3.687600 -4.477800 -5.268000 -6.058200
0.790200 0.263400 -0.263400 -0.790200 -1.317000 -1.843800 -2.370600 -2.897400 -3.424200
1.317000 1.053600 0.790200 0.526800 0.263400 0.000000 -0.263400 -0.526800 -0.790200
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.526800 0.790200 1.053600 1.317000 1.580400 1.843800 2.107200 2.370600 2.634000
1.053600 1.580400 2.107200 2.634000 3.160800 3.687600 4.214400 4.741200 5.268000
排序后:
Hello main()
-0.790200 -2.107200 -3.424200 -4.741200 -6.058200 -7.375200 -8.692200 -10.009200 -11.326200
-0.263400 -1.317000 -2.370600 -3.424200 -4.477800 -5.531400 -6.585000 -7.638600 -8.692200
0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
0.263400 -0.526800 -1.317000 -2.107200 -2.897400 -3.687600 -4.477800 -5.268000 -6.058200
0.526800 0.790200 1.053600 1.317000 1.580400 1.843800 2.107200 2.370600 2.634000
0.790200 0.263400 -0.263400 -0.790200 -1.317000 -1.843800 -2.370600 -2.897400 -3.424200
1.053600 1.580400 2.107200 2.634000 3.160800 3.687600 4.214400 4.741200 5.268000
1.317000 1.053600 0.790200 0.526800 0.263400 0.000000 -0.263400 -0.526800 -0.790200
Goodbye main()