CUDA 做排列组合



#include <fstream>

#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <iostream>
using namespace std;

#define ROWS 10
#define COLS 6

#define CHECK(res) if(res!=cudaSuccess){exit(-1);}


__global__ void Kerneltest(double **dp_out_params, double * d_out_Iqsd1, double *d_MTFac, double *d_errIqsd_MTFac, unsigned int loop)
{

    unsigned int row =  blockDim.x*blockIdx.x + threadIdx.x   ;
    unsigned int col = threadIdx.y;

	double min_q = 0;
	double max_q = 0.027;
	int stepNums_q = 10;
	double stepLen_q = (max_q-min_q)/stepNums_q;

	double min_mua = 0;
	double max_mua = 0.007;
	int stepNums_mua = 100;
	double stepLen_mua = (max_mua-min_mua)/stepNums_mua;

	double min_musp = 0;
	double max_musp = 1;
	int stepNums_musp = 100;
	double stepLen_musp = (max_musp-min_musp)/stepNums_musp;

	double min_alpha = 0;
	double max_alpha = 1;
	int stepNums_alpha = 100;
	double stepLen_alpha = (max_alpha-min_alpha)/stepNums_alpha;

	double min_n = 0;
	double max_n = 1;
	int stepNums_n = 100;
	double stepLen_n = (max_n-min_n)/stepNums_n;

	double min_rough = 0;
	double max_rough = 1;
	int stepNums_rough = 100;
	double stepLen_rough = (max_rough-min_rough)/stepNums_rough;

	if (row < ROWS && col < COLS)
	{

		dp_out_params[row][col] = 0;
	 __syncthreads();

	if( col== 5 )
	{
		int weight_q = (row+ROWS*loop) % stepNums_q;
		dp_out_params[row][col] = min_q + weight_q * stepLen_q;
	}
	 __syncthreads();

	if( col== 4 )
	{
		int weight_mua =( (row+ROWS*loop) / stepNums_q ) % stepNums_mua;

		dp_out_params[row][col] = min_mua + weight_mua * stepLen_mua;
	}
	 __syncthreads();


	if( col== 3 )
	{
		int weight_musp =( (row+ROWS*loop) / stepNums_q / stepNums_mua ) % stepNums_musp;

		dp_out_params[row][col] = min_musp + weight_musp * stepLen_musp;
 	}
	 __syncthreads();


	if( col== 2 )
	{
		int weight_alpha =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp ) % stepNums_alpha;

		dp_out_params[row][col] = min_alpha + weight_alpha * stepLen_alpha;
 	}
	 __syncthreads();



	if( col== 1 )
	{
		int weight_n =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp / stepNums_alpha ) % stepNums_n;

		dp_out_params[row][col] = min_n + weight_n * stepLen_n;
 	}
	 __syncthreads();


	if( col== 0 )
	{
		int weight_rough =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp / stepNums_alpha / stepNums_n ) % stepNums_rough;

		dp_out_params[row][col] = min_rough + weight_rough * stepLen_rough;
 	}
	 __syncthreads();


	}
///
	d_out_Iqsd1[row] = threadIdx.x;


	__syncthreads();
///
    unsigned int row1 =  blockDim.x*blockIdx.x + threadIdx.x   ;

	d_errIqsd_MTFac[row1] = d_out_Iqsd1[row] - d_MTFac[row];


	__syncthreads();

}


///
///

int main(int argc, char **argv)
{
	cudaError_t res;

	int MTFlen = 4000;
	double *h_MTFac = NULL;
	h_MTFac = (double*)malloc(MTFlen*sizeof(double));

	for(int i=0; i<100; i++)
	{
		h_MTFac[i] = 3.2;
	}

	double *d_MTFac = NULL;
    res = cudaMalloc((void**)(&d_MTFac), MTFlen*sizeof(double));CHECK(res)
    res = cudaMemcpy((void*)(d_MTFac), (void*)(h_MTFac), MTFlen*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)

	double *h_errIqsd_MTFac = NULL;
    h_errIqsd_MTFac = (double*)malloc(MTFlen*sizeof(double));
	double *d_errIqsd_MTFac = NULL;
    res = cudaMalloc((void**)(&d_errIqsd_MTFac), MTFlen*sizeof(double));CHECK(res)
    res = cudaMemcpy((void*)(d_errIqsd_MTFac), (void*)(h_errIqsd_MTFac), MTFlen*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)




	double *d_out_params = NULL;
    res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)


	double **dp_out_params = NULL;
	res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)


	double **hp_out_params = NULL;
	hp_out_params = (double**)malloc(ROWS*sizeof(double*));


	double *h_out_params = NULL;
	h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));


    for (int r = 0; r < ROWS; r++)
    {
    	hp_out_params[r] = d_out_params + r*COLS;
    }


    double *h_out_Iqsd1 = NULL;
    h_out_Iqsd1 = (double*)malloc(ROWS*sizeof(double));

	double *d_out_Iqsd1 = NULL;
	res = cudaMalloc((void **) &d_out_Iqsd1, ROWS*sizeof(double));CHECK(res)



    res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)

    dim3 dimBlock( 15,   6,  1);
    dim3 dimGrid(  1,    1,  1);

    for(unsigned int loop=0; loop<1; loop++)
     {

    	Kerneltest<<<dimGrid, dimBlock>>>(dp_out_params, d_out_Iqsd1, d_MTFac, d_errIqsd_MTFac, loop);

    	cout<<"loop:  "<<loop<<endl;
     }

    res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)
    res = cudaMemcpy((void*)(h_out_Iqsd1), (void*)(d_out_Iqsd1), ROWS*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)

    res = cudaMemcpy((void*)(h_MTFac), (void*)(d_MTFac), MTFlen*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)
    res = cudaMemcpy((void*)(h_errIqsd_MTFac), (void*)(d_errIqsd_MTFac), MTFlen*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)





    ofstream f1("/home/zlf/Documents/cuda.txt");

    int zz = 0;
	cout<<endl<<"h_out_params: "<<endl;
    for (int r = 0; r < ROWS; r++)
    {
        for (int c = 0; c < COLS; c++)
        {
            printf("%f ", h_out_params[r*COLS+c]);
            f1 << h_out_params[r*COLS+c]<<"    ";

        }
        zz = zz + 1;
        cout<<"    行数: "<<zz;
        printf("\n");
        f1<< "    行数: "<<zz<< "\n";

    }

	f1.close();


	cout<<endl<<"h_out_Iqsd1: "<<endl;
    for (int r = 0; r < ROWS; r++)
    {
    	cout<<h_out_Iqsd1[r]<<"    ";
    	if ((r%10)==9)
    	{
    		cout<<endl;
    	}

    }


    /
    cout<<"h_errIqsd_MTFac[i]: "<<endl;
	for(int i=0; i<MTFlen; i++)
	{
		cout<<h_errIqsd_MTFac[i]<<"  ";
	}
    /


	cout<<zz<<endl;
    cudaFree((void*)d_out_params);
    cudaFree((void*)dp_out_params);
    cudaFree((void*)d_out_Iqsd1);
    cudaFree((void*)d_MTFac);
    cudaFree((void*)d_errIqsd_MTFac);

    free(h_out_params);
    free(hp_out_params);
    free(h_out_Iqsd1);
    free(h_MTFac);
    free(h_errIqsd_MTFac);

    getchar();



    return 0;
}























评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值