#include <fstream>
#include <stdio.h>
#include <stdlib.h>
#include <cuda_runtime.h>
#include <iostream>
using namespace std;
#define ROWS 10
#define COLS 6
#define CHECK(res) if(res!=cudaSuccess){exit(-1);}
__global__ void Kerneltest(double **dp_out_params, double * d_out_Iqsd1, double *d_MTFac, double *d_errIqsd_MTFac, unsigned int loop)
{
unsigned int row = blockDim.x*blockIdx.x + threadIdx.x ;
unsigned int col = threadIdx.y;
double min_q = 0;
double max_q = 0.027;
int stepNums_q = 10;
double stepLen_q = (max_q-min_q)/stepNums_q;
double min_mua = 0;
double max_mua = 0.007;
int stepNums_mua = 100;
double stepLen_mua = (max_mua-min_mua)/stepNums_mua;
double min_musp = 0;
double max_musp = 1;
int stepNums_musp = 100;
double stepLen_musp = (max_musp-min_musp)/stepNums_musp;
double min_alpha = 0;
double max_alpha = 1;
int stepNums_alpha = 100;
double stepLen_alpha = (max_alpha-min_alpha)/stepNums_alpha;
double min_n = 0;
double max_n = 1;
int stepNums_n = 100;
double stepLen_n = (max_n-min_n)/stepNums_n;
double min_rough = 0;
double max_rough = 1;
int stepNums_rough = 100;
double stepLen_rough = (max_rough-min_rough)/stepNums_rough;
if (row < ROWS && col < COLS)
{
dp_out_params[row][col] = 0;
__syncthreads();
if( col== 5 )
{
int weight_q = (row+ROWS*loop) % stepNums_q;
dp_out_params[row][col] = min_q + weight_q * stepLen_q;
}
__syncthreads();
if( col== 4 )
{
int weight_mua =( (row+ROWS*loop) / stepNums_q ) % stepNums_mua;
dp_out_params[row][col] = min_mua + weight_mua * stepLen_mua;
}
__syncthreads();
if( col== 3 )
{
int weight_musp =( (row+ROWS*loop) / stepNums_q / stepNums_mua ) % stepNums_musp;
dp_out_params[row][col] = min_musp + weight_musp * stepLen_musp;
}
__syncthreads();
if( col== 2 )
{
int weight_alpha =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp ) % stepNums_alpha;
dp_out_params[row][col] = min_alpha + weight_alpha * stepLen_alpha;
}
__syncthreads();
if( col== 1 )
{
int weight_n =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp / stepNums_alpha ) % stepNums_n;
dp_out_params[row][col] = min_n + weight_n * stepLen_n;
}
__syncthreads();
if( col== 0 )
{
int weight_rough =( (row+ROWS*loop) / stepNums_q / stepNums_mua /stepNums_musp / stepNums_alpha / stepNums_n ) % stepNums_rough;
dp_out_params[row][col] = min_rough + weight_rough * stepLen_rough;
}
__syncthreads();
}
///
d_out_Iqsd1[row] = threadIdx.x;
__syncthreads();
///
unsigned int row1 = blockDim.x*blockIdx.x + threadIdx.x ;
d_errIqsd_MTFac[row1] = d_out_Iqsd1[row] - d_MTFac[row];
__syncthreads();
}
///
///
int main(int argc, char **argv)
{
cudaError_t res;
int MTFlen = 4000;
double *h_MTFac = NULL;
h_MTFac = (double*)malloc(MTFlen*sizeof(double));
for(int i=0; i<100; i++)
{
h_MTFac[i] = 3.2;
}
double *d_MTFac = NULL;
res = cudaMalloc((void**)(&d_MTFac), MTFlen*sizeof(double));CHECK(res)
res = cudaMemcpy((void*)(d_MTFac), (void*)(h_MTFac), MTFlen*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)
double *h_errIqsd_MTFac = NULL;
h_errIqsd_MTFac = (double*)malloc(MTFlen*sizeof(double));
double *d_errIqsd_MTFac = NULL;
res = cudaMalloc((void**)(&d_errIqsd_MTFac), MTFlen*sizeof(double));CHECK(res)
res = cudaMemcpy((void*)(d_errIqsd_MTFac), (void*)(h_errIqsd_MTFac), MTFlen*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)
double *d_out_params = NULL;
res = cudaMalloc((void**)(&d_out_params), ROWS*COLS*sizeof(double));CHECK(res)
double **dp_out_params = NULL;
res = cudaMalloc((void**)(&dp_out_params), ROWS*sizeof(double*));CHECK(res)
double **hp_out_params = NULL;
hp_out_params = (double**)malloc(ROWS*sizeof(double*));
double *h_out_params = NULL;
h_out_params = (double*)malloc(ROWS*COLS*sizeof(double));
for (int r = 0; r < ROWS; r++)
{
hp_out_params[r] = d_out_params + r*COLS;
}
double *h_out_Iqsd1 = NULL;
h_out_Iqsd1 = (double*)malloc(ROWS*sizeof(double));
double *d_out_Iqsd1 = NULL;
res = cudaMalloc((void **) &d_out_Iqsd1, ROWS*sizeof(double));CHECK(res)
res = cudaMemcpy((void*)(dp_out_params), (void*)(hp_out_params), ROWS*sizeof(double*), cudaMemcpyHostToDevice);CHECK(res)
dim3 dimBlock( 15, 6, 1);
dim3 dimGrid( 1, 1, 1);
for(unsigned int loop=0; loop<1; loop++)
{
Kerneltest<<<dimGrid, dimBlock>>>(dp_out_params, d_out_Iqsd1, d_MTFac, d_errIqsd_MTFac, loop);
cout<<"loop: "<<loop<<endl;
}
res = cudaMemcpy((void*)(h_out_params), (void*)(d_out_params), ROWS*COLS*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)
res = cudaMemcpy((void*)(h_out_Iqsd1), (void*)(d_out_Iqsd1), ROWS*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)
res = cudaMemcpy((void*)(h_MTFac), (void*)(d_MTFac), MTFlen*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)
res = cudaMemcpy((void*)(h_errIqsd_MTFac), (void*)(d_errIqsd_MTFac), MTFlen*sizeof(double*), cudaMemcpyDeviceToHost);CHECK(res)
ofstream f1("/home/zlf/Documents/cuda.txt");
int zz = 0;
cout<<endl<<"h_out_params: "<<endl;
for (int r = 0; r < ROWS; r++)
{
for (int c = 0; c < COLS; c++)
{
printf("%f ", h_out_params[r*COLS+c]);
f1 << h_out_params[r*COLS+c]<<" ";
}
zz = zz + 1;
cout<<" 行数: "<<zz;
printf("\n");
f1<< " 行数: "<<zz<< "\n";
}
f1.close();
cout<<endl<<"h_out_Iqsd1: "<<endl;
for (int r = 0; r < ROWS; r++)
{
cout<<h_out_Iqsd1[r]<<" ";
if ((r%10)==9)
{
cout<<endl;
}
}
/
cout<<"h_errIqsd_MTFac[i]: "<<endl;
for(int i=0; i<MTFlen; i++)
{
cout<<h_errIqsd_MTFac[i]<<" ";
}
/
cout<<zz<<endl;
cudaFree((void*)d_out_params);
cudaFree((void*)dp_out_params);
cudaFree((void*)d_out_Iqsd1);
cudaFree((void*)d_MTFac);
cudaFree((void*)d_errIqsd_MTFac);
free(h_out_params);
free(hp_out_params);
free(h_out_Iqsd1);
free(h_MTFac);
free(h_errIqsd_MTFac);
getchar();
return 0;
}