cuda 编程一般结构为:
1 主程序.c << 2 主机(H)端&设备(D)端数据传递及GPU子程序调用.cu << 3 GPU子程序.cu & CPU子程序.c
2.cu中一般结构为:
#1设定显卡号i
cudaSetDevice(i); check_gpu_error("Failed to initialize device");
#2设定grid和block块
dim3 grid((nz+Block_Sizez-1)/Block_Sizez,(ny+Block_Sizey-1)/Block_Sizey,nx+Block_Sizex-1)/Block_Sizex);
dim3 block(Block_Sizey,Block_Sizex,Block_Sizez);
#3申请设备端内存并初始化
cudaMalloc(&d_data, n_data*sizeof(float)); cudaMemset(d_data, 0, n_data*sizeof(float));
#主机端到设备端数据传递
cudaMemcpy(d_data, h_data, n_data*sizeof(float), cudaMemcpyHostToDevice);
#对于二维数组
#申请设备端内存并初始化
cudaMallocPitch((void**)&d_data, &pitch, nz*sizeof(float), nx); cudaMemset2D(d_data, pitch, 0, nz*sizeof(float), nx);
#主机端到设备端数据传递
cudaMemcpy2D(d_data, pitch, h_data, sizeof(float)*nz, sizeof(float)*nz, nx, cudaMemcpyHostToDevice);
#4调用GPU子程序
gpusubroutine<<<gridsize,blocksize>>>(pitch/sizeof(float),otherparameters)
#运算同步
cudaDeviceSynchronize(); check_gpu_error("there is something wrong");
#5设备端到主机端数据回传
cudaMemcpy(h_data, d_data, n_data*sizeof(float), cudaMemcpyDeviceToHost);
cudaMemcpy2D(h_data, sizeof(float)*nz, d_data, pitch, sizeof(float)*nz, nx, cudaMemcpyDeviceToHost);
#6显存释放
cudaFree(d_data);
3gpusubroutine.cu中函数形式为:
__global__ void subroutine1(float parametwers)