对比三两种在GPU上建立三维数组的方式
直接在GPU建立全局变量三维数组
在CPU端为GPU上的三维数组开辟空间
在CPU端为GPU上的一维数组开辟空间,然后通过寻址的方式
代码:
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include <iostream>
#include <stdio.h>
#include <time.h>
using namespace std;
__device__ const int n = 6000;
__device__ const int m = 100;
__device__ double d_static[n][m][10];
__global__ void dynamic_cpu(double ***f_3);
__global__ void static_gpu();
__global__ void dynamic_one_gpu(double *a);
const int N = 6000;
const int M = 100;
clock_t a, b;
double t;
int main()
{
double *arr;
cudaMalloc((void**)(&arr), N*M * 10 * sizeof(double));
double ***f_3 = (double***)malloc(N * sizeof(double***));
double **f_2 = (double**)malloc(N * M * sizeof(double**));
double *f_1 = (double*)malloc(N * M * 10 * sizeof(double*));
double ***d_3;
cudaMalloc((void**)(&d_3), N * sizeof(double***));
double **d_2;
cudaMalloc((void**)(&d_2), N*M * sizeof(double**));
double *d_1;
cudaMalloc((void**)(&d_1), N*M * 10 * sizeof(double));
for (int i = 0; i < N*M * 10; i++)
{
f_1[i] = 0;
}
cudaMemcpy(d_1, f_1, N*M * 10 * sizeof(double), cudaMemcpyHostToDevice);
cudaMemcpy(arr, f_1, N*M * 10 * sizeof(double), cudaMemcpyHostToDevice);
for (int i = 0; i < N*M; i++)
{
f_2[i] = d_1 + i * 10;
}
cudaMemcpy(d_2, f_2, N*M * sizeof(double**), cudaMemcpyHostToDevice);
for (int i = 0; i < N; i++)
{
f_3[i] = d_2 + M * i;
}
cudaMemcpy(d_3, f_3, N * sizeof(double***), cudaMemcpyHostToDevice);
int dimx = 6;
int dimy = 10;
dim3 block(dimx, dimy);
dim3 grid((N + block.x - 1) / block.x, (M + block.y - 1) / block.y);
a = clock();
for (int i = 0; i < 1000; i++)
{
static_gpu << <grid, block >> > ();
}
cudaDeviceSynchronize();
b = clock();
t = (double)(b - a) / CLOCKS_PER_SEC;
cout << "static_gpu=" << t << endl;
a = clock();
for (int i = 0; i < 1000; i++)
{
dynamic_cpu << <grid, block >> > (d_3);
}
cudaDeviceSynchronize();
b = clock();
t = (double)(b - a) / CLOCKS_PER_SEC;
cout << "dynamic_cpu=" << t << endl;
a = clock();
for (int i = 0; i < 1000; i++)
{
dynamic_one_gpu << <grid, block >> > (arr);
}
cudaDeviceSynchronize();
b = clock();
t = (double)(b - a) / CLOCKS_PER_SEC;
cout << "dynamic_one_gpu=" << t << endl;
return 0;
}
__global__ void static_gpu()
{
int X = threadIdx.x + blockIdx.x * blockDim.x;
int Y = threadIdx.y + blockIdx.y * blockDim.y;
if (X < n&&Y < m)
{
for (int i = 0; i < 10; i++)
{
d_static[X][Y][i] = (d_static[X][Y][i] + X * 0.2 + Y * 0.3 + i * 0.4) * 0.01;
}
}
}
__global__ void dynamic_cpu(double ***f_3)
{
int X = threadIdx.x + blockIdx.x * blockDim.x;
int Y = threadIdx.y + blockIdx.y * blockDim.y;
if (X < n&&Y < m)
{
for (int i = 0; i < 10; i++)
{
f_3[X][Y][i] = (f_3[X][Y][i] + X * 0.2 + Y * 0.3 + i * 0.4) * 0.01;
}
}
}
__global__ void dynamic_one_gpu(double *a)
{
int X = threadIdx.x + blockIdx.x * blockDim.x;
int Y = threadIdx.y + blockIdx.y * blockDim.y;
if (X < n&&Y < m)
{
for (int i = 0; i < 10; i++)
{
a[10 * (X + Y * N) + i] = (a[10 * (X + Y * N) + i] + X * 0.2 + Y * 0.3 + i * 0.4) * 0.01;
}
}
}
执行时间:
static_gpu=21.447
dynamic_cpu=11.669
dynamic_one_gpu=5.52