一、查看本文之前需要对多级指针有个初步的了解。如图所示是一个1920*1080*9大小的内存块,使用三级指针***p表示三维数组array[ i ][ j ][ k ]。在使用cuda做一、二、三维数组的计算时,脑袋中需要清楚三维数组元素对应在一维数组的哪一个位置。本示例中array[ i ][ j ][ k ]对应一维数组a[i * 1920 * 1080 + j * 1920 + k]
二、需要清楚的知道cuda计算是在GPU上,使用的是显存,CPU上的指针无法操作GPU内存,必须将CPU上的数据完全传入GPU才可计算,计算的结果也需要使用cudaMemcpy拷贝出来,该做映射的做映射。
三、源码
#include "func.h"
__global__ void VecAddDim1(uint8_t *A, uint8_t *B ,uint8_t *C)
{
int i = threadIdx.x;
C[i] = A[i]+ B[i];
}
__global__ void VecAddDim2(uint8_t **A, uint8_t **B ,uint8_t **C)
{
int i = threadIdx.x;
int j = threadIdx.y;
C[i][j] = A[i][j]+ B[i][j];
}
__global__ void VecAddDim3(uint8_t ***A, uint8_t ***B ,uint8_t ***C)
{
int i = threadIdx.x;
int j = threadIdx.y;
int k = threadIdx.z;
C[i][j][k] = A[i][j][k]+ B[i][j][k];
}
// demo:
// 1-dim matrix addition
// tips:host`s memery should copyto device`s memery
// && cuda`s data should copyto host memery
void test_func_dim1(void)
{
uint8_t *A,*B,*C;
A=(uint8_t*)malloc(3);
B=(uint8_t*)malloc(3);
C=(uint8_t*)malloc(3);
for(int i=0;i<3;i++){
A[i] = i;
B[i] = i;
}
uint8_t *d_a,*d_b,*d_c;
cudaMalloc((void**)&d_a,3);
cudaMalloc((void**)&d_b,3);
cudaMalloc((void**)&d_c,3);
cudaMemcpy((void*)d_a,A,3,cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_a,A,3,cudaMemcpyHostToDevice);
VecAddDim1<<<1,3>>>(d_a,d_b,d_c);
cudaMemcpy((uint8_t*)C,(uint8_t*)d_c,3,cudaMemcpyDeviceToHost);
printf("\ndim1:\n");
for(int i=0;i<3;i++){
printf("%d ",C[i]);
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
}
void test_func_dim2(void)
{
uint8_t **A;
uint8_t **B;
uint8_t **C;
uint8_t *data_a;
uint8_t *data_b;
uint8_t *data_c;
A=(uint8_t **)malloc(3*sizeof(uint8_t*));
B=(uint8_t **)malloc(3*sizeof(uint8_t*));
C=(uint8_t **)malloc(3*sizeof(uint8_t*));
data_a = (uint8_t *)malloc(3*3*sizeof(uint8_t));
data_b = (uint8_t *)malloc(3*3*sizeof(uint8_t));
data_c = (uint8_t *)malloc(3*3*sizeof(uint8_t));
for(int i=0;i<3*3;i++){
data_a[i] = i;
data_b[i] = i;
data_c[i] = 0;
}
uint8_t **d_a;
uint8_t **d_b;
uint8_t **d_c;
uint8_t *d_data_a;
uint8_t *d_data_b;
uint8_t *d_data_c;
cudaMalloc((void**)&d_a,3*sizeof(uint8_t*));
cudaMalloc((void**)&d_b,3*sizeof(uint8_t*));
cudaMalloc((void**)&d_c,3*sizeof(uint8_t*));
cudaMalloc((void**)&d_data_a,3*3*sizeof(uint8_t));
cudaMalloc((void**)&d_data_b,3*3*sizeof(uint8_t));
cudaMalloc((void**)&d_data_c,3*3*sizeof(uint8_t));
for(int i=0;i<3;i++){
A[i] = (d_data_a + i*3); // q:为何使用d_data_a? 【原因:CPU无法访问GPU内存】
// **A指向一个一维指针数组(如果不知道这一点,建议补习一下多级指针的知识),该数组中每一个元素A[i]存放一个一级指针,用于指向二维数组每行首地址
B[i] = (d_data_b + i*3); // 如果此处使用data_a,则表示二级指针**A与data_a在 HOST 内存上建立映射关系,在CPU上可以以A[i][j]的形式访问该二维数组
C[i] = (d_data_c + i*3); // 我们希望将全部原始二维数组数据全部传入到 DEVICE 内存上,在GPU上做运算时,也能以d_A[i][j]的方式在GPU上做运算,
// 就必须在GPU上也建立映射关系,否则只能使用传入的一维数组,然后以地址偏移的方式取数据参与计算
//
// d_data_a表示在GPU上开辟的内存首地址,使用cudaMemcpy的方式会将,会将A的数据传给d_A,而A中的数据是d_data_a的行首地址,相当于对d_A与d_data_a做了映射
// 注:不可使用d_A[i] = d_dataA + i*3;原因还是CPU无法访问GPU内存,CPU取不到d_A[i]的值
}
cudaMemcpy((void**)d_a,(void**)A,3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
cudaMemcpy((void**)d_b,(void**)B,3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
cudaMemcpy((void**)d_c,(void**)C,3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_data_a,data_a,3*3,cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_data_b,data_b,3*3,cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_data_c,data_c,3*3,cudaMemcpyHostToDevice);
dim3 block(3,3);
dim3 grip(1,1);
VecAddDim2<<<grip,block>>>(d_a,d_b,d_c);
cudaMemcpy((void*)data_c,d_data_c,3*3,cudaMemcpyDeviceToHost);
cudaMemcpy((void*)C,d_c,3*sizeof(uint8_t*),cudaMemcpyDeviceToHost);
// 将数据从GPU上传出后,还得再次做映射,此时两组数据没有映射关系
for(int i=0;i<3;i++){
C[i] = (data_c + i*3);
}
printf("\ndim2:\n");
for(int i=0;i<3;i++){
for(int j=0;j<3;j++){
printf("%d ",C[i][j]);
}
printf("\n");
}
cudaFree(d_a);
cudaFree(d_b);
cudaFree(d_c);
cudaFree(d_data_a);
cudaFree(d_data_b);
}
void test_func_dim3(void)
{
uint8_t ***A;
uint8_t ***B;
uint8_t ***C;
uint8_t **indexA;
uint8_t **indexB;
uint8_t **indexC;
uint8_t *dataA;
uint8_t *dataB;
uint8_t *dataC;
A = (uint8_t***)malloc(3*sizeof(uint8_t**));
B = (uint8_t***)malloc(3*sizeof(uint8_t**));
C = (uint8_t***)malloc(3*sizeof(uint8_t**));
indexA = (uint8_t**)malloc(3*3*sizeof(uint8_t*));
indexB = (uint8_t**)malloc(3*3*sizeof(uint8_t*));
indexC = (uint8_t**)malloc(3*3*sizeof(uint8_t*));
dataA = (uint8_t*)malloc(3*3*3*sizeof(uint8_t));
dataB = (uint8_t*)malloc(3*3*3*sizeof(uint8_t));
dataC = (uint8_t*)malloc(3*3*3*sizeof(uint8_t));
for(int i=0;i<3*3*3;i++){
dataA[i] = i;
dataB[i] = i;
dataC[i] = 0;
}
uint8_t ***d_A;
uint8_t ***d_B;
uint8_t ***d_C;
uint8_t **d_indexA;
uint8_t **d_indexB;
uint8_t **d_indexC;
uint8_t *d_dataA;
uint8_t *d_dataB;
uint8_t *d_dataC;
cudaMalloc((void**)&d_A,3*sizeof(uint8_t**));
cudaMalloc((void**)&d_B,3*sizeof(uint8_t**));
cudaMalloc((void**)&d_C,3*sizeof(uint8_t**));
cudaMalloc((void**)&d_indexA,3*3*sizeof(uint8_t*));
cudaMalloc((void**)&d_indexB,3*3*sizeof(uint8_t*));
cudaMalloc((void**)&d_indexC,3*3*sizeof(uint8_t*));
cudaMalloc((void**)&d_dataA,3*3*3*sizeof(uint8_t));
cudaMalloc((void**)&d_dataB,3*3*3*sizeof(uint8_t));
cudaMalloc((void**)&d_dataC,3*3*3*sizeof(uint8_t));
for(int i=0;i<3;i++){
A[i] = d_indexA + 3*i;
B[i] = d_indexB + 3*i;
C[i] = d_indexC + 3*i;
}
for(int i=0;i<3*3;i++){
indexA[i] = d_dataA + 3*i;
indexB[i] = d_dataB + 3*i;
indexC[i] = d_dataC + 3*i;
}
cudaMemcpy((void***)d_A,(void***)A,3*sizeof(uint8_t**),cudaMemcpyHostToDevice);
cudaMemcpy((void***)d_B,(void***)B,3*sizeof(uint8_t**),cudaMemcpyHostToDevice);
cudaMemcpy((void***)d_C,(void***)C,3*sizeof(uint8_t**),cudaMemcpyHostToDevice);
cudaMemcpy((void**)d_indexA,(void**)indexA,3*3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
cudaMemcpy((void**)d_indexB,(void**)indexB,3*3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
cudaMemcpy((void**)d_indexC,(void**)indexC,3*3*sizeof(uint8_t*),cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_dataA,(void*)dataA,3*3*3*sizeof(uint8_t),cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_dataB,(void*)dataB,3*3*3*sizeof(uint8_t),cudaMemcpyHostToDevice);
cudaMemcpy((void*)d_dataC,(void*)dataC,3*3*3*sizeof(uint8_t),cudaMemcpyHostToDevice);
dim3 block(3,3,3);
dim3 grip(1,1,1);
VecAddDim3<<<grip,block>>>(d_A,d_B,d_C);
cudaMemcpy((void***)C,(void***)d_C,3*sizeof(uint8_t**),cudaMemcpyDeviceToHost);
cudaMemcpy((void**)indexC,(void**)d_indexC,3*3*sizeof(uint8_t*),cudaMemcpyDeviceToHost);
cudaMemcpy((void*)dataC,(void*)d_dataC,3*3*3*sizeof(uint8_t),cudaMemcpyDeviceToHost);
for(int i=0;i<3;i++){
C[i] = indexC + 3*i;
}
for(int i=0;i<3*3;i++){
indexC[i] = dataC + 3*i;
}
printf("\ndim3:\n");
for(int i=0;i<3;i++){
for(int j=0;j<3;j++){
printf("(%d %d %d)",C[0][i][j],C[1][i][j],C[2][i][j]);
}
printf("\n");
}
cudaFree(d_A);
cudaFree(d_B);
cudaFree(d_C);
cudaFree(d_indexA);
cudaFree(d_indexB);
cudaFree(d_indexC);
cudaFree(d_dataA);
cudaFree(d_dataB);
cudaFree(d_dataC);
}