用MPI+CUDA实现矩阵加法

最新推荐文章于 2023-04-12 09:39:53 发布

alaclp

最新推荐文章于 2023-04-12 09:39:53 发布

阅读量3.7k

点赞数

分类专栏：高性能计算科学计算

科学计算同时被 2 个专栏收录

326 篇文章 2 订阅

订阅专栏

高性能计算

31 篇文章 0 订阅

订阅专栏

用MPI+CUDA实现矩阵加法

2010-06-10 12:52

转载自 w94w

最终编辑 w94w

程序其实很简单，就是实现两个三维矩阵相加A+B=C。

由于单个矩阵超出显卡内存，需要对矩阵在GPU上进行分片。程序如下：

//
// matadd_kernel.cu
//
#include <stdio.h>
#include <cutil.h>
#include <mpi.h>
#define BLOCK_DIMX 16
#define BLOCK_DIMZ 16

int InitCUDA(int myid)
{
int devCount = 0;
int dev = 0;

   cudaGetDeviceCount(&devCount);
   if (devCount == 0)
   {
      fprintf(stderr, "There is no device supporting CUDA.\n");
      return false;
   }

   for (dev = 0; dev < devCount; ++ dev)
   {
      cudaDeviceProp prop;
      if (cudaGetDeviceProperties(&prop, dev) == cudaSuccess)
      {
         if (prop.major >= 1) break;
      }
}

   if (dev == devCount)
{
     fprintf(stderr, "There is no device supporting CUDA.\n");
     return false;
   }

cudaSetDevice(myid);

return true;
}

__global__ void matadd_kernel(float *matA, float *matB, float *matC, int nx, int ny, int nz)
{
int ix = blockDim.x * blockIdx.x + threadIdx.x;
int iy = blockDim.y * blockIdx.y + threadIdx.y;

   for (int iz = 0; iz < nz; iz ++)
   {
     if (ix < nx && iy < ny)
        matC[iz * ny * nx + iy * nx + ix] = matA[iz * ny * nx + iy * nx + ix] + matB[iz * ny * nx + iy * nx + ix];
    }
}

void matadd(float *matA, float *matB, float *matC, int nx, int ny, int nz, int myid, int size)
{
int ista, iend;
   if (myid != 0)
   {
      ista = (myid - 1) * ( nz / size);
      iend = ista + nz / size - 1;
    } else {
      ista = (size - 1) * (nz / size);
      iend = nz - 1;
    }

    int loc_nz = iend - ista + 1;
    float *loc_matA = (float *) malloc( loc_nz * ny * nx * sizeof(float));
    float *loc_matB = (float *) malloc( loc_nz * ny * nx * sizeof(float));
    float *loc_matC = (float *) malloc( loc_nz * ny * nx * sizeof(float));

MPI_Status status;
int count[size];

   if (myid != 0)
   {
     MPI_Send(&loc_nz, 1, MPI_INT, 0, myid, MPI_COMM_WORLD);
    } else {
       count[0] = loc_nz;
       for (int i = 1; i < size; i ++) {
     MPI_Recv(&count[i], 1, MPI_INT, i, i, MPI_COMM_WORLD, &status);
      }
}

if (myid == 0)
{
     for (int ix = 0; ix < count[0] * ny * nx; ix ++)
     {
       loc_matA[count[0] * ny * nx - 1 - ix] = matA[nz * ny * nx - 1 - ix];
       loc_matB[count[0] * ny * nx - 1 - ix] = matB[nz * ny * nx - 1 - ix];
     }
     for (int isz = 1; isz < size; isz ++)
     {
       int idx = 0;
       if (isz == 1) idx = 0;
        else idx += count[isz - 1];
       MPI_Send(matA + idx * ny * nz, count[isz] * ny * nx, MPI_FLOAT, isz, isz, MPI_COMM_WORLD);
       MPI_Send(matB + idx * ny * nz, count[isz] * ny * nx, MPI_FLOAT, isz, isz, MPI_COMM_WORLD);
    }
} else {
     MPI_Recv(loc_matA, loc_nz * ny * nx, MPI_FLOAT, 0, myid, MPI_COMM_WORLD, &status);
     MPI_Recv(loc_matB, loc_nz * ny * nx, MPI_FLOAT, 0, myid, MPI_COMM_WORLD, &status);
   }

   float *d_loc_matA;
cudaMalloc((void **) &d_loc_matA, loc_nz * ny * nx * sizeof(float));
   cudaMemcpy(d_loc_matA, loc_matA, loc_nz * ny * nx * sizeof(float), cudaMemcpyHostToDevice);

   float *d_loc_matB;
   cudaMalloc((void **) &d_loc_matB, loc_nz * ny * nx * sizeof(float));
   cudaMemcpy(d_loc_matB, loc_matB, loc_nz * ny * nx * sizeof(float), cudaMemcpyHostToDevice);

float *d_loc_matC;
   cudaMalloc((void **) &d_loc_matC, loc_nz * ny * nx * sizeof(float));

dim3 dimBlock(BLOCK_DIMX, BLOCK_DIMZ);
   dim3 dimGrid(nx / BLOCK_DIMX, ny / BLOCK_DIMZ);

matadd_kernel<<<dimGrid, dimBlock>>>(d_loc_matA, d_loc_matB, d_loc_matC, nx, ny, loc_nz);

cudaMemcpy(loc_matC, d_loc_matC, loc_nz * ny * nx * sizeof(float), cudaMemcpyDeviceToHost);

   if (myid !=0 )
{
    MPI_Send(loc_matC, loc_nz * ny * nx, MPI_FLOAT, 0, myid, MPI_COMM_WORLD);
} else {
    for (int ix = 0; ix < count[0] * ny * nx; ix ++)
     matC[nz * ny * nx - 1 - ix] = loc_matC[loc_nz * ny * nx - 1 - ix];
    for (int isz = 1; isz < size; isz ++)
    {
      int idx = 0;
      if (isz == 1) idx = 0;
      else idx += count[isz - 1];
      MPI_Recv(matC + idx * ny * nz, count[isz] * ny * nx, MPI_FLOAT, isz, isz, MPI_COMM_WORLD, &status);
    }
}

cudaFree(d_loc_matA);
cudaFree(d_loc_matB);
cudaFree(d_loc_matC);

free(loc_matA);
free(loc_matB);
free(loc_matC);
return;
}

// Main program

#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include <mpi.h>

int InitCUDA(int myid);
void matadd(float *matA, float *matB, float *matC, int nx, int ny, int nz, int myid, int size);
int main(int argc, char *argv[])
{
int myid, numprocs;
int namelen;
MPI_Status status;

MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);

InitCUDA(myid);

int nx = 1024, ny = 1024, nz = 500;
float *a = (float *)malloc(nx * ny * nz * sizeof(float));
float *b = (float *)malloc(nx * ny * nz * sizeof(float));
float *c = (float *)malloc(nx * ny * nz * sizeof(float));

for (int iz = 0; iz < nz; iz ++)
   for (int iy = 0; iy < ny; iy ++)
    for (int ix = 0; ix < nx; ix++)
    {
      a[iz * ny * nx + iy * nx + ix] = 1.0f;
      b[iz * ny * nx + iy * nx + ix] = 2.0f;
      c[iz * ny * nx + iy * nx + ix] = 0.0f;
    }

clock_t tstart = clock();
matadd(a, b, c, nx, ny, nz, myid, numprocs);
clock_t tend = clock();

if (myid == 0)
printf("time for matrix addition is %.5f\n", (double)(tend - tstart)/CLOCKS_PER_SEC);

if (myid == 0)
{
printf("c = %f\n", c[nx * ny * nz - 1]);

   for (int iz = 0; iz < nz; iz ++)
     for (int iy = 0; iy < ny; iy ++)
       for (int ix = 0; ix < nx; ix++)
         if ((c[iz * ny * nx + iy * nx + ix] - 3.0) >1.0e-2)
         {
              fprintf(stderr, "Error occurs\n");
              return EXIT_FAILURE;
          }
   }

free(a);
free(b);
free(c);

MPI_Finalize();
return EXIT_SUCCESS;
}

由上面矩阵维数看，每个矩阵占4*500M=2G空间，因此如果要在1G容量GPU上实现两矩阵相加，那么至少需要大于等于5个GPU才能一次实现（由于子结点上的C矩阵还要占空间）。计算结果如下：
mpirun -np 5 ./test
Time for matrix addition is 45.56000s
mpirun -np 10 ./test
Time for matrix addition is 32.44000s

时间未见大幅度减少，主要是由于数据在节点之间进行消息传递，需要大量的时间。

来源： http://blog.sina.com.cn/s/blog_81fcea1601014pmx.html