6条函数的MPI世界，处处有陷阱

最新推荐文章于 2020-04-27 21:31:28 发布

Augusdi

最新推荐文章于 2020-04-27 21:31:28 发布

阅读量2.1k

点赞数

分类专栏： MPI

本文链接：https://blog.csdn.net/Augusdi/article/details/8863091

版权

MPI 专栏收录该内容

3 篇文章 0 订阅

订阅专栏

MPI其实是十分简单而又强大的并行库。只是这次让我花了半天的工夫才到出了一个一个微小的BUG，让我几乎崩溃。

原程序用于计算两个矩阵相乘。分配任务时，只把第一个矩阵分解传给若干个slave，第二个矩阵全传。虽然效率不高，但我的作业需要是把固定的矩阵大小改为可变的。

源代码：

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>

#define NRA 62                 /* number of rows in matrix A */
#define NCA 15                 /* number of columns in matrix A */
#define NCB 7                  /* number of columns in matrix B */
#define MASTER 0               /* taskid of first task */
#define FROM_MASTER 1          /* setting a message type */
#define FROM_WORKER 2          /* setting a message type */

int main(argc,argv)
int argc;
char *argv[];
...{
int    numtasks,              /**//* number of tasks in partition */
    taskid,                /**//* a task identifier */
    numworkers,            /**//* number of worker tasks */
    source,                /**//* task id of message source */
    dest,                  /**//* task id of message destination */
    mtype,                 /**//* message type */
    rows,                  /**//* rows of matrix A sent to each worker */
    averow, extra, offset, /**//* used to determine rows sent to each worker */
    i, j, k, rc;           /**//* misc */
double    a[NRA][NCA],           /**//* matrix A to be multiplied */
    b[NCA][NCB],           /**//* matrix B to be multiplied */
    c[NRA][NCB];           /**//* result matrix C */
MPI_Status status;

MPI_Init(&argc,&argv);
MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
if (numtasks < 2 ) ...{
  printf("Need at least two MPI tasks. Quitting... ");
  MPI_Abort(MPI_COMM_WORLD, rc);
  exit(1);
  }
numworkers = numtasks-1;


/**//**************************** master task ************************************/
   if (taskid == MASTER)
   ...{
      printf("mpi_mm has started with %d tasks. ",numtasks);
      printf("Initializing arrays... ");
      for (i=0; i<NRA; i++)
         for (j=0; j<NCA; j++)
            a[i][j]= i+j;
      for (i=0; i<NCA; i++)
         for (j=0; j<NCB; j++)
            b[i][j]= i*j;

      /**//* Send matrix data to the worker tasks */
      averow = NRA/numworkers;
      extra = NRA%numworkers;
      offset = 0;
      mtype = FROM_MASTER;
      for (dest=1; dest<=numworkers; dest++)
      ...{
         rows = (dest <= extra) ? averow+1 : averow;       
         printf("Sending %d rows to task %d offset=%d ",rows,dest,offset);
         MPI_Send(&offset, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
         MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
         MPI_Send(&a[offset][0], rows*NCA, MPI_DOUBLE, dest, mtype,
                   MPI_COMM_WORLD);
         MPI_Send(&b, NCA*NCB, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
         offset = offset + rows;
      }

      /**//* Receive results from worker tasks */
      mtype = FROM_WORKER;
      for (i=1; i<=numworkers; i++)
      ...{
         source = i;
         MPI_Recv(&offset, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
         MPI_Recv(&rows, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
         MPI_Recv(&c[offset][0], rows*NCB, MPI_DOUBLE, source, mtype, 
                  MPI_COMM_WORLD, &status);
         printf("Received results from task %d ",source);
      }

      /**//* Print results */
      printf("****************************************************** ");
      printf("Result Matrix: ");
      for (i=0; i<NRA; i++)
      ...{
         printf(" "); 
         for (j=0; j<NCB; j++) 
            printf("%6.2f   ", c[i][j]);
      }
      printf(" ****************************************************** ");
      printf ("Done. ");
   }


/**//**************************** worker task ************************************/
   if (taskid > MASTER)
   ...{
      mtype = FROM_MASTER;
      MPI_Recv(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);
      MPI_Recv(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);
      MPI_Recv(&a, rows*NCA, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);
      MPI_Recv(&b, NCA*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);

      for (k=0; k<NCB; k++)
         for (i=0; i<rows; i++)
         ...{
            c[i][k] = 0.0;
            for (j=0; j<NCA; j++)
               c[i][k] = c[i][k] + a[i][j] * b[j][k];
         }
      mtype = FROM_WORKER;
      MPI_Send(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);
      MPI_Send(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);
      MPI_Send(&c, rows*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD);
   }
   MPI_Finalize();
}

改过的代码（已经加了很多调试输出）：

#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>

#define MASTER 0               /* taskid of first task */
#define FROM_MASTER 1          /* setting a message type */
#define FROM_WORKER 2          /* setting a message type */

int main(int argc, char *argv[])
...{
    int NRA;
    int NCA;
    int NCB;
    int numtasks,               /**//* number of tasks in partition */
    taskid,                   /**//* a task identifier */
    numworkers,               /**//* number of worker tasks */
    source,                   /**//* task id of message source */
    dest,                   /**//* task id of message destination */
    mtype,                   /**//* message type */
    rows,                   /**//* rows of matrix A sent to each worker */
    averow, extra, offset, /**//* used to determine rows sent to each worker */
    i, j, k, rc;           /**//* misc */
    double  * a,           /**//* matrix A to be multiplied */
    *b,            /**//* matrix B to be multiplied */
    *c;           /**//* result matrix C */
    MPI_Status status;

    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&taskid);
    MPI_Comm_size(MPI_COMM_WORLD,&numtasks);
    if (numtasks < 2 )
    ...{
        printf("Need at least two MPI tasks. Quitting... ");
        MPI_Abort(MPI_COMM_WORLD, rc);
        exit(1);
    }
    numworkers = numtasks-1;


/**//**************************** master task ************************************/
    if (taskid == MASTER)
    ...{
        scanf("%d %d %d", &NRA, &NCA, &NCB);
        a= new double[NRA*NCA];
        b= new double[NCA*NCB];
        c= new double[NRA*NCB];
        for (int dest=1; dest<= numworkers; ++ dest)
        ...{
            MPI_Send(&NRA, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);
            MPI_Send(&NCA, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);
            MPI_Send(&NCB, 1, MPI_INT, dest, FROM_MASTER, MPI_COMM_WORLD);
        }
        printf("mpi_mm has started with %d tasks. ",numtasks);
        printf("Initializing arrays... ");
        for (i=0; i<NRA; i++)
            for (j=0; j<NCA; j++)
                a[i*NCA+j]= i+j;
        for (i=0; i<NCA; i++)
            for (j=0; j<NCB; j++)
                b[i*NCB+j]= i*j;

        printf("****************************************************** ");
        printf("A Matrix: ");
        for (i=0; i<NRA; i++)
        ...{
            printf(" "); 
            for (j=0; j<NCA; j++)
                printf("%6.2f   ", a[i*NCA+j]);
        }
        printf(" B Matrix: ");
        for (i=0; i<NCA; i++)
        ...{
            printf(" "); 
            for (j=0; j<NCB; j++)
                printf("%6.2f   ", b[i*NCB+j]);
        }
        printf(" ****************************************************** ");

        /**//* Send matrix data to the worker tasks */
        averow = NRA/numworkers;
        extra = NRA%numworkers;
        offset = 0;
        mtype = FROM_MASTER;
        for (dest=1; dest<=numworkers; dest++)
        ...{
            rows = (dest <= extra) ? averow+1 : averow;    
            printf("Sending %d rows to task %d offset=%d apos %d. ",rows,dest,offset, &(a[offset*NCA]) );
            MPI_Send(&offset, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
            MPI_Send(&rows, 1, MPI_INT, dest, mtype, MPI_COMM_WORLD);
            MPI_Send(&(a[offset*NCA]), rows*NCA, MPI_DOUBLE, dest, mtype,
                     MPI_COMM_WORLD);
            MPI_Send(&b, NCA*NCB, MPI_DOUBLE, dest, mtype, MPI_COMM_WORLD);
            offset = offset + rows;
        }

        /**//* Receive results from worker tasks */
        mtype = FROM_WORKER;
        for (i=1; i<=numworkers; i++)
        ...{
            source = i;
            MPI_Recv(&offset, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
            MPI_Recv(&rows, 1, MPI_INT, source, mtype, MPI_COMM_WORLD, &status);
            MPI_Recv(&(c[offset*NCB]), rows*NCB, MPI_DOUBLE, source, mtype, 
                     MPI_COMM_WORLD, &status);
            printf("Received results from task %d ",source);
        }

        /**//* Print results */
        printf("****************************************************** ");
        printf("Result Matrix: ");
        for (i=0; i<NRA; i++)
        ...{
            printf(" "); 
            for (j=0; j<NCB; j++)
                printf("%6.2f   ", c[i*NCB+j]);
        }
        printf(" ****************************************************** ");
        printf ("Done. ");

        delete []a;
        delete []b;
        delete []c;
    }


/**//**************************** worker task ************************************/
    if (taskid > MASTER)
    ...{
        mtype = FROM_MASTER;

        MPI_Recv(&NRA, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);
        MPI_Recv(&NCA, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);
        MPI_Recv(&NCB, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

        printf("processor %d : NRA %d, NCA %d, NCB %d. ", taskid, NRA, NCA, NCB);
               
        a= new double[NRA*NCA];
        b= new double[NCA*NCB];
        c= new double[NRA*NCB];
        printf("a addr : %d on procs %d. ", &a, taskid);
        if (a==NULL || b==NULL || c==NULL)
        ...{
            printf("Allocated error on procs %d. ", taskid);
        }

        MPI_Recv(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);
        MPI_Recv(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD, &status);

        printf("processor %d : offset %d, rows %d. ", taskid, offset, rows);

        MPI_Recv(&a, rows*NCA, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);
        ...{
            int count;
            MPI_Get_count(&status, MPI_DOUBLE, &count);
            printf("recived %d data of a on procs %d, %d. ", count, taskid, *(a+2));
            printf("a addr : %d on procs %d. ", &a, taskid);
        }
        MPI_Recv(&b, NCA*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD, &status);
        ...{
            int count;
            MPI_Get_count(&status, MPI_DOUBLE, &count);
            printf("recived %d data of b on procs %d. ", count, taskid);
        }

        printf("******on processor %d ******************************** ", taskid);
        printf("A Matrix: ");
        for (i=0; i<NRA; i++)
        ...{
            printf(" "); 
            for (j=0; j<NCA; j++)
                printf("%6.2f   ", a[i*NCA+j]);
        }
        printf(" B Matrix: ");
        for (i=0; i<NCA; i++)
        ...{
            printf(" "); 
            for (j=0; j<NCB; j++)
                printf("%6.2f   ", b[i*NCB+j]);
        }
        printf(" ****************************************************** ");

        for (k=0; k<NCB; k++)
            for (i=0; i<rows; i++)
            ...{
                c[i*NCB+k] = 0.0;
                for (j=0; j<NCA; j++)
                    c[i*NCB+k] = c[i*NCB+k] + a[i*NCA+j] * b[j*NCB+k];
            }
        mtype = FROM_WORKER;
        MPI_Send(&offset, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);
        MPI_Send(&rows, 1, MPI_INT, MASTER, mtype, MPI_COMM_WORLD);
        MPI_Send(&c, rows*NCB, MPI_DOUBLE, MASTER, mtype, MPI_COMM_WORLD);

        delete []a;
        delete []b;
        delete []c;
    }
    MPI_Finalize();
}

以上程序的运行唯一结果，就是segment fault。

后在通过dbx工具在core中定位到printf("%6.2f ", a[i*NCA+j]);一句。经过分析，终于找到问题出现在由

double a[][] -> double *a = new [] 这样的转变中。

由于这样的转变，a变成了指针，因此使用Send或Recieve时，就不能再使用 &a 作为第一个参数了，而是直接使用a。

程序这样修改后，终于能正常执行了。而我也可以继续下一个作业了。

Augusdi

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
6条函数的MPI世界，处处有陷阱

MPI其实是十分简单而又强大的并行库。只是这次让我花了半天的工夫才到出了一个一个微小的BUG，让我几乎崩溃。原程序用于计算两个矩阵相乘。分配任务时，只把第一个矩阵分解传给若干个slave，第二个矩阵全传。虽然效率不高，但我的作业需要是把固定的矩阵大小改为可变的。源代码：#include "mpi.h"#include #include #define NRA 62
复制链接

扫一扫

专栏目录