源码
#include <stdlib.h>
#include <string.h>
#include "InitMatrix.h"
#include "MatrixMultiply.h"
#include "PrintMatrix.h"
#include "mpi.h"
#define M 512
#define P 512
#define N 512
int main(int argc, char * argv [])
{
MPI_Init(&argc, &argv);
float * MatrixA;
float * MatrixB;
float * MatrixC;
MatrixA = (float *)malloc(M * P * sizeof(float));
MatrixB = (float *)malloc(P * N * sizeof(float));
MatrixC = (float *)malloc(M * N * sizeof(float));
memset(MatrixC, 0, M * N * sizeof(float));
//给矩阵A和B赋值
InitMatrixA(MatrixA, M, P);
InitMatrixB(MatrixB, P, N);
double start = MPI_Wtime();
//计算矩阵乘C=AB
MatrixMultiply(MatrixA, MatrixB, MatrixC, M, P, N);
double finish=MPI_Wtime();
printf("MatrixMultiply kernel function cost time(clock):%lf s\n",finish-start);
//输出矩阵C到文件,用于验证计算结果
PrintMatrix(MatrixC, M, N);
free(MatrixA);
free(MatrixB);
free(MatrixC);
return 0;
}
当阶数为512时,运行结果如下:
当阶数为5120时,运行时间超过10分钟以上
1、行列分块算法的程序实现
#include <stdlib.h>
#include <string.h>
#include "mpi.h"
#include "InitMatrix.h"
#include "MatrixMultiply.h"
#include "PrintMatrix.h"
#include <unistd.h>
#define M 1024
#define P 1024
#define N 1024
void printMatrix(float *data,int len){
for (int i = 0; i < len ; i++)
printf("%.3f\t", data[i]);
printf("\n");
}
int main(int argc, char *argv[]) {
int rank, size;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Status status;
//MPI_Request request[2];
MPI_Datatype newtype, tmp_block;
MPI_Type_vector(M, P/size, N, MPI_FLOAT, &tmp_block);
MPI_Type_create_resized(tmp_block, 0, sizeof(float), &newtype);
MPI_Type_commit(&newtype);
float * MatrixA;
float * MatrixB;
float * MatrixC;
float *SubMatrixA;
float *SubMatrixB;
float *SubMatrixC;
MatrixA = (float *)malloc(M * P * sizeof(float));
MatrixB = (float *)malloc(P * N * sizeof(float));
MatrixC = (float *)malloc(M * N * sizeof(float));
SubMatrixA = (float *)malloc(M * P / size * sizeof(float));
SubMatrixB = (float *)malloc(N * P / size * sizeof(float));
SubMatrixC = (float *)malloc(M * N / size * sizeof(float));
memset(MatrixC, 0, M * N * sizeof(float));
memset(SubMatrixC, 0, M * N / size * sizeof(float));
int up = (rank == 0) ? (size-1) : (rank - 1);
int down = (rank == (size-1)) ? 0: (rank + 1);
//printf("Rank %d: \n", rank);
//printf("down, up:%d,%d: \n", down, up);
if (rank == 0) {
InitMatrixA(MatrixA, M, P);
InitMatrixB(MatrixB, P, N);
//printf("MatrixA Rank %d: \n", rank);
//printMatrix(MatrixA,M * P);
//printf("MatrixB Rank %d: \n", rank);
//printMatrix(MatrixB,N * P);
}
MPI_Scatter(MatrixA, M * P / size, MPI_FLOAT, SubMatrixA, M * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);
MPI_Scatter(MatrixB, 1, newtype, SubMatrixB, M * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);
//重复非阻塞模式
//MPI_Send_init(SubMatrixA, M * P / size, MPI_FLOAT, up, 1, MPI_COMM_WORLD,&request[0]);
//MPI_Recv_init(SubMatrixA, M * P / size, MPI_FLOAT, down, 1, MPI_COMM_WORLD,&request[1]);
double start=MPI_Wtime();
for(int i=0; i<size; i++){
MatrixMultiply(SubMatrixA, SubMatrixB, SubMatrixC, M / size, P, N/size);
//阻塞
//MPI_Sendrecv( SubMatrixA, M * P/size, MPI_FLOAT, up, 1,SubMatrixA, M * P / size, MPI_FLOAT, down , 1, MPI_COMM_WORLD, &status );/*向上数据传送*/
MPI_Sendrecv_replace(SubMatrixA, M * P / size, MPI_FLOAT, up, 1, down, 1, MPI_COMM_WORLD, &status);
//非阻塞
//MPI_Isend(SubMatrixA, M * P / size, MPI_FLOAT, up, 1, MPI_COMM_WORLD,&request[0]);
//MPI_Irecv(SubMatrixA, M * P / size, MPI_FLOAT, down, 1, MPI_COMM_WORLD,&request[1]);
//重复非阻塞
//MPI_Startall(2, request);
//MPI_Wait(&request[0], MPI_STATUS_IGNORE);
//MPI_Wait(&request[1], MPI_STATUS_IGNORE);
//sleep(i);
//printf("SubMatrixA Rank %d i=%d: \n", rank,i);
//printMatrix(SubMatrixA,M * P / size);
//printf("SubMatrixC Rank %d i=%d: \n", rank,i);
//printMatrix(SubMatrixC,N/size);
MPI_Gather(SubMatrixC, N/size, MPI_FLOAT, &MatrixC[i*N], N/size, MPI_FLOAT, 0, MPI_COMM_WORLD);
}
//printf("SubMatrixB Rank %d: \n", rank);
//printMatrix(SubMatrixB,N * P / size);
if (rank == 0) {
double finish=MPI_Wtime();
printf("MatrixMultiply kernel function cost time(clock):%lf s\n",finish-start);
// 输出矩阵C到文件,用于验证计算结果
PrintMatrix(MatrixC, M, N);
//printf("MatrixC %d: \n", rank);
//printMatrix(MatrixC,M * N);
free(MatrixA);
free(MatrixB);
free(MatrixC);
}
free(SubMatrixA);
free(SubMatrixB);
free(SubMatrixC);
MPI_Type_free( &tmp_block);
MPI_Type_free( &newtype);
//MPI_Request_free(&request[0]);
//MPI_Request_free(&request[1]);
MPI_Finalize();
return 0;
}
运行结果如下:
当阶数为512时,运行结果如下:
阻塞:
非阻塞:
重复非阻塞:
当阶数为5120时,运行结果如下:
阻塞:
非阻塞:
重复非阻塞:
当阶数为10240时,运行结果如下:
阻塞:
非阻塞:
重复非阻塞:
2、cannon算法的程序实现
#include <stdlib.h>
#include <string.h>
#include "InitMatrix.h"
#include "MatrixMultiply.h"
#include "PrintMatrix.h"
#include <mpi.h>
#include <unistd.h>
#include <math.h>
#define M 4
#define P 4
#define N 4
void printMatrix(float *data,int len){
for (int i = 0; i < len ; i++)
printf("%.3f\t", data[i]);
printf("\n");
}
int main(int argc, char * argv [])
{
int rank, size;
MPI_Comm comm2d;
MPI_Status status;
//MPI_Request request[4];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
int q = sqrt(size);
int periods[2]={1,1};
int dims[2]={q,q};
int right,left,down,top;
MPI_Cart_create( MPI_COMM_WORLD, 2, dims, periods, 0,&comm2d);/*定义虚拟进程拓扑 它是一个n´n的网格 得到的包含进程拓扑信息的新的通信域是comm2d*/
MPI_Cart_shift( comm2d, 0, 1, &left, &right);/*得到当前进程左右两侧的进程标识*/
MPI_Cart_shift( comm2d, 1, 1, &down, &top);/* 得到当前进程上下方的进程标识*/
//printf("Rank %d: \n", rank);
//printf("left, right,down, top:%d,%d,%d,%d: \n", left, right,down, top);
MPI_Datatype newtype, tmp_block;
MPI_Type_vector(N/q, N/q, N, MPI_FLOAT, &tmp_block);
MPI_Type_create_resized(tmp_block, 0, sizeof(float), &newtype);
MPI_Type_commit(&newtype);
float * MatrixA;
float * MatrixB;
float * MatrixC;
float *SubMatrixA;
float *SubMatrixB;
float *SubMatrixC;
MatrixA = (float *)malloc(M * P * sizeof(float));
MatrixB = (float *)malloc(P * N * sizeof(float));
MatrixC = (float *)malloc(M * N * sizeof(float));
SubMatrixA = (float *)malloc(M * P / size * sizeof(float));
SubMatrixB = (float *)malloc(N * P / size * sizeof(float));
SubMatrixC = (float *)malloc(M * N / size * sizeof(float));
memset(MatrixC, 0, M * N * sizeof(float));
memset(SubMatrixC, 0, M * N / size * sizeof(float));
int recvcounts[size];
int displs[size];
for(int i=0;i<q;i++){
for(int j=0;j<q;j++){
displs[i*q+j] = i*N*M/q + j*M/q;
recvcounts[i*q+j] = 1;
//printf("偏移量 %d,", displs[i*q+j]);
}
}
//给矩阵A和B赋值
if(rank==0){
InitMatrixA(MatrixA, M, P);
InitMatrixB(MatrixB, P, N);
//MPI_Sendrecv(&MatrixA[0], 1, newtype, 0, 2,MatrixB,4, MPI_FLOAT, 0, 2, comm2d, &status );
//printf("MatrixA Rank %d: \n", rank);
//printMatrix(MatrixA,M * P);
//printf("MatrixB Rank %d: \n", rank);
//printMatrix(MatrixB,P * N);
}
MPI_Scatterv(MatrixA, recvcounts,displs, newtype, SubMatrixA, M * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);
MPI_Scatterv(MatrixB, recvcounts,displs, newtype, SubMatrixB, N * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);
//sleep(rank);
//printf("SubMatrixA Rank %d: \n", rank);
//printMatrix(SubMatrixA,M * P / size);
//printf("SubMatrixB Rank %d: \n", rank);
//printMatrix(SubMatrixB,N * P / size);
//MPI_Send_init(SubMatrixA, M * P / size, MPI_FLOAT, down, 1, comm2d,&request[0]);
//MPI_Send_init(SubMatrixB, M * P / size, MPI_FLOAT, left, 2, comm2d,&request[1]);
//MPI_Recv_init(SubMatrixA, M * P / size, MPI_FLOAT, top, 1, comm2d,&request[2]);
//MPI_Recv_init(SubMatrixB, M * P / size, MPI_FLOAT, right, 2, comm2d,&request[3]);
double start = MPI_Wtime();
for(int i=0; i < q; i++)
{
//MPI_Isend(SubMatrixA, M * P / size, MPI_FLOAT, down, 4, comm2d,&request[0]);
//MPI_Isend(SubMatrixB, M * P / size, MPI_FLOAT, left, 5, comm2d,&request[1]);
//MPI_Wait(&request[0], MPI_STATUS_IGNORE);
//MPI_Wait(&request[1], MPI_STATUS_IGNORE);
//MPI_Startall(2, &request[0]);
//sleep(rank);
//printf("SubMatrixB Rank %d 第%d轮: \n", rank,i);
//printMatrix(SubMatrixB,M * P / size);
//计算矩阵乘C=AB
MatrixMultiply(SubMatrixA, SubMatrixB, SubMatrixC, M/q, P/q, N/q );
MPI_Sendrecv_replace(SubMatrixA, M * P / size, MPI_FLOAT, down, 2, top, 2, comm2d, &status);
MPI_Sendrecv_replace(SubMatrixB, N * P / size, MPI_FLOAT, left, 1, right, 1, comm2d, &status);
//MPI_Irecv(SubMatrixA, M * P / size, MPI_FLOAT, top, 4, comm2d,&request[2]);
//MPI_Irecv(SubMatrixB, N * P / size, MPI_FLOAT, right, 5, comm2d,&request[3]);
//MPI_Startall(2, &request[2]);
//MPI_Wait(&request[2], MPI_STATUS_IGNORE);
//MPI_Wait(&request[3], MPI_STATUS_IGNORE);
//MPI_Sendrecv( SubMatrixA, M * P/size, MPI_FLOAT, down, 2,SubMatrixA, M * P / size, MPI_FLOAT, top, 2, comm2d, &status );/*向上数据传送*/
//MPI_Sendrecv( SubMatrixB, M * P/size,MPI_FLOAT, left, 1,SubMatrixB, N * P / size, MPI_FLOAT,right, 1, comm2d, &status );/*向左数据传送*/
//sleep(rank);
//printf("SubMatrixB Rank %d 第%d轮: \n", rank,i+1);
//printMatrix(SubMatrixB,M * P / size);
sleep(rank);
printf("SubMatrixC %d 第%d轮: \n", rank,i);
printMatrix(SubMatrixC,N * M / size);
}
MPI_Gatherv(SubMatrixC, N * M / size, MPI_FLOAT, MatrixC, recvcounts,displs, newtype, 0, MPI_COMM_WORLD);
if (rank == 0) {
double finish=MPI_Wtime();
printf("MatrixMultiply kernel function cost time(clock):%lf s\n",finish-start);
// 输出矩阵C到文件,用于验证计算结果
PrintMatrix(MatrixC, M, N);
//printf("MatrixC %d: \n", rank);
//printMatrix(MatrixC,M * N);
free(MatrixA);
free(MatrixB);
free(MatrixC);
}
free(SubMatrixA);
free(SubMatrixB);
free(SubMatrixC);
MPI_Type_free( &tmp_block);
MPI_Type_free( &newtype);
MPI_Comm_free( &comm2d );
//MPI_Request_free(&request[0]);
//MPI_Request_free(&request[1]);
//MPI_Request_free(&request[2]);
//MPI_Request_free(&request[3]);
MPI_Finalize();
return 0;
}
当阶数为512时,运行结果如下:
当阶数为5120时,运行结果如下:
当阶数为10240时,运行结果如下:
综上:
非阻塞通信可以让通信与计算同时进行,可以进一步缩短运行时间。
阶数低于512运行时间降低到一定程度开始升高,说明数据通信时间超过计算时间
阶数超过512运行时间呈现指数级增长,是因为cache大小为2*1024*1024
进程数量超过256,cannon算法计算时间比行列算法效率更高