gemm算法优化

源码

#include <stdlib.h>
#include <string.h>
#include "InitMatrix.h"
#include "MatrixMultiply.h"
#include "PrintMatrix.h"
#include "mpi.h"

#define M 512
#define P 512
#define N 512



int main(int argc, char * argv [])
{

    MPI_Init(&argc, &argv);
	float * MatrixA;
	float * MatrixB;
	float * MatrixC;
	
	MatrixA = (float *)malloc(M * P * sizeof(float));
	MatrixB = (float *)malloc(P * N * sizeof(float));
	MatrixC = (float *)malloc(M * N * sizeof(float));
	
	memset(MatrixC, 0, M * N * sizeof(float)); 
	
	//给矩阵A和B赋值	
	InitMatrixA(MatrixA, M, P);
	InitMatrixB(MatrixB, P, N);


	double start = MPI_Wtime();
	//计算矩阵乘C=AB
	MatrixMultiply(MatrixA, MatrixB, MatrixC, M, P, N);
	
	double finish=MPI_Wtime();
	printf("MatrixMultiply kernel function cost time(clock):%lf s\n",finish-start);

	//输出矩阵C到文件,用于验证计算结果
	PrintMatrix(MatrixC, M, N);
	
	free(MatrixA);
	free(MatrixB);
	free(MatrixC);

	
	return 0;	
}

 当阶数为512时,运行结果如下:

 当阶数为5120时,运行时间超过10分钟以上

1、行列分块算法的程序实现

#include <stdlib.h>
#include <string.h>
#include "mpi.h"
#include "InitMatrix.h"
#include "MatrixMultiply.h"
#include "PrintMatrix.h"
#include <unistd.h>

#define M 1024
#define P 1024
#define N 1024

void printMatrix(float *data,int len){
	for (int i = 0; i <  len  ; i++)
		printf("%.3f\t",   data[i]);
	printf("\n");
}


int main(int argc, char *argv[]) {
    int rank, size;	
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
	
	MPI_Status status;	
    //MPI_Request request[2];

	MPI_Datatype newtype, tmp_block;
	MPI_Type_vector(M, P/size, N, MPI_FLOAT, &tmp_block);
	MPI_Type_create_resized(tmp_block, 0, sizeof(float), &newtype);
	MPI_Type_commit(&newtype);

	float * MatrixA;
	float * MatrixB;
	float * MatrixC; 
	
	float *SubMatrixA;
	float *SubMatrixB;
    float *SubMatrixC;
	MatrixA = (float *)malloc(M * P * sizeof(float));
	MatrixB = (float *)malloc(P * N * sizeof(float));
	MatrixC = (float *)malloc(M * N * sizeof(float));
	SubMatrixA = (float *)malloc(M * P / size * sizeof(float));
	SubMatrixB = (float *)malloc(N * P / size * sizeof(float));
    SubMatrixC = (float *)malloc(M * N / size * sizeof(float));

	memset(MatrixC, 0, M * N * sizeof(float));
    memset(SubMatrixC, 0, M * N / size * sizeof(float));
	
	int up = (rank == 0) ? (size-1) : (rank - 1);
    int down = (rank == (size-1)) ? 0: (rank + 1);
	//printf("Rank %d:  \n",  rank);
	//printf("down, up:%d,%d:  \n", down, up);
	
    if (rank == 0) {
        InitMatrixA(MatrixA, M, P);
        InitMatrixB(MatrixB, P, N);
		//printf("MatrixA Rank %d:  \n",  rank);		
		//printMatrix(MatrixA,M * P);
		//printf("MatrixB Rank %d:  \n",  rank);		
		//printMatrix(MatrixB,N * P);
    }
	
	MPI_Scatter(MatrixA, M * P / size, MPI_FLOAT, SubMatrixA, M * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);
	MPI_Scatter(MatrixB, 1, newtype, SubMatrixB, M * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);


	//重复非阻塞模式
	//MPI_Send_init(SubMatrixA, M * P / size, MPI_FLOAT,  up, 1, MPI_COMM_WORLD,&request[0]);
	//MPI_Recv_init(SubMatrixA, M * P / size, MPI_FLOAT,  down, 1, MPI_COMM_WORLD,&request[1]);	
	double start=MPI_Wtime();
	for(int i=0; i<size; i++){
		

		MatrixMultiply(SubMatrixA, SubMatrixB, SubMatrixC, M / size, P, N/size);
		//阻塞
		//MPI_Sendrecv( SubMatrixA, M * P/size, MPI_FLOAT, up, 1,SubMatrixA, M * P / size, MPI_FLOAT, down , 1, MPI_COMM_WORLD, &status );/*向上数据传送*/
		MPI_Sendrecv_replace(SubMatrixA, M * P / size, MPI_FLOAT, up, 1, down, 1, MPI_COMM_WORLD, &status);
		//非阻塞
		//MPI_Isend(SubMatrixA, M * P / size, MPI_FLOAT, up, 1, MPI_COMM_WORLD,&request[0]);
		//MPI_Irecv(SubMatrixA, M * P / size, MPI_FLOAT, down, 1, MPI_COMM_WORLD,&request[1]);
		//重复非阻塞
		//MPI_Startall(2, request);
		//MPI_Wait(&request[0], MPI_STATUS_IGNORE);
		//MPI_Wait(&request[1], MPI_STATUS_IGNORE);
		//sleep(i);
		//printf("SubMatrixA Rank %d i=%d:  \n",  rank,i);		
		//printMatrix(SubMatrixA,M * P / size);
		//printf("SubMatrixC Rank %d i=%d:  \n",  rank,i);		
		//printMatrix(SubMatrixC,N/size);
		MPI_Gather(SubMatrixC, N/size, MPI_FLOAT, &MatrixC[i*N],  N/size, MPI_FLOAT, 0, MPI_COMM_WORLD);
	}
	//printf("SubMatrixB Rank %d:  \n",  rank);		
	//printMatrix(SubMatrixB,N * P / size);
    if (rank == 0) {
		double finish=MPI_Wtime();
		printf("MatrixMultiply kernel function cost time(clock):%lf s\n",finish-start);
        // 输出矩阵C到文件,用于验证计算结果
        PrintMatrix(MatrixC, M, N);
		//printf("MatrixC %d:  \n",  rank);
		//printMatrix(MatrixC,M * N);
		free(MatrixA);
		free(MatrixB);
		free(MatrixC);

    }

	free(SubMatrixA);
	free(SubMatrixB);
	free(SubMatrixC);
	MPI_Type_free( &tmp_block);
	MPI_Type_free( &newtype);
	//MPI_Request_free(&request[0]);
	//MPI_Request_free(&request[1]);

    MPI_Finalize();
    return 0;
}

运行结果如下:

当阶数为512时,运行结果如下:

阻塞:

非阻塞:

重复非阻塞:

当阶数为5120时,运行结果如下:

阻塞:

非阻塞:

重复非阻塞:

当阶数为10240时,运行结果如下:

阻塞:

非阻塞:

重复非阻塞:

2、cannon算法的程序实现


#include <stdlib.h>
#include <string.h>
#include "InitMatrix.h"
#include "MatrixMultiply.h"
#include "PrintMatrix.h"
#include <mpi.h>
#include <unistd.h>
#include <math.h>

#define M 4
#define P 4
#define N 4


void printMatrix(float *data,int len){
	for (int i = 0; i <  len  ; i++)
		printf("%.3f\t",   data[i]);
	printf("\n");
}


int main(int argc, char * argv [])
{	
	int rank, size;
	MPI_Comm comm2d;
	MPI_Status status;
	//MPI_Request request[4];
	
	MPI_Init(&argc, &argv);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	int q = sqrt(size);


	int periods[2]={1,1};
	int dims[2]={q,q};
	int right,left,down,top;
	MPI_Cart_create( MPI_COMM_WORLD, 2, dims, periods, 0,&comm2d);/*定义虚拟进程拓扑 它是一个n´n的网格 得到的包含进程拓扑信息的新的通信域是comm2d*/
	MPI_Cart_shift( comm2d, 0, 1, &left, &right);/*得到当前进程左右两侧的进程标识*/
	MPI_Cart_shift( comm2d, 1, 1, &down, &top);/* 得到当前进程上下方的进程标识*/

	//printf("Rank %d:  \n",  rank);
	//printf("left, right,down, top:%d,%d,%d,%d:  \n",  left, right,down, top);
	

	MPI_Datatype newtype, tmp_block;
	MPI_Type_vector(N/q, N/q, N, MPI_FLOAT, &tmp_block);
	MPI_Type_create_resized(tmp_block, 0, sizeof(float), &newtype);
	MPI_Type_commit(&newtype);
	

	float * MatrixA;
	float * MatrixB;
	float * MatrixC;
	float *SubMatrixA;
	float *SubMatrixB;
    float *SubMatrixC;
	MatrixA = (float *)malloc(M * P * sizeof(float));
	MatrixB = (float *)malloc(P * N * sizeof(float));
	MatrixC = (float *)malloc(M * N * sizeof(float));
	SubMatrixA = (float *)malloc(M * P / size * sizeof(float));
	SubMatrixB = (float *)malloc(N * P / size * sizeof(float));
    SubMatrixC = (float *)malloc(M * N / size * sizeof(float));

	memset(MatrixC, 0, M * N * sizeof(float));
    memset(SubMatrixC, 0, M * N / size * sizeof(float));
	
	int recvcounts[size];
	int displs[size];
	for(int i=0;i<q;i++){
		for(int j=0;j<q;j++){
			displs[i*q+j] = i*N*M/q + j*M/q;
			recvcounts[i*q+j] = 1;
			//printf("偏移量 %d,",  displs[i*q+j]);
		}
	}
	//给矩阵A和B赋值
	if(rank==0){

		InitMatrixA(MatrixA, M, P);
		InitMatrixB(MatrixB, P, N);
		//MPI_Sendrecv(&MatrixA[0], 1, newtype, 0, 2,MatrixB,4, MPI_FLOAT, 0, 2, comm2d, &status );
		//printf("MatrixA Rank %d:  \n",  rank);		
		//printMatrix(MatrixA,M * P);
		//printf("MatrixB Rank %d:  \n",  rank);		
		//printMatrix(MatrixB,P * N);
    }
	
		MPI_Scatterv(MatrixA, recvcounts,displs, newtype, SubMatrixA, M * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);
		MPI_Scatterv(MatrixB, recvcounts,displs, newtype, SubMatrixB, N * P / size, MPI_FLOAT, 0, MPI_COMM_WORLD);
		//sleep(rank); 
		//printf("SubMatrixA Rank %d:  \n",  rank);
		//printMatrix(SubMatrixA,M * P / size);
		//printf("SubMatrixB Rank %d:  \n",  rank);		
		//printMatrix(SubMatrixB,N * P / size);
		//MPI_Send_init(SubMatrixA, M * P / size, MPI_FLOAT,  down, 1, comm2d,&request[0]);
		//MPI_Send_init(SubMatrixB, M * P / size, MPI_FLOAT,  left, 2, comm2d,&request[1]);
		//MPI_Recv_init(SubMatrixA, M * P / size, MPI_FLOAT,  top, 1, comm2d,&request[2]);	
		//MPI_Recv_init(SubMatrixB, M * P / size, MPI_FLOAT,  right, 2, comm2d,&request[3]);
	double start = MPI_Wtime();
	for(int i=0; i < q; i++)
	{
		//MPI_Isend(SubMatrixA, M * P / size, MPI_FLOAT,  down, 4, comm2d,&request[0]);
		//MPI_Isend(SubMatrixB, M * P / size, MPI_FLOAT,  left, 5, comm2d,&request[1]);
		//MPI_Wait(&request[0], MPI_STATUS_IGNORE);
		//MPI_Wait(&request[1], MPI_STATUS_IGNORE);
		//MPI_Startall(2, &request[0]);
		//sleep(rank); 
		//printf("SubMatrixB Rank %d 第%d轮:  \n",  rank,i);
		//printMatrix(SubMatrixB,M * P / size);
		//计算矩阵乘C=AB
		MatrixMultiply(SubMatrixA, SubMatrixB, SubMatrixC, M/q, P/q, N/q );
		
		MPI_Sendrecv_replace(SubMatrixA, M * P / size, MPI_FLOAT, down, 2, top, 2, comm2d, &status);
		MPI_Sendrecv_replace(SubMatrixB, N * P / size, MPI_FLOAT, left,   1, right,  1, comm2d, &status);
		//MPI_Irecv(SubMatrixA, M * P / size, MPI_FLOAT,  top, 4, comm2d,&request[2]);	
		//MPI_Irecv(SubMatrixB, N * P / size, MPI_FLOAT,  right, 5, comm2d,&request[3]);
		//MPI_Startall(2, &request[2]);
		//MPI_Wait(&request[2], MPI_STATUS_IGNORE);
		//MPI_Wait(&request[3], MPI_STATUS_IGNORE);
		//MPI_Sendrecv( SubMatrixA, M * P/size, MPI_FLOAT, down, 2,SubMatrixA, M * P / size, MPI_FLOAT, top, 2, comm2d, &status );/*向上数据传送*/
		//MPI_Sendrecv( SubMatrixB, M * P/size,MPI_FLOAT, left, 1,SubMatrixB, N * P / size, MPI_FLOAT,right, 1, comm2d, &status );/*向左数据传送*/
		//sleep(rank); 
		//printf("SubMatrixB Rank %d 第%d轮:  \n",  rank,i+1);
		//printMatrix(SubMatrixB,M * P / size);
		sleep(rank); 
		printf("SubMatrixC %d 第%d轮:  \n",  rank,i);
		printMatrix(SubMatrixC,N * M / size);
	}

		MPI_Gatherv(SubMatrixC, N * M / size, MPI_FLOAT, MatrixC, recvcounts,displs, newtype, 0, MPI_COMM_WORLD);

	if (rank == 0) {
		double finish=MPI_Wtime();
		printf("MatrixMultiply kernel function cost time(clock):%lf s\n",finish-start);
        // 输出矩阵C到文件,用于验证计算结果
        PrintMatrix(MatrixC, M, N);
		//printf("MatrixC %d:  \n",  rank);
		//printMatrix(MatrixC,M * N);
		free(MatrixA);
		free(MatrixB);
		free(MatrixC);
    }

	free(SubMatrixA);
	free(SubMatrixB);
	free(SubMatrixC);
	MPI_Type_free( &tmp_block);
	MPI_Type_free( &newtype);
	MPI_Comm_free( &comm2d );
	//MPI_Request_free(&request[0]);
	//MPI_Request_free(&request[1]);
	//MPI_Request_free(&request[2]);
	//MPI_Request_free(&request[3]);
	MPI_Finalize();
	return 0;	
}

当阶数为512时,运行结果如下:

当阶数为5120时,运行结果如下:

当阶数为10240时,运行结果如下:

综上:

非阻塞通信可以让通信与计算同时进行,可以进一步缩短运行时间。

阶数低于512运行时间降低到一定程度开始升高,说明数据通信时间超过计算时间

阶数超过512运行时间呈现指数级增长,是因为cache大小为2*1024*1024

进程数量超过256,cannon算法计算时间比行列算法效率更高

  • 2
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值