为了完成高性能计算课程作业,学习了一下以前从来没有接触过的MPI、Pthread和OpenMP,现将MPI实现矩阵乘法的过程记录如下:
运行环境:win10系统 cpu:AMD E2-3800 APU with Radeon(TM) HD Grapics 1.30GHZ 四核 四线程 (家里的台式机十年前买的,配置十分垃圾请谅解)。
使用语言:C++
使用工具:Visual Studio 2019
一、配置Visual Studio 2019
详细过程请自行百度,我这里只说一下大致过程:先下载Visual studio、MPI然后安装好——>配置Visual Studio:项目属性-配置属性-VC++目录,在包含目录和库目录中,分别添加MPI安装目录中的include和lib文件夹。
二、编程运行
运行结果:
矩阵:A[500]xB[100]
可发现当进程数增加为10后,运行时间反而增加了。原因是因为当矩阵较小时,用于将切分好的矩阵广播出去的
for循环次数会随着进程的数量增加而增加,进程越多,循环次数越多。for循环浪费的时间大于进程计算节约
出来的时间。所以增加矩阵大小。
矩阵:A[1000]xB[100]
矩阵:A[2000]xB[100]
\
果然,当矩阵的行数增加到2000时,用10个进程计算的时间要小于用5个进程计算的时间。
对比可以说明了多进程计算比少量进程速度快的结果。
三、程序
#include "mpi.h"
#include <stdio.h>
#include <stdlib.h>
#include "random"
#include <fstream>
#include <iostream>
using namespace std;
const int rows = 500; //the rows of matrix
const int cols = 100; //the cols of matrix
int main(int argc, char* argv[])
{
int i, j, k, myid, numprocs, anstag;
int A[rows][cols], B[cols], C[rows];
int masterpro, buf[cols], ans, cnt;
double starttime, endtime;
double tmp, totaltime;
MPI_Status status;
masterpro = 0;
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
if (numprocs < 2) {
printf("Error:Too few processes!\n");
MPI_Abort(MPI_COMM_WORLD, 99);
}
starttime = MPI_Wtime();
if (myid == masterpro) {
default_random_engine engine;
uniform_real_distribution<double> u(0.0, 1.0);
for (int i = 0; i < rows; ++i) {
for (int j = 0; j < cols; ++j) {
A[i][j] = u(engine);
}
}
for (int i = 0; i < cols; ++i) {
B[i] = u(engine);
}
//bcast the B vector to all slave processor
MPI_Bcast(B, cols, MPI_INT, masterpro, MPI_COMM_WORLD);
//partition the A matrix to all slave processor
for (i = 1; i < numprocs; i++)
{
for (k = i - 1; k < rows; k += numprocs - 1)
{
for (j = 0; j < cols; j++)
{
buf[j] = A[k][j];
}
MPI_Send(buf, cols, MPI_INT, i, k, MPI_COMM_WORLD);
}
}
}
else {
//starttime = MPI_Wtime();
MPI_Bcast(B, cols, MPI_INT, masterpro, MPI_COMM_WORLD);
//every processor receive the part of A matrix,and make Mul operator with B vector
for (i = myid - 1; i < rows; i += numprocs - 1) {
MPI_Recv(buf, cols, MPI_INT, masterpro, i, MPI_COMM_WORLD, &status);
ans = 0;
for (j = 0; j < cols; j++)
{
ans += buf[j] * B[j];
}
//send back the result
MPI_Send(&ans, 1, MPI_INT, masterpro, i, MPI_COMM_WORLD);
}
//endtime = MPI_Wtime();
//tmp = endtime-starttime;
}
if (myid == masterpro) {
//receive the result from all slave processor
for (i = 0; i < rows; i++)
{
MPI_Recv(&ans, 1, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, MPI_COMM_WORLD, &status);
//sender = status.MPI_SOURCE;
anstag = status.MPI_TAG;
C[anstag] = ans;
}
}
endtime = MPI_Wtime();
totaltime = endtime - starttime;
if (myid == masterpro)
printf("total time:%f s.\n", totaltime);
MPI_Finalize();
return 0;
}