一、实验环境
本实验需要在预备实验的配置上继续进行实验。
登录root账户之后,进入之前创建的个人账户:
su - username
mkdir /home/usersname/matrix
cd /home/usersname/matrix
二、源码程序
gemm.cpp
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <iostream>
#include "mpi.h"
// #include <cblas.h>
using namespace std;
// void CheckStatus(MPI_Status &status) {
// if (status.MPI_ERROR != MPI_SUCCESS) {
// cout << MPI::Get_error_class(status.MPI_ERROR);
// }
// }
void randMat(int rows, int cols, float*& Mat) {
Mat = new float[rows * cols];
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
Mat[i * cols + j] = 1.0;
}
void openmp_sgemm(int m, int n, int k, float*& leftMat, float*& rightMat, float*& resultMat) {
// rightMat is transposed
#pragma omp parallel for
for (int row = 0; row < m; row++) {
for (int col = 0; col < k; col++) {
resultMat[row * k + col] = 0.0;
for (int i = 0; i < n; i++) {
resultMat[row * k + col] += leftMat[row * n + i] * rightMat[col * n + i];
}
}
}
return;
}
void blas_sgemm(int m, int n, int k, float*& leftMat, float*& rightMat, float*& resultMat) {
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, leftMat, n, rightMat, n, 0.0, resultMat, k);
}
void mpi_sgemm(int m, int n, int k, float*& leftMat, float*& rightMat,
float*& resultMat, int rank, int worldsize, bool blas) {
int rowBlock = sqrt(worldsize);
if (rowBlock * rowBlock > worldsize)
rowBlock -= 1;
int colBlock = rowBlock;
int rowStride = m / rowBlock;
int colStride = k / colBlock;
worldsize = rowBlock * colBlock; // we abandon some processes.
// so best set process to a square number.
float* res;
if (rank == 0) {
float* buf = new float[k * n];
// transpose right Mat
for (int r = 0; r < n; r++) {
for (int c = 0; c < k; c++) {
buf[c * n + r] = rightMat[r * k + c];
}
}
for (int r = 0; r < k; r++) {
for (int c = 0; c < n; c++) {
rightMat[r * n + c] = buf[r * n + c];
}
}
delete buf;
MPI_Request sendRequest[2 * worldsize];
MPI_Status status[2 * worldsize];
for (int rowB = 0; rowB < rowBlock; rowB++) {
for (int colB = 0; colB < colBlock; colB++) {
rowStride = (rowB == rowBlock - 1) ? m - (rowBlock - 1) * (m / rowBlock)
: m / rowBlock;
colStride = (colB == colBlock - 1) ? k - (colBlock - 1) * (k / colBlock)
: k / colBlock;
int sendto = rowB * colBlock + colB;
if (sendto == 0)
continue;
MPI_Isend(&leftMat[rowB * (m / rowBlock) * n], rowStride * n, MPI_FLOAT,
sendto, 0, MPI_COMM_WORLD, &sendRequest[sendto]);
MPI_Isend(&rightMat[colB * (k / colBlock) * n], colStride * n,
MPI_FLOAT, sendto, 1, MPI_COMM_WORLD,
&sendRequest[sendto + worldsize]);
}
}
for (int rowB = 0; rowB < rowBlock; rowB++) {
for (int colB = 0; colB < colBlock; colB++) {
int recvfrom = rowB * colBlock + colB;
if (recvfrom == 0)
continue;
MPI_Wait(&sendRequest[recvfrom], &status[recvfrom]);
MPI_Wait(&sendRequest[recvfrom + worldsize],
&status[recvfrom + worldsize]);
}
}
res = new float[(m / rowBlock) * (k / colBlock)];
}
else {
if (rank < worldsize) {
MPI_Status status[2];
rowStride = ((rank / colBlock) == rowBlock - 1)
? m - (rowBlock - 1) * (m / rowBlock)
: m / rowBlock;
colStride = ((rank % colBlock) == colBlock - 1)
? k - (colBlock - 1) * (k / colBlock)
: k / colBlock;
if (rank != 0) {
leftMat = new float[rowStride * n];
rightMat = new float[colStride * n];
}
if (rank != 0) {
MPI_Recv(leftMat, rowStride * n, MPI_FLOAT, 0, 0, MPI_COMM_WORLD,
&status[0]);
MPI_Recv(rightMat, colStride * n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD,
&status[1]);
}
res = new float[rowStride * colStride];
}
}
MPI_Barrier(MPI_COMM_WORLD);
if (rank < worldsize) {
rowStride = ((rank / colBlock) == rowBlock - 1)
? m - (rowBlock - 1) * (m / rowBlock)
: m / rowBlock;
colStride = ((rank % colBlock) == colBlock - 1)
? k - (colBlock - 1) * (k / colBlock)
: k / colBlock;
if (!blas)
openmp_sgemm(rowStride, n, colStride, leftMat, rightMat, res);
else
blas_sgemm(rowStride, n, colStride, leftMat, rightMat, res);
}
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
MPI_Status status;
float* buf = new float[(m - (rowBlock - 1) * (m / rowBlock)) *
(k - (colBlock - 1) * (k / colBlock))];
float* temp_res;
for (int rowB = 0; rowB < rowBlock; rowB++) {
for (int colB = 0; colB < colBlock; colB++) {
rowStride = (rowB == rowBlock - 1) ? m - (rowBlock - 1) * (m / rowBlock)
: m / rowBlock;
colStride = (colB == colBlock - 1) ? k - (colBlock - 1) * (k / colBlock)
: k / colBlock;
int recvfrom = rowB * colBlock + colB;
if (recvfrom != 0) {
temp_res = buf;
MPI_Recv(temp_res, rowStride * colStride, MPI_FLOAT, recvfrom, 0,
MPI_COMM_WORLD, &status);
}
else {
temp_res = res;
}
for (int r = 0; r < rowStride; r++)
for (int c = 0; c < colStride; c++)
resultMat[rowB * (m / rowBlock) * k + colB * (k / colBlock) +
r * k + c] = temp_res[r * colStride + c];
}
}
}
else {
rowStride = ((rank / colBlock) == rowBlock - 1)
? m - (rowBlock - 1) * (m / rowBlock)
: m / rowBlock;
colStride = ((rank % colBlock) == colBlock - 1)
? k - (colBlock - 1) * (k / colBlock)
: k / colBlock;
if (rank < worldsize)
MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
return;
}
int main(int argc, char* argv[]) {
if (argc != 5) {
cout << "Usage: " << argv[0] << " M N K use-blas\n";
exit(-1);
}
int rank;
int worldSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int k = atoi(argv[3]);
int blas = atoi(argv[4]);
float* leftMat, * rightMat, * resMat;
struct timeval start, stop;
if (rank == 0) {
randMat(m, n, leftMat);
randMat(n, k, rightMat);
randMat(m, k, resMat);
}
gettimeofday(&start, NULL);
mpi_sgemm(m, n, k, leftMat, rightMat, resMat, rank, worldSize, blas);
gettimeofday(&stop, NULL);
if (rank == 0) {
cout << "mpi matmul: "
<< (stop.tv_sec - start.tv_sec) * 1000.0 +
(stop.tv_usec - start.tv_usec) / 1000.0
<< " ms" << endl;
for (int i = 0; i < m; i++) {
for (int j = 0; j < k; j++)
if (int(resMat[i * k + j]) != n) {
cout << resMat[i * k + j] << "error\n";
exit(-1);
}
// cout << resMat[i * k + j] << ' ';
// cout << endl;
}
}
MPI_Finalize();
}
conv.cpp
#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <iostream>
#include "mpi.h"
#include <cblas.h>
#include <assert.h>
using namespace std;
void randMat(int rows, int cols, float*& Mat) {
Mat = new float[rows * cols];
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
Mat[i * cols + j] = 1.0;
}
int get_steps(int kernel, int step, int len) {
if (kernel > len)
return 0;
return (len - kernel) / step + 1;
}
inline void img2col_conv_kernel(int leftAnchorX, int leftAnchorY, int rightAnchorX,
int rightAnchorY, const int xKernel, const int yKernel, const int xStep,
const int yStep, float*& img, float*& kernel, float*& conv) {
int imgRows = rightAnchorX - leftAnchorX,
imgCols = rightAnchorY - leftAnchorY;
int convRows = get_steps(xKernel, xStep, imgRows);
int convCols = get_steps(yKernel, yStep, imgCols);
float* flattenImg = new float[convRows * convCols * xKernel * yKernel];
// #pragma omp parallel for
// for (int i = leftAnchorX; i < rightAnchorX - xKernel; i += xStep) {
// for (int r = 0; r < xKernel; r++) {
// for (int j = leftAnchorY; j < rightAnchorY - yKernel; j += yStep) {
// int pos = (i - leftAnchorX)/xStep * convCols + (j - leftAnchorY) / yStep;
// memcpy(&flattenImg[pos * xKernel * yKernel + r * yKernel], &img[(i + r) * imgCols + j],
// sizeof(float) * yKernel);
// }
// }
// }
// #pragma omp parallel for
// for (int i = 0; i < convRows * convCols; i++) {
// conv[i] = 0.0;
// for (int j = 0; j < xKernel * yKernel; j++) {
// conv[i] += flattenImg[i*xKernel*yKernel + j] * kernel[j];
// }
// }
cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, convRows * convCols, 1, xKernel * yKernel, 1.0,
flattenImg, xKernel * yKernel, kernel, 1, 0.0, conv, 1);
// cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, convRows * convCols, 1, xKernel * yKernel, 1.0,
// flattenImg, xKernel * yKernel, kernel, convRows * convCols, 0.0, conv, 1);
delete flattenImg;
}
inline void naive_conv_kernel(int leftAnchorX, int leftAnchorY, int rightAnchorX,
int rightAnchorY, const int xKernel, const int yKernel, const int xStep,
const int yStep, float*& img, float*& kernel, float*& conv) {
int imgRows = rightAnchorX - leftAnchorX,
imgCols = rightAnchorY - leftAnchorY;
int convRows = get_steps(xKernel, xStep, imgRows);
int convCols = get_steps(yKernel, yStep, imgCols);
#pragma omp parallel for
for (int i = leftAnchorX; i < rightAnchorX - xKernel; i += xStep) {
for (int j = leftAnchorY; j < rightAnchorY - yKernel; j += yStep) {
// to[i / xStride * to_n + j] = 0.0;
int pos = (i - leftAnchorX) / xStep * convCols + (j - leftAnchorY) / yStep;
conv[pos] = 0.0;
for (int r = i; r < i + xKernel; r++)
for (int c = j; c < j + yKernel; c++) {
conv[pos] += img[r * imgCols + c] * kernel[(r - i) * yKernel + (c - j)];
}
}
}
}
void mpi_convolution(int m, int n, int xKernel, int yKernel, int xStep,
int yStep, float*& img, float*& kernel, float*& conv,
int rank, int worldsize, bool img2col) {
const int total_xsteps = get_steps(xKernel, xStep, m);
const int total_ysteps = get_steps(yKernel, yStep, n);
const int xsteps_per_proc = total_xsteps / worldsize;
const int last_xsteps = total_xsteps - xsteps_per_proc * (worldsize - 1);
int steps;
if (rank == 0) {
MPI_Request* sendRequest = new MPI_Request[worldsize];
MPI_Status* status = new MPI_Status[worldsize];
for (int i = 1; i < worldsize; i++) {
steps = (i == worldsize - 1) ? last_xsteps : xsteps_per_proc;
MPI_Isend(&img[i * xsteps_per_proc * xStep * n],
(steps * xStep + xKernel - xStep) * n, MPI_FLOAT, i, 0,
MPI_COMM_WORLD, &sendRequest[i]);
}
for (int i = 1; i < worldsize; i++) {
MPI_Wait(&sendRequest[i], &status[i]);
}
delete sendRequest;
delete status;
}
else {
MPI_Status status;
steps = (rank == worldsize - 1) ? last_xsteps : xsteps_per_proc;
img = new float[(steps * xStep + xKernel - xStep) * n];
MPI_Recv(img, (steps * xStep + xKernel - xStep) * n, MPI_FLOAT, 0, 0,
MPI_COMM_WORLD, &status);
conv = new float[steps * total_ysteps];
}
MPI_Barrier(MPI_COMM_WORLD);
steps = (rank == worldsize - 1) ? last_xsteps : xsteps_per_proc;
if (img2col)
img2col_conv_kernel(0, 0, steps * xStep + xKernel - xStep, n, xKernel, yKernel,
xStep, yStep, img, kernel, conv);
else
naive_conv_kernel(0, 0, steps * xStep + xKernel - xStep, n, xKernel, yKernel,
xStep, yStep, img, kernel, conv);
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
MPI_Status status;
for (int i = 1; i < worldsize; i++) {
steps = (i == worldsize - 1) ? last_xsteps : xsteps_per_proc;
MPI_Recv(&conv[i * xsteps_per_proc * total_ysteps],
steps * total_ysteps, MPI_FLOAT, i, 0, MPI_COMM_WORLD,
&status);
}
}
else {
MPI_Send(conv, steps * total_ysteps, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
return;
}
int main(int argc, char* argv[]) {
if (argc != 4) {
cout << "Usage: " << argv[0] << " M N enabled-img2col";
exit(-1);
}
int rank;
int worldSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int img2col = atoi(argv[3]);
int xKernel = 3, yKernel = 3;
int xStep = 1, yStep = 1;
float* Img, * Conv;
struct timeval start, stop;
if (rank == 0) {
randMat(m, n, Img);
randMat(get_steps(xKernel, xStep, m), get_steps(yKernel, yStep, n), Conv);
}
float* Kernel = new float[xKernel * yKernel];
for (int i = 0; i < xKernel * yKernel; i++)
Kernel[i] = 1.0;
gettimeofday(&start, NULL);
mpi_convolution(m, n, xKernel, yKernel, xStep, yStep, Img, Kernel, Conv, rank, worldSize, img2col);
gettimeofday(&stop, NULL);
if (rank == 0) {
cout << "mpi convolution: " << (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_usec - start.tv_usec) / 1000.0 << " ms" << endl;
for (int i = 0; i < min(10, m); i++) {
for (int j = 0; j < min(10, n); j++)
cout << Conv[i * n + j] << ' ';
cout << endl;
}
}
delete Img;
delete Conv;
MPI_Finalize();
}
pooling.cpp
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <iostream>
#include "mpi.h"
using namespace std;
void randMat(int rows, int cols, float*& Mat) {
Mat = new float[rows * cols];
for (int i = 0; i < rows; i++)
for (int j = 0; j < cols; j++)
Mat[i * cols + j] = 1.0;
}
inline void pooling_kernel(int leftAnchorX, int leftAnchorY, int rightAnchorX,
int rightAnchorY, int xStride, int yStride,
int from_m, int from_n, int to_m, int to_n,
float*& from, float*& to) {
if ((rightAnchorX - leftAnchorX) % xStride != 0 ||
(rightAnchorY - leftAnchorY) % yStride != 0)
exit(-1);
if (leftAnchorX % xStride != 0 || leftAnchorY % yStride != 0)
exit(-2);
for (int i = leftAnchorX; i < rightAnchorX; i += xStride) {
for (int j = leftAnchorY; j < rightAnchorY; j += yStride) {
// to[i / xStride * to_n + j] = 0.0;
float temp = to[i / xStride * to_n + j / yStride];
for (int r = i; r < i + xStride; r++)
for (int c = j; c < j + yStride; c++) {
temp = max(temp, from[r * from_n + c]);
}
to[i / xStride * to_n + j / yStride] = temp;
}
}
}
void mpi_pooling(int m, int n, int xstride, int ystride, float*& mat,
float*& res, int rank, int worldsize) {
if (m % xstride || n % ystride) {
cout << "matrix size and stride do not match \n";
return;
}
const int xstrides_per_proc = (m / xstride) / worldsize;
int strides;
if (rank == 0) {
MPI_Request* sendRequest = new MPI_Request[worldsize];
MPI_Status* status = new MPI_Status[worldsize];
for (int i = 1; i < worldsize; i++) {
strides = (i < worldsize - 1)
? xstrides_per_proc
: (m / xstride) - xstrides_per_proc * (worldsize - 1);
MPI_Isend(&mat[i * xstrides_per_proc * xstride * n],
strides * xstride * n, MPI_FLOAT, i, 0, MPI_COMM_WORLD,
&sendRequest[i]);
}
for (int i = 1; i < worldsize; i++) {
MPI_Wait(&sendRequest[i], &status[i]);
}
delete[] sendRequest;
delete[] status;
}
else {
MPI_Status status;
strides = (rank < worldsize - 1)
? xstrides_per_proc
: (m / xstride) - xstrides_per_proc * (worldsize - 1);
mat = new float[strides * xstride * n];
MPI_Recv(mat, strides * xstride * n, MPI_FLOAT, 0, 0, MPI_COMM_WORLD,
&status);
res = new float[strides * (n / ystride)];
}
MPI_Barrier(MPI_COMM_WORLD);
strides = (rank < worldsize - 1)
? xstrides_per_proc
: (m / xstride) - xstrides_per_proc * (worldsize - 1);
// pooling_kernel(rank * xstrides_per_proc * xstride, 0,
// (rank * xstrides_per_proc + strides) * xstride, n,
// xstride, ystride, m, n, strides, n / ystride, mat, res);
pooling_kernel(0, 0, strides * xstride, n, xstride, ystride,
strides * xstride, n, strides, n / ystride, mat, res);
MPI_Barrier(MPI_COMM_WORLD);
if (rank == 0) {
MPI_Status status;
for (int i = 1; i < worldsize; i++) {
strides = (i < worldsize - 1)
? xstrides_per_proc
: (m / xstride) - xstrides_per_proc * (worldsize - 1);
MPI_Recv(&res[i * xstrides_per_proc * (n / ystride)],
strides * (n / ystride), MPI_FLOAT, i, 0, MPI_COMM_WORLD,
&status);
}
}
else {
MPI_Send(res, strides * (n / ystride), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
}
MPI_Barrier(MPI_COMM_WORLD);
return;
}
int main(int argc, char* argv[]) {
if (argc != 3) {
cout << "Usage: " << argv[0] << " M N";
exit(-1);
}
int rank;
int worldSize;
MPI_Init(&argc, &argv);
MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
int m = atoi(argv[1]);
int n = atoi(argv[2]);
int xstride = 4, ystride = 4;
float* Mat, * resMat;
struct timeval start, stop;
if (rank == 0) {
randMat(m, n, Mat);
randMat(m / xstride, n / ystride, resMat);
}
gettimeofday(&start, NULL);
mpi_pooling(m, n, xstride, ystride, Mat, resMat, rank, worldSize);
gettimeofday(&stop, NULL);
if (rank == 0) {
cout << "mpi pooling: "
<< (stop.tv_sec - start.tv_sec) * 1000 * 1000L +
(stop.tv_usec - start.tv_usec)
<< endl;
// for (int i = 0; i < min(10, m); i++) {
// for (int j = 0; j < min(10, n); j++)
// cout << Mat[i * k + j] << ' ';
// cout << endl;
// }
}
delete[] Mat;
delete[] resMat;
MPI_Finalize();
}
Makefile
CC = mpic++
CCFLAGS = -O2 -fopenmp
LDFLAGS = -lopenblas
all: gemm conv pooling
gemm: gemm.cpp
${CC} ${CCFLAGS} gemm.cpp -o gemm ${LDFLAGS}
conv: conv.cpp
${CC} ${CCFLAGS} conv.cpp -o conv ${LDFLAGS}
pooling: pooling.cpp
${CC} ${CCFLAGS} pooling.cpp -o pooling ${LDFLAGS}
clean:
rm gemm conv pooling
如果遇到Makefile不能写入的问题先退出,再在指令前+sudo试试。
或者在登录个人账户时就将此账户设置为管理员:
usermod -aG wheel zhangsan
三、按照实验指导手册进行编译
执行以下命令,获取包(四台主机都执行)
这个步骤在User目录下执行,不是Matrix。
wget https://github.com/xianyi/OpenBLAS/archive/v0.3.8.tar.gz
tar -zxvf v0.3.8.tar.gz && cd OpenBLAS-0.3.8
make -j2
sudo make PREFIX=/usr/local/openblas install
sudo chmod -R 777 /usr/local/openblas/
sudo ln -s /usr/local/openblas/lib/libopenblas.so /usr/lib/libopenblas.so
vim ~/.bashrc
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/openblas/include
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openblas/lib
source ~/.bashrc
make
四、主机配置文件
vim /home/zhangsan/matrix/hostfile
ecs-hw-0001:2
ecs-hw-0002:2
ecs-hw-0003:2
ecs-hw-0004:2
这里如果完全按照预备实验来做就没有问题,如果预备实验取的节点名字并不是ecs-hw-0001等的话注意修改。
五、运行监测
app=${1}
if [ ${app} = "gemm" ]; then
mpirun --hostfile hostfile -np ${2} ./gemm 4024 4024 4024 0
fi
if [ ${app} = "conv" ]; then
mpirun --hostfile hostfile -np ${2} ./conv 4096 4096 ${3}
fi
if [ ${app} = "pooling" ]; then
mpirun --hostfile hostfile -np ${2} ./pooling 1024 1024
fi
如果你想测试更多效果,可以修改这个地方的参数。
六、可能出现的问题
可以执行bash run.sh gemm 1、bash run.sh gemm 2但是执行gemm 3时报错,类似下面,且执行大于2个核心的bash指令时长时间无法运行出结果。
Fatal error in PMPI_Barrier: Unknown error class, error stack:
PMPI_Barrier(289).....................: MPI_Barrier(comm=MPI_COMM_WORLD) failed
PMPI_Barrier(275).....................:
MPIR_Barrier_impl(175)................:
MPIR_Barrier_intra_auto(110)..........:
MPIR_Barrier_intra_smp(43)............:
MPIR_Barrier_impl(175)................:
MPIR_Barrier_intra_auto(110)..........:
解决方法:退出当前账户到root:
exit
检查hosts文件:(四个都检查)
vim /etc/hosts
你也许会发现,除了实验一你所保留的内容之外,又凭空出现了一行,注释掉这一行,只保留预备实验添加的信息:
192.168.0.224 ecs-hw-0001
192.168.0.4 ecs-hw-0002
192.168.0.73 ecs-hw-0003
192.168.0.191 ecs-hw-0004
再次执行不出错。
(每个人做实验遇到的问题都不一样,希望能提供帮助)