华为实验一 MPI 矩阵运算

一、实验环境

本实验需要在预备实验的配置上继续进行实验。

登录root账户之后,进入之前创建的个人账户:
su - username

执行以下命令,创建 matrix 目录存放该程序的所有文件, 并进入 matrix 目录(四台主机都执行)
mkdir /home/usersname/matrix
cd /home/usersname/matrix

二、源码程序

  gemm.cpp

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <iostream>
#include "mpi.h"
// #include <cblas.h>

using namespace std;

// void CheckStatus(MPI_Status &status) {
//     if (status.MPI_ERROR != MPI_SUCCESS) {
//         cout << MPI::Get_error_class(status.MPI_ERROR);
//     }
// }

void randMat(int rows, int cols, float*& Mat) {
    Mat = new float[rows * cols];
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            Mat[i * cols + j] = 1.0;
}

void openmp_sgemm(int m, int n, int k, float*& leftMat, float*& rightMat, float*& resultMat) {
    // rightMat is transposed
#pragma omp parallel for
    for (int row = 0; row < m; row++) {
        for (int col = 0; col < k; col++) {
            resultMat[row * k + col] = 0.0;
            for (int i = 0; i < n; i++) {
                resultMat[row * k + col] += leftMat[row * n + i] * rightMat[col * n + i];
            }
        }
    }
    return;
}

void blas_sgemm(int m, int n, int k, float*& leftMat, float*& rightMat, float*& resultMat) {
    // cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, m, k, n, 1.0, leftMat, n, rightMat, n, 0.0, resultMat, k);
}

void mpi_sgemm(int m, int n, int k, float*& leftMat, float*& rightMat,
    float*& resultMat, int rank, int worldsize, bool blas) {
    int rowBlock = sqrt(worldsize);
    if (rowBlock * rowBlock > worldsize)
        rowBlock -= 1;
    int colBlock = rowBlock;
    int rowStride = m / rowBlock;
    int colStride = k / colBlock;
    worldsize = rowBlock * colBlock; // we abandon some processes.
    // so best set process to a square number.
    float* res;
    if (rank == 0) {
        float* buf = new float[k * n];
        // transpose right Mat
        for (int r = 0; r < n; r++) {
            for (int c = 0; c < k; c++) {
                buf[c * n + r] = rightMat[r * k + c];
            }
        }
        for (int r = 0; r < k; r++) {
            for (int c = 0; c < n; c++) {
                rightMat[r * n + c] = buf[r * n + c];
            }
        }
        delete buf;
        MPI_Request sendRequest[2 * worldsize];
        MPI_Status status[2 * worldsize];
        for (int rowB = 0; rowB < rowBlock; rowB++) {
            for (int colB = 0; colB < colBlock; colB++) {
                rowStride = (rowB == rowBlock - 1) ? m - (rowBlock - 1) * (m / rowBlock)
                    : m / rowBlock;
                colStride = (colB == colBlock - 1) ? k - (colBlock - 1) * (k / colBlock)
                    : k / colBlock;
                int sendto = rowB * colBlock + colB;
                if (sendto == 0)
                    continue;
                MPI_Isend(&leftMat[rowB * (m / rowBlock) * n], rowStride * n, MPI_FLOAT,
                    sendto, 0, MPI_COMM_WORLD, &sendRequest[sendto]);
                MPI_Isend(&rightMat[colB * (k / colBlock) * n], colStride * n,
                    MPI_FLOAT, sendto, 1, MPI_COMM_WORLD,
                    &sendRequest[sendto + worldsize]);
            }
        }
        for (int rowB = 0; rowB < rowBlock; rowB++) {
            for (int colB = 0; colB < colBlock; colB++) {
                int recvfrom = rowB * colBlock + colB;
                if (recvfrom == 0)
                    continue;
                MPI_Wait(&sendRequest[recvfrom], &status[recvfrom]);
                MPI_Wait(&sendRequest[recvfrom + worldsize],
                    &status[recvfrom + worldsize]);
            }
        }
        res = new float[(m / rowBlock) * (k / colBlock)];
    }
    else {
        if (rank < worldsize) {
            MPI_Status status[2];
            rowStride = ((rank / colBlock) == rowBlock - 1)
                ? m - (rowBlock - 1) * (m / rowBlock)
                : m / rowBlock;
            colStride = ((rank % colBlock) == colBlock - 1)
                ? k - (colBlock - 1) * (k / colBlock)
                : k / colBlock;
            if (rank != 0) {
                leftMat = new float[rowStride * n];
                rightMat = new float[colStride * n];
            }
            if (rank != 0) {
                MPI_Recv(leftMat, rowStride * n, MPI_FLOAT, 0, 0, MPI_COMM_WORLD,
                    &status[0]);
                MPI_Recv(rightMat, colStride * n, MPI_FLOAT, 0, 1, MPI_COMM_WORLD,
                    &status[1]);
            }
            res = new float[rowStride * colStride];
        }
    }
    MPI_Barrier(MPI_COMM_WORLD);
    if (rank < worldsize) {
        rowStride = ((rank / colBlock) == rowBlock - 1)
            ? m - (rowBlock - 1) * (m / rowBlock)
            : m / rowBlock;
        colStride = ((rank % colBlock) == colBlock - 1)
            ? k - (colBlock - 1) * (k / colBlock)
            : k / colBlock;
        if (!blas)
            openmp_sgemm(rowStride, n, colStride, leftMat, rightMat, res);
        else
            blas_sgemm(rowStride, n, colStride, leftMat, rightMat, res);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    if (rank == 0) {
        MPI_Status status;
        float* buf = new float[(m - (rowBlock - 1) * (m / rowBlock)) *
            (k - (colBlock - 1) * (k / colBlock))];
        float* temp_res;
        for (int rowB = 0; rowB < rowBlock; rowB++) {
            for (int colB = 0; colB < colBlock; colB++) {
                rowStride = (rowB == rowBlock - 1) ? m - (rowBlock - 1) * (m / rowBlock)
                    : m / rowBlock;
                colStride = (colB == colBlock - 1) ? k - (colBlock - 1) * (k / colBlock)
                    : k / colBlock;
                int recvfrom = rowB * colBlock + colB;
                if (recvfrom != 0) {
                    temp_res = buf;
                    MPI_Recv(temp_res, rowStride * colStride, MPI_FLOAT, recvfrom, 0,
                        MPI_COMM_WORLD, &status);
                }
                else {
                    temp_res = res;
                }
                for (int r = 0; r < rowStride; r++)
                    for (int c = 0; c < colStride; c++)
                        resultMat[rowB * (m / rowBlock) * k + colB * (k / colBlock) +
                        r * k + c] = temp_res[r * colStride + c];
            }
        }
    }
    else {
        rowStride = ((rank / colBlock) == rowBlock - 1)
            ? m - (rowBlock - 1) * (m / rowBlock)
            : m / rowBlock;
        colStride = ((rank % colBlock) == colBlock - 1)
            ? k - (colBlock - 1) * (k / colBlock)
            : k / colBlock;
        if (rank < worldsize)
            MPI_Send(res, rowStride * colStride, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    return;
}

int main(int argc, char* argv[]) {
    if (argc != 5) {
        cout << "Usage: " << argv[0] << " M N K use-blas\n";
        exit(-1);
    }
    int rank;
    int worldSize;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int m = atoi(argv[1]);
    int n = atoi(argv[2]);
    int k = atoi(argv[3]);
    int blas = atoi(argv[4]);
    float* leftMat, * rightMat, * resMat;
    struct timeval start, stop;
    if (rank == 0) {
        randMat(m, n, leftMat);
        randMat(n, k, rightMat);
        randMat(m, k, resMat);
    }
    gettimeofday(&start, NULL);
    mpi_sgemm(m, n, k, leftMat, rightMat, resMat, rank, worldSize, blas);
    gettimeofday(&stop, NULL);
    if (rank == 0) {
        cout << "mpi matmul: "
            << (stop.tv_sec - start.tv_sec) * 1000.0 +
            (stop.tv_usec - start.tv_usec) / 1000.0
            << " ms" << endl;
        for (int i = 0; i < m; i++) {
            for (int j = 0; j < k; j++)
                if (int(resMat[i * k + j]) != n) {
                    cout << resMat[i * k + j] << "error\n";
                    exit(-1);
                }
            // cout << resMat[i * k + j] << ' ';
            // cout << endl;
        }
    }
    MPI_Finalize();
}

        conv.cpp

#include <math.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <string.h>
#include <iostream>
#include "mpi.h"
#include <cblas.h>
#include <assert.h>
using namespace std;

void randMat(int rows, int cols, float*& Mat) {
    Mat = new float[rows * cols];
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            Mat[i * cols + j] = 1.0;
}

int get_steps(int kernel, int step, int len) {
    if (kernel > len)
        return 0;
    return (len - kernel) / step + 1;
}

inline void img2col_conv_kernel(int leftAnchorX, int leftAnchorY, int rightAnchorX,
    int rightAnchorY, const int xKernel, const int yKernel, const int xStep,
    const int yStep, float*& img, float*& kernel, float*& conv) {
    int imgRows = rightAnchorX - leftAnchorX,
        imgCols = rightAnchorY - leftAnchorY;
    int convRows = get_steps(xKernel, xStep, imgRows);
    int convCols = get_steps(yKernel, yStep, imgCols);
    float* flattenImg = new float[convRows * convCols * xKernel * yKernel];
    // #pragma omp parallel for
    // for (int i = leftAnchorX; i < rightAnchorX - xKernel; i += xStep) {
    //     for (int r = 0; r < xKernel; r++) {
    //         for (int j = leftAnchorY; j < rightAnchorY - yKernel; j += yStep) {
    //             int pos = (i - leftAnchorX)/xStep * convCols + (j - leftAnchorY) / yStep;
    //             memcpy(&flattenImg[pos * xKernel * yKernel + r * yKernel], &img[(i + r) * imgCols + j], 
    //                    sizeof(float) * yKernel);
    //         }
    //     }
    // }
    // #pragma omp parallel for
    // for (int i = 0; i < convRows * convCols; i++) {
    //     conv[i] = 0.0;
    //     for (int j = 0; j < xKernel * yKernel; j++) {
    //         conv[i] += flattenImg[i*xKernel*yKernel + j] * kernel[j];
    //     }
    // }
    cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasNoTrans, convRows * convCols, 1, xKernel * yKernel, 1.0,
        flattenImg, xKernel * yKernel, kernel, 1, 0.0, conv, 1);
    // cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans, convRows * convCols, 1, xKernel * yKernel, 1.0, 
    //             flattenImg, xKernel * yKernel, kernel, convRows * convCols, 0.0, conv, 1);
    delete flattenImg;
}

inline void naive_conv_kernel(int leftAnchorX, int leftAnchorY, int rightAnchorX,
    int rightAnchorY, const int xKernel, const int yKernel, const int xStep,
    const int yStep, float*& img, float*& kernel, float*& conv) {
    int imgRows = rightAnchorX - leftAnchorX,
        imgCols = rightAnchorY - leftAnchorY;
    int convRows = get_steps(xKernel, xStep, imgRows);
    int convCols = get_steps(yKernel, yStep, imgCols);
#pragma omp parallel for
    for (int i = leftAnchorX; i < rightAnchorX - xKernel; i += xStep) {
        for (int j = leftAnchorY; j < rightAnchorY - yKernel; j += yStep) {
            // to[i / xStride * to_n + j] = 0.0;
            int pos = (i - leftAnchorX) / xStep * convCols + (j - leftAnchorY) / yStep;
            conv[pos] = 0.0;
            for (int r = i; r < i + xKernel; r++)
                for (int c = j; c < j + yKernel; c++) {
                    conv[pos] += img[r * imgCols + c] * kernel[(r - i) * yKernel + (c - j)];
                }
        }
    }
}

void mpi_convolution(int m, int n, int xKernel, int yKernel, int xStep,
    int yStep, float*& img, float*& kernel, float*& conv,
    int rank, int worldsize, bool img2col) {
    const int total_xsteps = get_steps(xKernel, xStep, m);
    const int total_ysteps = get_steps(yKernel, yStep, n);
    const int xsteps_per_proc = total_xsteps / worldsize;
    const int last_xsteps = total_xsteps - xsteps_per_proc * (worldsize - 1);
    int steps;
    if (rank == 0) {
        MPI_Request* sendRequest = new MPI_Request[worldsize];
        MPI_Status* status = new MPI_Status[worldsize];
        for (int i = 1; i < worldsize; i++) {
            steps = (i == worldsize - 1) ? last_xsteps : xsteps_per_proc;
            MPI_Isend(&img[i * xsteps_per_proc * xStep * n],
                (steps * xStep + xKernel - xStep) * n, MPI_FLOAT, i, 0,
                MPI_COMM_WORLD, &sendRequest[i]);
        }
        for (int i = 1; i < worldsize; i++) {
            MPI_Wait(&sendRequest[i], &status[i]);
        }
        delete sendRequest;
        delete status;
    }
    else {
        MPI_Status status;
        steps = (rank == worldsize - 1) ? last_xsteps : xsteps_per_proc;
        img = new float[(steps * xStep + xKernel - xStep) * n];
        MPI_Recv(img, (steps * xStep + xKernel - xStep) * n, MPI_FLOAT, 0, 0,
            MPI_COMM_WORLD, &status);
        conv = new float[steps * total_ysteps];
    }
    MPI_Barrier(MPI_COMM_WORLD);
    steps = (rank == worldsize - 1) ? last_xsteps : xsteps_per_proc;
    if (img2col)
        img2col_conv_kernel(0, 0, steps * xStep + xKernel - xStep, n, xKernel, yKernel,
            xStep, yStep, img, kernel, conv);
    else
        naive_conv_kernel(0, 0, steps * xStep + xKernel - xStep, n, xKernel, yKernel,
            xStep, yStep, img, kernel, conv);
    MPI_Barrier(MPI_COMM_WORLD);
    if (rank == 0) {
        MPI_Status status;
        for (int i = 1; i < worldsize; i++) {
            steps = (i == worldsize - 1) ? last_xsteps : xsteps_per_proc;
            MPI_Recv(&conv[i * xsteps_per_proc * total_ysteps],
                steps * total_ysteps, MPI_FLOAT, i, 0, MPI_COMM_WORLD,
                &status);
        }
    }
    else {
        MPI_Send(conv, steps * total_ysteps, MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    return;
}

int main(int argc, char* argv[]) {
    if (argc != 4) {
        cout << "Usage: " << argv[0] << " M N enabled-img2col";
        exit(-1);
    }
    int rank;
    int worldSize;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int m = atoi(argv[1]);
    int n = atoi(argv[2]);
    int img2col = atoi(argv[3]);
    int xKernel = 3, yKernel = 3;
    int xStep = 1, yStep = 1;
    float* Img, * Conv;
    struct timeval start, stop;
    if (rank == 0) {
        randMat(m, n, Img);
        randMat(get_steps(xKernel, xStep, m), get_steps(yKernel, yStep, n), Conv);
    }
    float* Kernel = new float[xKernel * yKernel];
    for (int i = 0; i < xKernel * yKernel; i++)
        Kernel[i] = 1.0;
    gettimeofday(&start, NULL);
    mpi_convolution(m, n, xKernel, yKernel, xStep, yStep, Img, Kernel, Conv, rank, worldSize, img2col);
    gettimeofday(&stop, NULL);
    if (rank == 0) {
        cout << "mpi convolution: " << (stop.tv_sec - start.tv_sec) * 1000.0 + (stop.tv_usec - start.tv_usec) / 1000.0 << " ms" << endl;
        for (int i = 0; i < min(10, m); i++) {
            for (int j = 0; j < min(10, n); j++)
                cout << Conv[i * n + j] << ' ';
            cout << endl;
        }
    }
    delete Img;
    delete Conv;
    MPI_Finalize();
}

        pooling.cpp

#include <stdio.h>
#include <stdlib.h>
#include <sys/time.h>
#include <iostream>
#include "mpi.h"

using namespace std;

void randMat(int rows, int cols, float*& Mat) {
    Mat = new float[rows * cols];
    for (int i = 0; i < rows; i++)
        for (int j = 0; j < cols; j++)
            Mat[i * cols + j] = 1.0;
}

inline void pooling_kernel(int leftAnchorX, int leftAnchorY, int rightAnchorX,
    int rightAnchorY, int xStride, int yStride,
    int from_m, int from_n, int to_m, int to_n,
    float*& from, float*& to) {
    if ((rightAnchorX - leftAnchorX) % xStride != 0 ||
        (rightAnchorY - leftAnchorY) % yStride != 0)
        exit(-1);
    if (leftAnchorX % xStride != 0 || leftAnchorY % yStride != 0)
        exit(-2);
    for (int i = leftAnchorX; i < rightAnchorX; i += xStride) {
        for (int j = leftAnchorY; j < rightAnchorY; j += yStride) {
            // to[i / xStride * to_n + j] = 0.0;
            float temp = to[i / xStride * to_n + j / yStride];
            for (int r = i; r < i + xStride; r++)
                for (int c = j; c < j + yStride; c++) {
                    temp = max(temp, from[r * from_n + c]);
                }
            to[i / xStride * to_n + j / yStride] = temp;
        }
    }
}

void mpi_pooling(int m, int n, int xstride, int ystride, float*& mat,
    float*& res, int rank, int worldsize) {
    if (m % xstride || n % ystride) {
        cout << "matrix size and stride do not match \n";
        return;
    }
    const int xstrides_per_proc = (m / xstride) / worldsize;
    int strides;
    if (rank == 0) {
        MPI_Request* sendRequest = new MPI_Request[worldsize];
        MPI_Status* status = new MPI_Status[worldsize];
        for (int i = 1; i < worldsize; i++) {
            strides = (i < worldsize - 1)
                ? xstrides_per_proc
                : (m / xstride) - xstrides_per_proc * (worldsize - 1);
            MPI_Isend(&mat[i * xstrides_per_proc * xstride * n],
                strides * xstride * n, MPI_FLOAT, i, 0, MPI_COMM_WORLD,
                &sendRequest[i]);
        }
        for (int i = 1; i < worldsize; i++) {
            MPI_Wait(&sendRequest[i], &status[i]);
        }
        delete[] sendRequest;
        delete[] status;
    }
    else {
        MPI_Status status;
        strides = (rank < worldsize - 1)
            ? xstrides_per_proc
            : (m / xstride) - xstrides_per_proc * (worldsize - 1);
        mat = new float[strides * xstride * n];
        MPI_Recv(mat, strides * xstride * n, MPI_FLOAT, 0, 0, MPI_COMM_WORLD,
            &status);
        res = new float[strides * (n / ystride)];
    }
    MPI_Barrier(MPI_COMM_WORLD);
    strides = (rank < worldsize - 1)
        ? xstrides_per_proc
        : (m / xstride) - xstrides_per_proc * (worldsize - 1);
    // pooling_kernel(rank * xstrides_per_proc * xstride, 0,
    //                (rank * xstrides_per_proc + strides) * xstride, n,
    //                xstride, ystride, m, n, strides, n / ystride, mat, res);
    pooling_kernel(0, 0, strides * xstride, n, xstride, ystride,
        strides * xstride, n, strides, n / ystride, mat, res);
    MPI_Barrier(MPI_COMM_WORLD);
    if (rank == 0) {
        MPI_Status status;
        for (int i = 1; i < worldsize; i++) {
            strides = (i < worldsize - 1)
                ? xstrides_per_proc
                : (m / xstride) - xstrides_per_proc * (worldsize - 1);
            MPI_Recv(&res[i * xstrides_per_proc * (n / ystride)],
                strides * (n / ystride), MPI_FLOAT, i, 0, MPI_COMM_WORLD,
                &status);
        }
    }
    else {
        MPI_Send(res, strides * (n / ystride), MPI_FLOAT, 0, 0, MPI_COMM_WORLD);
    }
    MPI_Barrier(MPI_COMM_WORLD);
    return;
}
int main(int argc, char* argv[]) {
    if (argc != 3) {
        cout << "Usage: " << argv[0] << " M N";
        exit(-1);
    }
    int rank;
    int worldSize;
    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &worldSize);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    int m = atoi(argv[1]);
    int n = atoi(argv[2]);
    int xstride = 4, ystride = 4;
    float* Mat, * resMat;
    struct timeval start, stop;
    if (rank == 0) {
        randMat(m, n, Mat);
        randMat(m / xstride, n / ystride, resMat);
    }
    gettimeofday(&start, NULL);
    mpi_pooling(m, n, xstride, ystride, Mat, resMat, rank, worldSize);
    gettimeofday(&stop, NULL);
    if (rank == 0) {
        cout << "mpi pooling: "
            << (stop.tv_sec - start.tv_sec) * 1000 * 1000L +
            (stop.tv_usec - start.tv_usec)
            << endl;
        // for (int i = 0; i < min(10, m); i++) {
        //     for (int j = 0; j < min(10, n); j++)
        //         cout << Mat[i * k + j] << ' ';
        //     cout << endl;
        // }
    }
    delete[] Mat;
    delete[] resMat;
    MPI_Finalize();
}

        Makefile

CC = mpic++
CCFLAGS = -O2 -fopenmp
LDFLAGS = -lopenblas

all: gemm conv pooling 

gemm: gemm.cpp
	    ${CC} ${CCFLAGS} gemm.cpp -o gemm ${LDFLAGS}

conv: conv.cpp
	    ${CC} ${CCFLAGS} conv.cpp -o conv ${LDFLAGS}

pooling: pooling.cpp
	    ${CC} ${CCFLAGS} pooling.cpp -o pooling ${LDFLAGS}

clean:
	    rm gemm conv pooling

如果遇到Makefile不能写入的问题先退出,再在指令前+sudo试试。

或者在登录个人账户时就将此账户设置为管理员:

usermod -aG wheel zhangsan

三、按照实验指导手册进行编译

执行以下命令,获取包(四台主机都执行)

这个步骤在User目录下执行,不是Matrix。

wget https://github.com/xianyi/OpenBLAS/archive/v0.3.8.tar.gz
执行以下命令,进行编译(四台主机都执行)
tar -zxvf v0.3.8.tar.gz && cd OpenBLAS-0.3.8
make -j2
sudo make PREFIX=/usr/local/openblas install
sudo chmod -R 777 /usr/local/openblas/
执行以下命令,完成安装(四台主机都执行)
sudo ln -s /usr/local/openblas/lib/libopenblas.so /usr/lib/libopenblas.so
执行以下命令,配置 OpenBLAS 环境(四台主机都执行)
vim ~/.bashrc
添加下列内容
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/local/openblas/include
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/openblas/lib
执行以下命令,使环境变量均生效(四台主机都执行)
source ~/.bashrc
执行以下命令,进行编译(四台主机都执行)
make

四、主机配置文件

执行以下命令,建立主机配置文件(四台主机都执行)
vim /home/zhangsan/matrix/hostfile
添加内容如下:
ecs-hw-0001:2
ecs-hw-0002:2
ecs-hw-0003:2
ecs-hw-0004:2

这里如果完全按照预备实验来做就没有问题,如果预备实验取的节点名字并不是ecs-hw-0001等的话注意修改。

五、运行监测

编写 run.sh 脚本,内容如下:
app=${1}
if [ ${app} = "gemm" ]; then
mpirun --hostfile hostfile -np ${2} ./gemm 4024 4024 4024 0
fi
if [ ${app} = "conv" ]; then
mpirun --hostfile hostfile -np ${2} ./conv 4096 4096 ${3}
fi
if [ ${app} = "pooling" ]; then
mpirun --hostfile hostfile -np ${2} ./pooling 1024 1024
fi

如果你想测试更多效果,可以修改这个地方的参数。

六、可能出现的问题

  可以执行bash run.sh gemm 1、bash run.sh gemm 2但是执行gemm 3时报错,类似下面,且执行大于2个核心的bash指令时长时间无法运行出结果。

Fatal error in PMPI_Barrier: Unknown error class, error stack:
PMPI_Barrier(289).....................: MPI_Barrier(comm=MPI_COMM_WORLD) failed
PMPI_Barrier(275).....................: 
MPIR_Barrier_impl(175)................: 
MPIR_Barrier_intra_auto(110)..........: 
MPIR_Barrier_intra_smp(43)............: 
MPIR_Barrier_impl(175)................: 
MPIR_Barrier_intra_auto(110)..........: 

解决方法:退出当前账户到root:

exit

检查hosts文件:(四个都检查)

vim /etc/hosts

你也许会发现,除了实验一你所保留的内容之外,又凭空出现了一行,注释掉这一行,只保留预备实验添加的信息:

192.168.0.224  ecs-hw-0001
192.168.0.4  ecs-hw-0002
192.168.0.73 ecs-hw-0003
192.168.0.191 ecs-hw-0004

再次执行不出错。

(每个人做实验遇到的问题都不一样,希望能提供帮助)
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

程序媛媛不圆圆

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值