并行程序高阶矩阵乘法优化 c语言

Marbelious

于 2024-06-12 11:45:00 发布

阅读量319

点赞数 4

文章标签：矩阵 c语言算法

本文链接：https://blog.csdn.net/m0_62960995/article/details/139234689

版权

课程作业，mpi优化高阶矩阵乘法，分块和卡农参考了博客园上的一个大佬

只实现了A*B两个矩阵相乘

vs2022编译，装了最新版msmpi，线程数一般取10

写了详细的代码注释，希望过段时间还能看懂：(

1.串行代码

#include <stdio.h>
#include <stdlib.h>
#include <time.h>

#define row_a 1024
#define col_a 1024
#define row_b 1024
#define col_b 1024

// 矩阵乘法计算并返回运行时间
double matrix_multiplication(int* arrayA, int* arrayB, int* arrayC) {
    clock_t start_time = clock();

    for (int x = 0; x < row_a; x++) {
        for (int y = 0; y < col_b; y++) {
            int sum = 0;
            for (int c = 0; c < col_a; c++) {
                sum += arrayA[x * col_a + c] * arrayB[c * col_b + y];
            }
            arrayC[x * col_b + y] = sum;
        }
    }

    clock_t end_time = clock();
    return ((double)(end_time - start_time)) / CLOCKS_PER_SEC;
}

int main(void) {
    int* arrayA = NULL;
    int* arrayB = NULL;
    int* arrayC = NULL;

    arrayA = (int*)malloc(row_a * col_a * sizeof(int));
    arrayB = (int*)malloc(row_b * col_b * sizeof(int));
    arrayC = (int*)malloc(row_a * col_b * sizeof(int));

    if (arrayA == NULL || arrayB == NULL || arrayC == NULL) {
        printf("内存分配失败\n");
        return -1;
    }

    srand((unsigned int)time(NULL));

    // 生成并打印矩阵A
    printf("矩阵A:\n");
    for (int i = 0; i < row_a; i++) {
        for (int j = 0; j < col_a; j++) {
            arrayA[i * col_a + j] = -10 + rand() % 21;
            printf("%d ", arrayA[i * col_a + j]);
        }
        printf("\n");
    }

    // 生成并打印矩阵B
    printf("矩阵B:\n");
    for (int i = 0; i < row_b; i++) {
        for (int j = 0; j < col_b; j++) {
            arrayB[i * col_b + j] = -10 + rand() % 21;
            printf("%d ", arrayB[i * col_b + j]);
        }
        printf("\n");
    }

    // 调用矩阵乘法函数并获取运行时间
    double total_time = matrix_multiplication(arrayA, arrayB, arrayC, row_a, col_a, col_b);

    // 输出运行时间
    printf("乘法运行时间: %.4f 秒\n", total_time);

    // 释放动态分配的内存
    free(arrayA);
    free(arrayB);
    free(arrayC);

    return 0;
}

2.第一版并行代码

优化功能：使用结构体存储数据及其行列大小

平均分发、广播整个矩阵B和平均收集进行并行

缺点：结构体里用了数组，导致矩阵大的时候内存崩溃

平均分发平均收集限制太大，导致线程数必须和矩阵阶数匹配

没有统计效率需要的代码

#include<stdio.h>
#include<mpi.h>
#include<stdlib.h>
#include<time.h>


#define row_a 10
#define column_a 10
#define row_b 10
#define column_b 10


//结构体存储矩阵行列数
typedef struct {
    int array[row_a][column_a];
    int row;
    int column;
}Array;


//随机数生成矩阵
static void create_array(int row, int column, int array[][10]) {

    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            array[i][j] = -10 + rand() % 21;
        }
    }
}

//二维数组指针限制了列数
void print_matrix(int array[][10], int row, int column) {
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            printf("%d ", array[i][j]);
        }
        printf("\n");
    }
    printf("\n");
}


int main(int argc, char* argv[])
{
    int rank, size;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    int arrayA[row_a][column_a] = { 0 }, arrayB[row_b][column_b] = { 0 }, arrayC[row_a][column_b] = { 0 };
    int A_section[column_a] = { 0 };
    int C_section[column_a] = { 0 };

    if (rank == 0) {
        srand((unsigned int)time(NULL));
        create_array(row_a, column_a, &arrayA);
        create_array(row_b, column_b, &arrayB);

        printf("Matrix A:\n");
        print_matrix(&arrayA, row_a, column_a);
        printf("Matrix B:\n");
        print_matrix(&arrayB, row_b, column_b);


    }

    /*** 分发函数使用方法
        参数一：被分发数据的起始位置
        参数二：分发大小
        参数三：分发数据类型
        参数四：接收数据线程的接收地址
        参数五：接收的数据大小
        参数六：接收数据的类型
        参数七：负责分发的线程号
        参数八：通信域名称
    ***/

    MPI_Scatter(&arrayA, 10, MPI_INT, &A_section[0], 10, MPI_INT, 0, MPI_COMM_WORLD);

    /*** 广播函数使用方法
        参数一：被广播数据的起始位置
        参数二：广播大小
        参数三：广播数据类型
        参数四：负责广播的线程号
        参数五：通信域名称
    ***/
    MPI_Bcast(&arrayB, row_b * column_b, MPI_INT, 0, MPI_COMM_WORLD);

    //printf("我是进程%d, 我的第一个数据为：%d\n", rank, A_section[0]);



    // 各个线程运算自己的结果
    for (int i = 0; i < column_b; i++) {
        C_section[i] = 0;
        for (int j = 0; j < column_a; j++) {
            C_section[i] += A_section[j] * arrayB[j][i];
        }
    }

    /*** 收集函数使用方法
        参数一：被收集数据的起始位置
        参数二：收集大小
        参数三：被收集的数据类型
        参数四：被收集数据存储位置的首地址
        参数五：存储的长度
        参数六：存储的数据类型
        参数七：负责收集存储的线程号
        参数八：通信域名称
    ***/
    MPI_Gather(C_section, column_b, MPI_INT, &(arrayC[rank][0]), column_b, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        printf("Result matrix C:\n");
        print_matrix(&arrayC, 10, 10);
    }

    MPI_Finalize();

    return 0;
}

3.第二版并行代码

优化功能：n*m矩阵的乘法

线程不必再等于A矩阵的行数

缺点：还是二维数组，还是内存崩溃

函数太多太笨重

没有统计效率需要的代码

#include <stdio.h>
#include <mpi.h>
#include <stdlib.h>
#include <time.h>

#define row_a 11
#define column_a 5
#define row_b 5
#define column_b 7


//为了实现m*n的乘法，把两个矩阵的生成分开写了
static void create_arrayA(int row, int column, int array[][column_a]) {
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            array[i][j] = -10 + rand() % 21;
        }
    }
}

static void create_arrayB(int row, int column, int array[][column_b]) {
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            array[i][j] = -10 + rand() % 21;
        }
    }
}

//为了实现m*n的乘法，把两个矩阵的打印也分开写了
void print_matrixA(int arrayA[][column_a], int row, int column) {
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            printf("%d ", arrayA[i][j]);
        }
        printf("\n");
    }
    printf("\n");
}

void print_matrixB(int arrayB[][column_b], int row, int column) {
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            printf("%d ", arrayB[i][j]);
        }
        printf("\n");
    }
    printf("\n");
}

//笨重的二维指针
void matrix_multiply(int A[][column_a], int B[][column_b], int C[][column_b], int start_row, int end_row) {
    for (int i = start_row; i < end_row; i++) {
        for (int j = 0; j < column_b; j++) {
            C[i][j] = 0;
            for (int k = 0; k < column_a; k++) {
                C[i][j] += A[i][k] * B[k][j];
            }
        }
    }
}

int main(int argc, char* argv[]) {
    int rank, size;
    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &size);

    //笨重的初始化，还是二维数组，因为不能又发又存，为了0线程多创了俩矩阵
    int arrayA1[row_a][column_a] = { 0 }, arrayA2[row_a][column_a] = { 0 }, arrayB[row_b][column_b] = { 0 }, arrayC1[row_a][column_b] = { 0 }, arrayC2[row_a][column_b] = { 0 };

    if (rank == 0) {
        srand((unsigned int)time(NULL));
        create_arrayA(row_a, column_a, arrayA1);
        create_arrayB(row_b, column_b, arrayB);

        printf("Matrix A:\n");
        print_matrixA(arrayA1, row_a, column_a);
        printf("Matrix B:\n");
        print_matrixB(arrayB, row_b, column_b);
    }

    int rows_per_proc = row_a / size; //均分的话每个进程保底运算几行
    int extra_rows = row_a % size; //余下的部分计算任务

    /***
    *给scatterv做准备：
        start_row：分配开始的行数，即前面一共分走了几行，加一即为开始地点
        先把均分的总数算出来，余下的先判断一下线程能不能分到，能分到就+1，分不到+0
        end_row：分配结束的行数，就是开始的行数加上自己本身分配的行数
    
    ***/
    int start_row = rank < row_a ? rank * rows_per_proc + (rank < extra_rows ? rank : extra_rows) : 0;
    int end_row = rank < row_a ? start_row + rows_per_proc + (rank < extra_rows ? 1 : 0) : 0;

    /***
    *给scatterv做准备：
        send_counts：发送的数据量，就是发送行数*列大小
        displs：发送数据的首地址，即偏移量
    ***/

    /***
    *给gatherv做准备：
        recv_counts：收集的数据量，算出来之后应该是分配的行数*B矩阵的列数
        recv_displ：被收集数据存储位置的首地址，即偏移量，算算前面的线程占了多少，往后加就行
    ***/

    //我这里为什么要用动态分配来着，好像是因为size是变量，不能当一维数组用，被迫的
    int* send_counts = (int*)malloc(size * sizeof(int));
    int* displs = (int*)malloc(size * sizeof(int));
    int* recv_counts = (int*)malloc(size * sizeof(int));
    int* recv_displs = (int*)malloc(size * sizeof(int));

    //这玩意很奇怪，不加报错，加了一次也没这样退出过。。。
    if (send_counts == NULL || displs == NULL || recv_counts == NULL || recv_displs == NULL) {
        printf("内存分配失败\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    for (int i = 0; i < size; i++) {
        send_counts[i] = (i < row_a) ? (rows_per_proc + (i < extra_rows ? 1 : 0)) * column_a : 0;
        displs[i] = (i < row_a) ? i * rows_per_proc * column_a + (i < extra_rows ? i * column_a : extra_rows * column_a) : 0;
        recv_counts[i] = (i < row_a) ? (rows_per_proc + (i < extra_rows ? 1 : 0)) * column_b : 0;
        recv_displs[i] = (i < row_a) ? i * rows_per_proc * column_b + (i < extra_rows ? i * column_b : extra_rows * column_b) : 0;
    }

    /*** 不均匀分发函数使用方法
        参数一：被分发数据的起始位置
        参数二：分发大小，线程间可能不同，按顺序取用数组中的数据
        参数三：分发数据在发送区的偏移量
        参数四：分发数据类型
        参数五：接收缓存区首地址
        参数六：接收数据的大小
        参数七：接收数据的类型
        参数八：负责分发的线程号
        参数九：通信域名称
    ***/
    MPI_Scatterv(arrayA1, send_counts, displs, MPI_INT, &arrayA2[start_row][0], (end_row - start_row) * column_a, MPI_INT, 0, MPI_COMM_WORLD);

    // 广播矩阵 B 到所有进程
    MPI_Bcast(arrayB, row_b * column_b, MPI_INT, 0, MPI_COMM_WORLD);

    // 计算部分矩阵乘积
    if (rank < row_a) {
        matrix_multiply(arrayA2, arrayB, arrayC2, start_row, end_row);
    }

    /*** 不均匀收集函数使用方法
        参数一：被收集数据的起始位置
        参数二：收集数据的大小
        参数三：收集数据的类型
        参数四：被收集数据的缓存区
        参数五：收集数据的大小
        参数六：收集数据在缓存区储存位置的偏移量
        参数七：收集数据的类型
        参数八：负责收集的线程号
        参数九：通信域名称
    ***/
    MPI_Gatherv(&arrayC2[start_row][0], (end_row - start_row) * column_b, MPI_INT, arrayC1, recv_counts, recv_displs, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        printf("Result matrix C:\n");
        print_matrixB(arrayC1, row_a, column_b);
    }

    // 释放动态分配的内存
    free(send_counts);
    free(displs);
    free(recv_counts);
    free(recv_displs);

    MPI_Finalize();
    return 0;
}

4.第三版并行代码

优化功能：矩阵用了动态分配，一维数组指针，妈妈再也不用担心我的内存不够了

分块矩阵乘法，大大减少了通信时间

增加了计算乘法运行时间和计算并行效率的代码

缺点：小型矩阵的乘法不能做了，但这本来就是高阶矩阵乘法对吧

#include <stdio.h>
#include "mpi.h"
#include <stdlib.h>
#include <math.h>

#define row_a 10
#define column_a 10
#define row_b 10
#define column_b 10
#define SERIAL_TIME 5.0 // 假设串行时间为5秒，可以先运行一下串行代码看着改

//学聪明了，一维数组传行列数不就行了，傻子才用二维(恼
static void create_arrayA(int row, int* array) {
    for (int i = 0; i < column_a; i++) {
        for (int j = 0; j < row; j++) {
            if (j < row_a)
                array[j * column_a + i] = -10 + rand() % 21;
            else
                array[j * column_a + i] = 0;
        }
    }
}

static void create_arrayB(int col, int* array) {
    for (int i = 0; i < column_a; i++) {
        for (int j = 0; j < col; j++) {
            if (j < column_b)
                array[i * col + j] = -10 + rand() % 21;
            else
                array[i * col + j] = 0;
        }
    }
}

static void print_matrix(int* array, int row, int column) {
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            printf("%d ", array[i * column + j]);
        }
        printf("\n");
    }
    printf("\n");
}

/***
* 分块矩阵法介绍：
* 两个矩阵，A分成行向量，B分成列向量，A*B的每一项即可看作行列向量的积
* 这样的话就不必广播整个B，而是部分的B，大大缩短通信时间
* 
***/
int main(int argc, char** argv)
{
    //原始矩阵大小
    int M = row_a, N = column_a, K = column_b; 
    int rank, size;
    double start, stop; 
    MPI_Status status;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    // 算高阶，线程数不能比矩阵阶数小，不然就没法分块了
    if (size > M || size > K) {
        if (rank == 0)
            printf("Error: 线程数大于矩阵阶数！\n");
        MPI_Finalize();
        return 0;
    }

    // 扩展以致线程均分A的行，B的列
    if (M % size != 0) {
        M += size - (M % size);
    }
    //int saveK = K;
    if (K % size != 0) {
        K += size - (K % size);
    }

    int each_row = M / size; // 每个线程计算几行A(扩展后，后同
    int each_col = K / size; // 每个线程计算几列B(扩展后，后同

    // 一维数组指针赛高
    int* Matrix_A = NULL;

    //A行切片的存储地址，B列切片的存储地址，结果矩阵
    int* buffer_A, * buffer_B, * ans, * result_Matrix;

    buffer_A = (int*)malloc(each_row * N * sizeof(int));    //分片A，每个线程分几行，N列，想象一下，A横着切，B竖着切
    buffer_B = (int*)malloc(N * each_col * sizeof(int));    //分片B，每个线程分几列，N行，
    ans = (int*)malloc(each_row * K * sizeof(int));         //保存部分计算结果
    result_Matrix = (int*)malloc(M * K * sizeof(int));      //最终结果矩阵

    if (buffer_A == NULL || buffer_B == NULL || ans == NULL || result_Matrix == NULL) {
        printf("Memory allocation failed\n");
        MPI_Abort(MPI_COMM_WORLD, 1);
    }

    if (rank == 0) {
        int* Matrix_B;
        Matrix_A = (int*)malloc(M * N * sizeof(int)); 
        Matrix_B = (int*)malloc(N * K * sizeof(int));

        if (Matrix_A == NULL || Matrix_B == NULL) {
            printf("Matrix memory allocation failed\n");
            MPI_Abort(MPI_COMM_WORLD, 1);
        }

        // 初始化，扩展的部分全置零，不影响计算结果
        srand((unsigned int)time(NULL));
        create_arrayA(M, Matrix_A);
        printf("A为:\n");
        print_matrix(Matrix_A, M, N);

        create_arrayB(K, Matrix_B);
        printf("B为:\n");
        print_matrix(Matrix_B, N, K);

        
        //printf("行 blocks: %d, 列 blocks: %d\n", each_row, each_col);

        // 加了统计乘法时间的变量
        start = MPI_Wtime();

        //0线程发之前先把自己的B列块存好
        for (int i = 0; i < N; i++) {
            for (int j = 0; j < each_col; j++) {
                buffer_B[i * each_col + j] = Matrix_B[i * K + j]; 
            }
        }

        /***
        * send函数的用法：
        * 参数一：发送区首地址
        * 参数二：发送的数据大小
        * 参数三：发送的数据类型
        * 参数四：接收方的线程号
        * 参数五：消息类型，说实话这个译名，不太懂，应该是发的比较多，用来区分接收顺序的
        * 参数六：通信域
        ***/
        for (int p = 1; p < size; p++) {

            //线程号*每个线程要算的列数=偏移量，因为前面的分给别人了，下面的i*k，分块B是列向量，要分成好几行发，一行发each_col个
            for (int i = 0; i < N; i++) {
            int beginCol = p * each_col;  
                MPI_Send(Matrix_B + i * K + beginCol, each_col, MPI_INT, p, i, MPI_COMM_WORLD); 
            }
        }

        free(Matrix_B);
    }

    // 分发函数复习一下？
    MPI_Scatter(Matrix_A, each_row * N, MPI_INT, buffer_A, each_row * N, MPI_INT, 0, MPI_COMM_WORLD);

    if (rank == 0) {
        free(Matrix_A);
    }

    /***
        * recv函数的用法：
        * 参数一：接收区首地址
        * 参数二：接收的数据大小
        * 参数三：发送的数据类型
        * 参数四：发送方的线程号
        * 参数五：消息类型
        * 参数六：通信域
        * 参数七：接收状态，没接到报错误状态码
        ***/
    if (rank != 0) {
        for (int i = 0; i < N; i++) {

            //各个进程接收自己的B列块
            MPI_Recv(buffer_B + i * each_col, each_col, MPI_INT, 0, i, MPI_COMM_WORLD, &status);
        }
    }

    /***
    * sendrecv_replace函数：使用MPI的发送接收替换函数，将当前线程的B块发送到目标线程，并接收从源线程发来的B块
    * 参数一：发送和接收的缓冲区
    * 参数二：发送和接收的数据量，即矩阵B的一个分块的大小
    * 参数三：数据类型
    * 参数四：目标线程的rank号
    * 参数五：发送的消息标签
    * 参数六：源线程的rank号
    * 参数七：接收的消息标签
    * 参数八：通信域
    * 参数九：MPI通信状态
    ***/

    //每个线程的B列块都不同，算完自己的，就把自己的B发给前一个线程，从后一个线程拿新的B，再算一遍，算十次就算完了
    for (int k = 0; k < size; k++) {  

        //比如线程1，执行到第4次，那就该拿线程4的B，(3+1)%10*each_col，代表应该存到线程4那个列块的位置。
        int beginCol = ((k + rank) % size) * each_col; 

        //行列对应元素相乘
        for (int i = 0; i < each_row; i++) {
            for (int j = 0; j < each_col; j++) {
                int temp = 0;
                for (int p = 0; p < N; p++) {
                    temp += buffer_A[i * N + p] * buffer_B[p * each_col + j];
                }

                //分片A乘整个B，结果应该是each.row个1*B.col 的向量，前面的i*k是算行，后面在算列，从4B列块的首地址开始，第j个
                ans[i * K + beginCol + j] = temp;
            }
        }

        // 计算发送B目标线程的 rank 号。
        int dest = (rank - 1 + size) % size; 

        // 计算获取B源线程的 rank 号。
        int src = (rank + 1) % size;


        MPI_Sendrecv_replace(buffer_B, N * each_col, MPI_INT, dest, 0, src, 0, MPI_COMM_WORLD, &status); 
    }

    //收集函数复习一下？
    MPI_Gather(ans, each_row * K, MPI_INT, result_Matrix, each_row * K, MPI_INT, 0, MPI_COMM_WORLD);

    // 算乘法计算总时间和并行效率
    if (rank == 0) {      
        stop = MPI_Wtime();
        double E = (SERIAL_TIME / (stop - start)) / size;
        printf("运算结果为:\n");
        print_matrix(result_Matrix, row_a, column_b);
        printf("计算总时间: %lf seconds, 并行效率: %.2lf%%\n", stop - start, 100 * E);
    }

    // 释放内存
    free(buffer_A);
    free(buffer_B);
    free(ans);
    free(result_Matrix);

    MPI_Finalize();
    return 0;
}

5.第四版并行代码

优化功能：卡农算法，进一步缩短通信时间

缺点：卡农算法适用于n*n矩阵，n*m可以靠扩展解决，但是我懒

线程数必须为完全平方数，限制较大

#include <stdio.h>
#include "mpi.h"
#include <stdlib.h>
#include <math.h>
#include <string.h>


#define row_a 10
#define SERIAL_TIME 5.0 // 假设串行时间为5秒

// 创建矩阵并初始化
static void create_array(int N, int* array) {
    for (int i = 0; i < N; i++) {
        for (int j = 0; j < N; j++) {
            if (i < row_a && j < row_a) {
                array[i * N + j] = -10 + rand() % 21;
            }
            else {
                array[i * N + j] = 0;
            }
        }
    }
}

// 打印矩阵
static void print_matrix(int* array, int row, int column) {
    for (int i = 0; i < row; i++) {
        for (int j = 0; j < column; j++) {
            printf("%d ", array[i * column + j]);
        }
        printf("\n");
    }
    printf("\n");
}

// 获取对应分块的线程号
int get_index(int row, int col, int block) {
    return (row + block) % block * block + (col + block) % block;
}

/***卡农算法计算矩阵A*B ***/
int main(int argc, char** argv)
{
    int N = row_a; // 方阵矩阵阶数
    int rank, size;
    double start, stop; 

    MPI_Status status;

    MPI_Init(&argc, &argv);
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);

    int block = (int)sqrt(size); // A B行列分多少块,线程数必须可以开方

    if (size != block * block || block > N) {
        if (rank == 0)
            printf("error process:%d\n", size);
        MPI_Finalize();
        return 0;
    }

    int saveN = N; // 为了A B能均分，扩展一下 
    if (N % block != 0) {
        N -= N % block;
        N += block;
    }

    int line = N / block; // 矩阵A B每块分多少行or列数据
    int my_Row = rank / block; // 每个块当前位置(i,j)
    int my_Col = rank % block; // 每个块当前位置(i,j)

    int* buffer_A, * buffer_B, * ans;
    buffer_A = (int*)malloc(line * line * sizeof(int)); // A的块数据
    buffer_B = (int*)malloc(line * line * sizeof(int)); // B的块数据
    ans = (int*)malloc(line * line * sizeof(int)); // 临时存储每块的结果数据

    if (rank == 0) {
        int* Matrix_A, * Matrix_B;
        Matrix_A = (int*)malloc(N * N * sizeof(int));
        Matrix_B = (int*)malloc(N * N * sizeof(int));

        srand((unsigned int)time(NULL));

        create_array(N, Matrix_A);
        printf("矩阵A为:\n");
        print_matrix(Matrix_A, N, N);

        create_array(N, Matrix_B);
        printf("矩阵B为:\n");
        print_matrix(Matrix_B, N, N);

        printf("block=%d line=%d\n", block, line);

        start = MPI_Wtime();

        // 将AB的每块数据分配给每个进程
        for (int p = 1; p < size; p++) {
            int beginRow = (p / block) * line; // 每个块的行列起始位置(坐标/偏移量)
            int beginCol = (p % block) * line;
            for (int i = 0; i < line; i++) {
                MPI_Send(Matrix_A + (beginRow + i) * N + beginCol, line, MPI_INT, p, 0, MPI_COMM_WORLD);
                MPI_Send(Matrix_B + (beginRow + i) * N + beginCol, line, MPI_INT, p, 1, MPI_COMM_WORLD);
            }
        }
        // 线程0自己的部分直接拷贝过去即可
        for (int i = 0; i < line; i++) {
            memcpy(buffer_A + i * line, Matrix_A + i * N, line * sizeof(int));
            memcpy(buffer_B + i * line, Matrix_B + i * N, line * sizeof(int));
        }

        free(Matrix_A);
        free(Matrix_B);
    }
    else {
        // 其他线程接收A B块数据
        for (int i = 0; i < line; i++) {
            MPI_Recv(buffer_A + i * line, line, MPI_INT, 0, 0, MPI_COMM_WORLD, &status);
            MPI_Recv(buffer_B + i * line, line, MPI_INT, 0, 1, MPI_COMM_WORLD, &status);
        }
    }

    // 执行卡农算法，总共block个块，换完就算完了
    for (int k = 0; k < block; k++) {
        // 计算部分矩阵乘积
        for (int i = 0; i < line; i++) {
            for (int j = 0; j < line; j++) {
                int temp = 0;
                for (int p = 0; p < line; p++) {
                    temp += buffer_A[i * line + p] * buffer_B[p * line + j];
                }
                if (k == 0)
                    ans[i * line + j] = temp;
                else
                    ans[i * line + j] += temp;
            }
        }

        // 收发函数复习一下？

        // 把矩阵A均分为几个块，把自己的块发给同行但上一列的线程，从同行的下一列那块的线程拿数据
        MPI_Sendrecv_replace(buffer_A, line * line, MPI_INT, get_index(my_Row, my_Col - 1, block), 5,
            get_index(my_Row, my_Col + 1, block), 5, MPI_COMM_WORLD, &status);

        // 把矩阵B均分为几个块，把自己的块发给同列但上一行的线程，从同列的下一行那块的线程拿数据
        MPI_Sendrecv_replace(buffer_B, line * line, MPI_INT, get_index(my_Row - 1, my_Col, block), 6,
            get_index(my_Row + 1, my_Col, block), 6, MPI_COMM_WORLD, &status);
    }

    // 发送函数复习一下？
    if (rank != 0) {
        MPI_Send(ans, line * line, MPI_INT, 0, 7, MPI_COMM_WORLD);
    }
    else {
        // 接收其它进程的计算结果
        int* result_Matrix = (int*)malloc(N * N * sizeof(int)); // 保存数据计算结果
        for (int i = 0; i < line; i++) {
            for (int j = 0; j < line; j++) {
                result_Matrix[i * N + j] = ans[i * line + j];
            }
        }
        for (int p = 1; p < size; p++) {
            int beginRow = (p / block) * line;
            int beginCol = (p % block) * line;
            MPI_Recv(ans, line * line, MPI_INT, p, 7, MPI_COMM_WORLD, &status);
            for (int i = 0; i < line; i++) {
                memcpy(result_Matrix + (beginRow + i) * N + beginCol, ans + i * line, line * sizeof(int));
            }
        }

        printf("运算结果为:\n");
        print_matrix(result_Matrix, row_a, row_a);
        stop = MPI_Wtime();
        printf("rank:%d time:%lfs\n", rank, stop - start);
        free(result_Matrix);
    }

    free(buffer_A);
    free(buffer_B);
    free(ans);

    MPI_Finalize();
    return 0;
}