👍一. 非阻塞通信
非阻塞通信的完成与检测
- any 任意一个
- some 多个
- all 全部完成
Jacobi迭代4:非阻塞
- 阻塞版本分析
需要对于不同进程边界数据进行传输,因此想改变通信和计算能够重叠的功能,可以先计算边界数据,然后开启数据传输的同时计算.不同的算法,这种方式不一定好去做,Jacobi是没有依赖关系的.
#include<stdio.h>
#include<mpi.h>
#include<stdlib.h>
#include<unistd.h>
#define N 8
#define SIZE N / 4
#define T 2
void print_myRows(int, float [][N]);
int main(int argc, char* argv[]){
int myid, size;
float myRows[SIZE + 2][N], myRows2[SIZE+2][N], tmp[2][N], c[N][N];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Status status[4], status1;
MPI_Request request[4];
// 初始化数据
int i, j;
for (i = 0; i < SIZE + 2; i ++){
for (j = 0; j < N; j ++){
myRows[i][j] = myRows2[i][j] = 0;
}
}
if (myid == 0){
for (j = 0; j < N; j ++)
myRows[1][j] = myRows2[1][j] = 8.0;
}
if (myid == 3){
for (j = 0; j < N; j ++)
myRows[2][j] = myRows2[2][j] = 8.0;
}
for (i = 1; i < SIZE + 1; i ++){
myRows[i][0] = myRows2[i][0] = 8.0;
myRows[i][N - 1] = myRows2[i][N - 1] = 8.0;
}
// 实际迭代过程 0->1->2->3 3->2->1->0
int step;
int tag_down = 0, tag_up = 1;
int up_proc_id = (0 == myid) ? MPI_PROC_NULL:(myid - 1);
int down_proc_id = (3 == myid) ? MPI_PROC_NULL:(myid + 1);
MPI_Sendrecv(&myRows[SIZE][0], 8, MPI_FLOAT, down_proc_id, tag_down, &myRows[0][0], 8, MPI_FLOAT, up_proc_id, tag_down, MPI_COMM_WORLD, &status1);
MPI_Sendrecv(&myRows[1][0], 8, MPI_FLOAT, up_proc_id, tag_up, &myRows[SIZE + 1][0], 8, MPI_FLOAT, down_proc_id, tag_up, MPI_COMM_WORLD, &status1);
for (step = 0; step < T; step ++){
// 先计算边界数据,因为初始化的时候边界还没有数据,所以迭代之前要先传输一下
if (myid == 0){
for (int j = 1; j < N - 1; j ++) myRows2[2][j] = 0.25 * (myRows[1][j] + myRows[3][j] + myRows[2][j + 1] + myRows[2][j - 1]);
}else if (myid == 3){
for (int j = 1; j < N - 1; j ++) myRows2[1][j] = 0.25 * (myRows[0][j] + myRows[2][j] + myRows[1][j + 1] + myRows[1][j - 1]);
}else{
for (int i = 1; i <= 2; i ++){
for (int j = 1; j < N - 1; j ++) myRows2[i][j] = 0.25 * (myRows[i - 1][j] + myRows[i + 1][j] + myRows[i][j + 1] + myRows[i][j - 1]);
}
}
MPI_Isend(&myRows2[2][0], 8, MPI_FLOAT, down_proc_id, tag_down, MPI_COMM_WORLD, &request[0]);
MPI_Irecv(&myRows2[0][0], 8, MPI_FLOAT, up_proc_id, tag_down, MPI_COMM_WORLD, &request[1]);
MPI_Isend(&myRows2[1][0], 8, MPI_FLOAT, up_proc_id, tag_up, MPI_COMM_WORLD, &request[2]);
MPI_Irecv(&myRows2[3][0], 8, MPI_FLOAT, down_proc_id, tag_up, MPI_COMM_WORLD, &request[3]);
// 计算剩余数据,但是计算边界数据的时候,所有的迭代数据已经全部计算完成
// 更新矩阵
int begin_row = (0 == myid) ? 2 : 1;
int end_row = (3 == myid)? 1 : 2;
for (i = begin_row; i <= end_row; i ++){
for (int j = 1; j < N - 1; j ++){
myRows[i][j] = myRows2[i][j];
}
}
// 等待所有非阻塞对象完成
MPI_Waitall(4, &request[0], &status[0]);
}
sleep(myid);
print_myRows(myid, myRows);
// if (myid == 1) sleep(1);
for (int i = 1; i < SIZE + 1; i ++){
for (int j = 0; j < N; j ++){
tmp[i - 1][j] = myRows[i][j];
}
}
// MPI_Barrier(MPI_COMM_WORLD);
MPI_Gather(tmp, 16, MPI_FLOAT, c, 16, MPI_FLOAT, 0, MPI_COMM_WORLD);
// 输出结果
if (0 == myid){
fprintf(stderr, "\n收集后结果\n");
for (int i = 0; i < N; i ++){
for (int j = 0; j < N; j ++){
fprintf(stderr, "%.3f\t", c[i][j]);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
}
MPI_Finalize();
return 0;
}
void print_myRows(int myid, float myRows[][N]){
int i, j;
printf("Result in process %d:\n", myid);
for (i = 0; i < SIZE + 2;i ++){
for (j = 0; j < N; j ++){
printf("%1.3f\t", myRows[i][j]);
}
printf("\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
- 16行版本,加上迭代计算非边界的时候
- 如果计算剩余时间和通信时间相等,能够很好地隐藏通信时间
- 网格或者矩阵比较大,一般边界计算和通信时间很短,剩余计算花费时间很长
- 所以通过非阻塞通信能够隐藏
- 如果循环很多次,需要多次调用
MPI_Isend MPI_Irecv
,需要分配缓存 初始化 分配端口,一系列过程需要每次循环都要重新开始
#include<stdio.h>
#include<mpi.h>
#include<stdlib.h>
#include<unistd.h>
#define N 16
#define SIZE N / 4
#define T 2
void print_myRows(int, float [][N]);
int main(int argc, char* argv[]){
int myid, size;
float myRows[SIZE + 2][N], myRows2[SIZE+2][N], tmp[2][N], c[N][N];
MPI_Init(&argc, &argv);
MPI_Comm_rank(MPI_COMM_WORLD, &myid);
MPI_Comm_size(MPI_COMM_WORLD, &size);
MPI_Status status[4], status1;
MPI_Request request[4];
// 初始化数据
int i, j;
for (i = 0; i < SIZE + 2; i ++){
for (j = 0; j < N; j ++){
myRows[i][j] = myRows2[i][j] = 0;
}
}
if (myid == 0){
for (j = 0; j < N; j ++)
myRows[1][j] = myRows2[1][j] = 8.0;
}
if (myid == 3){
for (j = 0; j < N; j ++)
myRows[SIZE - 1][j] = myRows2[SIZE - 1][j] = 8.0;
}
for (i = 1; i < SIZE + 1; i ++){
myRows[i][0] = myRows2[i][0] = 8.0;
myRows[i][N - 1] = myRows2[i][N - 1] = 8.0;
}
// 实际迭代过程 0->1->2->3 3->2->1->0
int step;
int tag_down = 0, tag_up = 1;
int up_proc_id = (0 == myid) ? MPI_PROC_NULL:(myid - 1);
int down_proc_id = (3 == myid) ? MPI_PROC_NULL:(myid + 1);
MPI_Sendrecv(&myRows[SIZE][0], N, MPI_FLOAT, down_proc_id, tag_down, &myRows[0][0], N, MPI_FLOAT, up_proc_id, tag_down, MPI_COMM_WORLD, &status1);
MPI_Sendrecv(&myRows[1][0], N, MPI_FLOAT, up_proc_id, tag_up, &myRows[SIZE + 1][0], N, MPI_FLOAT, down_proc_id, tag_up, MPI_COMM_WORLD, &status1);
for (step = 0; step < T; step ++){
// 先计算边界数据,因为初始化的时候边界还没有数据,所以迭代之前要先传输一下
if (myid == 0){
for (int j = 1; j < N - 1; j ++) myRows2[SIZE][j] = 0.25 * (myRows[SIZE - 1][j] + myRows[SIZE + 1][j] + myRows[SIZE][j + 1] + myRows[SIZE][j - 1]);
}else if (myid == 3){
for (int j = 1; j < N - 1; j ++) myRows2[1][j] = 0.25 * (myRows[0][j] + myRows[2][j] + myRows[1][j + 1] + myRows[1][j - 1]);
}else{
for (int j = 1; j < N - 1; j ++){
myRows2[1][j] = 0.25 * (myRows[0][j] + myRows[2][j] + myRows[0][j + 1] + myRows[0][j - 1]);
myRows2[SIZE][j] = 0.25 * (myRows[SIZE -1 ][j] + myRows[SIZE + 1][j] + myRows[SIZE][j + 1] + myRows[SIZE][j - 1]);
}
}
MPI_Isend(&myRows2[SIZE][0], N, MPI_FLOAT, down_proc_id, tag_down, MPI_COMM_WORLD, &request[0]);
MPI_Irecv(&myRows2[0][0], N, MPI_FLOAT, up_proc_id, tag_down, MPI_COMM_WORLD, &request[1]);
MPI_Isend(&myRows2[1][0], N, MPI_FLOAT, up_proc_id, tag_up, MPI_COMM_WORLD, &request[2]);
MPI_Irecv(&myRows2[SIZE + 1][0], N, MPI_FLOAT, down_proc_id, tag_up, MPI_COMM_WORLD, &request[3]);
// 计算剩余数据,但是计算边界数据的时候,所有的迭代数据已经全部计算完成
int begin = 2;
int end = SIZE - 1;
for (int i = begin; i <= end; i ++){
for (int j = 1; j < N - 1; j ++){
myRows2[i][j] = 0.25 * (myRows[i - 1][j] + myRows[i + 1][j] + myRows[i][j + 1] + myRows[i][j - 1]);
}
}
// 更新矩阵
int begin_row = (0 == myid) ? 2 : 1;
int end_row = (3 == myid)? SIZE - 1 : SIZE;
for (i = begin_row; i <= end_row; i ++){
for (int j = 1; j < N - 1; j ++){
myRows[i][j] = myRows2[i][j];
}
}
// 等待所有非阻塞对象完成
MPI_Waitall(4, &request[0], &status[0]);
}
sleep(myid);
print_myRows(myid, myRows);
// if (myid == 1) sleep(1);
for (int i = 1; i < SIZE + 1; i ++){
for (int j = 0; j < N; j ++){
tmp[i - 1][j] = myRows[i][j];
}
}
// MPI_Barrier(MPI_COMM_WORLD);
MPI_Gather(tmp, 64, MPI_FLOAT, c, 64, MPI_FLOAT, 0, MPI_COMM_WORLD);
// 输出结果
if (0 == myid){
fprintf(stderr, "\n收集后结果\n");
for (int i = 0; i < N; i ++){
for (int j = 0; j < N; j ++){
fprintf(stderr, "%.3f\t", c[i][j]);
}
fprintf(stderr, "\n");
}
fprintf(stderr, "\n");
}
MPI_Finalize();
return 0;
}
void print_myRows(int myid, float myRows[][N]){
int i, j;
printf("Result in process %d:\n", myid);
for (i = 0; i < SIZE + 2;i ++){
for (j = 0; j < N; j ++){
printf("%1.3f\t", myRows[i][j]);
}
printf("\n");
}
MPI_Barrier(MPI_COMM_WORLD);
}
- 重复非阻塞通信
- 上述都是一样的,只是每次的发送数据时不同的
Jacobi迭代总结
- 阻塞消息通信版本
- 代码繁琐,存在死锁可能性
- 基于捆绑消息通信版本
- 引入虚拟进程,MPI维护消息发送次序
- 非阻塞消息通信版本
- 优化性能,流程复杂度高
- 重复非阻塞消息通信版本
- 优化通信,提前初始化通信
MPI_WAIT/MPI_TEST
根据使用时机有不同的效果,MPI_WAIT
等待所有完成.