ACSE6 L6 Parallel Decomposition

最新推荐文章于 2021-04-14 05:20:14 发布

isFan.y

最新推荐文章于 2021-04-14 05:20:14 发布

阅读量179

点赞数 1

分类专栏： MPI

本文链接：https://blog.csdn.net/weixin_41860751/article/details/115365384

版权

MPI 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

Parallel Decomposition

1. Introduction

您可以通过多种不同方式拆分问题，以便并行解决问题. 选取方式可能取决于可用的计算机资源(可用核数, 每个节点的可用内存量, 处理器的相对速度与通信速度的关系). 因此，我们将只考虑有限的一组通用方法来解决问题

2. Data Decomposition

Split the data between the processes:
(1) Split the output data
(2) Split the input data
(3) Split both the input and output data

2.1 Split the output data

在许多问题中，只要完全了解输入数据，就可以独立计算输出数据的不同部分. 比如, 、
（1） Matrix multiplication - answer matrix的每个元素可以独立计算
（2） Communication is required to distribute the input data, but not between the nodes
Exercise 1: Data Decomposition: Output
编写一个并行执行矩阵乘法的程序。On processor zero，创建2个要相乘的矩阵（确保它们具有相乘的兼容维数）. Make each of the processes responsible for different rows in the answer.
a）将输入矩阵与它们负责的行一起发送到所有进程(decide the decomposition on processor zero).一旦计算完成，将结果收集回处理器0.
b）您实际上不需要将所有两个矩阵都发送到其他进程。您可以通过仅向每个进程发送所需的数据来加快通信速度吗？

#include <mpi.h>
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <chrono>
using namespace std;


int id, p;
// A * B = C    A-> i_max by k_max   B->k_max by j_max   C->i_max by j_max
int i_max = 100;
int j_max = 200;
int k_max = 150;

double** A_2d;
double* A_1d;
double** B_2d;
double* B_1d;
double** C_2d;
double* C_1d;

int* row_start, * process_chunk;


void setup_continuous_array(double**& array_2d, double*& array_1d, int m, int n) {
	array_1d = new double[m * n];
	array_2d = new double* [m];

	for (int i = 0; i < m; i++) {
		array_2d[i] = &array_1d[i * n];         // index=i*n+j
	}
}

void free_continuous_array(double**& array_2d, double*& array_1d) {
	delete[] array_1d;
	delete[] array_2d;
}

void setup_partition() {
	row_start = new int[p];
	process_chunk = new int[p];
	
	int rows_left = i_max;
	row_start[0] = 0;
	for (int n = 0; n < p; n++) { // 每个process处理多少行, p是总的process数目
		int row_assigned = rows_left / (p - n);
		rows_left -= row_assigned;
		row_start[n + 1] = row_start[n] + row_assigned;
		process_chunk[n] = row_assigned;  // 每个process处理多少行
	}
	process_chunk[p - 1] = i_max - row_start[p - 1];

	if (id == 0) {
		for (int i = 0; i < p; i++) {
			cout << "Process " << i << " :start " << row_start[i] << " " << process_chunk[i] << endl;
		}
	}
}


int main(int argc, char* argv[]) {
	MPI_Init(&argc, &argv);

	MPI_Comm_rank(MPI_COMM_WORLD, &id);
	MPI_Comm_size(MPI_COMM_WORLD, &p);
	srand(time(NULL) + id * 10);

	setup_partition();

	setup_continuous_array(A_2d, A_1d, i_max, k_max);
	setup_continuous_array(B_2d, B_1d, k_max, j_max);
	setup_continuous_array(C_2d, C_1d, i_max, j_max);

	for (int i = 0; i < i_max * k_max; i++) {
		A_1d[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0;
	}

	for (int i = 0; i < k_max * j_max; i++) {
		B_1d[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0;
	}

	for (int i = 0; i < i_max; i++) {
		for (int j = 0; j < j_max; j++) {
			C_2d[i][j] = 0;
			for (int k = 0; k < k_max; k++) {
				C_2d[i][j] += A_2d[i][k] * B_2d[k][j];
			}
		}
	}



	free_continuous_array(A_2d, A_1d);
	free_continuous_array(B_2d, B_1d);
	free_continuous_array(C_2d, C_1d);

	MPI_Finalize();
}
/*
PS D:\桌面\C++ Assi\AMPI\x64\Debug> mpiexec -n 10 AMPI.exe
Process 0 :start 0 10
Process 1 :start 10 10
Process 2 :start 20 10
Process 3 :start 30 10
Process 4 :start 40 10
Process 5 :start 50 10
Process 6 :start 60 10
Process 7 :start 70 10
Process 8 :start 80 10
Process 9 :start 90 10
PS D:\桌面\C++ Assi\AMPI\x64\Debug> mpiexec -n 9 AMPI.exe
Process 0 :start 0 11
Process 1 :start 11 11
Process 2 :start 22 11
Process 3 :start 33 11
Process 4 :start 44 11
Process 5 :start 55 11
Process 6 :start 66 11
Process 7 :start 77 11
Process 8 :start 88 12
*/

课上的失败案例

#include <mpi.h>
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <chrono>
using namespace std;


int id, p;
// A * B = C    A-> i_max by k_max   B->k_max by j_max   C->i_max by j_max
int i_max = 100;
int j_max = 200;
int k_max = 150;

double** A_2d;
double* A_1d;
double** B_2d;
double* B_1d;
double** C_2d;
double* C_1d;

int* row_start, * process_chunk;
int tag_num = 1;


void setup_continuous_array(double**& array_2d, double*& array_1d, int m, int n) {
	array_1d = new double[m * n];
	array_2d = new double* [m];

	for (int i = 0; i < m; i++) {
		array_2d[i] = &array_1d[i * n];         // index=i*n+j
	}
}

void free_continuous_array(double**& array_2d, double*& array_1d) {
	delete[] array_1d;
	delete[] array_2d;
}

void setup_partition() {
	row_start = new int[p];
	process_chunk = new int[p];

	int rows_left = i_max;
	row_start[0] = 0;
	for (int n = 0; n < p; n++) { // 每个process处理多少行, p是总的process数目
		int row_assigned = rows_left / (p - n);
		rows_left -= row_assigned;
		row_start[n + 1] = row_start[n] + row_assigned;
		process_chunk[n] = row_assigned;  // 每个process处理多少行
	}
	process_chunk[p - 1] = i_max - row_start[p - 1];

	if (id == 0) {
		for (int i = 0; i < p; i++) {
			cout << "Process " << i << " :start " << row_start[i] << " " << process_chunk[i] << endl;
		}
	}
}

void send_matrix(double* data, int m, int n, int row_start, int num_rows,int dest, MPI_Request* request, int& cnt) {
/*
num_rows: 多少行
cnt: request数量
*/
	MPI_Isend(&data[row_start * n], num_rows * n, MPI_DOUBLE, dest, tag_num, MPI_COMM_WORLD, &request[cnt]);
	cnt++;
}

void recv_matrix(double* data, int m, int n, int row_start, int num_rows, int source, MPI_Request* request, int& cnt) {
	/*
	num_rows: 多少行
	cnt: request数量
	*/
	MPI_Irecv(&data[row_start * n], num_rows * n, MPI_DOUBLE, source, tag_num, MPI_COMM_WORLD, &request[cnt]);
	cnt++;
}


int main(int argc, char* argv[]) {
	MPI_Init(&argc, &argv);

	MPI_Comm_rank(MPI_COMM_WORLD, &id);
	MPI_Comm_size(MPI_COMM_WORLD, &p);
	srand(time(NULL) + id * 10);

	setup_partition();

	MPI_Request* requests=nullptr;
	int cnt = 0;


	if (id == 0) {
		setup_continuous_array(A_2d, A_1d, i_max, k_max);
		setup_continuous_array(B_2d, B_1d, k_max, j_max);
		setup_continuous_array(C_2d, C_1d, i_max, j_max);

		for (int i = 0; i < i_max * k_max; i++) {
			A_1d[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0;
		}
		for (int i = 0; i < k_max * j_max; i++) {
			B_1d[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0;
		}

		requests = new MPI_Request[2 * p - 1];
	
		for (int i = 1; i < p; i++) {
			send_matrix(A_1d, i_max, k_max, row_start[i], process_chunk[i], i, requests, cnt);
			recv_matrix(C_1d, i_max, j_max, row_start[i], process_chunk[i], i, requests, cnt);
		}

		MPI_Ibcast(B_1d, k_max * j_max, MPI_DOUBLE, 0, MPI_COMM_WORLD, &requests[cnt]);
		cnt++;

	} else{
		setup_continuous_array(A_2d, A_1d, process_chunk[id], k_max);
		setup_continuous_array(B_2d, B_1d, k_max, j_max);
		setup_continuous_array(C_2d, C_1d, process_chunk[id], j_max);

		requests = new MPI_Request[2];
		int cnt = 0;
		send_matrix(A_1d, process_chunk[id], k_max, 0, process_chunk[id], 0 ,requests, cnt);
		MPI_Ibcast(B_1d, k_max * j_max, MPI_DOUBLE, 0, MPI_COMM_WORLD, &requests[cnt]);
		cnt++;
		MPI_Waitall(cnt, requests, MPI_STATUSES_IGNORE);
	}

	

	for (int i = 0; i < i_max; i++) {
		for (int j = 0; j < j_max; j++) {
			C_2d[i][j] = 0;
			for (int k = 0; k < k_max; k++) {
				C_2d[i][j] += A_2d[i][k] * B_2d[k][j];
			}
		}
	}


	if (id == 0) {
		MPI_Waitall(cnt, requests, MPI_STATUSES_IGNORE);
	}
	else {
		cnt = 0;
		MPI_Send(C_1d, process_chunk[id]*j_max, MPI_DOUBLE,0, tag_num, MPI_COMM_WORLD);

	}

	free_continuous_array(A_2d, A_1d);
	free_continuous_array(B_2d, B_1d);
	free_continuous_array(C_2d, C_1d);

	MPI_Finalize();

}

课下真实案例

#include <mpi.h>
#include <iomanip>
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <chrono>

//#define DO_TIMING

using namespace std;

int id, p;

//A * B = C			//A -> i_max by k_max   B -> k_max by j_max    C -> i_max by j_max

int i_max = 1000, j_max = 2000, k_max = 1500;
// 分别创建2维和1维
double** A_2d, * A_1d;
double** B_2d, * B_1d;
double** C_2d, * C_1d;

int* row_start, * process_chunk;

int tag_num = 1;

// 注意参数传进来的  
void setup_continuous_array(double**& array_2d, double*& array_1d, int m, int n)
{
	array_1d = new double[m * n];
	array_2d = new double* [m];

	// 2d的头行首地址存储  1d的倍数
	for (int i = 0; i < m; i++)
		array_2d[i] = &array_1d[i * n];						//index = i*n + j
}

// 释放空间
void free_continuous_array(double**& array_2d, double*& array_1d)
{
	delete[] array_1d;
	delete[] array_2d;
}

// 为每个process 处理从矩阵的多少行开始, 处理多少行
void setup_partition()
{
	// 存储每一个process从矩阵的第几行开始处理
	row_start = new int[p];
	// 存储每一个process处理多少行
	process_chunk = new int[p];

	// 矩阵中剩余的行数
	int rows_left = i_max;
	row_start[0] = 0;
	for (int n = 0; n < p - 1; n++) //每个process处理多少行
	{
		int rows_assigned = rows_left / (p - n);
		rows_left -= rows_assigned;
		row_start[n + 1] = row_start[n] + rows_assigned;
		process_chunk[n] = rows_assigned;
	}
	process_chunk[p - 1] = i_max - row_start[p - 1];

	// 打印为每一个process分配好的开始行数与需要处理的行数
#ifndef DO_TIMING
	if (id == 0)
	{
		for (int i = 0; i < p; i++)
			cout << "process " << i << ": start " << row_start[i] << " " << process_chunk[i] << endl;
	}
#endif
}

// m n分别是矩阵的行数与列数  request是一个MPI_Request类型的列表
void send_matrix(double* data, int m, int n, int row_start, int num_rows, int dest, MPI_Request* requests, int& cnt)
{
	// 因为传进来的是整个一维数组, 所以row_srat*n剔除了前面的数据, 代表从这个数据开始?
	//  L2中有个二维数组, 好像和这个还不一样
	// 这里不太明白
	// 从这个地址开始, 往后num_rows * n个数据
	MPI_Isend(&data[row_start * n], num_rows * n, MPI_DOUBLE, dest, tag_num, MPI_COMM_WORLD, &requests[cnt]);
	cnt++;
}

void recv_matrix(double* data, int m, int n, int row_start, int num_rows, int source, MPI_Request* requests, int& cnt)
{
	MPI_Irecv(&data[row_start * n], num_rows * n, MPI_DOUBLE, source, tag_num, MPI_COMM_WORLD, &requests[cnt]);
	cnt++;
}

int main(int argc, char* argv[])
{
	MPI_Init(&argc, &argv);

#ifdef DO_TIMING
	MPI_Barrier(MPI_COMM_WORLD);
	auto start = chrono::high_resolution_clock::now();
#endif

	MPI_Comm_rank(MPI_COMM_WORLD, &id);  // 进程id
	MPI_Comm_size(MPI_COMM_WORLD, &p);   // 进程size
	srand(time(NULL) + id * 10);

	setup_partition();  // 每个process处理多少行

	MPI_Request* requests = nullptr;//MPI_Request类型列表
	int cnt = 0;

	if (id == 0)
	{
		// 用于矩阵的2d模式与1d模式联系起来. 后面对1d直接赋值同时也是对2d的赋值
		setup_continuous_array(A_2d, A_1d, i_max, k_max);
		setup_continuous_array(B_2d, B_1d, k_max, j_max);
		setup_continuous_array(C_2d, C_1d, i_max, j_max);

		// 对矩阵A,B赋值
		for (int i = 0; i < i_max * k_max; i++)
			A_1d[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0;
		for (int i = 0; i < k_max * j_max; i++)
			B_1d[i] = ((double)rand()) / ((double)RAND_MAX) * 10.0;

		// 下面有三个, 因此大小为3*(p-1)
		requests = new MPI_Request[3 * (p - 1)];

		// process 0的数据不需要发送, 只需要发送剩余的
		for (int i = 1; i < p; i++)
		{
			send_matrix(A_1d, i_max, k_max, row_start[i], process_chunk[i], i, requests, cnt);
			// 对B 的send从0开始因为我们算乘法看A的那部分就可以了
			send_matrix(B_1d, k_max, j_max, 0, k_max, i, requests, cnt);
			// source 是 进程i  从各个进程回收过来,自动组装了
			recv_matrix(C_1d, i_max, j_max, row_start[i], process_chunk[i], i, requests, cnt);
		}
	}
	else
	{
		// 用于矩阵的2d模式与1d模式联系起来. 后面对1d直接赋值同时也是对2d的赋值
		setup_continuous_array(A_2d, A_1d, process_chunk[id], k_max);
		setup_continuous_array(B_2d, B_1d, k_max, j_max);
		setup_continuous_array(C_2d, C_1d, process_chunk[id], j_max);

		requests = new MPI_Request[2];
		int cnt = 0;
		//  这里接收的话是从第0行开始接收。 到最后汇总的话接收的行数又不一样
		// 接收的行数恰好是process_chunk[id],所以这里A_1d的行数也是process_chunk[id]
		recv_matrix(A_1d, process_chunk[id], k_max, 0, process_chunk[id], 0, requests, cnt);
		recv_matrix(B_1d, k_max,             j_max, 0, k_max,             0, requests, cnt);

		// 等待当前进程非阻塞接收完毕
		MPI_Waitall(cnt, requests, MPI_STATUSES_IGNORE);
	}

	// 对所有process进行计算
	for (int i = 0; i < process_chunk[id]; i++)
		for (int j = 0; j < j_max; j++)
		{
			C_2d[i][j] = 0.0;
			for (int k = 0; k < k_max; k++)
				C_2d[i][j] += A_2d[i][k] * B_2d[k][j];
		}

	if (id == 0)
	{
		// 单纯的对id=0进程wait 先执行下面的, 后执行这个
		MPI_Waitall(cnt, requests, MPI_STATUSES_IGNORE);
	}
	else
	{
		// 当id!=0时 发送
		cnt = 0;
		MPI_Send(C_1d, process_chunk[id] * j_max, MPI_DOUBLE, 0, tag_num, MPI_COMM_WORLD);
	}

#ifdef DO_TIMING
	MPI_Barrier(MPI_COMM_WORLD);
	auto finish = chrono::high_resolution_clock::now();
	if (id == 0)
	{
		std::chrono::duration<double> elapsed = finish - start;
		cout << setprecision(5);
		cout << "The code took " << elapsed.count() << "s to run" << endl;
	}
#endif

	free_continuous_array(A_2d, A_1d);
	free_continuous_array(B_2d, B_1d);
	free_continuous_array(C_2d, C_1d);

	MPI_Finalize();
}
/*
PS D:\桌面\C++ Assi\AMPI\x64\Debug> mpiexec -n 3 AMPI.exe                                                               process 0: start 0 33
process 1: start 33 33
process 2: start 66 34
*/

2.2 Split the input data

Different portions of the input data assigned to different processes. 为共同持有的解决方案做贡献

Either globally known (e.g. a shared memory system) or known to a single master node

2.3 Split both the input and output data

问题中输入和输出数据的不同部分可能紧密相关。例如模拟物理系统时，可能会输入有关区域或时间的数据，并针对这些位置或时间计算系统的其他一些属性
Very common approach in both distributed and shared memory systems
(1) 在分布式存储系统中，这通常需要在数据边界进行信息交流
(2) 在分布式存储系统中，它限制了对与数据区域边缘相关联的内存块的需求

2.4 Domain Decomposition

Domain decomposition is a very commonly used example where both input and output data is split. 用于物理系统的仿真
(1) The system is divided into a set of regions (domains)
(2) Each process responsible for a different domain
(3) Communication of data at the edge of domains

2.5 Load Balancing

为了确保最大的并行效率，您不希望某些进程在其他进程工作时处于空闲状态. 我们需要分散计算负荷–尝试让每个过程做相同数量的工作
（1） When splitting should err on giving one process slightly less to do than the others rather than slightly more
（2） Typically want all processes to be responsible for the same number of degrees of freedom (e.g. the same number of nodes or elements)
（3）如果模拟分辨率在空间上恒定，则可能等同于均匀分割区域的大小
需要同时尝试将沟通保持在最低限度. Reduce the surface area of the regions
有时很难分析得出不同分解的计算成本. 我有时使用的策略是确定一个进程空闲多少时间，以等待其他进程上的通信完成
Load Balancing in Smoothed Particle Hydrodynamics (SPH)
(1) 我使用SPH做了大量的模拟工作–负载平衡是有效模拟的关键. 这是在4核上运行的简单模拟–底部视频的颜色对应于处理器编号

2.6 Recursive Decomposition

逐步将问题分成较小的部分，然后合并解决方案.
(1) 最适用于解决方案的计算成本随问题大小的线性增长速度更快的问题，以及拆分和组合解决方案的成本相对较低的问题. So called “divide and conquer” algorithms
(2) 通常在串行算法中使用，但也可以用作并行分解的基础
递归分解的经典示例是quicksort算法
n个项目的蛮力排序为 $O(n^2)$ , 但是您可以在 $O (n)$ 时间中拆分列表，并在 $O (1)$ 时间中将它们组合在一起，总时间大约为 $O (n l o g (n))$

该算法主要分为以下几步
（1）在数组中选一个基准数（通常为数组第一个）。
（2）将数组中小于基准数的数据移到基准数左边，大于基准数的移到右边
（3）对于基准数左、右两边的数组，不断重复以上两个过程，直到每个子集只有一个元素，即为全部有序。
Exercise 2: Quicksort Algorithm
我希望您实现quicksort算法的两个并行版本。您可以在快速排序算法的并行分解中使用两种基本策略：
a) Have one version where instead of a single pivot on the initial division you have p-1 pivots, where p is the number of processes.这将产生p个列表，每个子列表由一个处理器负责。这种方法的潜在效率低下是处理器零负责所有pivoting, 并且除非输入数据非常统一，否则进程之间的数据分配不太可能非常均匀
b) The second tactic is to have 2 pivots on each processor. 第一个处理器将数据发送到其他两个处理器，并为自己保留一些数据（它为自己保留的数量应大约为数据总量除以处理器数量）。这两个处理器中的每一个都应该做同样的事情，直到没有处理器可用为止。然后，需要以相反的顺序重新排序列表。
最好以递归方式对单个处理器上的每个列表进行排序。For distributing the processes it is best to keep track of the processes that a given process has available to distribute.可以将其与该进程负责的号码列表一起发送。您可能需要使用MPI_ANY_SOURCE来接收要由该过程排序的列表。然后，您可以从状态中获取来源的编号，以便将排序后的列表发送回适当的位置。

#include <mpi.h>
#include <cmath>
#include <iostream>
#include <cstdlib>
#include <time.h>
#include <vector>

using namespace std;

int id, p, tag_num = 1;
vector<double> sort_list;
vector<double> pivot_list[3];
int list_length;

double Random(void)
{
	return (double)rand() / RAND_MAX;
}

//probably cheaper to actually pass on the range, but in this code I re-determine it for each list
void find_range(double *list, int n, double &min_val, double &max_val)
{
	min_val = list[0];
	max_val = list[1];
	for (int i = 1; i < n; i++)
	{
		if (min_val > list[i]) min_val = list[i];
		if (max_val < list[i]) max_val = list[i];
	}
}

int serial_partition(double *sort_list, int low, int high)
{
	double pivot = sort_list[high]; 
	int i = low;

	for (int j = low; j < high; j++)
	{ 
		if (sort_list[j] <= pivot)
		{
			double temp = sort_list[i];
			sort_list[i] = sort_list[j];
			sort_list[j] = temp;
			i++;  
		}
	}
	double temp = sort_list[i];
	sort_list[i] = sort_list[high];
	sort_list[high] = temp;
	return i;
}

void serial_quicksort(double *sort_list, int low, int high)
{
	if (low < high)
	{
		int pi = serial_partition(sort_list, low, high);

		serial_quicksort(sort_list, low, pi - 1);
		serial_quicksort(sort_list, pi + 1, high);
	}
}

//We could make pivot lists in place, but it is a bit more complex more than with one pivot 
//(where you can just start with the top and bottom of the list as I do in the serial quicksort code)  
//and so I am taking the slightly less efficient option of creating new lists.
//This does not change the computational order, but does require additional memory allocation.
void make_pivot_lists(double *pivots, int n, double *sort_list)
{
	for (int i = 0; i < n; i++)
	{
		if (sort_list[i] < pivots[0])
			pivot_list[0].push_back(sort_list[i]);
		else if (sort_list[i] < pivots[1])
			pivot_list[1].push_back(sort_list[i]);
		else
			pivot_list[2].push_back(sort_list[i]);
	}
}

//note that this can very easily be done recursively
void distribute_and_solve(void)
{
	int first = -1, second = -1, num_pivots=0;
	MPI_Request requests[6];
	int send_minmax_first[2], send_minmax_second[2];
	int p_minmax[2];
	int request_cnt = 0;
	int in_id;

	if (id == 0)
	{
		p_minmax[0] = 1;
		p_minmax[1] = p - 1;
		in_id = -1;
	}
	else
	{
		//No point in non-blocking as only one list to receive and can't calculate without receiving it 
		MPI_Status status;
		MPI_Probe(MPI_ANY_SOURCE, tag_num, MPI_COMM_WORLD, &status);
		MPI_Get_count(&status, MPI_DOUBLE, &list_length);
		in_id = status.MPI_SOURCE;
		sort_list.resize(list_length);
		//Could combine into a single communication using my own MPI_Type
		MPI_Recv(&sort_list[0], list_length, MPI_DOUBLE, in_id, tag_num, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
		MPI_Recv(&p_minmax, 2, MPI_INT, in_id, tag_num + 1, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
	}

	//a min greater than the max indicates nothing to distribute
	int num_to_distribute = p_minmax[1] - p_minmax[0] + 1;
	if (num_to_distribute < 0) num_to_distribute = 0;

	if (p_minmax[0] <= p_minmax[1])
	{
		first = p_minmax[0];
		num_pivots++;
	}
	if (p_minmax[0] + 1 <= p_minmax[1])
	{
		second = p_minmax[0] + 1;
		num_pivots++;
	}

	//receive, send on and calculate
	
	double pivot[2], min_val, max_val;

	if (num_pivots != 0)
	{
		double proportion_kept = 1.0 / (num_to_distribute + 1);
		find_range(&sort_list[0], list_length, min_val, max_val);
			
		pivot[0] = min_val + proportion_kept * (max_val - min_val);
		if (num_pivots == 2)
			pivot[1] = pivot[0] + (max_val - pivot[0]) / 2.0;
		else pivot[1] = max_val * 2.0;	//ensures all definitely in first two pivot lists
		make_pivot_lists(pivot, list_length, &sort_list[0]);
	}

	//proportion of processors to assign weighted by size of list
	int proc_first_num = num_to_distribute;
	if (second != -1)
	{
		proc_first_num = max((int)round((double)pivot_list[1].size() / (pivot_list[1].size() + pivot_list[2].size()) * num_to_distribute), 1);
	}

	//setup both the sends and the receives - use send process number as tag on on receive to ensure correct one is received
	if (first != -1)
	{
		send_minmax_first[0] = p_minmax[0] + num_pivots;
		send_minmax_first[1] = p_minmax[0] + num_pivots + proc_first_num - 2;
		MPI_Isend(&pivot_list[1][0], pivot_list[1].size(), MPI_DOUBLE, first, tag_num, MPI_COMM_WORLD, &requests[request_cnt]);
		request_cnt++;
		MPI_Isend(send_minmax_first, 2, MPI_INT, first, tag_num+1, MPI_COMM_WORLD, &requests[request_cnt]);
		request_cnt++;
		int recv_pos = pivot_list[0].size();
		MPI_Irecv(&sort_list[recv_pos], pivot_list[1].size(), MPI_DOUBLE, first, tag_num + 2 + first, MPI_COMM_WORLD, &requests[request_cnt]);
		request_cnt++;
	}
	if (second != -1)
	{
		send_minmax_second[0] = p_minmax[0] + num_pivots + proc_first_num - 1;
		send_minmax_second[1] = p_minmax[1];
		MPI_Isend(&pivot_list[2][0], pivot_list[2].size(), MPI_DOUBLE, second, tag_num, MPI_COMM_WORLD, &requests[request_cnt]);
		request_cnt++;
		MPI_Isend(send_minmax_second, 2, MPI_INT, second, tag_num + 1, MPI_COMM_WORLD, &requests[request_cnt]);
		request_cnt++;
		int recv_pos = pivot_list[0].size() + pivot_list[1].size();
		MPI_Irecv(&sort_list[recv_pos], pivot_list[2].size(), MPI_DOUBLE, second, tag_num + 2 + second, MPI_COMM_WORLD, &requests[request_cnt]);
		request_cnt++;
	}

	if (num_pivots != 0)
	{
		serial_quicksort(&pivot_list[0][0], 0, pivot_list[0].size()-1);
		for (int i = 0; i < pivot_list[0].size(); i++)
			sort_list[i] = pivot_list[0][i];
	}
	else serial_quicksort(&sort_list[0], 0, sort_list.size()-1);

	MPI_Waitall(request_cnt, requests, MPI_STATUSES_IGNORE);

	//send back
	if (id != 0)
	{
		MPI_Send(&sort_list[0], sort_list.size(), MPI_DOUBLE, in_id, tag_num + 2 + id, MPI_COMM_WORLD);
	}
}


int main(int argc, char *argv[])
{
	MPI_Init(&argc, &argv);

	MPI_Comm_rank(MPI_COMM_WORLD, &id);
	MPI_Comm_size(MPI_COMM_WORLD, &p);
	srand(time(NULL) + id * 1000);

	list_length = 10000000;

	for (int i = 0; i < 3; i++)
		pivot_list[i].reserve(list_length);

	//Assume that list is generated on process zero and so only it knows the size
	if (id == 0)
	{
		
		sort_list.resize(list_length);
		for (int i = 0; i < list_length; i++)
			sort_list[i] = Random()*1000.0;
	}
	else
	{
		sort_list.reserve(list_length);

	}

	distribute_and_solve();

	//Check for out of order elements
	if (id == 0)
	{
		bool check = false;
		for (int i = 1; i < sort_list.size(); i++)
			if (sort_list[i - 1] > sort_list[i])
			{
				check = true;
				break;
			}

		if (check)
			cout << "There are out of order elements" << endl;
		else cout << "All elements are in order" << endl;
	}
	MPI_Finalize();
}
/*
PS D:\桌面\C++ Assi\AMPI\x64\Debug> mpiexec -n 10 AMPI.exe                                                              All elements are in order
*/

2.7 Exploratory(探索) Decomposition

用于在多步骤问题中寻找解决方案. Has some similarity to data decomposition in that the initial search space is split between processes(请注意，这可能不是真正的数据拆分，但可能是参数空间的拆分)
下一阶段是从上一阶段产生一组新的数据/参数，然后将其拆分.
(1) Can be split onto new processes if available,否则当前流程将负责更多状态
(2) 如果进程搜索到死胡同，可以允许进程再次变得可用
Workshop excercise
Implement the solution of the 15 puzzle problem in parallel. With only 4 processes the implementation is relatively straight forward as each of the processes can be responsible for one of the moves from the initial state,而所有后续移动都在这些进程上执行。拥有更多可用节点时，分解会更加复杂，因为each of the process needs to send new moves on to available processes while keeping some of the solution for itself. 在每个步骤中，节点之间都需要进行通信，以确定是否找到了解决方案。如果找到一个，the chain of moves required needs to be sent back to the root.

#include <iostream>

using namespace std;

int initial_grid[4][4] = { {1,2,3,4},{5,-1,6,8},{9,10,7,11},{13,14,15,12} };			//numbers 1- 15 or -1 for blank spot

const int max_moves = 10;

enum class direction
{
	up = 0,
	down,
	left,
	right,
	none
};


bool is_correct(int grid[4][4])
{
	//index i * 4 + j + 1
	for (int i = 0; i < 4; i++)
		for (int j = 0; j < 4; j++)
			if (grid[i][j] != i * 4 + j + 1 && (i != 3 || j != 3))
				return false;

	return true;
}


int find_solution_from_grid(int grid[4][4], int current_move_num, int i_index, int j_index, direction prev_move, direction* direction_list)			//returns steps to solution or -1 for no solution
{
	if (current_move_num >= max_moves)
		return -1;

	direction* new_direction_list[4];
	int ans[4] = { -1,-1,-1,-1 };

	for (int i = 0; i < 4; i++)
		new_direction_list[i] = new direction[max_moves];

	//up
	if (prev_move != direction::down && i_index > 0)
	{
		int new_grid[4][4];
		for (int i = 0; i < 4; i++)
			for (int j = 0; j < 4; j++)
				new_grid[i][j] = grid[i][j];

		new_grid[i_index][j_index] = new_grid[i_index - 1][j_index];
		new_grid[i_index - 1][j_index] = -1;

		if (is_correct(new_grid))
		{
			direction_list[current_move_num] = direction::up;
			for (int i = 0; i < 4; i++)
				delete[] new_direction_list[i];
			return current_move_num + 1;
		}


		for (int i = 0; i < current_move_num; i++)
			new_direction_list[0][i] = direction_list[i];
		new_direction_list[0][current_move_num] = direction::up;

		ans[0] = find_solution_from_grid(new_grid, current_move_num + 1, i_index - 1, j_index, direction::up, new_direction_list[0]);
	}

	//down
	if (prev_move != direction::up && i_index < 3)
	{
		int new_grid[4][4];
		for (int i = 0; i < 4; i++)
			for (int j = 0; j < 4; j++)
				new_grid[i][j] = grid[i][j];

		new_grid[i_index][j_index] = new_grid[i_index + 1][j_index];
		new_grid[i_index + 1][j_index] = -1;

		if (is_correct(new_grid))
		{
			direction_list[current_move_num] = direction::down;
			for (int i = 0; i < 4; i++)
				delete[] new_direction_list[i];
			return current_move_num + 1;
		}

		for (int i = 0; i < current_move_num; i++)
			new_direction_list[1][i] = direction_list[i];
		new_direction_list[1][current_move_num] = direction::down;

		ans[1] = find_solution_from_grid(new_grid, current_move_num + 1, i_index + 1, j_index, direction::down, new_direction_list[1]);
	}

	//left
	if (prev_move != direction::right && j_index > 0)
	{
		int new_grid[4][4];
		for (int i = 0; i < 4; i++)
			for (int j = 0; j < 4; j++)
				new_grid[i][j] = grid[i][j];

		new_grid[i_index][j_index] = new_grid[i_index][j_index - 1];
		new_grid[i_index][j_index - 1] = -1;

		if (is_correct(new_grid))
		{
			direction_list[current_move_num] = direction::left;
			for (int i = 0; i < 4; i++)
				delete[] new_direction_list[i];
			return current_move_num + 1;
		}

		for (int i = 0; i < current_move_num; i++)
			new_direction_list[2][i] = direction_list[i];
		new_direction_list[2][current_move_num] = direction::left;

		ans[2] = find_solution_from_grid(new_grid, current_move_num + 1, i_index, j_index - 1, direction::left, new_direction_list[2]);
	}

	//right
	if (prev_move != direction::left && j_index < 3)
	{
		int new_grid[4][4];
		for (int i = 0; i < 4; i++)
			for (int j = 0; j < 4; j++)
				new_grid[i][j] = grid[i][j];

		new_grid[i_index][j_index] = new_grid[i_index][j_index + 1];
		new_grid[i_index][j_index + 1] = -1;

		if (is_correct(new_grid))
		{
			direction_list[current_move_num] = direction::right;
			for (int i = 0; i < 4; i++)
				delete[] new_direction_list[i];
			return current_move_num + 1;
		}
		for (int i = 0; i < current_move_num; i++)
			new_direction_list[3][i] = direction_list[i];
		new_direction_list[3][current_move_num] = direction::right;

		ans[3] = find_solution_from_grid(new_grid, current_move_num + 1, i_index, j_index + 1, direction::right, new_direction_list[3]);
	}

	int best_index = -1;
	int best_ans = max_moves + 1;

	for (int i = 0; i < 4; i++)
	{
		if (ans[i] != -1 && ans[i] < best_ans)
		{
			best_ans = ans[i];
			best_index = i;
		}
	}

	if (best_index != -1)
	{
		for (int i = 0; i < best_ans; i++)
			direction_list[i] = new_direction_list[best_index][i];
		for (int i = 0; i < 4; i++)
			delete[] new_direction_list[i];
		return best_ans;			//some recursive call ha found the solution
	}

	for (int i = 0; i < 4; i++)
		delete[] new_direction_list[i];

	return -1;
}


int main(int argc, char* argv[])
{
	direction* direction_list = new direction[max_moves];
	int ans = find_solution_from_grid(initial_grid, 0, 1, 1, direction::none, direction_list);

	if (ans == -1)
	{
		cout << "No solution found within " << max_moves << " moves" << endl;
	}
	else
	{
		cout << "The answer sequence: ";
		for (int i = 0; i < ans; i++)
		{
			if (direction_list[i] == direction::up)
				cout << "up\t";
			else if (direction_list[i] == direction::down)
				cout << "down\t";
			else if (direction_list[i] == direction::left)
				cout << "left\t";
			else if (direction_list[i] == direction::right)
				cout << "right\t";
		}
	}

	return(0);
}

2.8 Speculative Decomposition

Can be used in problems where subsequent tasks depend on the outcome of earlier tasks. 例如根据先前任务的解决方案是对还是错，要完成两个不同的任务
In speculative decomposition，您可以执行所有后续任务，而无需等待先前任务的结果
如果较早的任务比所有后续任务花费更多或相似的时间，则特别有利
Example
Simulation of system with multiple interacting components

isFan.y

关注

1
点赞
踩
1

收藏

觉得还不错? 一键收藏
1
评论
ACSE6 L6 Parallel Decomposition

Parallel Decomposition1. Introduction您可以通过多种不同方式拆分问题，以便并行解决问题. 选取方式可能取决于可用的计算机资源(可用核数, 每个节点的可用内存量, 处理器的相对速度与通信速度的关系). 因此，我们将只考虑有限的一组通用方法来解决问题2. Data DecompositionSplit the data between the processes:(1) Split the output data(2) Split the input data
复制链接

扫一扫