Ubuntu19.10下TAU的配置及梯形积分法的实现

最新推荐文章于 2022-11-30 16:12:33 发布

JackFishxxx

最新推荐文章于 2022-11-30 16:12:33 发布

阅读量710

点赞数 2

分类专栏：并行计算文章标签： c++ 并行计算 profiling

本文链接：https://blog.csdn.net/Jacamox/article/details/112571999

版权

并行计算专栏收录该内容

3 篇文章 2 订阅

订阅专栏

前言

TAU是一种可以在Ubuntu下对并行运算的进程进行性能评估的软件，但是国内目前少有中文的经验，特此分享。

一、下载并安装TAU的前置事项

安装Ubuntu19.10，可以选择使用虚拟机（如VMware等）安装。注意：如果是虚拟机请分配多个核心，并提高内存。
(可选，推荐) 安装Java，在命令行中输入指令 sudo apt install openjdk-8-jdk openjdk-8-jre
安装PDT，下载完成后放置到想要安装的位置，使用指令tar -xzvf pdt.tar.gz进行解压，然后在解压的文件夹下进行编译。网址如下：

http://tau.uoregon.edu/pdt.tar.gz

安装openmpi，使用指令 sudo apt install openmpi-bin openmpi-common openmpi-doc
下载TAU

http://tau.uoregon.edu/tau.tgz

二、编译并安装TAU

将压缩包放到对应位置并解压。建议使用指令tar -xzvf tau.taz，因为图形化界面中右键压缩包解压不会有进度条。
进入TAU 文件夹 cd tau-2.29
接下来你有两种选择：

命令行：在终端输入./configure -mpi …等参数进行配置编译等

图形界面：需要安装java，在终端输入bash ./tau_setup即可，下图即为对应的图形界面

配置完毕后，需要编译：make install -j8。注意，该步骤需要在root环境下进行，否则会提醒没有权限，permission denied
编译好了之后，安装TAU：
./installtau -mpi -mpiinc=… -mpilib=… -tag=openmpi -pdt=… -j8
为了更方便的使用，我们可以配置一下环境变量，原理类似于Windows下的path。

首先需要安装vim：sudo apt-get install vim。
安装完成后，输入：vim ~\.bashrc，按i进行编辑，在最下方加入以下代码。如图红框所示，注意，对应的位置请按照自己安装的环境变化。

配置成功后，即可直接调用tau_cc.sh，pprof，paraprof等指令。

三、编写代码并运行

（1）串行部分

接下来，编写一个串行的梯形积分法程序，并编译运行。串行代码如下：

//serial.c
#include <stdio.h>
#define MAXN 100000

double f(double x);
double integ(double a, double b);
 
void main()
{
    double a = 0, b = 19.9;
	printf("%.10lf\n", integ(a, b));
}

double integ(double a, double b)
{ 
	int i, n = MAXN;				
	double approx, h = 0, x_i;
	approx = (f(a) + f(b)) / 2.0;	
	h = (b - a)/n; 
	for(i = 0; i <= n-1; i++)
	{
		x_i = a + i * h;
		approx += f(x_i);
	}
	approx = h * approx;
	
	return approx;
}

double f(double x)
{
	double fx = x * x;
	return fx;
}

注意，串行的程序编译如果使用tau_cc.sh编译会报错，直接gcc编译即可。并行的程序才需要用tau_cc.sh编译。
在这里插入图片描述

（2）并行部分

然后，编写并行的程序。其中，solution1采用的方法是常规的手动分配给每个线程不同的计算内容。并用MPI_recv和MPI_send进行汇总。如果comm_sz%n != 0，即comm_sz除不尽n，就把剩下的部分平均分配到每个线程上。

//并行solution1.c
#include <stdio.h> 
#include <mpi.h>

double f(double x)
{
	return x * x;
}

double Trap(double left_endpt, double right_endpt, int trap_count, double base_len)
{
	double estimate, x;
	int i;

	estimate = (f(left_endpt) + f(right_endpt)) / 2.0;
	for (i = 0; i < trap_count - 1; i++)
	{
		x = left_endpt + (i + 1) * base_len;
		estimate += f(x);
	}

	return estimate * base_len;
}

int main()
{
	int my_rank, comm_sz, n = 99999, local_n;
	double a = 0.0, b = 19.9, h, local_a, local_b, remain = 0.0;
	double local_int, total_int;
	int source;

	MPI_Init(NULL, NULL);
	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);

	h = (b - a) / n; //h is the same for all processes

	local_n = n / comm_sz; //这样整除可能会剩下一部分没算，所以之后需要补上

	local_a = a + my_rank * local_n * h;
	local_b = local_a + local_n * h;
	local_int = Trap(local_a, local_b, local_n, h);

	if (my_rank != 0)
	{
		MPI_Send(&local_int, 1, MPI_DOUBLE, 0, 0, MPI_COMM_WORLD);
	}
	else
	{
		total_int = local_int;
		for (source = 1; source < (comm_sz); source++)
		{
			MPI_Recv(&local_int, 1, MPI_DOUBLE, source, 0, MPI_COMM_WORLD, MPI_STATUS_IGNORE);
			total_int += local_int;
		}
		// 如果除不尽
		if (n % comm_sz)
		{
			remain = Trap(h * local_n * comm_sz, b, n % comm_sz, h);
		}

		total_int += remain;
	}

	if (my_rank == 0)
	{
		printf("With n = %d trapezoids, our estimate\n", n);
		printf("of the integral from %f to %f = %.10e\n", a, b, total_int);
	}

	MPI_Finalize();
	return 0;
}

solution2是使用MPI_reduce进行自动全局操作，比较方便。分别编译运行后比较其性能。

//并行solution2.c
#include <stdio.h>
#include <mpi.h>

double f(double x)
{
	return x * x;
}

double Trap(double left_endpt, double right_endpt, int trap_count, double base_len)
{
	double estimate, x;
	int i;

	estimate = (f(left_endpt) + f(right_endpt)) / 2.0;
	for (i = 0; i < trap_count - 1; i++)
	{
		x = left_endpt + (i + 1) * base_len;
		estimate += f(x);
	}

	return estimate * base_len;
}

int main()
{
	int my_rank, comm_sz, n = 99999, local_n;
	double a = 0.0, b = 19.9, h, local_a, local_b, remain = 0.0;
	double local_int, total_int;
	int source;

	MPI_Init(NULL, NULL);
	MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
	MPI_Comm_size(MPI_COMM_WORLD, &comm_sz);

	h = (b - a) / n;
	
	local_n = n / comm_sz; 

	local_a = a + my_rank * local_n * h;
	local_b = local_a + local_n * h;
	local_int = Trap(local_a, local_b, local_n, h);
	
	MPI_Reduce(&local_int, &total_int, 1, MPI_DOUBLE, MPI_SUM, 0, MPI_COMM_WORLD);
	if (n % comm_sz)
	{
		remain = Trap(h * local_n * comm_sz, b, n % comm_sz, h);
		total_int += remain;
	}

	if (my_rank == 0)
	{
		printf("With n = %d trapezoids, our estimate\n", n);
		printf("of the integral from %f to %f = %.10e\n", a, b, total_int);
	}

	MPI_Finalize();
	return 0;
}