实验目的
搭建MPI并行计算环境 在此基础上运行LAMMPS(Large-scale Atomic/Molecular Massively Parallel Simulator)一个示例 比较单机运行和分布式运行的效率
实验环境
宿主机上创建三台虚拟机,统一配置如下:
安装系统:Ubuntu20.04 CPU 8核,内存16G 网卡1走管理流量,网卡2走计算流量,网卡3走NFS流量(通过不同的linux bridge互通)
hostname 角色 网卡1 网卡2 网卡3 mypc 控制节点/NFS服务器 192.168.122.217 5.5.5.254/24 6.6.6.254/24 mypc 计算节点1 192.168.122.169 5.5.5.1/24 6.6.6.1/24 hpc-2 计算节点1 192.168.122.117 5.5.5.2/24 6.6.6.2/24
搭建单机LAMMPS运行环境
root@mypc:~/mpi_share/lammps#
#下载压缩包
root@mypc:~/mpi_share/lammps# wget http://www.fftw.org/fftw-3.3.10.tar.gz
root@mypc:~/mpi_share/lammps# wget https://www.mpich.org/static/downloads/4.0.2/mpich-4.0.2.tar.gz
root@mypc:~/mpi_share/lammps# wget https://lammps.sandia.gov/tars/lammps-stable.tar.gz
# 解压
root@mypc:~/mpi_share/lammps# tar -zxvf fftw-3.3.10.tar.gz
root@mypc:~/mpi_share/lammps# tar -zxvf mpich-4.0.2.tar.gz
root@mypc:~/mpi_share/lammps# tar -zxvf lammps-stable.tar.gz
# 创建fftw和mpich安装目录
root@mypc:~/mpi_share/lammps# mkdir fftw3
root@mypc:~/mpi_share/lammps# mkdir mpich3
# 查看
root@mypc:~/mpi_share/lammps# ls -lt
total 207964
drwxr-xr-x 6 root root 4096 11月 1 09:04 mpich3
drwxr-xr-x 14 3328 20001 4096 11月 1 07:54 mpich-4.0.2
drwxr-xr-x 6 root root 4096 11月 1 07:48 fftw3
drwxr-xr-x 18 centec centec 4096 11月 1 07:45 fftw-3.3.10
-rw-r--r-- 1 root root 170633728 10月 31 16:42 lammps-stable.tar.gz
-rw-r--r-- 1 root root 90 10月 31 16:34 mpich-4.0.2.tar.gz.st
drwxrwxr-x 13 root root 4096 8月 6 04:57 lammps-23Jun2022
-rw-r--r-- 1 root root 38137945 4月 9 2022 mpich-4.0.2.tar.gz
-rw-r--r-- 1 root root 4144100 9月 15 2021 fftw-3.3.10.tar.gz
root@mypc:~/mpi_share/lammps#
root@mypc:~/mpi_share/lammps/fftw-3.3.10# ./configure --prefix=/root/mpi_share/lammps/fftw3/ --enable-shared=yes
root@mypc:~/mpi_share/lammps/fftw-3.3.10# make && make install
root@mypc:~/mpi_share/lammps# mkdir mpich3
root@mypc:~/mpi_share/lammps# cd mpich
mpich3/ mpich-4.0.2/
root@mypc:~/mpi_share/lammps# cd mpich-4.0.2/
root@mypc:~/mpi_share/lammps/mpich-4.0.2# ./configure --prefix=/root/mpi_share/lammps/mpich3
root@mypc:~/mpi_share/lammps/mpich-4.0.2# make && make install
# 这个花了一个小时才执行完
安装lammps的Makefile修改如下(养成备份好习惯,方便提取diff和恢复文件)
root@mypc:~/mpi_share/lammps# diff -ruN lammps-23Jun2022/src/MAKE/Makefile.
Makefile.mpi Makefile.mpi_bak Makefile.serial
root@mypc:~/mpi_share/lammps# diff -ruN lammps-23Jun2022/src/MAKE/Makefile.mpi_bak lammps-23Jun2022/src/MAKE/Makefile.mpi
--- lammps-23Jun2022/src/MAKE/Makefile.mpi_bak 2022-11-01 09:08:24.631018791 +0800
+++ lammps-23Jun2022/src/MAKE/Makefile.mpi 2022-11-01 09:12:44.226354660 +0800
@@ -6,12 +6,12 @@
# compiler/linker settings
# specify flags and libraries needed for your compiler
-CC = mpicxx
+CC = g++
CCFLAGS = -g -O3 -std=c++11
SHFLAGS = -fPIC
DEPFLAGS = -M
-LINK = mpicxx
+LINK = g++
LINKFLAGS = -g -O3 -std=c++11
LIB =
SIZE = size
@@ -39,9 +39,9 @@
# PATH = path for MPI library
# LIB = name of MPI library
-MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1
-MPI_PATH =
-MPI_LIB =
+MPI_INC = -DMPICH_SKIP_MPICXX -DOMPI_SKIP_MPICXX=1 -I/root/mpi_share/lammps/mpich3/include
+MPI_PATH = -L/root/mpi_share/lammps/mpich3/lib
+MPI_LIB = -lmpich -lmpl -lpthread
# FFT library
# see discussion in Section 3.5.2 of manual
@@ -50,9 +50,9 @@
# PATH = path for FFT library
# LIB = name of FFT library
-FFT_INC =
-FFT_PATH =
-FFT_LIB =
+FFT_INC = -DFFT_FFTW3 -I/root/mpi_share/lammps/fftw3/include
+FFT_PATH = -L/root/mpi_share/lammps/fftw3/lib
+FFT_LIB = -lfftw3
# JPEG and/or PNG library
# see discussion in Section 3.5.4 of manual
root@mypc:~/mpi_share/lammps#
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src# ldd lmp_mpi
linux-vdso.so.1 (0x00007ffe603a6000)
libmpi.so.12 => not found
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f9a79cb4000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f9a79cae000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f9a79acc000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f9a7997d000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f9a79962000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f9a7976e000)
/lib64/ld-linux-x86-64.so.2 (0x00007f9a7a260000)
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src#
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src# cp /root/mpi_share/lammps/mpich3/lib/libmpi.so.12.2.2 /lib/x86_64-linux-gnu/libmpi.so.12
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src#
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src# ldd lmp_mpi
linux-vdso.so.1 (0x00007ffe66edb000)
libmpi.so.12 => /lib/x86_64-linux-gnu/libmpi.so.12 (0x00007f692ef03000)
libpthread.so.0 => /lib/x86_64-linux-gnu/libpthread.so.0 (0x00007f692eee0000)
libdl.so.2 => /lib/x86_64-linux-gnu/libdl.so.2 (0x00007f692eeda000)
libstdc++.so.6 => /lib/x86_64-linux-gnu/libstdc++.so.6 (0x00007f692ecf8000)
libm.so.6 => /lib/x86_64-linux-gnu/libm.so.6 (0x00007f692eba9000)
libgcc_s.so.1 => /lib/x86_64-linux-gnu/libgcc_s.so.1 (0x00007f692eb8e000)
libc.so.6 => /lib/x86_64-linux-gnu/libc.so.6 (0x00007f692e99a000)
libhwloc.so.15 => /lib/x86_64-linux-gnu/libhwloc.so.15 (0x00007f692e949000)
libefa.so.1 => /lib/x86_64-linux-gnu/libefa.so.1 (0x00007f692e93f000)
libibverbs.so.1 => /lib/x86_64-linux-gnu/libibverbs.so.1 (0x00007f692e920000)
libnl-3.so.200 => /lib/x86_64-linux-gnu/libnl-3.so.200 (0x00007f692e8fd000)
librt.so.1 => /lib/x86_64-linux-gnu/librt.so.1 (0x00007f692e8f3000)
/lib64/ld-linux-x86-64.so.2 (0x00007f6931eb4000)
libudev.so.1 => /lib/x86_64-linux-gnu/libudev.so.1 (0x00007f692e8c4000)
libltdl.so.7 => /lib/x86_64-linux-gnu/libltdl.so.7 (0x00007f692e8b9000)
libnl-route-3.so.200 => /lib/x86_64-linux-gnu/libnl-route-3.so.200 (0x00007f692e841000)
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src#
进入/examples/shear/(搜了一下shear表示气象中的“风切变”)
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src# cd ../examples/shear/
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/examples/shear# ls -lt
total 76
-rw-rw-r-- 1 root root 1541 8月 6 04:57 in.shear
-rw-rw-r-- 1 root root 1551 8月 6 04:57 in.shear.void
-rw-rw-r-- 1 root root 8113 8月 6 04:57 log.27Nov18.shear.g++.1
-rw-rw-r-- 1 root root 8123 8月 6 04:57 log.27Nov18.shear.g++.4
-rw-rw-r-- 1 root root 8166 8月 6 04:57 log.27Nov18.shear.void.g++.1
-rw-rw-r-- 1 root root 8164 8月 6 04:57 log.27Nov18.shear.void.g++.4
-rw-rw-r-- 1 root root 36573 8月 6 04:57 Ni_u3.eam
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/examples/shear#
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/examples/shear# mpirun --allow-run-as-root -np 3 /root/mpi_share/lammps/lammps-23Jun2022/src/lmp_mpi -in in.shear
......
ERROR: Unrecognized pair style 'eam' is part of the MANYBODY package which is not enabled in this LAMMPS binary. (../force.cpp:279)
Last command: pair_style eam
查到是编译的时候,MANYBODY这个package没有装上,安装一下
#yes-<package_name>t表示安装,no表示卸载
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src# make yes-MANYBODY
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src# make package-status | grep MANYBODY
Installed YES: package MANYBODY
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src#
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/src# make lmpmpi -j4
可以正常执行了(-np 15超过本机8核,不要紧)
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/examples/shear# mpirun -oversubscribe --allow-run-as-root -np 15 /root/mpi_share/lammps/lammps-23Jun2022/src/lmp_mpi -in in.shear
...
Total # of neighbors = 45027
Ave neighs/atom = 23.549686
Neighbor list builds = 222
Dangerous builds = 0
Total wall time: 0:00:14
分布式计算
免密
需要设置从控制节点ssh免密登录两个控制节点(5.5.5.1/5.5.5.2),具体内容就不写了 控制节点新增静态DNS解析项
root@mypc:~# echo "5.5.5.1 hpc-1" >> /etc/hosts
root@mypc:~# echo "5.5.5.2 hpc-2" >> /etc/hosts
NFS搭建
通过NFS,使得计算节点也自动拥有控制节点下载和编译出的文件 控制节点
#安装软件
root@mypc:~# apt-get install nfs-kernel-server -y
root@mypc:~#
root@mypc:~/mpi_share# cat /etc/exports | egrep -v "(^#|^$)"
/root/mpi_share *(rw,sync,no_root_squash,no_subtree_check)
root@mypc:~/mpi_share#
# 重启NFS服务
root@mypc:~/mpi_share# /etc/init.d/nfs-kernel-server restart
Restarting nfs-kernel-server (via systemctl): nfs-kernel-server.service.
root@mypc:~/mpi_share#
#探测NFS服务器
root@hpc-1:~# showmount -e 6.6.6.254
Export list for 6.6.6.254:
/root/mpi_share *
root@hpc-1:~#
#安装软件
root@hpc-1:~# apt install nfs-common -y
root@hpc-1:~# mount -t nfs 6.6.6.254:/root/mpi_share /root/mpi_share
root@hpc-1:~#
#验证,跟控制节点上看到的目录内容一样
root@hpc-1:~/mpi_share/lammps# ls -lt
total 207964
drwxr-xr-x 6 root root 4096 11-р сар 1 09:04 mpich3
drwxr-xr-x 14 3328 20001 4096 11-р сар 1 07:54 mpich-4.0.2
drwxr-xr-x 6 root root 4096 11-р сар 1 07:48 fftw3
drwxr-xr-x 18 centec centec 4096 11-р сар 1 07:45 fftw-3.3.10
-rw-r--r-- 1 root root 170633728 10-р сар 31 16:42 lammps-stable.tar.gz
-rw-r--r-- 1 root root 90 10-р сар 31 16:34 mpich-4.0.2.tar.gz.st
drwxrwxr-x 13 root root 4096 8-р сар 6 04:57 lammps-23Jun2022
-rw-r--r-- 1 root root 38137945 4-р сар 9 2022 mpich-4.0.2.tar.gz
-rw-r--r-- 1 root root 4144100 9-р сар 15 2021 fftw-3.3.10.tar.gz
root@hpc-1:~/mpi_share/lammps#
计算节点上还要拷贝so文件,因为这个文件需要拷贝到本地/lib目录
root@hpc-1:~/mpi_share/lammps# cp /root/mpi_share/lammps/mpich3/lib/libmpi.so.12.2.2 /lib/x86_64-linux-gnu/libmpi.so.12
root@hpc-1:~/mpi_share/lammps#
分布式运行
先在计算节点上加一下hostfile,指定分配的计算节点的CPU核数
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/examples/shear# cat hostfile
hpc-1 slots=5
hpc-2 slots=5
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/examples/shear#
在hpc-1和hpc-2的网卡2和网卡3启动tcpdump抓包 最终执行
root@mypc:~/mpi_share/lammps/lammps-23Jun2022/examples/shear# mpirun -host hpc-1,hpc-2 --hostfile hostfile -oversubscribe --allow-run-as-root -np 15 /root/mpi_share/lammps/lammps-23Jun2022/src/lmp_mpi -in in.shear
...
Total # of neighbors = 45027
Ave neighs/atom = 23.549686
Neighbor list builds = 222
Dangerous builds = 0
Total wall time: 0:00:08
# 从单机本地14秒缩短为分布式计算8秒