Background
1. mvapich2-1.9a
rpm -qa | grep mvapich2
wget http://mvapich.cse.ohio-state.edu/download/mvapich2/mvapich2-1.9a.tgz
tar -xzf mvapich2-1.9a.tgz
./configure 无参数,之后make install出错
./configure --prefix=/home/shir/mv/install (对于cuda版本,加上 --enable-shared,不用--enable-cuda)
make
make install
check mvapich version: mpiname -a
salloc -N 2 -t 1-00//申请2个节点
salloc -N 2 -p GPU //申请2个GPU节点
squeue -u shir //查看我的节点名字
释放节点 scancel jobid
JOBID PARTITION NAME USER ST TIME NODES NODELIST(REASON)
79867 Compute bash shir R 0:08 2 node[136-137]
touch hosts (或者使用bash ./gen_host 1)
#hosts file, including nodes from "squeue -u shir"
node136
node137
../../bin/mpirun_rsh -np 2 -hostfile hosts ./osu_bw
遇到2问题
1. 要求输入ssh远程节点密码
solution: ssh-keygen
cat .ssh/id_rsa.pub >> .ssh/authorized_keys
2. bin文件路径问题
vi ~/.bashrc
ulimit -c unlimited
PATH=$PATH:/path/to/mvapich2/bin
source ~/.bashrc
1. Pure MPI HPL (netlib version)
a. CBLAS, BLAS
make arch=NAME
make clean arch=NAME
make arch=NAME clean_arch_all
mpirun_rsh -np 4 -hostfile hosts ./xhpl
export LANG=en_US
module avail --> module load blacs/gnu --> module unload
添加 libmpichf90.a
b. GotoBLAS2
安装问题
./kernel/x86_64/gemm_ncopy_4.S:192: Error: undefined symbol`RPREFETCHSIZE' in operation
gmake clean
make BINARY=64 TARGET=NEHALEM
结束时候,输出ln -fs libgoto2_nehalemp-r1.13.so libgoto2.so
GotoBLAS2 --> libgoto2.a
2. Intel HPL
a. MKL
module load intel/latest
cat hosts | uniq > host_uniq
cp host_uniq mpd.hosts
vi mpd.hosts --> add 'head'
~/mv/mv2/bin/mpdboot -n 5 (#nodes+1)
~/mv/mv2/bin/mpdtrace -l
~/mv/mv2/bin/mpiexec -gdb -machinefile hosts -np 4 ./xhpl
cp /home/kandalla/Benchmarks/IMB_3.1/src/find_stray.sh .
cp /home/kandalla/Benchmarks/IMB_3.1/src/kill_all .
./find_stray.sh
cp host_uniq hosts_uniq
~/mv/mv2/bin/mpirun_rsh -hostfile hosts -np 4 valgrind --error-limit=no ./xhpl 2> valgrind.out
cd ~/download/mvapich2-1.9a/src/pm/mpd/
make
make install
./gen_host 1
uniq hosts > hosts_uniq
cp hosts_uniq mpd.hosts
vi mpd.hosts (add 'head')
~/mv/mv2/bin/mpdboot -n 5
vi HPL.dat --> change parameters
~/mv/mv2/bin/mpiexec -machinefile hosts -np 1 ./xhpl_intel64
b. Openmp + MPI + optimized binary
~/mv/mv2/bin/mpiexec -machinefile hosts -np 2 ./xhpl_hybrid_intel64
3. GPU HPL
遇到很多编译问题
Q1: Error while compiling Cuda Accelerated Linpack hpl_2.0_FERMI
Try replacing -openmp with -fopenmp in CCFLAGS, //**del -axS**//
Q2
make[2]: Entering directory `/home/shir/mv/hpl-2.0_FERMI_v13/src/cuda'
mpicc -O0 -c -fPIC -DMPI cuda_dgemm.c -o cuda_dgemm.o -I/usr/local/cuda/include
mpicc -O0 -c -fPIC -DMPI fermi_dgemm.c -o fermi_dgemm.o -I/usr/local/cuda/include
mpicc -O3 -shared -Wl,-soname,libdgemm.so.1 -o libdgemm.so.1.0.1 cuda_dgemm.o fermi_dgemm.o -L/usr/local/cuda/lib64 -lcudart -lcuda
/usr/bin/ld: /home/shir/mv/mv2/lib/libmpich.a(mvapich_malloc.o): relocation R_X86_64_32 against `.bss' can not beused when making a shared object; recompile with -fPIC
/home/shir/mv/mv2/lib/libmpich.a: could not read symbols: Bad value
collect2: ld returned 1 exit status
locate libiomp5.so
/opt/intel/Compiler/11.1/069/lib/intel64/libiomp5.so
Possible solution: A quick hack is to symlink libmagic.so.1 to libmagic.so
/usr/bin/ld: cannot find -liomp5
ln -s /opt/intel/Compiler/11.1/069/lib/intel64/libiomp5.so ./libiomp5.so.1
ln: creating symbolic link `./libiomp5.so.1': Permission denied
Q: undefined reference to `__kmpc_end_critical'
libomp5 --> use libiomp5 and -lpthread
/opt/intel/Compiler/11.1/069/lib/intel64/libiomp5.so
/opt/intel/composer_xe_2013.0.079/compiler/lib/intel64/libiomp5.so