(1)拉取一个新的centos镜像docker pull centos:7.2
(2)运行一个容器,准备安装必要的环境
docker run --privileged -p 8888:6817 -p 8889:6818 --dns 8.8.8.8 --dns 8.8.4.4 -h controler --name slurm_control -i -t -v /container_data/:/data centos:centos7 /bin/bash
(3)在容器内安装ssh环境
yum install wget vim
wget -0 /etc/yum.repos.d/CentOS-Base.repo http://mirrors.aliyun.com/repo/Centos-7.repo
yum cleal all
yum install passwd
yum install openssh-server
修改容器密码(提前yum -y reinstall cracklib-dicts)
echo "123456" |passwd --stdin root
产生公私钥:
ssh-keygen -t rsa
cd ~/.ssh/
cp id_rsa.pub authorized_keys
开启前准备(以下指令直接一路回车下去)
ssh-keygen -t rsa -f /etc/ssh/ssh_host_rsa_key
ssh-keygen -t dsa -f /etc/ssh/ssh_host_dsa_key
ssh-keygen -t rsa -f /etc/ssh/ssh_host_ecdsa_key
ssh-keygen -t rsa -f /etc/ssh/ssh_host_ed25519_key
/usr/sbin/sshd
退出容器,制作成新容器
Ctrl+d
Docker stop 2ba4a64c7f80
Docker commit 2ba4a64c7f80 docker_ssh
docker run --privileged -i -t docker_ssh /sbin/init (一直卡在当前界面,重新开启一个新的终端,进入容器内部打开sshd服务)
docker exec -it af40bd07fa0f /bin/bash
systemctl restart sshd.service
echo "123456" |passwd --stdin root
netstat -nplt|grep 22 //查看状态
安装客户端:yum install -y openssh-clients
(6)退出容器,将宿主机的~/.ssh/id_rsa.pub拷贝到容器里的~/.ssh/authorized_keys即可免密码登录
在宿主机上执行
docker cp ~/.ssh/id_rsa.pub aea267757cc9:/root/.ssh/
进入容器后追加到容器的授权码
docker attach aea267757cc9
cat id_dsa.pub >> /root/.ssh/authorized_keys
制作完成,被分为新的镜像
docker commit aea267757cc9 ssh_docker1
docker run -d -p 221:22 --name ssh_container2 -h slaver1 -v /container_data/:/data f57ef7db72aa /usr/sbin/sshd -D
在宿主机上秘密吗链接测试
ssh -p221 root@lcoalhost
(7)为防止docker每次启动ip变动,使用自定义网络后每次皮冻增加一个新的容器的host 列进去
docker network create --subnet=172.18.0.0/16 shadownet
为slurm集群安装两个节点一个是计算节点一个是用控制,一个用来计算节点
docker run -d -p 220:22 --name ssh_container1 -h controler --net shadownet --ip 172.18.0.10 --add-host="slaver1 :172.18.0.11" -v /container_data/:/data 0bedf27156e7 /usr/sbin/sshd -D
docker run -d -p 221:22 --name ssh_container2 -h slaver1 --net shadownet --ip 172.18.0.11 --add-host="controler:172.18.0.10" -v /container_data/:/data 0bedf27156e7 /usr/sbin/sshd -D
安装部分
安装munge
先创建一个slurm用户出来,后面的使用会用到:useradd slurm
yum -y install epel-release
yum -y install gtk2
yum -y install gtk-devel
yum -y install munge
yum -y install munge-devel
一般会自动生成(我的安装后就存在这些目录),没有就自己创建创建
mkdir /etc/munge
mkdir /var/run/munge
mkdir /var/lib/munge
mkdir /var/log/munge
chown slurm:slurm /etc/munge
chown slurm:slurm /var/run/munge
chown slurm:slurm /var/lib/munge
chown slurm:slurm /var/log/munge
然后在一台节点上(我是在主节点上)生成munge的秘钥,存储在/etc/munge中命令:
/usr/sbin/create-munge-key
chown slurm:slurm /etc/munge/munge.key
然后在各个节点上以slurm账户启动munge(之后不说明的地方都是以root账户)
命令:munged
ps aux | grep munged
至此一个容器中的munge安装完毕
(6)安装slurm
解压 slurm
tar -jxvf slurm-16.05.11.tar.bz2 //若解压失败则是由于缺少bzip的包,yum install -y bzip2
后在解压。
编译,进入解压后的目录
cd slurm-16.05.11
./configure //等待一段时间,让他编译一会,此时如果失败,可能势必的gcc版本太低或者没有安装
make //如果没有,则安装:yum -y install gcc automake autoconf libtool make
make install
之后修改配置文件
Cd /usr/local/etc
ClusterName=ssc
ControlMachine=controler
ControlAddr=172.18.0.10
#BackupController=
#BackupAddr=
#
SlurmUser=slurm
#SlurmdUser=root
SlurmctldPort=6817
SlurmdPort=6818
AuthType=auth/munge
#JobCredentialPrivateKey=
#JobCredentialPublicCertificate=
StateSaveLocation=/tmp
SlurmdSpoolDir=/tmp/slurmd
SwitchType=switch/none
MpiDefault=none
SlurmctldPidFile=/var/run/slurmctld.pid
SlurmdPidFile=/var/run/slurmd.pid
ProctrackType=proctrack/pgid
#PluginDir=
#FirstJobId=
ReturnToService=0
#MaxJobCount=
#PlugStackConfig=
#PropagatePrioProcess=
#PropagateResourceLimits=
#PropagateResourceLimitsExcept=
#Prolog=
#Epilog=
#SrunProlog=
#SrunEpilog=
#TaskProlog=
#TaskEpilog=
#TaskPlugin=
#TrackWCKey=no
#TreeWidth=50
#TmpFS=
#UsePAM=
#
# TIMERS
SlurmctldTimeout=300
SlurmdTimeout=300
InactiveLimit=0
MinJobAge=300
KillWait=30
Waittime=0
#
# SCHEDULING
SchedulerType=sched/backfill
#SchedulerAuth=
#SchedulerPort=
#SchedulerRootFilter=
SelectType=select/linear
FastSchedule=1
#PriorityType=priority/multifactor
#PriorityDecayHalfLife=14-0
#PriorityUsageResetPeriod=14-0
#PriorityWeightFairshare=100000
#PriorityWeightAge=1000
#PriorityWeightPartition=10000
#PriorityWeightJobSize=1000
#PriorityMaxAge=1-0
#
# LOGGING
SlurmctldDebug=3
SlurmctldLogFile=/var/log/slurmctld.log
SlurmdDebug=3
SlurmdLogFile=/var/log/slurmd.log
JobCompType=jobcomp/none
#JobCompLoc=
#
# ACCOUNTING
#JobAcctGatherType=jobacct_gather/linux
#JobAcctGatherFrequency=30
#
#AccountingStorageType=accounting_storage/slurmdbd
#AccountingStorageHost=
#AccountingStorageLoc=
#AccountingStoragePass=
#AccountingStorageUser=
#
# COMPUTE NODES
NodeName=controler CPUs=2 RealMemory=150 Sockets=2 CoresPerSocket=1 ThreadsPerCore=1 Procs=1 State=IDLE
NodeName=slaver1 CPUs=2 RealMemory=150 Sockets=2 CoresPerSocket=1 ThreadsPerCore=1 Procs=1 State=IDLE
NodeName=slaver2 CPUs=2 RealMemory=150 Sockets=2 CoresPerSocket=1 ThreadsPerCore=1 Procs=1 State=IDLE
PartitionName=control Nodes=controler Default=NO MaxTime=INFINITE State=UP
PartitionName=compute Nodes=slaver1 Default=YES MaxTime=INFINITE State=UP
PartitionName=compute Nodes=slaver2 Default=YES MaxTime=INFINITE State=UP
提交成镜像,并重新启动这个镜像成三个节点master,slaver1,slaver2
docker run -d -p 220:22 --name master -h controler --net shadownet --ip 172.18.0.10 --add-host="slaver1
:172.18.0.11" --add-host="slaver2 :172.18.0.12" -v /container_data/:/data e8ea50ec6b31 /usr/sbin/sshd -D
docker run -d -p 221:22 --name slaver1 -h slaver1 --net shadownet --ip 172.18.0.11 --add-host="controler
:172.18.0.10" --add-host="slaver2 :172.18.0.12" -v /container_data/:/data e8ea50ec6b31 /usr/sbin/sshd -D
docker run -d -p 222:22 --name slaver2 -h slaver2 --net shadownet --ip 172.18.0.12 --add-host="controler
:172.18.0.10" --add-host="slaver1 :172.18.0.11" -v /container_data/:/data e8ea50ec6b31 /usr/sbin/sshd -D
因为这三台都已ssh链接容器
所以随便进入一个
(10)开启集群
分别在三台机器以slurm省份开启munge
Munged
Master节点需要执行 slurmctld -c 和slurmd-c,都是以root账户执行
所有Slaver节点都执行 slurmd -c
至此,安装结束!