文|MESeraph
01 | 预先操作
- 关闭Centos界面登录。(仅适用于Centos7以上版本)
systemctl get-default
systemctl set-default multi-user.target
- 联网
(1) 首先查看网卡
ls /etc/sysconfig/network-scripts
(2) 编辑
vi /etc/sysconfig/network-scripts/ifcfg-ensXXXX #我的是ens33
修改该文件中配置:ONBOOT=yes
(3) 重启网络服务
service network restart
-
修改主机名
hostnamectl set-hostname newname
vim /etc/hosts #替换原主机名,并添加计算节点 -
关闭防火墙
systemctl stop firewalld.service
02 | 设置ssh免密
- 生成公私密钥对
ssh-keygen -t rsa
- 拷贝公钥到计算节点
ssh-copy-id -i ~/.ssh/id_rsa.pub root@centos2
03 | 安装munge
- 新建用户(munge、slurm)
export MUNGEUSER=1001
groupadd -g $MUNGEUSER munge
useradd -m -c "MUNGE Uid 'N' Gid Emporium" -d /var/lib/munge -u $MUNGEUSER -g munge -s /sbin/nologin munge
export SLURMUSER=1002
groupadd -g $SLURMUSER slurm
useradd -m -c "SLURM workload manager" -d /var/lib/slurm -u $SLURMUSER -g slurm -s /bin/bash slurm
- 安装软件
yum install epel-release
yum install munge munge-libs munge-devel -y
yum install rng-tools -y
- 生成配置
rngd -r /dev/urandom
/usr/sbin/create-munge-key -r
dd if=/dev/urandom bs=1 count=1024 > /etc/munge/munge.key
chown munge: /etc/munge/munge.key
chmod 400 /etc/munge/munge.key
将munge.key放置到其他计算节点
scp /etc/munge/munge.key root@centos2:/etc/munge/
- 启动并配置所有节点的Munge
chown -R munge: /etc/munge/ /var/log/munge/
chmod 0700 /etc/munge/ /var/log/munge/
systemctl enable munge
systemctl start munge
- 测试Munge
munge -n
munge -n | unmunge
munge -n | ssh centos2 unmunge
remunge
04 | 安装Slurm
- 安装依赖包
yum install openssl openssl-devel pam-devel numactl numactl-devel hwloc hwloc-devel lua lua-devel readline-devel rrdtool-devel ncurses-devel man2html libibmad libibumad perl-Switch mariadb mariadb-server mariadb-devel -y
yum install perl-ExtUtils-MakeMaker
yum install gcc
systemctl start mariadb
- 构建rpm包(只在控制节点构建,然后传到各个计算节点安装)
yum install rpm-build
wget https://download.schedmd.com/slurm/slurm-20.02.3.tar.bz2
rpmbuild -ta slurm-20.02.3.tar.bz2
- 安装rmp包
cd rpmbuild/RPMS/x86_64
yum localinstall slurm-*.rpm
- 配置slurm.conf
slurm.conf
# Slurm.conf file generated by configurator easy.html.
# Put this file on all nodes of your cluster.
# See the slurm.conf man page for more information.
#
SlurmctldHost=centos1
#
#MailProg=/bin/mail
MpiDefault=none
#MpiParams=ports=#-#
ProctrackType=proctrack/cgroup
ReturnToService=1
SlurmctldPidFile=/var/run/slurmctld.pid
#SlurmctldPort=6817
SlurmdPidFile=/var/run/slurmd.pid
#SlurmdPort=6818
SlurmdSpoolDir=/var/spool/slurmd
SlurmUser=slurm
#SlurmdUser=root
StateSaveLocation=/var/spool/slurmctld
SwitchType=switch/none
TaskPlugin=task/affinity
#
#
# TIMERS
#KillWait=30
#MinJobAge=300
#SlurmctldTimeout=120
#SlurmdTimeout=300
#
#
# SCHEDULING
SchedulerType=sched/backfill
SelectType=select/cons_tres
SelectTypeParameters=CR_Core
#
#
# LOGGING AND ACCOUNTING
AccountingStorageType=accounting_storage/none
ClusterName=cluster
#JobAcctGatherFrequency=30
JobAcctGatherType=jobacct_gather/none
#SlurmctldDebug=info
SlurmctldLogFile= /var/log/slurmctld.log
#SlurmdDebug=info
SlurmdLogFile=/var/log/slurmd.log
#
#
# COMPUTE NODES
NodeName=centos[2-3] CPUs=1 RealMemory=500 Sockets=1 CoresPerSocket=1 ThreadsPerCore=1 State=UNKNOWN
PartitionName=debug Nodes=centos[2-3] Default=YES MaxTime=INFINITE State=UP
将slurm.conf拷贝至各个计算节点:
scp /etc/slurm/slurm.conf root@centos2:/etc/slurm
- 计算节点配置cgroup.conf
cgroup.conf
###
#
# Slurm cgroup support configuration file
#
# See man slurm.conf and man cgroup.conf for further
# information on cgroup configuration parameters
#--
CgroupAutomount=yes
ConstrainCores=no
ConstrainRAMSpace=no
05 | 问题
error: Unable to register: Zero Bytes were transmitted or received slurm
解决:节点时间未同步,安装NTP后,启动ntpd服务即可。