【HPC-Slurm调度服务部署】

yum -y install wget epel-release gcc bzip2 python3 python3-devel mariadb-devel
#创建slurm账号
useradd -u 450 slurm
#注:以下文件需要各节点统一配置,建议通过共享存储配置
/etc/hosts
/usr/local/slurm/slurm.conf

一、Munge安装

#提供组件间的认证通信机制,需要在所有节点安装并且启动。

yum -y install epel-release
yum -y install munge munge-devel munge-libs
#ftp://ftp.pbone.net/mirror/vault.centos.org/8.3.2011/PowerTools/x86_64/kickstart/Packages/munge-devel-0.5.13-2.el8.x86_64.rpm
yum -y install munge-devel-0.5.13-2.el8.x86_64.rpm
#生成munge秘钥
/usr/sbin/create-munge-key
#拷贝至各Slave节点
#scp /etc/munge/munge.key root@<node_ip>:/etc/munge
systemctl enable munge && systemctl start munge

二、Mysql安装

#grant all privileges on slurm_acct_db.* to slurm@'192.168.158.%' identified by 'xxxxxx' with grant option;
echo "/usr/local/mysql/lib" >> /etc/ld.so.conf.d/mysql.conf
ldconfig

三、Slurm安装

备用下载地址:
#source: https://www.schedmd.com/downloads.php
wget https://download.schedmd.com/slurm/slurm-22.05.7.tar.bz2
tar -jxvf slurm-22.05.7.tar.bz2
cd slurm-22.05.7
./configure --prefix=/usr/local/slurm
make && make install
mkdir -p /usr/local/slurm/etc/ /var/log/slurm/ /var/spool/slurmctld/
chown -R slurm. /usr/local/slurm/ /var/spool/slurmctld

Mater节点启动slurmctld slurmdbd

#slurmctld
cp etc/slurmctld.service /usr/lib/systemd/system/
sed "s/linux0/$(hostname)/g" etc/slurm.conf.example > /usr/local/slurm/etc/slurm.conf
systemctl enable slurmctld && systemctl start slurmctld
#slurmdbd
cp etc/slurmdbd.conf.example /usr/local/slurm/etc/slurmdbd.conf
chmod 600 /usr/local/slurm/etc/slurmdbd.conf
chown slurm. /usr/local/slurm/etc/slurmdbd.conf
cat << EOF >> /usr/local/slurm/etc/slurmdbd.conf
AuthType=auth/munge
SlurmUser=slurm
StorageType=accounting_storage/mysql
StorageHost=$(ifconfig `route -n |grep ^0.0.0.0|awk '{print$NF}'` | grep "inet " | awk '{print$2}')
StoragePort=3306
StoragePass=slurm
StorageUser=slurm
EOF
cp etc/slurmdbd.service /usr/lib/systemd/system/
systemctl enable slurmdbd && systemctl start slurmdbd

Slave节点启动slurmd

#slurmd
cp etc/cgroup.conf.example /usr/local/slurm/etc/cgroup.conf
cp etc/slurmd.service /usr/lib/systemd/system/
sed -i "/AccountingStorageType/d" /usr/local/slurm/etc/slurm.conf
cat <<EOF>> /usr/local/slurm/etc/slurm.conf
AuthType=auth/munge
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=$(ifconfig `route -n |grep ^0.0.0.0|awk '{print$NF}'` | grep "inet " | awk '{print$2}')
AccountingStorageUser=slurm
AccountingStoreFlags=job_comment
AccountingStorageEnforce = associations,limits,qos,safe
AccountingStorageTRES=gres/gpu
GresTypes=gpu
EOF
echo "NodeName=$(hostname) CPUs=1 RealMemory=500 Procs=1 State=UNKNOWN" >> /usr/local/slurm/etc/slurm.conf
echo "PartitionName=cpu Nodes=$(hostname) Default=YES MaxTime=INFINITE State=UP" >> /usr/local/slurm/etc/slurm.conf
#添加所有节点/usr/local/slurm/etc/slurm.conf
systemctl enable slurmd && systemctl start slurmd

PATH

cat << "EOF" >> /etc/rc.local
export PATH=/usr/local/slurm/bin/:$PATH
EOF
source /etc/rc.local

四、测试

编辑任务

cat <<"EOF"> run.slurm
#!/bin/bash
#SBATCH -J test
#SBATCH --cpus-per-task=1
#SBATCH -N 1
#SBATCH -t 3:00
#SBATCH -p gpu --gres=gpu:1

hostname
sleep 600
EOF

执行任务

执行
sbatch run.slurm
查询
squeue

五、常见问题

5.1.【/var/log/slurmd.log】-gresfile**
日志:
[2023-03-28T16:42:58.621] Can not stat gres.conf file (/usr/local/slurm/etc/gres.conf), using slurm.conf data
解決:
#创建gres.conf
rm -f /usr/local/slurm/etc/gres.conf
ls /dev/nvidia[0-9]|awk '{for(i=1;i<=NF;i++)print"NodeName='"$(hostname)"' Name=gpu File="$i}' >> /usr/local/slurm/etc/gres.conf

#重启slurmd
systemctl restart slurmd
5.2.【/var/log/slurmctld.log】-munge**
日志:
[2023-03-28T18:11:39.001] error: If munged is up, restart with --num-threads=10
[2023-03-28T18:11:39.001] error: Munge encode failed: Failed to access "slurm": No such file or directory
[2023-03-28T18:11:39.001] error: slurm_send_node_msg: auth_g_create: REQUEST_PERSIST_INIT has authentication error
[2023-03-28T18:11:39.001] error: slurm_persist_conn_open: failed to send persistent connection init message to 172.16.100.180:3306
[2023-03-28T18:11:39.001] error: Sending PersistInit msg: Protocol authentication error
[2023-03-28T18:11:39.148] error: chdir(/var/log): Permission denied

解決:
#修改配置 slurm.conf
AuthType=auth/munge
#删除其他无关配置,重启slurmctld
systemctl restart slurmctld
5.3.【/var/log/slurmctld.log】-TRES**
日志1:
[2023-03-28T16:59:08.098] fatal: slurmdbd is required to run with TRES gres/gpu. Either setup slurmdbd or remove this TRES from your configuration.

解決:
#修改配置 slurm.conf
AccountingStorageType=accounting_storage/slurmdbd
AccountingStorageHost=$(ifconfig `route -n |grep ^0.0.0.0|awk '{print$NF}'` | grep "inet " | awk '{print$2}')
AccountingStorageUser=slurm
AccountingStoreFlags=job_comment
AccountingStorageEnforce = associations,limits,qos,safe
#重启slurmctld
systemctl restart slurmctld

日志2:
[2023-03-28T17:02:40.933] error: Setting node ecs-e155-0315279.novalocal state to INVAL with reason:gres/gpu count reported lower than configured (0 < 4)

解决:
#查看节点详情:
scontrol show node
#重置状态
scontrol update NodeName=ALL State=DOWN reason="maintain"
scontrol update NodeName=ALL State=idle
scontrol reconfigure
5.4.【sinfo】
异常:
# sinfo
PARTITION AVAIL  TIMELIMIT  NODES  STATE NODELIST
gpu*         up   infinite      4  drain ecs-891d-0320166.novalocal,ecs-e155-0315279.novalocal,ecs-e155-0315280.novalocal,ecs-e155-0315288.novalocal

解決:
#更新node状态
scontrol update NodeName=ALL State=idle
5.5.【sinfo】
异常:
# sinfo
sinfo: error: NodeNames=JSYZ-404-D-2-A2P3-SEV-NF5488-04U03 CPUs=128 match no Sockets, Sockets*CoresPerSocket or Sockets*CoresPerSocket*ThreadsPerCore. Resetting CPUs.

#删除 SocketsPerBoard=1 CoresPerSocket=10 ThreadsPerCore=1 并仅指定 NodeName=MYNODE CPUs=16。如果同时指定 CPUS 和 Sockets、CoresPerSocket 等。Slurm 将尝试理解 CPU value。如果不指定它们,Slurm 将接受给它的CPU value。
解決:
#更新node状态
scontrol update NodeName=ALL State=idle

六、常用命令

添加account
sacctmgr add account test
添加用户并设置qos
sacctmgr add user test account=test qos=normal MaxSubmit=500 GrpTRES="mem=1540G,gres/gpu=32"
sacctmgr modify user test account=test qos=normal set MaxSubmit=500 GrpTRES="mem=1540G,gres/gpu=32"
查询qos
sacctmgr show ass
sacctmgr show ass format=account%30,qos,grpjobs,grpTres%40,MaxSubmit
账号删除
sacctmgr -i
# 交互操作
sacctmgr: show account                                                                                
sacctmgr: delete account test                                                                    
sacctmgr: add account test
查询节点信息
scontrol show node node-001
查询历史任务信息
sacct -j 39195002  --format=jobid,account,node,stat,start,end,exitcode,jobidraw

查询任务信息

scontrol show job
  • 3
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值