一. 关闭防火墙及selinux
systemctl stop firewalld #关闭防火墙
setenforce 0 #临时关闭selinux
systemctl disable firewalld #永久关闭防火墙
sed -i s#SELINUX=enforcing#SELINUX=disabled# /etc/selinux/config #永久关闭selinux
二.部署munge(所有节点)
1. 创建Munge用户
Munge用户要确保Master Node和Compute Nodes的UID和GID相同,所有节点都需要安装Munge;
# groupadd -g 1108 munge
# useradd -m -c "Munge Uid 'N' Gid Emporium" -d /var/lib/munge -u 1108 -g munge -s /sbin/nologin munge
2. 生成熵池:
rngd -r /dev/urandom
vim /usr/lib/systemd/system/rngd.service
修改如下参数
[service]
ExecStart=/sbin/rngd -f -r /dev/urandom
查看状态
systemctl status rngd
退出报存
systemctl daemon-reload
systemctl start rngd
systemctl enable rngd
3. 启动及配置munge服务
yum install munge munge-libs munge-devel -y
生成密钥信息
#等待随机数据 (推荐给偏执狂):
dd if=/dev/random bs=1 count=1024 >/etc/munge/munge.key
新建用户并修改文件所属用户(全部节点)注意:客户端也需要同样的操作,在客户端安装好必要的软件之后
#新建用户及其主目录和登录shell
useradd munge -m -s /bin/bash
#给用户赋密码
passwd munge
#修改目录属主
chown -R munge.munge /var/{lib,log,run}/munge
chown -R munge.munge /etc/munge
#修改目录模式
chmod 711 /var/lib/munge
chmod 700 /var/log/munge
chmod 755 /var/run/munge
chmod 700 /etc/munge
chmod 400 /etc/munge/munge.key
拷贝密钥(主节点)
#拷贝主节点密钥到其余节点
scp /etc/munge/munge.key munge@ip:/etc/munge
所有节点都执行启动命令:
# systemctl start munge
# systemctl enable munge
3.测试munge服务
本地查看凭据:
munge -n
本地解码:
munge -n | unmunge
验证compute node,远程解码:
munge -n | ssh ip unmunge
Munge凭证基准测试
remunge
三. 编译安装slurm(所有节点)
1.创建Slurm用户
# groupadd -g 1109 slurm
# useradd -m -c "Slurm manager" -d /var/lib/slurm -u 1109 -g slurm -s /bin/bash slurm
2. 安装Slurm依赖
yum install gcc gcc-c++ readline-devel perl-ExtUtils-MakeMaker pam-devel rpm-build mysql-devel -y
3. 编译安装slurm
# 拉取slurm源码包
wget https://download.schedmd.com/slurm/slurm-23.02.2.tar.bz2
# 安装rpmbuild编译Slurm,rpmbuild制作rpm包
yum install rpm-build
rpmbuild -ta slurm-23.02.2.tar.bz2
# cd到制作好的rpm包下:
cd /root/rpmbuild/RPMS/x86_64/
#所有节点安装Slurm
yum localinstall slurm-*
四. 配置slurmdbd.conf
1.安装启动mysql
yum install mysql-server
service mysqld start
2. 创建数据库的Slurm用户
grant all on slurm_acct_db.* to ‘slurm’@‘%’ identified by ‘pera2024’ with grant option;
3. 配置slurmdbd.conf文件
cp /etc/slurm/slurmdbd.conf.example /etc/slurm/slurmdbd.conf
vim /etc/slurm/slurmdbd.conf
AuthType=auth/munge
AuthInfo=/var/run/munge/munge.socket.2
DbdAddr=localhost
DbdHost=localhost
SlurmUser=root
DebugLevel=verbose
LogFile=/var/log/slurm/slurmdbd.log
PidFile=/var/run/slurmdbd.pid
StorageType=accounting_storage/mysql
StorageHost=localhost
StorageUser=slurm
StoragePass=pera2024
StorageLoc=slurm_acct_db
#启动节点服务
systemctl start slurmdbd
systemctl enable slurmdbd
五. 配置slurm.conf
ClusterName=cluster
SlurmctldHost=fluid-server
AuthType=auth/munge
SlurmctldLogFile=/var/log/slurm/slurmctld.log
SlurmdLogFile=/var/log/slurm/slurmd.log
SelectType=select/cons_res
SelectTypeParameters=CR_CPU_Memory
AccountingStorageType=accounting_storage/slurmdbd
JobCompHost=localhost
JobCompPass=pera2024
JobCompPort=3306
JobCompType=jobcomp/mysql
JobCompUser=slurm
NodeName=c[1-2] RealMemory=3400 Sockets=1 CoresPerSocket=4 State=IDLE
PartitionName=all Nodes=c[1-2] Default=YES State=UP
#复制slurm.conf到所有计算节点
scp /etc/slurm/slurm.conf ip:/etc/slurm/
#管理节点启动slurmctld
systemctl start slurmctld
systemctl enable slurmctld
#计算节点启动slurmd
systemctl start slurmd
systemctl enable slurmd
六. 检查Slurm集群
sinfo
scontrol show partition
scontrol show node
查看用户所在的账户:sacctmgr list user
查看用户所在的账户:sacctmgr list account
查看账户关联用户及QOS限制等:sacctmgr list assoc
查看用户关联的账户及QOS限制等:sacctmgr list assoc user=用户
查看账户关联:sacctmgr list assoc account=账户
查看QOS限制:sacctmgr list qos
添加新的系统用户到slurm集群
sacctmgr -i add user xxx account=root