PBS安装文档
centOS 装机
系统选择 compute node
development tools 选上
语言支持选上汉语
密码 wjC井01
机器名字 (本次装机忘做设置,后来改的)
1. 修改机器名字与/etc/hosts #
hostnamectl set-hostname igc-head
cat /etc/hosts
127.0.0.1 localhost localhost.localdomain
10.1.10.1 igc-head
10.1.10.100 is1
10.1.10.11 i1
10.1.10.12 i2
10.1.10.13 i3
cat /etc/hostname
igc-head
2. 节点之间无密码登陆
ssh-keygen
ssh-copy-id root@is1
ssh-copy-id root@i1
ssh-copy-id root@i2
ssh-copy-id root@i3
3. 安装库,安装torque
yum update
yum install libtool openssl-devel libxml2-devel boost-devel gcc gcc-c++
yum install git
mkdir –p software/torque
git clone https://github.com/adaptivecomputing/torque.git -b 5.1.1 5.1.1
cd 5.1.1
./autogen.sh
./configure
make
make install
4.Configure the trqauthd daemon to start automatically at system boot.
cp contrib/init.d/trqauthd /etc/init.d/
chkconfig --add trqauthd
echo /usr/local/lib > /etc/ld.so.conf.d/torque.conf
ldconfig
service trqauthd start
export PATH=/usr/local/bin/:/usr/local/sbin/:$PATH
./torque.setup root
5. 编辑 /var/spool/torque/server_priv/nodes , 曾加节点信息 #
node-name[:ts] [np=] [gpus=] [properties]
[np=] 节点核数
[gpus=] 节点GPU个数
[properties] 节点名字
cat /var/spool/torque/server_priv/nodes
i1 np=16 normal all
i2 np=16 normal all
i3 np=16 bigmem all
6. make packages
会生成一些.sh文件 , 将torque-package-clients-linux-x86_64.sh torque-package-mom-linux-x86_64.sh 拷贝到各个节点
scp torque-package-clients-linux-x86_64.sh torque-package-mom-linux-x86_64.sh 192.168.0.151:/software/torque/5.1.1
同时, 将pbs_mom拷贝到各节点
scp contrib/init.d/pbs_mom 192.168.72.25:/software/torque
7.每一个节点做如下配置
more /var/spool/torque/mom_priv/config
$pbsserver ih # hostname running pbs server
$logevent 225 # bitmap of which events to log
8. 自动启动
管理节点配置
cp contrib/init.d/pbs_server /etc/init.d
chkconfig --add pbs_server
service pbs_server restart
其他自动启动
./torque-package-mom-linux-x86_64.sh --install
./torque-package-clients-linux-x86_64.sh --install
cp /software/torque/pbs_mom /etc/init.d
/usr/local/sbin/pbs_mom -c /var/spool/torque/mom_priv/config
chkconfig --add pbs_mom
service pbs_mom start
设置队列
qmgr -c "create queue all.q"
qmgr -c "set queue all.q queue_type = Execution"
qmgr -c "set queue all.q Priority = 100"
qmgr -c "set queue all.q resources_default.neednodes = all"
qmgr -c "set queue all.q resources_default.nodes = 1"
qmgr -c "set queue all.q enabled = True"
qmgr -c "set queue all.q started = True"
qmgr -c "create queue normal.q"
qmgr -c "set queue normal.q queue_type = Execution"
qmgr -c "set queue normal.q Priority = 100"
qmgr -c "set queue normal.q resources_default.neednodes = normal"
qmgr -c "set queue normal.q resources_default.nodes = 1"
qmgr -c "set queue normal.q enabled = True"
qmgr -c "set queue normal.q started = True"
qmgr -c "create queue bigmem.q"
qmgr -c "set queue bigmem.q queue_type = Execution"
qmgr -c "set queue bigmem.q Priority = 100"
qmgr -c "set queue bigmem.q resources_default.neednodes = bigmem"
qmgr -c "set queue bigmem.q resources_default.nodes = 1"
qmgr -c "set queue bigmem.q enabled = True"
qmgr -c "set queue bigmem.q started = True"
qmgr -c "set server default_queue = all.q"
maui 安装与配置
1. 官网下载, 需先注册 http://www.adaptivecomputing.com/products/open-source/maui/
http://www.adaptivecomputing.com/support/download-center/maui-cluster-scheduler/
2 安装
./configure --with-pbs=/usr/local
make
make install
3. 开机启动
cp /software/torque/maui-3.3.1/etc/maui.d /etc/init.d/
vi /etc/init.d/maui.d # "MAUI_PREFIX=/opt/maui" 改为 "MAUI_PREFIX=/usr/local/maui"
chkconfig maui.d on
启动maui:
/etc/init.d/maui.d start
防火墙端口
所有计算节点和头节点都需要做
firewall-cmd --zone=public --add-port=15001-15004/tcp --permanent
firewall-cmd --zone=public --add-port=15001-15004/udp --permanent
firewall-cmd --reload
9. 运行与测试
[root@igc-head ~]# qnodes
i1
state = free
power_state = Running
np = 16
properties = normal,all
ntype = cluster
status = rectime=1437594868,macaddr=44:a8:42:27:6a:3d,cpuclock=Fixed,varattr=,jobs=,state=free,netload=42405454151,gres=,loadave=0.00,ncpus=32,physmem=131752888kb,availmem=133165940kb,totmem=135947188kb,idletime=113308,nusers=1,nsessions=1,sessions=1275,uname=Linux i1 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
i2
state = free
power_state = Running
np = 16
properties = normal,all
ntype = cluster
status = rectime=1437594868,macaddr=44:a8:42:27:6a:0d,cpuclock=Fixed,varattr=,jobs=,state=free,netload=36553902893,gres=,loadave=0.00,ncpus=32,physmem=131752888kb,availmem=133604660kb,totmem=135947188kb,idletime=113156,nusers=1,nsessions=1,sessions=1256,uname=Linux i2 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
i3
state = free
power_state = Running
np = 16
properties = bigmem,all
ntype = cluster
status = rectime=1437594867,macaddr=44:a8:42:27:84:5b,cpuclock=Fixed,varattr=,jobs=,state=free,netload=37740503968,gres=,loadave=0.00,ncpus=32,physmem=263865204kb,availmem=264675684kb,totmem=268059504kb,idletime=113035,nusers=1,nsessions=1,sessions=1271,uname=Linux i3 3.10.0-229.el7.x86_64 #1 SMP Fri Mar 6 11:36:42 UTC 2015 x86_64,opsys=linux
mom_service_port = 15002
mom_manager_port = 15003
测试
su - igc
echo "sleep 30; hostname" |qsub
单节点特殊配置
完成安装后,需要注意如下几点:
hostname不能是localhost,需要修改为其他,例如
sudo hostname new_name
sudo vi /etc/hostname
127.0.0.1 localhost
x.x.x.x new_name
对应修改pbs_server目录下server_name, server_priv/nodes, 还有pbs_mom下面config里的server name
qnodes测试应通过,然后进行