torque-6.1.2 安装问题,节点down状态如何启动
qterm -t quick
pbs_server
pbsnodes -a
发现子节点是 state = down
已关防火墙,配置正确,可ssh切换,节点服务都启动,还是出问题
主节点:
[root@calserver calserver]# for i in pbs_server pbs_sched pbs_mom trqauthd; do service $i start; done
Starting pbs_server (via systemctl): [ OK ]
Starting pbs_sched (via systemctl): [ OK ]
Starting pbs_mom (via systemctl): [ OK ]
Starting trqauthd (via systemctl): [ OK ]
[root@calserver calserver]# ps -ef | grep pbs
root 1160 1 0 01:18 ? 00:00:00 /usr/local/torque/sbin/pbs_server -F -d /var/spool/torque
root 3566 1 0 01:20 ? 00:00:00 /usr/local/torque/sbin/pbs_sched -d /var/spool/torque
root 3593 1 0 01:20 ? 00:00:00 /usr/local/torque/sbin/pbs_mom -F -d /var/spool/torque
root 3659 3428 0 01:21 pts/0 00:00:00 grep --color=auto pbs
[root@calserver calserver]# qnodes
calserver
state = free
power_state = Running
np = 16
ntype = cluster
status = opsys=linux,uname=Linux calserver 3.10.0-862.14.4.el7.x86_64 #1 SMP Wed Sep 26 15:12:11 UTC 2018 x86_64,sessions=1593 2113 2237 2247 2501 3135 3185 3240,nsessions=8,nusers=2,idletime=256,totmem=5960692kb,availmem=4875732kb,physmem=3863544kb,ncpus=16,loadave=0.18,gres=,netload=89393,state=free,varattr= ,cpuclock=Fixed,macaddr=00:0c:29:a0:9b:d2,version=6.1.2,rectime=1540660913,jobs=
mom_service_port = 15002
mom_manager_port = 15003
calnode02
state = down
power_state = Running
np = 4
ntype = cluster
mom_service_port = 15002
mom_manager_port = 15003
calnode03
state = down
power_state = Running
np = 12
ntype = cluster
mom_service_port = 15002
mom_manager_port = 15003
计算节点:
[root@calnode02 ~]# systemctl status pbs_mom.service -l
● pbs_mom.service - TORQUE pbs_mom daemon
Loaded: loaded (/usr/lib/systemd/system/pbs_mom.service; enabled; vendor preset: disabled)
Active: active (running) since Sun 2018-10-28 01:18:50 CST; 10min ago
Main PID: 1041 (pbs_mom)
Tasks: 11
Memory: 101.8M
CGroup: /system.slice/pbs_mom.service
└─1041 /usr/local/torque/sbin/pbs_mom -F -d /var/spool/torque
Oct 28 01:29:05 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:05 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 154 MOM status update intervals
Oct 28 01:29:09 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:09 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 155 MOM status update intervals
Oct 28 01:29:14 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:14 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 156 MOM status update intervals
Oct 28 01:29:18 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:18 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 157 MOM status update intervals
Oct 28 01:29:22 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Could not contact any of the servers to send an update
Oct 28 01:29:22 calnode02 pbs_mom[1041]: LOG_ERROR::send_update_to_a_server, Status not successfully updated for 158 MOM status update intervals,