[toc]
sge高性能集群的搭建与使用
集群环境的准备Node1(master)
CentOS7.4
iptables/selinux(off)
IP:10.180.66.11
hostname:node1
ali yum源
Node2(slave)
CentOS7.4
iptables/selinux(off)
IP:10.180.66.12
hostname:node2
ali yum源
Node3(slave)
CentOS7.4
iptables/selinux(off)
IP:10.180.66.13
hostname:node3
ali yum源
Node4(slave)
CentOS7.4
iptables/selinux(off)
IP:10.180.66.14
hostname:node4
ali yum源
Node5(slave)
CentOS7.4
iptables/selinux(off)
IP:10.180.66.15
hostname:node5
ali yum源
master 节点安装
安装相关依赖包1# yum -y install jemalloc-devel openssl-devel ncurses-devel pam-devel libXmu-devel hwloc-devel hwloc hwloc-libs java-devel javacc ant-junit libdb-devel motif-devel csh ksh xterm db4-utils perl-XML-Simple perl-Env xorg-x11-fonts-ISO8859-1-100dpi xorg-x11-fonts-ISO8859-1-75dpi
新建sge管理员用户1
2
3# groupadd -g 490 sgeadmin
# useradd -u 495 -g 490 -r -m -d /home/sgeadmin -s /bin/bash -c "SGE Admin" sgeadmin
# sed -i '/^%wheel/a\%sgeadmin ALL=(ALL) NOPASSWD: ALL' /etc/sudoers
安装sge
sge 链接 密码:c7hy
1
2
3
4
5# cd /usr/local/src/
# tar -xvf ge2011.11.tar.gz
# mkdir -pv /data
# cp -a ge2011.11 /data/sge
# chown sgeadmin.sgeadmin /data/sge
qmaster 安装自动回答脚本,依赖软件包expect, 所有节点都需要安装
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160# cd /data/sge/
# vim master.sh
#!/bin/bash
user="sgeadmin"
/usr/bin/expect <
spawn ./install_qmaster
expect "*>>"
send "
"
expect "*>>"
send "y
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "n
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect eof
EOF
# sh master.sh
修改主节点环境变量
1
2
3
4
5
6# export SGE_ROOT=/data/sge
# echo 'export SGE_ROOT=/data/sge' >> ~/.bashrc
# echo 'PATH=$PATH:/data/sge/bin/linux-x64/:/data/sge/bin/' >> ~/.bashrc
# cp /data/sge/default/common/settings.sh /etc/profile.d/
# sh /etc/profile.d/settings.sh
# source /etc/profile
添加节点
1
2
3
4
5# qconf -ah node1
# qconf -ah node2
# qconf -ah node3
# qconf -ah node4
# qconf -ah node5
master 服务器搭建 nfs 服务
所有节点都需要安装
1# yum -y install nfs-utils
master节点操作
1
2
3# vim /etc/exports
/data/sge 10.180.66.0/24(rw,sync)
# systemctl restart nfs
slave 节点挂载
(node2,node3,node4,node5)执行
1
2
3# mkdir /data/sge -pv
# mount -t nfs 10.180.66.11:/data/sge /data/sge/
# chown sgeadmin.sgeadmin /data/
slave 服务器安装sgeexecd
(node2,node3,node4,node5) 执行
1
2
3# yum -y install hwloc-devel
# useradd -u 495 -g 490 -r -m -d /home/sgeadmin -s /bin/bash -c "SGE Admin" sgeadmin
# sed -i '/^%wheel/a\%sgeadmin ALL=(ALL) NOPASSWD: ALL' /etc/sudoers
生效环境变量
1
2
3
4
5
6# echo 'export SGE_ROOT=/data/sge' >> ~/.bashrc
# echo 'PATH=$PATH:/data/sge/bin/linux-x64/:/data/sge/bin/' >> ~/.bashrc
# echo 'export SGE_CELL=default' >> ~/.bashrc
# cp /data/sge/default/common/settings.sh /etc/profile.d/ -a
# source ~/.bashrc
# source /etc/profile
进行安装,所有节点都执行此脚本
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61# vim slave.sh
# cat slave.sh
#!/bin/bash
user="sgeadmin"
/usr/bin/expect <
spawn ./install_execd
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect "*>>"
send "
"
expect eof
EOF
# sh slave.sh
完成集群搭建1
2
3
4
5
6
7
8
9# qhost
HOSTNAME ARCH NCPU LOAD MEMTOT MEMUSE SWAPTO SWAPUS
-------------------------------------------------------------------------------
global - - - - - - -
node1 linux-x64 1 0.01 968.3M 193.0M 2.0G 64.0K
node2 linux-x64 1 0.01 976.3M 151.0M 2.0G 0.0
node3 linux-x64 1 0.02 978.3M 152.2M 2.0G 84.0K
node4 linux-x64 1 0.02 976.3M 155.4M 2.0G 0.0
node5 linux-x64 1 0.01 978.3M 148.5M 2.0G 84.0K
sge集群的使用