1,双机所含资源(可从main.cf)
vi /etc/VRTSvcs/conf/config/main.cf
include "types.cf"
cluster dbcluster (
UserNames = { admin = ajkCjeJgkFkkIskEjh }
Administrators = { admin }
)
system GDGZ-VPMN-SV03A-DBSVR (
)
system GDGZ-VPMN-SV03B-DBSVR (
)
//这个是snmp开放AMOS监控所需的group
group ClusterService (
SystemList = { GDGZ-VPMN-SV03A-DBSVR = 0, GDGZ-VPMN-SV03B-DBSVR = 0 }
AutoStartList = { GDGZ-VPMN-SV03A-DBSVR, GDGZ-VPMN-SV03B-DBSVR }
)
NIC hanic (
Enabled = 0
Device @GDGZ-VPMN-SV03A-DBSVR = bond0
Device @GDGZ-VPMN-SV03B-DBSVR = bond0
)
NotifierMngr ntfr (
SnmpConsoles = { "172.28.1.35" = Information }
)
ntfr requires hanic
//这个是连接IOD的agent/DB/Datasync/Fileanalyse资源组
group dbcluster (
SystemList = { GDGZ-VPMN-SV03A-DBSVR = 1, GDGZ-VPMN-SV03B-DBSVR = 2 }
AutoStartList = { GDGZ-VPMN-SV03A-DBSVR, GDGZ-VPMN-SV03B-DBSVR }
)
Application RawRes (
User = root
StartProgram = "/home/cluster/raw/rawstart.sh"
StopProgram = "/home/cluster/raw/rawstop.sh"
CleanProgram = "/home/cluster/raw/rawstop.sh"
MonitorProgram = "/home/cluster/raw/rawmonitor.sh"
)
Application WebsmsAPP (
User = root
StartProgram = "/home/cluster/app/start.sh"
StopProgram = "/home/cluster/app/stop.sh"
CleanProgram = "/home/cluster/app/stop.sh"
MonitorProgram = "/home/cluster/app/monitor.sh"
)
Application WebsmsDB (
User = root
StartProgram = "/home/cluster/ora/orastart.sh"
StopProgram = "/home/cluster/ora/orastop.sh"
MonitorProgram = "/home/cluster/ora/oramonitor.sh"
)
Application WebsmsVG (
User = root
StartProgram = "/home/cluster/vg/vgstart.sh"
StopProgram = "/home/cluster/vg/vgstop.sh"
CleanProgram = "/home/cluster/vg/vgclean.sh"
MonitorProgram = "/home/cluster/vg/vgmonitor.sh"
)
IP serviceip (
Device = bond0
Address = "X.X.X.X"
NetMask = "X.X.X.X"
)
Mount data0 (
MountPoint = "/websmsshare"
BlockDevice = "/dev/mapper/filevg-lv_websms"
FSType = ext3
FsckOpt = "-y"
)
Mount data1 (
MountPoint = "/dataSync"
BlockDevice = "/dev/mapper/filevg-lv_datasync"
FSType = ext3
FsckOpt = "-y"
)
Mount data2 (
MountPoint = "/back"
BlockDevice = "/dev/mapper/filevg-lv_back"
FSType = ext3
FsckOpt = "-y"
)
NIC SmcNic (
Device = bond0
)
RawRes requires WebsmsVG
WebsmsAPP requires WebsmsDB
WebsmsDB requires RawRes
WebsmsVG requires serviceip
data0 requires WebsmsVG
data1 requires WebsmsVG
data2 requires WebsmsVG
serviceip requires SmcNic
2, 双机脚本路径/home/cluster
GDGZ-VPMN-SV03A-DBSVR:/home/cluster # ll
total 24
drwxr-x--- 2 root root 4096 Apr 12 16:28 app
drwxr-xr-x 2 root root 4096 Apr 9 17:41 lv
drwxr-xr-x 2 root root 4096 Apr 12 16:30 ora
drwxr-xr-x 2 root root 4096 Jan 7 14:43 raw
drwxr-xr-x 2 root root 4096 Apr 12 15:05 vg
lv下有一个binddbraw.sh脚本,该脚本定义了3个运行参数
start:
对每一个系统lv执行如下操作
raw /dev/raw/raw23 /dev/datavg/lv_spfile
chown oracle:oinstall /dev/raw/raw23
stop:
对每一个系统lv执行如下操作
raw /dev/raw/raw23 0 0
restart:
先执行stop,再执行start
Vg下有如下脚本
GDGZ-VPMN-SV03A-DBSVR:/home/cluster/vg # ll
total 20
-rwxr-xr-x 1 root root 81 Mar 23 15:51 vg_config.inc
-rwxr-xr-x 1 root root 1400 Jun 9 2009 vgclean.sh
-rwxr-xr-x 1 root root 2297 Jun 9 2009 vgmonitor.sh
-rwxr-xr-x 1 root root 1532 Jun 9 2009 vgstart.sh
-rwxr-xr-x 1 root root 1400 Jun 9 2009 vgstop.sh
其中vg_config.inc定义了VCS系统识别的2个VG
TOTAL_VGS=2
NAME_OF_VG1=datavg
NAME_OF_VG2=filevg
vgstart.sh执行激活VG操作
vgchange -ay ${VG_NAME} 2>&1
vgstop.sh执行去激活VG操作
vgchange -an ${VG_NAME} 2>&1
vgclean.sh和vgstop.sh内容一样
GDGZ-VPMN-SV03A-DBSVR:/home/cluster/vg # diff ./vgclean.sh ./vgstop.sh
GDGZ-VPMN-SV03A-DBSVR:/home/cluster/vg #
这也是为什么很多资源的clean动作直接引用stop动作脚本的原因!
vgmonitor.sh通过分析vgscan/lvscan的结果来判断vg的状态!
raw下有如下脚本:
GDGZ-VPMN-SV03A-DBSVR:/home/cluster/raw # ll
total 16
-rwxr-xr-x 1 root root 1498 Dec 25 11:24 raw_config.inc
-rwxr-xr-x 1 root root 839 Jun 9 2009 rawmonitor.sh
-rwxr-xr-x 1 root root 54 Jun 9 2009 rawstart.sh
-rwxr-xr-x 1 root root 54 Jun 9 2009 rawstop.sh
其中raw_config.inc定义了VCS监控的raw,具体如下:
#DB2 dataBase Cluster Scripter
DB2_VGNAME=datavg
#raw device count
DB2_LV_COUNT=30
#raw name (corresponding with the raw devices list)
DB2_MOUNT_RAW_NAME_1=/dev/raw/raw23
DB2_MOUNT_RAW_NAME_2=/dev/raw/raw24
DB2_MOUNT_RAW_NAME_3=/dev/raw/raw25
…
rawstart.sh就是执行如下命令:
/home/cluster/lv/binddbraw.sh start
rawstop.sh就是执行如下命令:
/home/cluster/lv/binddbraw.sh stop
rawmonitor.sh就是通过分析raw -qa的结果来判断raw的状态
ora下有如下脚本:
GDGZ-VPMN-SV03A-DBSVR:/home/cluster/ora # ll
total 12
-rwxr-xr-x 1 root root 500 Apr 12 16:06 oramonitor.sh
-rwxr-xr-x 1 root root 207 Jun 9 2009 orastart.sh
-rwxr-xr-x 1 root root 204 Jun 9 2009 orastop.sh
其中orastart.sh即执行如下两条命令
su - oracle -c "/home/oracle/product/11.1/bin/dbstart"
su - oracle -c "/home/oracle/product/11.1/bin/lsnrctl start"
orastop.sh即执行如下两条命令
su - oracle -c "/home/oracle/product/11.1/bin/dbshut"
su - oracle -c "/home/oracle/product/11.1/bin/lsnrctl stop"
oramonitor.sh通过分析ps -fu oracle的结果来判断oracle核心进程ora_pmon ora_ckpt ora_smon ora_dbw ora_lgw tnslsnr是否退出!
app下有如下脚本:
GDGZ-VPMN-SV03A-DBSVR:/home/cluster/app # ll
total 36
-rwx--x--- 1 root root 3693 Apr 1 10:33 monitor.sh
-rwxr-x--- 1 root root 1856 Apr 12 16:18 start.sh
-rwx--x--- 1 root root 2704 Apr 1 10:33 stop.sh
其中start.sh中定制了到IOD1/IOD2/IOD3和各区域userproxy/ccm的主机路由!这个很重要,否则不会从bond0:0(即以浮动IP 172.28.1.35出去),而对端防火墙却只放通了172.28.1.35(10.244.70.11影射)!
route add -host 10.244.74.67 gw 172.28.1.1 dev bond0:0
route add -host 172.10.15.141 gw 172.28.1.1 dev bond0:0
route add -host 172.10.19.135 gw 172.28.1.1 dev bond0:0
route add -host 172.10.12.233 gw 172.28.1.1 dev bond0:0
route add -host 172.20.252.150 gw 172.28.1.1 dev bond0:0
route add -host 172.10.11.150 gw 172.28.1.1 dev bond0:0
route add -host 172.23.252.25 gw 172.28.1.1 dev bond0:0
route add -host 10.243.190.135 gw 172.28.1.1 dev bond0:0
route add -host 10.243.225.135 gw 172.28.1.1 dev bond0:0
后续还要添加4个AO点的主机路由!
另外还有如下3条启动应用资源的命令
su - websms -c "(cd "$WEBSMSPATH"/agent/bin; nohup sh smscenter.sh &)"
su - websms -c "(cd "$WEBSMSPATH"/agent/bin; nohup sh datasync.sh &)"
su - $UOMUSER -c "(cd "$UOMPATH"/bin; fileanalyse &)"
stop.sh其实就是ps –fu 出来上面的对应进程后,再kill -9掉这个进程,具体如下:
ps -fu $WEBSMSUSER | grep $PROC_DATASYNC |grep -v grep | awk '{print $2}'
kill -9 $pid
monitor.sh进程会对上面的3个进程自动拉起!!!即上面的3个进程如果退出,会被Monitor拉起来,而不是直接切换!!!这是由于DB双机上部署这些进程,DB优先级高,相对稳定,应用相对不稳定,如果应用异常就触发切换,对系统的影响更大,故采用了这种方式!
顺别加一句:对连接CCM的2个节点的进程,采用了root定义定时任务的方式监控:
5,10,15,20,25,30,35,40,45,50,55,59 * * * * sh /home/websms/mymonitor.sh
附录:
Datavg下的lv:
GDGZ-VPMN-SV03A-DBSVR:/dev/datavg # ll |awk '{print($9,$10,$11)}'
lv_ctl1 -> /dev/mapper/datavg-lv_ctl1
lv_ctl2 -> /dev/mapper/datavg-lv_ctl2
lv_ctl3 -> /dev/mapper/datavg-lv_ctl3
lv_data41 -> /dev/mapper/datavg-lv_data41
lv_data42 -> /dev/mapper/datavg-lv_data42
lv_data43 -> /dev/mapper/datavg-lv_data43
lv_data44 -> /dev/mapper/datavg-lv_data44
lv_data45 -> /dev/mapper/datavg-lv_data45
lv_data46 -> /dev/mapper/datavg-lv_data46
lv_data47 -> /dev/mapper/datavg-lv_data47
lv_data48 -> /dev/mapper/datavg-lv_data48
lv_data49 -> /dev/mapper/datavg-lv_data49
lv_data50 -> /dev/mapper/datavg-lv_data50
lv_data51 -> /dev/mapper/datavg-lv_data51
lv_data52 -> /dev/mapper/datavg-lv_data52
lv_data53 -> /dev/mapper/datavg-lv_data53
lv_data54 -> /dev/mapper/datavg-lv_data54
lv_data55 -> /dev/mapper/datavg-lv_data55
lv_data56 -> /dev/mapper/datavg-lv_data56
lv_data57 -> /dev/mapper/datavg-lv_data57
lv_data58 -> /dev/mapper/datavg-lv_data58
lv_data59 -> /dev/mapper/datavg-lv_data59
lv_data60 -> /dev/mapper/datavg-lv_data60
lv_data61 -> /dev/mapper/datavg-lv_data61
lv_data62 -> /dev/mapper/datavg-lv_data62
lv_data63 -> /dev/mapper/datavg-lv_data63
lv_data64 -> /dev/mapper/datavg-lv_data64
lv_data65 -> /dev/mapper/datavg-lv_data65
lv_redo1 -> /dev/mapper/datavg-lv_redo1
lv_redo2 -> /dev/mapper/datavg-lv_redo2
lv_redo3 -> /dev/mapper/datavg-lv_redo3
lv_spfile -> /dev/mapper/datavg-lv_spfile
lv_sysaux -> /dev/mapper/datavg-lv_sysaux
lv_system -> /dev/mapper/datavg-lv_system
lv_temp -> /dev/mapper/datavg-lv_temp
lv_undotbs -> /dev/mapper/datavg-lv_undotbs
lv_users -> /dev/mapper/datavg-lv_users
filevg下的lv:
GDGZ-VPMN-SV03A-DBSVR:/dev/filevg # ll |awk '{print($9,$10,$11)}'
lv_back -> /dev/mapper/filevg-lv_back
lv_datasync -> /dev/mapper/filevg-lv_datasync
lv_websms -> /dev/mapper/filevg-lv_websms
回到上面的binddbraw.sh
raw /dev/raw/raw23 /dev/datavg/lv_spfile
raw /dev/raw/raw24 /dev/datavg/lv_ctl1
raw /dev/raw/raw25 /dev/datavg/lv_ctl2
raw /dev/raw/raw26 /dev/datavg/lv_ctl3
raw /dev/raw/raw27 /dev/datavg/lv_sysaux
raw /dev/raw/raw28 /dev/datavg/lv_system
raw /dev/raw/raw29 /dev/datavg/lv_temp
raw /dev/raw/raw30 /dev/datavg/lv_undotbs
raw /dev/raw/raw31 /dev/datavg/lv_users
raw /dev/raw/raw32 /dev/datavg/lv_redo1
raw /dev/raw/raw33 /dev/datavg/lv_redo2
raw /dev/raw/raw34 /dev/datavg/lv_redo3
raw /dev/raw/raw41 /dev/datavg/lv_data41
raw /dev/raw/raw42 /dev/datavg/lv_data42
raw /dev/raw/raw43 /dev/datavg/lv_data43
raw /dev/raw/raw44 /dev/datavg/lv_data44
raw /dev/raw/raw45 /dev/datavg/lv_data45
raw /dev/raw/raw46 /dev/datavg/lv_data46
raw /dev/raw/raw47 /dev/datavg/lv_data47
raw /dev/raw/raw48 /dev/datavg/lv_data48
raw /dev/raw/raw49 /dev/datavg/lv_data49
raw /dev/raw/raw50 /dev/datavg/lv_data50
raw /dev/raw/raw51 /dev/datavg/lv_data51
raw /dev/raw/raw52 /dev/datavg/lv_data52
raw /dev/raw/raw53 /dev/datavg/lv_data53
raw /dev/raw/raw54 /dev/datavg/lv_data54
raw /dev/raw/raw55 /dev/datavg/lv_data55
raw /dev/raw/raw56 /dev/datavg/lv_data56
raw /dev/raw/raw57 /dev/datavg/lv_data57
raw /dev/raw/raw58 /dev/datavg/lv_data58
raw /dev/raw/raw59 /dev/datavg/lv_data59
raw /dev/raw/raw60 /dev/datavg/lv_data60
raw /dev/raw/raw61 /dev/datavg/lv_data61
raw /dev/raw/raw62 /dev/datavg/lv_data62
raw /dev/raw/raw63 /dev/datavg/lv_data63
raw /dev/raw/raw64 /dev/datavg/lv_data64
raw /dev/raw/raw65 /dev/datavg/lv_data65
可知,实际就是绑定datavg下的lv/raw
了解了上面的原理:
假如双机异常,应即方案就是尽快在一台单机上启动DB,然后启动连接IOD的smscenter!最后启动同步开户文件的datasync和fileanalyse!
启动DB方法如下:
1, 在一台机器上尽快恢复浮动IP,可以使用资源最小方法。仅保留浮动IP资源!
2, 在获得浮动IP的机器上激活VG
3, 在获得浮动IP的机器上绑定raw
4, 启动DB()