某数据库的节点4因网络问题,主机关闭。
主机重启后,客户启动crs后,发现crs启动hang住。
# sh -x /etc/init.cssd startcheck
+ ORA_CRS_HOME=/oracle/product/10.2.0/crs
+ ORACLE_USER=oracle
+ ORACLE_HOME=/oracle/product/10.2.0/crs
+ export ORACLE_HOME
+ export ORA_CRS_HOME
+ export ORACLE_USER
+ DISABLE_OPROCD=false
+ OPROCD_DEFAULT_TIMEOUT=1000
+ OPROCD_DEFAULT_MARGIN=500
+ OPROCD_CHECK_TIMEOUT=2000
+ OPROCD_STOP_TIMEOUT=2000
+ OPROCD_DEFAULT_HISTORGRAM=
+ HOSTN=/bin/hostname
+ EXPRN=/usr/bin/expr
+ CUT=/usr/bin/cut
+ AWK=/bin/awk
+ ECHO=echo
+ TR=/bin/tr
+ /bin/uname
+ [ SunOS = AIX ]
+ /bin/uname
+ [ Linux = AIX ]
+ + /bin/hostname
HOST=sgdb2
+ + /usr/bin/expr sgdb2 : .*
len1=5
+ + /usr/bin/expr match sgdb2 [0-9]*\.[0-9]*\.[0-9]*\.[0-9]*
len2=0
+ [ 5 != 0 ]
+ + echo sgdb2
+ /usr/bin/cut -d. -f1
HOST=sgdb2
+ + /bin/tr [:upper:] [:lower:]
+ echo sgdb2
HOST=sgdb2
+ PS=/bin/ps
+ PSE=/bin/ps -e
+ PSEF=/bin/ps -ef
+ HEAD=/bin/head
+ GREP=/bin/grep
+ KILL=/bin/kill
+ KILLTERM=/bin/kill -TERM
+ KILLDIE=/bin/kill -9
+ KILLCHECK=/bin/kill -0 6357102
+ SLEEP=/bin/sleep
+ NULL=/dev/null
+ UNAME=/bin/uname
+ CAT=/bin/cat
+ RMF=/bin/rm -f
+ TEST=/bin/test
+ WCL=/bin/wc -l
+ TOUCH=/bin/touch
+ SU=/bin/su
+ SED=/bin/sed
+ RENICE=/bin/renice
+ RTGPID=/bin/priocntl -s -c RT -i pgid
+ XARGS=/bin/xargs
+ PWD_COMMAND=/usr/bin/pwd
+ MVF=/bin/mv -f
+ EVAL=eval
+ LS=/bin/ls
+ BASENAME=/bin/basename
+ DATE=/bin/date
+ MKDIRP=/bin/mkdir -p
+ CHOWN=/bin/chown
+ CHMOD=/bin/chmod
+ SYNC=/bin/sync
+ USING_VC=0
+ OMONSLEEP=
+ SYNCSLEEP=0
+ + /bin/date +%Y-%m-%d-%H:%M:%S
UNIQUEDATE=2013-12-16-11:45:09
+ UNIQUECORE=core.2013-12-16-11:45:09
+ FINDCLSVMON=/bin/ps -e -o comm,pid | /bin/grep '^oclsvmon'
+ FINDCLSOMON=/bin/ps -e -o comm,pid | /bin/grep '^oclsomon'
+ FINDCSSD=/bin/ps -e -o comm,pid | /bin/grep '^ocssd'
+ FINDPROCD=/bin/ps -e -o comm,pid | /bin/grep '^oprocd'
+ LOGERR=/bin/logger -puser.alert
+ LOGMSG=/bin/logger -puser.err
+ CLEANREBOOTLOCK=/bin/true
+ OPROCD=/oracle/product/10.2.0/crs/bin/oprocd
+ CRSCTL=/oracle/product/10.2.0/crs/bin/crsctl
......
+ /etc/init.cssd runcheck
+ STATUS=0
+ [ 0 != 0 ]
+ [ 0 -eq 1 ]
+ [ ! -r /oracle/product/10.2.0/crs/bin/crsctl ]
+ [ = CSS ]
+ /bin/su oracle -c /oracle/product/10.2.0/crs/bin/crsctl check boot > /tmp/crsctl.6357102
+ RC=8
+ [ 8 != 0 ]
+ /bin/logger -puser.err Cluster Ready Services waiting on dependencies. Diagnostics in /tmp/crsctl.6357102.
+ /bin/sleep 60
+ /bin/su oracle -c /oracle/product/10.2.0/crs/bin/crsctl check boot > /tmp/crsctl.6357102
+ RC=8
+ [ 8 != 0 ]
+ /bin/logger -puser.err Cluster Ready Services waiting on dependencies. Diagnostics in /tmp/crsctl.6357102.
+ /bin/sleep 60
+ /bin/su oracle -c /oracle/product/10.2.0/crs/bin/crsctl check boot > /tmp/crsctl.6357102
+ RC=8
+ [ 8 != 0 ]
+ /bin/logger -puser.err Cluster Ready Services waiting on dependencies. Diagnostics in /tmp/crsctl.6357102.
+ /bin/sleep 60
Broadcast message from root@sgdb2 (tty) at 11:47:24 ...
Starting Concurrent Logical Volume Manager (gsclvmd) subsystem on sgdb2
Broadcast message from root@sgdb2 (tty) at 11:47:25 ...
Starting Cluster Information Services (clinfoES) subsystem on sgdb2
+ /bin/su oracle -c /oracle/product/10.2.0/crs/bin/crsctl check boot > /tmp/crsctl.6357102
+ RC=0
+ [ 0 != 0 ]
+ /bin/logger -puser.err Cluster Ready Services completed waiting on dependencies.
+ /bin/rm -f /tmp/crsctl.6357102
+ /bin/sleep 0
+ exit 0
查看错误信息
#cat /tmp/crsctl.6357102
OCR initialization failed accessing OCR device: PROC-26: Error while accessing the physical storage Operating System error [No such device or address] [6]
HA查看
# lssrc -g cluster
Subsystem Group PID Status
clstrmgrES cluster 5374026 active
应该是因为该节点的HA没起导致的ocr读取失败,从而是crs状态异常。
拉起HA后,然后拉起CRS。最后拉库的时候一直停留在
alter database open; 有1小时
后台日志没有报错。
以前也出现过这种情况,是出现enq:TT锁阻塞了。
column event format a30
column sess format a20
set linesize 150
break on id1 skip 1
select decode(request,0,'Holder:',' Waiter:') || s.inst_id || ':' || s.sid||','|| s.serial# sess,
id1, id2, lmode, request, l.type, ctime, s.sql_id, s.event,s.last_call_et
-- ,s.service_name
from gv$lock l, gv$session s
where (id1, id2, l.type) in
(select id1, id2, type from gv$lock where request>0
)
and l.sid=s.sid
and l.inst_id=s.inst_id
order by id1, ctime desc, request
/
SESS ID1 ID2 LMODE REQUEST TY CTIME SQL_ID EVENT LAST_CALL_ET
-------------------- ---------- ---------- ---------- ---------- -- ---------- ------------- ------------------------------ ------------
Holder:1:1770,3730 0 0 4 0 TT 2324808 SQL*Net message from client 1458937
Waiter:4:2265,1 0 0 6 TT 2581 a01hp0psv0rrh enq: TT - contention 2587
Waiter:1:2209,1 0 0 4 TT 2426 4gd6b1r53yt88 enq: TT - contention 3438988
Waiter:3:2280,1 0 0 4 TT 2349 4gd6b1r53yt88 enq: TT - contention 5709961
Waiter:2:2209,1 0 0 4 TT 2234 4gd6b1r53yt88 enq: TT - contention 5193911
可以看到sid为1770 的进程将羡慕几个都给阻塞了。
接着找1770对应的spid
col username format a10
col program format a10
col event format a23
col spid format a10
col machine format a20
col blocking_session format 99999
set linesize 500
set pagesize 1000
select s.sid,s.serial#,s.username,s.machine,p.spid,s.program,s.sql_id,s.event,s.last_call_et,s.blocking_session
from v$session s,v$process p
where
p.addr=s.paddr and s.sid=1770 order by last_call_et desc;
SID SERIAL# USERNAME MACHINE SPID PROGRAM SQL_ID EVENT LAST_CALL_ET BLOCKING_SESSION
---------- ---------- ---------- -------------------- ---------- ---------- ------------- ----------------------- ------------ ----------------
1770 3730 DWOUWENMIN SWG\NOAS-MET1-3650 757908 PlSqlDev.e SQL*Net message from cl 1458865
xe ient
kill -9 757908 后 库open了
中间还试过debug,无结果。oracle debug的方法:
SQL> oradebug setmypid;
Statement processed.
SQL> oradebug unlimit;
Statement processed.
SQL> oradebug -g all hanganalyze 3;
来自 “ ITPUB博客 ” ,链接:http://blog.itpub.net/24996904/viewspace-1063536/,如需转载,请注明出处,否则将追究法律责任。
转载于:http://blog.itpub.net/24996904/viewspace-1063536/