Oracle\故障排查\ORA 检查 alert_log 日志原来是归档日志占满了 FLASH_RECOVERY_AREA 的 2G 空间

1、ORA-00304

[oracle@testosc:/home/oracle]$oerr ora 00304
00304, 00000, "requested INSTANCE_NUMBER is busy"
// *Cause:  An instance tried to start by using a value of the
//          initialization parameter INSTANCE_NUMBER that is already in use.
// *Action: Either
//       a) specify another INSTANCE_NUMBER,
//       b) shut down the running instance with this number
//       c) wait for instance recovery to complete on the instance with
//          this number.
[oracle@testosc:/home/oracle]$

报错说明:

oracle 11g rac 3节点环境修改三个节点public IP 之前没有关闭has服务,重启后有个记得的实例启动不起来

#集群状态
[root@testosa ~]# crsctl status resource -t
--------------------------------------------------------------------------------
NAME           TARGET  STATE        SERVER                   STATE_DETAILS       
--------------------------------------------------------------------------------
Local Resources
--------------------------------------------------------------------------------
ora.DATA.dg
               ONLINE  ONLINE       testosa                                      
               ONLINE  ONLINE       testosb                                      
               ONLINE  ONLINE       testosc                                      
ora.DATA02.dg
               ONLINE  ONLINE       testosa                                      
               ONLINE  ONLINE       testosb                                      
               ONLINE  ONLINE       testosc                                      
ora.GRID.dg
               ONLINE  ONLINE       testosa                                      
               ONLINE  ONLINE       testosb                                      
               ONLINE  ONLINE       testosc                                      
ora.LISTENER.lsnr
               ONLINE  ONLINE       testosa                                      
               ONLINE  ONLINE       testosb                                      
               ONLINE  ONLINE       testosc                                      
ora.RECOVERY.dg
               ONLINE  ONLINE       testosa                                      
               ONLINE  ONLINE       testosb                                      
               ONLINE  ONLINE       testosc                                      
ora.asm
               ONLINE  ONLINE       testosa                  Started             
               ONLINE  ONLINE       testosb                  Started             
               ONLINE  ONLINE       testosc                  Started             
ora.gsd
               OFFLINE OFFLINE      testosa                                      
               OFFLINE OFFLINE      testosb                                      
               OFFLINE OFFLINE      testosc                                      
ora.net1.network
               ONLINE  ONLINE       testosa                                      
               ONLINE  ONLINE       testosb                                      
               ONLINE  ONLINE       testosc                                      
ora.ons
               ONLINE  ONLINE       testosa                                      
               ONLINE  ONLINE       testosb                                      
               ONLINE  ONLINE       testosc                                      
--------------------------------------------------------------------------------
Cluster Resources
--------------------------------------------------------------------------------
ora.LISTENER_SCAN1.lsnr
      1        ONLINE  ONLINE       testosc                                      
ora.cvu
      1        ONLINE  ONLINE       testosa                                      
ora.oc4j
      1        ONLINE  ONLINE       testosb                                      
ora.rac_db.db
      1        OFFLINE OFFLINE                               Instance Shutdown   
      2        ONLINE  ONLINE       testosa                  Open                #a节点本身实例id为1,此时变为了2
      3        ONLINE  ONLINE       testosb                  Open                
ora.scan1.vip
      1        ONLINE  ONLINE       testosc                                      
ora.testosa.vip
      1        ONLINE  ONLINE       testosa                                      
ora.testosb.vip
      1        ONLINE  ONLINE       testosb                                      
ora.testosc.vip
      1        ONLINE  ONLINE       testosc                                      
[root@testosa ~]# 

#查看实例状态
[root@testosa ~]# srvctl status database -d rac_db -v
Instance racdb_2 is running on node testosa. Instance status: Open.      #从这里看出
Instance racdb_3 is running on node testosb. Instance status: Open.
Database rac_db is not running on node testosc
[root@testosa ~]# 

#命令查询各个实例的INSTANCE_NUMBER
[oracle@testosa:/home/oracle]$env|grep ORACLE
ORACLE_UNQNAME=rac_db
ORACLE_SID=racdb_1
ORACLE_BASE=/oracle/app/oracle
ORACLE_TERM=xterm
ORACLE_HOME=/oracle/app/oracle/product/11.2.0/db_1
[oracle@testosa:/home/oracle]$
SQL> select HOST_NAME,INSTANCE_NUMBER,INSTANCE_NAME,STATUS from gv$instance;

HOST_NAME    INSTANCE_NUMBER INSTANC STATUS
---------- ----------------- ------- ------------------------
testosa                    2 racdb_2 OPEN
testosb                    3 racdb_3 OPEN


SQL> 


#尝试从c节点本地启动实例
[oracle@testosc:/home/oracle]$sqlplus / as sysdba

SQL*Plus: Release 11.2.0.4.0 Production on Sat Dec 9 13:18:09 2023

Copyright (c) 1982, 2013, Oracle.  All rights reserved.

Connected to an idle instance.

SQL> 
SQL> startup
ORA-00304: requested INSTANCE_NUMBER is busy

#此时就需要重新设置sid

[oracle@testosc:/home/oracle]$export ORACLE_SID=racdb_1
[oracle@testosc:/home/oracle]$sqlplus / as sysdba

SQL*Plus: Release 11.2.0.4.0 Production on Sat Dec 9 13:49:03 2023

Copyright (c) 1982, 2013, Oracle.  All rights reserved.

Connected to an idle instance.

SQL> startup
ORA-01078: failure in processing system parameters
LRM-00109: could not open parameter file '/oracle/app/oracle/product/11.2.0/db_1/dbs/initracdb_1.ora'
SQL> 

对于路径拷贝一份参数文件即可

2、

[oracle@testos:/home/oracle]$oerr ora 00257
00257, 00000, "Archiver error. Connect AS SYSDBA only until resolved."
// *Cause:  The archiver process received an error while trying to archive
//       a redo log.  If the problem is not resolved soon, the database
//       will stop executing transactions. The most likely cause of this
//       message is that the destination device is out of space to store the
//       redo log file. Another possible cause is that a destination marked
//       as MANDATORY has failed.
// *Action:  Check the alert log and trace files for detailed error
//        information.
[oracle@testos:/home/oracle]$
问题:
在导入 3G 大的数据时,需要注意 Oracle10.2.0.1 版本的归档日志空间默认为 2G,
不断归档导致磁盘空间。则导入数据暂停。
在进行 大量的数据插入 或者 IMP 大量导入的时候,
-----------------------------------------------------------------------------------------------------------
警告日志出现错误
Wed May 20 17:16:06 2009
Errors in file d:\oracle\product\10.1.0\admin\ZLTEST\bdump\auc_arc1_372.trc:
ORA-19809: 超出了恢复文件数的限制
ORA-19804: 无法回收 9755648 字节磁盘空间 (2147483648 限制中)
ARC1: Error 19809 Creating archive log file to
'D:\ORACLE\PRODUCT\10.1.0\FLASH_RECOVERY_AREA\ZLTEST\ARCHIVELOG\2009_05_20\O1_MF_1_478_U_.ARC'
ARC1: All standby destinations failed; successful archival assumed
ARC1: Failed to archive log 6 thread 1 sequence 478 (19809)
Wed May 20 17:16:06 2009
Errors in file d:\oracle\product\10.1.0\admin\ZLTEST\bdump\auc_arc1_372.trc:
ORA-16038: 日志 6 序列号 478 无法归档
ORA-19809: 超出了恢复文件数的限制
ORA-00312: 联机日志 6 线程 1: 'D:\ORACLE\PRODUCT\10.1.0\ORADATA\ZLTEST\REDO06'
ORA-00312: 联机日志 6 线程 1: 'D:\ORACLE\PRODUCT\10.1.0\ORADATA\ZLTEST\REDO061'
Errors in file d:\oracle\product\10.1.0\admin\ZLTEST\bdump\auc_mmon_2264.trc:
ORA-19815: 警告: db_recovery_file_dest_size 字节 (2147483648 字节) 已使用 100.00%, 尚有 0 字节可用。
-------------------------------------------------------------------------------------------------------------------
原因:
检查 alert_log 日志原来是归档日志占满了 FLASH_RECOVERY_AREA 的 2G 空间
解决办法
#解决方法 1:先手工删除 D:\oracle\product\10.2.0\flash_recovery_area 里面的日志,
然后用户用 rman 进入把归档日志删除
1) 命令>rman target/
2) 命令>crosscheck archivelog all;
3) 命令>delete expired archivelog all;
4) 命令>exit
#解决方法 2:改变了 FLASH_RECOVERY_AREA 的大小
修改命令>alter system set db_recovery_file_dest_size=8G scope=both;
查看命令>show parameter db_recovery_file_dest_size

3、 ASM Instance start failed with ORA-00445 on AIX

某客户的一套ORACLE RAC 2-nodes on AIX, 年迈的大块头25G memory, 平时反馈就非常慢, 因存储原因重启,修复后启动实例2无法启动,而实例1启动正常,无网络问题,,检查启动日志在ASM 实例正在starting后报错, 手动启动ASM 超时报错,ORA-00445: background process “LMD0” did not start after 120 seconds ,这是一个比较古老且常见的问题,

ASM alert log

PING started with pid=7, OS id=270358 
Tue Jul 12 17:09:23 2022
DIA0 started with pid=8, OS id=241918 
Tue Jul 12 17:09:23 2022
LMON started with pid=9, OS id=282632 
Tue Jul 12 17:09:23 2022
SKGXP:[110564060.1]{-}: WARNING: Failed to set buffer limit on IPC interconnect socket
SKGXP:[110564060.1]{-}: Oracle requires that the socket receive buffer size be tunable up to 2048 KB.
Please make sure the kernel parameter which limits the receive socket space set by 
applications (i.e. SO_RCVBUF) is at least that value.
Tue Jul 12 17:11:23 2022
Errors in file /grid/app/diag/asm/+asm/+ASM4/trace/+ASM4_ora_266562.trc  (incident=38253):
ORA-00445: background process "LMD0" did not start after 120 seconds
Incident details in: /grid/app/diag/asm/+asm/+ASM4/incident/incdir_38253/+ASM4_ora_266562_i38253.trc
Tue Jul 12 17:11:26 2022
Dumping diagnostic data in directory=[cdmp_20220712171126], requested by (instance=4, osid=266562), summary=[incident=38253].
USER (ospid: 266562): terminating the instance due to error 445
Instance terminated by USER, pid = 266562

ote:
看到2个报错,1个是socket receive buffer 另1个是ora-445

WARNING: Failed to set buffer limit on IPC interconnect socket
原因是当前OS 内核参数的net socket receive buffer配置过低, db 软件把OS报错提示那么明显,还是比较可贵

The commands required to do this depends on the Operating System.
For example, as root:
      On Linux execute   # sysctl -w net.core.rmem_max=2097152
      On Solaris execute # ndd -set /dev/udp udp_max_buf 2097152
      On AIX execute     # no -o sb_max=4194304   (note: AIX only permits sizes of 1048576, 4194304 or 8388608)

但这并非问题关键,调整后重启ASM依旧提示ora-445. LMD超时检查集群间网络,发现在ping ip可以秒回,但是Ping 主机名时要等几分钟才会返回。刚才的sqlplus / as sysasm同样存在等待几分钟才有输出的现象,看来并非是机器慢问题。开启truss跟踪

Linux strace sample
/usr/bin/strace -ftT -o /tmp/strace.out  “command”

AIX truss sample
truss -failed -o /tmp/truss.out -p pid

sqlplus的truss日志未保留,下面是ping的truss日志, 不过能看到相同的时间消耗调用

258372: _getpid()					= 258372
258372: kopen("/etc/resolv.conf", O_RDONLY)		= 3
258372: kioctl(3, 22528, 0x00000000, 0x00000000)	Err#25 ENOTTY
258372: kioctl(3, 22528, 0x00000000, 0x00000000)	Err#25 ENOTTY
258372: kread(3, " n a m e s e r v e r   1".., 4096)	= 48
258372: kread(3, " n a m e s e r v e r   1".., 4096)	= 0
258372: statx("/etc/resolv.conf", 0x2FF216D8, 76, 0)	= 0
258372: close(3)					= 0
258372: gethostname(0x2FF2174C, 1024)			= 0
258372: access("/usr/lib/nls/msg/en_US/libcnet.cat", 0)	= 0
258372: _getpid()					= 258372
258372: socket(1, 1, 0)					= 3
258372: kfcntl(3, F_SETFD, 0x00000001)			= 0
258372: connext(3, 0x2FF211E0, 1025)			Err#2  ENOENT
258372: close(3)					= 0
258372: kopen("/etc/netsvc.conf", O_RDONLY)		= 3
258372: kioctl(3, 22528, 0x00000000, 0x00000000)	Err#25 ENOTTY
258372: kioctl(3, 22528, 0x00000000, 0x00000000)	Err#25 ENOTTY
258372: kread(3, " #   @ ( # ) 4 3        ".., 4096)	= 4096
258372: kread(3, " o n   a n d   r e s o l".., 4096)	= 620
258372: kread(3, " o n   a n d   r e s o l".., 4096)	= 0
258372: close(3)					= 0
258372: kopen("/etc/irs.conf", O_RDONLY)		Err#2  ENOENT
258372: _thread_self()					= 1048949
258372: getdomainname(0xF06C99D8, 1024)			= 0
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: __libc_sbrk(0x00000000)				= 0x20031EE0
258372: _thread_self()					= 1048949
258372: getdomainname(0xF06C99D8, 1024)			= 0
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: kopen("/etc/hesiod.conf", O_RDONLY)		Err#2  ENOENT
258372: _thread_self()					= 1048949
258372: getdomainname(0xF06C99D8, 1024)			= 0
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: getdomainname(0xF06C99D8, 1024)			= 0
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: getdomainname(0xF06C99D8, 1024)			= 0
258372: _thread_self()					= 1048949
258372: _thread_self()					= 1048949
258372: __libc_sbrk(0x00000000)				= 0x20041EF0
258372: socket(2, 2, 0)					= 3
258372: getsockopt(3, 65535, 4104, 0x2FF203C4, 0x2FF203C0) = 0
258372: connext(3, 0xF0637910, 16)			= 0
258372: _esend(3, 0x2FF211C0, 22, 0, 0x00000000)	= 22
258372: _poll(0x2FF20450, 1, 5000)	(sleeping...)
258372: _poll(0x2FF20450, 1, 5000)			= 0
258372: close(3)					= 0
258372: socket(2, 2, 0)					= 3
258372: _esendto(3, 0x2FF211C0, 22, 0, 0xF0637920, 16, 0x00000000) = 22
258372: _poll(0x2FF20450, 1, 5000)	(sleeping...)
258372: _poll(0x2FF20450, 1, 5000)			= 0
258372: close(3)					= 0
258372: socket(2, 2, 0)					= 3
258372: _esendto(3, 0x2FF211C0, 22, 0, 0xF0637910, 16, 0x00000000) = 22
258372: _poll(0x2FF20450, 1, 5000)	(sleeping...)
258372: _poll(0x2FF20450, 1, 5000)			= 0
258372: close(3)					= 0
258372: socket(2, 2, 0)					= 3
258372: _esendto(3, 0x2FF211C0, 22, 0, 0xF0637920, 16, 0x00000000) = 22
258372: _poll(0x2FF20450, 1, 5000)	(sleeping...)
258372: _poll(0x2FF20450, 1, 5000)			= 0
258372: close(3)					= 0
258372: socket(2, 2, 0)					= 3
258372: _esendto(3, 0x2FF211C0, 22, 0, 0xF0637910, 16, 0x00000000) = 22
258372: _poll(0x2FF20450, 1, 10000)	(sleeping...)
258372: _poll(0x2FF20450, 1, 10000)			= 0
258372: close(3)					= 0
258372: socket(2, 2, 0)					= 3
258372: _esendto(3, 0x2FF211C0, 22, 0, 0xF0637920, 16, 0x00000000) = 22
258372: _poll(0x2FF20450, 1, 10000)	(sleeping...)
258372: _poll(0x2FF20450, 1, 10000)			= 0
258372: close(3)					= 0
258372: socket(2, 2, 0)					= 3
258372: _esendto(3, 0x2FF211C0, 22, 0, 0xF0637910, 16, 0x00000000) = 22
258372: _poll(0x2FF20450, 1, 20000)	(sleeping...)
258372: _poll(0x2FF20450, 1, 20000)			= 0
258372: close(3)					= 0
258372: socket(2, 2, 0)					= 3
258372: _esendto(3, 0x2FF211C0, 22, 0, 0xF0637920, 16, 0x00000000) = 22
258372: _poll(0x2FF20450, 1, 20000)	(sleeping...)

现在相信都猜到接下来排查DNS. 发现确实只有问题节点配置了DNS服务器地址(/etc/resolv.conf), 解析顺序文件/etc/netsvc.conf无内容。

AIX中域名解析通常使用

  1. BIND/DNS (domain name server, named)
  2. Network Information Service (NIS)
  3. The local /etc/hosts file

默认情况下,这些解析进程首先尝试使用 BIND/DNS 进行解析。如果 /etc/resolv.conf 文件不存在或 BIND/DNS 找不到条目,则查询 NIS 是否正在运行。如果 NIS 未运行,则搜索本地 /etc/hosts 文件。可以通过创建配置文件 /etc/netsvc.conf 并指定所需的顺序来覆盖默认顺序, 如hosts=local,bind4 也可以使用NSORDER=local,bind4 方式OS 环境变量更高的优先级。

原因

可能因为域名解析存在一些延迟,影响了db 登录和机器名到IP的解析时间。

解决方法

修改/etc/netsvc.conf 文件中的解析顺序
hosts = local, bind4
或去掉/etc/resolv.conf的DNS地址

为了保证实例间配置一致,把问题节点/etc/resolv.conf的DNS服务器地址注释, ping和sqlplus 恢复了快速响应, CRS启动正常。

  • 9
    点赞
  • 15
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值