背景:oracle12c rac偶尔会有奇奇怪怪的问题,自己对这套集群架构也不是特别懂,主要在devops优维平台的自定义采集那里做了三个脚本做检测(ASM磁盘容量检测、集群心跳检测、集群状态检测):
1、ASM占用检测:
#!/usr/bin/python
# _*_coding: utf-8_*_
import cx_Oracle
import socket
import subprocess
import json
import platform
def oraclesql(cursor):
#这里我们使用python的open方法打开文件并读取文件内容作为SQL语句执行
#可使用绝对路径或相对路径
fp=open('/home/oracle/scripts/asm_check.sql','r')
fp1=fp.read()
cursor.execute(fp1)
data=cursor.fetchall()
return data
if __name__=="__main__":
oracle_data = {
"ipaddress": ipaddress,
"username": username,
"password": password,
"port": port,
"tnsname": tnsname
}
info_list4=[]
#这里我们利用Python的异常处理来捕获异常,具体用法请参考文章开始提到的教程
try:
db = cx_Oracle.connect(oracle_data['username']+'/'+oracle_data['password']+'@'+oracle_data['ipaddress']+':'+oracle_data['port']+'/'+oracle_data['tnsname'])
except Exception as e:
content= (oracle_data['tnsname']+' is Unreachable,The reason is '+ str(e)).strip()
print (content)
else:
cursor = db.cursor()
data=oraclesql(cursor)
cursor.close()
db.close()
for asm_info in data:
vals = {
"storage_name": asm_info[1],
"TOTAL_GB":asm_info[2],
"FREE_GB":asm_info[3],
"used_percent":asm_info[4]
}
dims = {
"data_name": "oracle12c_asm_check",
}
info_list4.append(
{
"vals": vals,
"dims": dims
}
)
print json.dumps(info_list4)
注意:
1、devops平台自定义采集当然要提前设置好代码里面的参数(username,host,port等,我上面脚本是要传参数的,因为是集群,我这里只连接一个节点执行sql就行了)
2、这里调用了/home/oracle/scripts/asm_check.sql的sql查询,查询asm容量,下面给出这个sql
sql最好进去oracle用户创建
[root@dbaas3 scripts]# cat asm_check.sql
select group_number,name,total_mb/1024,free_mb/1024 , (1-free_mb/total_mb)*100 as "used%" from v$asm_diskgroup
2、心跳检测
#!/usr/local/easyops/python/bin/python
# encoding=utf-8
import socket
import subprocess
import json
import platform
import os,os.path,sys,commands,signal,time
dbaas = {
"dbaas1": dbaas1,
"dbaas2": dbaas2,
"dbaas3": dbaas3
}
def easyops_custom_output(dim_keys, values):
res = {}
res["dims"] = {k: values.pop(k) for k in dim_keys if k in values}
res["vals"] = values
return res
def run_cmd(cmd):
(status,output) = commands.getstatusoutput("%s" %cmd)
return status, output
#网络ping检测
def get_ping_status():
ping_11_status = 0
ping_12_status = 0
ping_13_status = 0
ping_status1,ping_output1 = run_cmd("ping -c 5 -w 5 " + dbaas['dbaas1'] + " >> /dev/null")
if ping_status1 != 0:
ping_11_status = 1
ping_status2,ping_output2 = run_cmd("ping -c 5 -w 5 " + dbaas['dbaas2'] + " >> /dev/null")
if ping_status2 != 0:
ping_12_status = 2
ping_status3,ping_output3 = run_cmd("ping -c 5 -w 5 " + dbaas['dbaas3'] + " >> /dev/null")
if ping_status3 != 0:
ping_13_status = 3
return ping_11_status,ping_12_status,ping_13_status
def get_ping_info(data_name, dim_keys, ping_11_status, ping_12_status, ping_13_status):
port_info = {
"data_name": data_name,
"ping_11_status": ping_11_status,
"ping_12_status": ping_12_status,
"ping_13_status": ping_13_status
}
return easyops_custom_output(dim_keys, port_info)
if __name__ == "__main__":
data_name = "heartbeat_monitor"
dim_keys = ["data_name"]
data = []
#获取网络状态
ping_11_status,ping_12_status,ping_13_status = get_ping_status()
info = get_ping_info(data_name, dim_keys, ping_11_status, ping_12_status, ping_13_status)
data.append(info)
print json.dumps(data)
注:同样,这里的dbaas1-dbaas3也是要传参数,因为我是三节点的rac集群,所以要三个节点之间ping
3、集群状态检测
#!/usr/local/easyops/python/bin/python
# encoding=utf-8
import socket
import subprocess
import json
import platform
import os,os.path,sys,commands,signal,time
def easyops_custom_output(dim_keys, values):
res = {}
res["dims"] = {k: values.pop(k) for k in dim_keys if k in values}
res["vals"] = values
return res
def run_cmd(cmd):
(status,output) = commands.getstatusoutput("%s" %cmd)
return status, output
#集群状态检测
def get_cluster_status():
cluster_status = 0
#run_status,output = run_cmd("su - grid && crs_stat -t -v | grep ora.scan* | grep OFFLINE")
run_status,output = run_cmd("sh /scripts/check_cluster.sh")
#如果脚本查到了offline的信息,那么run_status是为0的,具体结合我写的/scripts/check_cluster.sh脚本内容来看
if run_status == 0:
cluster_status = 1
return cluster_status
def get_cluster_info(data_name, dim_keys, cluster_status):
port_info = {
"data_name": data_name,
"cluster_status": cluster_status
}
return easyops_custom_output(dim_keys, port_info)
if __name__ == "__main__":
data_name = "cluster_monitor"
dim_keys = ["data_name"]
data = []
#获取集群状态
cluster_status = get_cluster_status()
info = get_cluster_info(data_name, dim_keys, cluster_status)
data.append(info)
print json.dumps(data)
我提前在这个节点底下写好了/scripts/check_cluster.sh这个脚本,主要是检测集群服务
[root@dbaas3 scripts]# cat /scripts/check_cluster.sh
#!/bin/bash
su - grid <<EOF
#crs_stat -t -v | grep ora.proxy_advm | grep OFFLINE;
crs_stat -t -v | grep ora.scan* | grep OFFLINE
exit;
EOF
如果这个sh脚本获取到了offline的信息,证明集群有节点异常了,对应上面采集的python会执行这个shell脚本
以上脚本仅供参考哈,我在devops优维平台自定义采集实践过