需求: 远程unix主机cpu load 有时过高,现在需要监控它,当超过阀值则email报警. 访问该主机只能通过telnet,不能安装其他任何library。
现有资源: 已经有nagios环境,python已自带telnetlib模块,可以通过执行uptime命令获取1(5,10)分钟平均cpu load。
通常,用shell script 编写nagios命令规范如下:
echo "OK status: ….” exit 0 echo "WARNING status: ….” exit 1 echo "CRITICAL status: ….” exit 2 echo "UNKNOWN status: ….” exit 3
接口规范依赖exit code.
完成后的python(2.7) 代码如下:
#!/usr/bin/python
# -*- coding: utf-8 -*-
import telnetlib
import sys,os
import datetime
import traceback
import logging
import logging.handlers
import getopt
def script_path():
path = os.path.realpath(sys.argv[0])
if os.path.isfile(path):
path = os.path.dirname(path)
return os.path.abspath(path)
LOGGING_MSG_FORMAT = '[%(asctime)s] [%(levelname)s] [%(module)s] [%(funcName)s] [%(lineno)d] %(message)s'
LOGGING_DATE_FORMAT = '%Y-%m-%d %H:%M:%S'
logging.basicConfig(level=logging.DEBUG,format=LOGGING_MSG_FORMAT,datefmt=LOGGING_DATE_FORMAT)
log = logging.getLogger('check_cpu_load_average_via_telnet')
if os.path.exists("/usr/local/nagios"):
log_path = os.path.join("/usr/local/nagios",'logs')
else:
log_path = os.path.join(script_path(),'logs')
if not os.path.exists(log_path):
os.makedirs(log_path)
log_file = os.path.join(log_path,'check_cpu_load_average_via_telnet.log')
logger = logging.handlers.TimedRotatingFileHandler(log_file,'midnight',1)
logger.setFormatter(logging.Formatter(LOGGING_MSG_FORMAT))
log.addHandler(logger)
if __name__ == '__main__':
try:
reload(sys)
sys.setdefaultencoding("utf-8")
#accept -t via command check_nrpe
opts, args = getopt.getopt(sys.argv[1:],"t:", ["host=","port=","user=","password=","cpuLoadLimit=","timeout="])
host = None
port = 23
username = None
password = None
cpuLoadLimit = None
login_Timeout = 30
for a,o in opts:
if a in ('--host'):
host=o
elif a in ('--port'):
port=int(o)
elif a in ('--user'):
username=o
elif a in ('--password'):
password=o
elif a in ('--cpuLoadLimit'):
cpuLoadLimit=float(o)
elif a in ('-t',"--timeout"):
login_Timeout = round(float(o))
log.info("telnet %s:%s with user:%s, set cpu load limit=%s,timeout=%s", host,str(port),username,str(cpuLoadLimit),str(login_Timeout))
CRLF = '\r\n'
finish = 'telecom> '
#finish = '$ '
tn = telnetlib.Telnet(host=host,port=port, timeout=login_Timeout)
#tn.set_debuglevel(2)
tn.read_until('login: ',timeout=5)
tn.write(username + CRLF)
tn.read_until('Password: ',timeout=5)
tn.write(password + CRLF)
tn.read_until(finish,timeout=5)
tn.write('uptime && sleep 1 '+CRLF)
log.info("run command: uptime")
tn.write("exit && sleep 1" + CRLF)
log.info("run command: exit")
resultOfCommands = tn.read_all()
log.info("Result of Commands:%s",resultOfCommands)
cpu_load_1_minute = resultOfCommands.split('load average:')[-1].strip().split(",")[0]
try:
cpu_load_1_minute = float(cpu_load_1_minute)
except:
print "exception occured while parsing the cpu_load_1_minute"
sys.exit(3)
log.error(traceback.format_exc())
tn.close()
if cpu_load_1_minute < cpuLoadLimit:
print "[ok] cpu load = %s for %s" % (str(cpu_load_1_minute), host)
sys.exit(0)
elif cpu_load_1_minute == cpuLoadLimit:
print "[warn] cpu load approach to %s for %s" % (str(cpu_load_1_minute), host)
sys.exit(1)
else:
print "[Not ok] cpu load = %s[limit:%s] for %s" % (str(cpu_load_1_minute),str(cpuLoadLimit), host)
sys.exit(2)
except SystemExit as e:
sys.exit(e)
except:
errMsg = traceback.format_exc()
print "exception occured, err msg:%s" % errMsg
exit(3)
备注: 防止check_nrpe命令timeout(默认10sec), 加了参数-t 30.