python 监控oozie任务失败及延时告警
思路:
1、链接集群的任务运行相关的数据库 oozie
2、查询近三分钟内失败或运行时间超过1小时的任务
3、调用钉钉接口发送告警信息
# -*- coding: UTF-8 -*-
import mysql.connector
import sys
import datetime
import requests
reload(sys)
sys.setdefaultencoding('utf8')
MYSQL_CONFIG= {'host': '******', 'user': 'root', 'password': '******', "database": 'oozie','port': 3306}
url = 'https://oapi.dingtalk.com/robot/send?access_token=*************'
def connMysql():
db = mysql.connector.connect(**MYSQL_CONFIG)
print("连接上服务器:%s数据库:%s" % ("******", "oozie"))
return db
def failtaskMonitor(db):
cur = db.cursor()
sql = "select app_name,user_name,end_time,start_time from WF_JOBS where status='KILLED' and end_time between date_add(now(),interval-3 minute) and now()"
cur.execute(sql)
datalist = cur.fetchall()
for (app_name,user_name,end_time,start_time) in datalist:
sendmsg = {
"msgtype":root "markdown",
"markdown": {
"title": "任务失败告警",
"text": "任务%s】执行失败,请及时处理\n\n" % app_name +
"**任务开始时间**: %s \n\n" % start_time +
"**任务结束时间**: %s \n\n" % end_time +
"**任务提交人**: %s \n\n" % user_name
}
}
sendMsg(sendmsg)
def delaytaskMonitor(db):
cur = db.cursor()
sql = "select app_name,user_name,start_time,TIMESTAMPDIFF(MINUTE, start_time, now()) as diff from WF_JOBS where status='RUNNING' and TIMESTAMPDIFF(MINUTE, start_time, now()) >60 and app_name <> 'error_log_track_workflow'"
cur.execute(sql)
datalist = cur.fetchall()
for (app_name,user_name,start_time,diff) in datalist:
sendmsg = {
"msgtype": "markdown",
"markdown": {
"title": "任务延迟告警",
"text": "任务【%s】执行超时,请优化\n\n" % app_name +
"**任务已执行**: %s 分钟\n\n" % diff +
"**任务开始时间**: %s \n\n" % start_time +
"**任务提交人**: %s \n\n" % user_name
}
}
sendMsg(sendmsg)
def sendMsg(sendmsg):
req = requests.post(url, json=sendmsg)
result = req.json()
if result['errcode'] != 0:
print('notify dingtalk error: %s' % result['errcode'])
if __name__ == '__main__':
now_time = datetime.datetime.now().minute
db = connMysql()
#失败任务监控
failtaskMonitor(db)
#延时任务监控
if now_time == 0 :
#每小时执行一次
delaytaskMonitor(db)