python脚本监控Flink任务,失败后发送飞书和打电话通知
一、先决条件:
1、拥有登录到可以访问Hadoop的active节点的权限
2、飞书机器人地址
3、阿里arms自定义集成地址
4、python3.6版本
二、代码
import requests
import json
import time
import subprocess
import urllib.parse
from datetime import datetime
# 定义 REST API 的 URL active节点的地址
yarn_url = 'http://active节点ip:8088/ws/v1/cluster/apps'
# 发送 HTTP GET 请求并获取响应结果
response = requests.get(yarn_url)
# 解析响应结果中的 JSON 数据
data = response.json()
#计算时间戳
TIMESTAMP =int(datetime.now().timestamp()*1000)
#告警通道webhook
def webhook_alert(id,name,user,state,finishedTime):
#测试EMR告警机器人
url="飞书告警机器人地址粘贴到此"
headers = {
'Content-Type': 'application/json',
}
card ={
"header": {
"template": "red",
"title": {
"content": "Flink任务失败",
"tag": "plain_text"
}
},
"elements": [
{
"tag": "column_set",
"flex_mode": "none",
"background_style": "default",
"columns": [
{
"tag": "column",
"width": "weighted",
"weight": 1,
"vertical_align": "top",
"elements": [
{
"tag": "div",
"text": {
"content": "**集群名称:**\n>> 集群名称粘贴到此",
"tag": "lark_md"
}
}
]
},
{
"tag": "column",
"width": "weighted",
"weight": 1,
"vertical_align": "top",
"elements": [
{
"tag": "markdown",
"content": "**任务名称:**\n>> "+name+""
}
]
}
]
},
{
"tag": "column_set",
"flex_mode": "none",
"background_style": "default",
"columns": [
{
"tag": "column",
"width": "weighted",
"weight": 1,
"vertical_align": "top",
"elements": [
{
"tag": "markdown",
"content": "**用户名称:**\n>> "+user+""
}
]
},
{
"tag": "column",
"width": "weighted",
"weight": 1,
"vertical_align": "top",
"elements": [
{
"tag": "markdown",
"content": "**任务ID:**\n>> "+id+""
}
]
}
]
},
{
"tag": "column_set",
"flex_mode": "none",
"background_style": "default",
"columns": [
{
"tag": "column",
"width": "weighted",
"weight": 1,
"vertical_align": "top",
"elements": [
{
"tag": "markdown",
"content": "**任务失败时间:**\n>> "+str(finishedTime)
}
]
},
{
"tag": "column",
"width": "weighted",
"weight": 1,
"vertical_align": "top",
"elements": [
{
"tag": "markdown",
"content": "**状  态:**\n>> "+state+""
}
]
}
]
}
]
}
body =json.dumps({"msg_type": "interactive","card":card})
resp = requests.post(url=url, data=body, headers=headers)
return resp
#告警通道飞书
def feishu_alert(receiver_list,channel,content):
url = "http://push.trasre.com/v1/feishu/push"
headers = {
'Content-Type': 'application/json',
}
request_body = {"receiver_list": receiver_list, "channel": channel, "content": content}
resp = requests.request("POST", url, headers = headers, data = json.dumps(request_body))
return resp
# 遍历应用程序列表,输出每个正在运行的应用程序的状态、队列、进度等信息
def get_yarn_apps():
yarn = []
for app in data['apps']['app']:
if app['state'] == 'FAILED' and app['name'] != 'Thrift JDBC/ODBC Server':
for x in range(len(data)):
yarn.append({
'id' : app['id'],
'name' : app['name'],
'user' : app['user'],
'state' : app['state'],
'finishedTime' : app['finishedTime']
})
return yarn
#将数据上传到arms实现电话,短信通知
def upload_to_arms(datas):
url = "arms自定义集成地址粘贴到此"
headers = {'Content-Type': 'application/json'}
response = requests.post(url, headers=headers, json=datas)
if __name__ == '__main__':
date = get_yarn_apps()
for row in date:
diff=TIMESTAMP-row['finishedTime']
#找出从失败到目前时间小于2分钟的任务
if diff < 120000:
dt=datetime.fromtimestamp(row['finishedTime']/1000)
webhook_alert(row['id'],row['name'],row['user'],row['state'],dt.strftime('%Y-%m-%d %H:%M:%S'))
datas = {
"id": row['id'],
"name": row['name'],
"user":row['user'],
"state":row['state'],
"finishedTime":dt.strftime('%Y-%m-%d %H:%M:%S'),
"alertName":"告警名称"
}
upload_to_arms(datas)
三、途中遇到的困难
太多,不想总结