1、概述
Skywalking
发送告警的基本原理是每隔一段时间轮询skywalking-oap
收集到的链路追踪的数据,再根据所配置的告警规则(如服务响应时间、服务响应时间百分比)等,如果达到阈值则发送响应的告警信息。 发送告警信息是以线程池异步的方式调用webhook
接口完成的,具体的webhook
接口可以由使用者自行定义,从而可以在指定的webhook
接口中自行编写各种告警方式,比如钉钉告警、邮件告警等等。告警的信息也可以在RocketBot
即ui
中查看到。
目前对应我前面文章中部署的8.5.0
版本支持的告警接口如下:
- 普通webhook
- gRPCHook
- Slack Chat Hook
- WeChat Hook(微信告警)
- Dingtalk Hook(钉钉告警)
- Feishu Hook(飞书告警)
2、告警规则
2.1 默认告警规则
通过/apps/apache-skywalking-apm-bin/config/oal/core.oal 定义指标数据如何查询数据
通过/apps/apache-skywalking-apm-bin/config/alarm-settings.yml定义告警规则.
对一定时间范围内的数据进行计算后告警.
在Skywalking
中,告警规则称为rule
,默认安装的Skywalking
oap server
组件中包含了告警规则的配置文件,位于安装目录下config
文件夹下alarm-settings.yml
文件中,在容器中运行的也是一样的
|
以下是默认的告警规则配置文件内容:
|
现用alarm-settings.yml
|
2.2 告警规则详解
首先提示声明了告警规则名称应该具有唯一性,且必须以 _rule
结尾,这里是service_resp_time_rule
(服务响应时间)
- metrics-name:告警指标,指标度量值为
long
、double
或int
类型 - op:度量值和阈值的比较方式,这里是大于
- threshold:阈值,这里是
1000
,毫秒为单位 - period:评估度量标准的时间长度,也就是告警检查周期,分钟为单位
- count:累计达到多少次告警值后触发告警
- silence-period:忽略相同告警信息的周期,默认与告警检查周期一致。简单来说,就是在触发告警时开始计时
N
,在N
+period
时间内保持沉默silence
不会再次触发告警,这和alertmanager
的告警抑制类似 - message:告警消息主体,通过变量在发送消息时进行自动替换
除此之外,还有以下可选(高级)规则配置:
到这里,就能分析出上面列出的所有默认告警规则的含义,依次为:
|
配置好告警规则后重启Skywalking
oap server,结果验证:
如果要添加自定义告警,首先需要在 oal
文件中添加一个指标,
这里添加一个自定义告警:当接口返回状态码为 404,50, 502, 503, 504 其中一个,就发送告警:
|
3、分发小程序
启动命令为:
|
skywalking-alter.py代码:
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
import sys
reload(sys)
sys.setdefaultencoding('utf8')
import re
import requests
from flask import Flask, request, json
import smtplib
from email.mime.text import MIMEText
import time
import json
import hmac
import hashlib
import base64
import urlparse
import urllib
app = Flask(__name__)
@app.route("/send_alter", methods=["POST"])
def send_alter():
datas = request.json
headers = {'Content-Type': 'application/json'}
webhook = 'http://10.10.4.62/dingding-webhook/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxx'
webhook_zhulong = 'http://10.10.4.62/dingding-webhook/robot/send?access_token=xxxxxxxxxxxxxxxxxxx'
fd0 = open('/data/skywalking-alter/project/all_group')
fd1 = open('/data/skywalking-alter/project/group_cuihan')
fd2 = open('/data/skywalking-alter/project/group_guoqilin')
fd3 = open('/data/skywalking-alter/project/group_lilianxin')
fd4 = open('/data/skywalking-alter/project/group_quhuijiang')
fd5 = open('/data/skywalking-alter/project/group_ranchengcheng')
fd6 = open('/data/skywalking-alter/project/group_zhulong')
fd7 = open('/data/skywalking-alter/project/group_zhangbin')
fd8 = open('/data/skywalking-alter/project/group_wangyu')
fd9 = open('/data/skywalking-alter/project/group_yangdaxin')
fd10 = open('/data/skywalking-alter/project/group_duanshuwen')
fd11 = open('/data/skywalking-alter/project/group_zhaolongji')
fd12 = open('/data/skywalking-alter/project/group_tangpingping')
fd13 = open('/data/skywalking-alter/project/group_null')
fd14 = open('/data/skywalking-alter/project/group_null')
fd15 = open('/data/skywalking-alter/project/group_null')
lines0 = fd0.readlines()
lines1 = fd1.readlines()
lines2 = fd2.readlines()
lines3 = fd3.readlines()
lines4 = fd4.readlines()
lines5 = fd5.readlines()
lines6 = fd6.readlines()
lines7 = fd7.readlines()
lines8 = fd8.readlines()
lines9 = fd9.readlines()
lines10 = fd10.readlines()
lines11 = fd11.readlines()
lines12 = fd12.readlines()
lines13 = fd13.readlines()
lines14 = fd14.readlines()
lines15 = fd15.readlines()
for data in datas:
msg = {
"scope": data['scope'],
"name": data['name'],
"rule_name": data['ruleName'],
"alarm_message": data['alarmMessage'],
"start_time": time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(int(data['startTime']) / 1000)),
}
for read in lines0:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
#"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines1:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines2:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines3:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines4:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
#"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines5:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines6:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
timestamp = long(round(time.time() * 1000))
secret = 'xxxxxxxxxxxxxxxxx'
secret_enc = bytes(secret).encode('utf-8')
string_to_sign = '{}\n{}'.format(timestamp, secret)
string_to_sign_enc = bytes(string_to_sign).encode('utf-8')
hmac_code = hmac.new(secret_enc, string_to_sign_enc, digestmod=hashlib.sha256).digest()
sign = urllib.quote_plus(base64.b64encode(hmac_code))
url = "http://10.10.4.62/dingding-webhook/robot/send?access_token=xxxxxxxxxxxxxxxxxxxxxxxxxxx"+str(timestamp)+"&sign="+str(sign)
headers1 = {'Content-Type': 'application/json;charset=utf-8'}
send_msg_tpl_zhulong = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
#"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(url, data=json.dumps(send_msg_tpl_zhulong), headers=headers1)
for read in lines7:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines8:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines9:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines10:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines11:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines12:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines13:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
#"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines14:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n触发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
#"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
for read in lines15:
read = read.replace('\n', '')
if read in data['name']:
send_msg_tpl = {
"msgtype": "text",
"text": {
"content": "服务名称:{name} \n���发规则:{rule_name} \n告警时间:{start_time} \n告警内容:{alarm_message} ".format(
**msg)
},
"at": {
"atMobiles": [
#"15885740000"
],
"atUserIds": [
"manager5345"
],
"isAtAll": False
}
}
requests.post(webhook, data=json.dumps(send_msg_tpl), headers=headers)
return "成功"
if __name__ == "__main__":
app.run("0.0.0.0", 5000, debug=True)
# !/usr/bin/env python
# _*_ coding: utf-8 _*_
#coding=utf-8
import requests
url = 'http://10.60.4.57:5000/send_alter'
d = [{
"scopeId": 1,
"scope": "SERVICE",
"name": "gzlles-dcb-app.affairs-service",
"id0": 12,
"id1": 0,
"ruleName": "service_resp_time_rule",
"alarmMessage": "alarmMessage xxxxxxxxxxx 测试一下",
"startTime": 15884741400
}]
r = requests.post(url, json=d)
print(r.text)
项目分组目录:10.10.10.10:/data/skywalking-alter/project
命名规则为group_项目负责人姓名拼音,文件内容为服务名称 . 命名空间,all_group文件为项目总集合(注意:all_group与各项目分组内容不能重复,否则钉钉告警会重复)