官方下载地址:https://prometheus.io/download
帮助文档:https://github.com/prometheus/alertmanager
1、部署Altermanager告警组件
[root@localhost prometheus]# tar zxvf alertmanager-0.16.0.linux-amd64.tar.gz
[root@localhost prometheus]# mv alertmanager-0.16.0.linux-amd64 /usr/local/alertmanager
[root@localhost prometheus]# cd /usr/local/alertmanager
[root@localhost alertmanager]# ls
alertmanager alertmanager.yml amtool LICENSE NOTICE
[root@localhost alertmanager]# vim alertmanager.yml # 查看配置文件
global: # 全局配置
resolve_timeout: 5m # 解析的超时时间
smtp_smarthost: 'smtp.163.com:25' # 邮箱服务器地址(邮件告警时需要使用,比如这个是163的邮箱服务器)
smtp_from: 'prod@163.com' # 发送告警的邮箱
smtp_auth_username: 'prod@163.com' # 邮箱名称
smtp_auth_password: 'abc123' # 邮箱密码
smtp_require_tls: false # 是否使用tls
route: # 发送告警的规则
group_by: ['alertname'] # 采用哪个标签作为分组依据
group_wait: 10s # 分组等待时间(等待10s发送一次,将10s内的告警一次发送出去)
group_interval: 10s # 告警分组之间发送告警的间隔时间
repeat_interval: 1h # 重复告警的时间(发送的频率)
receiver: 'web.hook' # 指定接收者
receivers: # 定义接收者
- name: 'web.hook' # 接收者名称
webhook_configs: # 定义webhook接收告警的方式(如钉钉)
- url: 'http://127.0.0.1:5001/'
- name: 'mail'
email_configs: # 定义email的方式接收告警通知
- to: 'zhangshan369@163.com'
inhibit_rules: # 抑制,用于告警收敛
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
[root@localhost alertmanager]# ./amtool check-config alertmanager.yml # 检查配置文件
Checking 'alertmanager.yml' SUCCESS # 配置无误
Found:
- global config
- route
- 1 inhibit rules
- 1 receivers
- 0 templates
[root@localhost alertmanager]# ./alertmanager --config.file=alertmanager.yml # 指定配置文件启动altermanager
2、配置Prometheus与Alertmanager通信
[root@localhost alertmanager]# cd ../prometheus/
[root@localhost prometheus]# pwd
/usr/local/prometheus
[root@localhost prometheus]# vim prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets: # 监控的实例
- 192.168.10.20:9093
rule_files: # 告警规则文件
- "rules/*.yml"
3、在prometheus中创建告警规则
规则的创建如下示例
官方地址:https://prometheus.io/docs/prometheus/latest/configuration/alerting_rules/
示例参数解释:
groups: # 进行告警分组
- name: example # 分组名称
rules: # 告警规则
# Alert for any instance that is unreachable for >5 minutes.
# 所有实例超出5分组不可达,就发出告警
- alert: InstanceDown # 告警实例名称
expr: up == 0 # 表达式(up=0表示down)
for: 5m # 当前告警持续时间(5分钟之内连续这个状态就告警)
labels: # 定义当前告警规则的级别
severity: page # 如error
annotations: # 告警信息描述
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
# Alert for any instance that has a median request latency >1s.
- alert: APIHighRequestLatency
expr: api_http_request_latencies_second{quantile="0.5"} > 1
for: 10m
annotations:
summary: "High request latency on {{ $labels.instance }}"
description: "{{ $labels.instance }} has a median request latency above 1s (current value: {{ $value }}s)"
编写告警
[root@localhost prometheus]# pwd
/usr/local/prometheus
[root@localhost prometheus]# vim rules/alert.yml
groups:
- name: example
rules:
# alert for any instance that is unreachable for >5 minutes
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
serveriry: error
annotations:
sumary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
配置Alertmanager给systemd管理
[root@localhost prometheus]# vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=Alertmanager.service
[Service]
ExecStart=/usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@localhost prometheus]# systemctl daemon-reload
[root@localhost prometheus]# systemctl start alertmanager
[root@localhost prometheus]# ps -ef |grep alertmanager
root 101264 1 0 11:27 ? 00:00:00 /usr/local/alertmanager/alertmanager --config.file=/usr/local/alertmanager/alertmanager.yml
root 102445 91138 0 11:29 pts/3 00:00:00 grep --color=auto alertmanager
[root@localhost prometheus]#
访问prometheus,可以看到告警
4、告警通知配置
邮箱告警直接在alertmanager.yml中配置即可,钉钉告警需要下载插件进行格式化
官方文档:https://github.com/timonwong/prometheus-webhook-dingtalk4
下载地址:https://github.com/timonwong/prometheus-webhook-dingtalk/releases
如:wget ‘https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz’
获取钉钉机器人的webhook
- 将这里的webhook地址复制,后面在告警配置中会用到
- 钉钉机器人的安全设置会过滤到一些垃圾信息,有多种安全机制,设置"自定义关键词"的方式,机器人会转发包含关键词的信息 ; 设置"加签"的方式,需要在下面的配置文件中添加认证的信息才会接收。
- 推荐使用"加签"的方式,将加签的秘钥复制,后面需要写入到配置文件中
下载webhook-dingtalk二进制包并安装
[root@localhost prometheus]# wget 'https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v1.4.0/prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz'
[root@localhost prometheus]# tar zxvf prometheus-webhook-dingtalk-1.4.0.linux-amd64.tar.gz
[root@localhost prometheus]# mv prometheus-webhook-dingtalk-1.4.0.linux-amd64 /usr/local/prometheus-webhook-dingtalk
[root@localhost prometheus]# cd /usr/local/prometheus-webhook-dingtalk
[root@localhost prometheus-webhook-dingtalk]# ls
config.example.yml contrib LICENSE prometheus-webhook-dingtalk
修改config.yml配置
[root@localhost prometheus-webhook-dingtalk]# cp -p config.example.yml config.yml
[root@localhost prometheus-webhook-dingtalk]# vim config.yml
targets:
webhook1:
# url: 这里将从钉钉机器人那复制过来的webhook地址粘贴
url: https://oapi.dingtalk.com/robot/send?access_token=XXXX
# secret for signature 在secret: 这粘贴钉钉机器人的标签密码
secret: SEC7071XXXXX
配置dingtalk给systemd管理
[root@localhost prometheus-webhook-dingtalk]# vim /usr/lib/systemd/system/dingtalk.service
[Unit]
Description=dingtalk.service
[Service]
ExecStart=/usr/local/prometheus-webhook-dingtalk/prometheus-webhook-dingtalk \
# 设置监听端口,默认是8060
--web.listen-address=:9160 \
--config.file=/usr/local/prometheus-webhook-dingtalk/config.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
[root@localhost prometheus-webhook-dingtalk]# systemctl start dingtalk
配置alermanager.yml
[root@localhost alertmanager]# vim alertmanager.yml # 查看配置文件
global: # 全局配置
resolve_timeout: 5m # 解析的超时时间
route: # 发送告警的规则
group_by: ['alertname'] # 采用哪个标签作为分组依据
group_wait: 10s # 分组等待时间(等待10s发送一次,将10s内的告警一次发送出去)
group_interval: 10s # 告警分组之间发送告警的间隔时间
repeat_interval: 1h # 重复告警的时间(发送的频率)
receiver: 'web.hook' # 指定接收者
receivers: # 定义接收者
- name: 'web.hook' # 接收者名称
webhook_configs: # 定义webhook接收告警的方式(如钉钉)
- url: 'http://localhost:9160/dingtalk/webhook1/dend'
# 这里的 url: 'http://prometheus-webhook-dingtalk的服务器的ip地址:端口/config.yml中的分组名称/send'
inhibit_rules: # 抑制,用于告警收敛
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
[root@localhost alertmanager]# systemctl restart alertmanager # 重启服务