Altermanagert安装
# 下载地址:https://prometheus.io/download/
wget https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
cd /opt/
tar -xf alertmanager-0.23.0.linux-amd64.tar.gz
ln -s alertmanager-0.23.0.linux-amd64 alertmanager
# systemctl配置
vi /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
[Service]
ExecStart=/opt/alertmanager/alertmanager --config.file=/opt/alertmanager/alertmanager.yml
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
[Install]
WantedBy=multi-user.target
systemctl daemon-reload
systemctl enable --now alertmanager
prometheus配置altermanager地址接收数据
vi /opt/prometheus-2.32.0-rc.0.linux-amd64/prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# 下面配置发送地址
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.112.26:9093
# 以prometheus安装目录为相对路径,指定告警策略配置目录文件
rule_files:
- "rules/*.yml"
# - "second_rules.yml"
vi /opt/prometheus-2.32.0-rc.0.linux-amd64/rules/test.yml
groups:
- name: example #告警规则组名称
rules:
# 任何实例5分钟内无法访问发出告警
- alert: InstanceDown # 告警规则名称
expr: up == 0 # 定义触发条件,基于PromQL的触发条件
for: 10s # 等待评估时间
labels: # 自定义标签,可以通过此标签,注明告警级别
severity: page
annotations: # 指定附加信息
summary: " {{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }}:job {{ $labels.job }} 已经停止5分钟以上."
当up=0时,机器属于宕机或与服务器断联
配置完后
下图是配完altermanager的报警收信,提前粘出来是为啦更好的理解上述参数
配置alertmanager
vi /opt/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
# 邮箱服务器
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xyhlinux@163.com'
smtp_auth_username: 'xyhlinux@163.com'
smtp_auth_password: 'YTPQOGPPIWMLFIWQ'
smtp_require_tls: false
# 配置根路由
route:
receiver: 'receivers_mail'
group_by: ['alertname'] # 根据告警规则组名进行分组
group_wait: 10s # 分组内第一个告警等待时间,10s内如有第二个告警会合并一个告警
group_interval: 10s # 发送新告警间隔时间
repeat_interval: 1h # 重复告警间隔发送时间
# 配置子路由
routes:
- match:
# 当报警源标签无法匹配下列键值时,邮件会发送给xnux@163.com
# 当匹配到时,邮件就会发给x403@126.com
# 也可以使用match_re进行正则匹配
severity: page
receiver: test
# 接收人
receivers:
- name: 'receivers_mail'
email_configs:
- to: 'xnux@163.com'
- name: 'test'
email_configs:
- to: 'x403@126.com'
详细配置解释请参考:https://blog.csdn.net/weixin_47677347/article/details/121879065
注意:
- 告警路由发送策略,优先匹配子路由,如果子路由配不到,就会匹配根路由,也可以说是默认路由
- 根路由不可以配置match选项
- 子路由继承根路由一切属性,可以重写
# 重启prometheus
systemctl restart prometheus
systemctl restart alertmanager
抑制规则—防止告警消息泛滥
altermanager.yml文件
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xyhlinux@163.com'
smtp_auth_username: 'xyhlinux@163.com'
smtp_auth_password: 'YTPQOGPPIWMLFIWQ'
smtp_require_tls: false
route:
receiver: 'receivers_mail'
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 10h
routes:
- match:
severity: ERROR
receiver: test
# 抑制规则
inhibit_rules:
# 当报警源的标签内容被“source_matchers“匹配到,则会导致后面的触发的报警源只要同时满足:
# 1、存在equal列表中定义的标签,2、并且标签value与被匹配的告警源标签value一致,3、后续报警源含有severity="warning"该key-value,
# 的所有报警不会发送
# 一般用来定义报警等级,比如一台机器宕机定义为ERROR,那同一台机器上其他等级的报警就不需要发送啦
- source_matchers:
- severity="ERROR"
target_matchers:
- severity="WARNING"
equal: ['instance']
receivers:
- name: 'receivers_mail'
email_configs:
- to: 'xnux@163.com'
- name: 'test'
email_configs:
- to: 'x403@126.com'
配置prometheus告警策略
vi /opt/prometheus-2.32.0-rc.0.linux-amd64/rules/test.yml
groups:
- name: example #告警规则组名称
rules:
# 任何实例5分钟内无法访问发出告警
- alert: InstanceDown # 告警规则名称
# 请忽略条件合理性,一切为了结果
expr: (node_filesystem_free_bytes{device="/dev/sda1", fstype="xfs"}) / (node_filesystem_size_bytes{device="/dev/sda1", fstype="xfs"}) * 100 < 100
for: 10s # 等待评估时间,既报警消息在这个时间内一直存在,才会发送报警
labels: # 自定义标签
severity: "ERROR"
annotations: # 指定附加信息
summary: " {{ $labels.instance }} 磁盘满了"
description: "{{ $labels.instance }}:job {{ $labels.job }} 磁盘空间不足"
- name: inhibit_rules
rules:
- alert: InstanceDown
expr: (node_memory_MemFree_bytes/1024/1024/1024) / (node_memory_MemTotal_bytes/1024/1024/1024) * 100 < 10
for: 10s
labels:
severity: "WARNING"
annotations:
summary: " {{ $labels.instance }} 内存不足"
description: "{{ $labels.instance }}:job {{ $labels.job }} 已经资源不足了"
重启prometheus、alertmanager
kill -HUP 6033
systemctl restart altermanager
配置完后
未配置抑制配置的邮件接收情况
配置抑制告警
global:
resolve_timeout: 5m
# 邮箱服务器
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xyhlinux@163.com'
smtp_auth_username: 'xyhlinux@163.com'
smtp_auth_password: 'YTPQOGPPIWMLFIWQ'
smtp_require_tls: false
# 配置路由树
route:
receiver: 'receivers_mail'
group_by: ['alertname'] # 根据告警规则组名进行分组
group_wait: 10s # 分组内第一个告警等待时间,10s内如有第二个告警会合并一个告警
group_interval: 10s # 发送新告警间隔时间
repeat_interval: 10s # 重复告警间隔发送时间
routes:
# 正则匹配
- match_re:
severity: ERROR|WARNING
receiver: test
inhibit_rules:
- source_matchers:
- severity="ERROR"
target_matchers:
- severity="WARNING"
equal: ['instance']
# 接收人
receivers:
- name: 'receivers_mail'
email_configs:
- to: 'xyhlinux@163.com'
- name: 'test'
email_configs:
- to: 'xyh403@126.com'