在alertmanager配置文件中编写相关路由
-
job=node_exporter 告警由 sre_system处理 5001端口
-
job=mysqld_exporter 告警由 sre_dba处理 5002端口
-
所有的告警 由 sre_all处理 5003端口
-
重新加载alertmanager配置文件
# 写配置文件
cat <<-"EOF" > /opt/app/alertmanager/alertmanager.yml
global:
resolve_timeout: 30mroute:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 1h
receiver: 'sre_all'
routes: #子路由,父路由的所有属性都会被子路由继承
- match_re: #此路由在警报标签上执行正则表达式匹配,以捕获与服务列表相关的警报
job: node_exporter
receiver: sre_system #下面receivers有这个
# continue=true 代表继续向下匹配,不然就break了
continue: true
- match_re:
job: mysqld_exporter
receiver: sre_dba
continue: true
# 默认all路由
- match_re:
job: .*
receiver: sre_all
continue: true
receivers: #普罗米修斯触发rule给alertmanager,然后alertmanager再发给谁接收处理
- name: 'sre_system'
webhook_configs:
- url: 'http://127.0.0.1:5001/alert'
- name: 'sre_dba'
webhook_configs:
- url: 'http://127.0.0.1:5002/alert'
- name: 'sre_all'
webhook_configs:
- url: 'http://127.0.0.1:5003/alert'
EOF# reload
curl -X POST -vvv localhost:9093/-/reload
准备prometheus 规则文件
准备rule文件
-
其中alert_g_1由job=node_exporter触发
-
其中alert_g_2由job=mysqld_exporter触发
cat <<EOF > /opt/app/prometheus/rule.yml
groups:
- name: alert_g_1
rules:
- alert: node_load too high
expr: node_memory_Active_bytes{instance="192.168.3.200:9100", job="node_exporter"}>0 #告警条件 promql
labels:
severity: critical
node_name: abc
annotations:
summary: 机器太累了- name: alert_g_2
rules:
- alert: mysql_qps too high
expr: mysql_global_status_queries{instance="192.168.3.200:3306", job="mysql_exporter"} >0 #告警条件 promql
labels:
severity: warning
node_name: abc
annotations:
summary: mysql太累了
修改prometheus主配置文件,生效rule和alertmanager
# 写配置文件
cat <<EOF > /opt/app/prometheus/prometheus.ymlglobal:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting:
alertmanagers:
- static_configs:
- targets:
- 172.20.70.215:9093rule_files:
- /opt/app/prometheus/rule.yml
scrape_configs:
- job_name: node_exporter #要监控的exporter都写在这
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
#metrics_path: /metrics
#scheme: http
static_configs:
- targets:
- 172.20.70.205:9100- job_name: mysqld_exporter #要监控的exporter都写在这
honor_timestamps: true
scrape_interval: 15s
scrape_timeout: 10s
#metrics_path: /metrics
#scheme: http
static_configs:- targets:
- 172.20.70.205:9104
EOF# reload
curl -X POST -vvv localhost:9090/-/reload