一、alertmanager
部署alertmanager
从prometheus官网下载二进制文件:Download | Prometheus
下载并安装 alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz
tar -xzvf alertmanager-0.26.0.linux-amd64.tar.gz -C /usr/local
更改文件名称,并设置systemctl管理alertmanager
cd /usr/loca/ & mv alertmanager-0.26.0.linux-amd64 alertmanager
vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=Alertmanager
Documentation=https://github.com/prometheus/alertmanager/releases/
After=network.target
[Service]
WorkingDirectory=/usr/local/alertmanager/ # alertmanager工作目录
ExecStart=/usr/local/alertmanager/alertmanager # alertmanager启动二进制文件
ExecReload=/bin/kill -HUP $MAINPID
ExecStop=/bin/kill -KILL $MAINPID
Type=simple
KillMode=control-group
Restart=on-failure # 智能重启
RestartSec=15s
[Install]
WantedBy=multi-user.target
# 修改alertmanager配置文件
vim alertmanger.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'xxxx' # 发送告警的邮箱
smtp_auth_username: 'xxxx' #发送告警的邮箱
smtp_auth_password: 'xxxx' #邮箱授权密码
smtp_require_tls: false
templates: #添加模板
- '/usr/local/alertmanager/template/email.tmpl' #指定路径
route:
receiver: mail1
group_by: ['alertname']
group_wait: 1m # 分组等待的时间
group_interval: 2m # 上下两组发送告警的间隔时间
repeat_interval: 1h # 重复发送告警时间
routes:
- receiver: mail2 # 接收者
match_re: # 条件匹配 与prometheus rules中设置的标签匹配
db: sql
repeat_interval: 1h # 分条件匹配重复发送告警时间
receivers:
- name: mail1
email_configs:
- send_resolved: true
to: xxxx
- name: "mail2"
email_configs:
- send_resolved: true
to: xxxx
inhibit_rules: # 静默匹配
- source_match: # 如果告警信息中包含Disaster 就取消发送 warning信息
severity: Disaster
target_match:
severity: warning
equal:
- alertmanager
# 报警模板
cat /usr/local/alertmanager/template/email.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========xxx环境监控报警 =========
告警状态:{{ .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ $alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }} {{ $alert.Labels.pod }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
触发阀值:{{ .Annotations.value }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========xxx环境异常恢复 =========
告警类型:{{ .Labels.alertname }}
告警状态:{{ .Status }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- end }}
# 启动alertmanager
systemctl daemon-reload
systemctl start alertmanager
二、prometheus
# 在prometheus中配置
prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.178.129:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: # 报警规则目录
- "rules/*_rules.yml"
- "rules/*_alerts.yml"
rules/node_alerts.yml # 其中for是持续时间 expr是判断语句
node_alerts.yml
groups:
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up == 0
for: 1m
labels:
status: Disaster
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "{{$labels.instance}}:服务器延时超过5分钟"
- name: 实例存活告警规则
rules:
- alert: 实例存活告警
expr: up{job="prometheus"} == 0 or up{job="Linux-host"} == 0
for: 1m
labels:
user: prometheus
severity: Disaster
annotations:
summary: "Instance {{ $labels.instance }} is down"
description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
value: "{{ $value }}"
- name: 内存告警规则
rules:
- alert: "内存使用率告警"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
db: sql
annotations:
summary: "服务器: {{$labels.alertname}} 内存报警"
description: "{{ $labels.alertname }} 内存资源利用率大于75%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} CPU报警"
description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
value: "{{ $value }}"
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 40
for: 1m
labels:
user: prometheus
severity: warning
annotations:
summary: "服务器: {{$labels.alertname}} 磁盘报警"
description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
value: "{{ $value }}"
修改完prometheus时 重新启动prometheus (添加了prometheus热启动的前提下)
curl -XPOST http://localhost:9090/-/reload