一 安装AlertManager
1 下载
这里是直接从官网下载的
wget https://github.com/prometheus/alertmanager/releases/download/v0.24.0/alertmanager-0.24.0.linux-amd64.tar.gz
2 安装并创建alertmanager用户
tar -xf alertmanager-0.24.0.linux-amd64.tar.gz -C /usr/local/
useradd -s /sbin/nologin -M alertmanager
3 配置开机自启
cat /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=alertmanager
Documentation=https://github.com/prometheus/alertmanager
After=network.target
[Service]
Type=simple
User=alertmanager
ExecStart=/usr/local/alertmanager/alertmanager --storage.path=/usr/local/alertmanager/data --config.file=/usr/local/alertmanager/alertmanager.yml
Restart=on-failure
[Install]
WantedBy=multi-user.target
4 配置alertmanager
注:repeat_interval 这个参数生产环境可以安实际情况设置时间久点,这样可以避免同样的未处理告警一直重复发出
cat /usr/local/alertmanager/alertmanager.yml
global:
resolve_timeout: 5m #每5分钟检测一次是否恢复
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' #调用企业微信api地址不用改
templates: #告警模板
- './template/*.tmpl'
route: # 设置报警分发策略
group_by: ['alertname'] # 分组标签
group_wait: 10s # 告警等待时间。告警产生后等待10s,如果有同组告警一起发出
group_interval: 10s # 两组告警的间隔时间
repeat_interval: 1m # 重复告警的间隔时间,减少相同告警的发送频率 此处为测试设置为1分钟
receiver: 'wechat' # 默认接收者
receivers:
- name: 'wechat'
wechat_configs:
- send_resolved: true
agent_id: '' # 自建应用的agentId
to_party: '' # 接收告警消息的人员Id
api_secret: '' # 自建应用的secret
corp_id: '' # 企业ID
agent_id、api_secret可在微信管理后台获取,corp_id 企业信息中获取,
corp_id
5 prometheus.yml 中添加alertmanager地址
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- "localhost:9093"
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*_alert.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: "alertmanager"
static_configs:
- targets: ['localhost:9093']
6 配置告警模板
cat ./template/wechat.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========监控报警 =========
告警状态:{{ .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ $alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }} {{ $alert.Labels.pod }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
触发阀值:{{ .Annotations.value }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========异常恢复 =========
告警类型:{{ .Labels.alertname }}
告警状态:{{ .Status }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
========= = end = =========
{{- end }}
{{- end }}
{{- end }}
{{- end }}
7 配置告警规则
cat rules/host_alert.yml
groups:
- name: node-alert
rules:
- alert: NodeDown
expr: up {job="nodes"} == 0
for: 30s
labels:
status: critical
annotations:
summary: "{{ $labels.job }} {{.instance}}:服务器宕机"
description: "{{ $labels.job }} {{.instance}}:服务器延时超过30s"
value: "{{ $value }}"
- alert: NodeCpuHigh
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
for: 5m
labels:
status: warning
annotations:
summary: "{{$labels.instance}}: High CPU Usage Detected"
description: "{{ $labels.job }} {{$labels.instance}}: CPU usage is {{$value}}, above 80%"
value: "{{ $value }}"
- alert: NodeFilesystemUsage
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100) > 80
for: 10m
labels:
severity: warning
annotations:
summary: "Instance {{ $labels.job }} {{ $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
description: "{{ $labels.job }} {{ $labels.instance }}: {{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }})"
- alert: NodeMemoryHigh
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 5m
labels:
status: warning
annotations:
summary: "{{ $labels.job }} {{ $labels.instance}} 内存使用率过高!"
description: "{{ $labels.job }} {{ $labels.instance }} 内存使用大于90%(目前使用:{{ $value}}%)"
- alert: NodeIO
expr: (avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) > 60
for: 1m
labels:
status: warning
annotations:
summary: "{{ $labels.job }} {{$labels.instance}} 流入磁盘IO使用率过高!"
description: "{{ $labels.job }} {{ $labels.instance }} 流入磁盘IO大于60%(目前使用:{{ $value }})"
- alert: Network
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*|ens*'}[5m])) by (instance)) / 100) > 102400
for: 2m
labels:
status: warning
annotations:
summary: "{{ $labels.job }} {{ $labels.instance}} 流入网络带宽过高!"
description: "{{ $labels.job }} {{ $labels.instance }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{ $value }}"
8 配置完成后使用命令检查配置文件是否正确
./promtool check config prometheus.yml
Checking prometheus.yml
SUCCESS: 3 rule files found
Checking rules/blackbox_exporter_alert.yml
SUCCESS: 1 rules found
Checking rules/check_ssl_alert.yml
SUCCESS: 1 rules found
Checking rules/host_alert.yml
SUCCESS: 6 rules found
./amtool check-config alertmanager.yml
Checking 'alertmanager.yml' SUCCESS
Found:
- global config
- route
- 0 inhibit rules
- 1 receivers
- 1 templates
SUCCESS
9 重启alertmanager和promethues生效
systemctl restart alertmanager
systemctl restart promethues
10 最后需要在自建应用中设置企业可信IP,IP为alertmanager的IP。不然不会收到告警信息