prometheus altermanager邮件报警:
下载网址:
https://prometheus.io/download
tar zxfv alertmanager-0.15.2.linux-amd64.tar.gz -C /space/
mv /space/alertmanager-0.15.2.linux-amd64 /Influxdb/alertmanager
vi /space/altermanager/altermanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:25'
smtp_from: 'zhi.yang@163.com'
smtp_auth_username: 'zhi.yang@163.com'
smtp_auth_password: 'xxxxx'
route:
group_by: ['down']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'shprom'
receivers:
- name: 'shprom'
email_configs:- to: 'zhi.yang@ming.com'
:wq
注:
465端口方式:
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.163.com:465'
smtp_from: 'zhi.yang@163.com'
smtp_auth_username: 'zhi.yang@163.com'
smtp_auth_password: 'xxxx' (此处为授权码,不是密码)
smtp_require_tls: false
/space/altermanager/altermanager --config.file=/space/altermanager/altermanager.yml
mkdir /space/prometheus/rules
vi /space/prometheus/rules/down.yml
groups:
- name: down
rules:- alert: InstanceDown
expr: up == 0
for: 30s
labels:
user: shprom
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
- alert: InstanceDown
:wq
vi /space/prometheus/rules/mem.yml
groups:
- name: mem
rules:- alert: NodeMemoryUsage
expr: (node_memory_MemTotal - (node_memory_MemFree+node_memory_Buffers+node_memory_Cached )) / node_memory_MemTotal * 100 > 80
for: 1m
labels:
serverity: page
annotations:
summary: "{{ $labels.instance }} High Memory usage detected"
description: "{{ $labels.instance }}: Memory usage is above 80% (current value is:{{ $value }})"
- alert: NodeMemoryUsage
:wq
vi /space/prometheus/rules/cpu.yml
groups:
- name: cpu
rules:- alert: NodeCPUUsage
expr: (100 - (avg by (instance) (irate(node_cpu{mode="idle"}[5m])) * 100)) > 80
for: 1m
labels:
serverity: page
annotations:
summary: "{{ $labels.instance }} High CPU usage detected"
description: "{{ $labels.instance }}: CPU usage is above 80% (current value is:{{ $value }})"
- alert: NodeCPUUsage
:wq
vi /space/prometheus/rules/home.yml
groups:
- name: home
rules:- alert: NodeHomeUsage
expr: (100 - (node_filesystem_avail_bytes{mountpoint="/home"} / node_filesystem_size_bytes{mountpoint="/home"}) * 100) > 80
for: 1m
labels:
serverity: page
annotations:
summary: "{{ $labels.instance }} High Memory usage detected"
description: "{{ $labels.instance }}: Memory usage is above 80% (current value is:{{ $value }})"
- alert: NodeHomeUsage
:wq
vi /space/prometheus/prometheus.yml
alerting:
alertmanagers:
- static_configs:
- targets: ['localhost:9093']
- alertmanager:9093
- targets: ['localhost:9093']
rule_files:
- "rules/down.yml"
- "rules/mem.yml"
- "rules/cpu.yml"
:wq
/space/prometheus/prometheus --config.file=/space/prometheus/prometheus.yml --storage.tsdb.path=/space/prometheus/data
可以到http://ip:9090——status——rules和alerts确认是否生效
转载于:https://blog.51cto.com/yangzhiming/2309012