注意
此处监控内容为:主机存活情况;CPU使用率超过60%,内存使用率超过75%,磁盘使用率超过85% 进行的监控测试。
groups:
- name: general.rules
rules:
- alert: 主机宕机
expr: up == 0
for: 3m
labels:
serverity: A+
annotations:
summary: "主机 {{ $labels.instance }} 停止工作"
description: "{{ $labels.instance }} job {{ $labels.job }} 已经宕机5分钟以上!"
- alert: CPU使用率过高
expr: 100 * (1 - avg(irate(node_cpu_seconds_total{mode="idle"}[2m])) by(instance)) > 60
for: 5m
labels:
serverity: high
annotations:
summary: "主机{{$labels.instance}}: High CPU Usage Detected"
description: "{{$labels.instance}}: CPU usage is {{$value}}, above 60%"
- alert: HostMemory
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 75
for: 5m
labels:
serverity: middle
annotations:
summary: "{{$labels.instance}}: High Memory Usage Detected"
description: "{{$labels.instance}}: Memory Usage i{{ $value }}, above 75%"
- alert: HostDisk
expr: 100 * (node_filesystem_size_bytes{fstype=~"xfs|ext4"} - node_filesystem_avail_bytes) / node_filesystem_size_bytes > 85
for: 5m
labels:
serverity: low
annotations:
summary: "{{$labels.instance}}: High Disk Usage Detected"
description: "{{$labels.instance}}, mountpoint {{$labels.mountpoint}}: Disk Usage is {{ $value }}, above 85%"
修改后重启promethus服务