groups:
- name: 主机状态
rules:-
alert: 主机状态
expr: up == 0
for: 5m
labels:
status: 灾难告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 16: labels.instance}̲}:服务器宕机" desc…labels.instance}}:服务器不可达超过5分钟” -
alert: CPU繁忙程度
expr: (node_load1 ) > 5 or (node_load5 ) > 5 or (node_load15 ) > 5
for: 5m
labels:
status: 一般告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} CPU繁忙!" des…labels.mountpoint }} CPU load_avg:{{$value}}” -
alert: CPU使用率
expr: (1-sum(increase(node_cpu_seconds_total{mode=“idle”}[1m])) by(instance) / sum(increase(node_cpu_seconds_total[1m])) by(instance))*100 > 60
for: 5m
labels:
status: 一般告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} CPU使用率过高!" …labels.mountpoint }} CPU使用大于60%(目前使用:{{$value}}%)” -
alert: CPU用户使用率
expr: (sum(increase(node_cpu_seconds_total{mode=“user”}[1m])) by(instance) / sum(increase(node_cpu_seconds_total[1m])) by(instance))*100 > 60
for: 5m
labels:
status: 一般告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} CPU用户使用率过高!" …labels.mountpoint }} CPU用户使用率大于60%(目前使用:{{$value}}%)” -
alert: CPU系统使用率
expr: (sum(increase(node_cpu_seconds_total{mode=“system”}[1m])) by(instance) / sum(increase(node_cpu_seconds_total[1m])) by(instance))*100 > 60
for: 5m
labels:
status: 一般告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} CPU用户使用率过高!" …labels.mountpoint }} CPU用户使用率大于60%(目前使用:{{$value}}%)” -
alert: 内存使用率
expr: (1-(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))*100 > 60
for: 5m
labels:
status: 一般告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 内存使用率过高!" d…labels.mountpoint }} 内存使用大于60%(目前使用:{{$value}}%)” -
alert: 内存使用率
expr: (1-(node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes))*100 > 75
for: 5m
labels:
status: 严重告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 内存使用率过高!" d…labels.mountpoint }} 内存使用大于75%(目前使用:{{$value}}%)” -
alert: 目录容量
expr: (node_filesystem_size_bytes{mountpoint=".*",fstype=“xfs|etx."} - node_filesystem_free_bytes{mountpoint=".*",fstype="xfs|etx.”}) / node_filesystem_size_bytes{mountpoint=".*",fstype=“xfs|etx.*”} * 100 > 80
for: 5m
labels:
status: 一般告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 目录容量使用率过高!" …labels.mountpoint }} 目录容量使用大于80%(目前使用:{{$value}}%)” -
alert: 目录容量
expr: (node_filesystem_size_bytes{mountpoint=".*",fstype=“xfs|etx."} - node_filesystem_free_bytes{mountpoint=".*",fstype="xfs|etx.”}) / node_filesystem_size_bytes{mountpoint=".*",fstype=“xfs|etx.*”} * 100 > 90
for: 5m
labels:
status: 严重告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 目录容量使用率过高!" …labels.mountpoint }} 目录容量使用大于90%(目前使用:{{$value}}%)” -
alert: 目录爆满预测
expr: predict_linear(node_filesystem_free_bytes {mountpoint=".*",fstype="xfs|etx."}[1h],724*3600) < 0
for: 5m
labels:
status: 灾难告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 目录容量即将爆满!" …labels.mountpoint }} 目录容量将于7天内爆满” -
alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 1000
for: 5m
labels:
status: 严重告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} TCP_ESTABLISH…labels.mountpoint }} TCP_ESTABLISHED大于1000(目前使用:{{$value}}%)” -
alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~‘tap.|veth.|br.|docker.|virbr*|lo*’}[5m])) by (instance)) / 100) > 102400
for: 5m
labels:
status: 严重告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 流入网络带宽过高!" …labels.mountpoint }}流入网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}” -
alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~‘tap.|veth.|br.|docker.|virbr*|lo*’}[5m])) by (instance)) / 100) > 102400
for: 5m
labels:
status: 严重告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 流出网络带宽过高!" …labels.mountpoint }}流出网络带宽持续2分钟高于100M. RX带宽使用率{{$value}}” -
alert: IO性能
expr: (avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) > 60
for: 5m
labels:
status: 严重告警
annotations:
summary: “{{KaTeX parse error: Expected 'EOF', got '}' at position 18: …bels.mountpoint}̲} 流入磁盘IO使用率过高!"…labels.mountpoint }} 流入磁盘IO大于60%(目前使用:{{$value}})”
-