host主机告警规则
- alert: hostMemUsageAlert
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)/node_memory_MemTotal_bytes > 0.90
for: 1m
labels:
severity: page
annotations:
summary: "实例 {{ $labels.instance }} 内存使用率过高"
description: "实例 {{ $labels.instance }} 内存使用率 90% (当前值为: {{ $value }})"
- alert: 主机磁盘空间不足
expr: (node_filesystem_avail_bytes{mountpoint="/rootfs"} * 100) / node_filesystem_size_bytes{mountpoint="/rootfs"} < 10
for: 5m
labels:
severity: warning
annotations:
summary: "主机磁盘空间不足 (instance {{ $labels.instance }})"
description: "磁盘几乎满了 (< 10% left)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: 主机异常磁盘读取延迟
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 100
for: 5m
labels:
severity: warning
annotations:
summary: "主机异常磁盘读取延迟 (instance {{ $labels.instance }})"
description: "磁盘延迟正在增长 (read operations > 100ms)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: 主机Cpu高负载
expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 99
for: 5m
labels:
severity: warning
annotations:
summary: "主机Cpu高负载 (instance {{ $labels.instance }})"
description: "Cpu 负载 > 99%\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: 主机网络接口可能正在发送过多数据
expr: sum by (instance) (irate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: " 主机 (instance {{ $labels.instance }}) 网络吞吐量较大"
description: "主机网络接口可能发送了太多的数据 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
mysql告警规则
- name: MySQL_Alert
rules:
- alert: MySQL status
expr: mysql_up == 0
for: 5s
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} MySQL宕机"
description: "MySQL 数据库宕机,需要立即采取行动!"
- alert: MySQL连接太多
expr: avg by (instance) (max_over_time(mysql_global_status_threads_connected[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} 实例的Mysql连接太多"
description: "超过80%的MySQL连接在 {{ $labels.instance }}上\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: MySQL高线程运行
expr: avg by (instance) (max_over_time(mysql_global_status_threads_running[5m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 60
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} 实例的Mysql高线程运行"
description: "超过60%的MySQL连接连接处于运行状态 {{ $labels.instance }}\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: MySQL查询速度慢
expr: mysql_global_status_slow_queries > 3
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} 实例的Mysql查询速度慢"
description: "MySQL服务有一些慢查询.\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
pgsql 告警规则
- name: PostgreSQL_Alert
rules:
- alert: PostgreSQL 数据库挂掉了
expr: pg_up == 0
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} 实例的 PostgreSQL 数据库挂掉了"
description: "PostgreSQL宕机,需立即处理!"
- alert: PostgreSQL重新启动
expr: time() - pg_postmaster_start_time_seconds < 60
for: 5m
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} 实例的PostgreSQL重启"
description: "Postgresql 刚刚重新启动,不到一分钟前在\n"
- alert: PostgreSQL的连接数不足
expr: sum by (datname) (pg_stat_activity_count{datname!~"template.*|postgres"}) > pg_settings_max_connections * 0.9
for: 5m
labels:
severity: warning
annotations:
summary: "Postgresql 的连接数不足10% {{ $labels.instance }}"
description: "PostgreSQL instance has too many connections\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PostgreSQL死锁
expr: rate(pg_stat_database_deadlocks{datname!~"template.*|postgres"}[1m]) > 0
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }}实例的Postgresql死锁"
description: "PostgreSQL已死锁\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: PostgreSQL慢查询
expr: pg_slow_queries > 3
for: 5m
labels:
severity: warning
annotations:
summary: "{{ $labels.instance }} 实例的Postgresql慢速查询"
description: "PostgreSQL执行查询缓慢\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
http 告警规则
- alert: Web访问异常
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 20s
labels:
severity: critical
annotations:
summary: "{{ $labels.instance }} HTTP请求失败"
description: "HTTP 状态码 {{ $value }}\n LABELS: {{ $labels }}"