注意事项:如何看报警规则适用自己 ,去prometheus 的主页面查看,这里有监控项和监控参数
mysql
groups:
- name: Mysql-rules
rules:
- alert: "Mysql status"
expr: mysql_up == 0
for: 5s
labels:
severity: error
annotations:
summary: "您的 {{ $labels.instance }} 的 Mysql 已停止运行!"
description: "Mysql数据库宕机,请检查"
- alert: "Mysql slave io thread status"
expr: mysql_slave_status_slave_io_running == 0
for: 5s
labels:
severity: error
annotations:
summary: "您的 {{ $labels.instance }} Mysql slave io thread 已停止"
description: "Mysql主从IO线程故障,请检测"
- alert: "Mysql slave sql thread status"
expr: mysql_slave_status_slave_sql_running == 0
for: 5s
labels:
severity: error
annotations:
summary: "您的 {{ $labels.instance }} Mysql slave sql thread 已停止"
description: "Mysql主从sql线程故障,请检测"
nginx
groups:
- name: nginx
rules:
- alert: "nginx status"
expr: sum(up{job="nginx"}) < 2
for: 1m
labels:
severity: error
annotations:
summary: "您的 {{ $labels.instance }} 的 Nginx 已停止运行!"
description: "Nginx宕机,请检查"
- alert: NginxHighHttp4xxErrorRate
expr: |
sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})"
description: "Too many HTTP requests with status 4xx (> 5%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: NginxHighHttp5xxErrorRate
expr: |
sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})"
description: "Too many HTTP requests with status 5xx (> 5%)\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
- alert: NginxLatencyHigh
expr: |
histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "Nginx latency high (instance {{ $labels.instance }})"
description: "Nginx p99 latency is higher than 10 seconds\n VALUE = {{ $value }}\n LABELS: {{ $labels }}"
node-exporter
# 服务器资源告警策略
groups:
- name: 服务器资源监控
rules:
- alert: 内存使用率过高
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 5m # 告警持续时间,超过这个时间才会发送给alertmanager
labels:
severity: 严重告警
annotations:
summary: "{{ $labels.instance }} 内存使用率过高,请尽快处理!"
description: "{{ $labels.instance }}内存使用率超过90%,当前使用率{{ $value }}%."
- alert: 服务器宕机
expr: up == 0
for: 3m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 服务器宕机,请尽快处理!"
description: "{{$labels.instance}} 服务器延时超过3分钟,当前状态{{ $value }}. "
- alert: CPU高负荷
expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} CPU使用率过高,请尽快处理!"
description: "{{$labels.instance}} CPU使用大于90%,当前使用率{{ $value }}%. "
- alert: 磁盘IO性能
expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入磁盘IO使用率过高,请尽快处理!"
description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
- alert: 网络流入
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流入网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}."
- alert: 网络流出
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
for: 5m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理!"
description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}."
- alert: TCP连接数
expr: node_netstat_Tcp_CurrEstab > 10000
for: 2m
labels:
severity: 严重告警
annotations:
summary: " TCP_ESTABLISHED过高!"
description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
- alert: 磁盘容量
expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
for: 1m
labels:
severity: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高,请尽快处理!"
description: "{{$labels.instance}} 磁盘分区使用大于90%,当前使用率{{ $value }}%."
php-fpm
groups:
- name: php_fpm_alerts
rules:
- alert: "PHP 7.4-FPM status"
expr: phpfpm_up == 0
for: 5s
labels:
severity: error
annotations:
summary: "您的 {{ $labels.instance }} 的 PHP 7.4-FPM 已停止运行!"
description: "PHP 7.4-FPM 宕机,请检查"
redis
groups:
- name: redis_alerts
rules:
- alert: RedisHighMemoryUsage
expr: redis_memory_usage_bytes{job="redis"} > 1048576000 # 1 GB
for: 5m
labels:
severity: critical
annotations:
summary: "Redis instance {{ $labels.instance }} high memory usage"
description: "Redis memory usage is {{ $value }} bytes which is above the threshold of 1GB."
- alert: RedisHighCommandRate
expr: rate(redis_commands_processed_total{job="redis"}[1m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Redis instance {{ $labels.instance }} high command rate"
description: "Redis is processing {{ $value }} commands per second which is above the threshold."
- alert: RedisHighConnectedClients
expr: redis_connected_clients{job="redis"} > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Redis instance {{ $labels.instance }} high number of connected clients"
description: "Redis has {{ $value }} connected clients which is above the threshold."
- alert: RedisLowMemoryAvailable
expr: redis_memory_available_bytes{job="redis"} < 104857600 # 100 MB
for: 5m
labels:
severity: critical
annotations:
summary: "Redis instance {{ $labels.instance }} low available memory"
description: "Redis available memory is {{ $value }} bytes which is below the threshold of 100MB."
- alert: RedisInstanceDown
expr: up{job="redis"} == 0
for: 1m
labels:
severity: critical
annotations:
summary: "Redis instance {{ $labels.instance }} is down"
description: "Redis instance {{ $labels.instance }} is not reachable."
ssl证书过期
groups:
- name: ssl_cert_alerts
rules:
- alert: SSL_Certificate_Expiry_Soon
expr: ssl_cert_expiry < 7
for: 1h
labels:
severity: warning
annotations:
summary: "SSL certificate for {{ $labels.hostname }} is expiring soon"
description: "The SSL certificate for {{ $labels.hostname }} will expire in less than 7 days."
nginx
groups:
- name: nginx_alerts
rules:
# 监控 Nginx 活动连接数是否超过阈值
- alert: HighNginxActiveConnections
expr: nginx_connections_active > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "Nginx active connections are high"
description: "Nginx active connections are over 1000 for more than 5 minutes."
# 监控 Nginx 请求处理时间是否过长
- alert: HighNginxRequestMsec
expr: avg(nginx_requestMsec) > 200
for: 5m
labels:
severity: warning
annotations:
summary: "High Nginx request processing time"
description: "The average request processing time is greater than 200ms for more than 5 minutes."
# 监控 Nginx 连接接受数过高
- alert: HighNginxConnectionsAccepted
expr: increase(nginx_connections_accepted[5m]) > 5000
for: 5m
labels:
severity: critical
annotations:
summary: "High number of accepted connections on Nginx"
description: "Nginx accepted connections exceeded 5000 in the last 5 minutes."
# 监控 Nginx 连接读取数是否异常
- alert: HighNginxConnectionsReading
expr: nginx_connections_reading > 100
for: 5m
labels:
severity: warning
annotations:
summary: "High number of connections in reading state"
description: "Nginx connections in reading state are over 100 for more than 5 minutes."
# 监控 Nginx 连接处理数是否异常
- alert: HighNginxConnectionsHandled
expr: rate(nginx_connections_handled[5m]) < rate(nginx_connections_accepted[5m])
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx connections handled fewer than accepted"
description: "Nginx is handling fewer connections than it has accepted over the last 5 minutes."
# 监控上游服务器响应时间是否过长
- alert: HighNginxUpstreamResponseMsec
expr: avg(nginx_upstream_responseMsec) > 300
for: 5m
labels:
severity: warning
annotations:
summary: "High Nginx upstream response time"
description: "The average upstream response time is greater than 300ms for more than 5 minutes."
# 监控上游服务器状态是否正常
- alert: NginxUpstreamDown
expr: nginx_up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Nginx upstream server down"
description: "One or more Nginx upstream servers are down for more than 2 minutes."
# 监控 VTS Exporter 是否正常工作
- alert: NginxVTSExporterDown
expr: up{job="nginx-vts-exporter"} == 0
for: 5m
labels:
severity: critical
annotations:
summary: "Nginx VTS Exporter is down"
description: "Nginx VTS Exporter has not been running for more than 5 minutes."
规则解释:
HighNginxActiveConnections: 当 nginx_connections_active 超过 1000 时触发报警。
HighNginxRequestMsec: 当平均请求处理时间超过 200 毫秒时触发报警。
HighNginxConnectionsAccepted: 当在 5 分钟内接受的连接数超过 5000 时触发报警。
HighNginxConnectionsReading: 当 nginx_connections_reading 超过 100 时触发报警。
HighNginxConnectionsHandled: 当处理的连接数少于接受的连接数时触发报警。
HighNginxUpstreamResponseMsec: 当平均上游服务器响应时间超过 300 毫秒时触发报警。
NginxUpstreamDown: 当上游服务器不可用时触发报警。
NginxVTSExporterDown: 当 VTS Exporter 服务不可用时触发报警。
php
groups:
- name: phpfpm_alerts
rules:
# 监控 PHP-FPM 的接受连接总数
- alert: HighPHPFPMAcceptedConnections
expr: increase(phpfpm_accepted_connections_total[5m]) > 1000
for: 5m
labels:
severity: warning
annotations:
summary: "High number of accepted connections in PHP-FPM"
description: "PHP-FPM accepted connections exceeded 1000 in the last 5 minutes."
# 监控 PHP-FPM 的活动进程是否达到最大限制
- alert: HighPHPFPMActiveProcesses
expr: phpfpm_active_max_processes > 50
for: 5m
labels:
severity: critical
annotations:
summary: "PHP-FPM active processes reached max"
description: "PHP-FPM active processes reached the maximum limit of 50 for more than 5 minutes."
# 监控 PHP-FPM 的监听队列中的连接数
- alert: HighPHPFPMListenQueueConnections
expr: phpfpm_listen_queue_connections > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High number of connections in PHP-FPM listen queue"
description: "PHP-FPM listen queue connections are over 10 for more than 5 minutes."
# 监控 PHP-FPM 的监听队列长度
- alert: HighPHPFPMListenQueueLength
expr: phpfpm_listen_queue_length_connections > 20
for: 5m
labels:
severity: warning
annotations:
summary: "PHP-FPM listen queue length is high"
description: "PHP-FPM listen queue length has exceeded 20 for more than 5 minutes."
# 监控 PHP-FPM 的监听队列最大连接数
- alert: HighPHPFPMListenQueueMaxConnections
expr: phpfpm_listen_queue_max_connections > 30
for: 5m
labels:
severity: critical
annotations:
summary: "PHP-FPM listen queue max connections reached"
description: "PHP-FPM listen queue max connections have exceeded 30 for more than 5 minutes."
# 监控 PHP-FPM 的子进程数是否达到最大限制
- alert: PHPFPMMaxChildrenReached
expr: increase(phpfpm_max_children_reached_total[5m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "PHP-FPM max children reached"
description: "PHP-FPM has reached the maximum number of child processes in the last 5 minutes."
# 监控 PHP-FPM 的总进程数
- alert: HighPHPFPMProcessesTotal
expr: phpfpm_processes_total > 70
for: 5m
labels:
severity: warning
annotations:
summary: "High total number of PHP-FPM processes"
description: "Total PHP-FPM processes count is over 70 for more than 5 minutes."
# 监控 PHP-FPM 的抓取失败总数
- alert: PHPFPMScrapeFailures
expr: increase(phpfpm_scrape_failures_total[5m]) > 0
for: 5m
labels:
severity: critical
annotations:
summary: "PHP-FPM scrape failures detected"
description: "PHP-FPM scrape failures have occurred in the last 5 minutes."
# 监控 PHP-FPM 的慢请求总数
- alert: HighPHPFPMSlowRequests
expr: increase(phpfpm_slow_requests_total[5m]) > 10
for: 5m
labels:
severity: warning
annotations:
summary: "High number of slow requests in PHP-FPM"
description: "PHP-FPM slow requests exceeded 10 in the last 5 minutes."
# 监控 PHP-FPM 是否正常运行
- alert: PHPFPMDown
expr: phpfpm_up == 0
for: 2m
labels:
severity: critical
annotations:
summary: "PHP-FPM is down"
description: "PHP-FPM service is not running for more than 2 minutes."
规则解释:
HighPHPFPMAcceptedConnections: 当 PHP-FPM 在 5 分钟内接受的连接数超过 1000 时触发报警。
HighPHPFPMActiveProcesses: 当 PHP-FPM 活动进程数达到最大限制(如 50)时触发报警。
HighPHPFPMListenQueueConnections: 当监听队列中的连接数超过 10 时触发报警。
HighPHPFPMListenQueueLength: 当监听队列的长度超过 20 时触发报警。
HighPHPFPMListenQueueMaxConnections: 当监听队列的最大连接数超过 30 时触发报警。
PHPFPMMaxChildrenReached: 当 PHP-FPM 的子进程数达到最大限制时触发报警。
HighPHPFPMProcessesTotal: 当 PHP-FPM 的总进程数超过 70 时触发报警。
PHPFPMScrapeFailures: 当 PHP-FPM 抓取失败次数增加时触发报警。
HighPHPFPMSlowRequests: 当 PHP-FPM 的慢请求数在 5 分钟内超过 10 时触发报警。
PHPFPMDown: 当 PHP-FPM 服务不可用时触发报警。
以下是一个适用于监控 Node (服务器或节点) 状态的 Prometheus 报警规则示例。此规则文件涵盖了 CPU、内存、磁盘空间、网络以及其他与节点健康相关的常见监控指标。这些报警规则可以帮助你在节点出现性能问题或资源不足时及时得到通知。
Prometheus 报警规则示例配置文件
groups:
- name: node_alerts
rules:
# 监控 CPU 使用率过高
- alert: HighCpuUsage
expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High CPU usage detected on {{ $labels.instance }}"
description: "CPU usage is above 90% for more than 5 minutes on {{ $labels.instance }}."
# 监控内存使用率过高
- alert: HighMemoryUsage
expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 5m
labels:
severity: warning
annotations:
summary: "High memory usage detected on {{ $labels.instance }}"
description: "Memory usage is above 90% for more than 5 minutes on {{ $labels.instance }}."
# 监控磁盘空间使用率过高
- alert: LowDiskSpace
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
for: 5m
labels:
severity: critical
annotations:
summary: "Low disk space on {{ $labels.instance }}"
description: "The disk space usage is above 90% on {{ $labels.instance }} for more than 5 minutes."
# 监控节点是否重启
- alert: NodeReboot
expr: time() - node_boot_time_seconds > 300
for: 5m
labels:
severity: warning
annotations:
summary: "Node {{ $labels.instance }} was rebooted recently"
description: "Node {{ $labels.instance }} has been rebooted within the last 5 minutes."
# 监控节点是否宕机
- alert: NodeDown
expr: up{job="node"} == 0
for: 2m
labels:
severity: critical
annotations:
summary: "Node {{ $labels.instance }} is down"
description: "Prometheus has not been able to scrape metrics from {{ $labels.instance }} for more than 2 minutes."
# 监控高网络接收流量
- alert: HighNetworkReceive
expr: rate(node_network_receive_bytes_total[5m]) > 10000000
for: 5m
labels:
severity: warning
annotations:
summary: "High network receive traffic on {{ $labels.instance }}"
description: "Network receive traffic is greater than 10 MB/s on {{ $labels.instance }} for more than 5 minutes."
# 监控高网络发送流量
- alert: HighNetworkTransmit
expr: rate(node_network_transmit_bytes_total[5m]) > 10000000
for: 5m
labels:
severity: warning
annotations:
summary: "High network transmit traffic on {{ $labels.instance }}"
description: "Network transmit traffic is greater than 10 MB/s on {{ $labels.instance }} for more than 5 minutes."
# 监控文件句柄使用率
- alert: HighFileDescriptorUsage
expr: node_filefd_allocated{job="node"} / node_filefd_maximum{job="node"} > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "High file descriptor usage on {{ $labels.instance }}"
description: "File descriptor usage is above 80% on {{ $labels.instance }} for more than 5 minutes."
# 监控系统负载是否过高
- alert: HighLoadAverage
expr: node_load1 > (count(node_cpu_seconds_total{mode="system"}) BY (instance) * 1.5)
for: 5m
labels:
severity: warning
annotations:
summary: "High load average detected on {{ $labels.instance }}"
description: "The 1-minute load average is greater than 1.5 times the number of CPUs on {{ $labels.instance }} for more than 5 minutes."
规则解释:
HighCpuUsage: 当节点的 CPU 使用率超过 90% 并持续 5 分钟时触发报警。
HighMemoryUsage: 当节点的内存使用率超过 90% 并持续 5 分钟时触发报警。
LowDiskSpace: 当节点的根分区磁盘空间使用率超过 90% 并持续 5 分钟时触发报警。
NodeReboot: 当节点在过去 5 分钟内重新启动时触发报警。
NodeDown: 当 Prometheus 无法从节点获取指标超过 2 分钟时触发报警。
HighNetworkReceive: 当网络接收流量超过 10 MB/s 并持续 5 分钟时触发报警。
HighNetworkTransmit: 当网络发送流量超过 10 MB/s 并持续 5 分钟时触发报警。
HighFileDescriptorUsage: 当文件描述符使用率超过 80% 并持续 5 分钟时触发报警。
HighLoadAverage: 当 1 分钟平均负载超过 CPU 核心数的 1.5 倍并持续 5 分钟时触发报警。