prometheus报警规则

北京秃子运维

已于 2024-08-30 17:53:42 修改

阅读量243

点赞数 3

文章标签： prometheus

于 2024-08-30 10:50:19 首次发布

本文链接：https://blog.csdn.net/q123q123q_/article/details/141712104

版权

注意事项：如何看报警规则适用自己，去prometheus 的主页面查看，这里有监控项和监控参数
在这里插入图片描述

mysql

groups:
  - name: Mysql-rules
    rules:
      - alert: "Mysql status"
        expr: mysql_up == 0
        for: 5s
        labels:
          severity: error
        annotations:
          summary: "您的 {{ $labels.instance }} 的 Mysql 已停止运行！"
          description: "Mysql数据库宕机，请检查"

      - alert: "Mysql slave io thread status"
        expr: mysql_slave_status_slave_io_running == 0
        for: 5s
        labels:
          severity: error
        annotations:
          summary: "您的 {{ $labels.instance }} Mysql slave io thread 已停止"
          description: "Mysql主从IO线程故障，请检测"

      - alert: "Mysql slave sql thread status"
        expr: mysql_slave_status_slave_sql_running == 0
        for: 5s
        labels:
          severity: error
        annotations:
          summary: "您的 {{ $labels.instance }} Mysql slave sql thread 已停止"
          description: "Mysql主从sql线程故障，请检测"

nginx

groups:
  - name: nginx
    rules:
      - alert: "nginx status"
        expr: sum(up{job="nginx"}) < 2
        for: 1m
        labels:
          severity: error
        annotations:
          summary: "您的 {{ $labels.instance }} 的 Nginx 已停止运行！"
          description: "Nginx宕机，请检查"

      - alert: NginxHighHttp4xxErrorRate
        expr: |
          sum(rate(nginx_http_requests_total{status=~"^4.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Nginx high HTTP 4xx error rate (instance {{ $labels.instance }})"
          description: "Too many HTTP requests with status 4xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: NginxHighHttp5xxErrorRate
        expr: |
          sum(rate(nginx_http_requests_total{status=~"^5.."}[1m])) / sum(rate(nginx_http_requests_total[1m])) * 100 > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Nginx high HTTP 5xx error rate (instance {{ $labels.instance }})"
          description: "Too many HTTP requests with status 5xx (> 5%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

      - alert: NginxLatencyHigh
        expr: |
          histogram_quantile(0.99, sum(rate(nginx_http_request_duration_seconds_bucket[30m])) by (host, node)) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Nginx latency high (instance {{ $labels.instance }})"
          description: "Nginx p99 latency is higher than 10 seconds\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"

node-exporter

# 服务器资源告警策略
groups:
- name: 服务器资源监控
  rules:
  - alert: 内存使用率过高
    expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
    for: 5m  # 告警持续时间，超过这个时间才会发送给alertmanager
    labels:
      severity: 严重告警
    annotations:
      summary: "{{ $labels.instance }} 内存使用率过高，请尽快处理！"
      description: "{{ $labels.instance }}内存使用率超过90%,当前使用率{{ $value }}%."
          
  - alert: 服务器宕机
    expr: up == 0
    for: 3m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 服务器宕机，请尽快处理！"
      description: "{{$labels.instance}} 服务器延时超过3分钟，当前状态{{ $value }}. "
 
  - alert: CPU高负荷
    expr: 100 - (avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} CPU使用率过高，请尽快处理！"
      description: "{{$labels.instance}} CPU使用大于90%，当前使用率{{ $value }}%. "
      
  - alert: 磁盘IO性能
    expr: avg(irate(node_disk_io_time_seconds_total[1m])) by(instance,job)* 100 > 90
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入磁盘IO使用率过高，请尽快处理！"
      description: "{{$labels.instance}} 流入磁盘IO大于90%,当前使用率{{ $value }}%."
 
 
  - alert: 网络流入
    expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流入网络带宽过高，请尽快处理！"
      description: "{{$labels.instance}} 流入网络带宽持续5分钟高于100M. RX带宽使用量{{$value}}."
 
  - alert: 网络流出
    expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance,job)) / 100) > 102400
    for: 5m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.instance}} 流出网络带宽过高,请尽快处理！"
      description: "{{$labels.instance}} 流出网络带宽持续5分钟高于100M. RX带宽使用量{$value}}."
  
  - alert: TCP连接数
    expr: node_netstat_Tcp_CurrEstab > 10000
    for: 2m
    labels:
      severity: 严重告警
    annotations:
      summary: " TCP_ESTABLISHED过高！"
      description: "{{$labels.instance}} TCP_ESTABLISHED大于100%,当前使用率{{ $value }}%."
 
  - alert: 磁盘容量
    expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 90
    for: 1m
    labels:
      severity: 严重告警
    annotations:
      summary: "{{$labels.mountpoint}} 磁盘分区使用率过高，请尽快处理！"
      description: "{{$labels.instance}} 磁盘分区使用大于90%，当前使用率{{ $value }}%."

php-fpm

groups:
- name: php_fpm_alerts
  rules:
  - alert: "PHP 7.4-FPM status"
    expr: phpfpm_up == 0
    for: 5s
    labels:
      severity: error
    annotations:
      summary: "您的 {{ $labels.instance }} 的 PHP 7.4-FPM 已停止运行！"
      description: "PHP 7.4-FPM 宕机，请检查"

redis

groups:
  - name: redis_alerts
    rules:
      - alert: RedisHighMemoryUsage
        expr: redis_memory_usage_bytes{job="redis"} > 1048576000  # 1 GB
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis instance {{ $labels.instance }} high memory usage"
          description: "Redis memory usage is {{ $value }} bytes which is above the threshold of 1GB."

      - alert: RedisHighCommandRate
        expr: rate(redis_commands_processed_total{job="redis"}[1m]) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Redis instance {{ $labels.instance }} high command rate"
          description: "Redis is processing {{ $value }} commands per second which is above the threshold."

      - alert: RedisHighConnectedClients
        expr: redis_connected_clients{job="redis"} > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Redis instance {{ $labels.instance }} high number of connected clients"
          description: "Redis has {{ $value }} connected clients which is above the threshold."

      - alert: RedisLowMemoryAvailable
        expr: redis_memory_available_bytes{job="redis"} < 104857600  # 100 MB
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Redis instance {{ $labels.instance }} low available memory"
          description: "Redis available memory is {{ $value }} bytes which is below the threshold of 100MB."


      - alert: RedisInstanceDown
        expr: up{job="redis"} == 0
        for: 1m
        labels:
          severity: critical
        annotations:
          summary: "Redis instance {{ $labels.instance }} is down"
          description: "Redis instance {{ $labels.instance }} is not reachable."

ssl证书过期

groups:
- name: ssl_cert_alerts
  rules:
  - alert: SSL_Certificate_Expiry_Soon
    expr: ssl_cert_expiry < 7
    for: 1h
    labels:
      severity: warning
    annotations:
      summary: "SSL certificate for {{ $labels.hostname }} is expiring soon"
      description: "The SSL certificate for {{ $labels.hostname }} will expire in less than 7 days."

nginx

groups:
  - name: nginx_alerts
    rules:
      # 监控 Nginx 活动连接数是否超过阈值
      - alert: HighNginxActiveConnections
        expr: nginx_connections_active > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Nginx active connections are high"
          description: "Nginx active connections are over 1000 for more than 5 minutes."

      # 监控 Nginx 请求处理时间是否过长
      - alert: HighNginxRequestMsec
        expr: avg(nginx_requestMsec) > 200
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Nginx request processing time"
          description: "The average request processing time is greater than 200ms for more than 5 minutes."

      # 监控 Nginx 连接接受数过高
      - alert: HighNginxConnectionsAccepted
        expr: increase(nginx_connections_accepted[5m]) > 5000
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High number of accepted connections on Nginx"
          description: "Nginx accepted connections exceeded 5000 in the last 5 minutes."

      # 监控 Nginx 连接读取数是否异常
      - alert: HighNginxConnectionsReading
        expr: nginx_connections_reading > 100
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High number of connections in reading state"
          description: "Nginx connections in reading state are over 100 for more than 5 minutes."

      # 监控 Nginx 连接处理数是否异常
      - alert: HighNginxConnectionsHandled
        expr: rate(nginx_connections_handled[5m]) < rate(nginx_connections_accepted[5m])
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Nginx connections handled fewer than accepted"
          description: "Nginx is handling fewer connections than it has accepted over the last 5 minutes."

      # 监控上游服务器响应时间是否过长
      - alert: HighNginxUpstreamResponseMsec
        expr: avg(nginx_upstream_responseMsec) > 300
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High Nginx upstream response time"
          description: "The average upstream response time is greater than 300ms for more than 5 minutes."

      # 监控上游服务器状态是否正常
      - alert: NginxUpstreamDown
        expr: nginx_up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Nginx upstream server down"
          description: "One or more Nginx upstream servers are down for more than 2 minutes."

      # 监控 VTS Exporter 是否正常工作
      - alert: NginxVTSExporterDown
        expr: up{job="nginx-vts-exporter"} == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Nginx VTS Exporter is down"
          description: "Nginx VTS Exporter has not been running for more than 5 minutes."

规则解释：
HighNginxActiveConnections: 当 nginx_connections_active 超过 1000 时触发报警。
HighNginxRequestMsec: 当平均请求处理时间超过 200 毫秒时触发报警。
HighNginxConnectionsAccepted: 当在 5 分钟内接受的连接数超过 5000 时触发报警。
HighNginxConnectionsReading: 当 nginx_connections_reading 超过 100 时触发报警。
HighNginxConnectionsHandled: 当处理的连接数少于接受的连接数时触发报警。
HighNginxUpstreamResponseMsec: 当平均上游服务器响应时间超过 300 毫秒时触发报警。
NginxUpstreamDown: 当上游服务器不可用时触发报警。
NginxVTSExporterDown: 当 VTS Exporter 服务不可用时触发报警。

php

groups:
  - name: phpfpm_alerts
    rules:
      # 监控 PHP-FPM 的接受连接总数
      - alert: HighPHPFPMAcceptedConnections
        expr: increase(phpfpm_accepted_connections_total[5m]) > 1000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High number of accepted connections in PHP-FPM"
          description: "PHP-FPM accepted connections exceeded 1000 in the last 5 minutes."

      # 监控 PHP-FPM 的活动进程是否达到最大限制
      - alert: HighPHPFPMActiveProcesses
        expr: phpfpm_active_max_processes > 50
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "PHP-FPM active processes reached max"
          description: "PHP-FPM active processes reached the maximum limit of 50 for more than 5 minutes."

      # 监控 PHP-FPM 的监听队列中的连接数
      - alert: HighPHPFPMListenQueueConnections
        expr: phpfpm_listen_queue_connections > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High number of connections in PHP-FPM listen queue"
          description: "PHP-FPM listen queue connections are over 10 for more than 5 minutes."

      # 监控 PHP-FPM 的监听队列长度
      - alert: HighPHPFPMListenQueueLength
        expr: phpfpm_listen_queue_length_connections > 20
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "PHP-FPM listen queue length is high"
          description: "PHP-FPM listen queue length has exceeded 20 for more than 5 minutes."

      # 监控 PHP-FPM 的监听队列最大连接数
      - alert: HighPHPFPMListenQueueMaxConnections
        expr: phpfpm_listen_queue_max_connections > 30
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "PHP-FPM listen queue max connections reached"
          description: "PHP-FPM listen queue max connections have exceeded 30 for more than 5 minutes."

      # 监控 PHP-FPM 的子进程数是否达到最大限制
      - alert: PHPFPMMaxChildrenReached
        expr: increase(phpfpm_max_children_reached_total[5m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "PHP-FPM max children reached"
          description: "PHP-FPM has reached the maximum number of child processes in the last 5 minutes."

      # 监控 PHP-FPM 的总进程数
      - alert: HighPHPFPMProcessesTotal
        expr: phpfpm_processes_total > 70
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High total number of PHP-FPM processes"
          description: "Total PHP-FPM processes count is over 70 for more than 5 minutes."

      # 监控 PHP-FPM 的抓取失败总数
      - alert: PHPFPMScrapeFailures
        expr: increase(phpfpm_scrape_failures_total[5m]) > 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "PHP-FPM scrape failures detected"
          description: "PHP-FPM scrape failures have occurred in the last 5 minutes."

      # 监控 PHP-FPM 的慢请求总数
      - alert: HighPHPFPMSlowRequests
        expr: increase(phpfpm_slow_requests_total[5m]) > 10
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High number of slow requests in PHP-FPM"
          description: "PHP-FPM slow requests exceeded 10 in the last 5 minutes."

      # 监控 PHP-FPM 是否正常运行
      - alert: PHPFPMDown
        expr: phpfpm_up == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "PHP-FPM is down"
          description: "PHP-FPM service is not running for more than 2 minutes."

规则解释：
HighPHPFPMAcceptedConnections: 当 PHP-FPM 在 5 分钟内接受的连接数超过 1000 时触发报警。
HighPHPFPMActiveProcesses: 当 PHP-FPM 活动进程数达到最大限制（如 50）时触发报警。
HighPHPFPMListenQueueConnections: 当监听队列中的连接数超过 10 时触发报警。
HighPHPFPMListenQueueLength: 当监听队列的长度超过 20 时触发报警。
HighPHPFPMListenQueueMaxConnections: 当监听队列的最大连接数超过 30 时触发报警。
PHPFPMMaxChildrenReached: 当 PHP-FPM 的子进程数达到最大限制时触发报警。
HighPHPFPMProcessesTotal: 当 PHP-FPM 的总进程数超过 70 时触发报警。
PHPFPMScrapeFailures: 当 PHP-FPM 抓取失败次数增加时触发报警。
HighPHPFPMSlowRequests: 当 PHP-FPM 的慢请求数在 5 分钟内超过 10 时触发报警。
PHPFPMDown: 当 PHP-FPM 服务不可用时触发报警。

以下是一个适用于监控 Node (服务器或节点) 状态的 Prometheus 报警规则示例。此规则文件涵盖了 CPU、内存、磁盘空间、网络以及其他与节点健康相关的常见监控指标。这些报警规则可以帮助你在节点出现性能问题或资源不足时及时得到通知。

Prometheus 报警规则示例配置文件

groups:
  - name: node_alerts
    rules:
      # 监控 CPU 使用率过高
      - alert: HighCpuUsage
        expr: 100 - (avg by (instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage detected on {{ $labels.instance }}"
          description: "CPU usage is above 90% for more than 5 minutes on {{ $labels.instance }}."

      # 监控内存使用率过高
      - alert: HighMemoryUsage
        expr: (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) / node_memory_MemTotal_bytes * 100 > 90
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage detected on {{ $labels.instance }}"
          description: "Memory usage is above 90% for more than 5 minutes on {{ $labels.instance }}."

      # 监控磁盘空间使用率过高
      - alert: LowDiskSpace
        expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint="/"} * 100 > 90
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "The disk space usage is above 90% on {{ $labels.instance }} for more than 5 minutes."

      # 监控节点是否重启
      - alert: NodeReboot
        expr: time() - node_boot_time_seconds > 300
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "Node {{ $labels.instance }} was rebooted recently"
          description: "Node {{ $labels.instance }} has been rebooted within the last 5 minutes."

      # 监控节点是否宕机
      - alert: NodeDown
        expr: up{job="node"} == 0
        for: 2m
        labels:
          severity: critical
        annotations:
          summary: "Node {{ $labels.instance }} is down"
          description: "Prometheus has not been able to scrape metrics from {{ $labels.instance }} for more than 2 minutes."

      # 监控高网络接收流量
      - alert: HighNetworkReceive
        expr: rate(node_network_receive_bytes_total[5m]) > 10000000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High network receive traffic on {{ $labels.instance }}"
          description: "Network receive traffic is greater than 10 MB/s on {{ $labels.instance }} for more than 5 minutes."

      # 监控高网络发送流量
      - alert: HighNetworkTransmit
        expr: rate(node_network_transmit_bytes_total[5m]) > 10000000
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High network transmit traffic on {{ $labels.instance }}"
          description: "Network transmit traffic is greater than 10 MB/s on {{ $labels.instance }} for more than 5 minutes."

      # 监控文件句柄使用率
      - alert: HighFileDescriptorUsage
        expr: node_filefd_allocated{job="node"} / node_filefd_maximum{job="node"} > 0.8
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High file descriptor usage on {{ $labels.instance }}"
          description: "File descriptor usage is above 80% on {{ $labels.instance }} for more than 5 minutes."

      # 监控系统负载是否过高
      - alert: HighLoadAverage
        expr: node_load1 > (count(node_cpu_seconds_total{mode="system"}) BY (instance) * 1.5)
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High load average detected on {{ $labels.instance }}"
          description: "The 1-minute load average is greater than 1.5 times the number of CPUs on {{ $labels.instance }} for more than 5 minutes."

规则解释：
HighCpuUsage: 当节点的 CPU 使用率超过 90% 并持续 5 分钟时触发报警。
HighMemoryUsage: 当节点的内存使用率超过 90% 并持续 5 分钟时触发报警。
LowDiskSpace: 当节点的根分区磁盘空间使用率超过 90% 并持续 5 分钟时触发报警。
NodeReboot: 当节点在过去 5 分钟内重新启动时触发报警。
NodeDown: 当 Prometheus 无法从节点获取指标超过 2 分钟时触发报警。
HighNetworkReceive: 当网络接收流量超过 10 MB/s 并持续 5 分钟时触发报警。
HighNetworkTransmit: 当网络发送流量超过 10 MB/s 并持续 5 分钟时触发报警。
HighFileDescriptorUsage: 当文件描述符使用率超过 80% 并持续 5 分钟时触发报警。
HighLoadAverage: 当 1 分钟平均负载超过 CPU 核心数的 1.5 倍并持续 5 分钟时触发报警。