Alertmanager 告警规则记录

mysql


groups:
- name: MySQL-Alert
  rules:
  - alert: MySQL_CPU使用率过高
    expr: mysql_cpu_util * on (iid) group_right mysql_up > 70
    for: 2m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL当前CPU使用率:{{ $value }}% \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_内存使用率过高
    expr: mysql_mem_util * on (iid) group_right mysql_up > 85
    for: 2m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL当前内存使用率:{{ $value }}% \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_磁盘使用率过高
    expr: mysql_disk_util * on (iid) group_right mysql_up > 90
    for: 2m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL当前磁盘使用率:{{ $value }}% \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_IO使用率过高
    expr: mysql_io_util * on (iid) group_right mysql_up > 90
    for: 2m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL当前IO使用率:{{ $value }}% \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_is_down
    expr: mysql_up == 0
    for: 3m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL database is down. \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_慢查询过多
    expr: delta(mysql_global_status_slow_queries[1m]) > 60
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:每分钟慢查询:{{ $value }} \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_当前活跃的连接数过多
    expr: mysql_global_status_threads_running > 100
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:当前活跃的连接数:{{ $value }} \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_当前updating状态的线程过多
    expr: mysql_info_schema_processlist_threads{state=~"updating"} > 100
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:当前updating状态的线程:{{ $value }} \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_High_QPS
    expr: irate(mysql_global_status_questions[3m]) > 30000
    for: 2m
    labels:
      severity: warning
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:Mysql QPS:{{ $value | humanize }} \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_Too_Many_Connections
    expr: irate(mysql_global_status_threads_connected[3m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:Mysql Connections:{{ $value | humanize }} \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_主从IO线程运行状态异常
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL Slave IO thread not running \n> {{ $labels.instance }}\n> {{ $labels.iid }}"
  
  - alert: MySQL_主从SQL线程运行状态异常
    expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL Slave SQL thread not running \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_主从复制延迟过高
    expr: mysql_slave_status_seconds_behind_master > 3
    for: 1m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:主从复制延迟当前:{{ $value | humanize }}s \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

  - alert: MySQL_is_Restart
    expr: mysql_global_status_uptime <600
    for: 2m
    labels:
      severity: critical
    annotations:
      description: "{{ $labels.group }}_{{ $labels.name }}:MySQL database is Restart. \n> {{ $labels.instance }}\n> {{ $labels.iid }}"

主机


groups:
- name: node_usage_record_rules
  interval: 1m
  rules:
  - record: cpu:usage:rate1m
    expr: (1 - avg(rate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance,vendor,account,group,name)) * 100
  - record: mem:usage:rate1m
    expr: (1 - node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100

- name: node-exporter
  rules:
  - alert: ECS内存使用率
    expr: 100 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes) * 100 > 90
    for: 5m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}:内存使用率{{ $value | humanize }}%\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS_CPU使用率
    expr: 100 - (avg by(instance,name,group,account) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 90
    for: 5m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}:CPU使用率{{ $value | humanize }}%\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS系统负载
    expr: node_load5 / on (instance,name,group,account) sum(count(node_cpu_seconds_total{mode='system'}) by (cpu,instance,name,group,account)) by(instance,name,group,account) > 1.7
    for: 10m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}:系统负载{{ $value | humanize }}倍\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS磁盘使用率
    expr: |
      100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype=~"ext.?|xfs",mountpoint!~".*pods.*|/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
    for: 5m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS主机重启
    expr: node_time_seconds - node_boot_time_seconds < 600
    for: 1m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}:主机重启\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS文件系统只读
    expr: node_filesystem_readonly == 1
    for: 1m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}-{{ $labels.mountpoint }}:文件系统只读\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: K8S节点POD磁盘使用率
    expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{mountpoint=~"/var/lib/docker/devicemapper/mnt/.*"} * 100) > 85
    for: 5m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: NFS磁盘使用率
    expr: 100 - (node_filesystem_avail_bytes/node_filesystem_size_bytes{fstype="nfs"} * 100) > 90
    for: 5m
    labels:
      alertype: system
      severity: warning
    annotations:
      description: "{{ $labels.name }}_{{ $labels.mountpoint }}:磁盘使用率{{ $value | humanize }}%\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS磁盘读写容量
    expr: (irate(node_disk_read_bytes_total[5m]) ) /1024 /1024  > 80 or (irate(node_disk_written_bytes_total[5m]) ) /1024 /1024 > 80
    for: 8m
    labels:
      alertype: disk
      severity: warning
    annotations:
      description: "{{ $labels.name }}_{{ $labels.device }}:当前IO为{{ $value | humanize }}MB/s\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS网络流入(下载)数据过多
    expr: sum by(device,instance, name, group, account) (irate(node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
    for: 5m
    labels:
      alertype: network
      severity: warning
    annotations:
      description: "{{ $labels.name }}:流入数据为{{ $value | humanize }}MB/s\n> {{ $labels.group }}-{{ $labels.instance }}"

  - alert: ECS网络流出(上传)数据过多
    expr: sum by(device,instance, name, group, account) (irate(node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr.*|lo.*|cni.*'}[5m])) / 1024 / 1024 > 70
    for: 5m
    labels:
      alertype: network
      severity: warning
    annotations:
      description: "{{ $labels.name }}:流出数据为{{ $value | humanize }}MB/s\n> {{ $labels.group }}-{{ $labels.instance }}"

- name: Itself
  rules:
  - alert: Exporter状态
    expr: up == 0
    for: 3m
    labels:
      alertype: itself
      severity: critical
    annotations:
      description: "{{ $labels.job }}:异常\n> {{ $labels.group }}-{{ $labels.name }}-{{ $labels.instance }}"

站点


- name: Domain
  rules:
  - alert: 站点可用性
    expr: probe_success{job="blackbox_exporter"} == 0
    for: 1m
    labels:
      alertype: domain
      severity: critical
    annotations:
      description: "{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):站点无法访问\n> {{ $labels.instance }}"

  - alert: 站点1h可用性低于80%
    expr: sum_over_time(probe_success{job="blackbox_exporter"}[1h])/count_over_time(probe_success{job="blackbox_exporter"}[1h]) * 100 < 80
    for: 3m
    labels:
      alertype: domain
      severity: warning
    annotations:
      description: "{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):站点1h可用性:{{ $value | humanize }}%\n> {{ $labels.instance }}"

  - alert: 站点状态异常
    expr: (probe_success{job="blackbox_exporter"} == 0 and probe_http_status_code > 499) or probe_http_status_code == 0
    for: 1m
    labels:
      alertype: domain
      severity: warning
    annotations:
      description: "{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):站点状态异常:{{ $value }}\n> {{ $labels.instance }}"

  - alert: 站点耗时过高
    expr: probe_duration_seconds > 0.5
    for: 2m
    labels:
      alertype: domain
      severity: warning
    annotations:
      description: "{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):当前站点耗时:{{ $value | humanize }}s\n> {{ $labels.instance }}"

  - alert: SSL证书有效期
    expr: (probe_ssl_earliest_cert_expiry-time()) / 3600 / 24 < 15
    for: 2m
    labels:
      alertype: domain
      severity: warning
    annotations:
      description: "{{ $labels.env }}_{{ $labels.name }}({{ $labels.project }}):证书有效期剩余{{ $value | humanize }}天\n> {{ $labels.instance }}"

kafka

#Kafka Broker不可用
groups:
- name: kafka_alerts
  rules:
  - alert: KafkaBrokerDown
    expr: up{job="kafka"} == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Kafka Broker {{ $labels.instance }} is down"
      description: "Kafka Broker instance {{ $labels.instance }} has been down for more than 5 minutes."

#Kafka的磁盘使用率过高时触发告警

groups:
- name: kafka_alerts
  rules:
  - alert: KafkaHighDiskUsage
    expr: node_filesystem_avail_bytes{job="kafka"} / node_filesystem_size_bytes{job="kafka"} < 0.1
    for: 15m
    labels:
      severity: critical
    annotations:
      summary: "Kafka disk usage is high on {{ $labels.instance }}"
      description: "Kafka instance {{ $labels.instance }} has more than 90% disk usage."

elasticsearch

#节点不可用

groups:
- name: elasticsearch_alerts
  rules:
  - alert: ElasticsearchNodeDown
    expr: up{job="elasticsearch"} == 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch Node {{ $labels.instance }} is down"
      description: "Elasticsearch node {{ $labels.instance }} has been down for more than 5 minutes."

#Elasticsearch集群状态变为红色

groups:
- name: elasticsearch_alerts
  rules:
  - alert: ElasticsearchClusterRed
    expr: elasticsearch_cluster_health_status{status="red"} > 0
    for: 5m
    labels:
      severity: critical
    annotations:
      summary: "Elasticsearch cluster is in red status"
      description: "Elasticsearch cluster {{ $labels.cluster }} is in red status. Immediate attention required."

  • 3
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
AlertmanagerPrometheus 的一个组件,用于处理来自 Prometheus 服务器的告警信息。Alertmanager 磁盘告警规则是用来监控 Alertmanager 服务器磁盘空间的使用情况,并在磁盘空间不足时生成告警Alertmanager 的磁盘告警规则通常包括以下几个方面: 1. 磁盘空间阈值设置:规定了磁盘空间使用的阈值,当磁盘空间使用超过该阈值时将触发告警。 2. 告警通知方式:可以通过设置告警通知方式,如邮件、短信、PagerDuty 等,将告警信息发送给相应的接收者。 3. 重复告警规则:可以设置告警的重复频率和间隔时间,以避免频繁发送重复的告警信息。 4. 告警级别和标签:可以根据不同的告警级别和标签,对不同的告警信息进行分类和处理。 5. 告警处理逻辑:可以设置告警的处理逻辑,如静音某类告警、聚合相同类型的告警等。 Alertmanager 磁盘告警规则可以通过配置文件进行设置。在 Prometheus 的配置文件中,可以指定 Alertmanager 的地址和端口,并设置磁盘告警规则。当 Alertmanager 启动后,即可开始监控磁盘空间的使用情况,并根据规则生成告警信息。 总之,Alertmanager 磁盘告警规则是用来监控 Alertmanager 服务器磁盘空间使用情况的规则,通过设置阈值、告警通知方式、重复告警规则告警级别和标签等,可以及时地发现并处理磁盘空间不足的情况,确保 Alertmanager 的正常运行。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值