监控告警规则

apiVersion: v1
kind: ConfigMap
metadata:
  name: prometheus-rules
  namespace: kube-mon
data:
  alert-rules.yaml: |-
    groups:
      - name: 存活告警测试
        rules:
        - alert: 实例down
          expr: up{instance="192.1.3.108:9100"}==0
          for: 1m
          labels:
            project: test
            severity: High
          annotations:
            description: "服务器挂了 {{ $labels.instance }}"
      - name: CPU报警规则
        rules:
        - alert: CPU使用率告警
          expr: round(100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle",project="studio"}[1m]) )) * 100) > 90
          for: 2m
          labels:
            project: studio
            severity: warning
          annotations:
            description: "服务器: CPU使用超过90%!(值: {{ $value }}%)"
      - name: vos-CPU报警
        rules:
        - alert: CPU使用率告警
          expr: round(100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle",project="vos"}[1m]) )) * 100) > 90
          for: 2m
          labels:
            project: vos
            severity: warning
          annotations:
            description: "服务器: CPU使用超过90%!(值: {{ $value }}%)"
      - name: 内存报警规则
        rules:
        - alert: 内存使用率告警
          expr: round((1 - (node_memory_MemAvailable_bytes{project="studio"} / (node_memory_MemTotal_bytes{project="studio"})))* 100) > 80
          for: 5m
          labels:
            project: studio
            severity: warning
          annotations:
            description: "服务器: 内存使用超过80%!(值: {{ $value }}%)"
      - name: vos内存报警
        rules:
        - alert: 内存使用率告警
          expr: round((1 - (node_memory_MemAvailable_bytes{project="vos"} / (node_memory_MemTotal_bytes{project="vos"})))* 100) > 80
          for: 5m
          labels:
            project: vos
            severity: warning
          annotations:
            description: "服务器: 内存使用超过80%!(值: {{ $value }}%)"
      - name: 磁盘报警规则
        rules:
        - alert: 磁盘使用率告警
          expr: round((node_filesystem_size_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}) *100/(node_filesystem_avail_bytes {project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}+(node_filesystem_size_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}))) > 85
          for: 5m
          labels:
            project: studio
            severity: warning
          annotations:
            description: "服务器: 挂载目录{{ $labels.mountpoint }}使用超过85%!(值: {{ $value }}%)"
      - name: vos磁盘报警
        rules:
        - alert: 磁盘使用率告警
          expr: round((node_filesystem_size_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}) *100/(node_filesystem_avail_bytes {project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}+(node_filesystem_size_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}))) > 80
          for: 5m
          labels:
            project: vos
            severity: warning
          annotations:
            description: "服务器: 挂载目录{{ $labels.mountpoint }}使用超过80%!(值: {{ $value }}%)"
      - name: 存活告警规则
        rules:
        - alert: 实例down
          expr: up{project="studio"}==0
          for: 5m
          labels:
            project: studio
            severity: High
          annotations:
            description: "服务器挂了 {{ $labels.instance }}"
      - name: vos存活告警规则
        rules:
        - alert: 实例down
          expr: up{project="vos",instance!="10.8.15.28:9400"}==0
          for: 5m
          labels:
            project: vos
            severity: High
          annotations:
            description: "服务器挂了 {{ $labels.instance }}"
      - name: redis存活告警规则
        rules:
        - alert: redis down
          expr: redis_up==0
          for: 5m
          labels:
            project: studio
            severity: warning
          annotations:
            description: "redis 服务挂了 IP:{{ $labels.IP }}"
      - name: redis连接数告警规则
        rules:
        - alert: redis 连接数告警
          expr: redis_connected_clients > 1000
          for: 1m
          labels:
            project: vos
            severity: warning
          annotations:
            description: "redis 服务连接数超过最大值的80%告警,值为:{{ $value }}"
      - name: mysql存活告警规则
        rules:
        - alert: mysql down
          expr: mysql_up{project="stduio"}==0
          for: 5m
          labels:
            project: studio
            severity: warning
          annotations:
            description: "mysql 服务挂了 IP:{{ $labels.IP }}"
      - name: vos-mysql存活告警
        rules:
        - alert: mysql down
          expr: mysql_up{project="vos"}==0
          for: 5m
          labels:
            project: vos
            severity: warning
          annotations:
            description: "mysql 服务挂了 IP:{{ $labels.IP }}"
      - name: vos-mysql慢查询
        rules:
        - alert: mysql slow queries
          expr: sum(rate(mysql_global_status_slow_queries{project="vos"}[30s])) > 0.01
          for: 1m
          labels:
            project: vos
            severity: warning
          annotations:
            description: "mysql 服务出现慢查询 IP:{{ $labels.IP }}"
      - name: vos-mysql连接数告警
        rules:
        - alert: mysql 连接数告警
          expr: mysql_global_status_max_used_connections > mysql_global_variables_max_connections*0.6
          for: 1m
          labels:
            project: vos
            severity: warning
          annotations:
            description: "mysql 服务连接数过高告警 IP:{{ $labels.IP }}"
      - name: mongodb存活告警规则
        rules:
        - alert: mongodb down
          expr: mongodb_up{project="stduio"}==0
          for: 5m
          labels:
            project: studio
            severity: warning
          annotations:
            description: "mongodb 服务挂了 IP:{{ $labels.instance }}"
      - name: rocketmq存活告警规则
        rules:
        - alert: rocketmq down
          expr: count(rocketmq_broker_tps) by (cluster) < 1
          for: 5m
          labels:
            project: vos
            severity: warning
            instance: "10.8.15.27:30134"
          annotations:
            description: "rocketmq 服务挂了 IP:{{ $labels.instance }}"
      - name: rocketmq消息积压
        rules:
        - alert: rocketmq 消息积压
          expr: sum(rocketmq_producer_offset) by (topic) - on(topic)  group_right  sum(rocketmq_consumer_offset) by (group,topic) and (sum(rocketmq_producer_offset) by (topic) - on(topic)  group_right  sum(rocketmq_consumer_offset) by (group,topic) > 0) > 500
          for: 1m
          labels:
            project: vos
            severity: warning
            instance: "10.8.15.27:30134"
          annotations:
            description: "rocketmq消息积压topic:{{ $labels.topic }},积压值:{{ $value }}"
      - name: rocketmq消费延时
        rules:
        - alert: rocketmq 消费延时
          expr: (sort_desc(sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic)) and (sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic) > 0))/1000 > 30
          for: 1m
          labels:
            project: vos
            severity: warning
            instance: "10.8.15.27:30134"
          annotations:
            description: "rocketmq消费延时topic:{{ $labels.topic }},延迟值:值: {{ $value }}S"
       
      - name: POD重启规则
        rules:
        - alert: PodRestart # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
          expr: increase(kube_pod_container_status_restarts_total{project=~"vos"}[10m])>0
          for: 1m # 持续多久确认报警信息
          labels:
            project: vos-pod
            severity: warning
          annotations:
            summary: 'Container: {{ $labels.container }} 重启'
            message: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} 过去十分钟重启{{ $value }}次'


      - name: 接口状态告警
        rules:
        - alert: Api_down # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
          expr: probe_http_status_code{project="vos"}!=200
          for: 5m
          labels:
            project: "vos"
            severity: warning
          annotations:
            description: "VOS环境API接口连接不上了: {{ $labels.instance }}"

      - name: studio接口状态告警
        rules:
        - alert: Api_down # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
          expr: probe_http_status_code{project="studio"}!=200
          for: 5m
          labels:
            project: "studio"
            severity: warning
          annotations:
            description: "studio环境API接口连接不上了: {{ $labels.instance }}"

      - name: vos接口超时
        rules:
        - alert: Api_slow # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
          expr: round(probe_duration_seconds{project="vos"}) > 3
          for: 1m
          labels:
            project: "vos"
            severity: warning
          annotations:
            description: "AI能力中台API接口 {{ $labels.instance }}响应时间超过三秒,响应时间: {{ $value }}s"

      - name: studio接口超时
        rules:
        - alert: Api_slow # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
          expr: round(probe_duration_seconds{project="studio"}) > 3
          for: 1m
          labels:
            project: "studio"
            severity: warning
          annotations:
            description: "AI能力中台API接口 {{ $labels.instance }}响应时间超过三秒,响应时间: {{ $value }}s"

      - name: AI接口超时
        rules:
        - alert: Api_slow # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
          expr: round(probe_duration_seconds{project="ai"}) > 3
          for: 1m
          labels:
            project: "ai"
            severity: warning
          annotations:
            description: "AI能力中台API接口 {{ $labels.instance }}响应时间超过三秒,响应时间: {{ $value }}s"
 

groups:
- name: 存活告警测试
  rules:
  - alert: 实例down
    expr: up{instance!~".*gpu"} == 0
    for: 8m
    labels:
      project: funuobase
      severity: High
    annotations:
      description: "服务挂了 {{ $labels.instance }}"
- name: CPU报警规则
  rules:
  - alert: CPU使用率告警
    expr: round(avg(rate(node_cpu_seconds_total{mode="system"}[1m])) by (instance) *100 + avg(rate(node_cpu_seconds_total{mode="user"}[1m])) by (instance) *100 + avg(rate(node_cpu_seconds_total{mode="iowait"}[1m])) by (instance) *100) > 90
    for: 2m
    labels:
      project: funuobase
      severity: warning
    annotations:
      description: "服务器: CPU使用超过90%!(值: {{ $value }}%)"

- name: 内存报警规则
  rules:
  - alert: 内存使用率告警
    expr: round((1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes)))* 100) > 85
    for: 5m
    labels:
      project: funuobase
      severity: warning
    annotations:
      description: "服务器: 内存使用超过85%!(值: {{ $value }}%)"
- name: 磁盘报警规则
  rules:
  - alert: 磁盘使用率告警
    expr: round((node_filesystem_size_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}-node_filesystem_free_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}) *100/(node_filesystem_avail_bytes {fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}+(node_filesystem_size_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}-node_filesystem_free_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}))) > 85
    for: 5m
    labels:
      project: funuobase
      severity: warning
    annotations:
      description: "服务器: 挂载目录{{ $labels.mountpoint }}使用超过85%!(值: {{ $value }}%)"
- name: rocketmq存活告警规则
  rules:
  - alert: rocketmq down
    expr: count(rocketmq_broker_tps) by (cluster) < 1
    for: 8m
    labels:
      project: funuobase
      severity: warning
      instance: "rocketmq-export.default"
    annotations:
      description: "rocketmq 服务挂了 IP:{{ $labels.instance }}"
- name: rocketmq消息积压
  rules:
  - alert: rocketmq 消息积压
    expr: (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0)) > 1000
    #expr: (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-sample-topic-sample"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-sample-topic-sample"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0)) > 1000
    for: 5m
    labels:
      project: test
      severity: warning
      instance: "rocketmq-export.default"
    annotations:
      description: "rocketmq消息积压topic:{{ $labels.topic }},积压值:{{ $value }}"
- name: rocketmq消费延时
  rules:
  - alert: rocketmq 消费延时
    expr: (sort_desc(sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic)) and (sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic) > 0))/1000 > 1000
    for: 3m
    labels:
      project: test
      severity: warning
      instance: "rocketmq-export.default"
    annotations:
      description: "rocketmq消费延时topic:{{ $labels.topic }},延迟值:值: {{ $value }}S"
- name: -redis连接数告警规则
  rules:
  - alert:  redis 连接数告警
    expr: redis_connected_clients > 7000
    for: 3m
    labels:
      project: test
      severity: warning
    annotations:
      description: "redis 服务连接数超过最大值的80%告警,值为:{{ $value }}"
- name: -mysql慢查询
  rules:
  - alert: -mysql slow queries
    expr: sum by (instance) (rate(mysql_global_status_slow_queries[50s])) > 0.05
    for: 2m
    labels:
      project: test
      severity: warning
    annotations:
      description: "mysql 服务出现慢查询 IP:{{ $labels.instance }}"
- name: -mysql连接数告警
  rules:
  - alert: mysql 连接数告警
    expr: mysql_global_status_threads_connected > mysql_global_variables_max_connections*0.7
    for: 3m
    labels:
      project: test
      severity: warning
    annotations:
      description: "mysql 服务连接数大于最大连接数70% IP:{{ $labels.instance }},当前连接数:{{ $value }}"


- name: -线程数告警
  rules:
  - alert: 线程数告警
    expr: thread_num > 40000
    for: 2m
    labels:
      project: vos
      severity: warning
    annotations:
      description: "服务器IP:{{ $labels.hostname }},连接数大于四万,当前连接数:{{ $value }}"


- name: -POD状态规则
  rules:
  - alert: -Pod状态告警 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
    expr:  min_over_time(kube_pod_container_status_ready{pod!~"rke-.*|skywalking-es-init.*|nightingale-categraf.*|nightingale-.*|nginx-ingress-controller-gpj25"} [1m])!=1
    for: 8m # 持续多久确认报警信息
    labels:
      project: funuobase
      severity: warning
    annotations:
      description: "K8S集群 namespace:{{ $labels.namespace }},     pod name,  {{ $labels.pod }}状态异常"

- name: -接口规则
  rules:
  - alert: -接口告警 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
    expr:  linecount > 100
    for: 3m # 持续多久确认报警信息
    labels:
      project: linecount
      severity: warning
    annotations:
      description: "接口http://10.48.81.68:32068/check/getQueueInfo?isAllFlag=1阻塞值大于100:当前lineCount值为:{{ $value }}"
- name: 接口挂了
  rules:
  - alert: 接口挂了 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
    expr:  httpjiekou!=200
    for: 3m # 持续多久确认报警信息
    labels:
      project: wgetimagetime
      severity: warning
    annotations:
      description: "接口http://10.48.81.68:32068/api/getQueueInfo?isAllFlag=1请求不通"
- name: -下载图片耗时规则
  rules:
  - alert: -下载图片耗时告警 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
    expr:  wgetimagetime > 2
    for: 1m # 持续多久确认报警信息
    labels:
      project: wgetimagetime
      severity: warning
    annotations:
      description: "下载图片大于2s:下载耗时为:{{ $value }}s"
 

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值