apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-rules
namespace: kube-mon
data:
alert-rules.yaml: |-
groups:
- name: 存活告警测试
rules:
- alert: 实例down
expr: up{instance="192.1.3.108:9100"}==0
for: 1m
labels:
project: test
severity: High
annotations:
description: "服务器挂了 {{ $labels.instance }}"
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: round(100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle",project="studio"}[1m]) )) * 100) > 90
for: 2m
labels:
project: studio
severity: warning
annotations:
description: "服务器: CPU使用超过90%!(值: {{ $value }}%)"
- name: vos-CPU报警
rules:
- alert: CPU使用率告警
expr: round(100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle",project="vos"}[1m]) )) * 100) > 90
for: 2m
labels:
project: vos
severity: warning
annotations:
description: "服务器: CPU使用超过90%!(值: {{ $value }}%)"
- name: 内存报警规则
rules:
- alert: 内存使用率告警
expr: round((1 - (node_memory_MemAvailable_bytes{project="studio"} / (node_memory_MemTotal_bytes{project="studio"})))* 100) > 80
for: 5m
labels:
project: studio
severity: warning
annotations:
description: "服务器: 内存使用超过80%!(值: {{ $value }}%)"
- name: vos内存报警
rules:
- alert: 内存使用率告警
expr: round((1 - (node_memory_MemAvailable_bytes{project="vos"} / (node_memory_MemTotal_bytes{project="vos"})))* 100) > 80
for: 5m
labels:
project: vos
severity: warning
annotations:
description: "服务器: 内存使用超过80%!(值: {{ $value }}%)"
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: round((node_filesystem_size_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}) *100/(node_filesystem_avail_bytes {project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}+(node_filesystem_size_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="studio",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}))) > 85
for: 5m
labels:
project: studio
severity: warning
annotations:
description: "服务器: 挂载目录{{ $labels.mountpoint }}使用超过85%!(值: {{ $value }}%)"
- name: vos磁盘报警
rules:
- alert: 磁盘使用率告警
expr: round((node_filesystem_size_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}) *100/(node_filesystem_avail_bytes {project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}+(node_filesystem_size_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}-node_filesystem_free_bytes{project="vos",fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc).*"}))) > 80
for: 5m
labels:
project: vos
severity: warning
annotations:
description: "服务器: 挂载目录{{ $labels.mountpoint }}使用超过80%!(值: {{ $value }}%)"
- name: 存活告警规则
rules:
- alert: 实例down
expr: up{project="studio"}==0
for: 5m
labels:
project: studio
severity: High
annotations:
description: "服务器挂了 {{ $labels.instance }}"
- name: vos存活告警规则
rules:
- alert: 实例down
expr: up{project="vos",instance!="10.8.15.28:9400"}==0
for: 5m
labels:
project: vos
severity: High
annotations:
description: "服务器挂了 {{ $labels.instance }}"
- name: redis存活告警规则
rules:
- alert: redis down
expr: redis_up==0
for: 5m
labels:
project: studio
severity: warning
annotations:
description: "redis 服务挂了 IP:{{ $labels.IP }}"
- name: redis连接数告警规则
rules:
- alert: redis 连接数告警
expr: redis_connected_clients > 1000
for: 1m
labels:
project: vos
severity: warning
annotations:
description: "redis 服务连接数超过最大值的80%告警,值为:{{ $value }}"
- name: mysql存活告警规则
rules:
- alert: mysql down
expr: mysql_up{project="stduio"}==0
for: 5m
labels:
project: studio
severity: warning
annotations:
description: "mysql 服务挂了 IP:{{ $labels.IP }}"
- name: vos-mysql存活告警
rules:
- alert: mysql down
expr: mysql_up{project="vos"}==0
for: 5m
labels:
project: vos
severity: warning
annotations:
description: "mysql 服务挂了 IP:{{ $labels.IP }}"
- name: vos-mysql慢查询
rules:
- alert: mysql slow queries
expr: sum(rate(mysql_global_status_slow_queries{project="vos"}[30s])) > 0.01
for: 1m
labels:
project: vos
severity: warning
annotations:
description: "mysql 服务出现慢查询 IP:{{ $labels.IP }}"
- name: vos-mysql连接数告警
rules:
- alert: mysql 连接数告警
expr: mysql_global_status_max_used_connections > mysql_global_variables_max_connections*0.6
for: 1m
labels:
project: vos
severity: warning
annotations:
description: "mysql 服务连接数过高告警 IP:{{ $labels.IP }}"
- name: mongodb存活告警规则
rules:
- alert: mongodb down
expr: mongodb_up{project="stduio"}==0
for: 5m
labels:
project: studio
severity: warning
annotations:
description: "mongodb 服务挂了 IP:{{ $labels.instance }}"
- name: rocketmq存活告警规则
rules:
- alert: rocketmq down
expr: count(rocketmq_broker_tps) by (cluster) < 1
for: 5m
labels:
project: vos
severity: warning
instance: "10.8.15.27:30134"
annotations:
description: "rocketmq 服务挂了 IP:{{ $labels.instance }}"
- name: rocketmq消息积压
rules:
- alert: rocketmq 消息积压
expr: sum(rocketmq_producer_offset) by (topic) - on(topic) group_right sum(rocketmq_consumer_offset) by (group,topic) and (sum(rocketmq_producer_offset) by (topic) - on(topic) group_right sum(rocketmq_consumer_offset) by (group,topic) > 0) > 500
for: 1m
labels:
project: vos
severity: warning
instance: "10.8.15.27:30134"
annotations:
description: "rocketmq消息积压topic:{{ $labels.topic }},积压值:{{ $value }}"
- name: rocketmq消费延时
rules:
- alert: rocketmq 消费延时
expr: (sort_desc(sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic)) and (sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic) > 0))/1000 > 30
for: 1m
labels:
project: vos
severity: warning
instance: "10.8.15.27:30134"
annotations:
description: "rocketmq消费延时topic:{{ $labels.topic }},延迟值:值: {{ $value }}S"
- name: POD重启规则
rules:
- alert: PodRestart # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: increase(kube_pod_container_status_restarts_total{project=~"vos"}[10m])>0
for: 1m # 持续多久确认报警信息
labels:
project: vos-pod
severity: warning
annotations:
summary: 'Container: {{ $labels.container }} 重启'
message: 'Namespace: {{ $labels.namespace }}, Pod: {{ $labels.pod }} 过去十分钟重启{{ $value }}次'
- name: 接口状态告警
rules:
- alert: Api_down # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: probe_http_status_code{project="vos"}!=200
for: 5m
labels:
project: "vos"
severity: warning
annotations:
description: "VOS环境API接口连接不上了: {{ $labels.instance }}"
- name: studio接口状态告警
rules:
- alert: Api_down # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: probe_http_status_code{project="studio"}!=200
for: 5m
labels:
project: "studio"
severity: warning
annotations:
description: "studio环境API接口连接不上了: {{ $labels.instance }}"
- name: vos接口超时
rules:
- alert: Api_slow # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: round(probe_duration_seconds{project="vos"}) > 3
for: 1m
labels:
project: "vos"
severity: warning
annotations:
description: "AI能力中台API接口 {{ $labels.instance }}响应时间超过三秒,响应时间: {{ $value }}s"
- name: studio接口超时
rules:
- alert: Api_slow # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: round(probe_duration_seconds{project="studio"}) > 3
for: 1m
labels:
project: "studio"
severity: warning
annotations:
description: "AI能力中台API接口 {{ $labels.instance }}响应时间超过三秒,响应时间: {{ $value }}s"
- name: AI接口超时
rules:
- alert: Api_slow # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: round(probe_duration_seconds{project="ai"}) > 3
for: 1m
labels:
project: "ai"
severity: warning
annotations:
description: "AI能力中台API接口 {{ $labels.instance }}响应时间超过三秒,响应时间: {{ $value }}s"
groups:
- name: 存活告警测试
rules:
- alert: 实例down
expr: up{instance!~".*gpu"} == 0
for: 8m
labels:
project: funuobase
severity: High
annotations:
description: "服务挂了 {{ $labels.instance }}"
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: round(avg(rate(node_cpu_seconds_total{mode="system"}[1m])) by (instance) *100 + avg(rate(node_cpu_seconds_total{mode="user"}[1m])) by (instance) *100 + avg(rate(node_cpu_seconds_total{mode="iowait"}[1m])) by (instance) *100) > 90
for: 2m
labels:
project: funuobase
severity: warning
annotations:
description: "服务器: CPU使用超过90%!(值: {{ $value }}%)"
- name: 内存报警规则
rules:
- alert: 内存使用率告警
expr: round((1 - (node_memory_MemAvailable_bytes / (node_memory_MemTotal_bytes)))* 100) > 85
for: 5m
labels:
project: funuobase
severity: warning
annotations:
description: "服务器: 内存使用超过85%!(值: {{ $value }}%)"
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: round((node_filesystem_size_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}-node_filesystem_free_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}) *100/(node_filesystem_avail_bytes {fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}+(node_filesystem_size_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}-node_filesystem_free_bytes{fstype=~"ext.*|xfs|nfs",mountpoint !~".*(pod|docker|boot|olddisk_sdc|host|resolv).*"}))) > 85
for: 5m
labels:
project: funuobase
severity: warning
annotations:
description: "服务器: 挂载目录{{ $labels.mountpoint }}使用超过85%!(值: {{ $value }}%)"
- name: rocketmq存活告警规则
rules:
- alert: rocketmq down
expr: count(rocketmq_broker_tps) by (cluster) < 1
for: 8m
labels:
project: funuobase
severity: warning
instance: "rocketmq-export.default"
annotations:
description: "rocketmq 服务挂了 IP:{{ $labels.instance }}"
- name: rocketmq消息积压
rules:
- alert: rocketmq 消息积压
expr: (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0)) > 1000
#expr: (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-sub-topic"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-cut"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-video-stream"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-huizhi-topic-work"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0) or sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-sample-topic-sample"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) and (sum by(topic) (topk(1,rocketmq_producer_offset{topic=~"om-sample-topic-sample"})) - on(topic) group_right() sum by(group, topic) (rocketmq_consumer_offset) > 0)) > 1000
for: 5m
labels:
project: test
severity: warning
instance: "rocketmq-export.default"
annotations:
description: "rocketmq消息积压topic:{{ $labels.topic }},积压值:{{ $value }}"
- name: rocketmq消费延时
rules:
- alert: rocketmq 消费延时
expr: (sort_desc(sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic)) and (sum(rocketmq_group_get_latency_by_storetime) by (broker,group,topic) > 0))/1000 > 1000
for: 3m
labels:
project: test
severity: warning
instance: "rocketmq-export.default"
annotations:
description: "rocketmq消费延时topic:{{ $labels.topic }},延迟值:值: {{ $value }}S"
- name: -redis连接数告警规则
rules:
- alert: redis 连接数告警
expr: redis_connected_clients > 7000
for: 3m
labels:
project: test
severity: warning
annotations:
description: "redis 服务连接数超过最大值的80%告警,值为:{{ $value }}"
- name: -mysql慢查询
rules:
- alert: -mysql slow queries
expr: sum by (instance) (rate(mysql_global_status_slow_queries[50s])) > 0.05
for: 2m
labels:
project: test
severity: warning
annotations:
description: "mysql 服务出现慢查询 IP:{{ $labels.instance }}"
- name: -mysql连接数告警
rules:
- alert: mysql 连接数告警
expr: mysql_global_status_threads_connected > mysql_global_variables_max_connections*0.7
for: 3m
labels:
project: test
severity: warning
annotations:
description: "mysql 服务连接数大于最大连接数70% IP:{{ $labels.instance }},当前连接数:{{ $value }}"
- name: -线程数告警
rules:
- alert: 线程数告警
expr: thread_num > 40000
for: 2m
labels:
project: vos
severity: warning
annotations:
description: "服务器IP:{{ $labels.hostname }},连接数大于四万,当前连接数:{{ $value }}"
- name: -POD状态规则
rules:
- alert: -Pod状态告警 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: min_over_time(kube_pod_container_status_ready{pod!~"rke-.*|skywalking-es-init.*|nightingale-categraf.*|nightingale-.*|nginx-ingress-controller-gpj25"} [1m])!=1
for: 8m # 持续多久确认报警信息
labels:
project: funuobase
severity: warning
annotations:
description: "K8S集群 namespace:{{ $labels.namespace }}, pod name, {{ $labels.pod }}状态异常"
- name: -接口规则
rules:
- alert: -接口告警 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: linecount > 100
for: 3m # 持续多久确认报警信息
labels:
project: linecount
severity: warning
annotations:
description: "接口http://10.48.81.68:32068/check/getQueueInfo?isAllFlag=1阻塞值大于100:当前lineCount值为:{{ $value }}"
- name: 接口挂了
rules:
- alert: 接口挂了 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: httpjiekou!=200
for: 3m # 持续多久确认报警信息
labels:
project: wgetimagetime
severity: warning
annotations:
description: "接口http://10.48.81.68:32068/api/getQueueInfo?isAllFlag=1请求不通"
- name: -下载图片耗时规则
rules:
- alert: -下载图片耗时告警 # 15s 采集 + 15s 扫描规则,规则是1分钟前存在 pod 为 not ready 的 pod,15s 扫描一次的间隔,至少能扫描 3次,所以一定会发送
expr: wgetimagetime > 2
for: 1m # 持续多久确认报警信息
labels:
project: wgetimagetime
severity: warning
annotations:
description: "下载图片大于2s:下载耗时为:{{ $value }}s"