创建目录
## Prometheus
mkdir -pv /data/prometheus/{conf,data}
mkdir -pv /data/prometheus/conf/{rules,targets}
mkdir -pv /data/prometheus/conf/targets/{servers,nodes,blackbox}
## Alertmanager
mkdir -pv /data/alertmanager/{conf,data,tmpl}
## Grafana
mkdir -pv /data/grafana/{conf,data,logs,plugins}
## docker-compose
mkdir -pv /data/docker-compose/{prometheus,alertmanager,grafana}
配置文件
Prometheus配置文件
主配置文件
- 节点基于文件的自动发现
prometheus.yml
cat > /data/prometheus/conf/prometheus.yml << 'EOF'
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- file_sd_configs:
- files:
- targets/servers/alertmanagers.yaml
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yaml"
# 基于文件的自动发现
scrape_configs:
- job_name: 'prometheus'
file_sd_configs:
- files:
- targets/servers/prometheus.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'alertmanagers'
file_sd_configs:
- files:
- targets/servers/alertmanagers.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'grafana'
file_sd_configs:
- files:
- targets/servers/grafana.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'nodes'
file_sd_configs:
- files:
- targets/nodes/node-exporter.yaml
refresh_interval: 2m
# relabel configs
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'mysql'
file_sd_configs:
- files:
- targets/servers/mysql.yaml
refresh_interval: 2m
- job_name: 'redis-cluster'
file_sd_configs:
- files:
- targets/servers/redis-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'es-cluster'
file_sd_configs:
- files:
- targets/servers/es-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'kafka-cluster'
file_sd_configs:
- files:
- targets/servers/kafka-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'nacos-cluster'
metrics_path: '/nacos/actuator/prometheus'
file_sd_configs:
- files:
- targets/servers/nacos-exporter.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
- job_name: 'docker-engines'
file_sd_configs:
- files:
- targets/nodes/docker-nodes.yaml
refresh_interval: 2m
relabel_configs:
- source_labels: [ '__address__' ]
regex: "(.*):(.*)"
target_label: 'instance'
replacement: $1
action: replace
# 网站检测
- job_name: 'http_status'
metrics_path: /probe
params:
module: [http_2xx]
file_sd_configs:
- files:
- targets/blackbox/http-status.yaml
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 3.1.101.40:9115
# PING 检测
- job_name: 'ping_status'
metrics_path: /probe
params:
module: [icmp]
file_sd_configs:
- files:
- targets/blackbox/ping-status.yaml
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 3.1.101.40:9115
# 端口检测
- job_name: 'port_status'
metrics_path: /probe
params:
module: [tcp_connect]
file_sd_configs:
- files:
- targets/blackbox/port-status.yaml
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 3.1.101.40:9115
EOF
重新打标示例:
# relabel configs relabel_configs: - source_labels: - __scheme__ - __address__ - __metrics_path__ regex: "(http|https)(.*)" separator: "" target_label: "endpoint" replacement: "${1}://${2}" action: replace - source_labels: [ '__address__' ] regex: "(.*):(.*)" target_label: 'ipaddr' replacement: $1 action: replace - regex: "(job|app)" replacement: ${1}_name action: labelmap # metric relabel configs metric_relabel_configs: - source_labels: - __name__ regex: "go_.*" action: drop
节点配置文件
targets/servers/prometheus.yaml
cat > /data/prometheus/conf/targets/servers/prometheus.yaml << 'EOF'
- targets:
- 3.1.101.39:9090
labels:
job: prometheus
server: prometheus
env: dev
EOF
targets/servers/alertmanagers.yaml
cat > /data/prometheus/conf/targets/servers/alertmanagers.yaml << 'EOF'
- targets:
- 3.1.101.39:9093
labels:
job: alertmanager
server: alertmanager
env: dev
EOF
targets/servers/grafana.yaml
cat > /data/prometheus/conf/targets/servers/grafana.yaml << 'EOF'
- targets:
- 3.1.101.40:3000
labels:
job: grafana
server: grafana
env: dev
EOF
targets/nodes/node-exporter.yaml
cat > /data/prometheus/conf/targets/nodes/node-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9100
- 3.1.101.34:9100
- 3.1.101.35:9100
labels:
job: node-exporter
server: middleware
env: dev
- targets:
- 3.1.101.36:9100
- 3.1.101.38:9100
- 3.1.101.39:9100
- 3.1.101.40:9100
labels:
job: node-exporter
server: devops
env: dev
- targets:
- 3.1.101.41:9100
- 3.1.101.42:9100
labels:
job: node-exporter
server: weblogic
env: dev
- targets:
- 6.1.14.86:9100
- 6.1.14.87:9100
labels:
job: node-exporter
server: weblogic
env: old-env
EOF
targets/servers/mysql-exporter.yaml
cat > /data/prometheus/conf/targets/servers/mysql-exporter.yaml << 'EOF'
- targets:
- 3.1.101.36:9104
labels:
instance: 6.1.14.87
job: mysql-exporter
server: mysql
env: dev
EOF
targets/servers/redis-exporter.yaml
cat > /data/prometheus/conf/targets/servers/redis-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9121
- 3.1.101.34:9121
- 3.1.101.35:9121
labels:
job: redis-exporter
server: redis
env: dev
cluster: redis
EOF
targets/servers/es-exporter.yaml
cat > /data/prometheus/conf/targets/servers/es-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9115
- 3.1.101.34:9115
- 3.1.101.35:9115
labels:
job: es-exporter
server: elasticsearch
env: dev
cluster: elk
EOF
targets/servers/kafka-exporter.yaml
cat > /data/prometheus/conf/targets/servers/kafka-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:9308
- 3.1.101.34:9308
- 3.1.101.35:9308
labels:
job: kafka-exporter
server: kafka
env: dev
cluster: kafka
EOF
targets/servers/nacos-exporter.yaml
cat > /data/prometheus/conf/targets/servers/nacos-exporter.yaml << 'EOF'
- targets:
- 3.1.101.33:8848
- 3.1.101.34:8848
- 3.1.101.35:8848
labels:
job: nacos-exporter
server: nacos
env: dev
cluster: nacos
EOF
targets/nodes/docker-nodes.yaml
cat > /data/prometheus/conf/targets/nodes/docker-nodes.yaml << 'EOF'
- targets:
- 3.1.101.33:9180
- 3.1.101.34:9180
- 3.1.101.35:9180
- 3.1.101.36:9180
- 3.1.101.38:9180
- 3.1.101.39:9180
- 3.1.101.40:9180
- 3.1.101.41:9180
- 3.1.101.42:9180
labels:
job: cAdvisor
server: docker
env: dev
EOF
targets/blackbox/http-status.yaml
cat > /data/prometheus/conf/targets/blackbox/http-status.yaml << 'EOF'
- targets:
- http://3.1.101.39:9090/
labels:
job: http-status
server: prometheus
env: dev
- targets:
- http://3.1.101.40:3030/
labels:
job: http-status
server: grafana
env: dev
- targets:
- http://3.1.101.40:5601/
labels:
job: http-status
server: kibana
env: dev
- targets:
- http://3.1.101.36:8088/
labels:
job: http-status
server: jumpserver
env: dev
- targets:
- http://3.1.101.35:8095/
labels:
job: http-status
server: jira
env: dev
- targets:
- http://3.1.101.45:8848/nacos/
labels:
job: http-status
server: nacos
env: dev
- targets:
- http://3.1.101.33:8858/
- http://3.1.101.34:8858/
labels:
job: http-status
server: sentinel
env: dev
- targets:
- http://3.1.101.41:7001/console
- http://3.1.101.42:7001/console
labels:
job: http-status
server: weblogic
env: dev
- targets:
- http://6.1.14.86:7001/console
- http://6.1.14.87:7001/console
labels:
job: http-status
server: weblogic
env: dev
EOF
targets/blackbox/ping-status.yaml
cat > /data/prometheus/conf/targets/blackbox/ping-status.yaml << 'EOF'
- targets:
- 3.1.101.33
- 3.1.101.34
- 3.1.101.35
labels:
job: ping-status
server: middleware
env: dev
- targets:
- 3.1.101.36
- 3.1.101.38
- 3.1.101.39
- 3.1.101.40
labels:
job: ping-status
server: devops
env: dev
- targets:
- 3.1.101.41
- 3.1.101.42
labels:
job: ping-status
server: weblogic
env: dev
- targets:
- 6.1.14.86
- 6.1.14.87
labels:
job: ping-status
server: weblogic
env: old-env
EOF
targets/blackbox/port-status.yaml
cat > /data/prometheus/conf/targets/blackbox/port-status.yaml << 'EOF'
# 监控平台
- targets:
- 3.1.101.39:9090
labels:
job: port-status
server: prometheus
env: dev
- targets:
- 3.1.101.40:3030
labels:
job: port-status
server: grafana
env: dev
- targets:
- 3.1.101.39:9093
labels:
job: port-status
server: alertmanager
env: dev
# redis集群
- targets:
- 3.1.101.33:6379
- 3.1.101.34:6379
- 3.1.101.35:6379
labels:
job: port-status
server: redis-master
env: dev
- targets:
- 3.1.101.33:26379
- 3.1.101.34:26379
- 3.1.101.35:26379
labels:
job: port-status
server: redis-slave
env: dev
# kafka集群
- targets:
- 3.1.101.33:9092
- 3.1.101.34:9092
- 3.1.101.35:9092
labels:
job: port-status
server: kafka
env: dev
- targets:
- 3.1.101.33:2181
- 3.1.101.34:2181
- 3.1.101.35:2181
labels:
job: port-status
server: zookeeper
env: dev
# 阿里微服务中间件
- targets:
- 3.1.101.33:8848
- 3.1.101.34:8848
- 3.1.101.35:8848
labels:
job: port-status
server: nacos
env: dev
- targets:
- 3.1.101.33:8091
- 3.1.101.34:8091
- 3.1.101.35:8091
labels:
job: port-status
server: seata
env: dev
- targets:
- 3.1.101.33:8858
- 3.1.101.34:8858
labels:
job: port-status
server: sentinel
env: dev
# ELK日志平台
- targets:
- 3.1.101.33:9200
- 3.1.101.34:9200
- 3.1.101.35:9200
labels:
job: port-status
server: elasticsearch
env: dev
- targets:
- 3.1.101.33:5044
- 3.1.101.34:5044
labels:
job: port-status
server: logstash
env: dev
- targets:
- 3.1.101.40:5601
labels:
job: port-status
server: kibana
env: dev
# Mysql数据库
- targets:
- 6.1.14.87:3306
labels:
job: port-status
server: mysql
env: dev
# weblogic服务
- targets:
- 3.1.101.41:7001
- 3.1.101.42:7001
labels:
job: port-status
server: weblogic
env: dev
- targets:
- 6.1.14.86:7001
- 6.1.14.87:7001
labels:
job: port-status
server: weblogic
env: old-env
# DevOps平台
- targets:
- 3.1.101.38:389
labels:
job: port-status
server: openldap
env: dev
- targets:
- 3.1.101.36:8088
labels:
job: port-status
server: jumpserver
env: dev
- targets:
- 3.1.101.38:8098
labels:
job: port-status
server: gitlab
env: dev
- targets:
- 3.1.101.36:8080
labels:
job: port-status
server: jenkins
env: dev
- targets:
- 3.1.101.36:9000
labels:
job: port-status
server: sonarqube
env: dev
- targets:
- 3.1.101.35:8095
labels:
job: port-status
server: jira
env: dev
EOF
Alertmanager配置文件
主配置文件
alertmanager.yml示例
global:
resolve_timeout: 5m
# smtp配置
smtp_from: 'wangshui898@sina.com'
smtp_smarthost: 'smtp.sina.com:465'
smtp_auth_username: 'wangshui898@sina.com'
smtp_auth_password: 'Authorization code'
smtp_require_tls: false
# 告警模板
templates:
- 'tmpl/*.tmpl'
route:
receiver: default-receiver # 所有不匹配以下子路由的告警都将保留在根节点,并发送到“default-receiver”
group_wait: 30s # 为一个组发送通知的初始等待时间,默认30s
group_interval: 2m # 在发送新告警前的等待时间。通常5m或以上
repeat_interval: 1h # 发送重复告警的周期。如果已经发送了通知,再次发送之前需要等待多长时间。
group_by: ['alertname','server'] # 分组规则,如果满足group_by中包含的标签,则这些报警会合并为一个通知发给receiver
routes: # 子路由,父路由的所有属性都会被子路由继承,子路由可以有多级
- match: # 此路由在警报标签上进行匹配,以捕获与服务列表相关的警报
severity: critical
receiver: 'default-receiver' # 发送给指定接收者
- match: # 此路由在警报标签上进行匹配,以捕获与服务列表相关的警报
server: mysql
receiver: 'dba' # 发送给指定接收者
- match_re: # 此路由在告警标签上执行正则表达式匹配,并发送给对应接收人
service: ccms-.*
receiver: 'dev'
# 告警抑制
#inhibit_rules:
#- source_match:
# severity: 'critical'
# target_match:
# severity: 'warning'
# equal: ['alertname']
receivers: # 定义接收者,将告警发送给谁
- name: 'default-receiver'
email_configs:
- to: 'wangshui898@126.com'
html: '{{ template "email.to.html" .}}'
send_resolved: true
- name: 'dev'
email_configs:
- to: 'wangshui898@163.com'
send_resolved: true
- name: 'dba'
email_configs:
- to: 'wangshui898@sina.com'
send_resolved: true
alertname即为prometheus告警规则中的groups.name.rules.alert
告警模版文件
官方模板参考: https://raw.githubusercontent.com/prometheus/alertmanager/master/template/default.tmpl
邮件告警模版
mkdir -pv /data/alertmanager/conf/tmpl
cat > /data/alertmanager/conf/tmpl/e-mail.tmpl << 'EOF'
{{ define "email.to.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
=========告警通知==========<br>
告警类型: {{ .Labels.alertname }} <br>
告警级别: {{ .Labels.severity }} 级<br>
{{- end }}
----------------------------<br>
告警主题: {{ .Annotations.title }} <br>
故障详情: {{ .Annotations.description }} <br>
故障时间: {{ .StartsAt.Local }} <br>
{{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}}
=========请勿回复==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
==========告警恢复通知========<br>
告警类型: {{ .Labels.alertname }} <br>
告警级别: {{ .Labels.severity }} 级<br>
{{- end }}
----------------------------<br>
告警主题: {{ .Annotations.title }} <br>
触发详情: {{ .Annotations.description }}, 已恢复 <br>
故障时间: {{ .StartsAt.Local }} <br>
恢复时间: {{ .EndsAt.Local }} <br>
{{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}}
===========请勿回复===========<br>
{{- end }}
{{- end }}
{{- end }}
EOF
模版中的变量要和告Prometheus告警规则(rules)中对应, 如title, description
{{- if gt (len .Alerts.Firing) 0 -}} # 判断报警列表的长度是否大于0,大于0说明有报警,否则没有 {{- range $index, $alert := .Alerts -}} # 遍历所有的告警列表,$index是索引,$alert是每一个报警元素 {{ if gt (len .Labels.instance) 0 -}}故障主机: {{ .Labels.instance }} <br>{{- end -}} # 判断instance是否存在
.Receiver: 接收器的名称 .Status: 如果正在告警,值为firing,恢复为resolved .Alerts: 所有告警对象的列表,是一个列表, .Alerts.Firing: 告警列表 .Alerts.Resolved: 恢复列表 .GroupLabels: 告警的分组标签 .CommonLabels: 所有告警共有的标签 .CommonAnnotations: 所有告警共有的注解 .ExternalURL: 告警对应的alertmanager连接地址
服务部署
- Docker-compose
Prometheus
version: "3"
services:
prometheus:
container_name: prometheus
image: prom/prometheus:v2.25.1
user: root
ports:
- 9090:9090
restart: always
volumes:
- /etc/localtime:/etc/localtime
- /data/prometheus/conf:/etc/prometheus
- /data/prometheus/data:/prometheus
environment:
TZ: Asia/Shanghai
command: --config.file=/etc/prometheus/prometheus.yml --web.enable-lifecycle --storage.tsdb.retention=30d
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G
Alertmanager
version: "3"
services:
alertmanager:
container_name: alertmanager
image: prom/alertmanager:v0.23.0
user: root
ports:
- 9093:9093
restart: always
volumes:
- /etc/localtime:/etc/localtime
- /data/alertmanager/conf:/etc/alertmanager
- /data/alertmanager/data:/alertmanager
environment:
TZ: Asia/Shanghai
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G
grafana
version: "3"
services:
grafana:
container_name: grafana
image: grafana/grafana:7.4.1
user: root
ports:
- 3000:3000
restart: always
environment:
TZ: Asia/Shanghai
GF_PATHS_DATA: /data/grafana/data
GF_PATHS_LOGS: /data/grafana/logs
volumes:
- /etc/localtime:/etc/localtime
- /data/grafana:/data/grafana
- /data/grafana/plugins:/var/lib/grafana/plugins
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G
exporter部署
- docker-compose
node_exporter
version: "3"
services:
node_exporter:
container_name: node_exporter
image: prom/node-exporter:v1.1.2
restart: always
network_mode: host
command:
- '--web.listen-address=:9100'
- '--path.rootfs=/rootfs'
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc)($$|/)'
# - '--collector.textfile.directory=/node_exporter/prom'
volumes:
- /proc:/host/proc
- /sys:/host/sys
- /:/rootfs
常用Linux主机模版: 8919, 1860
mysql-exporter
version: "3"
services:
mysql_exporter:
container_name: mysql_exporter
image: prom/mysqld-exporter:v0.13.0
restart: always
ports:
- "9104:9104"
environment:
# mysql服务端, 需要配置具有查询权限的用户:
# CREATE USER 'exporter'@'%' IDENTIFIED WITH mysql_native_password BY 'Aa@123456';
# GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'exporter'@'%';
# 格式为 dbuser:dbpasswd@(mysqlip:port)/dbname, 实际应用当中,应该配置为具体需要监控的数据库实例
DATA_SOURCE_NAME: "exporter:Aa@123456@(172.16.20.111:3306)/"
常用mysql监控模版: 7362
oracle-exporter
version: "3"
services:
oracle-exporter:
container_name: oracle_exporter
image: iamseth/oracledb_exporter:latest
hostname: ccms-odb-sit1
restart: always
ports:
- 9161:9161
volumes:
- /etc/localtime:/etc/localtime
environment:
- TZ=Asia/Shanghai
- DATA_SOURCE_NAME=system/System123@3.1.101.43:1521/loandb
常用监控模板: 11121
redis-exporter
version: "3"
services:
redis_exporter:
container_name: redis_exporter
image: bitnami/redis-exporter:1.20.0
restart: always
ports:
- "9121:9121"
command: "-redis.addr 3.1.101.33:6379 -redis-only-metrics -web.listen-address 0.0.0.0:9121 -redis.password g1tredis2o2l"
常用redis监控模版: 11835
elasticsearch_exporter
version: "3"
services:
es_exporter:
container_name: es_exporter
image: justwatch/elasticsearch_exporter:1.1.0
restart: always
ports:
- "9115:9115"
command: "--es.all --es.indices --es.cluster_settings --es.indices_settings --es.shards --es.snapshots --es.timeout=10s --web.listen-address=0.0.0.0:9115 --web.telemetry-path=/metrics --es.uri http://3.1.101.33:9200"
常用ES监控模版: 2322
kafka_exporter
version: "3"
services:
kafka_exporter:
container_name: kafka_exporter
image: danielqsj/kafka-exporter:v1.3.0
restart: always
ports:
- "9308:9308"
command: "--kafka.server=3.1.101.33:9092"
常用kakfa监控模版: 13572, 7589
cAdvisor
version: "3"
services:
cadvisor:
container_name: cadvisor
image: google/cadvisor:v0.33.0
user: root
privileged: true
ports:
- 9180:8080
restart: always
volumes:
- /:/rootfs
- /var/run:/var/run
- /sys:/sys
- /var/lib/docker/:/var/lib/docker
- /dev/disk/:/dev/disk
常用docker监控模版: 13584,13946
blackbox-exporter
创建目录
mkdir -pv /data/exporter/blackbox_exporter
创建配置文件
cat > /data/exporter/blackbox_exporter/config.yml << 'EOF'
modules:
http_2xx:
prober: http
http_post_2xx:
prober: http
http:
method: POST
tcp_connect:
prober: tcp
pop3s_banner:
prober: tcp
tcp:
query_response:
- expect: "^+OK"
tls: true
tls_config:
insecure_skip_verify: false
ssh_banner:
prober: tcp
tcp:
query_response:
- expect: "^SSH-2.0-"
- send: "SSH-2.0-blackbox-ssh-check"
irc_banner:
prober: tcp
tcp:
query_response:
- send: "NICK prober"
- send: "USER prober prober prober :prober"
- expect: "PING :([^ ]+)"
send: "PONG ${1}"
- expect: "^:[^ ]+ 001"
icmp:
prober: icmp
EOF
docker-compose编排文件
version: "3"
services:
blackbox-exporter:
container_name: blackbox-exporter
image: prom/blackbox-exporter:v0.19.0
restart: always
ports:
- 9115:9115
volumes:
- /etc/localtime:/etc/localtime
- /data/exporter/blackbox_exporter:/etc/blackbox_exporter
deploy:
resources:
limits:
memory: 2G
reservations:
memory: 1G
grafana常用监控模版: 9965
Prometheus常用告警规则
参考: https://awesome-prometheus-alerts.grep.to/rules
热加载告警规则
启动参数中加入: --web.enable-lifecycle参数, 然后终端执行如下POST请求 curl -X POST http://IP:port/-/reload
Prometheus.rules
cat > /data/prometheus/conf/rules/Prometheus.yaml << 'EOF'
groups:
- name: Prometheus.rules
rules:
- alert: PrometheusAllTargetsMissing
expr: count by (job) (up) == 0
for: 2m
labels:
severity: critical
annotations:
title: 'Prometheus all targets missing'
description: "A Prometheus job does not have living target anymore."
- alert: PrometheusConfigurationReloadFailure
expr: prometheus_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
title: 'Prometheus configuration reload failure'
description: "Prometheus: 【{{ $labels.instance }}】 configuration reload error."
- alert: PrometheusTooManyRestarts
expr: changes(process_start_time_seconds{job=~"prometheus|pushgateway|alertmanager"}[15m]) > 2
for: 0m
labels:
severity: warning
annotations:
title: 'Prometheus too many restarts'
description: "Prometheus: 【{{ $labels.instance }}】 has restarted more than twice in the last 15 minutes. It might be crashlooping."
- alert: PrometheusAlertmanagerConfigurationReloadFailure
expr: alertmanager_config_last_reload_successful != 1
for: 0m
labels:
severity: warning
annotations:
title: 'Prometheus AlertManager configuration reload failure'
description: "AlertManager: 【{{ $labels.instance }}】 configuration reload error"
- alert: PrometheusNotificationsBacklog
expr: min_over_time(prometheus_notifications_queue_length[10m]) > 0
for: 1m
labels:
severity: warning
annotations:
title: 'Prometheus notifications backlog'
description: "Prometheus: 【{{ $labels.instance }}】 The notification queue has not been empty for 10 minutes"
- alert: PrometheusAlertmanagerNotificationFailing
expr: rate(alertmanager_notifications_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus AlertManager notification failing'
description: "AlertManager: 【{{ $labels.instance }}】 is failing sending notifications"
- alert: PrometheusTsdbCheckpointCreationFailures
expr: increase(prometheus_tsdb_checkpoint_creations_failed_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB checkpoint creation failures'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} checkpoint creation failures"
- alert: PrometheusTsdbCheckpointDeletionFailures
expr: increase(prometheus_tsdb_checkpoint_deletions_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB checkpoint deletion failures'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} checkpoint deletion failures"
- alert: PrometheusTsdbCompactionsFailed
expr: increase(prometheus_tsdb_compactions_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB compactions failed'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} TSDB compactions failures"
- alert: PrometheusTsdbHeadTruncationsFailed
expr: increase(prometheus_tsdb_head_truncations_failed_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB head truncations failed'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} TSDB head truncation failures"
- alert: PrometheusTsdbReloadFailures
expr: increase(prometheus_tsdb_reloads_failures_total[1m]) > 0
for: 1m
labels:
severity: critical
annotations:
title: 'Prometheus TSDB reload failures'
description: "Prometheus: 【{{ $labels.instance }}】 encountered {{ $value }} TSDB reload failures"
EOF
Host.rules
cat > /data/prometheus/conf/rules/Hosts.yaml << 'EOF'
groups:
- name: Hosts.rules
rules:
## Custom By wangshui
- alert: HostDown
expr: up{job=~"node-exporter|prometheus|grafana|alertmanager"} == 0
for: 0m
labels:
severity: critical
annotations:
title: 'Instance down'
description: "主机: 【{{ $labels.instance }}】has been down for more than 1 minute"
- alert: HostCpuLoadAvage
expr: sum(node_load5) by (instance) > 10
for: 1m
annotations:
title: "5分钟内CPU负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU负载超过10 (当前值:{{ $value }})"
labels:
severity: 'warning'
- alert: HostCpuUsage
expr: (1-((sum(increase(node_cpu_seconds_total{mode="idle"}[5m])) by (instance))/ (sum(increase(node_cpu_seconds_total[5m])) by (instance))))*100 > 80
for: 1m
annotations:
title: "CPU使用率过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内CPU使用率超过80% (当前值:{{ $value }})"
labels:
severity: 'warning'
- alert: HostMemoryUsage
expr: (1-((node_memory_Buffers_bytes + node_memory_Cached_bytes + node_memory_MemFree_bytes)/node_memory_MemTotal_bytes))*100 > 80
for: 1m
annotations:
title: "主机内存使用率超过80%"
description: "主机: 【{{ $labels.instance }}】 内存使用率超过80% (当前使用率:{{ $value }}%)"
labels:
severity: 'warning'
- alert: HostIOWait
expr: ((sum(increase(node_cpu_seconds_total{mode="iowait"}[5m])) by (instance))/(sum(increase(node_cpu_seconds_total[5m])) by (instance)))*100 > 10
for: 1m
annotations:
title: "磁盘负载过高"
description: "主机: 【{{ $labels.instance }}】 5五分钟内磁盘负载过高 (当前负载值:{{ $value }})"
labels:
severity: 'warning'
- alert: HostFileSystemUsage
expr: (1-(node_filesystem_free_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }/node_filesystem_size_bytes{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" }))*100 > 70
for: 1m
annotations:
title: "磁盘空间剩余不足"
description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区使用率超过70%, 当前值使用率:{{ $value }}%"
labels:
severity: 'warning'
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机swap分区不足"
description: "主机: 【{{ $labels.instance }}】 swap分区使用超过 (>80%), 当前值使用率: {{ $value }}%"
- alert: HostNetworkConnection-ESTABLISHED
expr: sum(node_netstat_Tcp_CurrEstab) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机ESTABLISHED连接数过高"
description: "主机: 【{{ $labels.instance }}】 ESTABLISHED连接数超过1000, 当前ESTABLISHED连接数: {{ $value }}"
- alert: HostNetworkConnection-TIME_WAIT
expr: sum(node_sockstat_TCP_tw) by (instance) > 1000
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机TIME_WAIT连接数过高"
description: "主机: 【{{ $labels.instance }}】 TIME_WAIT连接数超过1000, 当前TIME_WAIT连接数: {{ $value }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance, device) (rate(node_network_receive_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡入口流量过高"
description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 入口流量超过 (> 100 MB/s), 当前值: {{ $value }}"
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance, device) (rate(node_network_transmit_bytes_total{device=~"ens.*"}[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机网卡出口流量过高"
description: "主机: 【{{ $labels.instance }}】, 网卡: {{ $labels.device }} 出口流量超过 (> 100 MB/s), 当前值: {{ $value }}"
- alert: HostUnusualDiskReadRate
expr: sum by (instance, device) (rate(node_disk_read_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: 'warning'
annotations:
title: "主机磁盘读取速率过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 读取速度超过(50 MB/s), 当前值: {{ $value }}"
- alert: HostUnusualDiskWriteRate
expr: sum by (instance, device) (rate(node_disk_written_bytes_total{device=~"sd.*"}[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘写入速率过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} 写入速度超过(50 MB/s), 当前值: {{ $value }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } / node_filesystem_files{fstype=~"ext4|xfs",mountpoint!~".*tmp|.*boot" } * 100 < 10
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机分区Inode节点不足"
description: "主机: 【{{ $labels.instance }}】 {{ $labels.mountpoint }}分区inode节点不足 (可用值小于{{ $value }}%)"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_reads_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘Read延迟过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Read延迟过高 (read operations > 100ms), 当前延迟值: {{ $value }}ms"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total{device=~"sd.*"}[1m]) / rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0.1 and rate(node_disk_writes_completed_total{device=~"sd.*"}[1m]) > 0
for: 2m
labels:
severity: 'warning'
annotations:
title: "主机磁盘Write延迟过高"
description: "主机: 【{{ $labels.instance }}】, 磁盘: {{ $labels.device }} Write延迟过高 (write operations > 100ms), 当前延迟值: {{ $value }}ms"
EOF
Blackbox.rules
cat > /data/prometheus/conf/rules/Blackbox.yaml << 'EOF'
groups:
- name: Blackbox.rules
rules:
- alert: HostConnectionFailure
expr: probe_success{job="ping-status"} == 0
for: 0m
labels:
severity: critical
annotations:
title: Host Connection Failure
description: "主机 【{{ $labels.instance }}】 cannot be connected"
- alert: ServiceConnectionFailure
expr: probe_success{job="port-status"} == 0
for: 0m
labels:
severity: critical
annotations:
title: Service Connection Failure
description: "服务 【{{ $labels.server }}】 on 主机 【{{ $labels.instance }}】 cannot be connected"
- alert: BlackboxSlowProbeOnServer
expr: avg_over_time(probe_duration_seconds{job="port-status"}[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Service probe timeout
description: "服务 【{{ $labels.server }}】 on 主机 【{{ $labels.instance }}】Blackbox probe took more than 1s to complete, Current Value: {{ $value }}s"
- alert: BlackboxSlowProbeOnWebsite
expr: avg_over_time(probe_duration_seconds{job="http-status"}[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Service probe timeout
description: "网站 【{{ $labels.instance }}】 Blackbox probe took more than 1s to complete, Current Value: {{ $value }}s"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 0m
labels:
severity: critical
service: web
annotations:
title: Blackbox probe HTTP failure
description: "网站: 【{{ $labels.instance }}】HTTP status code is exception, Current status code: {{ $value }}"
- alert: BlackboxSslCertificateWillExpireSoonIn30days
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 30
for: 0m
labels:
severity: warning
annotations:
title: Blackbox SSL certificate will expire soon
description: "网站: 【{{ $labels.instance }}】 SSL certificate expires in 30 days"
- alert: BlackboxSslCertificateWillExpireSoonIn3days
expr: probe_ssl_earliest_cert_expiry - time() < 86400 * 3
for: 0m
labels:
severity: critical
annotations:
title: Blackbox SSL certificate will expire soon
description: "网站: 【{{ $labels.instance }}】 SSL certificate expires in 3 days"
- alert: BlackboxSslCertificateExpired
expr: probe_ssl_earliest_cert_expiry - time() <= 0
for: 0m
labels:
severity: critical
annotations:
title: Blackbox SSL certificate expired
description: "网站: 【{{ $labels.instance }}】 SSL certificate has expired already"
- alert: BlackboxProbeSlowHttp
expr: avg_over_time(probe_http_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Blackbox probe slow HTTP
description: "网站: 【{{ $labels.instance }}】HTTP request took more than 1s, Current Value: {{ $value }}s"
- alert: BlackboxProbeSlowPing
expr: avg_over_time(probe_icmp_duration_seconds[1m]) > 1
for: 1m
labels:
severity: warning
annotations:
title: Blackbox probe slow ping
description: "主机: 【{{ $labels.instance }}】Blackbox ping took more than 1s, Current Value: {{ $value }}s"
EOF
Mysql.rules
cat > /data/prometheus/conf/rules/Mysql.yaml << 'EOF'
groups:
- name: Mysql.rules
rules:
## Mysql Alarm Rules
- alert: MysqlDown
expr: mysql_up == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL down'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL instance is down"
- alert: MysqlRestarted
expr: mysql_global_status_uptime < 60
for: 0m
labels:
severity: info
annotations:
title: 'MySQL Restarted'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL has just been restarted, less than one minute ago"
- alert: MysqlTooManyConnections(>80%)
expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many connections (> 80%)'
description: "Mysql实例: 【{{ $labels.instance }}】, More than 80% of MySQL connections are in use, Current Value: {{ $value }}%"
- alert: MysqlThreadsRunningHigh
expr: mysql_global_status_threads_running > 40
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL Threads_Running High'
description: "Mysql实例: 【{{ $labels.instance }}】, Threads_Running above the threshold(40), Current Value: {{ $value }}"
- alert: MysqlQpsHigh
expr: sum by (instance) (rate(mysql_global_status_queries[2m])) > 500
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL QPS High'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL QPS above 500"
- alert: MysqlSlowQueries
expr: increase(mysql_global_status_slow_queries[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL slow queries'
description: "Mysql实例: 【{{ $labels.instance }}】, has some new slow query."
- alert: MysqlTooManyAbortedConnections
expr: round(increase(mysql_global_status_aborted_connects[5m])) > 20
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many Aborted connections in 2 minutes'
description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted connections within 2 minutes"
- alert: MysqlTooManyAbortedClients
expr: round(increase(mysql_global_status_aborted_clients[120m])) > 10
for: 2m
labels:
severity: warning
annotations:
title: 'MySQL too many Aborted connections in 2 hours'
description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted Clients within 2 hours"
- alert: MysqlSlaveIoThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL Slave IO thread not running'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave IO thread not running"
- alert: MysqlSlaveSqlThreadNotRunning
expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
for: 0m
labels:
severity: critical
annotations:
title: 'MySQL Slave SQL thread not running'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave SQL thread not running"
- alert: MysqlSlaveReplicationLag
expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
for: 1m
labels:
severity: critical
annotations:
title: 'MySQL Slave replication lag'
description: "Mysql实例: 【{{ $labels.instance }}】, MySQL replication lag"
- alert: MysqlInnodbLogWaits
expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
for: 0m
labels:
severity: warning
annotations:
title: 'MySQL InnoDB log waits'
description: "Mysql实例: 【{{ $labels.instance }}】, innodb log writes stalling"
EOF
Redis.rules
cat > /data/prometheus/conf/rules/redis.yaml << 'EOF'
groups:
- name: Redis.rules
rules:
## Redis Alarm Rules
- alert: RedisDown
expr: redis_up == 0
for: 1m
labels:
severity: critical
annotations:
title: 'Redis down'
description: "Redis实例: 【{{ $labels.instance }}】, Redis instance is down"
- alert: RedisMissingMaster
expr: count(redis_instance_info{role="master"}) < 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis missing master'
description: "Redis cluster has no node marked as master."
- alert: RedisTooManyMasters
expr: count(redis_instance_info{role="master"}) > 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis too many masters'
description: "Redis cluster has too many nodes marked as master."
- alert: RedisDisconnectedSlaves
expr: count without (instance, job) (redis_connected_slaves) - sum without (instance, job) (redis_connected_slaves) - 1 > 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis disconnected slaves'
description: "Redis not replicating for all slaves. Consider reviewing the redis replication status."
- alert: RedisReplicationBroken
expr: delta(redis_connected_slaves[1m]) < 0
for: 0m
labels:
severity: critical
annotations:
title: 'Redis replication broken'
description: "Redis实例: 【{{ $labels.instance }}】,Redis instance lost a slave"
- alert: RedisClusterFlapping
expr: changes(redis_connected_slaves[1m]) > 1
for: 2m
labels:
severity: critical
annotations:
title: 'Redis cluster flapping'
description: "Redis实例: 【{{ $labels.instance }}】,Changes have been detected in Redis replica connection. This can occur when replica nodes lose connection to the master and reconnect (a.k.a flapping)."
- alert: RedisMissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 0m
labels:
severity: critical
annotations:
title: 'Redis missing backup'
description: "Redis实例: 【{{ $labels.instance }}】,Redis has not been backuped for 24 hours"
- alert: RedisOutOfConfiguredMaxmemory
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
for: 2m
labels:
severity: warning
annotations:
title: 'Redis out of configured maxmemory'
description: "Redis实例: 【{{ $labels.instance }}】,Redis is running out of configured maxmemory (> 90%), Current Value: {{ $value }}"
- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 2m
labels:
severity: warning
annotations:
title: 'Redis too many connections'
description: "Redis实例: 【{{ $labels.instance }}】, Redis instance has too many connections, Current Value: {{ $value }}"
- alert: RedisNotEnoughConnections
expr: redis_connected_clients < 5
for: 2m
labels:
severity: warning
annotations:
title: 'Redis not enough connections'
description: "Redis实例: 【{{ $labels.instance }}】, Redis instance should have more connections (> 5), Current Value: {{ $value }}"
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
title: 'Redis rejected connections'
description: "Redis实例: 【{{ $labels.instance }}】, Some connections to Redis has been rejected, Current Value: {{ $value }}"
EOF
Elasticsearch.rules
cat > /data/prometheus/conf/rules/elasticsearch.yaml << 'EOF'
groups:
- name: Elasticsearch.rules
rules:
## ES Alarm Rules
- alert: ElasticsearchHeapUsageTooHigh
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 90
for: 2m
labels:
severity: critical
annotations:
title: "Elasticsearch Heap Usage Too High"
description: "主机: 【{{ $labels.instance }}】, The heap usage is over 90%, Current Value: {{ $value }}"
- alert: ElasticsearchHeapUsageWarning
expr: (elasticsearch_jvm_memory_used_bytes{area="heap"} / elasticsearch_jvm_memory_max_bytes{area="heap"}) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
title: 'Elasticsearch Heap Usage warning'
description: "主机: 【{{ $labels.instance }}】, The heap usage is over 80%, Current Value: {{ $value }}"
- alert: ElasticsearchDiskOutOfSpace
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 10
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch disk out of space'
description: "主机: 【{{ $labels.instance }}】, The disk usage is over 90%, Current Value: {{ $value }}"
- alert: ElasticsearchDiskSpaceLow
expr: elasticsearch_filesystem_data_available_bytes / elasticsearch_filesystem_data_size_bytes * 100 < 20
for: 2m
labels:
severity: warning
annotations:
title: 'Elasticsearch disk space low'
description: "主机: 【{{ $labels.instance }}】, The disk usage is over 80%, Current Value: {{ $value }}"
- alert: ElasticsearchClusterRed
expr: elasticsearch_cluster_health_status{color="red"} == 1
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Cluster Red'
description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Red status"
- alert: ElasticsearchClusterYellow
expr: elasticsearch_cluster_health_status{color="yellow"} == 1
for: 0m
labels:
severity: warning
annotations:
title: 'Elasticsearch Cluster Yellow'
description: "主机: 【{{ $labels.instance }}】, Elastic Cluster Yellow status"
- alert: ElasticsearchHealthyNodes
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Healthy Nodes'
description: "Missing node in Elasticsearch cluster"
- alert: ElasticsearchHealthyDataNodes
expr: elasticsearch_cluster_health_number_of_data_nodes < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch Healthy Data Nodes'
description: "Missing data node in Elasticsearch cluster"
- alert: ElasticsearchRelocatingShards
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 0m
labels:
severity: info
annotations:
title: 'Elasticsearch relocating shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch is relocating shards"
- alert: ElasticsearchRelocatingShardsTooLong
expr: elasticsearch_cluster_health_relocating_shards > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch relocating shards too long'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been relocating shards for 15min"
- alert: ElasticsearchInitializingShards
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 0m
labels:
severity: info
annotations:
title: 'Elasticsearch initializing shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch is initializing shards"
- alert: ElasticsearchInitializingShardsTooLong
expr: elasticsearch_cluster_health_initializing_shards > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch initializing shards too long'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has been initializing shards for 15 min"
- alert: ElasticsearchUnassignedShards
expr: elasticsearch_cluster_health_unassigned_shards > 0
for: 0m
labels:
severity: critical
annotations:
title: 'Elasticsearch unassigned shards'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has unassigned shards"
- alert: ElasticsearchPendingTasks
expr: elasticsearch_cluster_health_number_of_pending_tasks > 0
for: 15m
labels:
severity: warning
annotations:
title: 'Elasticsearch pending tasks'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch has pending tasks. Cluster works slowly, Current Value: {{ $value }}"
- alert: ElasticsearchNoNewDocuments
expr: increase(elasticsearch_indices_docs{es_data_node="true"}[10m]) < 1
for: 0m
labels:
severity: warning
annotations:
title: 'Elasticsearch no new documents'
description: "主机: 【{{ $labels.instance }}】, Elasticsearch No new documents for 10 min!"
EOF
kafka.rules
cat > /data/prometheus/conf/rules/kafka.yaml << 'EOF'
groups:
- name: kafka.rules
rules:
## KAFKA Alarm Rules
- alert: KafkaTopicsReplicas
expr: sum(kafka_topic_partition_in_sync_replica) by (topic) < 3
for: 0m
labels:
severity: critical
annotations:
title: 'Kafka topics replicas less than 3'
description: "Topic: {{ $labels.topic }} partition less than 3, Current Value: {{ $value }}"
- alert: KafkaConsumersGroupLag
expr: sum(kafka_consumergroup_lag) by (consumergroup) > 50
for: 1m
labels:
severity: critical
annotations:
title: 'Kafka consumers group 消费滞后'
description: "Kafka consumers group 消费滞后 (Lag > 50), Lag值: {{ $value }}"
- alert: KafkaConsumersTopicLag
expr: sum(kafka_consumergroup_lag) by (topic) > 50
for: 1m
labels:
severity: critical
annotations:
title: 'Kafka Topic 消费滞后'
description: "Kafka Topic 消费滞后 (Lag > 50), Lag值: {{ $value }}"
EOF
Docker.rules
cat > /data/prometheus/conf/rules/Docker.yaml << 'EOF'
groups:
- name: Docker.rules
rules:
- alert: DockerInstanceDown
expr: up{job="cAdvisor"} == 0
for: 0m
labels:
severity: critical
annotations:
title: 'Docker Instance down'
description: "容器实例: 【{{ $labels.instance }}】has been down for more than 1 minute"
- alert: ContainerKilled
expr: time() - container_last_seen{name!=""} > 60
for: 1m
labels:
severity: critical
annotations:
title: "A Container has disappeared"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 has disappeared"
- alert: ContainerCpuUsage
expr: (sum by(instance, name) (rate(container_cpu_usage_seconds_total{name!=""}[3m])) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
title: "Container CPU usaged above 80%"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 CPU usage is above 80%, Current Value: {{ $value }}"
- alert: ContainerMemoryUsage
expr: (sum by(instance, name) (container_memory_working_set_bytes{name!=""}) / sum by(instance, name) (container_spec_memory_limit_bytes{name!=""} > 0) * 100) > 80
for: 2m
labels:
severity: warning
annotations:
title: "Container CPU usaged above 80%"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 Memory usage is above 80%, Current Value: {{ $value }}"
- alert: ContainerVolumeUsage
expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance))) * 100 > 80
for: 5m
labels:
severity: warning
annotations:
title: "Container Volume usaged above 80%"
description: "Container Name 【{{ $labels.name }}】 on 主机【{{ $labels.instance }}】 Volume usage is above 80%, Current Value: {{ $value }}"
EOF