Prometheus的配置
Prometheus主配置文件查看
- cat /data/prometheus/prometheus.yml | egrep -v “$|#”
global:
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.5.10:9093']
rule_files:
- "node-linux.yml"
scrape_configs:
#- job_name: 'prometheus'
- job_name: "Prometheus-server"
static_configs:
- targets: ["192.168.5.10:19100"]
- job_name: "loc-dev"
static_configs:
- targets: ["192.168.3.3:19100"]
- job_name: 'Data' #此处定义了自动发现的采集任务名称,可以依据自己的业务定义多个自动发现任务
file_sd_configs:
- files:
- /data/prometheus/targets/linux.json #采集文件路径
refresh_interval: 60s
- job_name: 'Docker' #此处定义了自动发现的采集任务名称,可以依据自己的业务定义多个自动发现任务
file_sd_configs:
- files:
- /data/prometheus/targets/docker.json #采集文件路径
refresh_interval: 60s
- cat /data/prometheus/targets/linux.json
[
{
"targets": [
"192.168.3.22:19100",
"192.168.3.23:19100",
"192.168.3.24:19100",
"192.168.3.25:19100",
"192.168.3.26:19100",
"192.168.5.11:19100",
"192.168.5.12:19100",
"192.168.5.13:19100"
],
"labels": {
"host": "data"
}
}
]
- cat /data/prometheus/targets/docker.json
[
{
"targets": [
"192.168.5.10:8080",
"192.168.5.11:8080"
],
"labels": {
"host": "docker"
}
}
]
node 节点监控规则查看
- cat /data/prometheus/node-linux.yml
groups:
- name: Linux
rules:
# Alert for any instance that is unreachable for >5 minutes.
- alert: InstanceDown
expr: up == 0
for: 120s
labels:
severity: critical
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 5 minutes."
- alert: "内存使用率过高"
expr: round(100- node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes*100) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "内存使用率过高"
description: "当前使用率{{ $value }}%"
- alert: "CPU使用率过高"
expr: round(100 - ((avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle",instance!~'bac-.*'}[5m]))) *100)) > 80
for: 2m
labels:
severity: warning
annotations:
summary: "CPU使用率过高"
description: "当前使用率{{ $value }}%"
- alert: "磁盘使用率过高"
expr: round(100-100*(node_filesystem_avail_bytes{fstype=~"ext4|xfs", instance!="192.168.3.22:19100"} / node_filesystem_size_bytes{fstype=~"ext4|xfs", instance!="192.168.3.22:19100"})) > 90
for: 15s
labels:
severity: warning
annotations:
summary: "磁盘使用率过高"
description: "当前磁盘{{$labels.mountpoint}} 使用率{{ $value }}%"
- alert: "3.22磁盘使用率过高"
expr: round(100-100*(node_filesystem_avail_bytes{fstype=~"ext4|xfs", instance="192.168.3.22:19100"} / node_filesystem_size_bytes{fstype=~"ext4|xfs", instance="192.168.3.22:19100"})) > 98
for: 15s
labels:
severity: warning
annotations:
summary: "磁盘使用率过高"
description: "当前磁盘{{$labels.mountpoint}} 使用率{{ $value }}%"
#- alert: "分区容量过低"
# expr: round(node_filesystem_avail_bytes{fstype=~"ext4|xfs",instance!~"testnode",mountpoint!~"/boot.*"}/1024/1024/1024) < 10
# for: 15s
# labels:
# severity: warning
# annotations:
# summary: "分区容量过低"
# description: "当前分区{{$labels.mountpoint}} 容量{{ $value }}GB"
- alert: "网络流出速率过高"
expr: round(irate(node_network_receive_bytes_total{instance!~"data.*",device!~'tap.*|veth.*|br.*|docker.*|vir.*|lo.*|vnet.*'}[1m])/1024) > 20480
for: 1m
labels:
severity: warning
annotations:
summary: "网络流出速率过高"
description: "当前速率{{ $value }}KB/s"
Altermanager的配置
主配置文件内容查看
- cat /data/alertmanager/alertmanager.yml | egrep -v “^$”
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.mxhichina.com:25'
smtp_from: 'sg.feng@neoclub.cn'
smtp_auth_username: 'sg.feng@neoclub.cn'
smtp_auth_password: '*********'
smtp_require_tls: true
smtp_hello: "aliyun.com"
templates:
- '/data/alertmanager/tmpl/*.tmpl'
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 24h
receiver: 'default'
routes:
- receiver: 'data'
group_wait: 10s
repeat_interval: 24h
match_re:
host: data
- receiver: wechat
group_wait: 10s
match_re:
job: rmq
receivers:
- name: 'default'
email_configs:
- to: 'sg.feng@qq.com'
send_resolved: true
html: '{{ template "email.html" . }}'
- name: 'data'
email_configs:
- to: '1096171081@qq.com'
send_resolved: true
html: '{{ template "email.html" . }}'
- name: 'wechat'
wechat_configs:
- corp_id: 'ww35d35de455******'
to_party: '1'
agent_id: '1000002'
api_secret: '**********'
message: '{{ template "wechat.html" . }}'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
发送邮件模板信息查看
- cat /data/alertmanager/tmpl/email.tmpl
{{ define "email.html" }}
{{ range .Alerts }}
<pre>
实例: {{ .Labels.instance }}
信息: {{ .Annotations.summary }}
详情: {{ .Annotations.description }}
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
</pre>
{{ end }}
{{ end }}
微信报警发送模板信息查看
- cat /data/alertmanager/tmpl/wechat.tmpl
{{ define "wechat.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}{{ range .Alerts }}
@警报
实例: {{ .Labels.instance }}
信息: {{ .Annotations.summary }}
详情: {{ .Annotations.description }}
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end -}}
{{- if gt (len .Alerts.Resolved) 0 -}}{{ range .Alerts }}
@恢复
实例: {{ .Labels.instance }}
信息: {{ .Annotations.summary }}
时间: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end -}}
{{- end }