本文适用于监控主机、及其他非容器化服务监控
一、安装prometheus(docker)
IP地址:192.168.0.39
目录文件:
- /data/prometheus_dir/conf/prometheus.yml(主配置文件)后续添加组件均需要添加至此
global:
scrape_interval: 60s
evaluation_interval: 60s
alerting:
alertmanagers:
- static_configs:
- targets:
- "192.168.0.39:9093"
rule_files:
- "/rules/*"
scrape_configs:
- job_name: 'node_exporter'
static_configs:
- targets: ['172.30.0.10:9102',
'172.30.0.11:9102']
- job_name: prometheus
static_configs:
- targets: ['192.168.0.39:9090']
- /data/prometheus_dir/rules(规则目录)
- cat node-export-alert-rules.yaml
# cat node-export-alert-rules.yaml
groups:
- name: 主机状态-监控告警
rules:
- alert: 主机状态
expr: up == 0
for: 1m
labels:
serverity: warning
status: 非常严重
annotations:
summary: "{{$labels.instance}}:服务器宕机"
description: "服务器down"
- alert: CPU使用情况
expr: 100-(avg(irate(node_cpu_seconds_total{mode="idle"}[5m])) by(instance)* 100) > 80
for: 1m
labels:
serverity: warning
status: 一般告警
annotations:
summary: "{{$labels.mountpoint}} CPU使用率过高!"
description: "CPU使用大于80%(当前:{{$value}}%)"
- alert: 内存使用
expr: 100 - (node_memory_MemFree_bytes+node_memory_Cached_bytes+node_memory_Buffers_bytes) / node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
serverity: warning
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 内存使用率过高!"
description: "内存使用大于90%(当前:{{$value}}%)"
- alert: IO性能
expr: 100-(avg(irate(node_disk_io_time_seconds_total[1m])) by(instance)* 100) < 60
for: 1m
labels:
serverity: warning
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入磁盘IO使用率过高!"
description: "流入磁盘IO大于60%(当前:{{$value}})"
- alert: 网络
expr: ((sum(rate (node_network_receive_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 153600
for: 1m
labels:
serverity: warning
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流入网络带宽过高!"
description: "入网带宽持续2分钟高于150M. RX带宽使用率{{$value}}"
- alert: 网络
expr: ((sum(rate (node_network_transmit_bytes_total{device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])) by (instance)) / 100) > 153600
for: 1m
labels:
serverity: warning
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 流出网络带宽过高!"
description: "出网带宽持续2分钟高于150M. RX带宽使用率{{$value}}"
- alert: TCP会话
expr: node_netstat_Tcp_CurrEstab > 20000
for: 1m
labels:
serverity: warning
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} TCP_ESTABLISHED过高!"
description: "TCP_ESTABLISHED大于20000 (当前:{{$value}})"
- alert: 磁盘容量
#expr: 100-(node_filesystem_free_bytes{fstype=~"ext4|xfs"}/node_filesystem_size_bytes {fstype=~"ext4|xfs"}*100) > 80
expr: max((node_filesystem_size_bytes{fstype=~"ext.?|xfs"}-node_filesystem_free_bytes{fstype=~"ext.?|xfs"}) *100/(node_filesystem_avail_bytes {fstype=~"ext.?|xfs"}+(node_filesystem_size_bytes{fstype=~"ext.?|xfs"}-node_filesystem_free_bytes{fstype=~"ext.?|xfs"})))by(instance) > 80
for: 1m
labels:
serverity: warning
status: 严重告警
annotations:
summary: "{{$labels.mountpoint}} 磁盘分区使用率过高!"
description: "磁盘分区使用大于80%(当前:{{$value}}%)"
- /data/prometheus_dir/prometheus_data(数据持久化目录)
安装启动:
docker run -d \
--name prometheus \
-p 9090:9090 \
--restart=always \
--restart=on-failure:5 \
-v /data/prometheus_dir/conf/prometheus.yml:/etc/prometheus/prometheus.yml \
-v /data/prometheus_dir/rules/:/rules/ \
-v /data/prometheus_dir/prometheus_data/:/prometheus/ \
prom/prometheus:v2.24.0
访问:192.168.0.39:9090
二、安装grafana
#下载
wget https://wget.52liangzy.top/prom/grafana-7.5.4-1.x86_64.rpm
#安装
yum localinstall grafana-7.5.4-1.x86_64.rpm
grafana-cli plugins install grafana-clock-panel
grafana-cli plugins list-remote
#启动并添加到开机自启
systemctl start grafana-server
systemctl enable grafana-server.service
访问:192.168.0.39:3000
用户名密码:admin admin
添加图表
三、安装告警alertmanager
这里是用的是企业微信
去搜一下吧,创建应用等
需要下面这些东西
corp_id: 'xxxxxxxxxx' #企业 ID
api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' # 企业微信 api 接口 统一定义
to_party: '49' # 通知组 ID
agent_id: '1000002' # 新建应用的 agent_id
api_secret: 'xxxxxxxxxxxxxxxxxxxxxxx' # 生成的 secret
send_resolved: true
目录:
/data/prometheus_dir/alert_conf/alertmanager.yml(alertmanager主配置文件)
global:
resolve_timeout: 5m
templates:
- '/etc/alertmanager/template/template.tmpl'
route:
group_interval: 1m
group_wait: 10s
receiver: 'wechat'
repeat_interval: 3h
# routes:
# - receiver: 'wechat2'
# matchers:
# - team = node # 告警规则标签匹配
# group_interval: 1m
# group_wait: 10s
# repeat_interval: 3h
receivers:
- name: 'wechat'
wechat_configs:
- corp_id: 'xxxxxxxxxxxx' #企业 ID
api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' # 企业微信 api 接口 统一定义
to_party: '49' # 通知组 ID
agent_id: '1000002' # 新建应用的 agent_id
api_secret: 'xxxxxxxxxxxxxxxxxxxxxxx' # 生成的 secret
send_resolved: true
/data/prometheus_dir/alert_template/template.tmpl(告警模板)
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
主机警报========== [{{ len .Alerts.Firing }}]
{{ range $i, $alert := .Alerts.Firing }}
告警名称:{{ index $alert.Labels "alertname" }}
告警主机:{{ index $alert.Labels "instance" }}
告警级别:{{ index $alert.Labels "status" }}
告警详情:
{{ index $alert.Annotations "description" }}
触发时间:
{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
警报恢复=========== [{{ len .Alerts.Resolved }}]
{{ range $i, $alert := .Alerts.Resolved }}
告警名称:{{ index $alert.Labels "alertname" }}
告警主机:{{ index $alert.Labels "instance" }}
告警详情:
{{ index $alert.Annotations "description" }}
状态:恢复正常
开始时间:
{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间:
{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end -}}
{{ end }}
安装启动alertmanager:
docker run -d \
--restart=always \
--name=alertmanager \
-p 9093:9093 \
-v /data/prometheus_dir/alert_conf/alertmanager.yml:/etc/alertmanager/alertmanager.yml \
-v /data/prometheus_dir/alert_template/template.tmpl:/etc/alertmanager/template/template.tmpl \
prom/alertmanager:v0.23.0
访问:192.168.0.39:9093
维护:
添加组件需要修改 /data/prometheus_dir/conf/prometheus.yml 重启prometheus
添加告警规则需要修改或创建 /data/prometheus_dir/rules 目录下 .*yaml 文件 重启prometheus
下一篇:监控nginx