前言
最新编写,配置文件均正常使用,prometheus监控根据需求添加相关exporter即可,配置grafana数据源为prometheus,添加对应面板
- node-exporter收集系统信息,用于监控CPU、内存、磁盘使用率、磁盘读写等系统信息
- blackbox收集服务状态信息,如判断服务http请求是否返回200继而报警
- cadvisor收集docker信息,用于展示docker的cpu、内存、上传下载等信息
- process-exporter 收集进程信息,内存、cpu使用等
Directory Structure
xxxxxx:~/prometheus+grafana$ tree -L 2
.
├── alertmanager
│ ├── alertmanager
│ ├── alertmanager.log
│ ├── alertmanager.yml
│ ├── amtool
│ ├── data
│ ├── LICENSE
│ ├── NOTICE
│ └── template
├── alertmanager-0.21.0.linux-amd64.tar.gz
├── blackbox_exporter
│ ├── blackbox_exporter
│ ├── blackbox.yml
│ ├── blackbox.yml.bak
│ ├── LICENSE
│ └── NOTICE
├── blackbox_exporter-0.18.0.linux-amd64.tar.gz
├── grafana
│ └── grafana_7.2.2_amd64.deb
├── node_exporter
│ ├── LICENSE
│ ├── node_exporter
│ └── NOTICE
├── node_exporter-1.0.1.linux-amd64.tar.gz
├── prometheus
│ ├── alert.rules.yml
│ ├── console_libraries
│ ├── consoles
│ ├── data
│ ├── LICENSE
│ ├── NOTICE
│ ├── prometheus
│ ├── prometheus.yml
│ ├── prometheus.yml.bak
│ ├── promtool
│ ├── rule.yml
│ └── services.yml
└── prometheus-2.22.0.linux-amd64.tar.gz
blackbox_exporter —process status collection
./blackbox_exporter --config.file=blackbox.yml --web.listen-address=:7995
xxxxx:~/prometheus+grafana/blackbox_exporter$ cat blackbox.yml
modules:
http_2xx: # http 检测模块 Blockbox-Exporter 中所有的探针均是以 Module 的信息进行配置
prober: http
timeout: 10s
http:
valid_status_codes: [200] # 这里最好作一个返回状态码,在grafana作图时,有明示
method: GET
preferred_ip_protocol: "ip4"
http_post_2xx: # http post 监测模块
prober: http
timeout: 10s
http:
method: POST
preferred_ip_protocol: "ip4"
tcp_connect: # TCP 检测模块
prober: tcp
timeout: 10s
prometheus
# reload prometheus config
# precondition start as:
./prometheus --config.file=prometheus.yml --web.enable-lifecycle &
curl -X POST http://localhost:9090/-/reload
# configs
xxxxxx:~/prometheus+grafana/prometheus$ ls -l
total 161712
-rw-rw-r-- 1 ctdna ctdna 1272 10月 28 17:08 alert.rules.yml
drwxr-xr-x 2 ctdna ctdna 4096 10月 15 22:21 console_libraries
drwxr-xr-x 2 ctdna ctdna 4096 10月 15 22:21 consoles
drwxrwxr-x 12 ctdna ctdna 4096 10月 28 17:00 data
-rw-r--r-- 1 ctdna ctdna 11357 10月 15 22:21 LICENSE
-rw-r--r-- 1 ctdna ctdna 3420 10月 15 22:21 NOTICE
-rwxr-xr-x 1 ctdna ctdna 87729971 10月 15 20:32 prometheus
-rw-rw-r-- 1 ctdna ctdna 1429 10月 28 11:46 prometheus.yml
-rw-r--r-- 1 ctdna ctdna 926 10月 15 22:21 prometheus.yml.bak
-rwxr-xr-x 1 ctdna ctdna 77801407 10月 15 20:34 promtool
-rw-rw-r-- 1 ctdna ctdna 1037 10月 27 20:41 rule.yml
-rw-rw-r-- 1 ctdna ctdna 391 10月 28 15:54 services.yml
xxxxxx:~/prometheus+grafana/prometheus$ cat alert.rules.yml
groups:
- name: alert.rules
rules:
- alert: cpu_usage_over_threshold
expr: 100 - avg(irate(node_cpu_seconds_total{
mode="idle"}[1m])) by (alarmhost,alarmproject,alarmtype) * 100 > 90
# Alarm duration
for: 5m
labels:
severity: "critical"
annotations:
summary: "Host {
{ $labels.alarmhost }} CPU usage continues to exceed the threshold for five minutes and is currently {
{humanize $value}}%"
- alert: mem_usage_over_threshold
expr: 100 - (node_memory_MemAv