注:本篇文章基于本人的 ubuntu系统docker方式安装Prometheus+grafana保姆及教程教程 写的 docker-compose方式添加Prometheus被监控节点
本服务器已经安装docker 和 docker-compose 如果你的没有安装请看本人写的的安装教程
我们添加被监控端前建议先做一下时间同步
1、添加Linux服务器系统监控
1、创建存储目录
mkdir -p /data/node_exporter
2、进入存储目录
cd /data/node_exporter
3、写docker-compose.yaml文件
vim docker-compose.yaml
version: '3.3'
services:
node_exporter:
image: prom/node-exporter
container_name: node-exporter
restart: always
volumes:
- /proc:/host/proc:ro
- /ysy:/host/sys:ro
- /:/rootfs:ro
command:
- '--path.procfs=/host/proc'
- '--path.sysfs=/host/sys'
- '--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker)($$l/)'
ports:
- '9100:9100'
文件详解:
-
version: '3.3'
: 这指定了 Docker Compose 文件的版本。 -
services
: 这是一个服务列表的开始标记。 -
node_exporter
: 这是您要定义的服务的名称。-
image: prom/node-exporter
: 指定了要使用的prom/node-exporter
镜像。 -
container_name: node-exporter
: 指定容器的名称为node-exporter
。 -
restart: always
: 设置容器在退出时总是重新启动。 -
volumes
: 挂载卷的配置。-
/proc:/host/proc:ro
: 将主机的/proc
目录挂载到容器内的/host/proc
目录,并以只读 (ro
) 模式挂载。 -
/ysy:/host/sys:ro
: 将主机的/ysy
目录挂载到容器内的/host/sys
目录,并以只读 (ro
) 模式挂载。 -
/:/rootfs:ro
: 将主机的根目录/
挂载到容器内的/rootfs
目录,并以只读 (ro
) 模式挂载。
-
-
command
: 指定要传递给容器的命令行参数。-
'--path.procfs=/host/proc'
: 指定node_exporter
使用/host/proc
路径来读取主机的 proc 文件系统。 -
'--path.sysfs=/host/sys'
: 指定node_exporter
使用/host/sys
路径来读取主机的 sys 文件系统。 -
'--collector.filesystem.ignored-mount-points=^/(sys|proc|dev|host|etc|rootfs/var/lib/docker)($$l/)'
: 指定要忽略的文件系统挂载点的正则表达式,这些挂载点将不会被node_exporter
收集统计信息。
-
-
ports
: 定义端口映射。 '9100:9100'
: 将主机的 9100 端口映射到容器的 9100 端口,以便可以通过主机的 9100 端口访问node_exporter
的指标数据。
-
4、检查与启动
docker-compose config -q #检查配置有问题才有输出
docker-compose up -d #启动所有docker-compose服务并后台运行
5、在浏览器访问查看
访问地址http://IP:9100/metrics
6、修改Prometheus监控端的配置文件
打开Prometheus的配置添加以下内容
vim /data/docker-prometheus/prometheus/prometheus.yml
- targets: ['192.168.1.139:9100']
labels:
instance: harbor服务器
7、热加载Prometheus
curl -X POST http://localhost:9090/-/reload
8、在浏览器上查看
Prometheus监控端的IP加9090端口
9、配置触发器在Prometheus机器配置
打开触发器配置文件添加以下配置
vim vim /data/docker-prometheus/prometheus/alert.yml
- name: node-exporter
rules:
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 10
for: 2m
labels:
severity: warning
annotations:
summary: "主机内存不足,实例: {{ $labels.instance }}"
description: "内存可用率 < 10%,当前值:{{ $value }}"
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: "内存压力不足,实例: {{ $labels.instance }}"
description: "节点内存压力大,重大页面错误率高,当前值为:{{ $value }}"
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: "异常流入网络吞吐量,实例: {{ $labels.instance }}"
description: "网络流入流量 > 100 MB/s,当前值:{{ $value }}"
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: "异常磁盘读取,实例: {{ $labels.instance }}"
description: "磁盘读取 > 50 MB/s,当前值:{{ $value }}"
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: warning
annotations:
summary: "异常磁盘写入,实例: {{ $labels.instance }}"
description: "磁盘写入 > 50 MB/s,当前值:{{ $value }}"
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘空间不足警告,实例: {{ $labels.instance }}"
description: "剩余磁盘空间 < 10%,当前值:{{ $value }}"
- alert: HostDiskWillFillIn24Hours
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 10 and ON (instance, device, mountpoint) predict_linear(node_filesystem_avail_bytes{fstype!~"tmpfs"}[1h], 24 * 3600) < 0 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘空间将在24小时内耗尽,实例: {{ $labels.instance }}"
description: "以当前写入速率预计磁盘空间将在 24 小时内耗尽,当前值:{{ $value }}"
- alert: HostOutOfInodes
expr: node_filesystem_files_free{mountpoint="/"}/node_filesystem_files{mountpoint="/"} * 100 < 10 and ON (instance, device, mountpoint) node_filesystem_readonly{mountpoint="/"} == 0
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘 Inodes 不足,实例: {{ $labels.instance }}"
description: "剩余磁盘 inodes < 10%,当前值:{{ $value }}"
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.1 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: "异常磁盘读取延迟,实例: {{ $labels.instance }}"
description: "磁盘读取延迟 > 100ms,当前值:{{ $value }}"
- alert: HostUnusualDiskWriteLatency
expr: rate(node_disk_write_time_seconds_total[1m]) / rate(node_disk_writes_completed_total[1m]) > 0.1 and rate(node_disk_writes_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘写入延迟,实例: {{ $labels.instance }}"
description: "磁盘写入延迟 > 100ms,当前值:{{ $value }}"
- alert: high_load
expr: node_load1 > 4
for: 2m
labels:
severity: page
annotations:
summary: "CPU1分钟负载过高,实例:{{ $labels.instance }}"
description: "CPU1分钟负载>4,已经持续2分钟.当前值为:{{ $value }}"
- alert: HostCpuIsUnderUtilized
expr: 100 - (avg by(instance) (rate(node_cpu_seconds_total{mode="idle"}[2m])) * 100) > 80
for: 1m
labels:
severity: warning
annotations:
summary: "cpu负载高,实例:{{ $labels.instance }}"
description: "cpu负载> 80%,当前值:{{ $value }}"
- alert: HostCpuStealNoisyNeighbor
expr: avg by(instance) (rate(node_cpu_seconds_total{mode="steal"}[5m])) * 100 > 10
for: 0m
labels:
severity: warning
annotations:
summary: "CPU窃取率异常,实例:{{ $labels.instance }}"
description: "CPU窃取率 > 10% 。嘈杂的邻居正在扼杀 VM性能,或者 spot 实例可能失去信用,当前值:{{ $value }}"
- alert: HostSwapIsFillingUp
expr: (1 - (node_memory_SwapFree_bytes / node_memory_SwapTotal_bytes)) * 100 > 80
for: 2m
labels:
severity: warning
annotations:
summary: "磁盘swap空间使用率异常,实例:{{ $labels.instance }}"
description: "磁盘swap空间使用率>80%"
- alert: HostNetworkReceiveErrors
expr: rate(node_network_receive_errs_total[2m]) / rate(node_network_receive_packets_total[2m]) > 0.01
for: 2m
labels:
severity: warning
annotations:
summary: "异常网络接收错误,实例:{{ $labels.instance }}"
description: "网卡{{ $labels.device }}在过去2分钟接收{{ $value }} 个错误"
- alert: HostNetworkInterfaceSaturated
expr: (rate(node_network_receive_bytes_total{device!~"^tap.*"}[1m]) + rate(node_network_transmit_bytes_total{device!~"^tap.*"}[1m])) / node_network_speed_bytes{device!~"^tap.*"} > 0.8
for: 1m
labels:
severity: warning
annotations:
summary: "异常网络接口饱和,实例:{{ $labels.instance }}"
description: "网卡{{ $labels.device }}正在超载,当前值{{ $value }}"
- alert: HostConntrackLimit
expr: node_nf_conntrack_entries / node_nf_conntrack_entries_limit > 0.8
for: 5m
labels:
severity: warning
annotations:
summary: "异常连接数,实例:{{ $labels.instance }}"
description: "连接数过大,当前链接数:{{ $value }}"
- alert: HostClockSkew
expr: (node_timex_offset_seconds > 0.05 and deriv(node_timex_offset_seconds[5m]) >= 0) or (node_timex_offset_seconds < -0.05 and deriv(node_timex_offset_seconds[5m]) <= 0)
for: 2m
labels:
severity: warning
annotations:
summary: "异常时钟偏差,实例 :{{ $labels.instance }}"
description: "检测到时钟偏差,时钟不同步。值为:{{ $value }}"
- alert: HostClockNotSynchronising
expr: min_over_time(node_timex_sync_status[1m]) == 0 and node_timex_maxerror_seconds >= 16
for: 2m
labels:
severity: warning
annotations:
summary: "时钟不同步,实例:{{ $labels.instance }}"
description: "时钟不同步"
10、检查配置在Prometheus机器配置
docker exec -it prometheus promtool check config /etc/prometheus/prometheus.yml
11、热加载Prometheus配置
curl -X POST http://localhost:9090/-/reload
加载前
加载后
12 、在grafana添加
注:因为我们之前已经将Prometheus机器图添加监控了,所以配置好后host选项中会出现我们添加的机器名字 我们选择就好
完成
2、Prometheus监控添加nginx监控节点
注:本篇文章基于本人的 ubuntu系统docker方式安装Prometheus+grafana保姆及教程教程 写的 docker-compose方式添加Prometheus被监控节点
本服务器已经安装docker 和 docker-compose 如果你的没有安装请看本人写的的安装教程
我们添加被监控端前建议先做一下时间同步
1、在被监控端创建nginx目录
mkdir /data/nginx/conf.d -p
cd /data/nginx/
2、在目录下新增配置文件
vim server.conf
server {
listen 80;
location / {
root /usr/share/nginx/html;
index index.html index.htm;
}
error_page 500 502 503 504 /50x.html;
location = /50x.html {
root /usr/share/nginx/html;
}
}
3、创建docker-compose的目录
mkdir /data/docker-compose -p
cd /data/docker-compose/
4、创建docker-compose.yaml文件
vim docker-compose.yaml
version: '3'
services:
redis:
image: redis
container_name: redis
command: redis-server --requirepass 123456 --maxmemory 512mb
restart: always
volumes:
- /data/redis/data:/data
ports:
- /data/redis/data:/data
ports:
- 6379:6379
nginx:
image: nginx
container_name: nginx
restart: always
volumes:
- /data/nginx/conf.d:/etc/nginx/conf.d
- /data/nginx/html:/usr/share/nginx/html
- /data/nginx/log:/var/log/nginx
ports:
- 80:80
rabbitmq:
image: rabbitmq
container_name: rabbitmq
restart: always
volumes:
- /data/rabbitmq/data:/var/lib/rabbitmq
- /data/rabbitmq/log:/var/log/rabbitmq
ports:
- 5672:5672
- 15672:15672
mongo:
image: mongo
container_name: mongo
restart: always
volumes:
- /data/mongo/db:/data/db
ports:
- 27017:27017
environment:
- MONGO_INITDB_ROOT_USERNAME=root
- MONGO_INITDB_ROOT_PASSWORD=123456
5、检查配置
docker-compose config -q #有问题才会输出
6、启动docker-compose
docker-compose up -d #启动所有docker-compose服务并后台运行
查看 docker ps
7、检查nginx的环境
docker exec -it nginx nginx -V 2>&1 |grep -o with-http_stub_status_module
8、nginx开启stub_status配置
在nginx的server模块下添加以下配置
location /stub_status {
stub_status on;
access_log off;
#allow nginx_export的IP;
allow 0.0.0.0/0;
deny all;
}
9、检查配置重新加载配置
docker exec -it nginx nginx -t #检查配置
docker exec -it nginx nginx -s reload #重新加载配置
10、检查配置
curl http://本机IP/stub_status
11、docker-compose方式安装nginx_exporter
cd /data/nginx/
vim docker-compose.yaml
version: '3.3'
services:
nginx_exporter:
image: nginx/nginx-prometheus-exporter
container_name: nginx_exporter
hostname: nginx_exporter
command:
- '-nginx.scrape-uri=http://192.168.1.139/stub_status'
restart: always
ports:
- "9113:9113"
12、检查启动
docker-compose config -q #有问题才会输出
docker-compose up -d #启动所有docker-compose服务并后台运行
docker ps #查看是否启动
13、浏览器访问一下metrics地址
http://192.168.1.139:9113/metrics
14、修改Prometheus配置
在Prometheus端进入Prometheus的配置文件内
cd /data/docker-prometheus
vim prometheus/prometheus.yml
- job_name: 'nginx_exporter'
static_configs:
- targets: ['192.168.1.139:9113']
labels:
instance: harbor服务器
重新加载Prometheus配置文件
curl -X POST http://localhost:9090/-/reload
15、在浏览器进行查看
访问Prometheus的IP加9090端口进行查看
http://192.168.1.144:9090
16、添加nginx的触发器
在Prometheus端进行配置,先进入到Prometheus的目录下
cd /data/docker-prometheus
添加nginx的触发器打开触发器文件 添加以下内容
vim prometheus/alert.yml
- name: nginx
rules:
- alert: NginxDown
expr: nginx_up == 0
for: 30s
labels:
severity: critical
annotations:
summary: "nginx异常,实例:{{ $labels.instance }}"
description: "{{ $labels.job }} nginx已关闭"
17、检查配置和重新加载配置
docker exec -it prometheus promtool check config /etc/prometheus/prometheus.yml #检查配置
curl -X POST http://localhost:9090/-/reload #重新加载配置
18、检查配置是否成功
访问Prometheus的IP加9090端口
19、添加nginx在grafana的监控模板
20、进入grafana中
注:下图建议使用 12708 模板
完成
3、Prometheus监控添加redis监控节点
注:本篇文章基于本人的 ubuntu系统docker方式安装Prometheus+grafana保姆及教程教程 写的 docker-compose方式添加Prometheus被监控节点
本服务器已经安装docker 和 docker-compose 如果你的没有安装请看本人写的的安装教程
我们添加被监控端前建议先做一下时间同步
注:我们在上一个添加nginx节点的时候已经将redis容器启动起来了
注:如果已经起来的第一步不用做 如果没有启动redis容器就从第一步开始做
1、如果没有安装redis 可以按照以下步骤来做
mkdir /data/redis/ #创建redis的目录
cd /data/redis/ #进入目录下
vim docker-compose.yaml
version: '3'
services:
redis:
image: redis
container_name: redis
command: redis-server --requirepass 123456 --maxmemory 512mb
restart: always
volumes:
- /data/redis/data:/data
ports:
- 6379:6379
启动:
docker-compose up -d
2、docker安装redis_exporter的两种方式二选一即可
1、docker直接启动:
docker run -d --name redis_exporter -p 9121:9121 oliver006/redis_exporter --redis.addr redis://192.168.1.139:6379 --redis.password '123456'
2、docker-compose方式:
cd /data/redis/
vim docker-compose.yaml
version: '3.3'
services:
redis_exporter:
image: oliver006/redis_exporter
container_name: redis_exporter
restart: always
environment:
REDIS_ADDR: "192.168.1.139:6379"
REDIS_PASSWORD: 123456
ports:
- "9121:9121"
启动docker-compose
docker-compose up -d
docker ps #查看
3、在浏览器访问redis_exporter的服务器IP加9121端口查看
4、修改Prometheus的配置
在Prometheus端对Prometheus.yaml进行配置
cd /data/docker-prometheus
打开Prometheus配置添加以下内容
vim prometheus/prometheus.yml
- job_name: 'redis_exporter'
static_configs:
- targets: ['192.168.1.139:9121']
labels:
instance: harbor服务器
5、重新在Prometheus端加载配置
curl -X POST http://localhost:9090/-/reload
6、在浏览器查看是否检测到redis_exporter
访问Prometheus服务的IP加9090端口
7、触发器配置
在Prometheus端配置触发器
因为所有的触发器存在一个文件内信息量太大了
所以我们在创建一个文件夹存放触发器文件
cd /data/docker-prometheus
vim prometheus/prometheus.yml
打开配置文件在rule_files: 下添加以下内容
- “rules/*.yml”
创建文件
mkdir /data/docker-prometheus/prometheus/rules
8、创建redis触发器
cd /data/docker-prometheus/prometheus/rules
vim redis.yml
groups:
- name: redis
rules:
- alert: RedisDown
expr: redis_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: "Redis Down,实例:{{ $labels.instance }}"
description: "Redis实例 is down"
- alert: RedisMissingBackup
expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
for: 0m
labels:
severity: critical
annotations:
summary: "Redis 备份丢失,实例:{{ $labels.instance }}"
description: "Redis 24小时未备份"
- alert: RedisOutofConfiguredMaxmemory
expr: redis_memory_used_bytes / redis_memory_max_bytes * 100 > 90
for: 2m
labels:
severity: warning
annotations:
summary: "Redis超出配置的最大内存,实例:{{ $labels.instance }}"
description: "Redis内存使用超过配置最大内存的90%"
- alert: RedisTooManyConnections
expr: redis_connected_clients > 100
for: 2m
labels:
severity: warning
annotations:
summary: 'Redis 连接数过多,实例:{{ $labels.instance }}'
description: "Redis当前连接数为:{{ $value }}"
- alert: RedisNotEnoughConnections
expr: redis_connected_clients < 1
for: 2m
labels:
severity: warning
annotations:
summary: 'Redis 没有足够的连接,实例:{{ $labels.instance }}'
description: "Redis当前连接数为:{{ $value }}"
- alert: RedisRejectedConnections
expr: increase(redis_rejected_connections_total[1m]) > 0
for: 0m
labels:
severity: critical
annotations:
summary: 'Redis 有拒绝链接,实例:{{ $labels.instance }}'
description: "与Redis 的某些连接被拒绝:{{ $value }}"
9、检查配置
docker exec -it prometheus promtool check config /etc/prometheus/prometheus.yml
10、重新加载配置文件
curl -X POST http://localhost:9090/-/reload
11、在浏览器查看是否存在
访问Prometheus的IP加9090端口选择Alerts查看
12、选择dashboard
13、添加到grafana
完成
4、监控rabbitmq的步骤和上面两个一样 下面写了告警规则可自行添加监控
groups:
- name: Rabbitmq
rules:
- alert: RabbitMQDown
expr: rabbitmq_up != 1
labels:
severity: High
annotations:
summary: "Rabbitmq Down,实例:{{ $labels.instance }}"
description: "Rabbitmq_exporter连不上RabbitMQ! ! !"
- alert: RabbitMQ有未确认消息
expr: rabbitmq_queue_messages_unacknowledged_global > 0
for: 1m
labels:
severity: critical
annotations:
summary: "Rabbitmq有未确认消息,实例:{{ $labels.instance }}"
description: "Rabbitmq未确认消息>0,当前值为:{{ $value }}"
- alert: RabbitMQ可用磁盘空间不足警告
expr: rabbitmq_node_disk_free_alarm != 0
for: 0m
labels:
severity: critical
annotations:
summary: "Rabbitmq可用磁盘空间不足,实例:{{ $labels.instance }}"
description: "Rabbitmq可用磁盘空间不足,请检查"
- alert: RabbitMQ可用内存不足警告
expr: rabbitmq_node_mem_alarm != 0
for: 0m
labels:
severity: critical
annotations:
summary: "Rabbitmq可用内存不足,实例:{{ $labels.instance }}"
description: "Rabbitmq可用内存不足,请检查"
- alert: RabbitMQ_socket连接数使用过高警告
expr: rabbitmq_sockets_used / rabbitmq_sockets_available * 100 > 60
for: 0m
labels:
severity: critical
annotations:
summary: "Rabbitmq_socket连接数使用过高,实例:{{ $labels.instance }}"
description: "Rabbitmq_socket使用>60%,当前值为: {{ $value }}"
- alert: RabbitMQ文件描述符使用过高警告
expr: rabbitmq_fd_used / rabbitmq_fds_available * 100 > 60
for: 0m
labels:
severity: critical
annotations:
summary: "Rabbitmq文件描述符使用过高,实例:{{ $labels.instance }}"
description: "Rabbitmq文件描述符使用>60%,当前值为:{{ $value }}"