我们可通过http://monitor_host:3000访问Grafana网页界面(默认登陆帐号/密码为admin/admin)
然后到Data Sources页面添加数据源:
同步一个监控主机状态的图形
设置grafana告警
配置邮件服务
yum install -y sendmail
vi /etc/grafana/grafana.ini (配置文件添加如下)
重启grafana
systemctl restart grafana-server
在grafana的web界面添加接收告警的邮箱地址
如果发送成功,右上角会有提示
prometheus告警设置
要实现prometheus的告警,需要通过altermanager这个组件;在prometheus服务端写告警规则,在altermanage组件配置邮箱
1、prometheus告警规则写法,以下是例子
groups:
- name: base-monitor-rule
rules:
- alert: NodeCpuUsage
expr: (100 - (avg by (instance) (rate(node_cpu{job=~".*",mode="idle"}[2m])) * 100)) > 99
for: 15m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: CPU usage is above 99% (current value is: {{ $value }}"
- alert: NodeMemUsage
expr: avg by (instance) ((1- (node_memory_MemFree{} + node_memory_Buffers{} + node_memory_Cached{})/node_memory_MemTotal{}) * 100) > 90
for: 15m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: MEM usage is above 90% (current value is: {{ $value }}"
- alert: NodeDiskUsage
expr: (1 - node_filesystem_free{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size) * 100 > 80
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }}"
- alert: NodeFDUsage
expr: avg by (instance) (node_filefd_allocated{} / node_filefd_maximum{}) * 100 > 80
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: File Descriptor usage is above 80% (current value is: {{ $value }}"
- alert: NodeLoad15
expr: avg by (instance) (node_load15{}) > 100
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Load15 is above 100 (current value is: {{ $value }}"
- alert: NodeAgentStatus
expr: avg by (instance) (up{}) == 0
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Node Agent is down (current value is: {{ $value }}"
- alert: NodeProcsBlocked
expr: avg by (instance) (node_procs_blocked{}) > 100
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Node Blocked Procs detected!(current value is: {{ $value }}"
- alert: NodeTransmitRate
expr: avg by (instance) (floor(irate(node_network_transmit_bytes{device="eth0"}[2m]) / 1024 / 1024)) > 100
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Node Transmit Rate is above 100MB/s (current value is: {{ $value }}"
- alert: NodeReceiveRate
expr: avg by (instance) (floor(irate(node_network_receive_bytes{device="eth0"}[2m]) / 1024 / 1024)) > 100
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Node Receive Rate is above 100MB/s (current value is: {{ $value }}"
- alert: NodeDiskReadRate
expr: avg by (instance) (floor(irate(node_disk_bytes_read{}[2m]) / 1024 / 1024)) > 50
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Node Disk Read Rate is above 50MB/s (current value is: {{ $value }}"
- alert: NodeDiskWriteRate
expr: avg by (instance) (floor(irate(node_disk_bytes_written{}[2m]) / 1024 / 1024)) > 50
for: 2m
labels:
service_name: test
level: warning
annotations:
description: "{{$labels.instance}}: Node Disk Write Rate is above 50MB/s (current value is: {{ $value }}"
操作如下:
[root@mp-sre-fanhaitao prometheus-2.3.1.linux-amd64]# pwd
/usr/local/prometheus-2.3.1.linux-amd64
[root@mp-sre-fanhaitao prometheus-2.3.1.linux-amd64]# vi prometheus.yml
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.210.100:9093 #此处我没有将altermanager和prometheus装在一台机器上
rule_files:
- rules/haitao.rules #告警规则文件,自定义一个
告警规则文件
[root@mp-sre-fanhaitao rules]# pwd
/usr/local/prometheus-2.3.1.linux-amd64/rules
[root@mp-sre-fanhaitao rules]# vi haitao.rules
groups:
- name: base-monitor-rule
rules:
- alert: NodeDiskUsage
expr: ceil(node_filesystem_avail_bytes{mountpoint="/", device="/dev/vda1"} /1024 / 1024 / 1024) > 92 #此处的语法是prometheus的重点,需要自己学习
for: 1m
annotations:
description: "{{$labels.instance}}: Node Disk Usage is above 90GB (current value is: {{ $value }}"
2、altermanager设置邮箱
[root@mp-sre-fanhaitao alertmanager-0.15.0.linux-amd64]# pwd
/usr/local/alertmanager-0.15.0.linux-amd64
[root@mp-sre-fanhaitao alertmanager-0.15.0.linux-amd64]# vi alertmanager.yml
定义好告警规则和设置邮箱后,可以在prometheus的web端看到定义好的规则,如下
参考
https://blog.csdn.net/qq_36357820/article/details/80777167