Prometheus-监控主机基础指标配置及告警

1、监控主机指标

这是一篇介绍主机使用Prometheus监控CPU、磁盘、内存、负载等基础数据的文章,目前生产可用,使用的是node_exporter-0.18.1版本,操作系统是centos7.X版本,使用之前请修改job="gt-dwz-node-exporter"的值对应自己在Prometheus配置的job名称。

2、Prometheus配置项

在prometheus.yml配置文件中添加如下配置:

############gt-dwz#################
  - job_name: "gt-dwz-node-exporter"
    static_configs:
    - targets: ['10.1.5.123:9100','10.1.5.124:9100','10.1.5.125:9100','10.1.5.126:9100']
      labels:
        service: gt-dwz-monitor
3、PromQL判断rules文件
[root@gtcq-gt-monitor-prometheus-01 rules]# more    gt-dwz-monitor.rules
groups:
- name: dwz-gt-monitor
  rules:
  - alert: "node-Agent告警"
    expr: up{job="gt-dwz-node-exporter"} == 0
    for: 120s
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "Agent告警" 
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ $labels.instance }} 已停止采集监控数据 30s!"
      description: "{{ $labels.instance }} job {{ $labels.job }} 暴露监控数据已停止."

  - alert: "CPU使用率监控"
    expr: ceil(100 - sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter",mode="idle"}[5m]))  by(instance) / sum(increase(node_cpu_seconds_total{job="gt-dwz-node-exporter"}[5m])) 
 by(instance)*100) > 80
    for: 2m
    labels:
      severity: "重要"
      team: bdfb
      alert_type: "CPU告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU使用率过高"
      description: "IP:{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的CPU使用大于80% (当前值: {{ $value }})"

  - alert: "磁盘使用率监控"
    expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"
})) * 100)  > 80
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "Disk告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于80% (当前值: {{ $value }}%)"

  - alert: "内存使用率监控"
    expr: ceil( (1 - (node_memory_MemAvailable_bytes{job="gt-dwz-node-exporter"} / (node_memory_MemTotal_bytes{job="gt-dwz-node-exporter"})))* 100 ) > 80
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "MEM告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用率过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}内存使用大于80% (当前值: {{ $value }})"

  - alert: "服务器大法宝CPULoad5"
    expr: node_load5{job="gt-dwz-node-exporter"} > 100
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "负载告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}CPU负载过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} CPU负载load大于100 (当前值: {{ $value }})"

  - alert: "服务器文件句柄监控"
    expr: node_filefd_allocated{job="gt-dwz-node-exporter"} > 50000
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "文件句柄告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 文件句柄使用过高大于50000 (当前值: {{ $value }})"
      
  - alert: "服务器TCP连接数监控"
    expr: node_sockstat_TCP_tw{job="gt-dwz-node-exporter"} > 15000
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "TCP连接数告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 等待关闭的TCP连接数TIME_WAIT过高大于15000 (当前值: {{ $value }})"
      
  - alert: "服务器入口流量监控"
    expr: round((sum by (instance)  (irate(node_network_receive_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "流量告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}监控入口流量过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控入口流量过高过高大于50MB (告警值: {{ $value }}MB)"
      
  - alert: "服务器出口流量监控"
    expr: round((sum by (instance)  (irate(node_network_transmit_bytes_total{job="gt-dwz-node-exporter",device!~'tap.*|veth.*|br.*|docker.*|virbr*|lo*'}[5m])))/1024/1024) > 50
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "流量告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} 监控出口流量过高过高大于50MB (告警值: {{ $value }}MB)"
[root@gtcq-gt-monitor-prometheus-01 rules]# 
4、测试告警

修改磁盘阈值如下:

  - alert: "磁盘使用率监控"
    expr: round((1 - (node_filesystem_avail_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"} / node_filesystem_size_bytes{fstype=~"ext3|ext4|xfs|nfs",job="gt-dwz-node-exporter"})) * 100)  > 10
    for: 2m
    labels:
      severity: "重要"
      team: dwz-gt-monitor
      alert_type: "Disk告警"
      alert_host: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}"
    annotations:
      summary: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }} : {{ $labels.mountpoint }} 分区使用率过高"
      description: "{{ reReplaceAll \":(.*)\" \"\" $labels.instance }}的{{ $labels.mountpoint }} 分区使用大于10% (当前值: {{ $value }}%)"


  • 1
    点赞
  • 7
    收藏
    觉得还不错? 一键收藏
  • 4
    评论
评论 4
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值