Prometheus（7）Pormetheus+ Alertmanager配置飞书警告

最新推荐文章于 2024-09-15 14:50:06 发布

?abc!

最新推荐文章于 2024-09-15 14:50:06 发布

阅读量5k

点赞数 4

分类专栏： # Prometheus 文章标签：运维 docker 容器

本文链接：https://blog.csdn.net/yyuggjggg/article/details/122862496

版权

Prometheus 专栏收录该内容

16 篇文章 2 订阅

订阅专栏

在之前的博客中，说明了报警的一般步骤前置条件

1 编写Prometheus配置

配置信息：

# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - 192.168.156.135:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "/opt/prometheus/prometheus-2.6.1.linux-amd64/rules/*.rules"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']
  - job_name: 'agent1'
    static_configs: 
    - targets: ['192.168.156.135:9100']
  - job_name: pushgateway
    honor_labels: true
    static_configs:
      - targets: ['192.168.156.135:9091']
        labels:
          instance: pushgateway

实际操作：

[root@localhost prometheus-2.6.1.linux-amd64]# vim prometheus.yml 
# my global config
global:
  scrape_interval:     15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
  evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
  # scrape_timeout is set to the global default (10s).

# Alertmanager configuration
alerting:
  alertmanagers:
  - static_configs:
    - targets:
       - 192.168.156.135:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
  # - "first_rules.yml"
  # - "second_rules.yml"
  - "/opt/prometheus/prometheus-2.6.1.linux-amd64/rules/*.rules"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
  # The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
  - job_name: 'prometheus'

    # metrics_path defaults to '/metrics'
    # scheme defaults to 'http'.

    static_configs:
    - targets: ['localhost:9090']
  - job_name: 'agent1'
    static_configs: 
    - targets: ['192.168.156.135:9100']
  - job_name: pushgateway
    honor_labels: true
    static_configs:
      - targets: ['192.168.156.135:9091']
        labels:
          instance: pushgateway
[root@localhost prometheus-2.6.1.linux-amd64]#

2 编写报警规则

配置信息：

groups:
- name: node-up
  rules:
  - alert: node-up
    expr: up{job="agent1"} == 0
    for: 15s
    labels:
      severity: 1
      team: node
    annotations:
      summary: "{{ $labels.instance }} 已停止运行超过 15s！"
      description: "{{ $labels.instance }} 检测到异常停止！请重点关注！！！"

时间操作：

[root@localhost rules]# vimnode-up.rules 
groups:
- name: node-up
  rules:
  - alert: node-up
    expr: up{job="agent1"} == 0
    for: 15s
    labels:
      severity: 1
      team: node
    annotations:
      summary: "{{ $labels.instance }} 已停止运行超过 15s！"
      description: "{{ $labels.instance }} 检测到异常停止！请重点关注！！！"
[root@localhost rules]# 
[root@localhost rules]# pwd
/opt/prometheus/prometheus-2.6.1.linux-amd64/rules
[root@localhost rules]# ls
node-up.rules
[root@localhost rules]#

3 编写alertmanager配置

配置信息：

global:
  resolve_timeout: 5m
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'prometheusalert-feishu'
#  receiver: 'web.hook.prometheusalert'
#  routes:
#  - receiver: 'prometheusalert-feishu'
#    group_wait: 10s
#    match:
#      level: '2'
receivers:
#- name: 'web.hook.prometheusalert'
#  webhook_configs:
#  - url: "http://[prometheusalert_url]:8080/prometheusalert/alert"
- name: 'prometheusalert-feishu'
  webhook_configs:
  - url: "http://192.168.156.135:8080/prometheusalert?type=fs&tpl=prometheus-fsv2&fsurlhttps://open.feishu.cn/open-apis/bot/v2/hook/xxx-3fba-4903-87d3-xxxx"

实际操作：

[root@localhost alertmanager]# vim  alertmanager2.yml 
global:
  resolve_timeout: 5m
route:
  group_by: ['alertname']
  group_wait: 5s
  group_interval: 5s
  repeat_interval: 5m
  receiver: 'prometheusalert-feishu'
#  receiver: 'web.hook.prometheusalert'
#  routes:
#  - receiver: 'prometheusalert-feishu'
#    group_wait: 10s
#    match:
#      level: '2'
receivers:
#- name: 'web.hook.prometheusalert'
#  webhook_configs:
#  - url: "http://[prometheusalert_url]:8080/prometheusalert/alert"
- name: 'prometheusalert-feishu'
  webhook_configs:
  - url: "http://192.168.156.135:8080/prometheusalert?type=fs&tpl=prometheus-fsv2&fsurlhttps://open.feishu.cn/open-apis/bot/v2/hook/xxx-3fba-4903-87d3-xxx"
"alertmanager2.yml" 22L, 660C 已写入                                                                             
[root@localhost alertmanager]# ./amtool check-config  alertmanager2.yml
Checking 'alertmanager2.yml'  SUCCESS
Found:
 - global config
 - route
 - 0 inhibit rules
 - 1 receivers
 - 0 templates

[root@localhost alertmanager]# ls
alertmanager  alertmanager1.yml  alertmanager2.yml  alertmanager.yml  amtool  data  LICENSE  NOTICE  template
[root@localhost alertmanager]# pwd
/opt/prometheus/alertmanager
[root@localhost alertmanager]#

4 重启服务

4.1 启动PrometheusAlert

在PrometheusAlert的安装目录下面启动

./PrometheusAlert

4.1 重启Prometheus

在安装目录里面如下面操作

[root@localhost prometheus-2.6.1.linux-amd64]# pwd
/opt/prometheus/prometheus-2.6.1.linux-amd64
[root@localhost prometheus-2.6.1.linux-amd64]# ls
console_libraries  consoles  data  LICENSE  NOTICE  prometheus  prometheus.yml  promtool  rules

启动服务

[root@localhost prometheus-2.6.1.linux-amd64]# 
[root@localhost prometheus-2.6.1.linux-amd64]# pkill prometheus
[root@localhost prometheus-2.6.1.linux-amd64]# lsof -i:9090
[root@localhost prometheus-2.6.1.linux-amd64]# ./prometheus --config.file=prometheus.yml &

4.2 重启alertmanager服务

[root@localhost alertmanager]# pwd
/opt/prometheus/alertmanager
[root@localhost alertmanager]# ls
alertmanager  alertmanager1.yml  alertmanager2.yml  alertmanager.yml  amtool  data  LICENSE  NOTICE  template
[root@localhost alertmanager]#

启动服务

[root@localhost alertmanager]# ./alertmanager --config.file=alertmanager2.yml

5 关闭node节点制造错误

[root@localhost node_export]# pwd
/opt/node_export
[root@localhost node_export]# ls
LICENSE  node_exporter  nohup.out  NOTICE
[root@localhost node_export]#

5.1 关闭node

[root@localhost node_export]# lsof -i:9100
COMMAND     PID USER   FD   TYPE  DEVICE SIZE/OFF NODE NAME
prometheu 95405 root   19u  IPv4 1035389      0t0  TCP localhost.localdomain:59670->localhost.localdomain:jetdirect (ESTABLISHED)
node_expo 96011 root    3u  IPv6 1034103      0t0  TCP *:jetdirect (LISTEN)
node_expo 96011 root    5u  IPv6 1035390      0t0  TCP localhost.localdomain:jetdirect->localhost.localdomain:59670 (ESTABLISHED)
[root@localhost node_export]# kill 96011
[root@localhost node_export]#

查看消息
在这里插入图片描述

5.2 重启node

[root@localhost node_export]# nohup ./node_exporter &
[7] 96267
[6]   已终止               nohup ./node_exporter
[root@localhost node_export]# nohup: 忽略输入并把输出追加到"nohup.out"

[root@localhost node_export]#