在之前的博客中,说明了报警的一般步骤前置条件
1 编写Prometheus配置
配置信息:
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.156.135:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/opt/prometheus/prometheus-2.6.1.linux-amd64/rules/*.rules"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'agent1'
static_configs:
- targets: ['192.168.156.135:9100']
- job_name: pushgateway
honor_labels: true
static_configs:
- targets: ['192.168.156.135:9091']
labels:
instance: pushgateway
实际操作:
[root@localhost prometheus-2.6.1.linux-amd64]# vim prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.156.135:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
- "/opt/prometheus/prometheus-2.6.1.linux-amd64/rules/*.rules"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'agent1'
static_configs:
- targets: ['192.168.156.135:9100']
- job_name: pushgateway
honor_labels: true
static_configs:
- targets: ['192.168.156.135:9091']
labels:
instance: pushgateway
[root@localhost prometheus-2.6.1.linux-amd64]#
2 编写报警规则
配置信息:
groups:
- name: node-up
rules:
- alert: node-up
expr: up{job="agent1"} == 0
for: 15s
labels:
severity: 1
team: node
annotations:
summary: "{{ $labels.instance }} 已停止运行超过 15s!"
description: "{{ $labels.instance }} 检测到异常停止!请重点关注!!!"
时间操作:
[root@localhost rules]# vimnode-up.rules
groups:
- name: node-up
rules:
- alert: node-up
expr: up{job="agent1"} == 0
for: 15s
labels:
severity: 1
team: node
annotations:
summary: "{{ $labels.instance }} 已停止运行超过 15s!"
description: "{{ $labels.instance }} 检测到异常停止!请重点关注!!!"
[root@localhost rules]#
[root@localhost rules]# pwd
/opt/prometheus/prometheus-2.6.1.linux-amd64/rules
[root@localhost rules]# ls
node-up.rules
[root@localhost rules]#
3 编写alertmanager配置
配置信息:
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'prometheusalert-feishu'
# receiver: 'web.hook.prometheusalert'
# routes:
# - receiver: 'prometheusalert-feishu'
# group_wait: 10s
# match:
# level: '2'
receivers:
#- name: 'web.hook.prometheusalert'
# webhook_configs:
# - url: "http://[prometheusalert_url]:8080/prometheusalert/alert"
- name: 'prometheusalert-feishu'
webhook_configs:
- url: "http://192.168.156.135:8080/prometheusalert?type=fs&tpl=prometheus-fsv2&fsurlhttps://open.feishu.cn/open-apis/bot/v2/hook/xxx-3fba-4903-87d3-xxxx"
实际操作:
[root@localhost alertmanager]# vim alertmanager2.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'prometheusalert-feishu'
# receiver: 'web.hook.prometheusalert'
# routes:
# - receiver: 'prometheusalert-feishu'
# group_wait: 10s
# match:
# level: '2'
receivers:
#- name: 'web.hook.prometheusalert'
# webhook_configs:
# - url: "http://[prometheusalert_url]:8080/prometheusalert/alert"
- name: 'prometheusalert-feishu'
webhook_configs:
- url: "http://192.168.156.135:8080/prometheusalert?type=fs&tpl=prometheus-fsv2&fsurlhttps://open.feishu.cn/open-apis/bot/v2/hook/xxx-3fba-4903-87d3-xxx"
"alertmanager2.yml" 22L, 660C 已写入
[root@localhost alertmanager]# ./amtool check-config alertmanager2.yml
Checking 'alertmanager2.yml' SUCCESS
Found:
- global config
- route
- 0 inhibit rules
- 1 receivers
- 0 templates
[root@localhost alertmanager]# ls
alertmanager alertmanager1.yml alertmanager2.yml alertmanager.yml amtool data LICENSE NOTICE template
[root@localhost alertmanager]# pwd
/opt/prometheus/alertmanager
[root@localhost alertmanager]#
4 重启服务
4.1 启动PrometheusAlert
在PrometheusAlert的安装目录下面启动
./PrometheusAlert
4.1 重启Prometheus
在安装目录里面如下面操作
[root@localhost prometheus-2.6.1.linux-amd64]# pwd
/opt/prometheus/prometheus-2.6.1.linux-amd64
[root@localhost prometheus-2.6.1.linux-amd64]# ls
console_libraries consoles data LICENSE NOTICE prometheus prometheus.yml promtool rules
启动服务
[root@localhost prometheus-2.6.1.linux-amd64]#
[root@localhost prometheus-2.6.1.linux-amd64]# pkill prometheus
[root@localhost prometheus-2.6.1.linux-amd64]# lsof -i:9090
[root@localhost prometheus-2.6.1.linux-amd64]# ./prometheus --config.file=prometheus.yml &
4.2 重启alertmanager服务
[root@localhost alertmanager]# pwd
/opt/prometheus/alertmanager
[root@localhost alertmanager]# ls
alertmanager alertmanager1.yml alertmanager2.yml alertmanager.yml amtool data LICENSE NOTICE template
[root@localhost alertmanager]#
启动服务
[root@localhost alertmanager]# ./alertmanager --config.file=alertmanager2.yml
5 关闭node节点制造错误
[root@localhost node_export]# pwd
/opt/node_export
[root@localhost node_export]# ls
LICENSE node_exporter nohup.out NOTICE
[root@localhost node_export]#
5.1 关闭node
[root@localhost node_export]# lsof -i:9100
COMMAND PID USER FD TYPE DEVICE SIZE/OFF NODE NAME
prometheu 95405 root 19u IPv4 1035389 0t0 TCP localhost.localdomain:59670->localhost.localdomain:jetdirect (ESTABLISHED)
node_expo 96011 root 3u IPv6 1034103 0t0 TCP *:jetdirect (LISTEN)
node_expo 96011 root 5u IPv6 1035390 0t0 TCP localhost.localdomain:jetdirect->localhost.localdomain:59670 (ESTABLISHED)
[root@localhost node_export]# kill 96011
[root@localhost node_export]#
查看消息
5.2 重启node
[root@localhost node_export]# nohup ./node_exporter &
[7] 96267
[6] 已终止 nohup ./node_exporter
[root@localhost node_export]# nohup: 忽略输入并把输出追加到"nohup.out"
[root@localhost node_export]#
查看消息