好记性 不如烂笔头
一、服务端 环境部署(prometheus + node_exporter + Grafana )
yum install bash-completion vim wget lrzsz unzip net-tools -y
部署
1、下载
wget -c https://github.com/prometheus/prometheus/releases/download/v2.29.2/prometheus-2.29.2.linux-amd64.tar.gz
wget -c https://github.com/prometheus/node_exporter/releases/download/v1.2.2/node_exporter-1.2.2.linux-amd64.tar.gz
2、解压
tar -xf prometheus-2.29.2.linux-amd64.tar.gz -C /usr/local/
tar -xf node_exporter-1.2.2.linux-amd64.tar.gz -C /usr/local/
3、配置
cd /usr/local/
ln -s prometheus-2.29.2.linux-amd64 prometheus
ln -s node_exporter-1.2.2.linux-amd64/ node_exporter
cd prometheus
mkdir targets
cd targets/
cat /usr/local/prometheus/targets/nodes-all.yaml
- targets:
- 192.168.0.58:9100
- 192.168.0.59:9100
labels:
app: node-exporter
job: node
[root@localhost ~]# cat /usr/local/prometheus/prometheus.yml |grep -vE ‘#|^$’
global:
alerting:
alertmanagers:
- static_configs:
- targets:
rule_files:
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "nodes"
file_sd_configs:
- files:
- targets/nodes-*.yaml
refresh_interval: 2m
#这里为 文件发现node_exports
4、启动:
cd /usr/local/prometheus ; nohup ./prometheus &
#node端服务器
cd /usr/local/node_exporter ; nohup ./node_exporter &
5、Grafana
wget -c https://dl.grafana.com/oss/release/grafana-8.1.2.linux-amd64.tar.gz
tar -xf grafana-8.1.2.linux-amd64.tar.gz -C /usr/local/
cd /usr/local/
ln -s grafana-8.1.2 grafana
cd /usr/local/grafana/bin && nohup ./grafana-server &
#[root@test grafana-6.7.4]# bin/grafana-server web &
ip:3000 admin admin
这里有很多模板,可以直接使用:
https://grafana.com/grafana/dashboards
我node使用的是: 13105
ID:8919也给力
二、邮件告警
wget -c https://github.com/prometheus/alertmanager/releases/download/v0.23.0/alertmanager-0.23.0.linux-amd64.tar.gz
1、prometheus主服务配置
cat prometheus.yml |grep -vE ‘#|^$’
global:
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.0.58:9093
rule_files:
- "rules/*.yaml"
- "alert_rules/*.yaml"
scrape_configs:
- job_name: "prometheus"
static_configs:
- targets: ["localhost:9090"]
- job_name: "nodes"
file_sd_configs:
- files:
- targets/nodes-*.yaml
refresh_interval: 2m
- job_name: "alertmanager"
static_configs:
- targets: ["192.168.0.58:9093"]
2、告警规则
cat alert_rules/instance_down.yaml
groups:
- name: AllInstances
rules:
- alert: InstanceDown
expr: up == 0
for: 20s
annotations:
title: "Instance down"
description: 'Instance has been down for more than 20 secondes .'
labels:
severity: 'critical'
3、alertmanager.yml 路由设置
cat /usr/local/alertmanager/alertmanager.yml
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 10s
receiver: '收信箱'
receivers:
- name: '收信箱'
email_configs:
- to: '收信箱'
from: '发信箱@163.com'
smarthost: 'smtp.163.com:465'
auth_username: '发信箱@163.com'
auth_identity: '发信箱@163.com'
auth_password: 'QHILVALBZLYPIPCI'
require_tls: false
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
告警
解除告警
注:
如果没有收到解除告警,检查alertmanager.yml,是否有
send_resolved: true
-----------------------end