1、安装prometheus
#下载、解压、创建软链接
wget https://github.com/prometheus/prometheus/releases/download/v2.13.0/prometheus-2.13.0.linux-amd64.tar.gz
(这里面的版本可以自己去官网找自己需要的版本)
tar -xf prometheus-2.13.0.linux-amd64.tar.gz
mv prometheus-2.13.0.linux-amd64 /usr/local/
ln -s /usr/local/prometheus-2.13.0.linux-amd64/ /usr/local/prometheus
prometheus.yml 文件配置详解 (目录所在位置/usr/local/prometheus)
# 全局配置
global:
scrape_interval: 15s # 设置抓取间隔,默认为1分钟
evaluation_interval: 15s #估算规则的默认周期,每15秒计算一次规则。默认1分钟
# scrape_timeout #默认抓取超时,默认为10s
# Alertmanager相关配置
alerting:
alertmanagers:
- static_configs:
- targets:
# - alertmanager:9093
# 规则文件列表,使用'evaluation_interval' 参数去抓取
rule_files:
# - "first_rules.yml"
# - "second_rules.yml"
# 抓取配置列表
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
#创建prometheus的用户
useradd -s /sbin/nologin -M prometheus
#创建一个数据存储目录
mkdir /data/prometheus -p
#修改目录属主
chown -R prometheus:prometheus /usr/local/prometheus/
chown -R prometheus:prometheus /data/prometheus/
#创建Systemd服务启动prometheus
vim /etc/systemd/system/prometheus.service
[Unit]
Description=Prometheus
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/usr/local/prometheus/prometheus --config.file=/usr/local/prometheus/prometheus.yml --storage.tsdb.path=/data/prometheus
Restart=on-failure
[Install]
WantedBy=multi-user.target
#启动
systemctl start prometheus
systemctl status prometheus
systemctl enable prometheus
访问ip+9090 prometheus搭建完成
2、grafana搭建
#下载、解压、创建软链接
wget https://dl.grafana.com/oss/release/grafana-6.4.2.linux-amd64.tar.gz
tar -zxvf grafana-6.4.2.linux-amd64.tar.gz
mv grafana-6.4.2 /usr/local/
ln -s /usr/local/grafana-6.4.2/ /usr/local/grafana
#创建用户
useradd -s /sbin/nologin -M grafana
#创建一个数据存储目录
mkdir /data/grafana
#修改目录属主
chown -R grafana:grafana /usr/local/grafana/
chown -R grafana:grafana /data/grafana/
#修改grafana的配置文件
vim /usr/local/grafana/conf/defaults.ini
data = /data/grafana/data
logs = /data/grafana/log
plugins = /data/grafana/plugins
provisioning = /data/grafana/conf/provisioning
```bash
```bash
#创建Systemd服务启动grafana
vim /etc/systemd/system/grafana-server.service
[Unit]
Description=Grafana
After=network.target
[Service]
User=grafana
Group=grafana
Type=notify
ExecStart=/usr/local/grafana/bin/grafana-server -homepath /usr/local/grafana
Restart=on-failure
[Install]
WantedBy=multi-user.target
#启动grafana
systemctl start grafana-server
systemctl status grafana-server
systemctl enable grafana-server
#访问IP+3000端口 grfana安装完成
3、安装 node_export 节点
wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
tar -xf node_exporter-0.18.1.linux-amd64.tar.gz
#新建一个目录专门安装各种exporter
mkdir -p /usr/local/prometheus_exporter
mv node_exporter-0.18.1.linux-amd64 /usr/local/prometheus_exporter/
cd /usr/local/prometheus_exporter/
ln -s node_exporter-0.18.1.linux-amd64/ node_exporter
nohup ./node_exporter & 也可以使用这个进行启动
启动
/usr/local/prometheus_exporter/node_exporter/node_exporter
在 prometheus.yml里面添加
- job_name: 'vm-node'
static_configs:
- targets:
- '192.168.241.142:9100'
这里注意一定要用这种缩写格式 (因为Prometheus的yaml文件有文本规范的文档)
写完之后进行重启Prometheus
去Grafana上导入模板就可以 直接引用
然后在grafana上导模板
4、邮件告警
这里的告警是基于grafana上搭建的
vim /usr/local/grafana/conf/defaults.ini
重启一下grafana
systemctl restart grafana-server
访问 grafana页面
创建邮件
然后去邮件中查看
以上 是以grafana做的邮件告警,但是感觉邮件很少人看。下面是以webhook 可以接入飞书,钉钉,企业微信。
5、接入altermanager + PrometheusAlert 接入飞书 webhook
需要自己下载altermanager
cat alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'prometheusalert-feishu'
receivers:
- name: 'prometheusalert-feishu'
webhook_configs:
- url: (这里面写prometheusalert的地址+飞书的webhook地址)
下载prometheusalter 配置文件
cat app.conf
prometheus 的告警规则
cd prometheus
➜ prometheus ls
a.sh console_libraries consoles LICENSE NOTICE prometheus prometheus.yml prometheus.yml.bak promtool rules tsdb
➜ prometheus cat rules/
cat: rules/: 是一个目录
➜ prometheus cd rules
➜ rules ls
cpu_over.yml disk_over.yml memory_over.yml node-rules.yml node-up.rules rules.yml
➜ rules cat cpu_over.yml
groups:
- name: CPU报警规则
rules:
- alert: CPU使用率告警
expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: CPU使用超过80%!(当前值: {{ $value }}%)"
➜ rules cat disk_over.yml
groups:
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: 100 - (node_filesystem_free_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"} * 100 ) > 85
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: 磁盘设备: 使用超过85%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
➜ rules cat memory_over.yml
groups:
- name: 内存报警规则
rules:
- alert: 内存使用率告警
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 85
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "服务器: 内存使用超过85%!(当前值: {{ $value }}%)"
➜ rules cat node-rules.yml
groups:
- name: 实例存活告警规则
rules:
- alert: 实例存活告警
expr: up == 0
for: 1m
labels:
user: prometheus
severity: warning
annotations:
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
➜ rules cat node-up.rules
groups:
- name: node-up
rules:
- alert: node-up
expr: up{job="agent1"} == 0
for: 15s
labels:
severity: 1
team: node
annotations:
summary: "{{ $labels.instance }} 已停止运行超过 15s!"
description: "{{ $labels.instance }} 检测到异常停止!请重点关注!!!"
prometheus.yml
cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- '本地机器:9093'
# - alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/usr/local/prometheus/rules/*.yml"
# - "first_rules.yml"
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
- job_name: 'prod-node'
static_configs:
- targets:
- 'ip地址:9100'
- job_name: java
scrape_interval: 10s
metrics_path: '/actuator/prometheus'
static_configs:
- targets: [' IP地址:应用端口']
labels:
service_name: 'celldb-web'
(最后一个是基于java 应用做的监控)
启动alertmanager (设置 开机在启动)
> cat /usr/lib/systemd/system/alertmanager.service [Unit]
> Description=alertmanager Documentation=https://prometheus.io/
> After=network.target
>
> [Service] User=prometheus Group=prometheus
> ExecStart=/usr/local/alertmanager/alertmanager
> --config.file=/usr/local/alertmanager/alertmanager.yml --storage.path=/mnt/disk2/alertmanager Restart=on-failure
>
> [Install] WantedBy=multi-user.target
systemctl restart alertmanager.service
systemctl enbale alertmanager.service
我们启动PrometheusAlert
./PrometheusAlert &
测试 基于我们生产环境做的文档