通过docker-compose部署prometheus、node-exporter、alertmanager和grafana。prometheus最新版本:2.19.2
mkdir -p /home/prom/{
prometheus,prometheus/data,alertmanager,grafana}
chmod 777 /home/prom/{
prometheus/data,grafana}
cd /home/prom
tree .
.
├── alertmanager
│ ├── alertmanager.yml
│ └── config.yml
├── docker-compose.yml
├── grafana
└── prometheus
├── alert-rules.yml
├── data
└── prometheus.yml
4 directories, 5 files
Prometheus
vim /home/prom/prometheus/alert-rules.yml
groups:
- name: node-alert
rules:
- alert: NodeDown
expr: up{
job="node"} == 0
for: 5m
labels:
severity: critical
instance: "{
{ $labels.instance }}"
annotations:
summary: "instance: {
{
$labels.instance }} down"
description: "Instance: {
{
$labels.instance }} 已经宕机 5分钟"
value: "{
{ $value }}"
- alert: NodeCpuHigh
expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{
job="node",mode="idle"}[5m]))) * 100 > 80
for: 5m
labels:
severity: warning
instance: "{
{ $labels.instance }}"
annotations:
summary: "instance: {
{
$labels.instance }} cpu使用率过高"
description: "CPU 使用率超过 80%"
value: "{
{ $value }}"
- alert: NodeCpuIowaitHigh
expr: avg by (instance) (irate(node_cpu_seconds_total{
job="node",mode="iowait"}[5m])) * 100 > 50
for: 5m
labels:
severity: warning
instance: "{
{ $labels.instance }}"
annotations:
summary: "instance: {
{
$labels.instance }} cpu iowait 使用率过高"
description: "CPU iowait 使用率超过 50%"
value: "{
{ $value }}"
- alert: NodeLoad5High
expr: node_load5 > (count by (instance) (node_cpu_seconds_total{
job="node",mode='system'})) * 1.2
for: 5m
labels:
severity: warning
instance: "{
{ $labels.instance }}"
annotations:
summary: "instance: {
{
$labels.instance }} load(5m) 过高"
description: "Load(5m) 过高,超出cpu核数 1.2倍"
value: "{
{ $value }}"
- alert: NodeMemoryHigh
expr: (1 - node_memory_MemAvailable_bytes{
job="node"} / node_memory_MemTotal_bytes{
job="node"}) * 100 > 90
for: 5m
labels:
severity: warning
instance: "{
{ $labels.instance }}"
annotations:
summary: "instance: {
{
$labels.instance }} memory 使用率过高"
description: "Memory 使用率超过 90%"
value: "{
{ $value }}"
- alert: NodeDiskRootHigh
expr: (1 - node_filesystem_avail_bytes{
job="node",fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{
job="node",fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 90
for: 10m
labels:
severity: warning
instance: "{
{ $labels.instance }}"
annotations:
summary: "instance: {
{
$labels.instance }} disk(/ 分区) 使用率过高"
description: