promethues 启动脚本
/usr/lib/systemd/system/prometheus.service
[Unit]
Description=Prometheus
[Service]
ExecStart=/data/prometheus/prometheus --config.file=/data/prometheus/prometheus.yml --storage.tsdb.path=/opt/prometheus --web.enable-lifecycle --storage.tsdb.retention.time=180d
Restart=on-failure
[Install]
WantedBy=multi-user.target
groups:
- name: 实例存活告警规则
rules:
- alert: "实例存活告警"
expr: up == 0
for: 30s
labels:
severity: Disaster
annotations:
summary: "节点失联"
description: "节点断联已超过1分钟: "
- name: 内存告警规则
rules:
- alert: "内存使用率告警"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 75 # 告警阈值为当内存使用率大于75%
for: 30s
labels:
severity: warning
annotations:
summary: "服务器内存报警"
description: "内存资源利用率大于75%!(当前值: {{ $value }}%)"
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80 # 告警阈值为某个挂载点使用大于80%
for: 1m
labels:
severity: warning
annotations:
summary: "服务器磁盘使用率报警"
description: "服务器磁盘设备使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
- name: cpu利用率
rules:
- alert: cpu使用率告警
expr: (100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) * 100 ) >80 # 告警阈值为某个挂载点使用大于80%
for: 1m
labels:
severity: warning
annotations:
summary: "服务器CPU报警"
description: "服务器CPU使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
- name: free使用告警
rules:
- alert: free空间使用率
expr: (100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes) *100 ) >80 # 告警阈值为某个挂载点使用大于80%
for: 1m
labels:
severity: warning
annotations:
summary: "服务器free使用率报警"
description: "服务器free使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
- name: 服务端口告警
rules:
- alert: 端口告警
expr: probe_success == 0
for: 1m
labels:
severity: warning
annotations:
summary: "服务器端口报警"
description: "服务器端口异常服务不可以用 {{ $labels.instance}} 当前值: {{ $value }}%)"
groups:
- name: 实例存活告警规则
rules:
- alert: "实例存活告警"
expr: up == 0
for: 30s
labels:
severity: Disaster
annotations:
summary: "节点失联"
description: "节点断联已超过1分钟: "
- name: 内存告警规则
rules:
- alert: "内存使用率告警"
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 75 # 告警阈值为当内存使用率大于75%
for: 30s
labels:
severity: warning
annotations:
summary: "服务器内存报警"
description: "内存资源利用率大于75%!(当前值: {{ $value }}%)"
- name: 磁盘报警规则
rules:
- alert: 磁盘使用率告警
expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 80 # 告警阈值为某个挂载点使用大于80%
for: 1m
labels:
severity: warning
annotations:
summary: "服务器磁盘使用率报警"
description: "服务器磁盘设备使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
- name: cpu利用率
rules:
- alert: cpu使用率告警
expr: (100 - avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance) * 100 ) >80 # 告警阈值为某个挂载点使用大于80%
for: 1m
labels:
severity: warning
annotations:
summary: "服务器CPU报警"
description: "服务器CPU使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
- name: free使用告警
rules:
- alert: free空间使用率
expr: (100 - (node_memory_MemAvailable_bytes)/(node_memory_MemTotal_bytes) *100 ) >80 # 告警阈值为某个挂载点使用大于80%
for: 1m
labels:
severity: warning
annotations:
summary: "服务器free使用率报警"
description: "服务器free使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
- name: 服务端口告警
rules:
- alert: 端口告警
expr: probe_success == 0
for: 1m
labels:
severity: warning
annotations:
summary: "服务器端口报警"
description: "服务器端口异常服务不可以用 {{ $labels.instance}} 当前值: {{ $value }}%)"