prometheus监控
- 部署安装docker
- 下载prometheus镜像编写配置文件
prometheus.yml
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
alerting: #指定alertmanager报警组件地址
alertmanagers:
- static_configs:
- targets: [ '192.168.16.251:9093']
rule_files: #指定报警规则文件
- "rules.yml"
scrape_configs:
- job_name: 'nodehost'
static_configs:
- targets: ['192.168.16.251:9100']
labels:
appname: 'Node1'
static_configs:
- targets: ['192.168.16.252:9100']
labels:
appname: 'Node2'
- job_name: 'tomcat'
static_configs:
- targets: ['192.168.16.173:12345']
labels:
appname: 'mytest'
- job_name: 'cadvisor'
static_configs:
- targets: [ '192.168.16.251:8080','192.168.16.252:8080','192.168.16.173:8080']
labels:
appname: 'cadvisor'
- job_name: 'prometheus'
static_configs:
- targets: [ '192.168.16.251:9090']
labels:
appname: 'prometheus'
rules.yml
groups:
- name: example #定义规则组
rules:
- alert: InstanceDown #定义报警名称
expr: up == 0 #Promql语句,触发规则
for: 1m # 一分钟
labels: #标签定义报警的级别和主机
name: instance
severity: Critical
annotations: #注解
summary: " {{ $labels.appname }}" #报警摘要,取报警信息的appname名称
description: " 服务停止运行 " #报警信息
value: "{{ $value }}%" # 当前报警状态值
- name: Host
rules:
- alert: HostMemory Usage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 80
for: 1m
labels:
name: Memory
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "宿主机内存使用率超过80%."
value: "{{ $value }}"
- alert: HostCPU Usage
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.65
for: 1m
labels:
name: CPU
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "宿主机CPU使用率超过65%."
value: "{{ $value }}"
- alert: HostLoad
expr: node_load5 > 4
for: 1m
labels:
name: Load
severity: Warning
annotations:
summary: "{{ $labels.appname }} "
description: " 主机负载5分钟超过4."
value: "{{ $value }}"
- alert: HostFilesystem Usage
expr: 1-(node_filesystem_free_bytes / node_filesystem_size_bytes) > 0.8
for: 1m
labels:
name: Disk
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
value: "{{ $value }}%"
- alert: HostDiskio
expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
value: "{{ $value }}iops"
- alert: Network_receive
expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3
for: 1m
labels:
name: Network_receive
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过3Mbps."
value: "{{ $value }}3Mbps"
- alert: Network_transmit
expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 3
for: 1m
labels:
name: Network_transmit
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过3Mbps."
value: "{{ $value }}3Mbps"
- name: Container
rules:
- alert: ContainerCPU Usage
expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 60
for: 1m
labels:
name: CPU
severity: Warning
annotations:
summary: "{{ $labels.name }} "
description: " 容器CPU使用超过60%."
value: "{{ $value }}%"
- alert: ContainerMem Usage
# expr: (container_memory_usage_bytes - container_memory_cache) / container_spec_memory_limit_bytes * 100 > 10
expr: container_memory_usage_bytes{name=~".+"} / 1048576 > 1024
for: 1m
labels:
name: Memory
severity: Warning
annotations:
summary: "{{ $labels.name }} "
description: " 容器内存使用超过1GB."
value: "{{ $value }}G"
- 启动prometheus
docker run -d -p 9090:9090 --name=prometheus -v /root/prometheus/conf/:/etc/prometheus/ prom/prometheus
- 下载部署grafana
docker run -d --name grafana --net host grafana/grafana
默认用户名密码都是admin
- 下载alertmanager并编写配置文件
alertmanager.yml
global:
resolve_timeout: 2m
smtp_smarthost: smtp.163.com:25
smtp_from: 12345678@163.com
smtp_auth_username: 12345678@163.com
smtp_auth_password: 123456 (授权码)
templates: ##消息模板
- '/etc/alertmanager/template/wechat.tmpl'
route:
group_by: ['alertname_wechat']
group_wait: 30s
group_interval: 60s
receiver: 'wechat' # 优先使用wechat发送
repeat_interval: 1h
routes: #子路由,使用email发送
- receiver: email
match_re:
serverity: email
receivers:
- name: 'email'
email_configs:
- to: '11111122@qq.com'
send_resolved: true # 发送已解决通知
- name: 'wechat'
wechat_configs:
- corp_id: 'wwd402ce40b1120f24' #企业ID
to_party: '2' # 通知组ID
agent_id: '1000002'
api_secret: '9nmYa4pWq63sQ123kToCbh_oNc' # 生成的secret
send_resolved: true
- name: 'wechat'
webhook_configs:
- url: 'http://localhost:80/alert'
send_resolved: true
- 部署alertmanager
docker run -d -p 9093:9093 --name alertmanager -v /root/alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml -v /root/alertmanager/template:/etc/alertmanager/template docker.io/prom/alertmanager:latest
- 下载及部署webhook
docker run -d --name webhook --network host --env DEFAULT_ROBOT_TOKENS='02acea05-b005-48c8-b743-a076f7eb0c51' --env CRITICAL_ROBOT_TOKENS='02acea05-b005-48c8-b743-a076f7eb0c51' docker.io/jinxinking/wechat-webhook
- docker-compose方式
version: "3"
services:
prometheus:
image: prom/prometheus:latest
container_name: "prometheus"
user: root
restart: always
environment:
TZ: Asia/Shanghai
ports:
- "9090:9090"
volumes:
- "./prometheus.yml:/etc/prometheus/prometheus.yml"
- "./prometheus_data:/prometheus"
networks:
prometheus:
- node-export部署
version: "3"
services:
node-exporter:
image: 13.25.21.14:80/library/node-exporter:latest
container_name: "node-exporter"
environment:
TZ: Asia/Shanghai
ports:
- "9100:9100"
restart: always