文章目录
部署前的准备工作
创建数据目录并设置权限
$ mkdir /data/monitor/prometheus/
$ mkdir /data/data/{prometheus,grafana}
$ sudo chmod 777 -R /data/data/grafana
prometheus需要创建相关用户,不然会报权限错误.
$ sudo echo "prometheus:x:65534:" >> /etc/group
$ sudo echo "prometheus:x:65534:65534:nobody:/home:/bin/false" >> /etc/passwd
$ sudo chown -R prometheus.prometheus /data/data/prometheus
编写yml文件
$ cd /data/monitor/prometheus/
docker-compose.yml
内容:
version: '3.7'
services:
#node-exporter:
# image: prom/node-exporter:latest
# container_name: monitoring_node_exporter
# restart: unless-stopped
# volumes:
# - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
# ports:
# - "9100:9100"
# networks:
# - monitor
#cadvisor:
# image: google/cadvisor:latest
# container_name: cadvisor
# restart: unless-stopped
# volumes:
# - /:/rootfs:ro
# - /var/run:/var/run:rw
# - /sys:/sys:ro
# - /data/data/docker/:/var/lib/docker:ro
# - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
# ports:
# - "9101:8080"
alertmanager:
image: prom/alertmanager:latest
hostname: alertmanager
container_name: alertmanager
restart: always
volumes:
- type: bind
source: ./alertmanager/alertmanager.yml
target: /etc/alertmanager/alertmanager.yml
read_only: true
- ./alertmanager/template:/etc/alertmanager/template
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
ports:
- "9093:9093"
- "9094:9094"
networks:
- monitor
prometheus:
depends_on:
- alertmanager
image: prom/prometheus:latest
hostname: prometheus
container_name: prometheus
restart: unless-stopped
volumes:
- type: bind
source: ./prometheus/prometheus.yml
target: /etc/prometheus/prometheus.yml
#read_only: true
- type: bind
source: ./prometheus/alert-rules.yml
target: /etc/prometheus/alert-rules.yml
#read_only: true
- type: volume
source: prometheus
target: /prometheus
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
- ./prometheus/rules/:/etc/prometheus/rules/
- ./prometheus/targets/:/etc/prometheus/targets/
command:
# 指定容器中的配置文件
- '--config.file=/etc/prometheus/prometheus.yml'
- '--storage.tsdb.path=/prometheus'
# 支持热更新
- '--web.enable-lifecycle'
- '--web.external-url=prometheus'
- '--web.enable-admin-api'
ports:
- "9095:9090"
networks:
- monitor
# 添加监控可视化面板
grafana:
depends_on:
- prometheus
image: grafana/grafana:latest
hostname: grafana
container_name: grafana
restart: unless-stopped
volumes:
- type: volume
source: grafana
target: /var/lib/grafana
- ./grafana/grafana-public:/usr/share/grafana/public
# 配置grafana 邮件服务器
- ./grafana/grafana-conf:/usr/share/grafana/conf
- ./grafana/plugins:/var/lib/grafana/plugins
- /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime
ports:
- "9096:9096"
environment:
GF_RENDERING_SERVER_URL: http://192.168.2.201:9097/render
GF_LOG_FILTERS: rendering:debug
GF_SECURITY_ADMIN_PASSWORD: 123456
GF_USERS_ALLOW_SIGN_UP: "false"
GF_SERVER_DOMAIN: 192.168.2.201
GF_SMTP_ENABLED: "true"
GF_SMTP_HOST: xxx.xxx.xxx
GF_SMTP_USER: xxxx@163.com
GF_SMTP_PASSWORD: xxxxxxx
GF_SMTP_FROM_ADDRESS: xxx@163.com
GF_RENDERING_CALLBACK_URL: http://192.168.2.201:9096/grafana
networks:
- monitor
renderer:
depends_on:
- grafana
image: grafana/grafana-image-renderer:latest
hostname: grafana-image-renderer
container_name: grafana_renderer
restart: always
ports:
- "9097:8081"
networks:
- monitor
volumes:
prometheus:
driver: local
driver_opts:
type: none
o: bind
device: /data/data/prometheus
grafana:
driver: local
driver_opts:
type: none
o: bind
device: /data/data/grafana
networks:
monitor:
driver: bridge
alertmanager配置文件内容
$ cat alertmanager/alertmanager.yml
global:
resolve_timeout: 5m
smtp_smarthost: 'xx.xx.xx.xx:25' #邮箱smtp服务器代理,启用SSL发信, 端口一般是465
#smtp_smarthost: 'xx.xx.xx.xx:994' #邮箱smtp服务器代理,启用SSL发信, 端口一般是465
smtp_from: 'xxx@163.com' #发送邮箱名称
smtp_auth_username: 'xxx@163.com' #邮箱名称
#smtp_auth_identity: '123456789'
#邮箱密码或授权码
smtp_auth_password: 'xxxxxxxxxxxxxxxxxxxxx'
smtp_require_tls: false
templates:
- '/etc/alertmanager/template/*'
#route:
# group_by: ['alertname_wechat']
# group_wait: 1s
# group_interval: 1s
# receiver: 'wechat'
# repeat_interval: 1h
# routes:
# - receiver: wechat
# match_re:
# serverity: wechat
#receivers:
#- name: 'email'
# email_configs:
# - to: '8xxxxx@qq.com'
# send_resolved: true
#- name: 'wechat'
# wechat_configs:
# - corp_id: 'wwd402ce40b4720f24'
# to_party: '2'
# agent_id: '1000002'
# api_secret: '9nmYa4p12OkToCbh_oNc'
# send_resolved: true ## 发送已解决通知
route:
group_by: ['alertname_email']
group_wait: 10s
group_interval: 1m
receiver: 'email'
repeat_interval: 1h
routes:
- receiver: email
match_re:
serverity: email
#inhibit_rules:
#- source_match:
# severity: 'critical'
# target_match:
# severity: 'warning'
# equal: ['alertname', 'instance']
receivers:
- name: 'email'
email_configs:
- to: 'liyuanjie8521@163.com'
send_resolved: true ## 发送已解决通知
headers: { Subject: " 【监控告警】 {{ .CommonLabels.alertname }} " } #标题
html: '{{ template "nodelist.html" . }}' #模板
#webhook_configs:
#- url: 'http://dingtalk:8060/dingtalk/webhook/send'
# send_resolved: true
alertmanager模板内容
- email模版
$ cat alertmanager/template/email.tmpl
{{ define "email.default.message" }}
{{ range $i, $alert :=.Alerts }}
【系统报警】
告警状态:{{ .Status }}
告警级别:{{ $alert.Labels.severity }}
告警应用:{{ $alert.Annotations.summary }}
告警详情:{{ $alert.Annotations.description }}
触发阀值:{{ $alert.Annotations.value }}
告警主机:{{ $alert.Labels.instance }}
告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
- nodelist.html内容
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
Style and HTML derived from https://github.com/mailgun/transactional-email-templates
The MIT License (MIT)
Copyright (c) 2014 Mailgun
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<head style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<meta name="viewport" content="width=device-width" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<title style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">{{ template "__subject" . }}</title>
</head>
<body itemscope="" itemtype="http://schema.org/EmailMessage" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;" bgcolor="#f6f6f6">
<table style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; background-color: #f6f6f6; margin: 0;" bgcolor="#f6f6f6">
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; max-width: 600px; display: block; margin: 0 auto; padding: 0;">
<table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; border-radius: 3px; background-color: #fff; margin: 0; border: 1px solid #e9e9e9;" bgcolor="#fff">
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: top; color: #fff; font-weight: 500; text-align: center; border-radius: 3px 3px 0 0; background-color: #E6522C; margin: 0; padding: 20px;" align="center" bgcolor="#E6522C" valign="top">
发生 {{ .Alerts | len }} 个 {{ range .GroupLabels.SortedPairs }}
{{ .Value }}
{{ end }} 告警 !!请尽快处理
</td>
</tr>
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 10px;" valign="top">
<table border="1" cellpadding="2" cellspacing="0" width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<tr border="1" cellpadding="2" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
</tr>
<table border="1" cellpadding="2" cellspacing="0" width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
<strong>告警名称</strong>
</td>
<td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
<strong>告警级别</strong>
<td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
<strong>实例</strong>
<td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
<strong>所属系统</strong>
<td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
<strong>厂商</strong>
<td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
<strong>触发时间</strong>
</td>
<td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
<strong>说明</strong>
</td>
</tr>
{{ range .Alerts.Firing }}
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
<!-- {{ .Labels.alertname }} -->
{{ .Labels.severity }}
</td>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
{{ .Status }}
</td>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
{{ .Labels.instance }}
</td>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
{{ .Labels.ownningsystem }}
</td>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
{{ .Labels.company }}
</td>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
{{ .StartsAt.Format "2006-01-02 15:04:05" }}
</td>
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
{{ .Annotations.description }}
</td>
</tr>
{{ end }}
</table>
</td>
</tr>
</table>
</table>
<div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; clear: both; color: #999; margin: 0; padding: 20px;">
<table width="100%" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; vertical-align: top; text-align: center; color: #999; margin: 0; padding: 0 0 20px;" align="center" valign="top"><a href="{{ .ExternalURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; color: #999; text-decoration: underline; margin: 0;">Sent by {{ template "__alertmanager" . }}</a></td>
</tr>
</table>
</div></div>
</table>
</body>
</html>
- wechat模版
$ cat alertmanager/template/wechat.tmpl
{{ define "wechat.default.message" }}
{{ range $i, $alert :=.Alerts }}
【系统报警】
告警状态:{{ .Status }}
告警级别:{{ $alert.Labels.severity }}
告警应用:{{ $alert.Annotations.summary }}
告警详情:{{ $alert.Annotations.description }}
触发阀值:{{ $alert.Annotations.value }}
告警主机:{{ $alert.Labels.instance }}
告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
Prometheus配置文件
- prometheus.yml
$ cat prometheus/prometheus.yml
global:
scrape_interval: 15s #默认采集监控数据时间间隔
evaluation_interval: 15s
alerting:
alertmanagers:
- static_configs:
- targets: ['192.168.2.201:9093']
rule_files:
- "rules/*.yml"
#rule_files:
# - "*rules.yml"
scrape_configs: #监控对象设置 用于配置数据收集的信息
- job_name: prometheus #任务名称
scrape_interval: 5s #每隔5s获取一次监控数据
static_configs: #监控对象地址
- targets: ['192.168.2.201:9095'] # 将自己加入到监控对象中
metrics_path: /prometheus/metrics
- job_name: 'alertmanager'
scrape_interval: 5s #每隔5s获取一次监控数据
static_configs:
- targets: ['192.168.2.201:9093']
labels:
instance: alertmanager
#- job_name: 'consul-prometheus'
# consul_sd_configs:
# - server: '192.168.2.201:9201'
# services: []
#relabel_configs:
# - source_labels: [__meta_consul_tags]
# regex: .*test.*
# action: keep
- job_name: 'Linux_host'
file_sd_configs:
- files: ['/etc/prometheus/targets/node/*.yml'] #指定自动发现文件路径
refresh_interval: 5s #更新间隔
- job_name: 'docker_cadvisor'
file_sd_configs:
- files: ['/etc/prometheus/targets/cAdvisor/*.yml'] #指定自动发现文件路径
refresh_interval: 5s #更新间隔
#- job_name: 'docker_cadvisor'
# scrape_interval: 5s #每隔5s获取一次监控数据
# metrics_path: /metrics
# static_configs:
# - targets: ['192.168.2.200:9101']
- job_name: 'blackbox_icmp'
metrics_path: /probe
params:
module: [icmp] # Look for a HTTP 200 response.
static_configs:
- targets:
- 192.168.2.189
#relabel_configs:
# - source_labels: [__address__]
# target_label: __param_target
# - source_labels: [__param_target]
# target_label: instance
# - target_label: __address__
# replacement: 127.0.0.1:9115 # The blackbox exporter's real hostname:port.
规则rules
- 容器规则
$ cat prometheus/rules/container_sys.yml
groups:
- name: Container
rules:
- alert: ContainerCPU
expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 200
for: 1m
labels:
name: CPU_Usage
severity: Warning
annotations:
summary: "{{ $labels.name }} "
description: " 容器CPU使用超200%."
value: "{{ $value }}%"
- alert: Memory Usage
expr: (container_memory_usage_bytes{name=~".+"} - container_memory_cache{name=~".+"}) / container_spec_memory_limit_bytes{name=~".+"} * 100 > 200
for: 1m
labels:
name: Memory
severity: Warning
annotations:
summary: "{{ $labels.name }} "
description: " 容器内存使用超过200%."
value: "{{ $value }}%"
- alert: Network_receive
expr: irate(container_network_receive_bytes_total{name=~".+",interface=~"eth.+"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_receive
severity: Warning
annotations:
summary: "{{ $labels.name }} "
description: "容器 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
value: "{{ $value }}Mbps"
- alert: Network_transmit
expr: irate(container_network_transmit_bytes_total{name=~".+",interface=~"eth.+"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_transmit
severity: Warning
annotations:
summary: "{{ $labels.name }} "
description: "容器 [{{ $labels.device }}] 网卡5分钟平均发送流量超过10Mbps."
value: "{{ $value }}Mbps"
- 主机规则
$ cat prometheus/rules/host_sys.yml
groups:
- name: Linux_host
rules:
- alert: HostMemory Usage
expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 > 90
for: 1m
labels:
name: Memory
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "宿主机内存使用率超过90%."
value: "{{ $value }}"
- alert: HostCPU Usage
expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.8
for: 1m
labels:
name: CPU
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: "宿主机CPU使用率超过80%."
value: "{{ $value }}"
- alert: HostLoad
expr: node_load5 > 20
for: 1m
labels:
name: Load
severity: Warning
annotations:
summary: "{{ $labels.appname }} "
description: " 主机负载5分钟超过20."
value: "{{ $value }}"
- alert: HostFilesystem Usage
expr: (node_filesystem_size_bytes-node_filesystem_free_bytes)/node_filesystem_size_bytes*100>80
for: 1m
labels:
name: Disk
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
value: "{{ $value }}%"
- alert: HostDiskio writes
expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
value: "{{ $value }}iops"
- alert: HostDiskio reads
expr: irate(node_disk_reads_completed_total{job=~"Host"}[1m]) > 10
for: 1m
labels:
name: Diskio
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿机 [{{ $labels.device }}]磁盘1分钟平均读取IO负载较高."
value: "{{ $value }}iops"
- alert: HostNetwork_receive
expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_receive
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
value: "{{ $value }}3Mbps"
- alert: hostNetwork_transmit
expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576 > 10
for: 1m
labels:
name: Network_transmit
severity: Warning
annotations:
summary: " {{ $labels.appname }} "
description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过10Mbps."
value: "{{ $value }}3Mbps"
- 服务规则
$ cat prometheus/rules/service_down.yml
groups:
- name: Container
rules:
- alert: ProcessDown
expr: namedprocess_namegroup_num_procs == 0
for: 1m
labels:
name: instance
severity: Critical
annotations:
summary: " {{ $labels.appname }}"
description: " 进程停止运行 "
value: "{{ $value }}"
- alert: Grafana down
expr: absent(container_last_seen{name=~"grafana.+"} ) == 1
for: 1m
labels:
name: grafana
severity: Critical
annotations:
summary: "Grafana"
description: "Grafana容器停止运行"
value: "{{ $value }}"
Grafana配置文件
grafana默认配置文件
$ cat grafana/grafana-conf/defaults.ini | grep -vE "#|^$"
app_mode = production
instance_name = ${HOSTNAME}
[paths]
data = data
temp_data_lifetime = 24h
logs = data/log
plugins = data/plugins
provisioning = conf/provisioning
[server]
protocol = http
http_addr =
http_port = 9096
domain = 192.168.2.201
enforce_domain = false
root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana
serve_from_sub_path = true
router_logging = false
static_root_path = public
enable_gzip = true
cert_file =
cert_key =
socket = /tmp/grafana.sock
[database]
type = sqlite3
host = 127.0.0.1:3306
name = grafana
user = root
password =
url =
max_idle_conn = 2
max_open_conn =
conn_max_lifetime = 14400
log_queries =
ssl_mode = disable
ca_cert_path =
client_key_path =
client_cert_path =
server_cert_name =
path = grafana.db
cache_mode = private
[remote_cache]
type = database
connstr =
[dataproxy]
logging = false
timeout = 30
keep_alive_seconds = 30
tls_handshake_timeout_seconds = 10
expect_continue_timeout_seconds = 1
max_idle_connections = 100
idle_conn_timeout_seconds = 90
send_user_header = false
[analytics]
reporting_enabled = false
check_for_updates = true
google_analytics_ua_id =
google_tag_manager_id =
[security]
disable_initial_admin_creation = false
admin_user = admin
admin_password = admin
secret_key = xxxxxxxxxxxxxxxxxxxxxx
disable_gravatar = false
data_source_proxy_whitelist =
disable_brute_force_login_protection = false
cookie_secure = false
cookie_samesite = lax
allow_embedding = false
strict_transport_security = false
strict_transport_security_max_age_seconds = 86400
strict_transport_security_preload = false
strict_transport_security_subdomains = false
x_content_type_options = true
x_xss_protection = true
[snapshots]
external_enabled = true
external_snapshot_url = https://snapshots-origin.raintank.io
external_snapshot_name = Publish to snapshot.raintank.io
public_mode = false
snapshot_remove_expired = true
[dashboards]
versions_to_keep = 20
min_refresh_interval = 5s
default_home_dashboard_path =
[users]
allow_sign_up = false
allow_org_create = false
auto_assign_org = true
auto_assign_org_id = 1
auto_assign_org_role = Viewer
verify_email_enabled = false
login_hint = email or username
password_hint = password
default_theme = dark
external_manage_link_url =
external_manage_link_name =
external_manage_info =
viewers_can_edit = false
editors_can_admin = false
user_invite_max_lifetime_duration = 24h
[auth]
login_cookie_name = grafana_session
login_maximum_inactive_lifetime_duration =
login_maximum_lifetime_duration =
token_rotation_interval_minutes = 10
disable_login_form = false
disable_signout_menu = false
signout_redirect_url =
oauth_auto_login = false
oauth_state_cookie_max_age = 600
api_key_max_seconds_to_live = -1
sigv4_auth_enabled = false
[auth.anonymous]
enabled = false
org_name = Main Org.
org_role = Viewer
hide_version = true
[auth.github]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email,read:org
auth_url = https://github.com/login/oauth/authorize
token_url = https://github.com/login/oauth/access_token
api_url = https://api.github.com/user
allowed_domains =
team_ids =
allowed_organizations =
[auth.gitlab]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = api
auth_url = https://gitlab.com/oauth/authorize
token_url = https://gitlab.com/oauth/token
api_url = https://gitlab.com/api/v4
allowed_domains =
allowed_groups =
[auth.google]
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret =
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
auth_url = https://accounts.google.com/o/oauth2/auth
token_url = https://accounts.google.com/o/oauth2/token
api_url = https://www.googleapis.com/oauth2/v1/userinfo
allowed_domains =
hosted_domain =
[auth.grafananet]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
allowed_organizations =
[auth.grafana_com]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
allowed_organizations =
[auth.azuread]
name = Azure AD
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret =
scopes = openid email profile
auth_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/authorize
token_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/token
allowed_domains =
allowed_groups =
[auth.okta]
name = Okta
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = openid profile email groups
auth_url = https://<tenant-id>.okta.com/oauth2/v1/authorize
token_url = https://<tenant-id>.okta.com/oauth2/v1/token
api_url = https://<tenant-id>.okta.com/oauth2/v1/userinfo
allowed_domains =
allowed_groups =
role_attribute_path =
[auth.generic_oauth]
name = OAuth
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
email_attribute_name = email:primary
email_attribute_path =
login_attribute_path =
role_attribute_path =
id_token_attribute_name =
auth_url =
token_url =
api_url =
allowed_domains =
team_ids =
allowed_organizations =
tls_skip_verify_insecure = false
tls_client_cert =
tls_client_key =
tls_client_ca =
[auth.basic]
enabled = true
[auth.proxy]
enabled = false
header_name = X-WEBAUTH-USER
header_property = username
auto_sign_up = true
ldap_sync_ttl = 60
sync_ttl = 60
whitelist =
headers =
enable_login_token = false
[auth.ldap]
enabled = true
config_file = /usr/share/grafana/conf/ldap.toml
allow_sign_up = true
sync_cron = "0 0 1 * * *"
active_sync_enabled = true
[smtp]
enabled = true
host = "xxx.xx.163.com:25"
user = "xxxxx@163.com"
password = "123456"
cert_file =
key_file =
skip_verify = true
from_address = "xxxxx@163.com"
from_name = "Grafana Admin"
ehlo_identity = "http://192.168.2.201:9096/grafana"
startTLS_policy =
[emails]
welcome_email_on_sign_up = true
templates_pattern = emails/*.html
[log]
mode = console file
level = info
filters =
[log.console]
level =
format = console
[log.file]
level =
format = text
log_rotate = true
max_lines = 1000000
max_size_shift = 28
daily_rotate = true
max_days = 7
[log.syslog]
level =
format = text
network =
address =
facility =
tag =
[quota]
enabled = false
org_user = 10
org_dashboard = 100
org_data_source = 10
org_api_key = 10
user_org = 10
global_user = -1
global_org = -1
global_dashboard = -1
global_api_key = -1
global_session = -1
[alerting]
enabled = true
execute_alerts = true
error_or_timeout = alerting
nodata_or_nullvalues = no_data
concurrent_render_limit = 5
evaluation_timeout_seconds = 30
notification_timeout_seconds = 30
max_attempts = 3
min_interval_seconds = 1
max_annotation_age =
max_annotations_to_keep =
[annotations.dashboard]
max_age =
max_annotations_to_keep =
[annotations.api]
max_age =
max_annotations_to_keep =
[explore]
enabled = true
[metrics]
enabled = false
interval_seconds = 10
disable_total_stats = false
basic_auth_username =
basic_auth_password =
[metrics.environment_info]
[metrics.graphite]
address =
prefix = prod.grafana.%(instance_name)s.
[grafana_net]
url = https://grafana.com
[grafana_com]
url = https://grafana.com
[tracing.jaeger]
address =
always_included_tag =
sampler_type = const
sampler_param = 1
sampling_server_url =
zipkin_propagation = false
disable_shared_zipkin_spans = false
[external_image_storage]
provider =
[external_image_storage.s3]
endpoint =
path_style_access =
bucket_url =
bucket =
region =
path =
access_key =
secret_key =
[external_image_storage.webdav]
url =
username =
password =
public_url =
[external_image_storage.gcs]
key_file =
bucket =
path =
enable_signed_urls = false
signed_url_expiration =
[external_image_storage.azure_blob]
account_name =
account_key =
container_name =
[external_image_storage.local]
[rendering]
server_url =
callback_url =
concurrent_render_request_limit = 30
[panels]
enable_alpha = false
disable_sanitize_html = true
[plugins]
enable_alpha = false
app_tls_skip_verify_insecure = false
allow_loading_unsigned_plugins =
marketplace_url = https://grafana.com/grafana/plugins/
[plugin.grafana-image-renderer]
rendering_timezone =
rendering_language =
rendering_viewport_device_scale_factor =
rendering_ignore_https_errors =
rendering_verbose_logging =
rendering_dumpio =
rendering_args =
rendering_chrome_bin =
rendering_mode =
rendering_clustering_mode =
rendering_clustering_max_concurrency =
rendering_viewport_max_width =
rendering_viewport_max_height =
rendering_viewport_max_device_scale_factor =
grpc_host =
grpc_port =
[enterprise]
license_path =
[feature_toggles]
enable =
[date_formats]
full_date = YYYY-MM-DD HH:mm:ss
interval_second = HH:mm:ss
interval_minute = HH:mm
interval_hour = MM/DD HH:mm
interval_day = MM/DD
interval_month = YYYY-MM
interval_year = YYYY
use_browser_locale = false
default_timezone = browser
LDAP集成
4.监控展示系统Grafana和LDAP用户认证平台集成方法
ldap.toml
配置文件的内容如下:
$ cat grafana/grafana-conf/ldap.toml | grep -vE "#|^$"
5 [[servers]]
7 host = "192.168.2.210"
9 port = 389
11 use_ssl = false
13 start_tls = false
15 ssl_skip_verify = false
23 bind_dn = "cn=yuanadmin,dc=yuan,dc=com"
26 bind_password = 'xxxxxxxxxxx'
29 search_filter = "(cn=%s)"
32 search_base_dns = ["dc=yuan,dc=com"]
41 [servers.attributes]
42 name = "displayName"
43 surname = "sn"
44 username = "cn"
45 member_of = "memberOf"
46 email = "mail"
49 [[servers.group_mappings]]
50 group_dn = "cn=Grafana-admins,ou=GrafanaGroups,ou=jishubu,ou=BeiJing,dc=yuan,dc=com"
51 org_role = "Admin"
57 [[servers.group_mappings]]
58 group_dn = "cn=Grafana-editors,ou=GrafanaGroups,ou=jishubu,ou=BeiJing,dc=yuan,dc=com"
59 org_role = "Editor"
61 [[servers.group_mappings]]
63 group_dn = "cn=Grafana-viewers,ou=GrafanaGroups,ou=jishubu,ou=BeiJing,dc=yuan,dc=com"
64 org_role = "Viewer"
收集的节点目标
$ cat prometheus/targets/node/centos7_node.yml
- targets:
# Ops server centos7
- '192.168.2.189:9100'
- '192.168.2.200:9100'
- '192.168.2.201:9100'
labels:
appname: 'centos7_node'
$ cat prometheus/targets/cAdvisor/centos7_cadvisor.yml
- targets:
# ops servers containers
- '192.168.2.189:9101'
- '192.168.2.200:9101'
- '192.168.2.201:9101'
监控容器cadvisor
脚本内容
$ cat ./cadvisor_start.sh
DOCKER_ROOT=$( docker info 2>/dev/null | awk '/Docker Root/{print $4}' )
DOCKER_DIR=$DOCKER_ROOT/:/var/lib/docker:ro
docker stop cadvisor
docker rm -f cadvisor
docker run \
--restart=always --privileged=true \
--volume=/:/rootfs:ro \
--volume=/var/run:/var/run:ro \
--volume=/sys:/sys:ro \
--volume=$DOCKER_DIR \
--volume=/dev/disk/:/dev/disk:ro \
--publish=9101:8080 \
--detach=true \
--name=cadvisor \
--privileged \
--device=/dev/kmsg \
google/cadvisor:latest
ansible-playbook部署
$ cat ./cadvisor_allnode-deploy.yml
---
- hosts: all
user: root
gather_facts: false
tasks:
- name: 拷贝cadvisor脚本到所有被监控节点
copy: src=./cadvisor_start.sh dest=/opt/ mode=0755
- name:
shell: /opt/cadvisor_start.sh
监控主机
容器化部署的脚本内容
$ cat ./run_node_exporter.sh
docker stop node_exporter
docker rm -f node_exporter
docker run -d --name node_exporter \
--restart=always \
--net="host" \
--pid="host" \
-v "/proc:/host/proc:ro" \
-v "/sys:/host/sys:ro" \
-v "/:/rootfs:ro" \
prom/node-exporter \
--path.procfs=/host/proc \
--path.rootfs=/rootfs \
--path.sysfs=/host/sys \
--collector.filesystem.ignored-mount-points='^/(sys|proc|dev|host|etc)($$|/)'
使用ansible进行传统部署的脚本
$ cat ./node_exporter.yml
---
- hosts: all
user: root
gather_facts: false
tasks:
- name: 安装node_exporter
unarchive: src=node_exporter-1.1.2.linux-amd64.tar.gz dest=/opt/
- name: 添加node_exporter服务
copy: src=node_exporter.service dest=/etc/systemd/system/
- name: 设置开机自动启动
shell: systemctl enable node_exporter.service
- name: 启动服务
shell: systemctl start node_exporter.service
node_exporter.service
内容:
$ cat ./node_exporter.service
[Unit]
Description=Prometheus node_exporter
Requires=network.target remote-fs.target
After=network.target remote-fs.target
[Service]
Type=simple
User=root
Group=root
ExecStart=/opt/node_exporter-1.1.2.linux-amd64/node_exporter --web.listen-address=0.0.0.0:9100
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
RestartSec=5s
[Install]
WantedBy=multi-user.target
至此, Docker容器化部署Prometheus和Grafana监控系统已经部署完成.