5.Docker容器化部署Prometheus和Grafana监控系统

部署前的准备工作

创建数据目录并设置权限

$ mkdir /data/monitor/prometheus/
$ mkdir /data/data/{prometheus,grafana}
$ sudo chmod 777 -R /data/data/grafana
prometheus需要创建相关用户,不然会报权限错误.
$ sudo echo "prometheus:x:65534:" >> /etc/group
$ sudo echo "prometheus:x:65534:65534:nobody:/home:/bin/false" >> /etc/passwd
$ sudo chown -R prometheus.prometheus /data/data/prometheus

编写yml文件

$ cd /data/monitor/prometheus/

docker-compose.yml内容:

version: '3.7'

services:
  #node-exporter:
  #  image: prom/node-exporter:latest
  #  container_name: monitoring_node_exporter
  #  restart: unless-stopped
  #  volumes:
  #    - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
  #  ports:
  #    - "9100:9100"
  #  networks:
  #    - monitor

  #cadvisor:
  #  image: google/cadvisor:latest
  #  container_name: cadvisor
  #  restart: unless-stopped
  #  volumes:
  #    - /:/rootfs:ro
  #    - /var/run:/var/run:rw
  #    - /sys:/sys:ro
  #    - /data/data/docker/:/var/lib/docker:ro
  #    - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
  #  ports:
  #    - "9101:8080"

      
  alertmanager:
    image: prom/alertmanager:latest
    hostname: alertmanager
    container_name: alertmanager
    restart: always
    volumes:
      - type: bind
        source: ./alertmanager/alertmanager.yml
        target: /etc/alertmanager/alertmanager.yml
        read_only: true
      - ./alertmanager/template:/etc/alertmanager/template
      - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
    ports:
      - "9093:9093"
      - "9094:9094"
    networks:
      - monitor

  prometheus:
    depends_on:
      - alertmanager
    image: prom/prometheus:latest
    hostname: prometheus 
    container_name: prometheus
    restart: unless-stopped
    volumes:
      - type: bind
        source: ./prometheus/prometheus.yml
        target: /etc/prometheus/prometheus.yml
        #read_only: true
      - type: bind
        source: ./prometheus/alert-rules.yml
        target: /etc/prometheus/alert-rules.yml
        #read_only: true
      - type: volume
        source: prometheus
        target: /prometheus
      - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
      - ./prometheus/rules/:/etc/prometheus/rules/
      - ./prometheus/targets/:/etc/prometheus/targets/
    command:
      # 指定容器中的配置文件
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      # 支持热更新
      - '--web.enable-lifecycle'
      - '--web.external-url=prometheus'
      - '--web.enable-admin-api'
    ports:
      - "9095:9090"
    networks:
      - monitor
 
  # 添加监控可视化面板
  grafana:
    depends_on:
      - prometheus
    image: grafana/grafana:latest
    hostname: grafana
    container_name: grafana
    restart: unless-stopped
    volumes:
      - type: volume
        source: grafana
        target: /var/lib/grafana
      - ./grafana/grafana-public:/usr/share/grafana/public
      # 配置grafana 邮件服务器
      - ./grafana/grafana-conf:/usr/share/grafana/conf
      - ./grafana/plugins:/var/lib/grafana/plugins
      - /usr/share/zoneinfo/Asia/Shanghai:/etc/localtime 
    ports:
      - "9096:9096"
    environment:
      GF_RENDERING_SERVER_URL: http://192.168.2.201:9097/render
      GF_LOG_FILTERS: rendering:debug
      GF_SECURITY_ADMIN_PASSWORD: 123456
      GF_USERS_ALLOW_SIGN_UP: "false"
      GF_SERVER_DOMAIN: 192.168.2.201
      GF_SMTP_ENABLED: "true"
      GF_SMTP_HOST: xxx.xxx.xxx
      GF_SMTP_USER: xxxx@163.com
      GF_SMTP_PASSWORD: xxxxxxx
      GF_SMTP_FROM_ADDRESS: xxx@163.com
      GF_RENDERING_CALLBACK_URL: http://192.168.2.201:9096/grafana
    networks:
      - monitor

  renderer:
    depends_on:
      - grafana 
    image: grafana/grafana-image-renderer:latest
    hostname: grafana-image-renderer
    container_name: grafana_renderer
    restart: always
    ports:
      - "9097:8081"
    networks:
      - monitor 
    
volumes:
  prometheus:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/data/prometheus
  grafana:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /data/data/grafana
      
networks:
  monitor:
    driver: bridge

alertmanager配置文件内容

$ cat alertmanager/alertmanager.yml 
global:
  resolve_timeout: 5m
  smtp_smarthost: 'xx.xx.xx.xx:25'             #邮箱smtp服务器代理,启用SSL发信, 端口一般是465
  #smtp_smarthost: 'xx.xx.xx.xx:994'             #邮箱smtp服务器代理,启用SSL发信, 端口一般是465
  smtp_from: 'xxx@163.com'              #发送邮箱名称
  smtp_auth_username: 'xxx@163.com'              #邮箱名称
  #smtp_auth_identity: '123456789'
  #邮箱密码或授权码
  smtp_auth_password: 'xxxxxxxxxxxxxxxxxxxxx'
  smtp_require_tls: false

templates:
  - '/etc/alertmanager/template/*'

#route:
#  group_by: ['alertname_wechat']
#  group_wait: 1s
#  group_interval: 1s
#  receiver: 'wechat'
#  repeat_interval: 1h
#  routes:
#  - receiver: wechat
#    match_re:
#      serverity: wechat
#receivers:
#- name: 'email'
#  email_configs:
#  - to: '8xxxxx@qq.com'
#    send_resolved: true
#- name: 'wechat'
#  wechat_configs:
#  - corp_id: 'wwd402ce40b4720f24'
#    to_party: '2'
#    agent_id: '1000002'
#    api_secret: '9nmYa4p12OkToCbh_oNc'
#    send_resolved: true ## 发送已解决通知

route:
  group_by: ['alertname_email']
  group_wait: 10s
  group_interval: 1m
  receiver: 'email'
  repeat_interval: 1h
  routes:
  - receiver: email
    match_re:
      serverity: email

#inhibit_rules:
#- source_match:
#    severity: 'critical'
#  target_match:
#    severity: 'warning'
#  equal: ['alertname', 'instance']
  
receivers:
- name: 'email'
  email_configs:
  - to: 'liyuanjie8521@163.com'
    send_resolved: true ## 发送已解决通知
    headers: { Subject: " 【监控告警】 {{ .CommonLabels.alertname }} " } #标题
    html: '{{ template "nodelist.html" . }}' #模板
  #webhook_configs:
  #- url: 'http://dingtalk:8060/dingtalk/webhook/send'
  #  send_resolved: true

alertmanager模板内容

  1. email模版
$ cat alertmanager/template/email.tmpl 
{{ define "email.default.message" }}
{{ range $i, $alert :=.Alerts }}
【系统报警】
告警状态:{{   .Status }}
告警级别:{{ $alert.Labels.severity }}
告警应用:{{ $alert.Annotations.summary }}
告警详情:{{ $alert.Annotations.description }}
触发阀值:{{ $alert.Annotations.value }}
告警主机:{{ $alert.Labels.instance }}
告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}
  1. nodelist.html内容
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<!--
Style and HTML derived from https://github.com/mailgun/transactional-email-templates

The MIT License (MIT)

Copyright (c) 2014 Mailgun

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
-->
<html xmlns="http://www.w3.org/1999/xhtml" xmlns="http://www.w3.org/1999/xhtml" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<head style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
<meta name="viewport" content="width=device-width" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;" />
<title style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">{{ template "__subject" . }}</title>

</head>

<body itemscope="" itemtype="http://schema.org/EmailMessage" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; -webkit-font-smoothing: antialiased; -webkit-text-size-adjust: none; height: 100%; line-height: 1.6em; width: 100% !important; background-color: #f6f6f6; margin: 0; padding: 0;" bgcolor="#f6f6f6">

<table style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; background-color: #f6f6f6; margin: 0;" bgcolor="#f6f6f6">
  <div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; max-width: 600px; display: block; margin: 0 auto; padding: 0;">
    <table width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; border-radius: 3px; background-color: #fff; margin: 0; border: 1px solid #e9e9e9;" bgcolor="#fff">
      <tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
        <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: top; color: #fff; font-weight: 500; text-align: center; border-radius: 3px 3px 0 0; background-color: #E6522C; margin: 0; padding: 20px;" align="center" bgcolor="#E6522C" valign="top">
          发生 {{ .Alerts | len }}  个 {{ range .GroupLabels.SortedPairs }}
                {{ .Value }}
          {{ end }} 告警 !!请尽快处理
        </td>
      </tr>
      <tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
        <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 10px;" valign="top">
          <table border="1" cellpadding="2" cellspacing="0" width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
            <tr border="1" cellpadding="2" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
            </tr>
          <table border="1" cellpadding="2" cellspacing="0" width="100%" cellpadding="0" cellspacing="0" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
            <tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
              <td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
                <strong>告警名称</strong> 
              </td>
              <td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
                <strong>告警级别</strong> 
              <td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
                <strong>实例</strong> 
              <td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
                <strong>所属系统</strong> 
              <td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
                <strong>厂商</strong> 
              <td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
                <strong>触发时间</strong>
              </td>
              <td width="50px" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 16px; vertical-align: middle; margin: 0; padding: 3 3 3px;" valign="top" >
                <strong>说明</strong> 
              </td>
            </tr>
            {{ range .Alerts.Firing }}
            <tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
              <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
               <!-- {{ .Labels.alertname }} -->
                {{ .Labels.severity }}
              </td>
              <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
                {{ .Status }}
              </td>
              <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
                {{ .Labels.instance }}
              </td>
              <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
                {{ .Labels.ownningsystem }}
              </td>
              <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
                {{ .Labels.company }}
              </td>
              <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
                {{  .StartsAt.Format "2006-01-02 15:04:05"  }}
              </td>
              <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; vertical-align: top; margin: 0; padding: 3 3 3px;" valign="top">
                {{ .Annotations.description  }}
              </td>
            </tr>
            {{ end }}
          </table>
        </td>
      </tr>
      </table>
    </table>

    <div style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; width: 100%; clear: both; color: #999; margin: 0; padding: 20px;">
      <table width="100%" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
        <tr style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 14px; margin: 0;">
          <td style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; vertical-align: top; text-align: center; color: #999; margin: 0; padding: 0 0 20px;" align="center" valign="top"><a href="{{ .ExternalURL }}" style="font-family: 'Helvetica Neue', Helvetica, Arial, sans-serif; box-sizing: border-box; font-size: 12px; color: #999; text-decoration: underline; margin: 0;">Sent by {{ template "__alertmanager" . }}</a></td>
        </tr>
      </table>
    </div></div>
</table>

</body>
</html>
  1. wechat模版
$ cat alertmanager/template/wechat.tmpl 
{{ define "wechat.default.message" }}
{{ range $i, $alert :=.Alerts }}
【系统报警】
告警状态:{{   .Status }}
告警级别:{{ $alert.Labels.severity }}
告警应用:{{ $alert.Annotations.summary }}
告警详情:{{ $alert.Annotations.description }}
触发阀值:{{ $alert.Annotations.value }}
告警主机:{{ $alert.Labels.instance }}
告警时间:{{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}
{{ end }}
{{ end }}

Prometheus配置文件

  1. prometheus.yml
$ cat  prometheus/prometheus.yml
global:
  scrape_interval:     15s #默认采集监控数据时间间隔
  evaluation_interval: 15s

alerting:
  alertmanagers:
  - static_configs:
    - targets: ['192.168.2.201:9093']

rule_files:
  - "rules/*.yml"
#rule_files:
#  - "*rules.yml"


scrape_configs: #监控对象设置 用于配置数据收集的信息
  - job_name: prometheus #任务名称
    scrape_interval: 5s #每隔5s获取一次监控数据
    static_configs: #监控对象地址
    - targets: ['192.168.2.201:9095']  # 将自己加入到监控对象中
    metrics_path: /prometheus/metrics

  - job_name: 'alertmanager'
    scrape_interval: 5s #每隔5s获取一次监控数据
    static_configs:
    - targets: ['192.168.2.201:9093']
      labels:
        instance: alertmanager

  #- job_name: 'consul-prometheus'
  #  consul_sd_configs:
  #    - server: '192.168.2.201:9201'
  #      services: []
    #relabel_configs:
    #  - source_labels: [__meta_consul_tags]
    #    regex: .*test.*
    #    action: keep

  - job_name: 'Linux_host'
    file_sd_configs:
      - files: ['/etc/prometheus/targets/node/*.yml'] #指定自动发现文件路径
        refresh_interval: 5s  #更新间隔

  - job_name: 'docker_cadvisor'
    file_sd_configs:
      - files: ['/etc/prometheus/targets/cAdvisor/*.yml'] #指定自动发现文件路径
        refresh_interval: 5s  #更新间隔

  #- job_name: 'docker_cadvisor'
  #  scrape_interval: 5s #每隔5s获取一次监控数据
  #  metrics_path: /metrics
  #  static_configs:
  #  - targets: ['192.168.2.200:9101']

  - job_name: 'blackbox_icmp'
    metrics_path: /probe
    params:
      module: [icmp]  # Look for a HTTP 200 response.
    static_configs:
      - targets:
        - 192.168.2.189
    #relabel_configs:
    #  - source_labels: [__address__]
    #    target_label: __param_target
    #  - source_labels: [__param_target]
    #    target_label: instance
    #  - target_label: __address__
    #    replacement: 127.0.0.1:9115  # The blackbox exporter's real hostname:port.
规则rules
  1. 容器规则
$ cat  prometheus/rules/container_sys.yml 
groups:
- name: Container
  rules:
  - alert: ContainerCPU
    expr: (sum by(name,instance) (rate(container_cpu_usage_seconds_total{image!=""}[5m]))*100) > 200
    for: 1m
    labels:
      name: CPU_Usage
      severity: Warning
    annotations:
      summary: "{{ $labels.name }} "
      description: " 容器CPU使用超200%."
      value: "{{ $value }}%"
  - alert: Memory Usage
    expr: (container_memory_usage_bytes{name=~".+"} - container_memory_cache{name=~".+"})  / container_spec_memory_limit_bytes{name=~".+"}   * 100 > 200
    for: 1m
    labels:
      name: Memory
      severity: Warning
    annotations:
      summary: "{{ $labels.name }} "
      description: " 容器内存使用超过200%."
      value: "{{ $value }}%"
  - alert: Network_receive
    expr: irate(container_network_receive_bytes_total{name=~".+",interface=~"eth.+"}[5m]) / 1048576  > 10
    for: 1m
    labels:
      name: Network_receive
      severity: Warning
    annotations:
      summary: "{{ $labels.name }} "
      description: "容器 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
      value: "{{ $value }}Mbps"
  - alert: Network_transmit
    expr: irate(container_network_transmit_bytes_total{name=~".+",interface=~"eth.+"}[5m]) / 1048576  > 10
    for: 1m
    labels:
      name: Network_transmit
      severity: Warning
    annotations:
      summary: "{{ $labels.name }} "
      description: "容器 [{{ $labels.device }}] 网卡5分钟平均发送流量超过10Mbps."
      value: "{{ $value }}Mbps"
  1. 主机规则
$ cat  prometheus/rules/host_sys.yml 
groups:
- name: Linux_host
  rules:
  - alert: HostMemory Usage
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes + node_memory_Buffers_bytes + node_memory_Cached_bytes)) / node_memory_MemTotal_bytes * 100 >  90
    for: 1m
    labels:
      name: Memory
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: "宿主机内存使用率超过90%."
      value: "{{ $value }}"
  - alert: HostCPU Usage
    expr: sum(avg without (cpu)(irate(node_cpu_seconds_total{mode!='idle'}[5m]))) by (instance,appname) > 0.8
    for: 1m
    labels:
      name: CPU
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: "宿主机CPU使用率超过80%."
      value: "{{ $value }}"
  - alert: HostLoad
    expr: node_load5 > 20
    for: 1m
    labels:
      name: Load
      severity: Warning
    annotations:
      summary: "{{ $labels.appname }} "
      description: " 主机负载5分钟超过20."
      value: "{{ $value }}"
  - alert: HostFilesystem Usage
    expr: (node_filesystem_size_bytes-node_filesystem_free_bytes)/node_filesystem_size_bytes*100>80
    for: 1m
    labels:
      name: Disk
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [ {{ $labels.mountpoint }} ]分区使用超过80%."
      value: "{{ $value }}%"
  - alert: HostDiskio writes
    expr: irate(node_disk_writes_completed_total{job=~"Host"}[1m]) > 10
    for: 1m
    labels:
      name: Diskio
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}]磁盘1分钟平均写入IO负载较高."
      value: "{{ $value }}iops"
  - alert: HostDiskio reads
    expr: irate(node_disk_reads_completed_total{job=~"Host"}[1m]) > 10
    for: 1m
    labels:
      name: Diskio
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿机 [{{ $labels.device }}]磁盘1分钟平均读取IO负载较高."
      value: "{{ $value }}iops"
  - alert: HostNetwork_receive
    expr: irate(node_network_receive_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 10
    for: 1m
    labels:
      name: Network_receive
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}] 网卡5分钟平均接收流量超过10Mbps."
      value: "{{ $value }}3Mbps"
  - alert: hostNetwork_transmit
    expr: irate(node_network_transmit_bytes_total{device!~"lo|bond[0-9]|cbr[0-9]|veth.*|virbr.*|ovs-system"}[5m]) / 1048576  > 10
    for: 1m
    labels:
      name: Network_transmit
      severity: Warning
    annotations:
      summary: " {{ $labels.appname }} "
      description: " 宿主机 [{{ $labels.device }}] 网卡5分钟内平均发送流量超过10Mbps."
      value: "{{ $value }}3Mbps"
  1. 服务规则
$ cat  prometheus/rules/service_down.yml 
groups:
- name: Container
  rules:
  - alert: ProcessDown
    expr: namedprocess_namegroup_num_procs  == 0
    for: 1m
    labels:
      name: instance
      severity: Critical
    annotations:
      summary: " {{ $labels.appname }}"
      description: " 进程停止运行 "
      value: "{{ $value }}"
  - alert: Grafana down
    expr: absent(container_last_seen{name=~"grafana.+"} ) == 1
    for: 1m
    labels:
      name: grafana
      severity: Critical
    annotations:
      summary: "Grafana"
      description: "Grafana容器停止运行"
      value: "{{ $value }}"

Grafana配置文件

grafana默认配置文件

$ cat  grafana/grafana-conf/defaults.ini | grep -vE "#|^$"
app_mode = production
instance_name = ${HOSTNAME}
[paths]
data = data
temp_data_lifetime = 24h
logs = data/log
plugins = data/plugins
provisioning = conf/provisioning
[server]
protocol = http
http_addr =
http_port = 9096
domain = 192.168.2.201
enforce_domain = false
root_url = %(protocol)s://%(domain)s:%(http_port)s/grafana
serve_from_sub_path = true
router_logging = false
static_root_path = public
enable_gzip = true 
cert_file =
cert_key =
socket = /tmp/grafana.sock
[database]
type = sqlite3
host = 127.0.0.1:3306
name = grafana
user = root
password =
url =
max_idle_conn = 2
max_open_conn =
conn_max_lifetime = 14400
log_queries =
ssl_mode = disable
ca_cert_path =
client_key_path =
client_cert_path =
server_cert_name =
path = grafana.db
cache_mode = private
[remote_cache]
type = database
connstr =
[dataproxy]
logging = false
timeout = 30
keep_alive_seconds = 30
tls_handshake_timeout_seconds = 10
expect_continue_timeout_seconds = 1
max_idle_connections = 100
idle_conn_timeout_seconds = 90
send_user_header = false
[analytics]
reporting_enabled = false 
check_for_updates = true
google_analytics_ua_id =
google_tag_manager_id =
[security]
disable_initial_admin_creation = false
admin_user = admin
admin_password = admin
secret_key = xxxxxxxxxxxxxxxxxxxxxx
disable_gravatar = false
data_source_proxy_whitelist =
disable_brute_force_login_protection = false
cookie_secure = false
cookie_samesite = lax
allow_embedding = false
strict_transport_security = false
strict_transport_security_max_age_seconds = 86400
strict_transport_security_preload = false
strict_transport_security_subdomains = false
x_content_type_options = true
x_xss_protection = true
[snapshots]
external_enabled = true
external_snapshot_url = https://snapshots-origin.raintank.io
external_snapshot_name = Publish to snapshot.raintank.io
public_mode = false
snapshot_remove_expired = true
[dashboards]
versions_to_keep = 20
min_refresh_interval = 5s
default_home_dashboard_path =
[users]
allow_sign_up = false
allow_org_create = false
auto_assign_org = true
auto_assign_org_id = 1
auto_assign_org_role = Viewer
verify_email_enabled = false
login_hint = email or username
password_hint = password
default_theme = dark
external_manage_link_url =
external_manage_link_name =
external_manage_info =
viewers_can_edit = false
editors_can_admin = false
user_invite_max_lifetime_duration = 24h
[auth]
login_cookie_name = grafana_session
login_maximum_inactive_lifetime_duration =
login_maximum_lifetime_duration =
token_rotation_interval_minutes = 10
disable_login_form = false
disable_signout_menu = false
signout_redirect_url =
oauth_auto_login = false
oauth_state_cookie_max_age = 600
api_key_max_seconds_to_live = -1
sigv4_auth_enabled = false
[auth.anonymous]
enabled = false
org_name = Main Org.
org_role = Viewer
hide_version = true 
[auth.github]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email,read:org
auth_url = https://github.com/login/oauth/authorize
token_url = https://github.com/login/oauth/access_token
api_url = https://api.github.com/user
allowed_domains =
team_ids =
allowed_organizations =
[auth.gitlab]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = api
auth_url = https://gitlab.com/oauth/authorize
token_url = https://gitlab.com/oauth/token
api_url = https://gitlab.com/api/v4
allowed_domains =
allowed_groups =
[auth.google]
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret =
scopes = https://www.googleapis.com/auth/userinfo.profile https://www.googleapis.com/auth/userinfo.email
auth_url = https://accounts.google.com/o/oauth2/auth
token_url = https://accounts.google.com/o/oauth2/token
api_url = https://www.googleapis.com/oauth2/v1/userinfo
allowed_domains =
hosted_domain =
[auth.grafananet]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
allowed_organizations =
[auth.grafana_com]
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
allowed_organizations =
[auth.azuread]
name = Azure AD
enabled = false
allow_sign_up = true
client_id = some_client_id
client_secret =
scopes = openid email profile
auth_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/authorize
token_url = https://login.microsoftonline.com/<tenant-id>/oauth2/v2.0/token
allowed_domains =
allowed_groups =
[auth.okta]
name = Okta
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = openid profile email groups
auth_url = https://<tenant-id>.okta.com/oauth2/v1/authorize
token_url = https://<tenant-id>.okta.com/oauth2/v1/token
api_url = https://<tenant-id>.okta.com/oauth2/v1/userinfo
allowed_domains =
allowed_groups =
role_attribute_path =
[auth.generic_oauth]
name = OAuth
enabled = false
allow_sign_up = true
client_id = some_id
client_secret =
scopes = user:email
email_attribute_name = email:primary
email_attribute_path =
login_attribute_path =
role_attribute_path =
id_token_attribute_name =
auth_url =
token_url =
api_url =
allowed_domains =
team_ids =
allowed_organizations =
tls_skip_verify_insecure = false
tls_client_cert =
tls_client_key =
tls_client_ca =
[auth.basic]
enabled = true
[auth.proxy]
enabled = false
header_name = X-WEBAUTH-USER
header_property = username
auto_sign_up = true
ldap_sync_ttl = 60
sync_ttl = 60
whitelist =
headers =
enable_login_token = false
[auth.ldap]
enabled = true
config_file = /usr/share/grafana/conf/ldap.toml
allow_sign_up = true
sync_cron = "0 0 1 * * *"
active_sync_enabled = true
[smtp]
enabled = true
host = "xxx.xx.163.com:25"
user = "xxxxx@163.com"
password = "123456"
cert_file =
key_file =
skip_verify = true
from_address = "xxxxx@163.com"
from_name = "Grafana Admin"
ehlo_identity = "http://192.168.2.201:9096/grafana"
startTLS_policy =
[emails]
welcome_email_on_sign_up = true
templates_pattern = emails/*.html
[log]
mode = console file
level = info
filters =
[log.console]
level =
format = console
[log.file]
level =
format = text
log_rotate = true
max_lines = 1000000
max_size_shift = 28
daily_rotate = true
max_days = 7
[log.syslog]
level =
format = text
network =
address =
facility =
tag =
[quota]
enabled = false
org_user = 10
org_dashboard = 100
org_data_source = 10
org_api_key = 10
user_org = 10
global_user = -1
global_org = -1
global_dashboard = -1
global_api_key = -1
global_session = -1
[alerting]
enabled = true
execute_alerts = true
error_or_timeout = alerting
nodata_or_nullvalues = no_data
concurrent_render_limit = 5
evaluation_timeout_seconds = 30
notification_timeout_seconds = 30
max_attempts = 3
min_interval_seconds = 1
max_annotation_age =
max_annotations_to_keep =
[annotations.dashboard]
max_age =
max_annotations_to_keep =
[annotations.api]
max_age =
max_annotations_to_keep =
[explore]
enabled = true
[metrics]
enabled              = false 
interval_seconds     = 10
disable_total_stats = false
basic_auth_username =
basic_auth_password =
[metrics.environment_info]
[metrics.graphite]
address =
prefix = prod.grafana.%(instance_name)s.
[grafana_net]
url = https://grafana.com
[grafana_com]
url = https://grafana.com
[tracing.jaeger]
address =
always_included_tag =
sampler_type = const
sampler_param = 1
sampling_server_url =
zipkin_propagation = false
disable_shared_zipkin_spans = false
[external_image_storage]
provider =
[external_image_storage.s3]
endpoint =
path_style_access =
bucket_url =
bucket =
region =
path =
access_key =
secret_key =
[external_image_storage.webdav]
url =
username =
password =
public_url =
[external_image_storage.gcs]
key_file =
bucket =
path =
enable_signed_urls = false
signed_url_expiration =
[external_image_storage.azure_blob]
account_name =
account_key =
container_name =
[external_image_storage.local]
[rendering]
server_url =
callback_url =
concurrent_render_request_limit = 30
[panels]
enable_alpha = false
disable_sanitize_html = true
[plugins]
enable_alpha = false
app_tls_skip_verify_insecure = false
allow_loading_unsigned_plugins =
marketplace_url = https://grafana.com/grafana/plugins/
[plugin.grafana-image-renderer]
rendering_timezone =
rendering_language =
rendering_viewport_device_scale_factor =
rendering_ignore_https_errors =
rendering_verbose_logging =
rendering_dumpio =
rendering_args =
rendering_chrome_bin =
rendering_mode =
rendering_clustering_mode =
rendering_clustering_max_concurrency =
rendering_viewport_max_width =
rendering_viewport_max_height =
rendering_viewport_max_device_scale_factor =
grpc_host =
grpc_port =
[enterprise]
license_path =
[feature_toggles]
enable =
[date_formats]
full_date = YYYY-MM-DD HH:mm:ss
interval_second = HH:mm:ss
interval_minute = HH:mm
interval_hour = MM/DD HH:mm
interval_day = MM/DD
interval_month = YYYY-MM
interval_year = YYYY
use_browser_locale = false
default_timezone = browser

LDAP集成

4.监控展示系统Grafana和LDAP用户认证平台集成方法

ldap.toml配置文件的内容如下:

$ cat  grafana/grafana-conf/ldap.toml | grep -vE "#|^$"
5	[[servers]]
7	host = "192.168.2.210"
9	port = 389
11	use_ssl = false
13	start_tls = false
15	ssl_skip_verify = false
23	bind_dn = "cn=yuanadmin,dc=yuan,dc=com"
26	bind_password = 'xxxxxxxxxxx'  
29	search_filter = "(cn=%s)"
32	search_base_dns = ["dc=yuan,dc=com"]
41	[servers.attributes]
42	name = "displayName"
43	surname = "sn"
44	username = "cn"
45	member_of = "memberOf"
46	email =  "mail"
49	[[servers.group_mappings]]
50	group_dn = "cn=Grafana-admins,ou=GrafanaGroups,ou=jishubu,ou=BeiJing,dc=yuan,dc=com"
51	org_role = "Admin"
57	[[servers.group_mappings]]
58	group_dn = "cn=Grafana-editors,ou=GrafanaGroups,ou=jishubu,ou=BeiJing,dc=yuan,dc=com"
59	org_role = "Editor"
61	[[servers.group_mappings]]
63	group_dn = "cn=Grafana-viewers,ou=GrafanaGroups,ou=jishubu,ou=BeiJing,dc=yuan,dc=com"
64	org_role = "Viewer"

收集的节点目标

$ cat  prometheus/targets/node/centos7_node.yml 
- targets:
# Ops server centos7
    - '192.168.2.189:9100'
    - '192.168.2.200:9100'
    - '192.168.2.201:9100'
  labels:
    appname: 'centos7_node'

$ cat prometheus/targets/cAdvisor/centos7_cadvisor.yml 
- targets: 
# ops servers containers
  - '192.168.2.189:9101'
  - '192.168.2.200:9101'
  - '192.168.2.201:9101'

监控容器cadvisor

脚本内容

$ cat ./cadvisor_start.sh
DOCKER_ROOT=$( docker info 2>/dev/null | awk '/Docker Root/{print $4}' )
DOCKER_DIR=$DOCKER_ROOT/:/var/lib/docker:ro

docker stop cadvisor
docker rm -f cadvisor
docker run \
  --restart=always --privileged=true \
  --volume=/:/rootfs:ro \
  --volume=/var/run:/var/run:ro \
  --volume=/sys:/sys:ro \
  --volume=$DOCKER_DIR \
  --volume=/dev/disk/:/dev/disk:ro \
  --publish=9101:8080 \
  --detach=true \
  --name=cadvisor \
  --privileged \
  --device=/dev/kmsg \
  google/cadvisor:latest

ansible-playbook部署

$ cat ./cadvisor_allnode-deploy.yml
---
- hosts: all 
  user: root
  gather_facts: false
  tasks:
  - name: 拷贝cadvisor脚本到所有被监控节点
    copy: src=./cadvisor_start.sh dest=/opt/  mode=0755
  - name:
    shell: /opt/cadvisor_start.sh

监控主机

容器化部署的脚本内容

$ cat ./run_node_exporter.sh
docker stop node_exporter
docker rm -f node_exporter
docker run -d --name node_exporter \
        --restart=always \
	--net="host" \
	--pid="host" \
	-v "/proc:/host/proc:ro" \
	-v "/sys:/host/sys:ro" \
	-v "/:/rootfs:ro" \
	prom/node-exporter \
	--path.procfs=/host/proc \
	--path.rootfs=/rootfs \
	--path.sysfs=/host/sys \
	--collector.filesystem.ignored-mount-points='^/(sys|proc|dev|host|etc)($$|/)'

使用ansible进行传统部署的脚本

$ cat ./node_exporter.yml
---
- hosts: all
  user: root
  gather_facts: false
  tasks: 
  - name: 安装node_exporter
    unarchive: src=node_exporter-1.1.2.linux-amd64.tar.gz dest=/opt/ 
  - name: 添加node_exporter服务
    copy: src=node_exporter.service dest=/etc/systemd/system/
  - name: 设置开机自动启动
    shell: systemctl enable node_exporter.service 
  - name: 启动服务
    shell: systemctl start node_exporter.service 

node_exporter.service内容:

$ cat ./node_exporter.service
[Unit]
Description=Prometheus node_exporter
Requires=network.target remote-fs.target
After=network.target remote-fs.target

[Service]
Type=simple
User=root
Group=root
ExecStart=/opt/node_exporter-1.1.2.linux-amd64/node_exporter --web.listen-address=0.0.0.0:9100
ExecReload=/bin/kill -HUP $MAINPID
KillMode=process
Restart=on-failure
RestartSec=5s

[Install]
WantedBy=multi-user.target

至此, Docker容器化部署Prometheus和Grafana监控系统已经部署完成.

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论
以下是使用Docker部署PrometheusGrafana的步骤: 1. 安装DockerDocker Compose 如果你还没有安装DockerDocker Compose,可以参考官方文档进行安装。 2. 创建Docker Compose文件 在本地创建一个文件夹(例如:`prometheus-grafana`),并在其中创建一个`docker-compose.yml`文件,用于定义PrometheusGrafana服务。 在文件中,添加以下内容: ``` version: '3' services: prometheus: image: prom/prometheus container_name: prometheus ports: - "9090:9090" volumes: - ./prometheus:/etc/prometheus command: - --config.file=/etc/prometheus/prometheus.yml restart: always grafana: image: grafana/grafana container_name: grafana ports: - "3000:3000" volumes: - ./grafana:/var/lib/grafana restart: always ``` 这个文件定义了两个服务:`prometheus`和`grafana`。`prometheus`服务将使用`prom/prometheus`镜像,并将本地`./prometheus`目录挂载到容器中`/etc/prometheus`目录,`grafana`服务将使用`grafana/grafana`镜像,并将本地`./grafana`目录挂载到容器中`/var/lib/grafana`目录。 3. 创建Prometheus配置文件 在本地创建`prometheus`文件夹,在其中创建一个`prometheus.yml`文件,用于定义Prometheus监控的目标和规则。 在文件中,添加以下内容: ``` global: scrape_interval: 15s evaluation_interval: 15s scrape_configs: - job_name: 'prometheus' static_configs: - targets: ['localhost:9090'] ``` 这个文件定义了一个`prometheus`作业,将使用Prometheus默认的`localhost:9090`目标进行监控。 4. 运行Docker Compose 在终端中,进入`prometheus-grafana`目录,并运行以下命令: ``` docker-compose up -d ``` 这将启动PrometheusGrafana服务,并将它们置于后台运行。你可以使用以下命令检查服务是否正在运行: ``` docker-compose ps ``` 5. 访问Grafana 在浏览器中,访问`http://localhost:3000`,使用默认的用户名和密码(admin/admin)登录Grafana。 现在,你可以在Grafana中添加一个数据源,选择Prometheus,并将URL设置为`http://prometheus:9090`(因为Prometheus服务的名称是`prometheus`,而不是`localhost`)。 完成后,你可以创建一个新的仪表板并添加一个面板,从而开始使用Grafana可视Prometheus监控数据。 以上就是使用Docker部署PrometheusGrafana的步骤。
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

DevSecOps云原生LYJ

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值