docker-compose部署prometheus+grafana

最新推荐文章于 2024-09-04 09:54:19 发布

阿方很慌

最新推荐文章于 2024-09-04 09:54:19 发布

阅读量2.6k

点赞数 3

文章标签： docker prometheus grafana Powered by 金山文档

本文链接：https://blog.csdn.net/qq_41616955/article/details/129425755

版权

prometheus介绍：

Prometheus是一个开源的系统监控和报警系统，现在已经加入到CNCF基金会，成为继k8s之后第二个在CNCF托管的项目，在kubernetes容器管理系统中，通常会搭配prometheus进行监控，同时也支持多种exporter采集数据，还支持pushgateway进行数据上报，Prometheus性能足够支撑上万台规模的集群。

prometheus组件：

prometheus：主要组件，负责收集告警的调度

alertmanager(告警)：告警，负责接受prometheus传来的告警数据，进行告警

node_exproter：监控主机数据，比如cpu，内存，网络等

black_exporter(黑盒测试): 检测网站状态码，主机存活，端口检测等

mysql_exporter: 监控mysql数据

redis_exporter：监控redis数据

docker-compoes搭建：

准备工作：创建挂载目录

mkdir -p /docker/{prometheus,prometheus/data,alertmanager,grafana,blackbox_exporter}
cd /docker

创建prometheus配置文件

global:
  scrape_interval:     15s
  evaluation_interval: 15s
 
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - docker_alertmanager_1:9093
 
rule_files:
  - "*rules.yml"
  
scrape_configs:
  - job_name: 'prometheus'
    static_configs:
    - targets: ['192.1.1.12:9090']
 
  - job_name: 'node'
    static_configs:
    - targets: ['192.1.1.12:9100','192.168.88.11:9100']
 
  - job_name: 'alertmanager'
    static_configs:
    - targets: ['192.1.1.12:9093']
 
  - job_name: 'cadvisor'
    static_configs:
    - targets: ['192.168.88.13:8080','192.168.88.11:8080']
 
 
  - job_name: 'mysql-exporter'
    #scrape_interval: 5s
    static_configs:
      - targets: ['192.168.88.13:9104']
 
  - job_name: 'redis-exporter'
    #scrape_interval: 5s
    static_configs:
      - targets: ['192.168.88.13:9121']
 
  #下面这个6个redis-targets是集群的，不是集群去掉
  - job_name: 'redis_exporter_targets'
    static_configs:
      - targets:
        - redis://172.16.8.58:6379
    metrics_path: /metrics
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 172.16.0.46:9121
 
 
  - job_name: 'http_status'  # 配置job名
    metrics_path: /probe     # 定义metric获取的路径
    params:
      module: [http_2xx]     # 这里就是我们在black_exporter中定义的模块名
    file_sd_configs:         # 因需要监控的地址很多，我们这里将所有地址独立出来，后面会介绍该文件
      - files: 
        - '/etc/prometheus/job-web.yml'
        refresh_interval: 30s # 30秒刷新一次，当有新的监控地址时，会自动加载进来不需要重启
    relabel_configs:
      - source_labels: [__address__]  # 当前target的访问地址，比如监控百度则为 https://baidu.com
        target_label: __param_target  # __param是默认参数前缀，target为参数，这里可以理解为把__address__ 的值赋给__param_target，若监控百度，则target=https://baidu.com
      - source_labels: [__param_target]
        target_label: instance        # 可以理解为把__param_target的值赋给instance标签
      - target_label: __address__
        replacement: docker_black_exporter_1:9115 # web监控原本的target为站点的地址，但Prometheus不是直接去请求该地址，而是去请求black_exporter，故需要把目标地址替换为black_exporter的地址
 
  - job_name: 'rocketmq-exporter'
    #scrape_interval: 5s
    static_configs:
      - targets: ['rocketmq-exporter的ip:5557']
        labels:
          project: baidu
          instance: 172.30.0.150:9876 # 这里加了一个标签方便检索
          app: rocketmq
          env: pro

创建告警规则文件

vi /docker/prometheus/rules.yml  #yml文件注意格式
 
 
groups:
  - name: node-alert
    rules:
    - alert: NodeDown
      expr: up{job="node"} == 0
      for: 1m
      labels:
        severity: critical
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} down"
        description: "Instance: {{ $labels.instance }} 已经宕机 1分钟"
        value: "{{ $value }}"
        
    - alert: NodeCpuHigh
      expr: (1 - avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="idle"}[5m]))) * 100 > 85
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} cpu使用率过高"
        description: "CPU 使用率超过 80%"
        value: "{{ $value }}"
 
    - alert: NodeCpuIowaitHigh
      expr: avg by (instance) (irate(node_cpu_seconds_total{job="node",mode="iowait"}[5m])) * 100 > 80
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} cpu iowait 使用率过高"
        description: "CPU iowait 使用率超过 50%"
        value: "{{ $value }}"
 
    - alert: NodeLoad5High
      expr: node_load5 > (count by (instance) (node_cpu_seconds_total{job="node",mode='system'})) * 1.2
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} load(5m) 过高"
        description: "Load(5m) 过高，超出cpu核数 1.2倍"
        value: "{{ $value }}"
 
    - alert: NodeMemoryHigh
      expr: (1 - node_memory_MemAvailable_bytes{job="node"} / node_memory_MemTotal_bytes{job="node"}) * 100 > 90
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} memory 使用率过高"
        description: "Memory 使用率超过 90%"
        value: "{{ $value }}"
 
    - alert: NodeDiskRootHigh
      expr: (1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"} / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/"}) * 100 > 90
      for: 10m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk(/ 分区) 使用率过高"
        description: "Disk(/ 分区) 使用率超过 90%"
        value: "{{ $value }}"
 
    - alert: NodeDiskBootHigh
      expr: (1 - node_filesystem_avail_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"} / node_filesystem_size_bytes{job="node",fstype=~"ext.*|xfs",mountpoint ="/boot"}) * 100 > 80
      for: 10m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk(/boot 分区) 使用率过高"
        description: "Disk(/boot 分区) 使用率超过 80%"
        value: "{{ $value }}"
 
    - alert: NodeDiskReadHigh
      expr: irate(node_disk_read_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk 读取字节数 速率过高"
        description: "Disk 读取字节数 速率超过 20 MB/s"
        value: "{{ $value }}"
 
    - alert: NodeDiskWriteHigh
      expr: irate(node_disk_written_bytes_total{job="node"}[5m]) > 20 * (1024 ^ 2)
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk 写入字节数 速率过高"
        description: "Disk 写入字节数 速率超过 20 MB/s"
        value: "{{ $value }}"
        
    - alert: NodeDiskReadRateCountHigh
      expr: irate(node_disk_reads_completed_total{job="node"}[5m]) > 3000
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk iops 每秒读取速率过高"
        description: "Disk iops 每秒读取速率超过 3000 iops"
        value: "{{ $value }}"
 
    - alert: NodeDiskWriteRateCountHigh
      expr: irate(node_disk_writes_completed_total{job="node"}[5m]) > 3000
      for: 5m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk iops 每秒写入速率过高"
        description: "Disk iops 每秒写入速率超过 3000 iops"
        value: "{{ $value }}"
 
    - alert: NodeInodeRootUsedPercentHigh
      expr: (1 - node_filesystem_files_free{job="node",fstype=~"ext4|xfs",mountpoint="/"} / node_filesystem_files{job="node",fstype=~"ext4|xfs",mountpoint="/"}) * 100 > 80
      for: 10m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk(/ 分区) inode 使用率过高"
        description: "Disk (/ 分区) inode 使用率超过 80%"
        value: "{{ $value }}"
 
    - alert: NodeInodeBootUsedPercentHigh
      expr: (1 - node_filesystem_files_free{job="node",fstype=~"ext4|xfs",mountpoint="/boot"} / node_filesystem_files{job="node",fstype=~"ext4|xfs",mountpoint="/boot"}) * 100 > 80
      for: 10m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} disk(/boot 分区) inode 使用率过高"
        description: "Disk (/boot 分区) inode 使用率超过 80%"
        value: "{{ $value }}"
        
    - alert: NodeFilefdAllocatedPercentHigh
      expr: node_filefd_allocated{job="node"} / node_filefd_maximum{job="node"} * 100 > 80
      for: 10m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} filefd 打开百分比过高"
        description: "Filefd 打开百分比 超过 80%"
        value: "{{ $value }}"
 
    - alert: NodeNetworkNetinBitRateHigh
      expr: avg by (instance) (irate(node_network_receive_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8
      for: 3m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} network 接收比特数 速率过高"
        description: "Network 接收比特数 速率超过 20MB/s"
        value: "{{ $value }}"
 
    - alert: NodeNetworkNetoutBitRateHigh
      expr: avg by (instance) (irate(node_network_transmit_bytes_total{device=~"eth0|eth1|ens33|ens37"}[1m]) * 8) > 20 * (1024 ^ 2) * 8
      for: 3m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} network 发送比特数 速率过高"
        description: "Network 发送比特数 速率超过 20MB/s"
        value: "{{ $value }}"
        
    - alert: NodeNetworkNetinPacketErrorRateHigh
      expr: avg by (instance) (irate(node_network_receive_errs_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
      for: 3m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} 接收错误包 速率过高"
        description: "Network 接收错误包 速率超过 15个/秒"
        value: "{{ $value }}"
 
    - alert: NodeNetworkNetoutPacketErrorRateHigh
      expr: avg by (instance) (irate(node_network_transmit_packets_total{device=~"eth0|eth1|ens33|ens37"}[1m])) > 15
      for: 3m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} 发送错误包 速率过高"
        description: "Network 发送错误包 速率超过 15个/秒"
        value: "{{ $value }}"
 
    - alert: NodeProcessBlockedHigh
      expr: node_procs_blocked{job="node"} > 10
      for: 10m
      labels:
        severity: warning
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} 当前被阻塞的任务的数量过多"
        description: "Process 当前被阻塞的任务的数量超过 10个"
        value: "{{ $value }}"
 
    - alert: NodeTimeOffsetHigh
      expr: abs(node_timex_offset_seconds{job="node"}) > 3 * 60
      for: 2m
      labels:
        severity: info
        instance: "{{ $labels.instance }}"
      annotations:
        summary: "instance: {{ $labels.instance }} 时间偏差过大"
        description: "Time 节点的时间偏差超过 3m"
        value: "{{ $value }}"
 
 
    - alert: Web访问异常
      expr: probe_http_status_code{not_200 != "yes" } != 200
      for: 30s
      annotations:
        summary: Web 访问异常{{ $labels.instance }}
      labels:
        Severity: '严重'
    - alert: Web访问响应响应时间>3s
      expr: probe_duration_seconds >= 3
      for: 30s
      annotations:
        summary: Web 响应异常{{ $labels.instance }}
      labels:
        Severity: '警告'
    - alert: 证书过期时间<30天
      expr: probe_ssl_earliest_cert_expiry-time()< 3600*24*30
      annotations:
        summary: Web 证书将在30天后过期 {{ $labels.instance }}
      labels:
        Severity: '提醒'
    - alert: 证书过期时间<7天
      expr: probe_ssl_earliest_cert_expiry-time()< 3600*24*7
      annotations:
        summary: Web 证书将在30天后过期 {{ $labels.instance }}
      labels:
        Severity: '严重'
    - alert: 证书过期时间<1天
      expr: probe_ssl_earliest_cert_expiry-time()< 3600*24*1
      annotations:
        summary: Web 证书将在30天后过期 {{ $labels.instance }}
      labels:
        Severity: '灾难'
 
 
#groups:
#  - name: mysql.rules
#    rules:
    - alert: MysqlDown
      expr: up == 0
      for: 0m
      labels:
        severity: critical
      annotations:
        title: 'MySQL down'
        description: "Mysql实例: 【{{ $labels.instance }}】, MySQL instance is down"
 
    - alert: MysqlRestarted
      expr: mysql_global_status_uptime < 60
      for: 0m
      labels:
        severity: info
      annotations:
        title: 'MySQL Restarted'
        description: "Mysql实例: 【{{ $labels.instance }}】, MySQL has just been restarted, less than one minute ago"
 
    - alert: MysqlTooManyConnections(>80%)
      expr: avg by (instance) (rate(mysql_global_status_threads_connected[1m])) / avg by (instance) (mysql_global_variables_max_connections) * 100 > 80
      for: 2m
      labels:
        severity: warning
      annotations:
        title: 'MySQL too many connections (> 80%)'
        description: "Mysql实例: 【{{ $labels.instance }}】, More than 80% of MySQL connections are in use, Current Value: {{ $value }}%"
 
    - alert: MysqlThreadsRunningHigh
      expr: mysql_global_status_threads_running > 40
      for: 2m
      labels:
        severity: warning
      annotations:
        title: 'MySQL Threads_Running High'
        description: "Mysql实例: 【{{ $labels.instance }}】, Threads_Running above the threshold(40), Current Value: {{ $value }}"
 
    - alert: MysqlQpsHigh
      expr: sum by (instance) (rate(mysql_global_status_queries[2m])) > 500
      for: 2m
      labels:
        severity: warning
      annotations:
        title: 'MySQL QPS High'
        description: "Mysql实例: 【{{ $labels.instance }}】, MySQL QPS above 500"
 
    - alert: MysqlSlowQueries
      expr: increase(mysql_global_status_slow_queries[1m]) > 0
      for: 2m
      labels:
        severity: warning
      annotations:
        title: 'MySQL slow queries'
        description: "Mysql实例: 【{{ $labels.instance }}】, has some new slow query."
 
    - alert: MysqlTooManyAbortedConnections
      expr: round(increase(mysql_global_status_aborted_connects[5m])) > 20
      for: 2m
      labels:
        severity: warning
      annotations:
        title: 'MySQL too many Aborted connections in 2 minutes'
        description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted connections within 2 minutes"
 
    - alert: MysqlTooManyAbortedClients
      expr: round(increase(mysql_global_status_aborted_clients[120m])) > 5000
      for: 2m
      labels:
        severity: warning
      annotations:
        title: 'MySQL too many Aborted connections in 2 hours'
        description: "Mysql实例: 【{{ $labels.instance }}】, {{ $value }} Aborted Clients within 2 hours"
 
    - alert: MysqlSlaveIoThreadNotRunning
      expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_io_running == 0
      for: 0m
      labels:
        severity: critical
      annotations:
        title: 'MySQL Slave IO thread not running'
        description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave IO thread not running"
 
    - alert: MysqlSlaveSqlThreadNotRunning
      expr: mysql_slave_status_master_server_id > 0 and ON (instance) mysql_slave_status_slave_sql_running == 0
      for: 0m
      labels:
        severity: critical
      annotations:
        title: 'MySQL Slave SQL thread not running'
        description: "Mysql实例: 【{{ $labels.instance }}】, MySQL Slave SQL thread not running"
 
    - alert: MysqlSlaveReplicationLag
      expr: mysql_slave_status_master_server_id > 0 and ON (instance) (mysql_slave_status_seconds_behind_master - mysql_slave_status_sql_delay) > 30
      for: 1m
      labels:
        severity: critical
      annotations:
        title: 'MySQL Slave replication lag'
        description: "Mysql实例: 【{{ $labels.instance }}】, MySQL replication lag"
 
    - alert: MysqlInnodbLogWaits
      expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
      for: 0m
      labels:
        severity: warning
      annotations:
        title: 'MySQL InnoDB log waits'
        description: "Mysql实例: 【{{ $labels.instance }}】, innodb log writes stalling"
 
#groups:
#  - name:  Docker containers monitoring
#    rules: 
    - alert: ContainerKilled
      expr: time() - container_last_seen > 60
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Container killed (instance {{ $labels.instance }})"
        description: "A container has disappeared\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ContainerCpuUsage
      expr: (sum(rate(container_cpu_usage_seconds_total[3m])) BY (instance, name) * 100) > 80
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Container CPU usage (instance {{ $labels.instance }})"
        description: "Container CPU usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ContainerMemoryUsage
      expr: (sum(container_memory_usage_bytes) BY (instance, name) / sum(container_spec_memory_limit_bytes) BY (instance, name) * 100) > 80
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Container Memory usage (instance {{ $labels.instance }})"
        description: "Container Memory usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ContainerVolumeUsage
      expr: (1 - (sum(container_fs_inodes_free) BY (instance) / sum(container_fs_inodes_total) BY (instance)) * 100) > 80
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Container Volume usage (instance {{ $labels.instance }})"
        description: "Container Volume usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ContainerVolumeIoUsage
      expr: (sum(container_fs_io_current) BY (instance, name) * 100) > 80
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Container Volume IO usage (instance {{ $labels.instance }})"
        description: "Container Volume IO usage is above 80%\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ContainerHighThrottleRate
      expr: rate(container_cpu_cfs_throttled_seconds_total[3m]) > 1
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Container high throttle rate (instance {{ $labels.instance }})"
        description: "Container is being throttled\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: PgbouncerActiveConnectinos
      expr: pgbouncer_pools_server_active_connections > 200
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "PGBouncer active connectinos (instance {{ $labels.instance }})"
        description: "PGBouncer pools are filling up\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: PgbouncerErrors
      expr: increase(pgbouncer_errors_count{errmsg!="server conn crashed?"}[5m]) > 10
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "PGBouncer errors (instance {{ $labels.instance }})"
        description: "PGBouncer is logging errors. This may be due to a a server restart or an admin typing commands at the pgbouncer console.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: PgbouncerMaxConnections
      expr: rate(pgbouncer_errors_count{errmsg="no more connections allowed (max_client_conn)"}[1m]) > 0
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "PGBouncer max connections (instance {{ $labels.instance }})"
        description: "The number of PGBouncer client connections has reached max_client_conn.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: SidekiqQueueSize
      expr: sidekiq_queue_size{} > 100
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Sidekiq queue size (instance {{ $labels.instance }})"
        description: "Sidekiq queue {{ $labels.name }} is growing\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: SidekiqSchedulingLatencyTooHigh
      expr: max(sidekiq_queue_latency) > 120
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "Sidekiq scheduling latency too high (instance {{ $labels.instance }})"
        description: "Sidekiq jobs are taking more than 2 minutes to be picked up. Users may be seeing delays in background processing.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ConsulServiceHealthcheckFailed
      expr: consul_catalog_service_node_healthy == 0
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "Consul service healthcheck failed (instance {{ $labels.instance }})"
        description: "Service: `{{ $labels.service_name }}` Healthcheck: `{{ $labels.service_id }}`\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ConsulMissingMasterNode
      expr: consul_raft_peers < 3
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "Consul missing master node (instance {{ $labels.instance }})"
        description: "Numbers of consul raft peers should be 3, in order to preserve quorum.\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ConsulAgentUnhealthy
      expr: consul_health_node_status{status="critical"} == 1
      for: 5m
      labels:
        severity: critical
      annotations:
        summary: "Consul agent unhealthy (instance {{ $labels.instance }})"
        description: "A Consul agent is down\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
 
#groups:
#- name:  Redis
#  rules: 
    - alert: RedisDown
      expr: redis_up  == 0
      for: 5m
      labels:
        severity: error
      annotations:
        summary: "Redis down (instance {{ $labels.instance }})"
        description: "Redis 挂了啊，mmp\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: MissingBackup
      expr: time() - redis_rdb_last_save_timestamp_seconds > 60 * 60 * 24
      for: 5m
      labels:
        severity: error
      annotations:
        summary: "Missing backup (instance {{ $labels.instance }})"
        description: "Redis has not been backuped for 24 hours\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"       
    - alert: OutOfMemory
      expr: redis_memory_used_bytes / redis_total_system_memory_bytes * 100 > 90
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Out of memory (instance {{ $labels.instance }})"
        description: "Redis is running out of memory (> 90%)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: ReplicationBroken
      expr: delta(redis_connected_slaves[1m]) < 0
      for: 5m
      labels:
        severity: error
      annotations:
        summary: "Replication broken (instance {{ $labels.instance }})"
        description: "Redis instance lost a slave\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: TooManyConnections
      expr: redis_connected_clients > 10
      for: 1m
      labels:
        severity: warning
      annotations:
        summary: "Too many connections (instance {{ $labels.instance }})"
        description: "Redis instance has too many connections\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"       
    - alert: NotEnoughConnections
      expr: redis_connected_clients < 5
      for: 5m
      labels:
        severity: warning
      annotations:
        summary: "Not enough connections (instance {{ $labels.instance }})"
        description: "Redis instance should have more connections (> 5)\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
    - alert: RejectedConnections
      expr: increase(redis_rejected_connections_total[1m]) > 0
      for: 5m
      labels:
        severity: error
      annotations:
        summary: "Rejected connections (instance {{ $labels.instance }})"
        description: "Some connections to Redis has been rejected\n  VALUE = {{ $value }}\n  LABELS: {{ $labels }}"
 
 
# rules_rocketmq.yml 
#groups:
#- name: rocketmq
#  rules:
    - alert: RocketMQ Exporter is Down 
      expr: up{job="rocketmq"} == 0
      for: 1m
      labels: 
        severity: '灾难'
      annotations:
        summary: RocketMQ {{ $labels.instance }} is down
    - alert: RocketMQ 存在消息积压
      expr: (sum(irate(rocketmq_producer_offset[1m])) by (topic)  - on(topic) group_right sum(irate(rocketmq_consumer_offset[1m])) by (group,topic)) > 5
      for: 5m
      labels: 
        severity: '警告'
      annotations:
        summary: RocketMQ (group={{ $labels.group }} topic={{ $labels.topic }})积压数 = {{ .Value }}
    - alert: GroupGetLatencyByStoretime 消费组的消费延时时间过高
      expr: rocketmq_group_get_latency_by_storetime/1000  > 5 and rate(rocketmq_group_get_latency_by_storetime[5m]) >0
      for: 3m
      labels:
        severity: 警告
      annotations:
        description: 'consumer {{$labels.group}} on {{$labels.broker}}, {{$labels.topic}} consume time lag behind message store time and (behind value is {{$value}}).'
        summary: 消费组的消费延时时间过高
    - alert: RocketMQClusterProduceHigh 集群TPS > 20
      expr: sum(rocketmq_producer_tps) by (cluster) >= 20
      for: 3m
      labels:
        severity: 警告
      annotations:
        description: '{{$labels.cluster}} Sending tps too high. now TPS = {{ .Value }}'
        summary: cluster send tps too high

创建prometheus.yaml文件

vi /docker/prometheus/prometheus.yaml
global:
  scrape_interval:     15s
  evaluation_interval: 15s
 
alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - docker_alertmanager_1:9093
 
rule_files:
  - "*rules.yml"
  
scrape_configs:
  - job_name: 'prometheus'
    static_configs:
    - targets: ['192.1.1.12:9090']
 
  - job_name: 'node'
    static_configs:
    - targets: ['192.1.1.12:9100','192.168.88.11:9100']
 
  - job_name: 'alertmanager'
    static_configs:
    - targets: ['192.1.1.12:9093']
 
  - job_name: 'cadvisor'
    static_configs:
    - targets: ['192.168.88.13:8080','192.168.88.11:8080']
 
 
  - job_name: 'mysql-exporter'
    #scrape_interval: 5s
    static_configs:
      - targets: ['192.168.88.13:9104']
 
  - job_name: 'redis-exporter'
    #scrape_interval: 5s
    static_configs:
      - targets: ['192.168.88.13:9121']
 
  #下面这个6个redis-targets是集群的，不是集群去掉
  - job_name: 'redis_exporter_targets'
    static_configs:
      - targets:
        - redis://172.16.8.58:6379
    metrics_path: /metrics
    relabel_configs:
      - source_labels: [__address__]
        target_label: __param_target
      - source_labels: [__param_target]
        target_label: instance
      - target_label: __address__
        replacement: 172.16.0.46:9121
 
 
  - job_name: 'http_status'  # 配置job名
    metrics_path: /probe     # 定义metric获取的路径
    params:
      module: [http_2xx]     # 这里就是我们在black_exporter中定义的模块名
    file_sd_configs:         # 因需要监控的地址很多，我们这里将所有地址独立出来，后面会介绍该文件
      - files: 
        - '/etc/prometheus/job-web.yml'
        refresh_interval: 30s # 30秒刷新一次，当有新的监控地址时，会自动加载进来不需要重启
    relabel_configs:
      - source_labels: [__address__]  # 当前target的访问地址，比如监控百度则为 https://baidu.com
        target_label: __param_target  # __param是默认参数前缀，target为参数，这里可以理解为把__address__ 的值赋给__param_target，若监控百度，则target=https://baidu.com
      - source_labels: [__param_target]
        target_label: instance        # 可以理解为把__param_target的值赋给instance标签
      - target_label: __address__
        replacement: docker_black_exporter_1:9115 # web监控原本的target为站点的地址，但Prometheus不是直接去请求该地址，而是去请求black_exporter，故需要把目标地址替换为black_exporter的地址
 
  - job_name: 'rocketmq-exporter'
    #scrape_interval: 5s
    static_configs:
      - targets: ['rocketmq-exporter的ip:5557']
        labels:
          project: baidu
          instance: 172.30.0.150:9876 # 这里加了一个标签方便检索
          app: rocketmq
          env: pro

创建alertmanager文件

vi /docker/alertmanager/alertmanager.yml
global:
  resolve_timeout: 5m #超时,默认5min
  #邮箱smtp服务
  smtp_smarthost: 'smtp.qq.com:465'
  smtp_from: '755530832@qq.com'
  smtp_auth_username: '755530832@qq.com'
  smtp_auth_password: '######' ##qq邮箱token
  smtp_require_tls: false

# 定义模板信息
templates:
  - 'template/*.tmpl'   # 路径

# 路由
route:
  group_by: ['alertname'] # 报警分组依据
  group_wait: 10s #组等待时间
  group_interval: 10s # 发送前等待时间
  repeat_interval: 1m #重复周期
  receiver: 'mail' # 默认警报接收者

# 警报接收者
receivers:
- name: 'mail' #警报名称
  email_configs:
  - to: '{{ template "email.to" . }}'  #接收警报的email
    html: '{{ template "email.to.html" . }}' # 模板
    send_resolved: true

# 告警抑制
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'dev', 'instance']

创建告警模板

vi /docker/alertmanager/template/em.tmpl
{{ define "email.from" }}755530832@qq.com{{ end }}
{{ define "email.to" }}411203036@qq.com,1683294271@qq.com{{ end }}
{{ define "email.to.html" }}
{{ range .Alerts }}
=========start==========<br>
告警程序: prometheus_alert <br>
告警级别: {{ .Labels.severity }} 级 <br>
告警类型: {{ .Labels.alertname }} <br>
故障主机: {{ .Labels.instance }} <br>
告警主题: {{ .Annotations.summary }} <br>
告警详情: {{ .Annotations.description }} <br>
触发时间: {{ (.StartsAt.Add 28800e9) }} <br>
=========end==========<br>
{{ end }}
{{ end }}

创建black_exporter配置文件

vi /docker/blackbox_exporter/config.yml


modules:
  http_2xx:
    prober: http
  http_post_2xx:
    prober: http
    http:
      method: POST
  tcp_connect:
    prober: tcp
  pop3s_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^+OK"
      tls: true
      tls_config:
        insecure_skip_verify: false
  grpc:
    prober: grpc
    grpc:
      tls: true
      preferred_ip_protocol: "ip4"
  grpc_plain:
    prober: grpc
    grpc:
      tls: false
      service: "service1"
  ssh_banner:
    prober: tcp
    tcp:
      query_response:
      - expect: "^SSH-2.0-"
      - send: "SSH-2.0-blackbox-ssh-check"
  irc_banner:
    prober: tcp
    tcp:
      query_response:
      - send: "NICK prober"
      - send: "USER prober prober prober :prober"
      - expect: "PING :([^ ]+)"
        send: "PONG ${1}"
      - expect: "^:[^ ]+ 001"
  icmp:
    prober: icmp
  icmp_ttl5:
    prober: icmp
    timeout: 5s
    icmp:
      ttl: 5

创建blackweb文件

vi /docker/prometheus/job-web.yml
 
 
---
- targets:
  - https://www.baidu.com/
  labels:
    env: pro
    app: web
    project: 百度
    desc: 百度生产
- targets:
  - https://blog.csdn.net/
  labels:
    env: test
    app: web
    project: CSDN
    desc: 测试一下啦
    not_200: yes # 这个自定义标签是为了标识某些地址在正常情况下不是返回200状态码

创建docker-compose文件

vi /docker/docker-compoes.yml

version: '3.7'
 
services:
  node-exporter:
    image: prom/node-exporter:latest
    restart: always
    ports:
      - "9100:9100"
    networks:
      - prom
 
#  dingtalk:
#    image: timonwong/prometheus-webhook-dingtalk:latest
#    restart: always
#    volumes:
#      - type: bind
#        source: ./alertmanager/config.yml
#        target: /etc/prometheus-webhook-dingtalk/config.yml
#        read_only: true
#    ports:
#      - "8060:8060"
#    networks:
#      - prom
 
  alertmanager:
    #depends_on:
    #  - dingtalk
    image: prom/alertmanager:latest
    restart: always
    volumes:
      - type: bind
        source: ./alertmanager/alertmanager.yml
        target: /etc/alertmanager/alertmanager.yml
        read_only: true
      - type: volume
        source: alertmanager
        target: /etc/alertmanager
    ports:
      - "9093:9093"
      - "9094:9094"
    networks:
      - prom
 
  prometheus:
    depends_on:
      - alertmanager
    image: prom/prometheus:latest
    restart: always
    command:
      - --config.file=/etc/prometheus/prometheus.yml
      - --web.enable-lifecycle
    volumes:
      - type: bind
        source: ./prometheus/prometheus.yml
        target: /etc/prometheus/prometheus.yml
        read_only: true
      - type: bind
        source: ./prometheus/rules.yml
        target: /etc/prometheus/rules.yml
        read_only: true
      #- type: bind
        #source: ./prometheus/job-web.yml
        #target: /etc/prometheus/job-web.yml
        #read_only: true
      - type: volume
        source: prometheus
        target: /prometheus
        
    ports:
      - "9090:9090"
    networks:
      - prom
 
  grafana:
    depends_on:
      - prometheus
    image: grafana/grafana:latest
    restart: always
    volumes:
      - type: volume
        source: grafana
        target: /var/lib/grafana
    ports:
      - "3000:3000"
    networks:
      - prom
 
  cadvisor:
    image: google/cadvisor:latest
    #container_name: cadvisor
    hostname: cadvisor
    restart: always
    volumes:
      - /:/rootfs:ro
      - /var/run:/var/run:rw
      - /sys:/sys:ro
      - /var/lib/docker/:/var/lib/docker:ro
    ports:
      - "8080:8080"
    networks:
      - prom
    privileged: true
 
  mysqld-exporter:
    image: prom/mysqld-exporter
    hostname: mysqld-exporter
    restart: always
    ports:
      - "9104:9104"
    environment:
      - DATA_SOURCE_NAME=root:12345@(10.211.122.9:3306)/  #username:password@(ip:端口)
    networks:
      - prom
 
  redis-exporter:
    image: oliver006/redis_exporter
    #container_name: mysqld-exporter
    hostname: redis-exporter
    restart: always
    ports:
      - "9121:9121"
    command:
      - --redis.addr=redis://10.211.122.9:6379
      - --redis.password=123456
    networks:
      - prom
 
  blackbox_exporter:
    #container_name: blackbox_exporter
    image: prom/blackbox-exporter:master
    restart: always
    volumes:
      - /docker/blackbox_exporter/config.yml:/etc/blackbox_exporter/config.yml
    ports:
      - 9115:9115 
 
  rocketmq-exporter:
    image: sawyerlan/rocketmq-exporter
    #container_name: mysqld-exporter
    hostname: sawyerlan/rocketmq-exporter
    restart: always
    ports:
      - "5557:5557"
    command:
      #- --rocketmq.config.namesrvAddr=172.30.0.150:9876;172.30.0.151:9876
      - --rocketmq.config.namesrvAddr=172.30.0.150:9876
    networks:
      - prom
 
volumes:
  prometheus:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /docker/prometheus/data
  grafana:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /docker/grafana
    
  alertmanager:
    driver: local
    driver_opts:
      type: none
      o: bind
      device: /docker/alertmanager
networks:
  prom:
    driver: bridge