安装请看https://blog.51cto.com/liuqs/2027365 ,最好是对应的版本组件,否则可能会有差别。

(一)prometheus + grafana + alertmanager 配置主机监控

(二)prometheus + grafana + alertmanager 配置Mysql监控

(三)prometheus + grafana + alertmanager 配置Redis监控

(四)prometheus + grafana + alertmanager 配置Kafka监控

(五)prometheus + grafana + alertmanager 配置ES监控


(二)prometheus + grafana + alertmanager 配置Mysql监控

  1. mysqld_exporter安装与配置

    A. mysqld服务安装在每台Linux服务器上

    1. 下载mysqld_exporter到每台mysqld服务器上(下载地址: https://pan.baidu.com/s/1pW7RptzXa3LqFlO5zxJXPw ),并解压到/data/monitor/下

    2. 安装go环境, yum install go -y

    3. 用root用户连接当前mysql,授权监控用户

      mysql> GRANT REPLICATION CLIENT,PROCESS ON *.* TO 'mysql_monitor'@'localhost' identified by 'Jvsa09OodhvS0VKQ';

      mysql> FLUSH PRIVILEGES;

    4. cd /data/monitor/mysqld_exporter下,创建.my.cnf文件,vim .my.cnf

      [client]

      host=10.8.4.126

      port=3306

      user=mysql_monitor

      password=Jvsa09OodhvS0VKQ

    5. 启动mysqld_exporter  /data/monitor/mysqld_exporter/bin/mysqld_exporter -config.my-cnf="/data/monitor/.my.cnf" &


B. 使用的是云商的mysql db(我们使用的是ucloud的udb,下面的都按这个来实现,都差不多)

  1. 下载mysqld_exporter到prometheus服务器上((登陆到prometheus服务器,prometheus grafana alertmanager在同一台服务器上)下载地址: https://pan.baidu.com/s/1MNPbhoZEvVV4lf1bVXWJ1g ),并解压到/data/monitor/下

  2. 如果没有安装go环境, yum install go -y

  3. 用root用户连接当前mysql,授权监控用户

    mysql> GRANT REPLICATION CLIENT,PROCESS ON *.* TO 'mysql_monitor'@'%' identified by 'Jvsa09OodhvS0VKQ';

    mysql> FLUSH PRIVILEGES;

  4. cd /data/monitor/mysqld_exporter下,创建.my.cnf文件夹,然后在文件下创建每个db的连接配置文件。以下是一个的实例,其它的请参照这个来创建。

    cat /data/monitor/mysqld_exporter/.my.cnf/.ba_master_10.8.4.126_3306_15049.cnf

    [client]

    host=10.8.4.126

    port=3306

    user=mysql_monitor

    password=Jvsa09OodhvS0VKQ

  5. 然后cd /data/monitor/mysqld_exporter/scripts下,创建各个mysqld_exporter的启动脚本,下面是一个mysql db 的mysqld_exporter启动脚本,其它请参照这个来创建,注意监听的端口要不同和调用的.my.cnf文件要对应,

    cat /data/monitor/mysqld_exporter/scripts/ba_master_10.8.4.126_3306_15049.sh

    nohup /data/monitor/mysqld_exporter/bin/mysqld_exporter -web.listen-address=':15049' -config.my-cnf=/data/monitor/mysqld_exporter/.my.cnf/.ba_master_10.8.4.126_3306_15049.cnf -collect.info_schema.tables=false >> /data/monitor/mysqld_exporter/log/15049_10.8.4.126_3306.log 2>&1 &

  6. 由于/data/monitor/mysqld_exporter/scripts/下有很多个mysql db 的mysqld_exporter启动脚本,所以我们cd /data/monitor/mysqld_exporter下,然后 sh start.sh进行启动,然后检查各个端口是否已监听。


2. 配置prometheus

    A. 将mysqld_exporter的配置增加到prometheus.yml文件中,vim /data/monitor/prometheus/conf/prometheus.yml

        

global:

  # Server端抓取数据的时间间隔

  scrape_interval:     1m

  # 评估报警规则的时间间隔

  evaluation_interval: 1m

  # 数据抓取的超时时间

  scrape_timeout: 20s

  # 加全局标签

  #external_labels:

  #  monitor: "hk"


# 连接alertmanager

alerting:

  alertmanagers:

    - static_configs:

      - targets: ["localhost:9093"]


# 告警规则

rule_files:

  - /data/monitor/prometheus/conf/rule/*.yml


# A scrape configuration containing exactly one endpoint to scrape:

# Here it's Prometheus itself.

scrape_configs:

# 监控prometheus本机

  - job_name: 'prometheus'

    scrape_interval: 15s

    static_configs:

      - targets: ['10.8.53.218:9090']


# 监控指定主机

  - job_name: 'node_resources'

    scrape_interval: 1m

    static_configs:

    file_sd_configs:

      - files:

        - /data/monitor/prometheus/conf/node_conf/node_host_info.json

    honor_labels: true


# mysql采集器

  - job_name: 'mysql_global_status'

    scrape_interval: 60s

    static_configs:

    file_sd_configs:

      - files:

        - /data/monitor/prometheus/conf/node_conf/node_mysql_info.json


    B. 编写node_mysql_info.json,cat /data/monitor/prometheus/conf/node_conf/node_mysql_info.json

[

    {   

        "labels": { 

            "desc": "slave_customer_10.8.31.101:3306",

            "group": "ba",

            "mysql_addr": "10.8.31.101:3306",

            "role": "slave_customer"

        },

        "targets": [

            "localhost:15050"

        ]

    },

    {   

        "labels": { 

            "desc": "slave_bi_10.8.150.188:3306",

            "group": "ba",

            "mysql_addr": "10.8.150.188:3306",

            "role": "slave_bi"

        },

        "targets": [

            "localhost:15221"

        ]

    },

    {

        "labels": {

            "desc": "slave_10.8.139.209:3306",

            "group": "ba",

            "mysql_addr": "10.8.139.209:3306",

            "role": "slave"

        },

        "targets": [

            "localhost:15052"

        ]

    },

    {

        "labels": {

            "desc": "slave_catalog_10.8.11.246:3306",

            "group": "ba",

            "mysql_addr": "10.8.11.246:3306",

            "role": "slave_catalog"

        },

        "targets": [

            "localhost:15053"

        ]

    },

    {

        "labels": {

            "desc": "master_10.8.4.126:3306",

            "group": "ba",

            "mysql_addr": "10.8.4.126:3306",

            "role": "master"

        },

        "targets": [

            "localhost:15049"

        ]

    },

    {

        "labels": {

            "desc": "slave_dc_10.8.17.124:3306",

            "group": "ba",

            "mysql_addr": "10.8.17.124:3306",

            "role": "slave_dc"

        },

        "targets": [

            "localhost:15051"

        ]

    },

    {

        "labels": {

            "desc": "master_10.8.115.3:3306",

            "group": "openapi",

            "mysql_addr": "10.8.115.3:3306",

            "role": "master"

        },

        "targets": [

            "localhost:15060"

        ]

    }

]


    B. 重启prometheus,cd /data/monitor/prometheus下,然后 sh reload.sh


注意:由于有很多指标无法抓取,我们用脚本再次获取,我只有ucloud的api对接抓取的python脚本,如果有需要可以加我qq: 761117826


3. 配置grafana

    A. 下载mysql监控模板,下载地址: https://pan.baidu.com/s/1xWWceAQ_A4kKEn06dUlRBA 

    B. 如何导入请参考配置主机监控的文章中的2.配置grafana中的h至l步骤( https://blog.51cto.com/liuqs/2391282 )

4. 配置alertmanager

    A. 在prometheus配置规则,cat /data/monitor/prometheus/conf/rule/mysql.yml ,下面是文件内容,然后重启prometheus,cd /data/monitor/prometheus && sh reload.sh


groups:

  - name: mysql_alert

    rules:

### 慢查询 ###

# 默认慢查询告警策略

    - alert: mysql慢查询5分钟100条

      expr: floor(delta(mysql_global_status_slow_queries{mysql_addr!~"10.8.6.44:3306|10.8.9.20:3306|10.8.12.212:3306"}[5m])) >= 100

      for: 3m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}条],告警初始时长为3分钟."


### qps ###

# 默认qps告警策略

    - alert: mysql_qps大于8000

      expr: floor(sum(irate(mysql_global_status_commands_total{group!~"product|product_backend"}[5m])) by (group, role, mysql_addr)) > 8000

      for: 6m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}],告警初始时长为6分钟."


# 商品库等qps告警策略

    - alert: mysql_qps大于25000

      expr: floor(sum(irate(mysql_global_status_commands_total{group=~"product|product_backend"}[5m])) by (group, role, mysql_addr)) > 25000

      for: 3m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}],告警初始时长为3分钟."


### 内存 ###

# 默认内存告警策略

    - alert: mysql内存99%

      expr: mysql_mem_used_rate >= 99

      for: 6m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}%],告警初始时长为6分钟."


### 磁盘 ###

# 默认磁盘告警策略

    - alert: mysql磁盘85%

      expr: mysql_disk_used_rate{mysql_addr!~"10.8.161.53:3306|10.8.115.31:3306"} >= 85

      for: 3m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}%],告警初始时长为3分钟."


# 磁盘95%告警策略

    - alert: mysql磁盘95%

      expr: mysql_disk_used_rate{mysql_addr=~"10.8.161.53:3306|10.8.115.31:3306"} >= 95

      for: 3m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}%],告警初始时长为3分钟."


#### IO上限告警 ###

## SSD盘IO上限告警策略

#    - alert: mysqlSSD盘IO上限预警

#      expr: (floor(mysql_ioops) >= mysql_disk_total_size * 50 * 0.9) and (mysql_ssd == 1) and on() hour() >= 0 < 16

#      for: 6m

#      labels:

#        severity: warning

#      annotations:

#        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}],告警初始时长为6分钟."

#

## 普通盘IO上限告警策略

#    - alert: mysql普通盘IO上限预警

#      expr: (floor(mysql_ioops) >= mysql_disk_total_size * 10 * 0.9) and (mysql_ssd == 0) and on() hour() >= 0 < 16

#      for: 6m

#      labels:

#        severity: warning

#      annotations:

#        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}],告警初始时长为6分钟."


### 连接数 ###

# 默认连接数告警策略

    - alert: mysql连接数80%

      expr: floor(mysql_global_status_threads_connected / mysql_global_variables_max_connections * 100) >= 80

      for: 3m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}%],告警初始时长为3分钟."


### 运行进程数 ###

# 默认运行进程数告警策略

    - alert: mysql运行进程数5分钟增长>150

      expr: floor(delta(mysql_global_status_threads_running{mysql_addr!~"10.8.136.10:3306|10.10.129.116:3306|10.8.67.153:3306"}[5m])) >= 150

      for: 3m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}],告警初始时长为3分钟."


# 6分钟运行进程数告警策略

    - alert: mysql运行进程数5分钟增长>150

      expr: floor(delta(mysql_global_status_threads_runningi{mysql_addr=~"10.8.136.10:3306|10.10.129.116:3306|10.8.67.153:3306"}[5m])) >= 150

      for: 6m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}],告警初始时长为6分钟."


### 主从同步异常 ###

# 默认主从同步告警策略

    - alert: mysql主从同步异常

      expr: (mysql_slave_status_slave_io_running{role!="master"} == 0) or (mysql_slave_status_slave_sql_running{role!="master"} == 0)

      for: 1m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],主从同步异常,告警初始时长为1分钟."


### 主从同步延时 ###

# 默认主从同步延时告警策略

    - alert: mysql主从同步延时>30s

      expr: floor(mysql_slave_status_seconds_behind_master{mysql_addr!~"10.8.137.173:3306|10.8.11.17:3306|10.8.2.17:3306|10.10.29.6:3306|10.8.61.153:3306"}) >= 30

      for: 3m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}s],告警初始时长为3分钟."


# 主从同步延时较大告警策略

    - alert: mysql主从同步延时>300s

      expr: floor(mysql_slave_status_seconds_behind_master{mysql_addr=~"10.8.137.173:3306|10.8.11.17:3306|10.10.29.6:3306|10.8.61.153:3306"}) >= 300

      for: 12m

      labels:

        severity: warning

      annotations:

        description: "[{{ $labels.group }}_{{ $labels.role }}],地址:[{{ $labels.mysql_addr }}],告警值为:[{{ $value }}s],告警初始时长为12分钟."

    

    B. 配置alertmanager, cat /data/prometheus/alertmanager/conf/alertmanager.yml ,如果是相同的接收人,可以直接在原来的资源后面增加,如果是不同的接收人,就需要重新定义接收人模板,然后再定义资源规则并绑定到新的接收人模板


global:

  resolve_timeout: 2m

  smtp_auth_password: q5AYahvxi3WLDap3 #发送邮箱密码

  smtp_auth_username: itliuqs@163.com #发送邮箱

  smtp_from: itliuqs@163.com #发送邮箱

  smtp_require_tls: false

  smtp_smarthost: smtp.163.com:465 #发送服务器

  wechat_api_url: https://qyapi.weixin.qq.com/cgi-bin/ #微信接口链接


inhibit_rules:

- equal:

  - instance

  source_match:

    alertname: "主机CPU90%"

  target_match:

    alertname: "主机负载过高"

- equal:

  - instance

  source_match:

    alertname: "mysql运行进程数5分钟增长数>150"

  target_match:

    alertname: "mysql慢查询5分钟100条"

- equal:

  - instance

  source_match:

    severity: error

  target_match:

    severity: warning

- equal:

  - instance

  source_match:

    severity: fatal

  target_match:

    severity: error

- equal:

  - service_name

  source_match:

    severity: error

  target_match:

    severity: warning


receivers: 

- email_configs: #定义test发送人模块

  - html: '{{  template "email.default.html" . }}' #调用的模板

    send_resolved: true

    to: liuqs@126.com #将报警信息发给些邮箱,多人用|

  name: test #发送人模板名

  wechat_configs: #微信接收这些信息请看最下面的企业微信介绍

  - agent_id: 1000002 #应用id

    api_secret: hnyU1LTGnJUiBaCp47l3WVQLTEFF5RXyfNO751xlaHa #应用认证

    corp_id: wwd397231fa801beaa #企业微信ID

    send_resolved: true

    to_user: LiuQingShan|liuqs #发送给企业微信通讯人的Id 多个人就用|分开


- email_configs: #定义默认的发送人

  - html: '{{  template "email.default.html" . }}'

    send_resolved: true

    to: liuqs@126.com

  name: default_group

  wechat_configs: 

  - agent_id: 1000002

    api_secret: hnyU1LTGnJUiBaCp47l3WVQLTEFF5RXyfNO751xlaHa

    corp_id: wwd397231fa801beaa

    send_resolved: true

    to_user: LiuQingShan


route: #定义资源报警规则

  group_by:

  - monitor

  group_interval: 2m

  group_wait: 30s

  receiver: default_group

  repeat_interval: 6h

  routes:

  - continue: true

    match_re:

      instance: 10.8.46.117:9100|10.8.80.126:9100|10.8.32.67:9100|10.8.9.35:9100|10.8.69.81:9100|localhost:15050|localhost:15221|localhost:15052|localhost:15053|localhost:15049|localhost:15051|localhost:15060  #定义使用的资源

    receiver: test #使用test发送人模板


templates:

- /data/monitor/alertmanager/template/*.tmpl #调用报警内容模板的路径