一.搭建Prometheus
$ wget
https://github.com/prometheus/prometheus/releases/download/v2.12.0/prometheus-2.12.0.linux-amd64.tar.gz
$ tar xf prometheus-2.12.0.linux-amd64.tar.gz && cd prometheus
$ nohup ./prometheus --config.file=prometheus.yml --web.enable-lifecycle &
$ vim prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- alertmanager:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "/data/prometheus/rule.yml"
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
#对主机的监控
- job_name: 'OS'
static_configs:
- targets: ['172.31.xxx:9100']
labels:
instance: 'web'
- targets: ['172.xxx:9100']
labels:
instance: 'db'
#对mysql的监控
- job_name: 'MySQL'
static_configs:
- targets: ['172.31.xxx:9104']
labels:
instance: 'db-mysql'
#对Redis的监控
- job_name: 'Redis'
static_configs:
- targets: ['172.31.xxx:9121']
labels:
instance: 'db-redis'
二.Node/mysql/redis_exporter
//node_exporter
$ wget https://github.com/prometheus/node_exporter/releases/download/v0.18.1/node_exporter-0.18.1.linux-amd64.tar.gz
$ tar xf node_exporter-0.18.1.linux-amd64.tar.gz &&cd node_exporter
$ nohup ./node_exporter &
//mysql_exporter
$ wget https://github.com/prometheus/mysqld_exporter/releases/download/v0.12.1/mysqld_exporter-0.12.1.linux-amd64.tar.gz
$ tar xf mysqld_exporter-0.12.1.linux-amd64.tar.gz &&cd mysqld_exporter-0.12.1.linux-amd64
#先在MySQL创建用户
mysql> CREATE USER 'mysql_monitor'@'localhost' identified by 'mysql_monitor';
mysql> GRANT REPLICATION CLIENT, PROCESS ON *.* TO 'mysql_monitor'@'localhost';
mysql> GRANT SELECT ON performance_schema.* TO 'mysql_monitor'@'localhost';
在mysql_exporter目录下创建.my.cnf文件
$vim .my.cnf
[client]
port=3306
user=mysql_monitor
password=mysql_monitor
#启动时指定配置文件
$ nohup ./mysqld_exporter --config.my-cnf=.my.cnf &
如果要监听多个实例
$ nohup ./mysqld_exporter --config.my-cnf=.mycntr.cnf --web.listen-address=172.31.243.198:9105 &
//redis_exporter
$ wget https://github.com/oliver006/redis_exporter/releases/download/v0.30.0/redis_exporter-v0.30.0.linux-amd64.tar.gz
$ tar xf redis_exporter-v0.30.0.linux-amd64.tar.gz && cd redis_exporter
#指定server和exporter程序地址
$ nohup ./redis_exporter -redis.addr=172.31.243.198:6379 -web.listen-address 0.0.0.0:9121 &
三、alertmanager
wget https://github.com/prometheus/alertmanager/releases/download/v0.15.2/alertmanager-0.15.2.linux-amd64.tar.gz
tar xf alertmanager-0.15.2.linux-amd64.tar.gz
nohup ./alertmanager &
-------------------------------------------
cat rule.yaml
#根据情况修改,expr为promql
groups:
- name: UnicornServerStatus
rules:
- alert: InstanceStatus
expr: up == 0
for: 2m
labels:
status: warning
annotations:
summary: "{{$labels.instance}}: has been down"
description: "{{$labels.instance}}: job {{$labels.job}} has been down"
- name: base-monitor-rule
rules:
- alert: NodeCpuUsage
expr: (100 - (avg by (instance) (rate(node_cpu{job=~".*",mode="idle"}[2m])) * 100)) > 99
for: 15m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: CPU usage is above 99% (current value is: {{ $value }}"
- alert: NodeMemUsage
expr: avg by (instance) ((1- (node_memory_MemFree{} + node_memory_Buffers{} + node_memory_Cached{})/node_memory_MemTotal{}) * 100) > 90
for: 15m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: MEM usage is above 90% (current value is: {{ $value }}"
- alert: NodeDiskUsage
expr: (1 - node_filesystem_free{fstype!="rootfs",mountpoint!="",mountpoint!~"/(run|var|sys|dev).*"} / node_filesystem_size) * 100 > 80
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Disk usage is above 80% (current value is: {{ $value }}"
- alert: NodeLoad15
expr: avg by (instance) (node_load15{}) > 100
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Load15 is above 100 (current value is: {{ $value }}"
- alert: NodeAgentStatus
expr: avg by (instance) (up{}) == 0
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Node Agent is down (current value is: {{ $value }}"
- alert: NodeProcsBlocked
expr: avg by (instance) (node_procs_blocked{}) > 100
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Node Blocked Procs detected!(current value is: {{ $value }}"
- alert: NodeTransmitRate
expr: avg by (instance) (floor(irate(node_network_transmit_bytes{device="eth0"}[2m]) / 1024 / 1024)) > 100
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Node Transmit Rate is above 100MB/s (current value is: {{ $value }}"
- alert: NodeReceiveRate
expr: avg by (instance) (floor(irate(node_network_receive_bytes{device="eth0"}[2m]) / 1024 / 1024)) > 100
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Node Receive Rate is above 100MB/s (current value is: {{ $value }}"
- alert: NodeDiskReadRate
expr: avg by (instance) (floor(irate(node_disk_bytes_read{}[2m]) / 1024 / 1024)) > 50
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Node Disk Read Rate is above 50MB/s (current value is: {{ $value }}"
- alert: NodeDiskWriteRate
# - alert: 磁盘写入率正常
expr: avg by (instance) (floor(irate(node_disk_bytes_written{}[2m]) / 1024 / 1024)) > 50
for: 2m
labels:
service_name: unicornServer
level: warning
annotations:
description: "{{$labels.instance}}: Node Disk Write Rate is above 50MB/s (current value is: {{ $value }}"
# description: "{{$labels.instance}}: 磁盘写入率低于50MB/s (current value is: {{ $value }}"
-------------------------------------------
cat alertmanager.yaml
global:
resolve_timeout: 5m
receivers:
- name: "dingding.webhook"
webhook_configs:
- url: 'http://127.0.0.1:8060/dingtalk/ops_dingding/send'
send_resolved: true
route:
group_by: ['alertname']
group_wait: 10s
group_interval: 10s
repeat_interval: 1h
receiver: 'dingding.webhook'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
四、dingtalk
1.钉钉群新增机器人:
-
群设置-群智能助手--添加机器人--自定义
-
命名,定义选择安全设置,三种方式 https://ding-doc.dingtalk.com/doc#/serverapi2/qf2nxq,这里选择关键字,对应alertmanager的alert名字
-
复制webhook--下一步要用
2.dingtalk-webhook:
//https://github.com/timonwong/prometheus-webhook-dingtalk/releases/
$ tar xf prometheus-webhook-dingtalk-0.3.0.linux-amd64.tar.gz
//启动
$ ./prometheus-webhook-dingtalk --ding.profile="ops_dingding=https://oapi.dingtalk.com/robot/send?access_token=571db1bc165ffcec8d0a33246ce950xxxxxcbab4eef9b2e885a1d80b2"
//修改alertmanager.yaml alert相关的内容,webhook url那行
五、Grafana
添加数据源--$prometheus'sIP:9090
Redis Template ID:11692
MySQL Template ID:6239
Node Template ID:8919
curl -X POST
http://localhost:9090/-/reload
修改配置文件后使之生效
遇到任何问题请一定看各个组件的日志