Prometheus 二、
服务的告警与监控
0. 实现钉钉报警
0.1 webhook-dingtalk安装与配置
wget https://github.com/timonwong/prometheus-webhook-dingtalk/releases/download/v2.1.0/prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz
tar xf prometheus-webhook-dingtalk-2.1.0.linux-amd64.tar.gz -C /opt
cat > /etc/systemd/system/webhook-dingtalk.service << EOF
[Unit]
Description=The prometheus webhook dingtalk
After=network-online.target
Wants=network-online.target
[Service]
WorkingDirectory=/opt/prometheus-webhook-dingtalk-2.1.0.linux-amd64
ExecStart=/opt/prometheus-webhook-dingtalk-2.1.0.linux-amd64/prometheus-webhook-dingtalk --config.file=config.yml --web.enable-lifecycle
User=prometheus
Group=prometheus
KillSignal=SIGQUIT
Restart=always
RestartPreventExitStatus=1 6 SIGABRT
TimeoutStopSec=5
KillMode=process
PrivateTmp=true
LimitNOFILE=1048576
LimitNPROC=1048576
[Install]
WantedBy=multi-user.target
EOF
chown -R prometheus.prometheus /opt/prometheus-webhook-dingtalk-2.1.0.linux-amd64
cat config.yml
# 配置钉钉消息模板
templates:
- /opt/alertmanager-0.26.0.linux-amd64/template/ding-talk.tmpl
targets:
#加签方式
webhook1:
url: https://oapi.dingtalk.com/robot/send?access_token=675d581185ee70afxc9d96c5e8136bda3f2
secret: SECaeefa920dxa6fa3f0977a
message:
title: '{{ template "ops.title" . }}'
text: '{{ template "ops.content" . }}'
#钉钉模板详情
cat > /opt/alertmanager-0.26.0.linux-amd64/template/ding-talk.tmpl <<EOF
{{ define "__subject" }}
[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}]
{{ end }}
{{ define "__alert_list" }}{{ range . }}
---
**告警名称**: {{ index .Annotations "title" }}
**告警环境**: {{ .Labels.env }}
**告警级别**: {{ .Labels.severity }}
**告警主机**: {{ .Labels.instance }}
**告警信息**: {{ index .Annotations "description" }}
**告警时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "__resolved_list" }}{{ range . }}
---
**告警名称**: {{ index .Annotations "title" }}
**告警环境**: {{ .Labels.env }}
**告警级别**: {{ .Labels.severity }}
**告警主机**: {{ .Labels.instance }}
**告警信息**: {{ index .Annotations "description" }}
**告警时间**: {{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
**恢复时间**: {{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{ end }}{{ end }}
{{ define "ops.title" }}
{{ template "__subject" . }}
{{ end }}
{{ define "ops.content" }}
{{ if gt (len .Alerts.Firing) 0 }}
**====侦测到{{ .Alerts.Firing | len }}个故障====**
{{ template "__alert_list" .Alerts.Firing }}
---
{{ end }}
{{ if gt (len .Alerts.Resolved) 0 }}
**====恢复{{ .Alerts.Resolved | len }}个故障====**
{{ template "__resolved_list" .Alerts.Resolved }}
{{ end }}
{{ end }}
{{ define "ops.link.title" }}{{ template "ops.title" . }}{{ end }}
{{ define "ops.link.content" }}{{ template "ops.content" . }}{{ end }}
{{ template "ops.title" . }}
{{ template "ops.content" . }}
EOF
#启动
systemctl enable webhook-dingtalk
systemctl start webhook-dingtalk
systemctl status webhook-dingtalk
0.2 alertmanager
#alertmanager配置修改
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 3h
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://10.0.0.100:8060/dingtalk/webhook1/send'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']
1. node-exporter
1.1 prometheus配置
# node_exporter
- job_name: "node-exporter"
scrape_interval: 15s
static_configs:
- targets: ["localhost:9100"]
labels:
env: dev
- targets: ["10.0.0.101:9100"]
labels:
env: prod
1.2 Instance告警
#实例是否宕机
groups:
- name: prometheus alert
rules:
- alert: 实例宕机
expr: up{job="node-exporter"} == 0
for: 30s
labels:
severity: critical
annotations:
title: 'Instance Down'
description: "{{ $labels.instance }} 已宕机,请速度处理."
##CPU
groups:
- name: prometheus alert
rules:
- alert: HighNodeCPU
expr: (1 - avg(rate(node_cpu_seconds_total{job=~"node-exporter",mode="idle"}[5m])) by (instance)) * 100 > 60
for: 5m
labels:
severity: warning
annotations:
title: CPU
description: cpu 使用率>60% ,当前值: {{ printf "%.2f" $value }}%.
##内存
groups:
- name: prometheus alert
rules:
- alert: HighNodeMem
expr: (1 - (node_memory_MemAvailable_bytes{job=~"node-exporter"} / (node_memory_MemTotal_bytes{job=~"node-exporter"})))* 100 > 80
for: 1m
labels:
severity: warning
annotations:
title: Mem
description: Memory 使用率>80% ,当前值:{{ printf "%.2f" $value }}%.
##磁盘
groups:
- name: prometheus alert
rules:
- alert: HighNodeDisk
expr: (node_filesystem_size_bytes{mountpoint="/"} - node_filesystem_free_bytes{mountpoint="/"}) / node_filesystem_size_bytes{mountpoint='/'} * 100 > 80
for: 5m
labels:
severity: warning
annotations:
title: 'Disk'
description: Disk 使用率>80% , 当前值:{{ printf "%.2f" $value }}%.
1.2 监控
实例dashboard json已放到下载页上
展示:
2.nginx_exporter
2.1 nginx-prometheus-exporter安装和配置
tar xf nginx-prometheus-exporter_1.1.0_linux_amd64.tar.gz -C /opt/nginx-prometheus-exporter/
cat > /etc/nginx/conf.d/nginx_status.conf <<EOF
server {
listen 8080;
location /nginx_status {
stub_status on;
access_log off;
allow 127.0.0.1;
deny all;
}
}
EOF
nginx -s reload
cat > /etc/systemd/system/nginx_exportor.service << EOF
[Unit]
Description=The prometheus nginx exportor
After=network-online.target
Wants=network-online.target
[Service]
WorkingDirectory=/opt/nginx-prometheus-exporter
ExecStart=/opt/nginx-prometheus-exporter/nginx-prometheus-exporter -nginx.scrape-uri http://127.0.0.1:8080/nginx_status
KillSignal=SIGQUIT
Restart=always
RestartPreventExitStatus=1 6 SIGABRT
TimeoutStopSec=5
KillMode=process
PrivateTmp=true
LimitNOFILE=1048576
LimitNPROC=1048576
[Install]
WantedBy=multi-user.target
EOF
##prometheus配置
- job_name: "nginx-exporter"
scrape_interval: 15s
static_configs:
- targets: ["10.0.0.101:9113"]
labels:
env: prod
2.2告警
groups:
- name: prometheus alert
rules:
- alert: nginx-status
expr: nginx_up == 0
for: 5s
labels:
severity: critical
annotations:
title: 'Nginx Down'
description: "nginx已宕机,请速度处理."
2.3监控
实例dashboard json已放到下载页上
3.redis_exporter
3.1 redis_exporter安装和配置
tar xf redis_exporter-v1.58.0.linux-amd64.tar.gz -C /opt/
cat > /etc/systemd/system/redis_exporter.service <<EOF
[Unit]
Description=redis_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/opt/redis_exporter-v1.58.0.linux-amd64/redis_exporter -redis.addr=10.0.0.101:6371 -redis.password '' -web.listen-address 10.0.0.101:9122
Restart=always
EOF
#prometheus redis_cluster配置:
- job_name: 'redis_exporter_targets'
static_configs:
- targets:
- redis://10.0.0.101:6371
- redis://10.0.0.101:6372
- redis://10.0.0.101:6373
- redis://10.0.0.101:6374
- redis://10.0.0.101:6375
- redis://10.0.0.101:6376
params:
check-keys: ["metrics:*"]
metrics_path: /scrape
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 10.0.0.101:9122
labels:
env: prod
3.2 redis告警
cat > redis_target.yml <<EOF
groups:
- name: prometheus alert
rules:
- alert: Redis Down
expr: redis_up == 0
for: 5s
labels:
severity: critical
annotations:
title: 'Redis Down'
description: "{{ $labels.instance }} Redis 已宕机,请速度处理."
EOF
4. minio
4.1 minio监控和配置
minio节点每个启动脚本中添加变量:export MINIO_PROMETHEUS_AUTH_TYPE="public"
#Prometheus配置
- job_name: minio-job
metrics_path: /minio/prometheus/metrics
scheme: http
static_configs:
- targets: ['10.0.0.101:9000','xx.xx.xx.xx','...']
4.2 minio告警
groups:
- name: prometheus alert
rules:
- alert: minio服务下线
expr: up{minio-job} == 0
for: 30s
labels:
severity: critical
annotations:
title: 'minio svc Down'
description: "{{ $labels.instance }} minio服务下线,请速度处理."
5. elasticsearch_exporter
5.1 elasticsearch_exporter安装和配置
tar xf elasticsearch_exporter-1.7.0.linux-amd64.tar.gz -C /opt/
cat > /etc/systemd/system/elasticsearch_exporter.service <<EOF
[Unit]
Description=elasticsearch_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/opt/elasticsearch_exporter-1.7.0.linux-amd64/elasticsearch_exporter --es.all --[no-]collector.cluster-info --es.timeout=10s --es.uri http://10.0.0.101:9200
Restart=always
EOF
systemctl enable elasticsearch_exporter
systemctl start elasticsearch_exporter
#prometheus target
- job_name: "elasticsearch Cluster"
scrape_interval: 60s
static_configs:
- targets: ['10.0.0.101:9114']
5.2 告警
groups:
- name: prometheus alert
rules:
- alert: elasticsearch cluster
expr: elasticsearch_cluster_health_number_of_nodes < 3
for: 30s
labels:
severity: critical
annotations:
title: 'elasticsearch Down'
description: "elasticsearch node下线,请速度处理."
6. mysqld_exporter
6.1 mysql单点
CREATE USER 'mysqld_exporter'@'localhost' IDENTIFIED BY 'Aegxz/zy105*&Z' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'mysqld_exporter'@'localhost';
FLUSH PRIVILEGES;
tar xf mysqld_exporter-0.15.1.linux-amd64.tar.gz -C /opt
cd /opt/mysqld_exporter-0.15.1.linux-amd64
cat > .my.cnf <<EOF
[client]
user=mysqld_exporter
password=Aegxz/zy105*&Z
EOF
cat > /etc/systemd/system/mysql_exporter.service <<EOF
[Unit]
Description=mysql_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/opt/mysqld_exporter-0.15.1.linux-amd64/mysqld_exporter --config.my-cnf=/opt/mysqld_exporter-0.15.1.linux-amd64/.my.cnf
Restart=always
EOF
systemctl enable mysql_exporter
systemctl start mysql_exporter
#prometheus配置
- job_name: "mysql-exporter"
scrape_interval: 30s
static_configs:
- targets: ["10.0.0.101:9104"]
labels:
env: prod
6.2 mysql集群
CREATE USER 'mysqld_exporter'@'10.0.%' IDENTIFIED BY 'Aegxz/zy105*&Z' WITH MAX_USER_CONNECTIONS 3;
GRANT PROCESS, REPLICATION CLIENT, SELECT ON *.* TO 'mysqld_exporter'@'10.0.%';
FLUSH PRIVILEGES;
tar xf mysqld_exporter-0.15.1.linux-amd64.tar.gz -C /opt
cd /opt/mysqld_exporter-0.15.1.linux-amd64
cat > .my.cnf <<EOF
[client]
user=mysqld_exporter
password=Aegxz/zy105*&Z
EOF
cat > /etc/systemd/system/mysql_exporter.service <<EOF
[Unit]
Description=mysql_exporter
Documentation=https://prometheus.io/
After=network.target
[Service]
Type=simple
User=prometheus
ExecStart=/opt/mysqld_exporter-0.15.1.linux-amd64/mysqld_exporter --config.my-cnf=/opt/mysqld_exporter-0.15.1.linux-amd64/.my.cnf
Restart=always
EOF
systemctl enable mysql_exporter
systemctl start mysql_exporter
#prometheus监控多个mysql
- job_name: 'mysql'
static_configs:
- targets:
- 10.0.0.100:3306
- 10.0.0.101:3306
labels:
auth_module: client
env: prod
metrics_path: /probe
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: instance
- target_label: __address__
replacement: 10.0.0.101:9104
- source_labels: [auth_module]
target_label: __param_auth_module
- action: labeldrop
regex: auth_module
6.3 告警
#mysql_target.yml
groups:
- name: prometheus alert
rules:
- alert: Mysql Down
expr: mysql_up == 0
for: 5s
labels:
severity: critical
annotations:
title: 'Mysql Down'
description: "{{ $labels.instance }} Mysql 已宕机,请速度处理."
#mysql_lock-num.yml
groups:
- name: prometheus alert
rules:
- alert: Mysql lock_waits
expr: mysql_global_status_innodb_row_lock_current_waits > 2
for: 5m
labels:
severity: critical
annotations:
title: 'Mysql lock_waits'
description: "{{ $labels.instance }} Mysql 正在等待锁 > 2个,请速度处理."
7.Doris数仓
#prometheus配置,监控Doris数仓集群
- job_name: 'Doris_cluster'
metrics_path: '/metrics'
static_configs:
- targets: ['10.0.0.101:8030','xx.xx.xx.xx']
labels:
env: prod
group: fe
- targets: ['10.0.0.101:8040','xx.xx.xx.xx']
labels:
env: prod
group: be
#告警
groups:
- name: prometheus alert
rules:
- alert: Doris Down
expr: up{job="Doris_cluster"} == 0
for: 5s
labels:
severity: critical
annotations:
title: 'Doris Down'
description: "{{ $labels.instance }} Doris {{ $labels.group }} 已宕机,请速度处理."
8.prometheus监控docker swarm集群
Stack
version: '3'
services:
cadvisor:
image: google/cadvisor
networks:
- net
command: -logtostderr -docker_only
volumes:
- /var/run/docker.sock:/var/run/docker.sock:ro
- /:/rootfs:ro
- /var/run:/var/run
- /sys:/sys:ro
- /var/lib/docker/:/var/lib/docker:ro
deploy:
mode: global
resources:
limits:
memory: 128M
reservations:
memory: 64M
#prometheus配置,docker Cadvisor
- job_name: 'docker_cadvisor'
metrics_path: '/metrics'
static_configs:
- targets: ['10.0.0.101:8080','10.0.0.100:8080']
labels:
env: prod
邮件报警
[root@centos7 opt]# cat alertmanager-0.26.0.linux-amd64/template/email.tmpl
{{ define "email.html" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range \$index, \$alert := .Alerts -}}
========= ERROR ==========<br>
告警名称:{{ .Labels.alertname }}<br>
告警级别:{{ .Labels.severity }}<br>
告警机器:{{ .Labels.instance }} {{ .Labels.device }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range \$index, \$alert := .Alerts -}}
========= INFO ==========<br>
告警名称:{{ .Labels.alertname }}<br>
告警级别:{{ .Labels.severity }}<br>
告警机器:{{ .Labels.instance }}<br>
告警详情:{{ .Annotations.summary }}<br>
告警时间:{{ (.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
恢复时间:{{ (.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}<br>
========= END ==========<br>
{{- end }}
{{- end }}
{{- end }}
#alertmanager.yml
global:
resolve_timeout: 5m
smtp_from: '14x9@qq.com'
smtp_smarthost: 'smtp.qq.com:465'
smtp_auth_username: '145x@qq.com' #发件人用户名
smtp_auth_password: 'pxxc' #邮箱授权码
smtp_require_tls: false
smtp_hello: 'qq.com'
route:
group_by: ['alertname']
group_wait: 5s
group_interval: 5s
repeat_interval: 5m
receiver: 'email'
receivers:
- name: 'email'
email_configs:
- to: '14x59@qq.com'
html: '{{ template "email.html" }}'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'instance']