部署Prometheus+grafana+alertmanager+钉钉webhook
下载所需要的镜像
docker pull prom/node_exporter
docker pull prom/prometheus
docker pull grafana/grafana
docker pull prom/alertmanager
docker pull prom/pushgateway
docker pull timonwong/prometheus-webhook-dingtalk
启动Prometheus
mkdir -p /data/prometheus/
mkdir -p /data/prometheus/config
mkdir -p /data/prometheus/config/conf.d
mkdir -p /data/prometheus/config/rules
##挂载目录结构如下
/data/prometheus/
├── config
│ ├── conf.d
│ │ └── host.json
│ ├── prometheus.yml
│ └── rules
│ └── instance.yml
└── data
配置文件详解
## host.json 存放export配置
root@saas:/data/prometheus/config# cat conf.d/host.json
[
{
"targets": ["127.0.0.1:9100"],
"labels": {
"instance": "host-localhost"
}
}
]
## instance.yml 告警规则测试
root@saas:/data/prometheus/config# cat rules/instance.yml
groups:
- name: example
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
labels:
severity: page
annotations:
summary: "Instance {{ $labels.instance }} down"
description: "{{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
## prometheus.yml 主配置文件
root@saas:/data/prometheus/config# cat prometheus.yml
global: # 全局设置,可以被覆盖
scrape_interval: 15s # 设定抓取数据的频率,默认为1min
evaluation_interval: 15s # 评估规则的频率,默认为1min
scrape_timeout: 15s # 设定抓取数据的超时时间,默认为10s
alerting:
alertmanagers:
- static_configs:
- targets:
- 127.0.0.1:9093 # # Alertmanager配置,设定alertmanager和prometheus交互的接口
rule_files:
- '/etc/prometheus/rules/*.yml' # 报警规则存放路径
scrape_configs:
- job_name: prometheus
scrape_interval: 15s # 抓取周期,默认采用global配置
static_configs:
- targets: ['127.0.0.1:9090'] # prometheus所要抓取数据的地址
labels: # 定义标签,便于区分
instance: prometheus
- job_name: pushgateway
honor_labels: true
static_configs:
- targets: ['127.0.0.1:9091']
labels:
instance: pushgateway
- job_name: host
metrics_path: '/metrics'
file_sd_configs: # 基于文件的服务发现
- files:
- /etc/prometheus/conf.d/host.json
## 服务自动发现使用,新加主机节点监控只需要修改host.json文件,不需要重启Prometheus服务。
启动Prometheus服务。
docker run -d -p 9090:9090 -v /data/prometheus/config/:/etc/prometheus -v /data/prometheus/data:/prometheus/data --name=prometheus prom/prometheus
安装grafana服务
mkdir /data/grafana-storage ## 创建grafana映射目录
chmod 777 -R /data/grafana-storage/ ## 授予最大权限
docker run -d -p 3000:3000 --name=grafana -v /data/grafana-storage/:/var/lib/grafana grafana/grafana:latest
安装pushgateway
docker run -d -p 9091:9091 --name=pushgateway prom/pushgateway
安装node_export
docker run -d -p 9100:9100 -v "/proc:/host/proc:ro" -v "/sys:/host/sys:ro" -v "/:/rootfs:ro" --net="host" --name=node-exporter prom/node-exporter
#安装alertmanager
mkdir /data/alertmanager
#目标文件树状图
root@saas:/data/alertmanager# tree /data/alertmanager/
/data/alertmanager/
└── alertmanager.yml
alertmanager配置文件alertmanager详细信息
global:
resolve_timeout: 5m # 在没有报警的情况下声明为已解决的时间
# # 配置邮件发送信息
# smtp_smarthost: 'smtp.test.com:465'
# smtp_from: 'your_email'
# smtp_auth_username: 'your_email'
# smtp_auth_password: 'email_passwd'
# smtp_hello: 'your_email'
# smtp_require_tls: false
# 设置报警的分发策略
route:
receiver: webhook # 发送警报的接收者的名称,默认的receiver
group_wait: 10s # 当一个新的报警分组被创建后,需要至少等待多久时间发送一组警报的通知
group_interval: 1m # 当第一个报警发送后,等待'group_interval'时间来发送新的一组报警信息
repeat_interval: 24h # 报警发送成功后,重新发送等待的时间
group_by: ['alertname'] # 报警分组依据
# #子路由,使用email发送
# routes:
# - receiver: email
# match_re:
# serverity : email # label 匹配email
# group_wait: 10s
# 定义警报接收者信息
receivers:
- name: webhook # 与route匹配
webhook_configs:
- url: http://10.0.0.73:8060/dingtalk/webhook/send
send_resolved: true # 发送已解决通知
#- name: 'email'
# email_configs:
# - to: 'email@qq.com'
# send_resolved: true
# 抑制规则配置
#inhibit_rules:
# [ - <inhibit_rule> ... ]
#target_match:
# [ <labelname>: <labelvalue>, ... ]
#target_match_re:
# [ <labelname>: <regex>, ... ]
#source_match:
# [ <labelname>: <labelvalue>, ... ]
#source_match_re:
# [ <labelname>: <regex>, ... ]
#[ equal: '[' <labelname>, ... ']' ]
钉钉生成webhook地址
https://oapi.dingtalk.com/robot/send?access_token=2cc1150a2beea04e82e37c36c0a903b393a7f6d51e9031a3c5dd667ac2d552df
安装钉钉webhook引入到Prometheus报警
docker run -d -p 8060:8060 --name=prometheus-webhook-dingtalk timonwong/prometheus-webhook-dingtalk --ding.profile="webhook=https://oapi.dingtalk.com/robot/send?access_token=2cc1150a2beea04e82e37c36c0a903b393a7f6d51e9031a3c5dd667ac2d552df"
测试是否接入钉钉机器人
curl https://oapi.dingtalk.com/robot/send?access_token=2cc1150a2beea04e82e37c36c0a903b393a7f6d51e9031a3c5dd667ac2d552df -H 'Content-Type: application/json' -d '{"msgtype": "text","text": {"content": "测试是否连通!"}}'
安装node_export 收集服务器基本信息
容器方式安装
docker pull prom/node_exporter
docker run -d -p 9100:9100 -v "/proc:/host/proc:ro" -v "/sys:/host/sys:ro" -v "/:/rootfs:ro" --net="host" --name=node-exporter prom/node-exporter
物理机安装
#下载node_exporter包
wget http://downloads.max.datahunter.cn/download/software/node_exporter-1.0.1.linux-amd64.tar.gz
#解压包
tar -xf node_exporter-1.0.1.linux-amd64.tar.gz -C /usr/share/
#进入可执行文件目录
cd /usr/share/node_exporter-1.0.1.linux-amd64/
#后台运行
nohup ./node_exporter &
告警rules规则参考地址
https://awesome-prometheus-alerts.grep.to/rules#elasticsearch