1、Prometheus使用钉钉告警需要安装插件,本文以docker部署方式演示
2、说明:本文使用钉钉安全设置中的放通ip段为例演示
3、docker启动钉钉告警插件
docker run -d --restart always -p 8060:8060 --name webhook-dingding -v /root/dingding.tmpl:/root/dingding.tmpl -v /etc/localtime:/etc/localtime timonwong/prometheus-webhook-dingtalk:v0.3.0 --template.file="/root/dingding.tmpl" --ding.profile="webhook1=https://oapi.dingtalk.com/robot/send?access_token=
自己的钉钉token"
--name webhook-dingding # docker名称
-v /root/dingding.tmpl:/root/dingding.tmp # 映射告警模板
-v /etc/localtime:/etc/localtime # 修改时区
--template.file # 指定模板路径
--ding.profile # 指定钉钉api链接
4、启动完成后可以使用如下命令进行测试钉钉是否正常
curl -H "Content-Type: application/json" -d '{"msgtype":"text","text":{"content":"prometheus alert test"}}' https://oapi.dingtalk.com/robot/send?access_token=b81384b8b0c751cbc8ff736e2c064fc1b68c63878128c4082663bf988a101b11
4.1、返回如下内容表示正常
{"errcode":0,"errmsg":"ok"}
5、自定义钉钉告警模板
[root@prometheus ~]# cat dingding.tmpl
{{ define "__subject" }}[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}] {{ .GroupLabels.SortedPairs.Values | join " " }} {{ if gt (len .CommonLabels) (len .GroupLabels) }}({{ with .CommonLabels.Remove .GroupLabels.Names }}{{ .Values | join " " }}{{ end }}){{ end }}{{ end }}
{{ define "__alertmanagerURL" }}{{ .ExternalURL }}/#/alerts?receiver={{ .Receiver }}{{ end }}
{{ define "__text_alert_list" }}{{ range . }}
告警程序:prometheus_alert
告警级别:{{ .Labels.severity }}
告警类型:{{ .Labels.alertname }}
主机: {{ .Labels.instance }}
命名空间: {{ .Labels.namespace }}
Pod: {{ .Labels.pod }}
告警主题: {{ .Annotations.summary }}
告警描叙: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
------------------------
{{ end }}{{ end }}
{{ define "__text_resolve_list" }}{{ range . }}
恢复程序:{{ .Labels.alertname }}
主机: {{ .Labels.instance }}
恢复描叙: {{ .Annotations.description }}
触发时间: {{ .StartsAt.Format "2006-01-02 15:04:05" }}
------------------------
{{ end }}{{ end }}
{{ define "ding.link.title" }}{{ template "__subject" . }}{{ end }}
{{ define "ding.link.content" }}#### \[{{ .Status | toUpper }}{{ if eq .Status "firing" }}:{{ .Alerts.Firing | len }}{{ end }}\] **[{{ index .GroupLabels "alertname" }}]({{ template "__alertmanagerURL" . }})**
{{ if gt (len .Alerts.Firing) 0 -}}
![警报 图标](https://ss0.bdstatic.com/70cFuHSh_Q1YnxGkpoWK1HF6hhy/it/u=3626076420,1196179712&fm=15&gp=0.jpg)
**====侦测到故障====**
{{ template "__text_alert_list" .Alerts.Firing }}
{{- end }}
{{ if gt (len .Alerts.Resolved) 0 -}}
恢复列表:
{{ template "__text_resolve_list" .Alerts.Resolved }}
{{- end }}
{{- end }}
6、修改 alertmanager 配置文件
[root@prometheus alertmanager]# cat alertmanager.yml
global:
resolve_timeout: 5m
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 1m
repeat_interval: 2m
receiver: 'web.hook'
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://localhost:8060/dingtalk/webhook1/send'
send_resolved: true # 表示服务恢复后会收到恢复告警
inhibit_rules:
- source_match:
alertname: 'ApplicationDown'
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname',"target","job","instance"]
6.1、重启 alertmanager 服务
[root@prometheus alertmanager]# systemctl restart alertmanager
7、修改 prometheus 配置
[root@k8s-node1 prometheus]# cat prometheus.yml
# my global config
global:
scrape_interval: 15s # Set the scrape interval to every 15 seconds. Default is every 1 minute.
evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute.
# scrape_timeout is set to the global default (10s).
# Alertmanager configuration
alerting:
alertmanagers:
- static_configs:
- targets:
- 192.168.33.145:9093
# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files:
- "rules/*.yml" # 修改此处的配置即可。此处表示当前prometheus目录下的rules目录下的所有yml文件全部匹配
# - "second_rules.yml"
# A scrape configuration containing exactly one endpoint to scrape:
# Here it's Prometheus itself.
scrape_configs:
# The job name is added as a label `job=<job_name>` to any timeseries scraped from this config.
- job_name: 'prometheus'
# metrics_path defaults to '/metrics'
# scheme defaults to 'http'.
static_configs:
- targets: ['localhost:9090']
8、重新加载prometheus即可
[root@k8s-node1 prometheus]# kill -HUP 64254
9、钉钉告警示例如下:
10、服务恢复告警示例如下: