Alertmanager主要处理流程
- 接收到Alert,根据labels判断属于哪些Route(可存在多个Route,一个Route有多个Group,一个Group有多个Alert)
- 将Alert分配到Group中,没有则新建Group
- 新的Group等待group_wait指定的时间(等待时可能收到同一Group的Alert),根据resolve_timeout判断Alert是否解决,然后发送通知
- 已有的Group等待group_interval指定的时间,判断Alert是否解决,当上次发送通知到现在的间隔大于repeat_interval或者Group有更新时会发送通知
邮件报警方式: https://blog.csdn.net/weixin_39816723/article/details/99679592
微信报警方式:
获取信息用于配置:
企业id:
应用的secret/应用的AgentId
1.邮件报警规则
#alert-cm.yaml
kind: ConfigMap
apiVersion: v1
metadata:
name: alertmanager-config
namespace: kube-system
data:
config.yml: |-
global:
resolve_timeout: 10m #警报被宣告解决时间 默认5m
wechat_api_url: 'https://qyapi.weixin.qq.com/cgi-bin/' #默认https://qyapi.weixin.qq.com/cgi-bin/
wechat_api_secret: '*******************' #应用的secret,在应用的配置页面可以看到
wechat_api_corp_id: '****************' ##企业id,在企业的配置页面可以看到
templates:
- '/etc/alertmanager/config/*.tmpl' #告警模板文件
route:
group_by: ['alertname']
group_wait: 30s
group_interval: 5m
repeat_interval: 12h
receiver: 'wechat'
inhibit_rules:
- source_match:
receivers:
- name: 'wechat'
wechat_configs:
- send_resolved: false
corp_id: '****************' #wechat_api_corp_id
to_user: '@all' #人员
to_party: ' PartyID1 | PartyID2 ' #用户组
message: '{{ template "wechat.default.message" . }}'
agent_id: ''***********" #应用的AgentId,在应用的配置页面可以看到
api_secret: '**************' #同上wechat_api_secret
2.模板信息
#alert-tem.yaml
apiVersion: v1
kind: ConfigMap
metadata:
name: alert-wechat
namespace: kube-system
data:
wechat.tmpl: |
{{ define "wechat.default.message" }} #message: '{{ template "wechat.default.message" . }}'
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
=====================
{{- end }}
===告警详情===
告警详情: {{ $alert.Annotations.message }}
故障时间: {{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}
===参考信息===
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例ip: {{ $alert.Labels.instance }};{{- end -}}
{{- if gt (len $alert.Labels.namespace) 0 -}}故障实例所在namespace: {{ $alert.Labels.namespace }};{{- end -}}
{{- if gt (len $alert.Labels.node) 0 -}}故障物理机ip: {{ $alert.Labels.node }};{{- end -}}
{{- if gt (len $alert.Labels.pod_name) 0 -}}故障pod名称: {{ $alert.Labels.pod_name }}{{- end }}
=====================
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 -}}
告警类型: {{ $alert.Labels.alertname }}
告警级别: {{ $alert.Labels.severity }}
=====================
{{- end }}
===告警详情===
告警详情: {{ $alert.Annotations.message }}
故障时间: {{ $alert.StartsAt.Format "2006-01-02 15:04:05" }}
恢复时间: {{ $alert.EndsAt.Format "2006-01-02 15:04:05" }}
===参考信息===
{{ if gt (len $alert.Labels.instance) 0 -}}故障实例ip: {{ $alert.Labels.instance }};{{- end -}}
{{- if gt (len $alert.Labels.namespace) 0 -}}故障实例所在namespace: {{ $alert.Labels.namespace }};{{- end -}}
{{- if gt (len $alert.Labels.node) 0 -}}故障物理机ip: {{ $alert.Labels.node }};{{- end -}}
{{- if gt (len $alert.Labels.pod_name) 0 -}}故障pod名称: {{ $alert.Labels.pod_name }};{{- end }}
=====================
{{- end }}
{{- end }}
{{- end }}
3.部署文件信息
#alert-de.yaml
---
apiVersion: apps/v1beta2
kind: Deployment
metadata:
labels:
name: alertmanager-deployment
name: alertmanager
namespace: kube-system
spec:
replicas: 1
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
containers:
- name: alertmanager
image: prom/alertmanager
imagePullPolicy: IfNotPresent
env:
- name: POD_IP
valueFrom:
fieldRef:
apiVersion: v1
fieldPath: status.podIP
args:
- "--config.file=/etc/alertmanager/config.yml"
- "--storage.path=/alertmanager/data"
- "--cluster.listen-address=$(POD_IP):6783"
ports:
- containerPort: 9093
name: http
volumeMounts:
- mountPath: "/etc/alertmanager"
name: alertcfg
- mountPath: "/etc/alertmanager/config"
name: alert-wechat
resources:
requests:
cpu: 100m
memory: 256Mi
limits:
cpu: 100m
memory: 256Mi
serviceAccountName: prometheus
volumes:
- name: alertcfg
configMap:
name: alertmanager-config
- name: data
emptyDir: {}
- name: alert-wechat
configMap:
name: alert-wechat
4.部署
kubectl create -f alert-cm.yaml
kubectl create -f alert-tem.yaml
kubectl create -f alert-de.yaml
5.告警如下: