Prometheus 笔记 -- alertmanager 邮件报警

一、alertmanager

部署alertmanager

从prometheus官网下载二进制文件:Download | Prometheus


下载并安装 alertmanager

wget https://github.com/prometheus/alertmanager/releases/download/v0.26.0/alertmanager-0.26.0.linux-amd64.tar.gz

tar -xzvf alertmanager-0.26.0.linux-amd64.tar.gz -C /usr/local

更改文件名称,并设置systemctl管理alertmanager

cd /usr/loca/ & mv alertmanager-0.26.0.linux-amd64  alertmanager

vim /usr/lib/systemd/system/alertmanager.service
[Unit]
Description=Alertmanager
Documentation=https://github.com/prometheus/alertmanager/releases/
After=network.target

[Service]
WorkingDirectory=/usr/local/alertmanager/ # alertmanager工作目录
ExecStart=/usr/local/alertmanager/alertmanager # alertmanager启动二进制文件
ExecReload=/bin/kill -HUP $MAINPID
ExecStop=/bin/kill -KILL $MAINPID
Type=simple
KillMode=control-group
Restart=on-failure # 智能重启
RestartSec=15s

[Install]
WantedBy=multi-user.target

# 修改alertmanager配置文件

vim alertmanger.yml

global:
  resolve_timeout: 5m
  smtp_smarthost: 'smtp.163.com:25'
  smtp_from: 'xxxx' # 发送告警的邮箱
  smtp_auth_username: 'xxxx'  #发送告警的邮箱
  smtp_auth_password: 'xxxx' #邮箱授权密码
  smtp_require_tls: false

templates: #添加模板
  - '/usr/local/alertmanager/template/email.tmpl'   #指定路径

route:
  receiver: mail1
  group_by: ['alertname']
  group_wait: 1m  # 分组等待的时间
  group_interval: 2m # 上下两组发送告警的间隔时间
  repeat_interval: 1h # 重复发送告警时间
  routes:
  - receiver: mail2 # 接收者
    match_re: # 条件匹配 与prometheus rules中设置的标签匹配
      db: sql
    repeat_interval: 1h # 分条件匹配重复发送告警时间
receivers:
  - name: mail1
    email_configs:
    - send_resolved: true
      to: xxxx
  - name: "mail2"
    email_configs:
    - send_resolved: true
      to: xxxx
inhibit_rules:  # 静默匹配
  - source_match:  # 如果告警信息中包含Disaster 就取消发送 warning信息
      severity: Disaster
    target_match:
      severity: warning
    equal:
    - alertmanager

# 报警模板

cat /usr/local/alertmanager/template/email.tmpl
{{ define "wechat.default.message" }}
{{- if gt (len .Alerts.Firing) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========xxx环境监控报警 =========
告警状态:{{   .Status }}
告警级别:{{ .Labels.severity }}
告警类型:{{ $alert.Labels.alertname }}
故障主机: {{ $alert.Labels.instance }} {{ $alert.Labels.pod }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
触发阀值:{{ .Annotations.value }}
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
========= = end =  =========
{{- end }}
{{- end }}
{{- end }}
{{- if gt (len .Alerts.Resolved) 0 -}}
{{- range $index, $alert := .Alerts -}}
{{- if eq $index 0 }}
=========xxx环境异常恢复 =========
告警类型:{{ .Labels.alertname }}
告警状态:{{   .Status }}
告警主题: {{ $alert.Annotations.summary }}
告警详情: {{ $alert.Annotations.message }}{{ $alert.Annotations.description}};
故障时间: {{ ($alert.StartsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
恢复时间: {{ ($alert.EndsAt.Add 28800e9).Format "2006-01-02 15:04:05" }}
{{- if gt (len $alert.Labels.instance) 0 }}
实例信息: {{ $alert.Labels.instance }}
{{- end }}
========= = end =  =========
{{- end }}
{{- end }}
{{- end }}
{{- end }}

# 启动alertmanager

systemctl daemon-reload
systemctl start alertmanager

二、prometheus

# 在prometheus中配置

prometheus.yml

alerting:
  alertmanagers:
  - static_configs:
    - targets:
      - 192.168.178.129:9093

# Load rules once and periodically evaluate them according to the global 'evaluation_interval'.
rule_files: # 报警规则目录
  - "rules/*_rules.yml"
  - "rules/*_alerts.yml"

rules/node_alerts.yml # 其中for是持续时间 expr是判断语句

node_alerts.yml
groups:
- name: 主机状态-监控告警
  rules:
  - alert: 主机状态
    expr: up == 0
    for: 1m
    labels:
      status: Disaster
    annotations:
      summary: "{{$labels.instance}}:服务器宕机"
      description: "{{$labels.instance}}:服务器延时超过5分钟"

- name: 实例存活告警规则
  rules:
  - alert: 实例存活告警
    expr: up{job="prometheus"} == 0 or up{job="Linux-host"} == 0
    for: 1m
    labels:
      user: prometheus
      severity: Disaster
    annotations:
      summary: "Instance {{ $labels.instance }} is down"
      description: "Instance {{ $labels.instance }} of job {{ $labels.job }} has been down for more than 1 minutes."
      value: "{{ $value }}"

- name: 内存告警规则
  rules:
  - alert: "内存使用率告警"
    expr: (node_memory_MemTotal_bytes - (node_memory_MemFree_bytes+node_memory_Buffers_bytes+node_memory_Cached_bytes )) / node_memory_MemTotal_bytes * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
      db: sql
    annotations:
      summary: "服务器: {{$labels.alertname}} 内存报警"
      description: "{{ $labels.alertname }} 内存资源利用率大于75%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

- name: CPU报警规则
  rules:
  - alert: CPU使用率告警
    expr: 100 - (avg by (instance)(irate(node_cpu_seconds_total{mode="idle"}[1m]) )) * 100 > 80
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} CPU报警"
      description: "服务器: CPU使用超过70%!(当前值: {{ $value }}%)"
      value: "{{ $value }}"

- name: 磁盘报警规则
  rules:
  - alert: 磁盘使用率告警
    expr: (node_filesystem_size_bytes - node_filesystem_avail_bytes) / node_filesystem_size_bytes * 100 > 40
    for: 1m
    labels:
      user: prometheus
      severity: warning
    annotations:
      summary: "服务器: {{$labels.alertname}} 磁盘报警"
      description: "服务器:{{$labels.alertname}},磁盘设备: 使用超过80%!(挂载点: {{ $labels.mountpoint }} 当前值: {{ $value }}%)"
      value: "{{ $value }}"

修改完prometheus时 重新启动prometheus (添加了prometheus热启动的前提下)

curl -XPOST http://localhost:9090/-/reload

  • 2
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 1
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论 1
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值