alertmanager第三方告警插件使用之钉钉告警

野猪佩挤

已于 2022-04-13 22:50:16 修改

阅读量4.4k

点赞数

分类专栏： Prometheus 文章标签： devops

于 2020-11-27 14:01:39 首次发布

本文链接：https://blog.csdn.net/weixin_42562106/article/details/110229104

版权

Prometheus 专栏收录该内容

18 篇文章 2 订阅

订阅专栏

1.安装第三方告警插件

配置文件

root@k8s-60 aler]# cat app.conf 
#---------------------↓全局配置-----------------------
appname = PrometheusAlert
login_user=prometheusalert
#登录密码
login_password=prometheusalert
httpaddr = "0.0.0.0"
#监听端口
httpport = 8080
runmode = dev
#设置代理 proxy = http://123.123.123.123:8080
proxy =
#开启JSON请求
copyrequestbody = true
#告警消息标题
title=PrometheusAlert
#链接到告警平台地址
GraylogAlerturl=http://graylog.org
#钉钉告警 告警logo图标地址
logourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
#钉钉告警 恢复logo图标地址
rlogourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
#短信告警级别(等于3就进行短信告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
messagelevel=3
#电话告警级别(等于4就进行语音告警) 告警级别定义 0 信息,1 警告,2 一般严重,3 严重,4 灾难
phonecalllevel=4
#默认拨打号码(页面测试短信和电话功能需要配置此项)
defaultphone=xxxxxxxx
#故障恢复是否启用电话通知0为关闭,1为开启
phonecallresolved=0
#自动告警抑制(自动告警抑制是默认同一个告警源的告警信息只发送告警级别最高的第一条告警信息,其他消息默认屏蔽,这么做的目的是为了减少相同告警来源的消息数量,防止告警炸弹,0为关闭,1为开启)
silent=0
#是否前台输出file or console
logtype=file
#日志文件路径
logpath=logs/prometheusalertcenter.log
#转换Prometheus,graylog告警消息的时区为CST时区(如默认已经是CST时区，请勿开启)
prometheus_cst_time=1
#以上配置是必须要有
#---------------------↓webhook-----------------------
#是否开启钉钉告警通道,可同时开始多个通道0为关闭,1为开启
open-dingding=1
#默认钉钉机器人地址
ddurl=https://oapi.dingtalk.com/robot/send?access_token=1a049fe40f02b614da6dba1d85d908b34b9dc452090380c0345548fad0b54404
#是否开启 @所有人(0为关闭,1为开启)

#是否开启微信告警通道,可同时开始多个通道0为关闭,1为开启
open-weixin=1
#默认企业微信机器人地址
wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxx

容器版

docker run -d -p 88:8080 -v /config/prometheusalert-center:/app/conf --name prometheusalert-center feiyu563/prometheus-alert:latest

k8s版

kubectl create configmap my-alert-conf --from-file=/opt/aler/app.conf
[root@k8s-60 aler]# kubectl get cm | grep my
my-alert-conf   1      45m

yaml模板

cat <<END> feiyu563.yaml 
apiVersion: apps/v1
kind: Deployment
metadata:
  name: alert
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheusalert
  template:
    metadata:
      labels:
        app: prometheusalert
    spec:
      containers:
      - name: prometheusalert
        image: feiyu563/prometheus-alert:latest 
        ports:
        - containerPort: 8080
          name: http
        volumeMounts:
          - name: conf
            mountPath: /app/conf/
            readOnly: true
          - name: date-config
            mountPath: /etc/localtime
      volumes:
      - name: conf
        configMap:
          name: my-alert-conf
      - name: date-config
        hostPath:
          path: /etc/localtime
---
apiVersion: v1
kind: Service
metadata:
  name: alert
spec:
  selector:
      app: prometheusalert
  ports:
    - name: http
      port: 8080
      protocol: TCP
      nodePort: 18080
      targetPort: 8080
  type: NodePort
END

可以登录web测试算部署成功
在这里插入图片描述

2.安装alertmanager

官网下载软件包

wget https://github.com/prometheus/alertmanager/releases/download/v0.21.0/alertmanager-0.21.0.linux-amd64.tar.gz

配置

cat alertmanager.yml 

global:
  resolve_timeout: 5m

route:
  group_by: ['instance']
  group_wait: 10s
  group_interval: 10s
  repeat_interval: 1m
  receiver: 'web.hook.prometheusalert'
receivers:
- name: 'web.hook.prometheusalert'
  webhook_configs:
  - url: 'http://172.16.0.61:18080/prometheus/alert'   <<----第三方告警的地址就是上面图的那个ui地址后缀必须/prometheus/alert
  #- url: 'http://alert:8080/prometheus/alert'  k8s使用

语法检查

 ./amtool check-config alertmanager.yml

启动

nohup ./alertmanager --config.file=./alertmanager.yml &

docker容器版

docker run --name alertmanger -d  -p 9093:9093  -v /newmoni/alertmanager.yml:/etc/alertmanager/alertmanager.yml -v /etc/localtime:/etc/localtime:ro -v /etc/timezone:/etc/timezone:ro prom/alertmanager:latest

K8S版

kubectl create configmap conf --from-file=/opt/aler/manager.yml

YAML模板

cat <<END>alert-manget.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
  name: my-alert3
spec:
  replicas: 1
  selector:
    matchLabels:
      app: my-alert32
  template:
    metadata:
      labels:
        app: my-alert32
    spec:
      containers:
      - name: my-alert32
        image: prom/alertmanager:latest 
        command: 
        - "/bin/alertmanager"
        args: 
        - "--config.file=/etc/alertmanager/manager.yml"
        ports:
        - containerPort: 9093
          name: http
        volumeMounts:
          - name: conf
            mountPath: /etc/alertmanager/
            readOnly: true
          - name: dates
            mountPath: /etc/localtime
      volumes:
      - name: conf
        configMap:
          name: alertmanager
      - name: dates
        hostPath:
          path: /etc/localtime
---
apiVersion: v1
kind: Service
metadata:
  name: my-alert3
spec:
  selector:
      app: my-alert32
  ports:
    - name: http
      port: 9093
      protocol: TCP
      nodePort: 19093
      targetPort: 9093
  type: NodePort
END

可以访问下面web ui算成功
在这里插入图片描述

最后 prometheus服务端

配置文件

[root@docker63 ~]# cat /monit/prometheus1.yml 
global:
alerting:
  alertmanagers:
  - static_configs:
    - targets: ['172.16.0.18:9093']
rule_files:
  - "/opt/*.yml"  <<<<<-------告警规则
scrape_configs:
  - job_name: 'linux'
    file_sd_configs:
      - files: ['/prometheus/*.yml']
        refresh_interval: 5s

  - job_name: 'prometheus'
    static_configs:
    - targets: ['localhost:9090']
  - job_name: 'win7'
    static_configs:
    - targets: ['172.16.0.8:9182']

  - job_name: 'linus'
    static_configs:
    - targets: ['172.16.0.60:9100','172.16.0.61:9100']
    -

告警规则

[root@docker63 ~]# cat /rule/ru.yml
groups:
- name: linux
rules:
- alert: Node-Down 
  expr: up == 0
  for: 1m 
  labels:
    severity: warning
  annotations: 
    summary: "Node has been down"
    description: "has been down "

- alert: "内存使用率过高"
  expr: round(100- node_memory_MemAvailable_bytes/node_memory_MemTotal_bytes*100) > 80
  for: 1m
  labels:
    severity: warning
  annotations:
    summary: "内存使用率过高"
    description: "当前使用率{{ $value }}%"

- alert: "CPU使用率过高"
  expr: round(100 - ((avg by (instance,job)(irate(node_cpu_seconds_total{mode="idle",instance!~'bac-.*'}[5m]))) *100)) > 80
  for: 2m
  labels:
    severity: warning
  annotations:
    summary: "CPU使用率过高"
    description: "当前使用率{{ $value }}%"

- alert: "磁盘使用率过高"
  expr: round(100-100*(node_filesystem_avail_bytes{fstype=~"ext4|xfs"} / node_filesystem_size_bytes{fstype=~"ext4|xfs"})) > 80
  for: 15s
  labels:
    severity: warning
  annotations:
    summary: "磁盘使用率过高"
    description: "当前磁盘{{$labels.mountpoint}} 使用率{{ $value }}%"

- alert: "分区容量过低"
  expr: round(node_filesystem_avail_bytes{fstype=~"ext4|xfs",instance!~"testnode",mountpoint!~"/boot.*"}/1024/1024/1024) < 10
  for: 15s
  labels:
    severity: warning
  annotations:
    summary: "分区容量过低"
    description: "当前分区{{$labels.mountpoint}} 容量{{ $value }}GB"

- alert: "网络流出速率过高"
  expr: round(irate(node_network_receive_bytes_total{instance!~"data.*",device!~'tap.*|veth.*|br.*|docker.*|vir.*|lo.*|vnet.*'}[1m])/1024) > 2048
  for: 1m
  labels:
    severity: warning
  annotations:
    summary: "网络流出速率过高"
    description: "当前速率{{ $value }}KB/s"

最后告警

在这里插入图片描述

prometheus容器启动方式

docker run  -d \
  -p 9090:9090 --name prometheus \
  -v /monit/prometheus1.yml:/etc/prometheus/prometheus.yml \
  -v /opt/prometheus/data:/prometheus \
  -v /rule:/opt \  #告警的配置文件目录
  prom/prometheus:latest \
  --config.file=/etc/prometheus/prometheus.yml \
  --storage.tsdb.retention.time=100d \  #数据保留100天
  --web.enable-lifecycle

node_exporter安装

# 创建用户
groupadd -r prometheus
useradd -r -g prometheus -s /sbin/nologin -M -c "prometheus Daemons" prometheus

编辑/usr/lib/systemd/system/node_exporter.service

cat <<END> /usr/lib/systemd/system/node_exporter.service
[Service]
User=prometheus
Group=prometheus
ExecStart=/usr/local/bin/node_exporter
 
[Install]
WantedBy=multi-user.target
 
[Unit]
Description=node_exporter
After=network.target
END

#启动
systemctl start node_exporter
systemctl enable node_exporter
systemctl status node_exporter.service