Prometheus安装
- K8S集群版本:v1.19
- 华为云CCE平台
- 飞书机器人通知
1 参考kube-prometheus
文档:https://github.com/prometheus-operator/kube-prometheus/tree/release-0.7
1.1 下载kube-prometheus
git clone https://github.com/prometheus-operator/kube-prometheus.git
git checkout release-0.7
1.2 根据不同的服务创建文件夹
方便区分
- alertmanager
- kube-state-metrics
- node-exporter
- prometheus
- prometheusalert
- loki
- loki-promtail
cp -rf kube-prometheus/manifests ~/mywork
cd mywork
mkdir alertmanager kube-state-metrics node-exporter prometheus prometheusalert
cp prometheus-* prometheus
cp node-exporter-* node-exporter
cp kube-state-metrics-* kube-state-metrics
cp alertmanager-* alertmanager
2 修改配置文件
2.1 拷贝命名空间yaml文件
# 命名空间单独创建
# kubectl create namespace monitoring
mv setup/0namespace-namespace.yaml ./
2.2 alertmanager中的配置文件修改
# 修改这个文件 alertmanager/alertmanager-secret.yaml
# webhook_configs中的飞书机器人地址填写成自己需要的
cat alertmanager/alertmanager-secret.yaml
apiVersion: v1
kind: Secret
metadata:
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
global:
resolve_timeout: 5m
route:
group_by: [...]
group_wait: 20s
group_interval: 10m
repeat_interval: 5h
receiver: 'default'
routes:
- receiver: 'null'
match:
alertname: Watchdog
- receiver: 'feishu'
group_wait: 10s
match_re:
severity: (info|warning|critical)
receivers:
- name: 'null'
- name: 'feishu'
webhook_configs:
- url: 'http://prometheusalert:8080/prometheusalert?type=fs&tpl=prometheus-fsv2-def&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxx'
send_resolved: true
- name: 'default'
webhook_configs:
- url: 'http://prometheusalert:8080/prometheusalert?type=fs&tpl=prometheus-fsv2-def&fsurl=https://open.feishu.cn/open-apis/bot/v2/hook/xxxxxxxxxxxxxx'
send_resolved: true
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'dev', 'instance']
type: Opaque
2.3 prometheus中的配置文件修改
2.3.1 修改prometheus/prometheus-prometheus.yaml
# 修改数据保持时间
# 增加retention: "15d"字段如下
spec:
alerting:
alertmanagers:
- name: alertmanager-main
namespace: monitoring
port: web
retention: "15d" #增加的字段
# 修改数据挂载
# 使用的是华为云CCE的动态插件,storage字段开始的部分追加上去。如下
spec:
alerting:
alertmanagers:
- name: alertmanager-main
namespace: monitoring
port: web
retention: "15d"
storage:
volumeClaimTemplate:
kind: PersistentVolumeClaim
apiVersion: v1
metadata:
name: pvc-evs-prometheus-storage
creationTimestamp: null
annotations:
everest.io/disk-volume-type: SAS
spec:
accessModes:
- ReadWriteOnce
resources:
requests:
storage: 100Gi
storageClassName: csi-disk
volumeMode: Filesystem
2.3.2 修改prometheus/prometheus-rules.yaml
# vim prometheus/prometheus-rules.yaml
# 把message替换成description,这个是为了跟告警模板匹配,或者调整告警模板
:%s/message/description/gc
# 地二部分修改文件alert为TargetDown的
# expr 增加instance如下
expr: 100 * (count(up == 0) BY (job, namespace, service, instance) / count(up) BY (job, namespace, service, instance)) > 10
# description 修改如下,为了好看一下
description: 'the {
{ $labels.job }}/{
{ $labels.service }} targets in {
{ $labels.namespace }} namespace are down.'
2.4 增加prometheusalert的配置文件
参考文档 https://github.com/feiyu563/PrometheusAlert
2.4.1 prometheusalert-configmap.yaml
cat prometheusalert/prometheusalert-configmap.yaml
---
apiVersion: v1
data:
app.conf: |-
appname = PrometheusAlert
login_user=admin
login_password=admin@2022
httpaddr = "0.0.0.0"
httpport = 8080
runmode = dev
proxy =
copyrequestbody = true
title=PrometheusAlert
GraylogAlerturl=http://graylog.org
logourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
rlogourl=https://raw.githubusercontent.com/feiyu563/PrometheusAlert/master/doc/alert-center.png
messagelevel=3
phonecalllevel=4
defaultphone=xxxxxxxx
phonecallresolved=0
silent=0
logtype=file
logpath=logs/prometheusalertcenter.log
prometheus_cst_time=0
db_driver=sqlite3
AlertRecord=0
open-dingding=1
ddurl=https://oapi.dingtalk.com/robot/send?access_token=xxxxx
dd_isatall=1
open-weixin=1
wxurl=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxx
open-feishu=1
fsurl=https://open.feishu.cn/open-apis/bot/hook/xxxxxxxxx
open-txdx=0
TXY_DX_appkey=xxxxx
TXY_DX_tpl_id=xxxxx
TXY_DX_sdkappid=xxxxx
TXY_DX_sign=腾讯云
open-txdh=0
TXY_DH_phonecallappkey=xxxxx
TXY_DH_phonecalltpl_id=xxxxx
TXY_DH_phonecallsdkappid=xxxxx
open-hwdx=0
HWY_DX_APP_Key=xxxxxxxxxxxxxxxxxxxxxx
HWY_DX_APP_Secret=xxxxxxxxxxxxxxxxxxxxxx
HWY_DX_APP_Url=https://rtcsms.cn-north-1.myhuaweicloud.com:10743
HWY_DX_Templateid=xxxxxxxxxxxxxxxxxxxxxx
HWY_DX_Signature=华为云
HWY_DX_Sender=xxxxxxxxxx
open-alydx=0
ALY_DX_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
ALY_DX_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
ALY_DX_SignName=阿里云
ALY_DX_Template=xxxxxxxxxxxxxxxxxxxxxx
open-alydh=0
ALY_DH_AccessKeyId=xxxxxxxxxxxxxxxxxxxxxx
ALY_DH_AccessSecret=xxxxxxxxxxxxxxxxxxxxxx
ALY_DX_CalledShowNumber=xxxxxxxxx
ALY_DH_TtsCode=xxxxxxxx
RLY_DH_open-rlydh=0
RLY_URL=https://app.cloopen.com:8883/2013-12-26/Accounts/
RLY_ACCOUNT_SID=xxxxxxxxxxx
RLY_ACCOUNT_TOKEN=xxxxxxxxxx
RLY_APP_ID=xxxxxxxxxxxxx
open-email=1
Email_host=smtp.exmail.qq.com
Email_port=465
Email_user=dxli@iecas.cn
Email_password=Hongyun@2022
Email_title=运维告警
Default_emails=dxli@iecas.cn
open-7moordx=0
7MOOR_ACCOUNT_ID=Nxxx
7MOOR_ACCOUNT_APISECRET=xxx
7MOOR_DX_TEMPLATENUM=n
open-7moordh=0
7MOOR_WEBCALL_SERVICENO=xxx
7MOOR_WEBCALL_VOICE_VAR=text
open-tg=0
TG_TOKEN=xxxxx
TG_MODE_CHAN=0
TG_USERID=xxxxx
TG_CHANNAME=xxxxx
open-workwechat=0