cd github地址:
https://github.com/prometheus-community/helm-charts/tree/main/charts/kube-prometheus-stack
artifacthub地址:
https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack
关联需求:
1、helm 3+
2、kubernetes 1.16+
一、添加仓库
#添加 kubernetes-dashboard helm chart
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
# 更新下仓库
helm repo update
#查询repo
helm repo list
二、部暑
#指定变量
pro=kube-prometheus-stack
chart_version=23.3.2
mkdir -p /data/$pro
cd /data/$pro
#下载charts
helm pull prometheus-community/$pro --version=$chart_version
#提取values.yaml文件
tar zxvf $pro-$chart_version.tgz --strip-components 1 $pro/values.yaml
cat > /data/$pro/start.sh << EOF
kubectl get ns monitoring||kubectl create ns monitoring
helm install $pro $pro-$chart_version.tgz \
-f values.yaml \
-n monitoring
EOF
三、更新脚本
#helm3 升级
cat > /data/kube-prometheus-stack/upgrade.sh << 'EOF'
pro=kube-prometheus-stack
chart_version=23.3.2
helm upgrade $pro $pro-$chart_version.tgz -f values.yaml -n monitoring
cp values.yaml values.yaml.bak_`date +%F_%R`
EOF
四、修改配置values.yaml
- ingress ---- grafana、prometheus、alertmanager
#alertmanager
ingress:
enabled: true
hosts:
- alertmanager.fly.cn
#grafana
ingress:
enabled: true
hosts:
- grafana.fly.cn
path: /
#prometheus
ingress:
enabled: true
hosts:
- prometheus.fly.cn
- 持久化 ---- grafana、prometheus、alertmanager(生产环境需要配置持久化)
#alertmanager
storage:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 20Gi
#grafana
persistence:
type: pvc
enabled: true
accessModes:
- ReadWriteOnce
size: 10Gi
#prometheus
storageSpec:
volumeClaimTemplate:
spec:
accessModes: ["ReadWriteOnce"]
resources:
requests:
storage: 50Gi
helm repo add prometheus-community https://prometheus-community.github.io/helm-charts
helm search repo prometheus
helm pull prometheus-community/kube-prometheus-stack
kubectl create ns monitoring
helm install prometheus kube-prometheus-stack-15.2.1.tgz --namespace monitoring
#获取grafana 密码
kubectl get secrets -n monitoring prometheus-grafana -o yaml|grep admin-password|grep -v '{}'|awk '{print $2}'|base64 -d
load dashboard 13105
helm更新
helm upgrade prometheus -n monitoring kube-prometheus-stack-15.4.4.tgz -f kube-prometheus-stack/values.yaml
alertmanager 通过企微发送警报
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
run: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: monitoring
spec:
selector:
matchLabels:
run: prometheus-webhook-dingtalk
template:
metadata:
labels:
run: prometheus-webhook-dingtalk
spec:
containers:
- args:
# - --adapter=/app/prometheusalert/dingtalk.js=/adapter/dingtalk=https://oapi.dingtalk.com/robot/send?access_token={token}#{secret}
- --adapter=/app/prometheusalert/wx.js=/adapter/wx=https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=3e718f0e-4fe9-4b44-aab6-zzzzzzzzz
image: registry.cn-hangzhou.aliyuncs.com/guyongquan/webhook-adapter
name: prometheus-webhook-dingtalk
ports:
- containerPort: 80
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
labels:
run: prometheus-webhook-dingtalk
name: prometheus-webhook-dingtalk
namespace: monitoring
spec:
ports:
- port: 8060
protocol: TCP
targetPort: 80
selector:
run: prometheus-webhook-dingtalk
type: ClusterIP
alertmanager config
receivers:
- name: allreceivers
webhook_configs:
- url: http://prometheus-webhook-dingtalk.monitoring.svc.cluster.local:8060/adapter/dingtalk
send_resolved: false
- url: http://prometheus-webhook-dingtalk.monitoring.svc.cluster.local:8060/adapter/wx
send_resolved: false
webhook adapter代码库 https://github.com/guyongquan/webhook-adapter
servicemonitoring模板
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app: redis
release: prometheus #此label与prometheus-operator-stack关联
name: redis
namespace: base
spec:
endpoints:
- port: metrics #与metrics里的name对应
namespaceSelector:
matchNames:
- base #service所在的namespace
selector:
matchLabels:
app: redis #与service对应
release: redis #与service对应
prometheusrules模板
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
release: prometheus #此label与prometheus-operator-stack关联
app: kube-prometheus-stack
app.kubernetes.io/instance: prometheus
name: prometheus-kube-prometheus-redis.rules
namespace: monitoring
spec:
groups:
- name: redis
rules:
- alert: RedisDown
expr: redis_up == 0
for: 0m
labels:
severity: critical
annotations:
summary: Redis down (instance {{ $labels.instance }})
description: "Redis instance is down\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisMissingMaster
expr: (count(redis_instance_info{role="master"}) or vector(0)) < 1
for: 0m
labels:
severity: critical
annotations:
summary: Redis missing master (instance {{ $labels.instance }})
description: "Redis cluster has no node marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
- alert: RedisTooManyMasters
expr: count(redis_instance_info{role="master"}) > 1
for: 0m
labels:
severity: critical
annotations:
summary: Redis too many masters (instance {{ $labels.instance }})
description: "Redis cluster has too many nodes marked as master.\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
相关的警报可参考https://awesome-prometheus-alerts.grep.to/
https://github.com/3scale-ops/prometheus-exporter-operator/tree/main/prometheus-rules