文章目录
Kube-prometheus Deploy Prometheus
部署 Prometheus
下载 Kube-prometheus
wget -c https://github.com/prometheus-operator/kube-prometheus/archive/refs/tags/v0.11.0.tar.gz
查看镜像
grep -rn 'image: '
examples/example-app/example-app.yaml:36: image: quay.io/fabxc/prometheus_demo_service
examples/thanos-sidecar.jsonnet:11: image: 'quay.io/thanos/thanos:v0.19.0',
experimental/metrics-server/metrics-server-deployment.yaml:21: image: gcr.io/google_containers/metrics-server-amd64:v0.2.0
manifests/alertmanager-alertmanager.yaml:13: image: quay.io/prometheus/alertmanager:v0.24.0
manifests/blackboxExporter-deployment.yaml:33: image: quay.io/prometheus/blackbox-exporter:v0.21.0
manifests/blackboxExporter-deployment.yaml:60: image: jimmidyson/configmap-reload:v0.5.0
manifests/blackboxExporter-deployment.yaml:88: image: quay.io/brancz/kube-rbac-proxy:v0.12.0
manifests/grafana-deployment.yaml:33: image: grafana/grafana:8.5.5
manifests/kubeStateMetrics-deployment.yaml:35: image: k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.5.0
manifests/kubeStateMetrics-deployment.yaml:56: image: quay.io/brancz/kube-rbac-proxy:v0.12.0
manifests/kubeStateMetrics-deployment.yaml:82: image: quay.io/brancz/kube-rbac-proxy:v0.12.0
manifests/nodeExporter-daemonset.yaml:38: image: quay.io/prometheus/node-exporter:v1.3.1
manifests/nodeExporter-daemonset.yaml:74: image: quay.io/brancz/kube-rbac-proxy:v0.12.0
manifests/prometheus-prometheus.yaml:21: image: quay.io/prometheus/prometheus:v2.36.1
manifests/prometheusAdapter-deployment.yaml:40: image: k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1
manifests/prometheusOperator-deployment.yaml:33: image: quay.io/prometheus-operator/prometheus-operator:v0.57.0
manifests/prometheusOperator-deployment.yaml:56: image: quay.io/brancz/kube-rbac-proxy:v0.12.0
镜像 tag 重置及推送镜像到 harbor
# docker pull gcr.io/google_containers/metrics-server-amd64:v0.2.0
# Error response from daemon: Get "https://gcr.io/v2/": dial tcp 74.125.203.82:443: connect: connection timed out
# 使用阿里云仓库
docker pull registry.aliyuncs.com/google_containers/metrics-server-amd64:v0.2.0
docker pull quay.io/prometheus/alertmanager:v0.24.0
docker pull quay.io/prometheus/blackbox-exporter:v0.21.0
docker pull jimmidyson/configmap-reload:v0.5.0
docker pull quay.io/brancz/kube-rbac-proxy:v0.12.0
docker pull grafana/grafana:8.5.5
# docker pull k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.5.0
# 使用阿里云仓库
# docker pull registry.cn-hangzhou.aliyuncs.com/google_containers/kube-state-metrics/kube-state-metrics:v2.5.0
# docker pull registry.aliyuncs.com/google_containers/kube-state-metrics/kube-state-metrics:v2.5.0
# dockerhub 拉取
docker pull landv1001/kube-state-metrics:v2.5.0
docker pull quay.io/brancz/kube-rbac-proxy:v0.12.0
docker pull quay.io/brancz/kube-rbac-proxy:v0.12.0
docker pull quay.io/prometheus/node-exporter:v1.3.1
docker pull quay.io/brancz/kube-rbac-proxy:v0.12.0
docker pull quay.io/prometheus/prometheus:v2.36.1
# docker pull k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1
# 使用阿里云仓库
# docker pull registry.aliyuncs.com/google_containers/prometheus-adapter/prometheus-adapter:v0.9.1
# dockerhub 拉取
docker pull v5cn/prometheus-adapter:v0.9.1
docker pull quay.io/prometheus-operator/prometheus-operator:v0.57.0
docker pull quay.io/brancz/kube-rbac-proxy:v0.12.0
docker tag registry.aliyuncs.com/google_containers/metrics-server-amd64:v0.2.0 10.83.195.8:1443/prometheus/metrics-server-amd64:v0.2.0
docker tag quay.io/prometheus/alertmanager:v0.24.0 10.83.195.8:1443/prometheus/alertmanager:v0.24.0
docker tag quay.io/prometheus/blackbox-exporter:v0.21.0 10.83.195.8:1443/prometheus/blackbox-exporter:v0.21.0
docker tag jimmidyson/configmap-reload:v0.5.0 10.83.195.8:1443/prometheus/configmap-reload:v0.5.0
docker tag quay.io/brancz/kube-rbac-proxy:v0.12.0 10.83.195.8:1443/prometheus/kube-rbac-proxy:v0.12.0
docker tag grafana/grafana:8.5.5 10.83.195.8:1443/prometheus/grafana:8.5.5
docker tag landv1001/kube-state-metrics:v2.5.0 10.83.195.8:1443/prometheus/kube-state-metrics:v2.5.0
docker tag quay.io/prometheus/node-exporter:v1.3.1 10.83.195.8:1443/prometheus/node-exporter:v1.3.1
docker tag quay.io/prometheus/prometheus:v2.36.1 10.83.195.8:1443/prometheus/prometheus:v2.36.1
docker tag v5cn/prometheus-adapter:v0.9.1 10.83.195.8:1443/prometheus/prometheus-adapter:v0.9.1
docker tag quay.io/prometheus-operator/prometheus-operator:v0.57.0 10.83.195.8:1443/prometheus/prometheus-operator:v0.57.0
docker push 10.83.195.8:1443/prometheus/metrics-server-amd64:v0.2.0
docker push 10.83.195.8:1443/prometheus/alertmanager:v0.24.0
docker push 10.83.195.8:1443/prometheus/blackbox-exporter:v0.21.0
docker push 10.83.195.8:1443/prometheus/configmap-reload:v0.5.0
docker push 10.83.195.8:1443/prometheus/kube-rbac-proxy:v0.12.0
docker push 10.83.195.8:1443/prometheus/grafana:8.5.5
docker push 10.83.195.8:1443/prometheus/kube-state-metrics:v2.5.0
docker push 10.83.195.8:1443/prometheus/node-exporter:v1.3.1
docker push 10.83.195.8:1443/prometheus/prometheus:v2.36.1
docker push 10.83.195.8:1443/prometheus/prometheus-adapter:v0.9.1
docker push 10.83.195.8:1443/prometheus/prometheus-operator:v0.57.0
docker pull bitnami/pushgateway:1.8.0
docker tag bitnami/pushgateway:1.8.0 10.83.195.8:1443/prometheus/pushgateway:1.8.0
docker push 10.83.195.8:1443/prometheus/pushgateway:1.8.0
docker pull quay.io/prometheus-operator/prometheus-config-reloader:v0.57.0
docker tag quay.io/prometheus-operator/prometheus-config-reloader:v0.57.0 10.83.195.8:1443/prometheus/prometheus-config-reloader:v0.57.0
docker push 10.83.195.8:1443/prometheus/prometheus-config-reloader:v0.57.0
替换为 harbor 镜像仓库
# sed 's@quay.io/prometheus@10.83.195.8:1443/prometheus@g' manifests/alertmanager-alertmanager.yaml | grep '10.83.195.8:1443/prometheus
sed -i 's@quay.io/prometheus/alertmanager:v0.24.0@10.83.195.8:1443/prometheus/alertmanager:v0.24.0@g' manifests/alertmanager-alertmanager.yaml
sed -i 's@quay.io/prometheus/blackbox-exporter:v0.21.0@10.83.195.8:1443/prometheus/blackbox-exporter:v0.21.0@g' manifests/blackboxExporter-deployment.yaml
sed -i 's@jimmidyson/configmap-reload:v0.5.0@10.83.195.8:1443/prometheus/configmap-reload:v0.5.0@g' manifests/blackboxExporter-deployment.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.12.0@10.83.195.8:1443/prometheus/kube-rbac-proxy:v0.12.0@g' manifests/blackboxExporter-deployment.yaml
sed -i 's@grafana/grafana:8.5.5@10.83.195.8:1443/prometheus/grafana:8.5.5@g' manifests/grafana-deployment.yaml
sed -i 's@k8s.gcr.io/kube-state-metrics/kube-state-metrics:v2.5.0@10.83.195.8:1443/prometheus/kube-state-metrics:v2.5.0@g' manifests/kubeStateMetrics-deployment.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.12.0@10.83.195.8:1443/prometheus/kube-rbac-proxy:v0.12.0@g' manifests/kubeStateMetrics-deployment.yaml
sed -i 's@quay.io/prometheus/node-exporter:v1.3.1@10.83.195.8:1443/prometheus/node-exporter:v1.3.1@g' manifests/nodeExporter-daemonset.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.12.0@10.83.195.8:1443/prometheus/kube-rbac-proxy:v0.12.0@g' manifests/nodeExporter-daemonset.yaml
sed -i 's@quay.io/prometheus/prometheus:v2.36.1@10.83.195.8:1443/prometheus/prometheus:v2.36.1@g' manifests/prometheus-prometheus.yaml
sed -i 's@k8s.gcr.io/prometheus-adapter/prometheus-adapter:v0.9.1@10.83.195.8:1443/prometheus/prometheus-adapter:v0.9.1@g' manifests/prometheusAdapter-deployment.yaml
sed -i 's@quay.io/prometheus-operator/prometheus-operator:v0.57.0@10.83.195.8:1443/prometheus/prometheus-operator:v0.57.0@g' manifests/prometheusOperator-deployment.yaml
sed -i 's@quay.io/brancz/kube-rbac-proxy:v0.12.0@10.83.195.8:1443/prometheus/kube-rbac-proxy:v0.12.0@g' manifests/prometheusOperator-deployment.yaml
sed -i 's@quay.io/prometheus-operator/prometheus-config-reloader:v0.57.0@10.83.195.8:1443/prometheus/prometheus-config-reloader:v0.57.0@g' manifests/prometheusOperator-deployment.yaml
vim manifests/grafana-config.yaml
# 增加匿名访问,grafana数据存储到mysql
# 密码不需要base64编码
grafana.ini: |
[security]
allow_embedding = true
[date_formats]
default_timezone = UTC
[auth.anonymous]
enabled = true
[database]
type = mysql
host = xx:3306
name = grafana
user = grafana
password = xx
type: Opaque
配置 Ingress
# vim manifests/alertmanager-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: alertmanager-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: "nginx"
prometheus.io/http_probe: "true"
spec:
rules:
- host: bigdata-dev-alertmanager.ky-tech.com.cn
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: alertmanager-main
port:
number: 9093
tls:
- hosts:
- bigdata-dev-alertmanager.ky-tech.com.cn # https域名
secretName: prometheus-secret
# vim manifests/grafana-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: grafana-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: "nginx"
prometheus.io/http_probe: "true"
spec:
rules:
- host: bigdata-dev-grafana.ky-tech.com.cn
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: grafana
port:
number: 3000
tls:
- hosts:
- bigdata-dev-grafana.ky-tech.com.cn # https域名
secretName: prometheus-secret
# vim manifests/prometheus-ingress.yaml
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
name: prometheus-ingress
namespace: monitoring
annotations:
kubernetes.io/ingress.class: "nginx"
prometheus.io/http_probe: "true"
spec:
rules:
- host: bigdata-dev-prometheus.ky-tech.com.cn
http:
paths:
- path: /
pathType: Prefix
backend:
service:
name: prometheus-k8s
port:
number: 9090
tls:
- hosts:
- bigdata-dev-prometheus.ky-tech.com.cn # https域名
secretName: prometheus-secret
kubectl create ns monitoring
kubectl create secret tls prometheus-secret --key /data/harbor_helm/stl/ky-tech.com.cn_nginx/ky-tech.com.cn.key --cert /data/harbor_helm/stl/ky-tech.com.cn_nginx/ky-tech.com.cn_bundle.crt -n monitoring
一般Prometheus使用共享存储(如ceph等)
# 创建 StorageClass
# 创建pvc
# vim prometheus-pvc.yaml
apiVersion: v1
kind: PersistentVolumeClaim
metadata:
name: prometheus-pvc
namespace: monitoring
spec:
accessModes:
- ReadWriteMany
resources:
requests:
storage: 4Ti
storageClassName: bigdata-cephfs
volumeMode: Filesystem
status:
accessModes:
- ReadWriteMany
capacity:
storage: 4Ti
phase: Boun
# vim prometheus-prometheus.yaml
spec:
retention: 90d
storage:
volumeClaimTemplate:
spec:
storageClassName: bigdata-cephfs
resources:
requests:
storage: 4Ti
安装
# 部署
kubectl apply --server-side -f manifests/setup
kubectl wait \
--for condition=Established \
--all CustomResourceDefinition \
--namespace=monitoring
kubectl apply -f manifests/
# 卸载
# kubectl delete --ignore-not-found=true -f manifests/ -f manifests/setup
# 彻底删除namespace
kubectl get namespace monitoring -o json \
| tr -d "\n" | sed "s/\"finalizers\": \[[^]]\+\]/\"finalizers\": []/" \
| kubectl replace --raw /api/v1/namespaces/monitoring/finalize -f -
# 报错查看日志
kubectl describe po trino-dockerhub-coordinator-c4779585d-tdldt -n monitoring
kubectl logs node-exporter-fb6ql -n monitoring --all-containers
# (combined from similar events): MountVolume.SetUp failed for volume "grafana-dashboard-persistentvolumesusage" : failed to sync configmap cache: timed out waiting for the condition
kubectl delete networkpolicy --all -n monitoring
访问
https://bigdata-dev-alertmanager.ky-tech.com.cn
https://bigdata-dev-grafana.ky-tech.com.cn admin/admin@13
https://bigdata-dev-prometheus.ky-tech.com.cn
Kube-prometheus 集成 pushgateway
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/name: pushgateway-sdp
name: pushgateway-sdp
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: pushgateway-sdp
template:
metadata:
labels:
app.kubernetes.io/name: pushgateway-sdp
app: pushgateway
spec:
nodeSelector:
kubernetes.io/system: monitor
containers:
- args:
- --web.enable-admin-api
- --push.disable-consistency-check
image: 10.83.195.8:1443/prometheus/pushgateway:1.8.0
livenessProbe:
failureThreshold: 10
httpGet:
path: /-/healthy
port: 9091
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
name: pushgateway-sdp
ports:
- containerPort: 9091
name: http
readinessProbe:
failureThreshold: 15
httpGet:
path: /-/ready
port: 9091
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 10
successThreshold: 1
timeoutSeconds: 30
resources:
limits:
cpu: 16
memory: 32Gi
requests:
cpu: 4
memory: 16G
restartPolicy: Always
---
apiVersion: v1
kind: Service
metadata:
labels:
app.kubernetes.io/name: pushgateway-sdp
name: pushgateway-sdp
namespace: monitoring
spec:
type: NodePort
ports:
- name: http
nodePort: 30091
port: 9091
protocol: TCP
targetPort: http
selector:
app.kubernetes.io/name: pushgateway-sdp
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: pushgateway-sdp
name: pushgateway-sdp
namespace: monitoring
spec:
endpoints:
- interval: 60s
scrapeTimeout: 50s
path: /metrics
port: http
scheme: http
relabelings:
- sourceLabels:
- pod
targetLabel: pod
regex: "(.*)"
replacement: pushgateway-sdp
- sourceLabels:
- instance
targetLabel: instance
regex: "(.*)"
replacement: pushgateway-sdp
selector:
matchLabels:
app.kubernetes.io/name: pushgateway-sdp
监控 K8S 集群外节点
---
apiVersion: monitoring.coreos.com/v1
kind: ServiceMonitor
metadata:
labels:
app.kubernetes.io/name: node-exporter
cluster: hadoop-sdp
name: hadoop-sdp
namespace: monitoring
spec:
endpoints:
- interval: 60s
port: metrcis
relabelings:
- sourceLabels:
- __meta_kubernetes_service_label_dn_flag
action: replace
targetLabel: dn_flag
regex: (.*)
- sourceLabels:
- __meta_kubernetes_service_label_nm_flag
action: replace
targetLabel: nm_flag
regex: (.*)
- sourceLabels:
- __meta_kubernetes_service_label_cluster
targetLabel: cluster
- sourceLabels:
- __address__
targetLabel: ip
regex: "(.*):(.*)"
replacement: $1
selector:
matchLabels:
app.kubernetes.io/name: node-exporter
cluster: hadoop-sdp
namespaceSelector:
matchNames:
- monitoring
---
apiVersion: v1
kind: Service
metadata:
name: hadoop-sdp-dn
labels:
app.kubernetes.io/name: node-exporter
dn_flag: "1"
cluster: hadoop-sdp
namespace: monitoring
spec:
type: ClusterIP
clusterIP: None
ports:
- name: metrcis
port: 9100
targetPort: 9100
protocol: TCP
---
apiVersion: v1
kind: Endpoints
metadata:
name: hadoop-sdp-dn
labels:
app.kubernetes.io/name: node-exporter
dn_flag: "1"
cluster: hadoop-sdp
namespace: monitoring
subsets:
- addresses:
- ip: 10.83.192.9
ports:
- name: metrcis
port: 9100
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: hadoop-sdp-nm
labels:
app.kubernetes.io/name: node-exporter
nm_flag: "1"
cluster: hadoop-sdp
namespace: monitoring
spec:
type: ClusterIP
clusterIP: None
ports:
- name: metrcis
port: 9100
targetPort: 9100
protocol: TCP
---
apiVersion: v1
kind: Endpoints
metadata:
name: hadoop-sdp-nm
labels:
app.kubernetes.io/name: node-exporter
nm_flag: "1"
cluster: hadoop-sdp
namespace: monitoring
subsets:
- addresses:
- ip: 10.83.192.9
ports:
- name: metrcis
port: 9100
protocol: TCP
---
apiVersion: v1
kind: Service
metadata:
name: hadoop-sdp-nn
labels:
app.kubernetes.io/name: node-exporter
cluster: hadoop-sdp
namespace: monitoring
spec:
type: ClusterIP
clusterIP: None
ports:
- name: metrcis
port: 9100
targetPort: 9100
protocol: TCP
---
apiVersion: v1
kind: Endpoints
metadata:
name: hadoop-sdp-nn
labels:
app.kubernetes.io/name: node-exporter
cluster: hadoop-sdp
namespace: monitoring
subsets:
- addresses:
- ip: 10.83.192.6
- ip: 10.83.192.7
ports:
- name: metrcis
port: 9100
protocol: TCP
集成 AlterManager
python接收alertmanager告警服务
# !/usr/bin/env python
# -*-coding:utf-8 -*-
"""
@ File : 0513.py
@ Time : 2024-05-13 15:43
@ Author : Shylin Zhang
@ version : python 3.6
@ Description:
"""
import logging
from flask import Flask, request
import json
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(filename)s[line:%(lineno)d] - %(levelname)s: %(message)s',
filename="./alertmanager-webhook-run.log"
)
app = Flask(__name__)
@app.route("/send/", methods=["POST"])
def send():
try:
data = json.loads(request.data)
logging.info(data)
alerts = data['alerts']
for i in alerts:
logging.info('SEND SMS: ' + str(i))
except Exception as e:
logging.error(e)
return 'ok'
if __name__ == '__main__':
app.run(host='0.0.0.0', port=8088)
# vim manifests/alertmanager-secret.yaml
apiVersion: v1
kind: Secret
metadata:
labels:
app.kubernetes.io/component: alert-router
app.kubernetes.io/instance: main
app.kubernetes.io/name: alertmanager
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 0.24.0
name: alertmanager-main
namespace: monitoring
stringData:
alertmanager.yaml: |-
"global":
"resolve_timeout": "5m"
"inhibit_rules":
- "equal":
- "namespace"
- "alertname"
"source_matchers":
- "severity = critical"
"target_matchers":
- "severity =~ warning|info"
- "equal":
- "namespace"
- "alertname"
"source_matchers":
- "severity = warning"
"target_matchers":
- "severity = info"
- "equal":
- "namespace"
"source_matchers":
- "alertname = InfoInhibitor"
"target_matchers":
- "severity = info"
"receivers":
- "name": "alert-webhook"
"webhook_configs":
- "url": "http://10.83.195.6:8088/send/" # python接收alertmanager告警服务
"send_resolved": true
"max_alerts": 0
- "name": "Default"
- "name": "Watchdog"
- "name": "Critical"
- "name": "null"
"route":
"group_by":
- "namespace"
- "alertname"
"group_interval": "5m"
"group_wait": "30s"
"receiver": "alert-webhook"
"repeat_interval": "24h"
"routes":
- "matchers":
- "severity = critical"
"receiver": "alert-webhook"
- "matchers":
- "severity = warning"
"receiver": "alert-webhook"
- "matchers":
- "alertname = Watchdog"
"receiver": "Watchdog"
- "matchers":
- "alertname = InfoInhibitor"
"receiver": "null"
type: Opaque
配置告警规则 rule
# vim manifests/rules/hadoop-server.rule
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: prometheus-hadoop-rules
namespace: monitoring
spec:
groups:
- name: hadoop.rules
rules:
- alert: 服务器内存使用率 > 80%
annotations:
description: 集群[ {{ $labels.cluster}} ] 节点[ {{ $labels.ip}} ] 内存使用率 {{ printf "%.2f" $value }}%
expr: |
(1 - (node_memory_MemAvailable_bytes{cluster=~"hadoop-sdp"} / (node_memory_MemTotal_bytes{cluster=~"hadoop-sdp"}))) * 100 > 50
for: 30s
labels:
severity: critical
kubectl -n monitoring logs -l 'alertmanager=main' -c alertmanager
# vim manifests/rules/hadoop-server.rule
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus: k8s
role: alert-rules
name: prometheus-hadoop-rules
namespace: monitoring
spec:
groups:
- name: hadoop.rules
rules:
- alert: 服务器内存使用率 > 80%
annotations:
description: 集群[ {{ $labels.cluster}} ] 节点[ {{ $labels.ip}} ] 内存使用率 {{ printf "%.2f" $value }}%
expr: |
(1 - (node_memory_MemAvailable_bytes{cluster=~"hadoop-sdp"} / (node_memory_MemTotal_bytes{cluster=~"hadoop-sdp"}))) * 100 > 50
for: 30s
labels:
severity: critical
kubectl -n monitoring logs -l 'alertmanager=main' -c alertmanager