K8s部署bitnami/Kube-prometheus
kube-prometheus-stack大部分镜像都是国外,linux没有vpn会下载失败,这里结合sc存储、helm、kube-prometheus、自定义告警,在K8s里部署Prometheus+alertmanager+邮件告警,操作系统Ubuntu20.04。
主要告警内容:
- K8s集群告警
- PostgreSQL告警
- es告警
- kube-state-metrics告警
1. 创建StorageClass存储
不需要配置SC存储的可以跳过
1.1 安装NFS
安装nfs服务器创建本地目录并加入nfs共享。
#安装nfs
apt install nfs-kernel-server nfs-common
#创建共享目录
mkdir /nfs
mkdir /nfs/k8sdata
#配置共享
cat /etc/exports
/nfs *(rw,sync,insecure,no_subtree_check,no_root_squash)
#重启服务
systemctl restart rpcbind
systemctl daemon_reload
systemctl enable nfs-kernel-server
systemctl restart nfs-server
1.2 创建yaml配置文件
rbac.yaml
vim rbac.yaml
apiVersion: v1
kind: ServiceAccount
metadata:
name: nfs-client-provisioner
namespace: default
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: nfs-client-provisioner-runner
rules:
- apiGroups: [""]
resources: ["persistentvolumes"]
verbs: ["get", "list", "watch", "create", "delete"]
- apiGroups: [""]
resources: ["persistentvolumeclaims"]
verbs: ["get", "list", "watch", "update"]
- apiGroups: ["storage.k8s.io"]
resources: ["storageclasses"]
verbs: ["get", "list", "watch"]
- apiGroups: [""]
resources: ["events"]
verbs: ["create", "update", "patch"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: run-nfs-client-provisioner
subjects:
- kind: ServiceAccount
name: nfs-client-provisioner
namespace: default
roleRef:
kind: ClusterRole
name: nfs-client-provisioner-runner
apiGroup: rbac.authorization.k8s.io
---
kind: Role
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: leader-locking-nfs-client-provisioner
namespace: default
rules:
- apiGroups: [""]
resources: ["endpoints"]
verbs: ["get", "list", "watch", "create", "update", "patch"]
---
kind: RoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: leader-locking-nfs-client-provisioner
subjects:
- kind: ServiceAccount
name: nfs-client-provisioner
namespace: default
roleRef:
kind: Role
name: leader-locking-nfs-client-provisioner
apiGroup: rbac.authorization.k8s.io
nfs-StorageClass.yaml
vim nfs-StorageClass.yaml
apiVersion: storage.k8s.io/v1
kind: StorageClass
metadata:
name: managed-nfs-storage
provisioner: managed-nfs-storage #记住这个
parameters:
archiveOnDelete: "false"
nfs-provisioner.yaml
vim nfs-provisioner.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: nfs-client-provisioner
labels:
app: nfs-client-provisioner
namespace: default
spec:
replicas: 1
selector:
matchLabels:
app: nfs-client-provisioner
strategy:
type: Recreate
selector:
matchLabels:
app: nfs-client-provisioner
template:
metadata:
labels:
app: nfs-client-provisioner
spec:
serviceAccountName: nfs-client-provisioner
containers:
- name: nfs-client-provisioner
image: quay.io/external_storage/nfs-client-provisioner:latest
volumeMounts:
- name: nfs-client-root
mountPath: /persistentvolumes
env:
- name: PROVISIONER_NAME
value: managed-nfs-storage #与上面的provisioner值一样
- name: NFS_SERVER
value: 172.16.12.27 #nfs-server的ip
- name: NFS_PATH
value: /nfs/k8sdata #nfs-server共享的目录
volumes:
- name: nfs-client-root
nfs:
server: 172.16.12.27
path: /nfs/k8sdata
1.3 部署StorageClass存储
kubectl apply -f rbac.yaml
kubectl apply -f nfs-StorageClass.yaml
kubectl apply -f nfs-provisioner.yaml
2. 配置helm3
2.1 下载并安装helm3
#安装helm3
curl https://baltocdn.com/helm/signing.asc | sudo apt-key add -
sudo apt-get install apt-transport-https --yes
echo "deb https://baltocdn.com/helm/stable/debian/ all main" | sudo tee /etc/apt/sources.list.d/helm-stable-debian.list
sudo apt-get update
sudo apt-get install helm
2.2 添加repo仓库
#添加helm仓库
helm repo add bitnami https://charts.bitnami.com/bitnami
helm update
3. 部署Kube-prometheus
3.1 下载kube-prometheus
helm search repo kube-prometheus
NAME CHART VERSION APP VERSION DESCRIPTION
bitnami/kube-prometheus 6.5.3 0.53.1 kube-prometheus collects Kubernetes manifests t...
#创建minio命名空间
kubectl create namespace kube-prometheus
namespace/kube-prometheus created
#拉取kube-Prometheus版本
mkdir /opt/k8s/k8s_application
cd /opt/k8s/k8s_application
helm pull bitnami/kube-prometheus --version 6.5.3
tar -zxvf kube-prometheus-6.5.3.tgz
cd kube-prometheus
3.2 编辑values.yaml配置
#编辑变量
vim values.yaml
18 storageClass: "managed-nfs-storage" #上面文件部署的是aicloud-nfs-storage
prometheus
346 prometheus:
425 type: NodePort
438 nodePort: "30090" #prometheus暴露端口
805 storageClass: "managed-nfs-storage"
alertmanager
1125 alertmanager:
1199 type: NodePort
1213 nodePort: "30903" #alertmanager暴露端口
1499 storageClass: "managed-nfs-storage"
#企业邮箱配置
1400 config:
1401 global:
1402 resolve_timeout: 5m
1403 smtp_smarthost: 'smtp.exmail.qq.com:465'
1404 smtp_from: 'xxx@xxx.com'
1405 smtp_auth_username: 'xxx@xxx.com'
1406 smtp_auth_password: 'xxx授权码'
1407 smtp_require_tls: false
1408 route:
1409 group_by: ['job']
1410 group_wait: 30s
1411 group_interval: 5m
1412 repeat_interval: 12h
1413 receiver: 'ops'
1414 routes:
1415 - match:
1416 alertname: Watchdog
1417 receiver: 'ops'
1418 receivers:
1419 - name: 'ops'
1420 email_configs:
1421 - to: 'xxx@xxx.com'
1422 send_resolved: true
3.3 helm部署kube-prometheus
root@master:/opt/k8s/k8s_application/kube-prometheus# pwd
/opt/k8s/k8s_application/kube-prometheus
root@master:/opt/k8s/k8s_application/kube-prometheus# ll
total 232
drwxr-xr-x 6 root root 4096 Jan 7 16:30 ./
drwxr-xr-x 10 root root 4096 Jan 10 09:09 ../
-rw-r--r-- 1 root root 333 Dec 21 00:59 .helmignore
-rw-r--r-- 1 root root 403 Dec 21 00:59 Chart.lock
-rw-r--r-- 1 root root 1259 Dec 21 00:59 Chart.yaml
-rw-r--r-- 1 root root 108864 Dec 21 00:59 README.md
drwxr-xr-x 5 root root 4096 Jan 7 10:55 charts/
drwxr-xr-x 2 root root 4096 Jan 7 14:55 crds/
drwxr-xr-x 2 root root 4096 Jan 10 09:15 rules/
drwxr-xr-x 6 root root 4096 Jan 7 10:55 templates/
-rw-r--r-- 1 root root 87579 Jan 7 16:28 values.yaml
#部署命令
helm install -f values.yaml kube-prometheus bitnami/kube-prometheus --version 6.5.3 -n kube-prometheus
NAME: kube-prometheus
LAST DEPLOYED: Fri Jan 7 11:39:43 2022
NAMESPACE: kube-prometheus
STATUS: deployed
REVISION: 1
TEST SUITE: None
NOTES:
CHART NAME: kube-prometheus
CHART VERSION: 6.5.3
APP VERSION: 0.53.1
** Please be patient while the chart is being deployed **
Watch the Prometheus Operator Deployment status using the command:
kubectl get deploy -w --namespace kube-prometheus -l app.kubernetes.io/name=kube-prometheus-operator,app.kubernetes.io/instance=kube-prometheus
Watch the Prometheus StatefulSet status using the command:
kubectl get sts -w --namespace kube-prometheus -l app.kubernetes.io/name=kube-prometheus-prometheus,app.kubernetes.io/instance=kube-prometheus
Prometheus can be accessed via port "9090" on the following DNS name from within your cluster:
kube-prometheus-prometheus.kube-prometheus.svc.cluster.local
To access Prometheus from outside the cluster execute the following commands:
export NODE_PORT=$(kubectl get --namespace kube-prometheus -o jsonpath="{.spec.ports[0].nodePort}" services kube-prometheus-prometheus)
export NODE_IP=$(kubectl get nodes --namespace kube-prometheus -o jsonpath="{.items[0].status.addresses[0].address}")
echo "Prometheus URL: http://$NODE_IP:$NODE_PORT/"
Watch the Alertmanager StatefulSet status using the command:
kubectl get sts -w --namespace kube-prometheus -l app.kubernetes.io/name=kube-prometheus-alertmanager,app.kubernetes.io/instance=kube-prometheus
Alertmanager can be accessed via port "9093" on the following DNS name from within your cluster:
kube-prometheus-alertmanager.kube-prometheus.svc.cluster.local
To access Alertmanager from outside the cluster execute the following commands:
export NODE_PORT=$(kubectl get --namespace kube-prometheus -o jsonpath="{.spec.ports[0].nodePort}" services kube-prometheus-alertmanager)
export NODE_IP=$(kubectl get nodes --namespace kube-prometheus -o jsonpath="{.items[0].status.addresses[0].address}")
echo "Alertmanager URL: http://$NODE_IP:$NODE_PORT/"
#删除kube-prometheus
helm delete kube-prometheus -n kube-prometheus
#Prometheus端口为30090
#Alertmanager端口为30903
3.4 查看容器运行情况
root@master:~# kubectl get pods,svc -n kube-prometheus -o wide
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
pod/alertmanager-kube-prometheus-alertmanager-0 2/2 Running 0 11m 10.244.135.55 node3 <none> <none>
pod/kube-prometheus-kube-state-metrics-86b78f7c66-c5cdg 1/1 Running 0 11m 10.244.135.14 node3 <none> <none>
pod/kube-prometheus-node-exporter-mpjn6 1/1 Running 0 11m 172.16.12.29 node3 <none> <none>
pod/kube-prometheus-operator-6fc5dbc49-g2l25 1/1 Running 0 11m 10.244.135.18 node3 <none> <none>
pod/prometheus-kube-prometheus-prometheus-0 2/2 Running 0 11m 10.244.135.1 node3 <none> <none>
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR
service/alertmanager-operated ClusterIP None <none> 9093/TCP,9094/TCP,9094/UDP 11m app.kubernetes.io/name=alertmanager
service/kube-prometheus-alertmanager NodePort 10.108.168.165 <none> 9093:30903/TCP 11m alertmanager=kube-prometheus-alertmanager,app.kubernetes.io/name=alertmanager
service/kube-prometheus-kube-state-metrics ClusterIP 10.103.31.242 <none> 8080/TCP 11m app.kubernetes.io/instance=kube-prometheus,app.kubernetes.io/name=kube-state-metrics
service/kube-prometheus-node-exporter ClusterIP 10.107.0.118 <none> 9100/TCP 11m app.kubernetes.io/instance=kube-prometheus,app.kubernetes.io/name=node-exporter
service/kube-prometheus-operator ClusterIP 10.110.167.59 <none> 8080/TCP 11m app.kubernetes.io/component=operator,app.kubernetes.io/instance=kube-prometheus,app.kubernetes.io/name=kube-prometheus
service/kube-prometheus-prometheus NodePort 10.103.120.125 <none> 9090:30090/TCP 11m app.kubernetes.io/name=prometheus,prometheus=kube-prometheus-prometheus
service/prometheus-operated ClusterIP None <none> 9090/TCP 11m app.kubernetes.io/name=prometheus
3.5 查看Prometheus和Alertmanager
IP为k8s-master的IP
Prometheus:http://172.16.13.55:30090/targets
Alertmanager:http://172.16.13.55:30903/#/alerts
3.6 配置Prometheus
1)打开Prometheus查看监控会发现kube-proxy失败。
2)这是因为默认情况下,该服务监听地址只提供给127.0.0.1,需修改为0.0.0.0。
打开Dashboard搜索kube-proxy,找到Config Maps点击编辑,或者命令行修改也可以。
3)将原来的127.0.0.1改为0.0.0.0。
4)删除对应的所有节点上的kube-proxy,让他们重新启动应用修改后的配置文件。重启完成后刷新Prometheus即可看到。
http://172.16.13.55:30090/targets
4. 自定义告警内容
在kube-prometheus目录下创建rules文件夹,所有自定义告警内容都存放在此文件夹里。
cd /opt/k8s/k8s_application/kube-prometheus
mkdir rules
cd rules
4.1 例子
这里举个例子,其他告警可以网上搜索创建应用即可。
#这是针对PostgreSQL数据库的告警内容
vim postgresql-prometheusrule.yaml
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
prometheus-name: kube-prometheus-prometheus
managed-by: prometheus-operator
name: prometheus-pgsql-rules #全部都是固定的,只有这一行name自定义
namespace: kube-prometheus
#下面是自定义告警内容
spec:
groups:
- name: prometheus-exporter-postgresql.rules
rules:
- alert: PostgreSQLDown
expr: pg_up == 0
for: 1m
labels:
severity: critical
annotations:
message: "PostgreSQL instance {{ $labels.prometheus_exporter }} is DOWN"
- alert: PostgreSQLConnectionsHigh
expr: (sum(pg_stat_database_numbackends) by (namespace,service,prometheus_exporter)) / on (namespace,service,prometheus_exporter) (pg_settings_max_connections) > 0.8
for: 1m
labels:
severity: warning
annotations:
message: "PostgreSQL instance {{ $labels.prometheus_exporter }} has High number of connections"
- alert: PostgreSQLQueriesPerSecondHigh
expr: avg(rate(pg_stat_database_xact_commit[5m]) + rate(pg_stat_database_xact_rollback[5m])) by (namespace,service,prometheus_exporter,datname) > 10000
for: 1m
labels:
severity: warning
annotations:
message: "PostgreSQL instance {{ $labels.prometheus_exporter }} has High number of queries per second"
- alert: PostgreSQLCacheHitRateLow
expr: (sum(irate(pg_stat_database_xact_commit[5m])) by (namespace,service,prometheus_exporter) + sum(irate(pg_stat_database_xact_rollback{datname!~"template.*|"}[5m])) by (namespace,service,prometheus_exporter)) > 10000
for: 1m
labels:
severity: warning
annotations:
message: "PostgreSQL instance {{ $labels.prometheus_exporter }} has Low cache hit rate"
#应用告警
kubectl apply -f postgresql-prometheusrule.yaml
4.2 其他自定义告警
这里找了些网上的告警,包含了大部分需求,有特别需求可以去对应服务官方文档查找资料。
#下载
https://gitee.com/songfei123123/kube-prometheus-rules
cd rules
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/postgresql-prometheusrule.yaml
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/kube-state-metrics-rules.yaml
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/k8s-rules.yaml
wget https://gitee.com/songfei123123/kube-prometheus-rules/blob/master/es-prometheusrule.yaml
#部署
kubectl apply -f es-prometheusrule.yaml #es告警规则
kubectl apply -f k8s-rules.yaml #k8s集群告警规则
kubectl apply -f kube-state-metrics-rules.yaml #k8s官方文档的kube-state-metrics告警规则
kubectl apply -f postgresql-prometheusrule.yaml #pg数据库的告警规则
k8s官方:https://awesome-prometheus-alerts.grep.to/rules#kubernetes
github他人参考:https://github.com/3scale-ops/prometheus-exporter-operator/tree/main/prometheus-rules
4.3 进入Dashboard查看configmap是否生效
url:https://k8s-master-ip:30008/#/overview?namespace=kube-prometheus
1)所有新增自定义的告警规则都会附加在该Config Maps里面。
2)默认不创建自定义告警规则只有上面这一部分。
3)创建了告警规则后下面会新增内容。
4.4 查看Prometheus告警规则
http://172.16.13.55:30090/alerts 需要等待一会再刷新
4.5 查看Alertmanager
http://172.16.13.55:30903/#/alerts
可以看到对应的两个告警
4.6 查看告警邮件