用pod部署prometheus
node_exporter
节点数据收集器
daemonset ————> 保证每个节点都有一个收集器
prometheus————>监控主程序
grafana————>图形化
altermanager————>告警模块
[root@master01 ~]# kubectl create ns monitor-sa namespace/monitor-sa created [root@master01 opt]# mkdir prometheus [root@master01 opt]# cd prometheus/ [root@master01 prometheus]# vim node_exporter.yaml apiVersion: apps/v1 kind: DaemonSet metadata: name: node-exporter namespace: monitor-sa labels: name: node-exporter spec: selector: matchLabels: name: node-exporter template: metadata: labels: name: node-exporter spec: hostPID: true hostIPC: true hostNetwork: true containers: - name: node-exporter image: prom/node-exporter:v1 ports: - containerPort: 9100 resources: limits: cpu: "0.5" securityContext: privileged: true args: - --path.procfs - /host/proc - --path.sysfs - /host/sys - --collector.filesystem.ignored-mount-points - '"^/(sys|proc|dev|host|etc)($|/)"' volumeMounts: - name: dev mountPath: /host/dev - name: proc mountPath: /host/proc - name: sys mountPath: /host/sys - name: rootfs mountPath: /rootfs volumes: - name: proc hostPath: path: /proc - name: dev hostPath: path: /dev - name: sys hostPath: path: /sys - name: rootfs hostPath: path: / [root@master01 prometheus]# kubectl apply -f node_exporter.yaml daemonset.apps/node-exporter created [root@master01 prometheus]# kubectl get pod -n monitor-sa -o wide node-exporter-99vhd 1/1 Running 0 15s 192.168.60.120 node01 <none> <none> node-exporter-c6md9 1/1 Running 0 15s 192.168.60.130 node02 <none> <none> node-exporter-f29fh 1/1 Running 0 15s 192.168.60.110 master01 <none> <none>
#创建两个账号 [root@master01 prometheus]# kubectl create serviceaccount monitor -n monitor-sa [root@master01 prometheus]# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitor-sa --clusterrole=cluster-admin --serviceaccount=monitor-sa:monitor [root@master01 prometheus]# kubectl create clusterrolebinding monitor-clusterrolebinding -n monitor-sa --clusterrole=cluster-admin --serviceaccount=monitor-sa:monitor [root@master01 prometheus]# rz -E rz waiting to receive. [root@master01 prometheus]# ls node_exporter.yaml prometheus-alertmanager-cfg.yaml [root@master01 prometheus]# vim prometheus-alertmanager-cfg.yaml - targets: ['192.168.60.110:10251'] - job_name: 'kubernetes-controller-manager' scrape_interval: 5s static_configs: - targets: ['192.168.60.110:10252'] - job_name: 'kubernetes-kube-proxy' scrape_interval: 5s static_configs: - targets: ['192.168.60.110:10249','192.168.60.120:10249','192.168.60.130:10249'] - job_name: 'kubernetes-etcd' scheme: https tls_config: ca_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ca.crt cert_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.crt key_file: /var/run/secrets/kubernetes.io/k8s-certs/etcd/server.key scrape_interval: 5s static_configs: - targets: ['192.168.60.110:2379'] - alert: HighPodCpuUsage #告警邮件的标题 expr: sum(rate(container_cpu_usage_seconds_total{namespace="default", pod=~".+"}[5m])) by (pod) > 0.9 #收集指标数据 for: 5m #占用90%cpu的持续时间5M。告警 labels: severity: warning annotations: #告警的内容 [root@master01 prometheus]# vim alter-mail.yaml kind: ConfigMap apiVersion: v1 metadata: name: alertmanager namespace: monitor-sa data: alertmanager.yml: |- global: resolve_timeout: 1m smtp_smarthost: 'smtp.qq.com:25' smtp_from: '1647629457@qq.com' smtp_auth_username: '1647629457@qq.com' smtp_auth_password: 'mhmjbfjydbuhecea' smtp_require_tls: false route: group_by: [alertname] group_wait: 10s group_interval: 10s repeat_interval: 10m receiver: default-receiver receivers: - name: 'default-receiver' email_configs: - to: '1647629457@qq.com' send_resolved: true
[root@master01 prometheus]# vim prometheus-svc.yaml apiVersion: v1 kind: Service metadata: name: prometheus namespace: monitor-sa labels: app: prometheus spec: type: NodePort ports: - port: 9090 targetPort: 9090 protocol: TCP selector: app: prometheus component: server [root@master01 prometheus]# vim prometheus-alter.yaml apiVersion: v1 kind: Service metadata: labels: name: prometheus kubernetes.io/cluster-service: 'true' name: alertmanager namespace: monitor-sa spec: ports: - name: alertmanager nodePort: 30066 port: 9093 protocol: TCP targetPort: 9093 selector: app: prometheus sessionAffinity: None type: NodePort [root@master01 prometheus]# vim prometheus-deploy.yaml apiVersion: apps/v1 kind: Deployment metadata: name: prometheus-server namespace: monitor-sa labels: app: prometheus spec: replicas: 1 selector: matchLabels: app: prometheus component: server template: metadata: labels: app: prometheus component: server annotations: prometheus.io/scrape: 'false' spec: serviceAccountName: monitor initContainers: - name: init-chmod image: busybox:latest command: ['sh','-c','chmod -R 777 /prometheus;chmod -R 777 /etc'] volumeMounts: - mountPath: /prometheus name: prometheus-storage-volume - mountPath: /etc/localtime name: timezone containers: - name: prometheus image: prom/prometheus:v2.45.0 command: - prometheus - --config.file=/etc/prometheus/prometheus.yml - --storage.tsdb.path=/prometheus - --storage.tsdb.retention=720h - --web.enable-lifecycle ports: - containerPort: 9090 volumeMounts: - name: prometheus-config mountPath: /etc/prometheus/ - mountPath: /prometheus/ name: prometheus-storage-volume - name: timezone mountPath: /etc/localtime - name: k8s-certs mountPath: /var/run/secrets/kubernetes.io/k8s-certs/etcd/ - name: alertmanager image: prom/alertmanager:v0.20.0 args: - "--config.file=/etc/alertmanager/alertmanager.yml" - "--log.level=debug" ports: - containerPort: 9093 protocol: TCP name: alertmanager volumeMounts: - name: alertmanager-config mountPath: /etc/alertmanager - name: alertmanager-storage mountPath: /alertmanager - name: localtime mountPath: /etc/localtime volumes: - name: prometheus-config configMap: name: prometheus-config defaultMode: 0777 - name: prometheus-storage-volume hostPath: path: /data type: DirectoryOrCreate - name: k8s-certs secret: secretName: etcd-certs - name: timezone hostPath: path: /usr/share/zoneinfo/Asia/Shanghai - name: alertmanager-config configMap: name: alertmanager - name: alertmanager-storage hostPath: path: /data/alertmanager type: DirectoryOrCreate - name: localtime hostPath: path: /usr/share/zoneinfo/Asia/Shanghai kubectl apply -f prometheus-deploy.yaml kubectl apply -f prometheus-svc.yaml kubectl apply -f prometheus-alter.yaml #生成证书 kubectl -n monitor-sa create secret generic etcd-certs --from-file=/etc/kubernetes/pki/etcd/server.key --from-file=/etc/kubernetes/pki/etcd/server.crt --from-file=/etc/kubernetes/pki/etcd/ca.crt [root@master01 prometheus]# kubectl get pod -n monitor-sa -o wide NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES node-exporter-99vhd 1/1 Running 0 140m 192.168.60.120 node01 <none> <none> node-exporter-c6md9 1/1 Running 0 140m 192.168.60.130 node02 <none> <none> node-exporter-f29fh 1/1 Running 0 140m 192.168.60.110 master01 <none> <none> prometheus-server-55d866cb44-wrrbx 2/2 Running 0 5m29s 10.244.2.34 node02 <none> <none> [root@master01 prometheus]# kubectl get svc -n monitor-sa -o wide NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR alertmanager NodePort 10.96.153.49 <none> 9093:30066/TCP 6m app=prometheus prometheus NodePort 10.96.215.253 <none> 9090:31758/TCP 6m1s app=prometheus,component=server
[root@master01 prometheus]# vim pro-gra.yml apiVersion: v1 kind: PersistentVolumeClaim metadata: name: grafana namespace: kube-system spec: accessModes: - ReadWriteMany storageClassName: nfs-client-storageclass resources: requests: storage: 2Gi --- apiVersion: apps/v1 kind: Deployment metadata: name: monitoring-grafana namespace: kube-system spec: replicas: 1 selector: matchLabels: task: monitoring k8s-app: grafana template: metadata: labels: task: monitoring k8s-app: grafana spec: containers: - name: grafana image: grafana/grafana:7.5.11 securityContext: runAsUser: 104 runAsGroup: 107 ports: - containerPort: 3000 protocol: TCP volumeMounts: - mountPath: /etc/ssl/certs name: ca-certificates readOnly: false - mountPath: /var name: grafana-storage - mountPath: /var/lib/grafana name: graf-test env: - name: INFLUXDB_HOST value: monitoring-influxdb - name: GF_SERVER_HTTP_PORT value: "3000" - name: GF_AUTH_BASIC_ENABLED value: "false" - name: GF_AUTH_ANONYMOUS_ENABLED value: "true" - name: GF_AUTH_ANONYMOUS_ORG_ROLE value: Admin - name: GF_SERVER_ROOT_URL value: / volumes: - name: ca-certificates hostPath: path: /etc/ssl/certs - name: grafana-storage emptyDir: {} - name: graf-test persistentVolumeClaim: claimName: grafana --- apiVersion: v1 kind: Service metadata: labels: name: monitoring-grafana namespace: kube-system spec: ports: - port: 80 targetPort: 3000 selector: k8s-app: grafana type: NodePort [root@master01 prometheus]# kubectl apply -f pro-gra.yml [root@master01 prometheus]# kubectl get svc -n kube-system NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE kube-dns ClusterIP 10.96.0.10 <none> 53/UDP,53/TCP,9153/TCP 16d monitoring-grafana NodePort 10.96.220.147 <none> 80:31771/TCP 12s
//处理 kube-proxy 监控告警 kubectl edit configmap kube-proxy -n kube-system ...... metricsBindAddress: "0.0.0.0:10249" #因为 kube-proxy 默认端口10249是监听在 127.0.0.1 上的,需要改成监听到物理节点上 #重新启动 kube-proxy kubectl get pods -n kube-system | grep kube-proxy |awk '{print $1}' | xargs kubectl delete pods -n kube-system
测试:
[root@master01 prometheus]# vim ylce.yml apiVersion: apps/v1 kind: Deployment metadata: name: hpa-test labels: hpa: test spec: replicas: 1 selector: matchLabels: hpa: test template: metadata: labels: hpa: test spec: containers: - name: centos image: centos:7 command: ["/bin/bash", "-c", "yum install -y stress --nogpgcheck && sleep 3600"] volumeMounts: - name: yum mountPath: /etc/yum.repos.d/ volumes: - name: yum hostPath: path: /etc/yum.repos.d/