一监控大纲
监控集群组件
监控应用
管理组件日志
管理应用日志
Deployment升级和回滚
配置应用的不同方法
应用弹性伸缩
应用自恢复
二集群监控
集群整体状态
[root@cce-21day-cluster-62954-81jwz ~]# kubectl cluster-info
Kubernetes master is running at https://192.168.47.160:5443
KubeDNS is running at https://192.168.47.160:5443/api/v1/namespaces/kube-system/services/kube-dns:dns/proxy
kubernetes-dashboard is running at https://192.168.47.160:5443/api/v1/namespaces/kube-system/services/https:kubernetes-dashboard:/proxy
更多集群信息
[root@cce-21day-cluster-62954-81jwz ~]# kubectl cluster-info dump
通过插件部署:
[root@cce-21day-cluster-62954-81jwz ~]# kubectl get pods -n kube-system
NAME READY STATUS RESTARTS AGE
icagent-2nkmn 0/0 Running 0 4d
icagent-z4v6v 0/0 Running 0 4d
kube-dns-7dd858d95b-2p4hs 3/3 Running 18 4d
kube-dns-7dd858d95b-xncrv 3/3 Running 18 4d
kubernetes-dashboard-9bf5c858-z2lvm 1/1 Running 6 4d
storage-driver-25ft5 1/1 Running 6 4d
storage-driver-mkjsz 1/1 Running 6 4d
[root@cce-21day-cluster-62954-81jwz ~]# kubectl get pod kube-dns-7dd858d95b-2p4hs -n kube-system
NAME READY STATUS RESTARTS AGE
kube-dns-7dd858d95b-2p4hs 3/3 Running 18 4d
[root@cce-21day-cluster-62954-81jwz ~]#
[root@cce-21day-cluster-62954-81jwz ~]# kubectl describe pod kube-dns-7dd858d95b-2p4hs -n kube-system
Name: kube-dns-7dd858d95b-2p4hs
Namespace: kube-system
Node: 192.168.162.50/192.168.162.50
Start Time: Fri, 07 Dec 2018 11:01:03 +0800
Labels: k8s-app=kube-dns
kubernetes.io/lock=true
pod-template-hash=3884148516
podaffinity=kube-dns-pod
Annotations: kubernetes.io/availablezone=cn-north-1a
Status: Running
IP: 172.16.0.34
Controlled By: ReplicaSet/kube-dns-7dd858d95b
Init Containers:
bootstrap-kubedns-init:
Container ID: docker://6eb3bb4bcf1fde1b37fe07a50549e22c177a658d4a9ab417d3e95f2db3166a59
Image: euleros:2.2.5
Image ID: docker://sha256:b0f6bcd0a2a09896e27ce8697ae1f96f05137e3761da1e78acaa8a9885fe62fb
Port: <none>
Command:
/bin/sh
-c
touch /var/log/skydns.log; touch /var/log/dnsmasq.log; touch /var/log/exechealthz.log; chown paas:paas /var/log/skydns.log /var/log/dnsmasq.log /var/log/exechealthz.log; chmod 640 /var/log/skydns.log /var/log/dnsmasq.log /var/log/exechealthz.log
State: Terminated
Reason: Completed
Exit Code: 0
Started: Tue, 11 Dec 2018 21:45:39 +0800
Finished: Tue, 11 Dec 2018 21:45:39 +0800
Ready: True
Restart Count: 6
Environment: <none>
Mounts:
/var/log from logfile (rw)
/var/run/secrets/kubernetes.io/serviceaccount from kube-dns-token-tbh9l (ro)
Containers:
kubedns:
Container ID: docker://b11df1c26df85a2551243d026651fa7bcf8a6eedaf4968578042028a78f4991a
Image: cfe-kubedns-amd64:5.12.1
Image ID: docker://sha256:df73fd5fff82a747c877c832337deb2136907ca3d98b3b2669ab58cb8b730c2a
Ports: 10053/UDP, 10053/TCP, 10055/TCP
Command:
/bin/sh
-c
/kube-dns 1>>/var/log/skydns.log 2>&1 --domain=cluster.local. --dns-port=10053 --config-dir=/kube-dns-config --v=2
State: Running
Started: Tue, 11 Dec 2018 21:45:40 +0800
Last State: Terminated
Reason: Error
Exit Code: 255
Started: Tue, 11 Dec 2018 14:21:36 +0800
Finished: Tue, 11 Dec 2018 21:42:55 +0800
Ready: True
Restart Count: 6
Limits:
cpu: 100m
memory: 512Mi
Requests:
cpu: 100m
memory: 100Mi
Liveness: http-get http://:10054/healthcheck/kubedns delay=60s timeout=5s period=10s #success=1 #failure=5
Readiness: http-get http://:8081/readiness delay=3s timeout=5s period=10s #success=1 #failure=3
Environment:
PROMETHEUS_PORT: 10055
PAAS_CRYPTO_PATH: /var/paas/kubernetes/material
Mounts:
/kube-dns-config from kube-dns-config (rw)
/var/log from logfile (rw)
/var/paas from crypto (rw)
/var/paas/kubernetes/material from material (ro)
/var/run/secrets/kubernetes.io/serviceaccount from kube-dns-token-tbh9l (ro)
dnsmasq:
Container ID: docker://3238c48504f79f592a5506440028d347c4fa5b7d8fd3ca0f1be4f4122ca59642
Image: cfe-kube-dnsmasq-amd64:5.12.1
Image ID: docker://sha256:76253c3f68bdbbf0faf0f0408943276042c2dae2e288eaf265f3873cef801cfe
Ports: 5353/UDP, 5353/TCP
Command:
/bin/sh
-c
/dnsmasq-nanny 1>>/var/log/dnsmasq.log 2>&1 -v=2 -logtostderr -configDir=/etc/k8s/dns/dnsmasq-nanny -restartDnsmasq=true -- -k --port=5353 --cache-size=1000 --log-facility=- --server=/cluster.local/127.0.0.1#10053 --server=/in-addr.arpa/127.0.0.1#10053 --server=/ip6.arpa/127.0.0.1#10053
State: Running
Started: Tue, 11 Dec 2018 21:45:41 +0800
Last State: Terminated
Reason: Error
Exit Code: 255
Started: Tue, 11 Dec 2018 14:21:37 +0800
Finished: Tue, 11 Dec 2018 21:42:54 +0800
Ready: True
Restart Count: 6
Limits:
cpu: 150m
memory: 150Mi
Requests:
cpu: 150m
memory: 150Mi
Liveness: http-get http://:10054/healthcheck/dnsmasq delay=60s timeout=5s period=10s #success=1 #failure=5
Environment: <none>
Mounts:
/etc/k8s/dns/dnsmasq-nanny from kube-dns-config (rw)
/var/log from logfile (rw)
/var/run/secrets/kubernetes.io/serviceaccount from kube-dns-token-tbh9l (ro)
sidecar:
Container ID: docker://e98f61ac5dbe3e0c37175bb3fa72827a6a5ab2d99edff71bc4666e1123aa9f35
Image: cfe-exechealthz-amd64:5.12.1
Image ID: docker://sha256:5768a72e5c1ab85982bd832dd983611046be24e4eae95aa3bfd95b0051acc424
Port: 10054/TCP
Command:
/bin/sh
-c
/sidecar 1>>/var/log/exechealthz.log 2>&1 --v=2 --logtostderr --probe=kubedns,127.0.0.1:10053,kubernetes.default.svc.cluster.local,5,A --probe=dnsmasq,127.0.0.1:5353,kubernetes.default.svc.cluster.local,5,A
State: Running
Started: Tue, 11 Dec 2018 21:45:42 +0800
Last State: Terminated
Reason: Error
Exit Code: 255
Started: Tue, 11 Dec 2018 14:21:37 +0800
Finished: Tue, 11 Dec 2018 21:42:54 +0800
Ready: True
Restart Count: 6
Limits:
cpu: 50m
memory: 50Mi
Requests:
cpu: 50m
memory: 50Mi
Liveness: http-get http://:10054/metrics delay=60s timeout=5s period=10s #success=1 #failure=5
Environment: <none>
Mounts:
/var/log from logfile (rw)
/var/run/secrets/kubernetes.io/serviceaccount from kube-dns-token-tbh9l (ro)
Conditions:
Type Status
Initialized True
Ready True
PodScheduled True
Volumes:
crypto:
Type: EmptyDir (a temporary directory that shares a pod's lifetime)
Medium:
material:
Type: HostPath (bare host directory volume)
Path: /var/paas/srv/kubernetes
HostPathType:
logfile:
Type: HostPath (bare host directory volume)
Path: /var/paas/sys/log/kubernetes
HostPathType:
kube-dns-config:
Type: ConfigMap (a volume populated by a ConfigMap)
Name: kube-dns
Optional: true
kube-dns-token-tbh9l:
Type: Secret (a volume populated by a Secret)
SecretName: kube-dns-token-tbh9l
Optional: false
QoS Class: Burstable
Node-Selectors: <none>
Tolerations: node.kubernetes.io/not-ready:NoExecute for 300s
node.kubernetes.io/unreachable:NoExecute for 300s
Events:
Type Reason Age From Message
---- ------ ---- ---- -------
Normal SuccessfulMountVolume 7h kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "crypto"
Normal SuccessfulMountVolume 7h kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "material"
Normal SuccessfulMountVolume 7h kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "logfile"
Normal SuccessfulMountVolume 7h kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "kube-dns-token-tbh9l"
Normal SuccessfulMountVolume 7h kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "kube-dns-config"
Normal SuccessfulMountVolume 7h kubelet, 192.168.162.50 Successfully mounted volumes for pod "kube-dns-7dd858d95b-2p4hs_kube-system(55a9bbc1-f9cc-11e8-a390-fa163e2c8c71)"
Normal SandboxChanged 7h (x5 over 7h) kubelet, 192.168.162.50 Pod sandbox changed, it will be killed and re-created.
Normal Pulled 7h kubelet, 192.168.162.50 Container image "euleros:2.2.5" already present on machine
Normal SuccessfulCreate 7h kubelet, 192.168.162.50 Created container
Normal Started 7h kubelet, 192.168.162.50 Started container
Normal Pulled 7h kubelet, 192.168.162.50 Container image "cfe-kubedns-amd64:5.12.1" already present on machine
Normal SuccessfulCreate 7h kubelet, 192.168.162.50 Created container
Normal Started 7h kubelet, 192.168.162.50 Started container
Normal Pulled 7h kubelet, 192.168.162.50 Container image "cfe-kube-dnsmasq-amd64:5.12.1" already present on machine
Normal SuccessfulCreate 7h kubelet, 192.168.162.50 Created container
Normal Started 7h kubelet, 192.168.162.50 Started container
Normal Pulled 7h kubelet, 192.168.162.50 Container image "cfe-exechealthz-amd64:5.12.1" already present on machine
Normal SuccessfulCreate 7h kubelet, 192.168.162.50 Created container
Normal Started 7h kubelet, 192.168.162.50 Started container
Normal Healthy 7h (x2 over 7h) kubelet, 192.168.162.50 container docker://ddf231f2ca34df7945528a92cc344756ef7198e0e4bf4828062411c61eac0d40 in health status
Normal SuccessfulMountVolume 6m kubelet, 192.168.162.50 Successfully mounted volumes for pod "kube-dns-7dd858d95b-2p4hs_kube-system(55a9bbc1-f9cc-11e8-a390-fa163e2c8c71)"
Normal SuccessfulMountVolume 6m kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "kube-dns-config"
Normal SuccessfulMountVolume 6m kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "material"
Normal SuccessfulMountVolume 6m kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "logfile"
Normal SuccessfulMountVolume 6m kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "kube-dns-token-tbh9l"
Normal SuccessfulMountVolume 6m kubelet, 192.168.162.50 MountVolume.SetUp succeeded for volume "crypto"
Normal SandboxChanged 6m (x5 over 6m) kubelet, 192.168.162.50 Pod sandbox changed, it will be killed and re-created.
Normal Pulled 5m kubelet, 192.168.162.50 Container image "euleros:2.2.5" already present on machine
Normal SuccessfulCreate 5m kubelet, 192.168.162.50 Created container
Normal Started 5m kubelet, 192.168.162.50 Started container
Normal Pulled 5m kubelet, 192.168.162.50 Container image "cfe-kubedns-amd64:5.12.1" already present on machine
Normal SuccessfulCreate 5m kubelet, 192.168.162.50 Created container
Normal Started 5m kubelet, 192.168.162.50 Started container
Normal Pulled 5m kubelet, 192.168.162.50 Container image "cfe-kube-dnsmasq-amd64:5.12.1" already present on machine
Normal SuccessfulCreate 5m kubelet, 192.168.162.50 Created container
Normal Started 5m kubelet, 192.168.162.50 Started container
Normal Pulled 5m kubelet, 192.168.162.50 Container image "cfe-exechealthz-amd64:5.12.1" already present on machine
Normal SuccessfulCreate 5m kubelet, 192.168.162.50 Created container
Normal Started 5m kubelet, 192.168.162.50 Started container
Normal Healthy 4m (x2 over 5m) kubelet, 192.168.162.50 container docker://b11df1c26df85a2551243d026651fa7bcf8a6eedaf4968578042028a78f4991a in health status
[root@cce-21day-cluster-62954-81jwz ~]#
组件metrics:
$ curl localhost:10250/stats/summary
组件健康状况:
$ curl localhost:10250/healthz
Heapster + cAdvisor监控集群组件
对接了heapster或metrics-server后
展示Node CPU/内存/存储资源消耗:
$ kubectl top node {node name}
cAdvisor既能收集容器CPU、内存、文件系统和网络使用统计信息,还能采集节点资源使用情况;
cAdvisor和Heapster都不能进行数据存储、趋势分析和报警。因此,还需要将数据推送到InfluxDB,Grafana等后端进行存储和图形化展示。
Heapster即将被metrics-server替代
Kuberneetes Dashboard UI
Kubernetes Dashboard用于监控/展示Kubernetes所有的资源对象:
Cluster(Node,PV等)
Workload(Pod,Deployment等)
Config(Configmap,Secrets等)
二监控应用 kubectl describe pod 对接了heapster或metrics-server后,展示Pod CPU/内存/存储资源消耗: $ kubectl top pod {pod name} $ kubectl top pods --namespace backyard $ kubectl get pod {pod name} --watch
三管理K8s组件日志
组件日志:
/var/log/kube-apiserver.log
/var/log/kube-proxy.log
/var/log/kube-controller-manager.log
/var/log/kubelet.log
使用systemd管理:
$ journalctl –u kubelet
使用K8S插件部署:
$ kubectl logs -f kube-proxy
四管理K8S应用日志 # 从容器标准输出截获: $ kubectl logs -f {pod name} –c {container name} $ docker logs -f {docker name} # 日志文件挂载到主机目录: apiVersion: v1 kind: Pod metadata: name: test-pd spec: containers: - image: gcr.io/google_containers/test-webserver name: test-container volumeMounts: - mountPath: /log name: log-volume volumes: - name: log-volume hostPath: # directory location on host path: /var/k8s/log 直接进入容器内查看日志: $ kubectl exec -it {pod} -c {container} /bin/sh $ docker exec -it {container} /bin/sh
五 Deployment升级与回滚 - 1 # 创建Deployment: $ kubectl run {deployment} –image={image} –replicas={rep.} # 或使用yaml文件形式,重点配置replicas和image字段。 # 升级Deployment: $ kubectl set image deployment/nginx-deployment nginx=nginx:1.9.1 $ kubectl set resources deployment/nginx-deployment -c=nginx --limits=cpu=200m,memory=512Mi # 升级策略: minReadySeconds: 5 strategy: type: RollingUpdate rollingUpdate: maxSurge: 1 #默认25% maxUnavailable: 1 #默认25% 六Deployment升级与回滚 - 2 # 暂停Deployment: $ kubectl rollout pause deployment/nginx-deployment # 恢复Deployment: $ kubectl rollout resume deployment/nginx-deployment # 查询升级状态: $ kubectl rollout status deployment/nginx-deployment # 查询升级历史: $ kubectl rollout history deploy/nginx-deployment $ kubectl rollout history deploy/nginx-deployment --revision=2 # 回滚: $ kubectl rollout undo deployment/nginx-deployment --to-revision=2
七应用弹性伸缩 $ kubectl scale deployment nginx-deployment --replicas=10 # 对接了heapster,和HPA联动后: $ kubectl autoscale deployment nginx-deployment --min=10 --max=15 --cpu-percent=80 八应用自恢复:restartPolicy + livenessProbe Pod Restart Policy:Always, OnFailure, Never livenessProbe:http/https Get, shell exec, tcpSocket # tcp socket的liveness探针 + always restart例子 apiVersion: v1 kind: Pod metadata: name: goproxy spec: restartPolicy: Always containers: - name: goproxy image: k8s.gcr.io/goproxy:0.1 ports: - containerPort: 8080 livenessProbe: tcpSocket: port: 8080 initialDelaySeconds: 15 periodSeconds: 20