目录
2.3、使用Prometheus监控Kubernetes集群
1、在kubernetes部署Prometheus
1.1、使用ConfigMaps管理应用配置
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'prometheus'
static_configs:
- targets: ['localhost:9090']
kubectl create -f prometheus-config.yml
1.2、使用Deployment部署Prometheus
apiVersion: v1
kind: "Service"
metadata:
name: prometheus
labels:
name: prometheus
spec:
ports:
- name: prometheus
protocol: TCP
port: 9090
targetPort: 9090
selector:
app: prometheus
type: NodePort
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
name: prometheus
name: prometheus
spec:
replicas: 1
template:
metadata:
labels:
app: prometheus
spec:
containers:
- name: prometheus
image: prom/prometheus:v2.2.1
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/etc/prometheus"
name: prometheus-config
volumes:
- name: prometheus-config
configMap:
name: prometheus
kubectl create -f prometheus-deployment.yml
2、Kubernetes下的服务发现
2.1、Kubernetes的访问授权
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: default
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: default
kubectl apply -f prometheus-rbac-setup.yml
spec:
replicas: 1
template:
metadata:
labels:
app: prometheus
spec:
serviceAccountName: prometheus
serviceAccount: prometheus
kubectl apply -f prometheus-deployment.yml
kubectl exec -it prometheus-69f9ddb588-czn2c ls /var/run/secrets/kubernetes.io/serviceaccount/
2.2、服务发现
- job_name: 'kubernetes-nodes'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
data:
prometheus.yml: |-
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'kubernetes-nodes'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
- job_name: 'kubernetes-service'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: service
- job_name: 'kubernetes-endpoints'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
- job_name: 'kubernetes-ingress'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: ingress
- job_name: 'kubernetes-pods'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: pod
kubectl apply -f prometheus-config.yml
kubectl get pods
kubectl delete pods prometheus-69f9ddb588-rbrs2
kubectl get pods
1. __address__="192.168.99.100:10250"
2. __meta_kubernetes_node_address_Hostname="minikube"
3. __meta_kubernetes_node_address_InternalIP="192.168.99.100"
4. __meta_kubernetes_node_annotation_alpha_kubernetes_io_provided_node_ip="192.168.99.100"
5. __meta_kubernetes_node_annotation_node_alpha_kubernetes_io_ttl="0"
6.__meta_kubernetes_node_annotation_volumes_kubernetes_io_controller_managed_attach_detach="true"
7. __meta_kubernetes_node_label_beta_kubernetes_io_arch="amd64"
8. __meta_kubernetes_node_label_beta_kubernetes_io_os="linux"
9. __meta_kubernetes_node_label_kubernetes_io_hostname="minikube"
10. __meta_kubernetes_node_name="minikube"
11. __metrics_path__="/metrics"
12. __scheme__="https"
13. instance="minikube"
14. job="kubernetes-nodes"
2.3、使用Prometheus监控Kubernetes集群
- job_name: 'kubernetes-kubelet'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
Get https : //192.168.99.100:10250/metrics: x509: cannot validate certificate for 192.168.99.100 because it doesn't contain any IP SANs
- job_name: 'kubernetes-kubelet'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: ture
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-kubelet'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics
kubelet_pod_start_latency_microseconds{quantile="0.99"}
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- job_name: 'kubernetes-cadvisor'
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
insecure_skip_verify: true
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: node
relabel_configs:
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: metrics/cadvisor
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
使用NodeExporter监控集群资源使用情况
为了能够采集集群中各个节点的资源使用情况,我们需要在各节点中部署一个Node Exporter实例。
apiVersion: apps/v1
kind: DaemonSet
metadata:
name: prometheus-node-exporter
namespace: kube-system
labels:
app: prometheus-node-exporter
spec:
template:
metadata:
name: prometheus-node-exporter
labels:
app: prometheus-node-exporter
spec:
containers:
- image: prom/node-exporter:v0.16.0
imagePullPolicy: IfNotPresent
name: prometheus-node-exporter
ports:
- name: prom-node-exp
containerPort: 9100
hostPort: 9100
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
hostNetwork: true
hostPID: true
hostIPC: true
restartPolicy: Always
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/scrape: 'true'
prometheus.io/app-metrics: 'true'
prometheus.io/app-metrics-path: '/metrics'
name: prometheus-node-exporter
namespace: kube-system
labels:
app: prometheus-node-exporter
spec:
clusterIP: None
ports:
- name: prometheus-node-exporter
port: 9100
protocol: TCP
selector:
app: prometheus-node-exporter
type: ClusterIP
已实现采样接口逻辑的资源,可通过annotation标签自动将其加入监控
- Pod资源
- prometheus.io/scrape=true
- prometheus.io/path=/metric
- prometheus.io/port=8080
- Service资源
- prometheus.io/probe
- Endpoint资源
- prometheus.io/scrape
- prometheus.io/path
- prometheus.io/port
由于Node Exporter需要能够访问宿主机,因此这里指定了hostNetwork和hostPID,让Pod实例能够以主机网络 以及系统进程的形式运行。
kubectl create -f node-exporter-daemonset.yml
配置Prometheus来scrape node-exporter的metrics:
- job_name: 'prometheus-node-exporter'
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
kubernetes_sd_configs:
- role: endpoints
relabel_configs:
# 只保留endpoints的annotations中含有prometheus.io/scrape: 'true'和port的name为prometheus-node-exporter的endpoint
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scrape, __meta_kubernetes_endpoint_port_name]
regex: true;prometheus-node-exporter
action: keep
# Match regex against the concatenated source_labels. Then, set target_label to replacement,
# with match group references (${1}, ${2}, ...) in replacement substituted by their value.
# If regex does not match, no replacement takes place.
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_scheme]
action: replace
target_label: __scheme__
regex: (https?)
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_path]
action: replace
target_label: __metrics_path__
regex: (.+)
- source_labels: [__address__, __meta_kubernetes_service_annotation_prometheus_io_port]
action: replace
target_label: __address__
regex: (.+)(?::\d+);(\d+)
replacement: $1:$2
# 去掉label name中的前缀__meta_kubernetes_service_label_
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
# 将__meta_kubernetes_namespace重命名为kubernetes_namespace
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: kubernetes_namespace
# 将__meta_kubernetes_service_name重命名为kubernetes_name
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: kubernetes_name
从kube-apiserver获取集群运行监控指标
kube-apiserver扮演了整个Kubernetes集群管理的入口的角色,负责对外暴露Kubernetes API。kube-apiserver组件一般是独立部署在集群外的,为了能够让部署在集群内的应用(kubernetes插件或者用户应用)能 够与kube-apiserver交互,Kubernetes会默认在命名空间下创建一个名为kubernetes的服务。
而该kubernetes服务代理的后端实际地址通过endpoints进行维护
通过这种方式集群内的应用或者系统主机就可以通过集群内部的DNS域名kubernetes.default.svc访问到部署外 部的kube-apiserver实例。
因此,如果我们想要监控kube-apiserver相关的指标,只需要通过endpoints资源找到kubernetes对应的所有后 端地址即可。
如下所示,创建监控任务kubernetes-apiservers,这里指定了服务发现模式为endpoints。Promtheus会查找 当前集群中所有的endpoints配置,并通过relabel进行判断是否为apiserver对应的访问地址:
- job_name: 'kube-apiservers'
# 通过https访问apiserver
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
#以k8s的角色(role)来定义收集,比如node,service,pod,endpoints,ingress等等
kubernetes_sd_configs:
# 从endpoints获取apiserver数据
- role: endpoints
#relabel_configs允许在抓取之前对任何目标及其标签进行修改。
relabel_configs:
# 选择哪些label
- source_labels: [__meta_kubernetes_namespace, __meta_kubernetes_service_name, __meta_kubernetes_endpoint_port_name]
# 上述选择的label的值需要与下述对应
regex: default;kubernetes;https
# 含有符合regex的source_label的endpoints进行保留
action: keep
- target_label: __address__
replacement: kubernetes.default.svc:443
对Ingress和Service进行网络探测
为了能够对Ingress和Service进行探测,我们需要在集群部署Blackbox Exporter实例。 如下所示,创建 blackbox-exporter.yaml用于描述部署相关的内容:
apiVersion: apps/v1
kind: Deployment
metadata:
name: prometheus-blackbox-exporter
namespace: kube-system
spec:
selector:
matchLabels:
app: prometheus-blackbox-exporter
replicas: 1
template:
metadata:
labels:
app: prometheus-blackbox-exporter
spec:
restartPolicy: Always
containers:
- name: prometheus-blackbox-exporter
image: prom/blackbox-exporter:v0.12.0
imagePullPolicy: IfNotPresent
ports:
- name: blackbox-port
containerPort: 9115
readinessProbe:
tcpSocket:
port: 9115
initialDelaySeconds: 5
timeoutSeconds: 5
resources:
requests:
memory: 50Mi
cpu: 100m
limits:
memory: 60Mi
cpu: 200m
volumeMounts:
- name: config
mountPath: /etc/blackbox_exporter
args:
- --config.file=/etc/blackbox_exporter/blackbox.yml
- --log.level=debug
- --web.listen-address=:9115
volumes:
- name: config
configMap:
name: prometheus-blackbox-exporter
nodeSelector:
node-role.kubernetes.io/master: "true"
tolerations:
- key: "node-role.kubernetes.io/master"
effect: "NoSchedule"
---
apiVersion: v1
kind: Service
metadata:
labels:
app: prometheus-blackbox-exporter
name: prometheus-blackbox-exporter
namespace: kube-system
annotations:
prometheus.io/scrape: 'true'
spec:
type: NodePort
selector:
app: prometheus-blackbox-exporter
ports:
- name: blackbox
port: 9115
targetPort: 9115
nodePort: 30009
protocol: TCP
通过kubectl命令部署Blackbox Exporter实例,这里将部署一个Blackbox Exporter的Pod实例,同时通过服 务blackbox-exporter在集群内暴露访问地址blackbox-exporter.default.svc.cluster.local,对于集 群内的任意服务都可以通过该内部DNS域名访问Blackbox Exporter实例。
为了能够让Prometheus能够自动的对Service进行探测,我们需要通过服务发现自动找到所有的Service信息。 如下所示,在Prometheus的配置文件中添加名为kubernetes-services的监控采集任务。
- job_name: 'kubernetes-services'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: service
relabel_configs:
- source_labels: [__meta_kubernetes_service_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__address__]
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.default.svc.cluster.local:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_service_name]
target_label: kubernetes_name
对于Ingress而言,也是一个相对类似的过程,这里给出对Ingress探测的Promthues任务配置作为参考:
- job_name: 'kubernetes-ingresses'
metrics_path: /probe
params:
module: [http_2xx]
kubernetes_sd_configs:
- role: ingress
relabel_configs:
- source_labels: [__meta_kubernetes_ingress_annotation_prometheus_io_probe]
action: keep
regex: true
- source_labels: [__meta_kubernetes_ingress_scheme,__address__,__meta_kubernetes_ingress_path]
regex: (.+);(.+);(.+)
replacement: ${1}://${2}${3}
target_label: __param_target
- target_label: __address__
replacement: blackbox-exporter.default.svc.cluster.local:9115
- source_labels: [__param_target]
target_label: instance
- action: labelmap
regex: __meta_kubernetes_ingress_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
target_label: kubernetes_namespace
- source_labels: [__meta_kubernetes_ingress_name]
target_label: kubernetes_name