1、创建名称空间
kubectl create ns monitoring
2、创建rbca角色认证
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/proxy
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: monitoring
3、部署prometheus无状态服务,生产环境建议创建为有状态服务
kind: ConfigMap
apiVersion: v1
metadata:
name: alert-rules
namespace: monitoring
data:
alert-linux.yaml: |-
groups:
- name: instances
rules:
- alert: InstanceDown
expr: up == 0
for: 1m
annotations:
title: 'Instance down'
description: Instance has been down for more than 1 minute.'
labels:
severity: 'critical'
- name: cpu_alert
rules:
- alert: cpu_alert
expr: 100 -avg(irate(node_cpu_seconds_total{mode="idle"}[1m])) by (instance)* 100 > 80
for: 5m
labels:
level: warning
annotations:
description: "instance: {{ $labels.instance }} ,cpu usage is too high ! value: {{$value}}"
summary: "cpu usage is too high"
- name: mem_alert
rules:
- alert: mem_alert
expr: ((node_memory_MemTotal_bytes{instance="$node",job="$job"} - node_memory_MemFree_bytes{instance="$node",job="$job"}) / (node_memory_MemTotal_bytes{instance="$node",job="$job"} )) * 100 > 90
for: 5m
labels:
level: warning
annotations:
description: "instance: {{ $labels.instance }} ,memory usage is too high ! value: {{$value}}"
summary: "memory usage is too high"
- name: disk_alert
rules:
- alert: disk_alert
expr: 100 - ((node_filesystem_avail_bytes{instance="$node",job="$job",mountpoint="/",fstype!="rootfs"} * 100) / node_filesystem_size_bytes{instance="$node",job="$job",mountpoint="/",fstype!="rootfs"}) > 85
for: 5m
labels:
level: warning
annotations:
description: "instance: {{ $labels.instance }} ,disk usage is too high ! value: {{$value}}"
summary: "disk usage is too high"
---
apiVersion: v1
kind: ConfigMap
metadata:
name: prometheus-config
namespace: monitoring
data:
prometheus.yml: |
global:
scrape_interval: 15s
evaluation_interval: 15s
scrape_configs:
- job_name: 'kubernetes-cadvisor'
kubernetes_sd_configs:
- role: node
scheme: https
tls_config:
ca_file: /var/run/secrets/kubernetes.io/serviceaccount/ca.crt
bearer_token_file: /var/run/secrets/kubernetes.io/serviceaccount/token
relabel_configs:
- action: labelmap
regex: __meta_kubernetes_node_label_(.+)
- target_label: __address__
replacement: kubernetes.default.svc:443
- source_labels: [__meta_kubernetes_node_name]
regex: (.+)
target_label: __metrics_path__
replacement: /api/v1/nodes/${1}/proxy/metrics/cadvisor
- job_name: kube-state-metrics
kubernetes_sd_configs:
- role: endpoints
namespaces:
names:
- monitoring
relabel_configs:
- source_labels: [__meta_kubernetes_service_label_app_kubernetes_io_name]
regex: kube-state-metrics
replacement: $1
action: keep
- action: labelmap
regex: __meta_kubernetes_service_label_(.+)
- source_labels: [__meta_kubernetes_namespace]
action: replace
target_label: k8s_namespace
- source_labels: [__meta_kubernetes_service_name]
action: replace
target_label: k8s_sname
- job_name: node-exporter
static_configs:
- targets: ['node-exporter.monitoring.svc.cluster.local:9100']
alerting:
alertmanagers:
- static_configs:
- targets: ["alertmanager.monitoring.svc.cluster.local:9093"]
rule_files:
- /opt/prometheus/*.yaml
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
name: prometheus
name: prometheus
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- image: prom/prometheus:v2.7.1
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=24h"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/prometheus"
name: data
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/opt/prometheus/alert-linux.yaml"
subPath: alert-linux.yaml
name: alert
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
cpu: 500m
memory: 1560Mi
serviceAccountName: prometheus
volumes:
- name: data
emptyDir: {}
- name: config-volume
configMap:
name: prometheus-config
- name: alert
configMap:
name: alert-rules
items:
- key: alert-linux.yaml
path: alert-linux.yaml
---
kind: Service
apiVersion: v1
metadata:
labels:
app: prometheus
name: prometheus
namespace: monitoring
spec:
type: ClusterIP
ports:
- port: 9090
targetPort: 9090
protocol: TCP
selector:
app: prometheus
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/proxy-body-size: 50m
nginx.org/client-max-body-size: 100m
prometheus.io/http-probe: "true"
prometheus.io/scrape: "true"
name: prometheus-web
namespace: monitoring
spec:
ingressClassName: nginx
rules:
- host: prometheus-test.xxx.com
http:
paths:
- backend:
service:
name: prometheus
port:
number: 9090
path: /
pathType: Prefix
4、部署grafana
apiVersion: v1
kind: ConfigMap
metadata:
name: grafana-cm
namespace: monitoring
data:
grafana.ini: |
[smtp]
enabled = true
host = smtp.qq.com:25
user = xxxx@qq.com
password = xxxxxx
skip_verify = true
from_address = xxxxx@qq.com
[alerting]
enabled = true
execute_alerts = true
---
apiVersion: apps/v1
kind: Deployment
metadata:
name: grafana
namespace: monitoring
spec:
replicas: 1
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
containers:
- image: grafana/grafana
imagePullPolicy: IfNotPresent
securityContext:
allowPrivilegeEscalation: false
runAsUser: 0
name: grafana
ports:
- containerPort: 3000
protocol: TCP
volumeMounts:
- mountPath: "/var/lib/grafana"
name: data
- mountPath: "/etc/grafana"
name: grafana-cm
resources:
requests:
cpu: 100m
memory: 512Mi
limits:
cpu: 500m
memory: 1560Mi
volumes:
- name: data
emptyDir: {}
- name: grafana-cm
configMap:
name: grafana-cm
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: monitoring
spec:
ports:
- port: 3000
targetPort: 3000
protocol: TCP
selector:
app: grafana
type: ClusterIP
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/proxy-body-size: 50m
nginx.org/client-max-body-size: 100m
prometheus.io/http-probe: "true"
prometheus.io/scrape: "true"
name: grafana-web
namespace: monitoring
spec:
ingressClassName: nginx
rules:
- host: grafana-test.xxx.com
http:
paths:
- backend:
service:
name: grafana
port:
number: 3000
path: /
pathType: Prefix
5、部署alertmanager
kind: ConfigMap
apiVersion: v1
metadata:
name: alertmanager
namespace: monitoring
data:
alertmanager.yml: |-
global:
resolve_timeout: 5m
smtp_smarthost: 'smtp.qq.com:25'
smtp_from: 'xxx'
smtp_auth_username: 'xxx'
smtp_auth_password: 'xxx'
smtp_require_tls: false
route:
group_by: [alertname]
group_wait: 30s
group_interval: 5m
repeat_interval: 3m
receiver: web.hook
receivers:
- name: 'email'
email_configs:
- to: 'xxx'
send_resolved: true
- name: 'dingtalk'
webhook_configs:
- url: 'https://oapi.dingtalk.com/robot/send?access_token=xxxxxxx'
send_resolved: true
receivers:
- name: 'web.hook'
webhook_configs:
- url: 'http://czhwebhook.monitoring.svc.cluster.local:1994/webhook/sendmsgtowx&key=czj'
inhibit_rules:
- source_match:
severity: 'critical'
target_match:
severity: 'warning'
equal: ['alertname', 'node', 'instance']
---
apiVersion: v1
kind: Service
metadata:
labels:
name: altermanager
kubernetes.io/cluster-service: 'true'
name: alertmanager
namespace: monitoring
spec:
ports:
- name: alertmanager
port: 9093
protocol: TCP
targetPort: 9093
selector:
app: altermanager
sessionAffinity: None
type: ClusterIP
---
apiVersion: networking.k8s.io/v1
kind: Ingress
metadata:
annotations:
nginx.ingress.kubernetes.io/proxy-body-size: 50m
nginx.org/client-max-body-size: 100m
prometheus.io/http-probe: "true"
prometheus.io/scrape: "true"
name: alertmanager-web
namespace: monitoring
spec:
ingressClassName: nginx
rules:
- host: alertmanager-test.xxx.com
http:
paths:
- backend:
service:
name: altermanager
port:
number: 9093
path: /
pathType: Prefix
6、我这里是将告警数据发到企业微信机器人,所以写了一个简单的转发告警消息的服务,在
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: czhwebhook
name: czhwebhook
namespace: monitoring
spec:
minReadySeconds: 60
progressDeadlineSeconds: 600
replicas: 1
revisionHistoryLimit: 5
selector:
matchLabels:
app: czhwebhook
strategy:
rollingUpdate:
maxSurge: 25%
maxUnavailable: 25%
type: RollingUpdate
template:
metadata:
labels:
app: czhwebhook
spec:
containers:
- env:
- name: wxurl
value: https://qyapi.weixin.qq.com/cgi-bin/webhook/send?key=xxxxxxx
image: czhwebhook:1.0
imagePullPolicy: IfNotPresent
name: czhwebhook
ports:
- containerPort: 1994
name: tcp-1994
protocol: TCP
resources:
limits:
memory: 256Mi
cpu: 100m
---
apiVersion: v1
kind: Service
metadata:
annotations:
prometheus.io/http-probe: "true"
prometheus.io/scrape: "true"
labels:
app: czhwebhook
name: czhwebhook
namespace: monitoring
spec:
ports:
- name: tcp-1994
port: 1994
protocol: TCP
targetPort: 1994
selector:
app: czhwebhook
sessionAffinity: None
type: ClusterIP
查看部署的服务
kubectl get pod,svc,ingress -o wide -n monitoring
NAME READY STATUS RESTARTS AGE IP NODE NOMINATED NODE READINESS GATES
pod/altermanager-57d56db644-2cdcp 1/1 Running 0 33m 192.168.219.21 k8s3 <none> <none>
pod/czhwebhook-f9c6df698-2md46 1/1 Running 0 36m 192.168.166.239 k8s1 <none> <none>
pod/grafana-5f66f54c99-d78r6 1/1 Running 0 23h 192.168.166.235 k8s1 <none> <none>
pod/kube-state-metrics-cc9968b-5j8zb 1/1 Running 0 23h 192.168.219.19 k8s3 <none> <none>
pod/node-exporter-4m4q6 1/1 Running 0 23h 192.168.166.237 k8s1 <none> <none>
pod/node-exporter-ltkpg 1/1 Running 0 23h 192.168.219.20 k8s3 <none> <none>
pod/node-exporter-vkrp9 1/1 Running 0 23h 192.168.109.106 k8s2 <none> <none>
pod/prometheus-676f97cd9d-27k8l 1/1 Running 0 23h 192.168.166.238 k8s1 <none> <none>
NAME TYPE CLUSTER-IP EXTERNAL-IP PORT(S) AGE SELECTOR
service/alertmanager ClusterIP 10.108.164.2 <none> 9093/TCP 33m app=altermanager
service/czhwebhook ClusterIP 10.97.63.161 <none> 1994/TCP 36m app=czhwebhook
service/grafana ClusterIP 10.102.252.251 <none> 3000/TCP 23h app=grafana
service/kube-state-metrics ClusterIP None <none> 8080/TCP,8081/TCP 23h app.kubernetes.io/name=kube-state-metrics
service/node-exporter ClusterIP 10.105.96.244 <none> 9100/TCP 23h k8s-app=node-exporter
service/prometheus ClusterIP 10.100.196.114 <none> 9090/TCP 23h app=prometheus
NAME CLASS HOSTS ADDRESS PORTS AGE
ingress.networking.k8s.io/alertmanager-web nginx alertmanager-test.xxx.com 80 33m
ingress.networking.k8s.io/grafana-web nginx grafana-test.xxx.com 80 23h
ingress.networking.k8s.io/prometheus-web nginx prometheus-test.xxx.com 80 23h
部署完成,后续还将配置告警规则及添加节点监控,中间件监控等等