prometheus+grafana+alertmanager
prometheus
apiVersion: v1
kind: "Service"
metadata:
name: prometheus
namespace: prometheus
labels:
name: prometheus
spec:
ports:
- name: prometheus
protocol: TCP
port: 9090
targetPort: 9090
selector:
app: prometheus
---
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
name: prometheus-deployment
name: prometheus
namespace: prometheus
spec:
replicas: 1
selector:
matchLabels:
app: prometheus
template:
metadata:
labels:
app: prometheus
spec:
containers:
- image: prom/prometheus:latest
name: prometheus
command:
- "/bin/prometheus"
args:
- "--config.file=/etc/prometheus/prometheus.yml"
- "--storage.tsdb.path=/prometheus"
- "--storage.tsdb.retention=72h"
ports:
- containerPort: 9090
protocol: TCP
volumeMounts:
- mountPath: "/prometheus"
name: data
- mountPath: "/etc/prometheus"
name: config-volume
- mountPath: "/etc/prometheus/rules"
name: rules-volume
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 500m
memory: 2500Mi
serviceAccountName: prometheus
imagePullSecrets:
- name: regsecret
volumes:
- name: data
emptyDir: {}
- name: config-volume
configMap:
name: prometheus-config
- name: rules-volume
configMap:
name: rules
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: prometheus
rules:
- apiGroups: [""]
resources:
- nodes
- nodes/metrics
- services
- endpoints
- pods
verbs: ["get", "list", "watch"]
- apiGroups:
- extensions
- networking.k8s.io
resources:
- ingresses
verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: prometheus
namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: prometheus
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: prometheus
subjects:
- kind: ServiceAccount
name: prometheus
namespace: prometheus
configmap(prometheus and rules)
kubectl create --save-config configmap rules -n prometheus --from-file ./config/rules -o yaml --dry-run | kubectl apply -f -
kubectl create --save-config configmap prometheus-config -n prometheus --from-file ./config -o yaml --dry-run | kubectl apply -f -
example: rules(node_exporter)
groups:
- name: node_health
rules:
## 内存使用率
- alert: HostOutOfMemory
expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 15
for: 2m
labels:
severity: warning
annotations:
summary: Host out of memory (instance {{ $labels.instance }})
description: "节点内存 (< 15% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## 节点内存压力
- alert: HostMemoryUnderMemoryPressure
expr: rate(node_vmstat_pgmajfault[1m]) > 1000
for: 2m
labels:
severity: warning
annotations:
summary: 内存压力下的主机内存 (instance {{ $labels.instance }})
description: "节点内存压力过大,主要页面故障率高\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## 网络吞吐量(接收)
- alert: HostUnusualNetworkThroughputIn
expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: 主机不正常的网络吞吐量(接收) (instance {{ $labels.instance }})
description: "主机网络接口 接收 的数据可能过多 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## 网络吞吐量(发送)
- alert: HostUnusualNetworkThroughputOut
expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
for: 5m
labels:
severity: warning
annotations:
summary: 主机不正常的网络吞吐量(发送)(instance {{ $labels.instance }})
description: "主机网络接口可能 发送 了太多数据 (> 100 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## 磁盘读取率
- alert: HostUnusualDiskReadRate
expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
for: 5m
labels:
severity: warning
annotations:
summary: 主机磁盘读速异常 (instance {{ $labels.instance }})
description: "磁盘可能读取了太多的数据 (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## 磁盘写入率
- alert: HostUnusualDiskWriteRate
expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
for: 2m
labels:
severity: warning
annotations:
summary: 主机磁盘写速率异常 (instance {{ $labels.instance }})
description: "磁盘可能写了太多的数据 (> 50 MB/s)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## 磁盘空间
- alert: HostOutOfDiskSpace
expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
for: 2m
labels:
severity: warning
annotations:
summary: 主机磁盘空间不足 (instance {{ $labels.instance }})
description: "磁盘空间 (< 5% left)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
## 磁盘读取延迟
- alert: HostUnusualDiskReadLatency
expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.5 and rate(node_disk_reads_completed_total[1m]) > 0
for: 2m
labels:
severity: warning
annotations:
summary: 主机磁盘读取延迟高(instance {{ $labels.instance }})
description: "磁盘延迟正在增长 (read operations > 500ms)\n VALUE = {{ $value }}\n LABELS = {{ $labels }}"
kube-state-metrics
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.0.0
name: kube-state-metrics
namespace: prometheus
spec:
replicas: 1
selector:
matchLabels:
app.kubernetes.io/name: kube-state-metrics
template:
metadata:
labels:
app.kubernetes.io/name: kube-state-metrics
app.kubernetes.io/version: 2.0.0
spec:
containers:
- image: quay.io/coreos/kube-state-metrics:v2.0.0-alpha
livenessProbe:
httpGet:
path: /healthz
port: 8080
initialDelaySeconds: 5
timeoutSeconds: 5
name: kube-state-metrics
ports:
- containerPort: 8080
name: http-metrics
- containerPort: 8081
name: telemetry
readinessProbe:
httpGet:
path: /
port: 8081
initialDelaySeconds: 5
timeoutSeconds: 5
securityContext:
runAsUser: 65534
nodeSelector:
kubernetes.io/os: linux
node_exporter
apiVersion: apps/v1
kind: DaemonSet
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
name: node-exporter
namespace: prometheus
spec:
selector:
matchLabels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
template:
metadata:
labels:
app.kubernetes.io/component: exporter
app.kubernetes.io/name: node-exporter
app.kubernetes.io/part-of: kube-prometheus
app.kubernetes.io/version: 1.1.2
spec:
containers:
- args:
- --web.listen-address=0.0.0.0:9200
- --path.sysfs=/host/sys
- --path.rootfs=/host/root
- --no-collector.wifi
- --no-collector.hwmon
- --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
- --collector.netclass.ignored-devices=^(veth.*)$
- --collector.netdev.device-exclude=^(veth.*)$
image: quay.io/prometheus/node-exporter:v1.1.2
name: node-exporter
ports:
- containerPort: 9200
hostPort: 9200
name: https
resources:
limits:
cpu: 250m
memory: 180Mi
requests:
cpu: 102m
memory: 180Mi
volumeMounts:
- mountPath: /host/sys
mountPropagation: HostToContainer
name: sys
readOnly: true
- mountPath: /host/root
mountPropagation: HostToContainer
name: root
readOnly: true
hostNetwork: true
hostPID: true
nodeSelector:
kubernetes.io/os: linux
securityContext:
runAsNonRoot: true
runAsUser: 65534
tolerations:
- operator: Exists
volumes:
- hostPath:
path: /sys
name: sys
- hostPath:
path: /
name: root
updateStrategy:
rollingUpdate:
maxUnavailable: 10%
type: RollingUpdate
mysql_exporter
apiVersion: apps/v1
kind: Deployment
metadata:
labels:
app: mysqld-exporter
name: mysqld-exporter
namespace: prometheus
spec:
selector:
matchLabels:
app: mysqld-exporter
template:
metadata:
labels:
app: mysqld-exporter
name: mysqld-exporter
spec:
containers:
- env:
- name: DATA_SOURCE_NAME
value: exporter:zWqNdvG2NTZtjv8m@(mysql-server.dev.svc.cluster.local:3306)/
image: prom/mysqld-exporter
imagePullPolicy: Always
name: mysqld-exporter
---
apiVersion: v1
kind: Service
metadata:
labels:
app: mysqld-exporter
name: mysqld-exporter
namespace: prometheus
spec:
ports:
- name: mysqld-exporter
port: 9104
protocol: TCP
targetPort: 9104
selector:
app: mysqld-exporter
type: ClusterIP
grafana(告警不支持模板变量)
apiVersion: apps/v1
kind: Deployment
metadata:
namespace: prometheus
labels:
app: grafana
name: grafana
spec:
selector:
matchLabels:
app: grafana
template:
metadata:
labels:
app: grafana
spec:
securityContext:
fsGroup: 472
supplementalGroups:
- 0
containers:
- name: grafana
image: grafana/grafana
imagePullPolicy: IfNotPresent
ports:
- containerPort: 3000
name: http-grafana
protocol: TCP
readinessProbe:
failureThreshold: 3
httpGet:
path: /robots.txt
port: 3000
scheme: HTTP
initialDelaySeconds: 10
periodSeconds: 30
successThreshold: 1
timeoutSeconds: 2
livenessProbe:
failureThreshold: 3
initialDelaySeconds: 30
periodSeconds: 10
successThreshold: 1
tcpSocket:
port: 3000
timeoutSeconds: 1
resources:
requests:
cpu: 250m
memory: 750Mi
volumeMounts:
- mountPath: /var/lib/grafana
name: d-xxxxxxxxxxxxxxxx
- mountPath: /usr/share/grafana/conf
name: config
volumes:
- name: d-xxxxxxxxxxxxxxxx
persistentVolumeClaim:
claimName: d-xxxxxxxxxxxxxxxx
- name: config
configMap:
name: grafana-config
---
apiVersion: v1
kind: Service
metadata:
name: grafana
namespace: prometheus
spec:
ports:
- port: 3000
protocol: TCP
targetPort: http-grafana
selector:
app: grafana
#type: ClusterIP
configmap(grafana)
kubectl create --save-config configmap grafana-config -n prometheus --from-file ./config -o yaml --dry-run | kubectl apply -f -
alertmanager
apiVersion: apps/v1
kind: Deployment
metadata:
name: alertmanager
namespace: prometheus
labels:
app: alertmanager
spec:
replicas: 1
selector:
matchLabels:
app: alertmanager
template:
metadata:
labels:
app: alertmanager
spec:
containers:
- name: prometheus-alertmanager
image: "prom/alertmanager"
imagePullPolicy: "IfNotPresent"
args:
- --config.file=/etc/config/alertmanager.yml
# - --storage.path=/data
# - --web.external-url=/
ports:
- containerPort: 9093
readinessProbe:
httpGet:
path: /#/status
port: 9093
initialDelaySeconds: 30
timeoutSeconds: 30
volumeMounts:
- name: config-volume
mountPath: /etc/config
# - name: storage-volume
# mountPath: "/data"
# subPath: ""
resources:
limits:
cpu: 10m
memory: 50Mi
requests:
cpu: 10m
memory: 50Mi
volumes:
- name: config-volume
configMap:
name: alertmanager-config
# - name: storage-volume
# persistentVolumeClaim:
# claimName: alertmanager
---
apiVersion: v1
kind: Service
metadata:
name: alertmanager
namespace: prometheus
labels:
app: alertmanager
spec:
ports:
- name: http
port: 9093
protocol: TCP
targetPort: 9093
selector:
app: alertmanager
type: "ClusterIP"