关于k8所有告警规则
groups:
- name: kubernetes-alerts
rules:
# P0 级别告警
- alert: KubernetesNodeNotReady
expr: kube_node_status_condition{condition="Ready",status="true", cluster_name=~".+"} == 0
for: 5m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P0
annotations:
summary: "【P0 级别告警】Kubernetes 节点未就绪 (集群: {{ $labels.cluster }}, 节点: {{ $labels.node }})"
description: "【P0 级别告警】集群 {{ $labels.cluster_name }} 中,节点 {{ $labels.node }} 持续 5 分钟未就绪"
- alert: KubernetesJobFailed
expr: kube_job_status_failed{cluster_name=~".+"} > 0
for: 0m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P0
annotations:
summary: "【P0 级别告警】Kubernetes Job 失败 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, Job: {{ $labels.job_name }})"
description: "【P0 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Job {{ $labels.job_name }} 执行失败"
- alert: KubernetesJobNotStarting
expr: kube_job_status_active{cluster_name=~".+"} == 0 and kube_job_status_failed{cluster_name=~".+"} == 0 and kube_job_status_succeeded{cluster_name=~".+"} == 0 and (time() - kube_job_status_start_time{cluster_name=~".+"}) > 600
for: 0m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P0
annotations:
summary: "【P0 级别告警】Kubernetes Job 未启动 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, Job: {{ $labels.job_name }})"
description: "【P0 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Job {{ $labels.job_name }} 超 10 分钟未启动"
- alert: KubernetesCronjobSuspended
expr: kube_cronjob_spec_suspend{cluster_name=~".+"} != 0
for: 0m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P0
annotations:
summary: "【P0 级别告警】Kubernetes CronJob 挂起 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, CronJob: {{ $labels.cronjob }})"
description: "【P0 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,CronJob {{ $labels.cronjob }} 被挂起"
- alert: KubernetesApiServerErrors
expr: sum(rate(apiserver_request_total{job="apiserver",code=~"(?:5..)", cluster_name=~".+"}[1m])) by (instance, job, cluster_name) / sum(rate(apiserver_request_total{job="apiserver", cluster_name=~".+"}[1m])) by (instance, job, cluster_name) * 100 > 3
for: 2m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P0
annotations:
summary: "【P0 级别告警】Kubernetes API 服务器错误率过高 (集群: {{ $labels.cluster }}, 实例: {{ $labels.instance }})"
description: "【P0 级别告警】集群 {{ $labels.cluster_name }} 中,API 服务器 {{ $labels.instance }} 的 5xx 错误率超 3%"
- alert: KubernetesApiClientErrors
expr: (sum(rate(rest_client_requests_total{code=~"(4|5)..", cluster_name=~".+"}[1m])) by (instance, job, cluster_name) / sum(rate(rest_client_requests_total{cluster_name=~".+"}[1m])) by (instance, job, cluster_name)) * 100 > 1
for: 2m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P0
annotations:
summary: "【P0 级别告警】Kubernetes API 客户端错误率过高 (集群: {{ $labels.cluster }}, 实例: {{ $labels.instance }})"
description: "【P0 级别告警】集群 {{ $labels.cluster_name }} 中,API 客户端 {{ $labels.instance }} 的 4xx/5xx 错误率超 1%"
- alert: KubernetesClientCertificateExpiresSoon
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver", cluster_name=~".+"} > 0 and histogram_quantile(0.01, sum by (job, le, cluster_name) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver", cluster_name=~".+"}[5m]))) < 24*60*60
for: 0m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P0
annotations:
summary: "【P0 级别告警】Kubernetes 客户端证书即将过期 (集群: {{ $labels.cluster }}, 实例: {{ $labels.instance }})"
description: "【P0 级别告警】集群 {{ $labels.cluster_name }} 中,API 服务器客户端 {{ $labels.instance }} 的证书 24 小时内过期"
# P1 级别告警
- alert: KubernetesNodeMemoryPressure
expr: kube_node_status_condition{condition="MemoryPressure",status="true", cluster_name=~".+"} == 1
for: 2m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 节点内存压力 (集群: {{ $labels.cluster }}, 节点: {{ $labels.node }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 中,节点 {{ $labels.node }} 内存压力持续 2 分钟"
- alert: KubernetesNodeDiskPressure
expr: kube_node_status_condition{condition="DiskPressure",status="true", cluster_name=~".+"} == 1
for: 2m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 节点磁盘压力 (集群: {{ $labels.cluster }}, 节点: {{ $labels.node }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 中,节点 {{ $labels.node }} 磁盘压力持续 2 分钟"
- alert: KubernetesNodeNetworkUnavailable
expr: kube_node_status_condition{condition="NetworkUnavailable",status="true", cluster_name=~".+"} == 1
for: 2m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 节点网络不可用 (集群: {{ $labels.cluster }}, 节点: {{ $labels.node }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 中,节点 {{ $labels.node }} 网络不可用持续 2 分钟"
- alert: KubernetesNodeOutOfPodCapacity
expr: sum by (node, cluster_name) ((kube_pod_status_phase{phase="Running", cluster_name=~".+"} == 1) + on(uid, instance) group_left(node, cluster_name) (0 * kube_pod_info{pod_template_hash="", cluster_name=~".+"})) / sum by (node, cluster_name) (kube_node_status_allocatable{resource="pods", cluster_name=~".+"}) * 100 > 90
for: 2m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 节点 Pod 容量耗尽 (集群: {{ $labels.cluster }}, 节点: {{ $labels.node }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 中,节点 {{ $labels.node }} 的 Pod 容量超 90%"
- alert: KubernetesContainerOomKiller
expr: (kube_pod_container_status_restarts_total{cluster_name=~".+"} - kube_pod_container_status_restarts_total{cluster_name=~".+"} offset 10m >= 1) and ignoring (reason) min_over_time(kube_pod_container_status_last_terminated_reason{reason="OOMKilled", cluster_name=~".+"}[10m]) == 1
for: 0m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 容器 OOM 杀死 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, 容器: {{ $labels.container }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,容器 {{ $labels.container }} 10 分钟内至少 1 次因 OOM 重启"
- alert: KubernetesPersistentvolumeclaimPending
expr: kube_persistentvolumeclaim_status_phase{phase="Pending", cluster_name=~".+"} == 1
for: 2m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 持久卷声明挂起 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, 持久卷声明: {{ $labels.persistentvolumeclaim }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,持久卷声明 {{ $labels.persistentvolumeclaim }} 挂起 2 分钟"
- alert: KubernetesVolumeOutOfDiskSpace
expr: kubelet_volume_stats_available_bytes{cluster_name=~".+"} / kubelet_volume_stats_capacity_bytes{cluster_name=~".+"} * 100 < 10
for: 2m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 卷磁盘空间不足 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, 卷: {{ $labels.persistentvolume }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,卷 {{ $labels.persistentvolume }} 剩余空间不足 10%"
- alert: KubernetesVolumeFullInFourDays
expr: predict_linear(kubelet_volume_stats_available_bytes{cluster_name=~".+"}[6h:5m], 4 * 24 * 3600) < 0
for: 0m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 卷 4 天内填满 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, 卷: {{ $labels.persistentvolume }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,卷 {{ $labels.persistentvolume }} 预计 4 天内填满"
- alert: KubernetesPersistentvolumeError
expr: kube_persistentvolume_status_phase{phase=~"Failed|Pending", job="kube-state-metrics", cluster_name=~".+"} > 0
for: 0m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes 持久卷错误 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, 持久卷: {{ $labels.persistentvolume }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,持久卷 {{ $labels.persistentvolume }} 状态异常"
- alert: KubernetesStatefulsetDown
expr: kube_statefulset_replicas{cluster_name=~".+"} != kube_statefulset_status_replicas_ready{cluster_name=~".+"} > 0
for: 1m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes StatefulSet 故障 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, StatefulSet: {{ $labels.statefulset }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,StatefulSet {{ $labels.statefulset }} 副本数不一致"
- alert: KubernetesHpaScaleInability
expr: (kube_horizontalpodautoscaler_spec_max_replicas{cluster_name=~".+"} - kube_horizontalpodautoscaler_status_desired_replicas{cluster_name=~".+"}) * on (horizontalpodautoscaler,namespace, cluster_name) (kube_horizontalpodautoscaler_status_condition{condition="ScalingLimited", status="true", cluster_name=~".+"} == 1) == 0
for: 2m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes HPA 无法扩展 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, HPA: {{ $labels.horizontalpodautoscaler }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,HPA {{ $labels.horizontalpodautoscaler }} 无法扩容"
- alert: KubernetesHpaMetricsUnavailability
expr: kube_horizontalpodautoscaler_status_condition{status="false", condition="ScalingActive", cluster_name=~".+"} == 1
for: 0m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes HPA 指标不可用 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, HPA: {{ $labels.horizontalpodautoscaler }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,HPA {{ $labels.horizontalpodautoscaler }} 指标不可用"
- alert: KubernetesPodNotHealthy
expr: sum by (namespace, pod, cluster_name) (kube_pod_status_phase{phase=~"Pending|Unknown|Failed", cluster_name=~".+"}) > 0
for: 15m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes Pod 不健康 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, Pod: {{ $labels.pod }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Pod {{ $labels.pod }} 状态异常超 15 分钟"
- alert: KubernetesDaemonsetMisscheduled
expr: kube_daemonset_status_number_misscheduled{cluster_name=~".+"} > 0
for: 1m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes DaemonSet 调度错误 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, DaemonSet: {{ $labels.daemonset }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,DaemonSet {{ $labels.daemonset }} 的 Pod 调度错误"
- alert: KubernetesJobSlowCompletion
expr: kube_job_spec_completions{cluster_name=~".+"} - kube_job_status_succeeded{cluster_name=~".+"} - kube_job_status_failed{cluster_name=~".+"} > 0
for: 12h
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P1
annotations:
summary: "【P1 级别告警】Kubernetes Job 完成缓慢 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, Job: {{ $labels.job_name }})"
description: "【P1 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Job {{ $labels.job_name }} 未按时完成"
# P2 级别告警
- alert: KubernetesReplicasetReplicasMismatch
expr: kube_replicaset_spec_replicas{cluster_name=~".+"} != kube_replicaset_status_ready_replicas{cluster_name=~".+"}
for: 10m
labels:
severity: P2
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P2
annotations:
summary: "【P2 级别告警】Kubernetes ReplicaSet 副本不匹配 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, ReplicaSet: {{ $labels.replicaset }})"
description: "【P2 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,ReplicaSet {{ $labels.replicaset }} 副本数不一致"
- alert: KubernetesDeploymentReplicasMismatch
expr: kube_deployment_spec_replicas{cluster_name=~".+"} != kube_deployment_status_replicas_available{cluster_name=~".+"}
for: 10m
labels:
severity: P2
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P2
annotations:
summary: "【P2 级别告警】Kubernetes Deployment 副本不匹配 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, Deployment: {{ $labels.deployment }})"
description: "【P2 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Deployment {{ $labels.deployment }} 副本数不一致"
- alert: KubernetesStatefulsetReplicasMismatch
expr: kube_statefulset_status_replicas_ready{cluster_name=~".+"} != kube_statefulset_status_replicas{cluster_name=~".+"}
for: 10m
labels:
severity: P2
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P2
annotations:
summary: "【P2 级别告警】Kubernetes StatefulSet 副本不匹配 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, StatefulSet: {{ $labels.statefulset }})"
description: "【P2 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,StatefulSet {{ $labels.statefulset }} 副本数不一致"
- alert: KubernetesDeploymentGenerationMismatch
expr: kube_deployment_status_observed_generation{cluster_name=~".+"} != kube_deployment_metadata_generation{cluster_name=~".+"}
for: 10m
labels:
severity: P2
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P2
annotations:
summary: "【P2 级别告警】Kubernetes Deployment 版本不匹配 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, Deployment: {{ $labels.deployment }})"
description: "【P2 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Deployment {{ $labels.deployment }} 版本不一致"
- alert: KubernetesStatefulsetUpdateNotRolledOut
expr: max without (revision, cluster_name) (kube_statefulset_status_current_revision{cluster_name=~".+"} unless kube_statefulset_status_update_revision{cluster_name=~".+"}) * (kube_statefulset_replicas{cluster_name=~".+"} != kube_statefulset_status_replicas_updated{cluster_name=~".+"})
for: 10m
labels:
severity: P2
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P2
annotations:
summary: "【P2 级别告警】Kubernetes StatefulSet 更新未完成 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, StatefulSet: {{ $labels.statefulset }})"
description: "【P2 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,StatefulSet {{ $labels.statefulset }} 更新未完成"
- alert: KubernetesDaemonsetRolloutStuck
expr: kube_daemonset_status_number_ready{cluster_name=~".+"} / kube_daemonset_status_desired_number_scheduled{cluster_name=~".+"} * 100 < 100 or kube_daemonset_status_desired_number_scheduled{cluster_name=~".+"} - kube_daemonset_status_current_number_scheduled{cluster_name=~".+"} > 0
for: 10m
labels:
severity: P2
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P2
annotations:
summary: "【P2 级别告警】Kubernetes DaemonSet 发布卡住 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, DaemonSet: {{ $labels.daemonset }})"
description: "【P2 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,DaemonSet {{ $labels.daemonset }} 发布卡住"
# P3 级别告警
- alert: KubernetesHpaScaleMaximum
expr: (kube_horizontalpodautoscaler_status_desired_replicas{cluster_name=~".+"} >= kube_horizontalpodautoscaler_spec_max_replicas{cluster_name=~".+"}) and (kube_horizontalpodautoscaler_spec_max_replicas{cluster_name=~".+"} > 1) and (kube_horizontalpodautoscaler_spec_min_replicas{cluster_name=~".+"} != kube_horizontalpodautoscaler_spec_max_replicas{cluster_name=~".+"})
for: 2m
labels:
severity: P3
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P3
annotations:
summary: "【P3 级别告警】Kubernetes HPA 达到最大副本数 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, HPA: {{ $labels.horizontalpodautoscaler }})"
description: "【P3 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,HPA {{ $labels.horizontalpodautoscaler }} 达最大副本数"
- alert: KubernetesHpaUnderutilized
expr: max(quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas{cluster_name=~".+"}[1d]) == kube_horizontalpodautoscaler_spec_min_replicas{cluster_name=~".+"} ) by (horizontalpodautoscaler, cluster_name) > 3
for: 0m
labels:
severity: P3
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P3
annotations:
summary: "【P3 级别告警】Kubernetes HPA 资源未充分利用 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, HPA: {{ $labels.horizontalpodautoscaler }})"
description: "【P3 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,HPA {{ $labels.horizontalpodautoscaler }} 资源利用率低"
- alert: KubernetesPodCrashLooping
expr: increase(kube_pod_container_status_restarts_total{cluster_name=~".+"}[1m]) > 3
for: 2m
labels:
severity: P3
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P3
annotations:
summary: "【P3 级别告警】Kubernetes Pod 崩溃循环 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, Pod: {{ $labels.pod }})"
description: "【P3 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Pod {{ $labels.pod }} 1 分钟内重启超 3 次"
- alert: KubernetesCronjobTooLong
expr: time() - kube_cronjob_next_schedule_time{cluster_name=~".+"} > 3600
for: 0m
labels:
severity: P3
cluster: "{{ $labels.cluster_name }}"
namespace: "{{ $labels.namespace }}"
alert_severity_level: P3
annotations:
summary: "【P3 级别告警】Kubernetes CronJob 执行超时 (集群: {{ $labels.cluster }}, 命名空间: {{ $labels.namespace }}, CronJob: {{ $labels.cronjob }})"
description: "【P3 级别告警】集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,CronJob {{ $labels.cronjob }} 执行超 1 小时未完成"
---
groups:
- name: kubernetes-alert-upgrades
rules:
- alert: KubernetesApiServerLatencySeverityEscalation
expr: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) without (subresource)) > 1
and
idelta(histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) without (subresource))[10m:]) != 0
and
time() - histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)"} [10m])) without (subresource)) > 1800
for: 0m
labels:
severity: P2
original_severity: P3
annotations:
summary: "Kubernetes API 服务器延迟过高告警升级至 P2 ({{ $labels.namespace }}/{{ $labels.instance }})"
description: "API 服务器 {{ $labels.namespace }}/{{ $labels.instance }} 延迟过高问题在 30 分钟内未恢复,告警升级至 P2。"
---
groups:
- name: Kubernetesoom_restart_alerts
rules:
- alert: ContainerOOMRestartsCritical
expr: |
(kube_pod_container_status_restarts_total{reason="OOMKilled"} - kube_pod_container_status_restarts_total{reason="OOMKilled"} offset 10m) >= 3
for: 1m # 持续 1 分钟避免瞬时抖动
labels:
severity: "P1"
annotations:
summary: "容器 {{ $labels.container }} OOM 重启次数达到 P1 级别(当前值: {{ $value }})"
description: "集群 {{ $labels.cluster }} 的命名空间 {{ $labels.namespace }} 中,容器 {{ $labels.container }} 因 OOM 在 10 分钟内重启了 {{ $value }} 次。"
---
groups:
- alert: HPAFrequentScaleOperation
expr: |
sum(changes(kube_horizontalpodautoscaler_status_current_replicas[1h])) by (horizontalpodautoscaler) > 10
for: 30m
labels:
severity: P2
annotations:
summary: "HPA 扩缩操作过于频繁 (HPA: {{ $labels.horizontalpodautoscaler }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler }} 在过去1小时内执行了超过10次扩缩操作,持续30分钟,可能导致服务不稳定。"
- alert: HPAPodColdStartLatency
expr: |
histogram_quantile(0.95,
sum(rate(kube_pod_start_time_seconds{cluster_name=~".+"}[5m])) by (le)
) > 30
for: 10m
labels:
severity: P1
annotations:
summary: "HPA Pod 冷启动延迟过高 (HPA: {{ $labels.horizontalpodautoscaler }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,Horizontal Pod Autoscaler (HPA) {{ $labels.horizontalpodautoscaler }} 控制的Pods冷启动P95延迟在过去5分钟内超过30秒,持续10分钟,可能影响服务响应时间。"
- alert: KubernetesApiServerLatency
expr: |
histogram_quantile(
0.99,
sum(
rate(
apiserver_request_duration_seconds_bucket{
verb!~"(?:CONNECT|WATCHLIST|WATCH|PROXY)",
cluster_name=~".+"
} [10m]
)
) WITHOUT (subresource, cluster_name)
) > 1
for: 2m
labels:
severity: "P3"
annotations:
summary: "Kubernetes API 服务器 99 百分位请求延迟超过 1 秒 (集群: {{ $labels.cluster_name }}, 实例: {{ $labels.instance }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,API服务器实例 {{ $labels.instance }} 的99百分位请求延迟在过去10分钟内超过1秒,持续2分钟,可能影响集群组件间通信效率。"
---
groups:
- name: kubernetes-alerts
rules:
- alert: DaemonSetPVBindingFailure
expr: |
kube_persistentvolumeclaim_status_phase{phase="Pending", cluster_name=~".+"} == 1
and on(namespace, persistentvolumeclaim)
kube_persistentvolume_status_phase{phase=~"Failed|Pending"} == 1
for: 20m
labels:
severity: P0
annotations:
summary: "DaemonSet 持久卷绑定失败 (集群: {{ $labels.cluster_name }}, 命名空间: {{ $labels.namespace }}, DaemonSet: {{ $labels.daemonset }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的DaemonSet {{ $labels.daemonset }} 存在持久卷声明处于Pending状态,且关联的持久卷处于Failed或Pending状态,持续20分钟,可能导致Pod无法正常启动。"
- alert: DaemonSetResourceOvercommit
expr: |
sum by (daemonset, namespace) (
container_memory_working_set_bytes{container!="POD"}
> on(pod) kube_pod_container_resource_limits{resource="memory"}
) > 0
for: 10m
labels:
severity: P1
annotations:
summary: "DaemonSet 内存资源超限 (命名空间: {{ $labels.namespace }}, DaemonSet: {{ $labels.daemonset }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的DaemonSet {{ $labels.daemonset }} 存在容器内存使用超过资源限制的情况,持续10分钟,可能导致容器因OOM被终止。"
- alert: HPAUnderlyingNodePressure
expr: |
sum(kube_node_status_condition{condition="Ready", status="true"})
- sum(kube_node_status_allocatable{resource="pods"})
< count(kube_horizontalpodautoscaler_status_desired_replicas)
for: 30m
labels:
severity: P1
annotations:
summary: "HPA 因节点资源压力无法调度 (集群: {{ $labels.cluster_name }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,可用节点的Pod容量不足以满足Horizontal Pod Autoscaler (HPA) 的期望副本数,持续30分钟,可能导致HPA无法正常扩缩容。"
- alert: ContainerOOMRestartsWarning
expr: |
(kube_pod_container_status_restarts_total{reason="OOMKilled"}
- kube_pod_container_status_restarts_total{reason="OOMKilled"} offset 10m) >= 1
for: 1m
labels:
severity: "P2"
annotations:
summary: "容器 {{ $labels.container }} OOM 重启次数达到 P2 级别(当前值: {{ $value }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Pod {{ $labels.pod }} 内的容器 {{ $labels.container }} 因内存不足(OOM)在10分钟内重启了 {{ $value }} 次,持续1分钟,可能存在内存泄漏风险。"
- alert: ContainerRestartRateSpike
expr: |
rate(kube_pod_container_status_restarts_total[1h])
> (rate(kube_pod_container_status_restarts_total[1h] offset 1d) * 2)
labels:
severity: "P2"
annotations:
summary: "容器 {{ $labels.container }} 重启频率突增"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 的命名空间 {{ $labels.namespace }} 中,Pod {{ $labels.pod }} 内的容器 {{ $labels.container }} 过去1小时重启频率为 {{ $value }} 次/分钟,超过昨日同期的200%,可能表示容器存在稳定性问题。"
- alert: KubernetesImagePullBackOff
expr: |
kube_pod_container_status_waiting_reason{reason="ImagePullBackOff", cluster_name=~".+"} == 1
or kube_pod_container_status_waiting_reason{reason="ErrImagePull"} == 1
for: 5m
labels:
severity: P1
annotations:
summary: "容器镜像拉取失败 (Pod: {{ $labels.pod }}, 镜像: {{ $labels.image }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的Pod {{ $labels.pod }} 拉取镜像 {{ $labels.image }} 失败,状态为ImagePullBackOff或ErrImagePull,持续5分钟,可能导致Pod无法正常启动。"
action: "执行 kubectl describe pod {{ $labels.pod }} 查看事件详情[网页1][网页4]"
- alert: KubernetesVolumeMountFailure
expr: |
kube_pod_container_status_last_terminated_reason{reason="ContainerCannotRun", cluster_name=~".+"} == 1
and on(pod) kube_persistentvolumeclaim_info{phase!="Bound"} == 1
for: 0m
labels:
severity: P0
annotations:
summary: "持久卷挂载失败导致容器终止 (Pod: {{ $labels.pod }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的Pod {{ $labels.pod }} 因持久卷挂载失败导致容器终止,相关PVC未绑定,可能影响服务正常运行。"
- alert: KubernetesPrivilegedContainer
expr: |
kube_pod_container_info{security_context_privileged="true"} == 1
for: 0m
labels:
severity: P0
annotations:
summary: "检测到特权容器运行 (Pod: {{ $labels.pod }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的Pod {{ $labels.pod }} 中存在特权容器运行,这可能带来安全风险,请立即审查容器安全策略。"
action: "立即审查容器安全策略"
---
groups:
- name: kubernetes-alerts
rules:
# 定义 ConfigMapSensitiveDataExposure 告警规则,用于检测 ConfigMap 中是否包含敏感数据
- alert: ConfigMapSensitiveDataExposure
# 告警规则表达式,统计包含敏感注释且数据中存在敏感关键词的 ConfigMap 数量
expr: |
count by (configmap) (
kube_configmap_annotations{annotation_prometheus_io_sensitive="true"} == 1
and
kube_configmap_data{data=~".*(password|secret|token).*"} == 1
) > 0
for: 0m # 只要表达式条件为真,立即触发告警
labels:
severity: P0 # 告警严重级别为 P0,通常用于提示需要关注的安全风险
annotations:
summary: "ConfigMap 包含敏感数据 (命名空间: {{ $labels.namespace }}, ConfigMap: {{ $labels.configmap }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的ConfigMap {{ $labels.configmap }} 包含敏感数据(如密码、密钥、令牌等),这可能导致安全风险,请立即整改。"
# 定义 OrphanedConfigMap 告警规则,用于检测未被任何 Pod 引用的冗余 ConfigMap
- alert: OrphanedConfigMap
# 告警规则表达式,通过判断 ConfigMap 是否未被任何 Pod 引用
expr: |
kube_configmap_info
unless on(configmap,namespace)
kube_pod_spec_volumes_configmap
for: 24h # 若表达式条件持续 24 小时为真,则触发告警
labels:
severity: P3 # 告警严重级别为 P3,提示存在可优化的冗余资源
annotations:
summary: "存在未被引用的冗余 ConfigMap (命名空间: {{ $labels.namespace }}, ConfigMap: {{ $labels.configmap }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的ConfigMap {{ $labels.configmap }} 未被任何Pod引用,持续24小时,可能为冗余资源,建议清理。"
# 定义 ConfigMapSizeExceeded 告警规则,用于检测 ConfigMap 数据量是否超过 ETCD 限制
- alert: ConfigMapSizeExceeded
# 告警规则表达式,判断 ConfigMap 的大小是否超过 1MB
expr: |
kube_configmap_size_bytes > 1048576 # 1MB
for: 5m # 若表达式条件持续 5 分钟为真,则触发告警
labels:
severity: P2 # 告警严重级别为 P2,提示可能影响系统性能
annotations:
summary: "ConfigMap 数据量超过 ETCD 限制 (命名空间: {{ $labels.namespace }}, ConfigMap: {{ $labels.configmap }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的ConfigMap {{ $labels.configmap }} 数据量超过1MB,持续5分钟,可能导致ETCD性能下降或API请求失败。"
# 定义 ConfigMapRollbackFailure 告警规则,用于检测 ConfigMap 版本回滚是否失败
- alert: ConfigMapRollbackFailure
# 告警规则表达式,通过判断 ConfigMap 的版本变更次数和版本差异来检测回滚失败
expr: |
changes(kube_configmap_metadata_resource_version[1h]) > 3
and
kube_configmap_metadata_generation - kube_configmap_status_observed_generation > 2
for: 30m # 若表达式条件持续 30 分钟为真,则触发告警
labels:
severity: P1 # 告警严重级别为 P1,表示较为严重
annotations:
summary: "ConfigMap 版本回滚异常 (命名空间: {{ $labels.namespace }}, ConfigMap: {{ $labels.configmap }})"
description: "在Kubernetes集群 {{ $labels.cluster_name }} 中,命名空间 {{ $labels.namespace }} 下的ConfigMap {{ $labels.configmap }} 在1小时内发生超过3次变更,且当前版本与观察版本差异超过2个版本,持续30分钟,可能导致回滚失败或配置不一致。"
---
# P1 升级到 P0
- alert: KubernetesNodeMemoryPressureP1toP0
expr: |
kube_node_status_condition{condition="MemoryPressure",status="true"} == 1
and (time() - changes(kube_node_status_condition{condition="MemoryPressure",status="true"}[2h])) > 7200
for: 0m
labels:
severity: P0
original_severity: P1
annotations:
summary: "Kubernetes 节点内存压力告警升级至 P0 ({{ $labels.namespace }}/{{ $labels.node }})"
description: "节点 {{ $labels.namespace }}/{{ $labels.node }} 内存压力问题在 2 小时内未恢复,告警升级至 P0。"
- alert: KubernetesNodeDiskPressureP1toP0
expr: |
kube_node_status_condition{condition="DiskPressure",status="true"} == 1
and (time() - changes(kube_node_status_condition{condition="DiskPressure",status="true"}[2h])) > 7200
for: 0m
labels:
severity: P0
original_severity: P1
annotations:
summary: "Kubernetes 节点磁盘压力告警升级至 P0 ({{ $labels.namespace }}/{{ $labels.node }})"
description: "节点 {{ $labels.namespace }}/{{ $labels.node }} 磁盘压力问题在 2 小时内未恢复,告警升级至 P0。"
- alert: KubernetesNodeNetworkUnavailableP1toP0
expr: |
kube_node_status_condition{condition="NetworkUnavailable",status="true"} == 1
and (time() - changes(kube_node_status_condition{condition="NetworkUnavailable",status="true"}[2h])) > 7200
for: 0m
labels:
severity: P0
original_severity: P1
annotations:
summary: "Kubernetes 节点网络不可用告警升级至 P0 ({{ $labels.namespace }}/{{ $labels.node }})"
description: "节点 {{ $labels.namespace }}/{{ $labels.node }} 网络不可用问题在 2 小时内未恢复,告警升级至 P0。"
- alert: KubernetesNodeOutOfPodCapacityP1toP0
expr: sum by (node) (
min_over_time(kube_pod_status_phase{phase="Running"}[2h:])
) / sum by (node) (
min_over_time(kube_node_status_allocatable{resource="pods"}[2h:])
) * 100 > 90
for: 0m
labels:
severity: P0
original_severity: P1
annotations:
summary: "Kubernetes 节点 Pod 容量耗尽告警升级至 P0 ({{ $labels.namespace }}/{{ $labels.node }})"
description: "节点 {{ $labels.namespace }}/{{ $labels.node }} Pod 容量耗尽问题在 2 小时内未恢复,告警升级至 P0。"
# P2 升级到 P1
- alert: KubernetesReplicasetReplicasMismatchP2toP1
expr: |
kube_replicaset_spec_replicas != kube_replicaset_status_ready_replicas
and (time() - changes(kube_replicaset_status_ready_replicas[1h])) > 3600
for: 0m
labels:
severity: P1
original_severity: P2
annotations:
summary: "Kubernetes ReplicaSet 副本不匹配告警升级至 P1 ({{ $labels.namespace }}/{{ $labels.replicaset }})"
description: "ReplicaSet {{ $labels.namespace }}/{{ $labels.replicaset }} 副本不匹配问题在 1 小时内未恢复,告警升级至 P1。"
- alert: KubernetesDeploymentReplicasMismatchP2toP1
expr: |
kube_deployment_spec_replicas != kube_deployment_status_replicas_available
and (time() - changes(kube_deployment_status_replicas_available[1h])) > 3600
for: 0m
labels:
severity: P1
original_severity: P2
annotations:
summary: "Kubernetes Deployment 副本不匹配告警升级至 P1 ({{ $labels.namespace }}/{{ $labels.deployment }})"
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 副本不匹配问题在 1 小时内未恢复,告警升级至 P1。"
- alert: KubernetesStatefulsetReplicasMismatchP2toP1
expr: |
kube_statefulset_status_replicas_ready != kube_statefulset_status_replicas
and (time() - changes(kube_statefulset_status_replicas_ready[1h])) > 3600
for: 0m
labels:
severity: P1
original_severity: P2
annotations:
summary: "Kubernetes StatefulSet 副本不匹配告警升级至 P1 ({{ $labels.namespace }}/{{ $labels.statefulset }})"
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} 副本不匹配问题在 1 小时内未恢复,告警升级至 P1。"
- alert: KubernetesDeploymentGenerationMismatchP2toP1
expr: |
kube_deployment_status_observed_generation != kube_deployment_metadata_generation
and (time() - changes(kube_deployment_status_observed_generation[1h])) > 3600
for: 0m
labels:
severity: P1
original_severity: P2
annotations:
summary: "Kubernetes Deployment 版本不匹配告警升级至 P1 ({{ $labels.namespace }}/{{ $labels.deployment }})"
description: "Deployment {{ $labels.namespace }}/{{ $labels.deployment }} 版本不匹配问题在 1 小时内未恢复,告警升级至 P1。"
- alert: KubernetesStatefulsetUpdateNotRolledOutP2toP1
expr: |
(max without (revision) (kube_statefulset_status_current_revision unless kube_statefulset_status_update_revision) * (kube_statefulset_replicas != kube_statefulset_status_replicas_updated))
and (time() - changes(kube_statefulset_status_replicas_updated[1h])) > 3600
for: 0m
labels:
severity: P1
original_severity: P2
annotations:
summary: "Kubernetes StatefulSet 更新未完成告警升级至 P1 ({{ $labels.namespace }}/{{ $labels.statefulset }})"
description: "StatefulSet {{ $labels.namespace }}/{{ $labels.statefulset }} 更新未完成问题在 1 小时内未恢复,告警升级至 P1。"
- alert: KubernetesDaemonsetRolloutStuckP2toP1
expr: |
(kube_daemonset_status_number_ready / kube_daemonset_status_desired_number_scheduled * 100 < 100)
or (kube_daemonset_status_desired_number_scheduled - kube_daemonset_status_current_number_scheduled > 0)
and (time() - changes(kube_daemonset_status_number_ready[1h])) > 3600
for: 0m
labels:
severity: P1
original_severity: P2
annotations:
summary: "Kubernetes DaemonSet 发布卡住告警升级至 P1 ({{ $labels.namespace }}/{{ $labels.daemonset }})"
description: "DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} 发布卡住问题在 1 小时内未恢复,告警升级至 P1。"
# P3 升级到 P2
- alert: KubernetesHpaScaleMaximumP3toP2
expr: |
(kube_horizontalpodautoscaler_status_desired_replicas >= kube_horizontalpodautoscaler_spec_max_replicas)
and (kube_horizontalpodautoscaler_spec_max_replicas > 1)
and (kube_horizontalpodautoscaler_spec_min_replicas != kube_horizontalpodautoscaler_spec_max_replicas)
and (time() - changes(kube_horizontalpodautoscaler_status_desired_replicas[30m])) > 1800
for: 0m
labels:
severity: P2
original_severity: P3
annotations:
summary: "Kubernetes HPA 达到最大副本数告警升级至 P2 ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})"
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} 达到最大副本数问题在 30 分钟内未恢复,告警升级至 P2。"
- alert: KubernetesHpaUnderutilizedP3toP2
expr: |
max by (horizontalpodautoscaler) (
quantile_over_time(0.5, kube_horizontalpodautoscaler_status_desired_replicas[1d]) == kube_horizontalpodautoscaler_spec_min_replicas
) > 3
and (time() - kube_horizontalpodautoscaler_status_last_scale_time) > 43200
for: 0m
labels:
severity: P2
original_severity: P3
annotations:
summary: "Kubernetes HPA 资源未充分利用告警升级至 P2 ({{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }})"
description: "HPA {{ $labels.namespace }}/{{ $labels.horizontalpodautoscaler }} 资源未充分利用问题在 12 小时内未恢复,告警升级至 P2。"
- alert: KubernetesPodCrashLoopingP3toP2
expr: |
increase(kube_pod_container_status_restarts_total[1m]) > 3
and (time() - changes(kube_pod_container_status_restarts_total[30m])) > 1800
for: 0m
labels:
severity: P2
original_severity: P3
annotations:
summary: "Kubernetes Pod 崩溃循环告警升级至 P2 ({{ $labels.namespace }}/{{ $labels.pod }})"
description: "Pod {{ $labels.namespace }}/{{ $labels.pod }} 崩溃循环问题在 30 分钟内未恢复,告警升级至 P2。"
- alert: KubernetesCronjobTooLongP3toP2
expr: time() - kube_cronjob_next_schedule_time > 7200
for: 0m
labels:
severity: P2
original_severity: P3
annotations:
summary: "Kubernetes CronJob 执行超时告警升级至 P2 ({{ $labels.namespace }}/{{ $labels.cronjob }})"
description: "CronJob {{ $labels.namespace }}/{{ $labels.cronjob }} 执行超时问题在 2 小时内未恢复,告警升级至 P2。"
- alert: KubernetesClientCertificateExpiresNextWeekP3toP2
expr: apiserver_client_certificate_expiration_seconds_count{job="apiserver"} > 0
and histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) < 7 * 24 * 60 * 60
and time() - histogram_quantile(0.01, sum by (job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{job="apiserver"}[5m]))) > 86400
for: 0m
labels:
severity: P2
original_severity: P3
annotations:
summary: "Kubernetes 客户端证书下周过期告警升级至 P2 ({{ $labels.namespace }}/{{ $labels.client }})"
description: "API 服务器客户端证书 {{ $labels.namespace }}/{{ $labels.client }} 下周过期问题在 1 天内未恢复,告警升级至 P2。"
---
groups:
- name: kubernetes-secret-alerts
rules:
- alert: SecretSensitiveDataExposure
expr: |
# 检测包含敏感字段且未加密的 Secret
kube_secret_annotations{annotation_encrypted="false", cluster_name=~".+"} == 1
and kube_secret_data{data=~".*(password|token|secret).*", cluster_name=~".+"} == 1
for: 0m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}" # 新增集群标签
annotations:
summary: "Secret 包含明文敏感数据 (集群: {{ $labels.cluster_name }}, 命名空间: {{ $labels.namespace }})"
description: "集群 {{ $labels.cluster_name }} 中,Secret {{ $labels.secret }} 包含未加密的敏感字段(如密码、令牌),存在数据泄露风险!"
- alert: OrphanedSecret
expr: |
# 检测未被任何 Pod 引用的 Secret
kube_secret_info{cluster_name=~".+"}
unless on(secret,namespace) kube_pod_spec_volumes_secret{cluster_name=~".+"}
for: 24h
labels:
severity: P2
cluster: "{{ $labels.cluster_name }}" # 新增集群标签
annotations:
summary: "存在未被引用的冗余 Secret (集群: {{ $labels.cluster_name }}, 命名空间: {{ $labels.namespace }})"
description: "集群 {{ $labels.cluster_name }} 中,Secret {{ $labels.secret }} 未被任何 Pod 使用,持续 24 小时,建议清理。"
- alert: SecretCertificateExpiringSoon
expr: |
# 检测证书有效期剩余小于 7 天
kube_secret_annotations{annotation_cert_manager_io_certificate_name="my-cert", annotation_cert_manager_io_issuer_name="my-issuer"}
< time() + 7 * 86400
for: 0m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}" # 新增集群标签
annotations:
summary: "Secret 证书即将过期 (集群: {{ $labels.cluster_name }}, 命名空间: {{ $labels.namespace }})"
description: "集群 {{ $labels.cluster_name }} 中,证书 {{ $labels.secret }} 将在 7 天内过期,请及时更新!"
- alert: SecretUnencryptedStorage
expr: |
# 检测未启用 etcd 加密的 Secret
kube_secret_annotations{annotation_encrypted="false", cluster_name=~".+"} == 1
for: 0m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}" # 新增集群标签
annotations:
summary: "Secret 未加密存储 (集群: {{ $labels.cluster_name }}, 命名空间: {{ $labels.namespace }})"
description: "集群 {{ $labels.cluster_name }} 中,Secret {{ $labels.secret }} 未启用 etcd 加密,敏感数据可能以明文形式泄露!"
- alert: SecretFrequentModification
expr: |
# 1 小时内修改次数超过 5 次
changes(kube_secret_metadata_resource_version{cluster_name=~".+"}[1h]) > 5
for: 5m
labels:
severity: P1
cluster: "{{ $labels.cluster_name }}" # 新增集群标签
annotations:
summary: "Secret 高频修改告警 (集群: {{ $labels.cluster_name }}, 命名空间: {{ $labels.namespace }})"
description: "集群 {{ $labels.cluster_name }} 中,Secret {{ $labels.secret }} 在 1 小时内被修改 {{ $value }} 次,可能存在异常操作。"
- alert: SecretCrossNamespaceAccess
expr: |
# 检测跨命名空间引用 Secret(如 dev 命名空间 Pod 引用 prod Secret)
kube_pod_spec_volumes_secret{namespace=~"dev|test", cluster_name=~".+"}
and on(secret) kube_secret_info{namespace="prod", cluster_name=~".+"}
for: 0m
labels:
severity: P0
cluster: "{{ $labels.cluster_name }}" # 新增集群标签
annotations:
summary: "非法跨命名空间访问 Secret (集群: {{ $labels.cluster_name }}, Pod: {{ $labels.pod }})"
description: "集群 {{ $labels.cluster_name }} 中,Pod {{ $labels.pod }} 引用了生产环境命名空间 prod 下的 Secret {{ $labels.secret }},存在严重安全隐患!"
```yaml