helm 部署 Kube-Prometheus + Grafana + 钉钉告警部署 Kube-Prometheus

背景

角色IPK8S 版本容器运行时
k8s-master-1172.16.16.108v1.24.1containerd://1.6.8
k8s-node-1172.16.16.109v1.24.1containerd://1.6.8
k8s-node-2172.16.16.110v1.24.1containerd://1.6.8

安装 kube-prometheus

mkdir -p /data/yaml/kube-prometheus/prometheus && cd /data/yaml/kube-prometheus/prometheus

# 添加 bitnami charts 仓库
helm repo add bitnami https://charts.bitnami.com/bitnami

helm search repo kube-prometheus

helm pull bitnami/kube-prometheus --version 8.3.0

tar -zxvf kube-prometheus-8.3.0.tgz

cat > my-values.yaml << EOF
global:
  storageClass: "nfs-client"  # 默认 storageClass

prometheus:
  service:
    type: NodePort      # 配置 NodePort
    nodePorts: 
      http: 30090       # 配置 NodePort 端口
  persistence:
    enabled: true       # 开启持久化
    size: 9Gi           # 存储大小

alertmanager:
  service:
    type: NodePort      # 配置 NodePort
    nodePorts: 
      http: 30093       # 配置 NodePort 端口
  persistence:
    enabled: true       # 开启持久化
    size: 9Gi           # 存储大小
  config:
    route:
      receiver: 'devops'   # 告警接收者
      routes:
        - match:
          receiver: 'devops'
    receivers:
      - name: 'devops'       # 告警接收者
        webhook_configs:
        - url: 'http://prometheus-webhook-dingtalk.kube-prometheus:8060/dingtalk/devops/send'     # 注意这里的 devops 需要与 prometheus-webhook-dingtalk 中的 --ding.profile 值相同
          send_resolved: true
EOF

# 创建命名空间
kubectl create ns kube-prometheus

# 测试
helm install --namespace kube-prometheus prometheus -f my-values.yaml --dry-run  kube-prometheus

# 启动
helm install --namespace kube-prometheus prometheus -f my-values.yaml  kube-prometheus

# 查看
helm -n kube-prometheus ls

kubectl -n kube-prometheus get pod

访问 Prometheus

http://172.16.16.108:30090/

配置 Pod 告警策略

mkdir -p /data/yaml/kube-prometheus/prometheus/rules && cd /data/yaml/kube-prometheus/prometheus/rules

cat >> k8s-pod-rules.yaml << -'EOF'
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
  labels:
    prometheus-name: kube-prometheus-prometheus
    managed-by: prometheus-operator          
  name: prometheus-k8s-pod-rules
  namespace:  kube-prometheus
spec:
  groups:
  - name: PodMemUsage
    rules:
    - alert: Pod内存使用率告警
      expr: sum by (pod, namespace, job, container) (container_memory_working_set_bytes{pod!="",container !=""}) / sum by (pod, namespace, job, container) (container_spec_memory_limit_bytes{pod!="",container !=""}) * 100 != +Inf > 95
      for: 1m
      labels:
        severity: 紧急告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 当前Pod内存使用率大于95% ,使用率为: {{ $value }}"
        summary: "Pod:{{ $labels.pod }} 检测到内存使用率超过limit值95%"  

  - name: Pod_cpu
    rules:
    - alert: Pod_CPU使用率告警
      expr: sum(irate(container_cpu_usage_seconds_total{pod!="",container !=""}[1m])) by (container, pod) / (sum(container_spec_cpu_quota{pod!="",container !=""}/100000) by (container, pod)) * 100 > 130
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.pod}}: 一分钟内Pod的cpu使用率大于130%,当前的使用率为: {{ $value }}"  


  - name: Pod_Network_rx
    rules:
    - alert: Pod网络IO(rx方向)告警
      expr: (sum (rate (container_network_receive_bytes_total{pod!=""}[1m])) by (pod)) / 1024  / 1024 > 200
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(rx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
        summary: "Pod:{{ $labels.pod }} 检测到一分钟内网络IO(rx方向)过高"  

  - name: Pod_Network_tx
    rules:
    - alert: Pod网络IO(tx方向)告警
      expr: (sum (rate (container_network_transmit_bytes_total{pod!=""}[1m])) by (pod)) / 1024 / 1024 > 200
      for: 1m
      labels:
        severity: 严重告警
        service: pods
      annotations:
        description: "{{$labels.instance}}: 一分钟内Pod的Pod网络IO(tx方向)大于200Mbps,当前的值为: {{ $value }} Mbps"
        summary: "检测到一分钟内Pod网络IO(tx方向)过高"  

  - name: imagepullbackoff
    rules:
    - alert: 拉取镜像失败
      expr: kube_pod_container_status_waiting_reason{reason="ImagePullBackOff"} == 1
      for: 1m
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 拉取镜像失败,无法创建容器"
        description: "请确认镜像是否存在"
        
  - name: Pod_Start_Exception
    rules:
    - alert: POD 资源配置不正确
      expr: sum by (namespace, pod) (kube_pod_status_phase{ phase=~"Pending|Unknown"}) == 1
      for: 15s
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 启动失败,请及时查看"
        description: "POD 无法正常启动,请查看资源是否配置正确"

  - name: crashloopbackoff
    rules:
    - alert: POD启动失败
      expr: kube_pod_container_status_waiting_reason{reason="CrashLoopBackOff"} == 1
      for: 1m
      labels:
        severity: 紧急告警
      annotations:
        summary: "POD:{{ $labels.pod }} 启动失败,请查看程序日志"
        description: "确认配置参数是否正确" 
-EOF

kubectl apply -f k8s-pod-rules.yaml

# 检查
kubectl -n kube-prometheus get cm
  • 3
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

zxj19880502

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值