kubernetes(k8s)搭建监控告警平台prometheus+grafana+alertmanager

prometheus+grafana+alertmanager

prometheus

apiVersion: v1
kind: "Service"
metadata:
  name: prometheus
  namespace: prometheus
  labels:
    name: prometheus
spec:
  ports:
  - name: prometheus
    protocol: TCP
    port: 9090
    targetPort: 9090
  selector:
     app: prometheus
---
apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    name: prometheus-deployment
  name: prometheus
  namespace: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      app: prometheus
  template:
    metadata:
      labels:
        app: prometheus
    spec:
      containers:
      - image: prom/prometheus:latest
        name: prometheus
        command:
        - "/bin/prometheus"
        args:
        - "--config.file=/etc/prometheus/prometheus.yml"
        - "--storage.tsdb.path=/prometheus"
        - "--storage.tsdb.retention=72h"
        ports:
        - containerPort: 9090
          protocol: TCP
        volumeMounts:
        - mountPath: "/prometheus"
          name: data
        - mountPath: "/etc/prometheus"
          name: config-volume
        - mountPath: "/etc/prometheus/rules"
          name: rules-volume
        resources:
          requests:
            cpu: 100m
            memory: 100Mi
          limits:
            cpu: 500m
            memory: 2500Mi
      serviceAccountName: prometheus
      imagePullSecrets: 
        - name: regsecret
      volumes:
      - name: data
        emptyDir: {}
      - name: config-volume
        configMap:
          name: prometheus-config
      - name: rules-volume
        configMap:
          name: rules
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: prometheus
rules:
- apiGroups: [""]
  resources:
  - nodes
  - nodes/metrics
  - services
  - endpoints
  - pods
  verbs: ["get", "list", "watch"]
- apiGroups:
  - extensions
  - networking.k8s.io
  resources:
  - ingresses
  verbs: ["get", "list", "watch"]
- nonResourceURLs: ["/metrics", "/metrics/cadvisor"]
  verbs: ["get"]
---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: prometheus
  namespace: prometheus
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: prometheus
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: prometheus
subjects:
- kind: ServiceAccount
  name: prometheus
  namespace: prometheus

configmap(prometheus and rules)

kubectl create --save-config configmap rules -n prometheus --from-file ./config/rules -o yaml --dry-run | kubectl apply -f -
kubectl create --save-config configmap prometheus-config -n prometheus --from-file ./config -o yaml --dry-run | kubectl apply -f -

example: rules(node_exporter)

groups:
- name: node_health
  rules:
## 内存使用率
  - alert: HostOutOfMemory
    expr: node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes * 100 < 15
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: Host out of memory (instance {{ $labels.instance }})
      description: "节点内存 (< 15% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
## 节点内存压力
  - alert: HostMemoryUnderMemoryPressure
    expr: rate(node_vmstat_pgmajfault[1m]) > 1000
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 内存压力下的主机内存 (instance {{ $labels.instance }})
      description: "节点内存压力过大,主要页面故障率高\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
## 网络吞吐量(接收)
  - alert: HostUnusualNetworkThroughputIn
    expr: sum by (instance) (rate(node_network_receive_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: 主机不正常的网络吞吐量(接收) (instance {{ $labels.instance }})
      description: "主机网络接口 接收 的数据可能过多 (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
## 网络吞吐量(发送)
  - alert: HostUnusualNetworkThroughputOut
    expr: sum by (instance) (rate(node_network_transmit_bytes_total[2m])) / 1024 / 1024 > 100
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: 主机不正常的网络吞吐量(发送)(instance {{ $labels.instance }})
      description: "主机网络接口可能 发送 了太多数据 (> 100 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
## 磁盘读取率
  - alert: HostUnusualDiskReadRate
    expr: sum by (instance) (rate(node_disk_read_bytes_total[2m])) / 1024 / 1024 > 50
    for: 5m
    labels:
      severity: warning
    annotations:
      summary: 主机磁盘读速异常 (instance {{ $labels.instance }})
      description: "磁盘可能读取了太多的数据 (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
## 磁盘写入率
  - alert: HostUnusualDiskWriteRate
    expr: sum by (instance) (rate(node_disk_written_bytes_total[2m])) / 1024 / 1024 > 50
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 主机磁盘写速率异常 (instance {{ $labels.instance }})
      description: "磁盘可能写了太多的数据 (> 50 MB/s)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
## 磁盘空间
  - alert: HostOutOfDiskSpace
    expr: (node_filesystem_avail_bytes * 100) / node_filesystem_size_bytes < 5 and ON (instance, device, mountpoint) node_filesystem_readonly == 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 主机磁盘空间不足 (instance {{ $labels.instance }})
      description: "磁盘空间 (< 5% left)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"
## 磁盘读取延迟
  - alert: HostUnusualDiskReadLatency
    expr: rate(node_disk_read_time_seconds_total[1m]) / rate(node_disk_reads_completed_total[1m]) > 0.5 and rate(node_disk_reads_completed_total[1m]) > 0
    for: 2m
    labels:
      severity: warning
    annotations:
      summary: 主机磁盘读取延迟高(instance {{ $labels.instance }})
      description: "磁盘延迟正在增长 (read operations > 500ms)\n  VALUE = {{ $value }}\n  LABELS = {{ $labels }}"

kube-state-metrics

apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app.kubernetes.io/name: kube-state-metrics
    app.kubernetes.io/version: 2.0.0
  name: kube-state-metrics
  namespace: prometheus
spec:
  replicas: 1
  selector:
    matchLabels:
      app.kubernetes.io/name: kube-state-metrics
  template:
    metadata:
      labels:
        app.kubernetes.io/name: kube-state-metrics
        app.kubernetes.io/version: 2.0.0
    spec:
      containers:
      - image: quay.io/coreos/kube-state-metrics:v2.0.0-alpha
        livenessProbe:
          httpGet:
            path: /healthz
            port: 8080
          initialDelaySeconds: 5
          timeoutSeconds: 5
        name: kube-state-metrics
        ports:
        - containerPort: 8080
          name: http-metrics
        - containerPort: 8081
          name: telemetry
        readinessProbe:
          httpGet:
            path: /
            port: 8081
          initialDelaySeconds: 5
          timeoutSeconds: 5
        securityContext:
          runAsUser: 65534
      nodeSelector:
        kubernetes.io/os: linux

node_exporter

apiVersion: apps/v1
kind: DaemonSet
metadata:
  labels:
    app.kubernetes.io/component: exporter
    app.kubernetes.io/name: node-exporter
    app.kubernetes.io/part-of: kube-prometheus
    app.kubernetes.io/version: 1.1.2
  name: node-exporter
  namespace: prometheus
spec:
  selector:
    matchLabels:
      app.kubernetes.io/component: exporter
      app.kubernetes.io/name: node-exporter
      app.kubernetes.io/part-of: kube-prometheus
  template:
    metadata:
      labels:
        app.kubernetes.io/component: exporter
        app.kubernetes.io/name: node-exporter
        app.kubernetes.io/part-of: kube-prometheus
        app.kubernetes.io/version: 1.1.2
    spec:
      containers:
      - args:
        - --web.listen-address=0.0.0.0:9200
        - --path.sysfs=/host/sys
        - --path.rootfs=/host/root
        - --no-collector.wifi
        - --no-collector.hwmon
        - --collector.filesystem.ignored-mount-points=^/(dev|proc|sys|var/lib/docker/.+|var/lib/kubelet/pods/.+)($|/)
        - --collector.netclass.ignored-devices=^(veth.*)$
        - --collector.netdev.device-exclude=^(veth.*)$
        image: quay.io/prometheus/node-exporter:v1.1.2
        name: node-exporter
        ports:
        - containerPort: 9200
          hostPort: 9200
          name: https
        resources:
          limits:
            cpu: 250m
            memory: 180Mi
          requests:
            cpu: 102m
            memory: 180Mi
        volumeMounts:
        - mountPath: /host/sys
          mountPropagation: HostToContainer
          name: sys
          readOnly: true
        - mountPath: /host/root
          mountPropagation: HostToContainer
          name: root
          readOnly: true
      hostNetwork: true
      hostPID: true
      nodeSelector:
        kubernetes.io/os: linux
      securityContext:
        runAsNonRoot: true
        runAsUser: 65534
      tolerations:
      - operator: Exists
      volumes:
      - hostPath:
          path: /sys
        name: sys
      - hostPath:
          path: /
        name: root
  updateStrategy:
    rollingUpdate:
      maxUnavailable: 10%
    type: RollingUpdate

mysql_exporter

apiVersion: apps/v1
kind: Deployment
metadata:
  labels:
    app: mysqld-exporter
  name: mysqld-exporter
  namespace: prometheus
spec:
  selector:
    matchLabels:
      app: mysqld-exporter
  template:
    metadata:
      labels:
        app: mysqld-exporter
      name: mysqld-exporter
    spec:
      containers:
      - env:
        - name: DATA_SOURCE_NAME
          value: exporter:zWqNdvG2NTZtjv8m@(mysql-server.dev.svc.cluster.local:3306)/
        image: prom/mysqld-exporter
        imagePullPolicy: Always
        name: mysqld-exporter
---
apiVersion: v1
kind: Service
metadata:
  labels:
    app: mysqld-exporter
  name: mysqld-exporter
  namespace: prometheus
spec:
  ports:
  - name: mysqld-exporter
    port: 9104
    protocol: TCP
    targetPort: 9104
  selector:
    app: mysqld-exporter
  type: ClusterIP

grafana(告警不支持模板变量)

apiVersion: apps/v1
kind: Deployment
metadata:
  namespace: prometheus     
  labels:
    app: grafana
  name: grafana
spec:
  selector:
    matchLabels:
      app: grafana
  template:
    metadata:
      labels:
        app: grafana
    spec:
      securityContext:
        fsGroup: 472
        supplementalGroups:
        - 0    
      containers:
        - name: grafana
          image: grafana/grafana
          imagePullPolicy: IfNotPresent
          ports:
            - containerPort: 3000
              name: http-grafana
              protocol: TCP
          readinessProbe:
            failureThreshold: 3
            httpGet:
              path: /robots.txt
              port: 3000
              scheme: HTTP
            initialDelaySeconds: 10
            periodSeconds: 30
            successThreshold: 1
            timeoutSeconds: 2
          livenessProbe:
            failureThreshold: 3
            initialDelaySeconds: 30
            periodSeconds: 10
            successThreshold: 1
            tcpSocket:
              port: 3000
            timeoutSeconds: 1            
          resources:
            requests:
              cpu: 250m
              memory: 750Mi
          volumeMounts:
            - mountPath: /var/lib/grafana
              name: d-xxxxxxxxxxxxxxxx
            - mountPath: /usr/share/grafana/conf
              name: config
      volumes:
        - name: d-xxxxxxxxxxxxxxxx
          persistentVolumeClaim:
            claimName: d-xxxxxxxxxxxxxxxx
        - name: config
          configMap:
            name: grafana-config
---
apiVersion: v1
kind: Service
metadata:
  name: grafana
  namespace: prometheus
spec:
  ports:
    - port: 3000
      protocol: TCP
      targetPort: http-grafana
  selector:
    app: grafana
    #type: ClusterIP

configmap(grafana)

kubectl create --save-config configmap grafana-config -n prometheus --from-file ./config -o yaml --dry-run | kubectl apply -f -

alertmanager

apiVersion: apps/v1
kind: Deployment
metadata:
  name: alertmanager
  namespace: prometheus
  labels:
    app: alertmanager
spec:
  replicas: 1
  selector:
    matchLabels:
      app: alertmanager
  template:
    metadata:
      labels:
        app: alertmanager
    spec:
      containers:
        - name: prometheus-alertmanager
          image: "prom/alertmanager"
          imagePullPolicy: "IfNotPresent"
          args:
            - --config.file=/etc/config/alertmanager.yml
#            - --storage.path=/data
#            - --web.external-url=/
          ports:
            - containerPort: 9093
          readinessProbe:
            httpGet:
              path: /#/status
              port: 9093
            initialDelaySeconds: 30
            timeoutSeconds: 30
          volumeMounts:
            - name: config-volume
              mountPath: /etc/config
#            - name: storage-volume
#              mountPath: "/data"
#              subPath: ""
          resources:
            limits:
              cpu: 10m
              memory: 50Mi
            requests:
              cpu: 10m
              memory: 50Mi
      volumes:
        - name: config-volume
          configMap:
            name: alertmanager-config
#        - name: storage-volume
#          persistentVolumeClaim:
#            claimName: alertmanager
---
apiVersion: v1
kind: Service
metadata:
  name: alertmanager
  namespace: prometheus
  labels:
    app: alertmanager
spec:
  ports:
    - name: http
      port: 9093
      protocol: TCP
      targetPort: 9093
  selector:
    app: alertmanager 
  type: "ClusterIP"
  • 7
    点赞
  • 3
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值