Prometheus二次开发-使用k8s的sdk编写一个项目获取pod和node信息

使用golang引入sdk编写一个项目跑在k8s中

一 需求分析

  • 编写一个go的项目,引用k8s的sdk 获取节点信息,获取pod信息
  • 将获取到的信息通过prometheus sdk打点打出来
  • 编写dockerfile 将该项目打成镜像
  • 编写k8s 的yaml运行改项目
  • prometheus采集该项目的pod指标

golang版本 go version go1.22.4 linux/amd64

二 编写代码

go init jcrose-pod-metrics
cd jcrose-pod-metrics

2.1 定义metrics

const (
	namespace = "jcrose_pod_metrics"
	getNode   = "get_node"
	getPod    = "get_pod"
)

var (
	// 将每个node的信息打印出来
	k8sNodeDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getNode, "node_detail"),
		Help: "k8s node detail each",
	}, []string{"ip", "hostname", "containerRuntimeVersion", "kubeletVersion"})

	// 将每个控制平面的pod信息打印出来
	k8sPodDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getPod, "control_plane_pod_detail"),
		Help: "k8s pod detail of control plane",
	}, []string{"ip", "pod_name", "component"})

	// 计算获取节点的耗时
	getNodeDuration = prometheus.NewGauge(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getNode, "last_duration_seconds"),
		Help: "get node last duration seconds",
	})

	// 计算获取pod的耗时
	getPodDuration = prometheus.NewGauge(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getPod, "last_duration_seconds"),
		Help: "get pod last duration seconds",
	})
)
  • metrics讲解

    • k8sNodeDetail 将每个node的信息打印出来
    • getNodeDuration 计算获取节点的耗时
    • k8sPodDetail 将每个控制平面的pod信息打印出来
    • getPodDuration 计算获取pod的耗时
  • prometheus.BuildFQName(namespace, getNode, "detail") 代表使用共同前缀,namespace + subsystem

2.2 注册metrics

func newMetrics() {
	prometheus.DefaultRegisterer.MustRegister(k8sNodeDetail)
	prometheus.DefaultRegisterer.MustRegister(k8sPodDetail)
	prometheus.DefaultRegisterer.MustRegister(getNodeDuration)
	prometheus.DefaultRegisterer.MustRegister(getPodDuration)
}

2.3 初始化k8s-client

  • 使用包 “k8s.io/client-go/kubernetes”
  • 使用包 “k8s.io/client-go/rest”
  • 配合后面的serviceaccount +clusterrole+clusterrolebinding
  • 封装一个getK8sClient 方法
func initK8sClient() (*kubernetes.Clientset, error) {
	// creates the in-cluster config
	config, err := rest.InClusterConfig()
	if err != nil {
		fmt.Println(err.Error())
		return nil, err
	}
	// creates the clientset
	return kubernetes.NewForConfig(config)

}

2.4 使用k8s-client get node

  • clientset.CoreV1().Nodes().List代表 get node

  • 遍历nodes

    • 获取ip地址 p.Status.Addresses 中的类型为 apiv1.NodeInternalIP 就是内网ip
    • containerRuntimeVersion和kubeletVersion信息在 p.Status.NodeInfo中
  • 在结尾的时候打印个日志,记录下节点数和耗时,并把耗时打个metrics上报

  • 完整代码如下

func doGetNode(ctx context.Context) {
	start := time.Now()
	var ip string

	clientset, err := initK8sClient()
	if err != nil {
		fmt.Println(err.Error())
	}

	nodes,err := clientset.CoreV1().Nodes().List(ctx,metav1.ListOptions{})
	if err != nil{
		fmt.Println(err.Error())
	}

	for _,node := range  nodes.Items{
		addr := node.Status.Addresses
		for _,a := range  addr{
			if a.Type == apiv1.NodeInternalIP{
				ip = a.Address
			}
		}

		k8sNodeDetail.With(prometheus.Labels{
			"ip":  ip,
			"hostname": node.Name,
			"containerRuntimeVersion": node.Status.NodeInfo.ContainerRuntimeVersion,
			"kubeletVersion": node.Status.NodeInfo.KubeletVersion,
		}).Set(1)

		timeTook := time.Since(start).Seconds()
		getNodeDuration.Set(timeTook)
		klog.Infof("kubernetes Node %s\n",node.Name)
	}
}

2.5 使用k8s-client get pod

  • 这里对namespace进行了编译获取所有namespace下的所有pod记录 clientset.CoreV1().Pods(ns).List

  • 遍历pods

    • 打点即可
  • 完整代码如下

func  doGetPods(ctx context.Context)  {
	start := time.Now()

	clientset, err := initK8sClient()
	if err != nil {
		fmt.Println(err.Error())
	}

	namespaces,_ := clientset.CoreV1().Namespaces().List(ctx,metav1.ListOptions{})

	for _,ns := range  namespaces.Items{
		pods,_ := clientset.CoreV1().Pods(ns.Name).List(ctx,metav1.ListOptions{})
		for _,pod := range  pods.Items{
			ip := pod.Status.PodIP
			node := pod.Spec.NodeName
			component := pod.Labels["component"]
			k8sPodDetail.With(prometheus.Labels{
				"ip": ip,
				"pod_name": pod.Name,
				"node_name": node,
				"namespace": ns.Name,
				"component": component,
			}).Set(1)

			timeTook := time.Since(start).Seconds()
			getPodDuration.Set(timeTook)
			klog.Infof("kubernetes Pod %s\n",pod.Name)
		}
	}
}

2.6 编写运行的ticker函数

  • 每隔10秒就执行一下getnode 和getpod上报数据
  • 外部的ctx被cancel会导致for退出
func getK8sObjTicker(ctx context.Context) {
	ticker := time.NewTicker(time.Second * 10)
	logger.Infof("GetK8sObjTicker start....")

	defer ticker.Stop()
	for {
		select {
		case <-ticker.C:
			go doGetNode()
			go doGetPod()
		case <-ctx.Done():
			return
		}
	}
}

完整的代码

package main

import (
	"context"
	"fmt"
	"github.com/prometheus/client_golang/prometheus"
	"github.com/prometheus/client_golang/prometheus/promhttp"
	apiv1 "k8s.io/api/core/v1"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/client-go/kubernetes"
	"k8s.io/client-go/rest"
	"k8s.io/klog/v2"
	"net/http"
	"time"
)

const (
	namespace = "jcrose_pod_metrics"
	getNode   = "get_node"
	getPod    = "get_pod"
)

var (
	// 将每个node的信息打印出来
	k8sNodeDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getNode, "node_detail"),
		Help: "k8s node detail each",
	}, []string{"ip", "hostname", "containerRuntimeVersion", "kubeletVersion"})

	// 将每个控制平面的pod信息打印出来
	k8sPodDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getPod, "control_plane_pod_detail"),
		Help: "k8s pod detail of control plane",
	}, []string{"ip", "pod_name","node_name", "namespace","component"})

	// 计算获取节点的耗时
	getNodeDuration = prometheus.NewGauge(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getNode, "last_duration_seconds"),
		Help: "get node last duration seconds",
	})

	// 计算获取pod的耗时
	getPodDuration = prometheus.NewGauge(prometheus.GaugeOpts{
		Name: prometheus.BuildFQName(namespace, getPod, "last_duration_seconds"),
		Help: "get pod last duration seconds",
	})
)

func newMetrics() {
	prometheus.DefaultRegisterer.MustRegister(k8sNodeDetail)
	prometheus.DefaultRegisterer.MustRegister(k8sPodDetail)
	prometheus.DefaultRegisterer.MustRegister(getNodeDuration)
	prometheus.DefaultRegisterer.MustRegister(getPodDuration)
}

func initK8sClient() (*kubernetes.Clientset, error) {
	// creates the in-cluster config
	config, err := rest.InClusterConfig()
	if err != nil {
		fmt.Println(err.Error())
		return nil, err
	}
	// creates the clientset
	return kubernetes.NewForConfig(config)

}

func doGetNode(ctx context.Context) {
	start := time.Now()
	var ip string

	clientset, err := initK8sClient()
	if err != nil {
		fmt.Println(err.Error())
	}

	nodes,err := clientset.CoreV1().Nodes().List(ctx,metav1.ListOptions{})
	if err != nil{
		fmt.Println(err.Error())
	}

	for _,node := range  nodes.Items{
		addr := node.Status.Addresses
		for _,a := range  addr{
			if a.Type == apiv1.NodeInternalIP{
				ip = a.Address
			}
		}

		k8sNodeDetail.With(prometheus.Labels{
			"ip":  ip,
			"hostname": node.Name,
			"containerRuntimeVersion": node.Status.NodeInfo.ContainerRuntimeVersion,
			"kubeletVersion": node.Status.NodeInfo.KubeletVersion,
		}).Set(1)

		timeTook := time.Since(start).Seconds()
		getNodeDuration.Set(timeTook)
		klog.Infof("kubernetes Node %s\n",node.Name)
	}
}

func  doGetPods(ctx context.Context)  {
	start := time.Now()

	clientset, err := initK8sClient()
	if err != nil {
		fmt.Println(err.Error())
	}

	namespaces,_ := clientset.CoreV1().Namespaces().List(ctx,metav1.ListOptions{})

	for _,ns := range  namespaces.Items{
		pods,_ := clientset.CoreV1().Pods(ns.Name).List(ctx,metav1.ListOptions{})
		for _,pod := range  pods.Items{
			ip := pod.Status.PodIP
			node := pod.Spec.NodeName
			component := pod.Labels["component"]
			k8sPodDetail.With(prometheus.Labels{
				"ip": ip,
				"pod_name": pod.Name,
				"node_name": node,
				"namespace": ns.Name,
				"component": component,
			}).Set(1)

			timeTook := time.Since(start).Seconds()
			getPodDuration.Set(timeTook)
			klog.Infof("kubernetes Pod %s\n",pod.Name)
		}

	}
}

func getK8sObjTicker(ctx context.Context) {
	ticker := time.NewTicker(time.Second * 10)

	defer ticker.Stop()
	for {
		select {
		case <-ticker.C:
			go doGetNode(ctx)
			go doGetPods(ctx)
		case <-ctx.Done():
			return
		}
	}

}

func main() {
	// 注册metrics
	newMetrics()
	ctx := context.Background()
	// 开启获取 k8s对象的协程
	go getK8sObjTicker(ctx)
	// 开启prometheus metric path
	http.Handle("/metrics", promhttp.Handler())
	err := http.ListenAndServe(":8080", nil)
	if err != nil {
		fmt.Println(err.Error())
	}
}


三 制作容器

3.1 打包

复制2个文件到当前目录

  • /etc/ssl/certs/ca-certificates.crt
  • /usr/share/zoneinfo/Asia/Shanghai(可能不一样,需要自己去find一下)

编译代码

go build -o jcrose-pod-metrics main.go

最终的文件结构

root@elk:/usr/local/git# tree jcrose-pod-metrics/
jcrose-pod-metrics/
├── ca-certificates.crt
├── Dockerfile
├── go.mod
├── go.sum
├── jcrose-pod-metrics
├── main.go
└── Shanghai

dockerfile

FROM busybox:latest
COPY  Shanghai /etc/localtime
COPY  ca-certificates.crt /etc/ssl/certs/
COPY  jcrose-pod-metrics /opt/app/jcrose-pod-metrics
ENTRYPOINT [ "/opt/app/jcrose-pod-metrics" ]

3.2 编译docker以及推送

注意这个镜像不能随便在其他集群使用,因为ca.crt文件是我自己集群的内容,只能控制自己当前的集群

docker build -t registry.cn-zhangjiakou.aliyuncs.com/jcrose-k8s/jcrose-pod-metrics:v1 .
docker push registry.cn-zhangjiakou.aliyuncs.com/jcrose-k8s/jcrose-pod-metrics:v1

四 部署

4.1 编写资源配置yaml

---
apiVersion: v1
kind: ServiceAccount
metadata:
  name: jcrose-pod-metrics
  namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
  name: jcrose-pod-metrics
rules:
  - apiGroups:
      - ""
    resources:
      - nodes
      - pods
      - namespaces
    verbs:
      - list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
  name: jcrose-pod-metrics
roleRef:
  apiGroup: rbac.authorization.k8s.io
  kind: ClusterRole
  name: jcrose-pod-metrics
subjects:
  - kind: ServiceAccount
    name: jcrose-pod-metrics
    namespace: monitoring

4.2 编写deployment.yaml文件

deployment.yaml

apiVersion: apps/v1
kind: Deployment
metadata:
  name: jcrose-pod-metrics
  namespace: monitoring
  labels:
    app: jcrose-pod-metrics
spec:
  replicas: 1
  selector:
    matchLabels:
      app: jcrose-pod-metrics
  template:
    metadata:
      labels:
        app: jcrose-pod-metrics
      annotations:
        prometheus.io/scrape: 'true'
        prometheus.io/port: '8080'
        prometheus.io/path: 'metrics'
    spec:
      containers:
        - name: jcrose-pod-metrics
          image: registry.cn-zhangjiakou.aliyuncs.com/jcrose-k8s/jcrose-pod-metrics:v1
          command:
            - /opt/app/jcrose-pod-metrics
          ports:
            - containerPort: 8080
          resources:
            requests:
              cpu: 100m
              memory: 100Mi
            limits:
              cpu: 200m
              memory: 800Mi
      serviceAccountName: jcrose-pod-metrics
---
apiVersion: v1
kind: Service
metadata:
  name: jcrose-pod-metrics
  namespace: monitoring
  labels:
    app: jcrose-pod-metrics
spec:
  selector:
    app: jcrose-pod-metrics
  ports:
    - protocol: TCP
      port: 8080
      targetPort: 8080
  type: ClusterIP

五 查看数据是否被成功采集

5.1 获取部署的容器ip

pod_ip=`kubectl get po -n monitoring -l app=jcrose-pod-metrics -o wide |awk 'NR==2{print $6}'`

root@elk:~/jcrose-pod-metrics# curl $pod_ip:8080/metrics|grep jcrose
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 11198    0 11198    0   # HELP jcrose_pod_metrics_get_node_last_duration_seconds get node last duration seconds
  # TYPE jcrose_pod_metrics_get_node_last_duration_seconds gauge
0  jcrose_pod_metrics_get_node_last_duration_seconds 0.00405659
61# HELP jcrose_pod_metrics_get_node_node_detail k8s node detail each
0# TYPE jcrose_pod_metrics_get_node_node_detail gauge
2kjcrose_pod_metrics_get_node_node_detail{containerRuntimeVersion="containerd://1.6.8",hostname="elk",ip="192.168.44.129",kubeletVersion="v1.26.7"} 1
 # HELP jcrose_pod_metrics_get_pod_control_plane_pod_detail k8s pod detail of control plane
 # TYPE jcrose_pod_metrics_get_pod_control_plane_pod_detail gauge
 jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.150",namespace="ingress-nginx",node_name="elk",pod_name="ingress-nginx-admission-create-c7rp7"} 1
 jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.151",namespace="ingress-nginx",node_name="elk",pod_name="ingress-nginx-admission-patch-7wtgd"} 1
 jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.154",namespace="kube-system",node_name="elk",pod_name="coredns-5bbd96d687-g55lz"} 1
 jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.155",namespace="monitoring",node_name="elk",pod_name="prometheus-adapter-6c4cc5465b-clfsl"} 1
0 jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.156",namespace="monitoring",node_name="elk",pod_name="alertmanager-main-1"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.157",namespace="monitoring",node_name="elk",pod_name="grafana-79f47474f7-cjhsd"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.158",namespace="monitoring",node_name="elk",pod_name="alertmanager-main-2"} 1
:-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.159",namespace="kube-system",node_name="elk",pod_name="calico-kube-controllers-57b57c56f-6rrsr"} 1
-:jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.160",namespace="monitoring",node_name="elk",pod_name="prometheus-k8s-1"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.161",namespace="kube-system",node_name="elk",pod_name="coredns-5bbd96d687-gb5wq"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.162",namespace="monitoring",node_name="elk",pod_name="alertmanager-main-0"} 1
 -jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.163",namespace="ingress-nginx",node_name="elk",pod_name="ingress-nginx-controller-775946b75b-wgzbc"} 1
-:jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.164",namespace="monitoring",node_name="elk",pod_name="prometheus-adapter-6c4cc5465b-x4dps"} 1
--jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.165",namespace="monitoring",node_name="elk",pod_name="blackbox-exporter-59dddb7bb6-9jmt8"} 1
:--jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.166",namespace="monitoring",node_name="elk",pod_name="prometheus-operator-57cf88fbcb-tjds9"} 1
 -jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.167",namespace="monitoring",node_name="elk",pod_name="kube-state-metrics-5884fb96b-dz4hn"} 1
-:jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.168",namespace="nfs",node_name="elk",pod_name="nfs-subdir-external-provisioner-6f5fcd7d8b-v54wb"} 1
--jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.169",namespace="monitoring",node_name="elk",pod_name="prometheus-k8s-0"} 1
:-- 10.6M
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.175",namespace="monitoring",node_name="elk",pod_name="jcrose-pod-metrics-deployment-668dcb888f-6xhtx"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="calico-node-x8x2v"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-proxy-rd2sk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="192.168.44.129",namespace="monitoring",node_name="elk",pod_name="node-exporter-4gbh7"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="etcd",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="etcd-elk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="kube-apiserver",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-apiserver-elk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="kube-controller-manager",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-controller-manager-elk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="kube-scheduler",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-scheduler-elk"} 1
# HELP jcrose_pod_metrics_get_pod_last_duration_seconds get pod last duration seconds
# TYPE jcrose_pod_metrics_get_pod_last_duration_seconds gauge
jcrose_pod_metrics_get_pod_last_duration_seconds 0.109656193

数据已经成功采集,接下来在prometheus,与grafana修改

六 Prometheus,Grafana

因为我的prometheus是使用 kube-prometheus部署的operator的,所以job的写法有点差异

我修改的位置 /usr/local/git/kube-prometheus/manifests/additional/jcrose/config/prometheus-additional-config.yaml

    - job_name: 'jcrose自定义监控指标'
      metrics_path: /metrics
      static_configs:
      - targets:
        - "jcrose-pod-metrics.monitoring.svc.cluster.local:8080"
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值