使用golang引入sdk编写一个项目跑在k8s中
一 需求分析
- 编写一个go的项目,引用k8s的sdk 获取节点信息,获取pod信息
- 将获取到的信息通过prometheus sdk打点打出来
- 编写dockerfile 将该项目打成镜像
- 编写k8s 的yaml运行改项目
- prometheus采集该项目的pod指标
golang版本 go version go1.22.4 linux/amd64
二 编写代码
go init jcrose-pod-metrics
cd jcrose-pod-metrics
2.1 定义metrics
const (
namespace = "jcrose_pod_metrics"
getNode = "get_node"
getPod = "get_pod"
)
var (
// 将每个node的信息打印出来
k8sNodeDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getNode, "node_detail"),
Help: "k8s node detail each",
}, []string{"ip", "hostname", "containerRuntimeVersion", "kubeletVersion"})
// 将每个控制平面的pod信息打印出来
k8sPodDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getPod, "control_plane_pod_detail"),
Help: "k8s pod detail of control plane",
}, []string{"ip", "pod_name", "component"})
// 计算获取节点的耗时
getNodeDuration = prometheus.NewGauge(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getNode, "last_duration_seconds"),
Help: "get node last duration seconds",
})
// 计算获取pod的耗时
getPodDuration = prometheus.NewGauge(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getPod, "last_duration_seconds"),
Help: "get pod last duration seconds",
})
)
-
metrics讲解
- k8sNodeDetail 将每个node的信息打印出来
- getNodeDuration 计算获取节点的耗时
- k8sPodDetail 将每个控制平面的pod信息打印出来
- getPodDuration 计算获取pod的耗时
-
prometheus.BuildFQName(namespace, getNode, "detail")
代表使用共同前缀,namespace + subsystem
2.2 注册metrics
func newMetrics() {
prometheus.DefaultRegisterer.MustRegister(k8sNodeDetail)
prometheus.DefaultRegisterer.MustRegister(k8sPodDetail)
prometheus.DefaultRegisterer.MustRegister(getNodeDuration)
prometheus.DefaultRegisterer.MustRegister(getPodDuration)
}
2.3 初始化k8s-client
- 使用包 “k8s.io/client-go/kubernetes”
- 使用包 “k8s.io/client-go/rest”
- 配合后面的serviceaccount +clusterrole+clusterrolebinding
- 封装一个getK8sClient 方法
func initK8sClient() (*kubernetes.Clientset, error) {
// creates the in-cluster config
config, err := rest.InClusterConfig()
if err != nil {
fmt.Println(err.Error())
return nil, err
}
// creates the clientset
return kubernetes.NewForConfig(config)
}
2.4 使用k8s-client get node
-
clientset.CoreV1().Nodes().List
代表 get node -
遍历nodes
- 获取ip地址
p.Status.Addresses
中的类型为 apiv1.NodeInternalIP 就是内网ip - containerRuntimeVersion和kubeletVersion信息在 p.Status.NodeInfo中
- 获取ip地址
-
在结尾的时候打印个日志,记录下节点数和耗时,并把耗时打个metrics上报
-
完整代码如下
func doGetNode(ctx context.Context) {
start := time.Now()
var ip string
clientset, err := initK8sClient()
if err != nil {
fmt.Println(err.Error())
}
nodes,err := clientset.CoreV1().Nodes().List(ctx,metav1.ListOptions{})
if err != nil{
fmt.Println(err.Error())
}
for _,node := range nodes.Items{
addr := node.Status.Addresses
for _,a := range addr{
if a.Type == apiv1.NodeInternalIP{
ip = a.Address
}
}
k8sNodeDetail.With(prometheus.Labels{
"ip": ip,
"hostname": node.Name,
"containerRuntimeVersion": node.Status.NodeInfo.ContainerRuntimeVersion,
"kubeletVersion": node.Status.NodeInfo.KubeletVersion,
}).Set(1)
timeTook := time.Since(start).Seconds()
getNodeDuration.Set(timeTook)
klog.Infof("kubernetes Node %s\n",node.Name)
}
}
2.5 使用k8s-client get pod
-
这里对namespace进行了编译获取所有namespace下的所有pod记录
clientset.CoreV1().Pods(ns).List
-
遍历pods
- 打点即可
-
完整代码如下
func doGetPods(ctx context.Context) {
start := time.Now()
clientset, err := initK8sClient()
if err != nil {
fmt.Println(err.Error())
}
namespaces,_ := clientset.CoreV1().Namespaces().List(ctx,metav1.ListOptions{})
for _,ns := range namespaces.Items{
pods,_ := clientset.CoreV1().Pods(ns.Name).List(ctx,metav1.ListOptions{})
for _,pod := range pods.Items{
ip := pod.Status.PodIP
node := pod.Spec.NodeName
component := pod.Labels["component"]
k8sPodDetail.With(prometheus.Labels{
"ip": ip,
"pod_name": pod.Name,
"node_name": node,
"namespace": ns.Name,
"component": component,
}).Set(1)
timeTook := time.Since(start).Seconds()
getPodDuration.Set(timeTook)
klog.Infof("kubernetes Pod %s\n",pod.Name)
}
}
}
2.6 编写运行的ticker函数
- 每隔10秒就执行一下getnode 和getpod上报数据
- 外部的ctx被cancel会导致for退出
func getK8sObjTicker(ctx context.Context) {
ticker := time.NewTicker(time.Second * 10)
logger.Infof("GetK8sObjTicker start....")
defer ticker.Stop()
for {
select {
case <-ticker.C:
go doGetNode()
go doGetPod()
case <-ctx.Done():
return
}
}
}
完整的代码
package main
import (
"context"
"fmt"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
apiv1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/client-go/kubernetes"
"k8s.io/client-go/rest"
"k8s.io/klog/v2"
"net/http"
"time"
)
const (
namespace = "jcrose_pod_metrics"
getNode = "get_node"
getPod = "get_pod"
)
var (
// 将每个node的信息打印出来
k8sNodeDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getNode, "node_detail"),
Help: "k8s node detail each",
}, []string{"ip", "hostname", "containerRuntimeVersion", "kubeletVersion"})
// 将每个控制平面的pod信息打印出来
k8sPodDetail = prometheus.NewGaugeVec(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getPod, "control_plane_pod_detail"),
Help: "k8s pod detail of control plane",
}, []string{"ip", "pod_name","node_name", "namespace","component"})
// 计算获取节点的耗时
getNodeDuration = prometheus.NewGauge(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getNode, "last_duration_seconds"),
Help: "get node last duration seconds",
})
// 计算获取pod的耗时
getPodDuration = prometheus.NewGauge(prometheus.GaugeOpts{
Name: prometheus.BuildFQName(namespace, getPod, "last_duration_seconds"),
Help: "get pod last duration seconds",
})
)
func newMetrics() {
prometheus.DefaultRegisterer.MustRegister(k8sNodeDetail)
prometheus.DefaultRegisterer.MustRegister(k8sPodDetail)
prometheus.DefaultRegisterer.MustRegister(getNodeDuration)
prometheus.DefaultRegisterer.MustRegister(getPodDuration)
}
func initK8sClient() (*kubernetes.Clientset, error) {
// creates the in-cluster config
config, err := rest.InClusterConfig()
if err != nil {
fmt.Println(err.Error())
return nil, err
}
// creates the clientset
return kubernetes.NewForConfig(config)
}
func doGetNode(ctx context.Context) {
start := time.Now()
var ip string
clientset, err := initK8sClient()
if err != nil {
fmt.Println(err.Error())
}
nodes,err := clientset.CoreV1().Nodes().List(ctx,metav1.ListOptions{})
if err != nil{
fmt.Println(err.Error())
}
for _,node := range nodes.Items{
addr := node.Status.Addresses
for _,a := range addr{
if a.Type == apiv1.NodeInternalIP{
ip = a.Address
}
}
k8sNodeDetail.With(prometheus.Labels{
"ip": ip,
"hostname": node.Name,
"containerRuntimeVersion": node.Status.NodeInfo.ContainerRuntimeVersion,
"kubeletVersion": node.Status.NodeInfo.KubeletVersion,
}).Set(1)
timeTook := time.Since(start).Seconds()
getNodeDuration.Set(timeTook)
klog.Infof("kubernetes Node %s\n",node.Name)
}
}
func doGetPods(ctx context.Context) {
start := time.Now()
clientset, err := initK8sClient()
if err != nil {
fmt.Println(err.Error())
}
namespaces,_ := clientset.CoreV1().Namespaces().List(ctx,metav1.ListOptions{})
for _,ns := range namespaces.Items{
pods,_ := clientset.CoreV1().Pods(ns.Name).List(ctx,metav1.ListOptions{})
for _,pod := range pods.Items{
ip := pod.Status.PodIP
node := pod.Spec.NodeName
component := pod.Labels["component"]
k8sPodDetail.With(prometheus.Labels{
"ip": ip,
"pod_name": pod.Name,
"node_name": node,
"namespace": ns.Name,
"component": component,
}).Set(1)
timeTook := time.Since(start).Seconds()
getPodDuration.Set(timeTook)
klog.Infof("kubernetes Pod %s\n",pod.Name)
}
}
}
func getK8sObjTicker(ctx context.Context) {
ticker := time.NewTicker(time.Second * 10)
defer ticker.Stop()
for {
select {
case <-ticker.C:
go doGetNode(ctx)
go doGetPods(ctx)
case <-ctx.Done():
return
}
}
}
func main() {
// 注册metrics
newMetrics()
ctx := context.Background()
// 开启获取 k8s对象的协程
go getK8sObjTicker(ctx)
// 开启prometheus metric path
http.Handle("/metrics", promhttp.Handler())
err := http.ListenAndServe(":8080", nil)
if err != nil {
fmt.Println(err.Error())
}
}
三 制作容器
3.1 打包
复制2个文件到当前目录
- /etc/ssl/certs/ca-certificates.crt
- /usr/share/zoneinfo/Asia/Shanghai(可能不一样,需要自己去find一下)
编译代码
go build -o jcrose-pod-metrics main.go
最终的文件结构
root@elk:/usr/local/git# tree jcrose-pod-metrics/
jcrose-pod-metrics/
├── ca-certificates.crt
├── Dockerfile
├── go.mod
├── go.sum
├── jcrose-pod-metrics
├── main.go
└── Shanghai
dockerfile
FROM busybox:latest
COPY Shanghai /etc/localtime
COPY ca-certificates.crt /etc/ssl/certs/
COPY jcrose-pod-metrics /opt/app/jcrose-pod-metrics
ENTRYPOINT [ "/opt/app/jcrose-pod-metrics" ]
3.2 编译docker以及推送
注意这个镜像不能随便在其他集群使用,因为ca.crt文件是我自己集群的内容,只能控制自己当前的集群
docker build -t registry.cn-zhangjiakou.aliyuncs.com/jcrose-k8s/jcrose-pod-metrics:v1 .
docker push registry.cn-zhangjiakou.aliyuncs.com/jcrose-k8s/jcrose-pod-metrics:v1
四 部署
4.1 编写资源配置yaml
---
apiVersion: v1
kind: ServiceAccount
metadata:
name: jcrose-pod-metrics
namespace: monitoring
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRole
metadata:
name: jcrose-pod-metrics
rules:
- apiGroups:
- ""
resources:
- nodes
- pods
- namespaces
verbs:
- list
---
apiVersion: rbac.authorization.k8s.io/v1
kind: ClusterRoleBinding
metadata:
name: jcrose-pod-metrics
roleRef:
apiGroup: rbac.authorization.k8s.io
kind: ClusterRole
name: jcrose-pod-metrics
subjects:
- kind: ServiceAccount
name: jcrose-pod-metrics
namespace: monitoring
4.2 编写deployment.yaml文件
deployment.yaml
apiVersion: apps/v1
kind: Deployment
metadata:
name: jcrose-pod-metrics
namespace: monitoring
labels:
app: jcrose-pod-metrics
spec:
replicas: 1
selector:
matchLabels:
app: jcrose-pod-metrics
template:
metadata:
labels:
app: jcrose-pod-metrics
annotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '8080'
prometheus.io/path: 'metrics'
spec:
containers:
- name: jcrose-pod-metrics
image: registry.cn-zhangjiakou.aliyuncs.com/jcrose-k8s/jcrose-pod-metrics:v1
command:
- /opt/app/jcrose-pod-metrics
ports:
- containerPort: 8080
resources:
requests:
cpu: 100m
memory: 100Mi
limits:
cpu: 200m
memory: 800Mi
serviceAccountName: jcrose-pod-metrics
---
apiVersion: v1
kind: Service
metadata:
name: jcrose-pod-metrics
namespace: monitoring
labels:
app: jcrose-pod-metrics
spec:
selector:
app: jcrose-pod-metrics
ports:
- protocol: TCP
port: 8080
targetPort: 8080
type: ClusterIP
五 查看数据是否被成功采集
5.1 获取部署的容器ip
pod_ip=`kubectl get po -n monitoring -l app=jcrose-pod-metrics -o wide |awk 'NR==2{print $6}'`
root@elk:~/jcrose-pod-metrics# curl $pod_ip:8080/metrics|grep jcrose
% Total % Received % Xferd Average Speed Time Time Time Current
Dload Upload Total Spent Left Speed
100 11198 0 11198 0 # HELP jcrose_pod_metrics_get_node_last_duration_seconds get node last duration seconds
# TYPE jcrose_pod_metrics_get_node_last_duration_seconds gauge
0 jcrose_pod_metrics_get_node_last_duration_seconds 0.00405659
61# HELP jcrose_pod_metrics_get_node_node_detail k8s node detail each
0# TYPE jcrose_pod_metrics_get_node_node_detail gauge
2kjcrose_pod_metrics_get_node_node_detail{containerRuntimeVersion="containerd://1.6.8",hostname="elk",ip="192.168.44.129",kubeletVersion="v1.26.7"} 1
# HELP jcrose_pod_metrics_get_pod_control_plane_pod_detail k8s pod detail of control plane
# TYPE jcrose_pod_metrics_get_pod_control_plane_pod_detail gauge
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.150",namespace="ingress-nginx",node_name="elk",pod_name="ingress-nginx-admission-create-c7rp7"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.151",namespace="ingress-nginx",node_name="elk",pod_name="ingress-nginx-admission-patch-7wtgd"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.154",namespace="kube-system",node_name="elk",pod_name="coredns-5bbd96d687-g55lz"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.155",namespace="monitoring",node_name="elk",pod_name="prometheus-adapter-6c4cc5465b-clfsl"} 1
0 jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.156",namespace="monitoring",node_name="elk",pod_name="alertmanager-main-1"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.157",namespace="monitoring",node_name="elk",pod_name="grafana-79f47474f7-cjhsd"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.158",namespace="monitoring",node_name="elk",pod_name="alertmanager-main-2"} 1
:-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.159",namespace="kube-system",node_name="elk",pod_name="calico-kube-controllers-57b57c56f-6rrsr"} 1
-:jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.160",namespace="monitoring",node_name="elk",pod_name="prometheus-k8s-1"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.161",namespace="kube-system",node_name="elk",pod_name="coredns-5bbd96d687-gb5wq"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.162",namespace="monitoring",node_name="elk",pod_name="alertmanager-main-0"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.163",namespace="ingress-nginx",node_name="elk",pod_name="ingress-nginx-controller-775946b75b-wgzbc"} 1
-:jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.164",namespace="monitoring",node_name="elk",pod_name="prometheus-adapter-6c4cc5465b-x4dps"} 1
--jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.165",namespace="monitoring",node_name="elk",pod_name="blackbox-exporter-59dddb7bb6-9jmt8"} 1
:--jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.166",namespace="monitoring",node_name="elk",pod_name="prometheus-operator-57cf88fbcb-tjds9"} 1
-jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.167",namespace="monitoring",node_name="elk",pod_name="kube-state-metrics-5884fb96b-dz4hn"} 1
-:jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.168",namespace="nfs",node_name="elk",pod_name="nfs-subdir-external-provisioner-6f5fcd7d8b-v54wb"} 1
--jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.169",namespace="monitoring",node_name="elk",pod_name="prometheus-k8s-0"} 1
:-- 10.6M
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="10.244.142.175",namespace="monitoring",node_name="elk",pod_name="jcrose-pod-metrics-deployment-668dcb888f-6xhtx"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="calico-node-x8x2v"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-proxy-rd2sk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="",ip="192.168.44.129",namespace="monitoring",node_name="elk",pod_name="node-exporter-4gbh7"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="etcd",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="etcd-elk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="kube-apiserver",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-apiserver-elk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="kube-controller-manager",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-controller-manager-elk"} 1
jcrose_pod_metrics_get_pod_control_plane_pod_detail{component="kube-scheduler",ip="192.168.44.129",namespace="kube-system",node_name="elk",pod_name="kube-scheduler-elk"} 1
# HELP jcrose_pod_metrics_get_pod_last_duration_seconds get pod last duration seconds
# TYPE jcrose_pod_metrics_get_pod_last_duration_seconds gauge
jcrose_pod_metrics_get_pod_last_duration_seconds 0.109656193
数据已经成功采集,接下来在prometheus,与grafana修改
六 Prometheus,Grafana
因为我的prometheus是使用 kube-prometheus部署的operator的,所以job的写法有点差异
我修改的位置 /usr/local/git/kube-prometheus/manifests/additional/jcrose/config/prometheus-additional-config.yaml
- job_name: 'jcrose自定义监控指标'
metrics_path: /metrics
static_configs:
- targets:
- "jcrose-pod-metrics.monitoring.svc.cluster.local:8080"