服务
以GPU监控为例,参考prometheus官方文档,这里用到了gonvml库,确保运行环境能找到libnvidia-ml.so.1库,如果没有,其实也可以nvidia-smi,手动解析获取监控指标,不过这样比较麻烦。
Gauge和GaugeVec区别:Gauge单个监控指标,GaugeVec多个监控指标集合,需要定义[]labels,使用WithLabelValues(labels…)去选择指标,用于多CPU、GPU等场景
package main
import (
"flag"
"fmt"
"github.com/mindprince/gonvml"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promhttp"
"net/http"
"strconv"
"sync"
)
const (
namespace = "nvidia_gpu"
)
var (
labels = []string{
"minor_number", "uuid", "name"}
)
//Collector defines prometheus collector params
type Collector struct {
sync.Mutex
numDevices prometheus.Gauge
usedMemory *prometheus.GaugeVec
totalMemory *prometheus.GaugeVec
dutyCycle *prometheus.GaugeVec
powerUsage *prometheus.GaugeVec
temperature *prometheus.GaugeVec
}
//NewCollector return a new Collector
func NewCollector() *Collector {
return &Collector{
numDevices: prometheus.NewGauge(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "num_devices",
Help: "Number of GPU devices",
},
),
usedMemory: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "memory_used_bytes",
Help: "Memory used by the GPU device in bytes",
},
labels,
),
totalMemory: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "memory_total_bytes",
Help: "Total memory of the GPU device in bytes",
},
labels,
),
dutyCycle: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "duty_cycle",
Help: "Percent of time over the past sample period during which one or more kernels were executing on the GPU device",
},
labels,
),
powerUsage: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "power_usage_milliwatts",
Help: "Power usage of the GPU device in milliwatts",
},
labels,
),
temperature: prometheus.NewGaugeVec(
prometheus.GaugeOpts{
Namespace: namespace,
Name: "temperature_celsius",
Help: "Temperature of the GPU device in celsius",
},
labels,
),
}
}
//Describe return the descriptions
func (c *Collector) Describe(ch chan<- *prometheus.Desc) {
ch <- c.numDevices.Desc()
c.usedMemory