Prometheus以scrape_interval(默认为1m)规则周期,从监控目标上收集信息。其中scrape_interval可以基于全局或基于单个metric定义;然后将监控信息持久存储在其本地存储上。
Prometheus以evaluation_interval(默认为1m)另一个独立的规则周期,对告警规则做定期计算。其中evaluation_interval只有全局值;然后更新告警状态。
type Group struct {
name string
// promethues evaluation_interval 告警规则做定期计算
interval time.Duration
rules []Rule
}
func (g *Group) Run() {
fmt.Println(time.Now(), g.name)
for _, r := range g.rules {
alerts,err := r.Eval(time.Now())
if err != nil{
fmt.Printf("g.name:%s,rule.name:%s,error:%v",g.name,r.GetName(),err)
continue
}
// sendAlarm
if len(alerts) >0{
values := []float64{}
for _,alert := range alerts{
values = append(values,alert.Value)
}
fmt.Printf("time:%v,g.name:%s,rule.name:%s sendAlarm values:%v \n",time.Now(),g.name,r.GetName(),values)
}
}
}
type AlertState int
const (
// StateInactive is the state of an alert that is neither firing nor pending.
StateInactive AlertState = iota
// StatePending is the state of an alert that has been active for less than
// the configured threshold duration.
StatePending
// StateFiring is the state of an alert that has been active for longer than
// the configured threshold duration.
StateFiring
)
type Alert struct {
State AlertState
Labels Labels
Annotations Labels
// The value at the last evaluation of the alerting expression.
Value float64
// The interval during which the condition of this alert held true.
// ResolvedAt will be 0 to indicate a still active alert.
ActiveAt time.Time
FiredAt time.Time
ResolvedAt time.Time
}
var seps = []byte{'\xff'}
type Labels []Label
type Label struct {
Name, Value string
}
// Hash returns a hash value for the label set.
func (ls Labels) Hash() uint64 {
// Use xxhash.Sum64(b) for fast path as it's faster.
b := make([]byte, 0, 1024)
for i, v := range ls {
if len(b)+len(v.Name)+len(v.Value)+2 >= cap(b) {
// If labels entry is 1KB+ do not allocate whole entry.
h := xxhash.New()
_, _ = h.Write(b)
for _, v := range ls[i:] {
_, _ = h.WriteString(v.Name)
_, _ = h.Write(seps)
_, _ = h.WriteString(v.Value)
_, _ = h.Write(seps)
}
return h.Sum64()
}
b = append(b, v.Name...)
b = append(b, seps[0])
b = append(b, v.Value...)
b = append(b, seps[0])
}
return xxhash.Sum64(b)
}
// resolvedRetention is the duration for which a resolved alert instance
// is kept in memory state and consequently repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time.Minute
type Rule interface {
Eval(ts time.Time) (tmls []*Alert, err error)
GetName() string
}
type AlertingRule struct {
name string
// 报警表达式
vector string
// 持续时间
holdDuration time.Duration
labels Labels
annotations Labels
// A map of alerts which are currently active (Pending or Firing), keyed by
// the fingerprint of the labelset they correspond to.
active map[uint64]*Alert
}
func (r *AlertingRule) GetName() string {
return r.name
}
func (r *AlertingRule) Eval(ts time.Time) (tmls []*Alert, err error) {
res, err := query(r.vector, ts)
if err != nil {
return nil, err
}
alerts := make(map[uint64]*Alert, len(res))
resultFPs := map[uint64]struct{}{}
// step 1:将当前的符合报警的结果,塞入alerts,状态为StatePending
for _, smpl := range res {
expand := func(text string) string {
return fmt.Sprintf(`
__alert_:%s,
__vector_:%s,
__value_:%s,
__smpl_:%s`,
r.name, r.vector, text,smpl)
}
annotations := make(Labels, 0, len(r.annotations))
for _, a := range r.annotations {
annotations = append(annotations, Label{Name: a.Name, Value: expand(a.Value)})
}
labels := make(Labels, 0, len(r.labels))
for _, l := range r.labels {
labels = append(labels, Label{Name: l.Name, Value: expand(l.Value)})
}
h := labels.Hash()
resultFPs[h] = struct{}{}
if _, ok := alerts[h]; ok {
return nil, fmt.Errorf("vector contains metrics with the same labelset after applying alert labels")
}
alerts[h] = &Alert{
Labels: labels,
Annotations: annotations,
ActiveAt: ts,
State: StatePending,
Value: smpl,
}
}
for h, a := range alerts {
// Check whether we already have alerting state for the identifying label set.
// Update the last value and annotations if so, create a new alert entry otherwise.
if alert, ok := r.active[h]; ok && alert.State != StateInactive {
alert.Value = a.Value
alert.Annotations = a.Annotations
continue
}
r.active[h] = a
}
// Check if any pending alerts should be removed or fire now. Write out alert timeseries.
for fp, a := range r.active {
if _, ok := resultFPs[fp]; !ok {
// If the alert was previously firing, keep it around for a given
// retention time so it is reported as resolved to the AlertManager.
if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) {
delete(r.active, fp)
}
if a.State != StateInactive {
a.State = StateInactive
a.ResolvedAt = ts
}
continue
}
if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
a.State = StateFiring
a.FiredAt = ts
}
if a.State == StateFiring{
tmls = append(tmls, a)
}
}
return tmls, nil
}
// promethues 通过scrape_interval 定时采集
var SourceData = map[string]map[string][]float64{
"mysql_cpu": {
"2022-05-17 16:26": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
"2022-05-17 16:27": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
"2022-05-17 16:28": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
"2022-05-17 16:29": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
"2022-05-17 16:30": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
"2022-05-17 16:31": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
"2022-05-17 16:32": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
},
}
func query(vector string, ts time.Time) (res []float64, err error) {
// demo
ends := strings.Split(vector, ">")
key := ends[0]
target, _ := strconv.ParseFloat(ends[1],64)
for _, v := range SourceData[key][ts.Format("2006-01-02 15:04")] {
if v > target {
res = append(res, v)
}
}
return res, nil
}
func main() {
rules1 := make([]Rule, 0)
rules1 = append(rules1, &AlertingRule{
name: "mysql_cpu",
vector: "mysql_cpu>10",
holdDuration: time.Minute * 2,
labels: Labels{
Label{
Name: "level",
Value: "critical",
},
},
annotations: Labels{
Label{
Name: "detail",
Value: "数据库cpu",
},
},
active:map[uint64]*Alert{},
})
groups := []Group{
{
name: "group1",
interval: time.Minute,
rules: rules1,
},
}
for _, group := range groups {
go func(group Group) {
for {
group.Run()
time.Sleep(group.interval)
}
}(group)
}
select {}
}
最后结果:满足持续时间2min后报警,当不满足时不报警
prometheus抓取数据后,根据告警规则计算,表达式为真时,进入pending状态,当持续时间超过for配置的时间后进入active状态;数据同时会推送至alertmanager,在经过group_wait后发送通知。
告警延迟或频发
根据整个告警流程来看,在数据到达alertmanager后,如果group_wait设置越大,则收到告警的时间也就越长,也就会造成告警延迟;同理,如果group_wait设置过小,则频繁收到告警。因此,需要按照具体场景进行设置。
不该告警的时候告警了
prometheus每经过scrape_interval时间向target拉取数据,再进行计算。与此同时,target的数据可能已经恢复正常了,也就是说,在for计算过程中,原数据已经恢复了正常,但是被告警跳过了,达到了持续时间,就触发了告警,也就发送了告警通知。但从grafana中看,认为数据正常,不应发送告警。这是因为grafana以prometheus为数据源时,是range query,而不是像告警数据那样稀疏的。