promethues最小化demo示例

最新推荐文章于 2024-02-21 21:47:07 发布

hzzyu

最新推荐文章于 2024-02-21 21:47:07 发布

阅读量453

点赞数

文章标签： golang 开发语言后端

本文链接：https://blog.csdn.net/qq_16059847/article/details/124824611

版权

Prometheus以scrape_interval（默认为1m）规则周期，从监控目标上收集信息。其中scrape_interval可以基于全局或基于单个metric定义；然后将监控信息持久存储在其本地存储上。

Prometheus以evaluation_interval（默认为1m）另一个独立的规则周期，对告警规则做定期计算。其中evaluation_interval只有全局值；然后更新告警状态。


type Group struct {
    name string
    // promethues evaluation_interval 告警规则做定期计算
    interval time.Duration
    rules    []Rule
}

func (g *Group) Run() {
    fmt.Println(time.Now(), g.name)

    for _, r := range g.rules {
        alerts,err := r.Eval(time.Now())
        if err != nil{
            fmt.Printf("g.name:%s,rule.name:%s,error:%v",g.name,r.GetName(),err)
            continue
        }
        // sendAlarm
        if len(alerts) >0{
            values := []float64{}
            for _,alert := range alerts{
                values = append(values,alert.Value)
            }

            fmt.Printf("time:%v,g.name:%s,rule.name:%s sendAlarm values:%v \n",time.Now(),g.name,r.GetName(),values)
        }

    }
}


type AlertState int

const (
    // StateInactive is the state of an alert that is neither firing nor pending.
    StateInactive AlertState = iota
    // StatePending is the state of an alert that has been active for less than
    // the configured threshold duration.
    StatePending
    // StateFiring is the state of an alert that has been active for longer than
    // the configured threshold duration.
    StateFiring
)

type Alert struct {
    State AlertState

    Labels      Labels
    Annotations Labels

    // The value at the last evaluation of the alerting expression.
    Value float64
    // The interval during which the condition of this alert held true.
    // ResolvedAt will be 0 to indicate a still active alert.
    ActiveAt   time.Time
    FiredAt    time.Time
    ResolvedAt time.Time
}


var seps = []byte{'\xff'}

type Labels []Label

type Label struct {
    Name, Value string
}

// Hash returns a hash value for the label set.
func (ls Labels) Hash() uint64 {
    // Use xxhash.Sum64(b) for fast path as it's faster.
    b := make([]byte, 0, 1024)
    for i, v := range ls {
        if len(b)+len(v.Name)+len(v.Value)+2 >= cap(b) {
            // If labels entry is 1KB+ do not allocate whole entry.
            h := xxhash.New()
            _, _ = h.Write(b)
            for _, v := range ls[i:] {
                _, _ = h.WriteString(v.Name)
                _, _ = h.Write(seps)
                _, _ = h.WriteString(v.Value)
                _, _ = h.Write(seps)
            }
            return h.Sum64()
        }

        b = append(b, v.Name...)
        b = append(b, seps[0])
        b = append(b, v.Value...)
        b = append(b, seps[0])
    }
    return xxhash.Sum64(b)
}


// resolvedRetention is the duration for which a resolved alert instance
// is kept in memory state and consequently repeatedly sent to the AlertManager.
const resolvedRetention = 15 * time.Minute

type Rule interface {
    Eval(ts time.Time) (tmls []*Alert, err error)
    GetName() string
}

type AlertingRule struct {
    name string
    // 报警表达式
    vector string
    // 持续时间
    holdDuration time.Duration
    labels       Labels
    annotations  Labels
    // A map of alerts which are currently active (Pending or Firing), keyed by
    // the fingerprint of the labelset they correspond to.
    active map[uint64]*Alert
}

func (r *AlertingRule) GetName() string {
    return r.name
}
func (r *AlertingRule) Eval(ts time.Time) (tmls []*Alert, err error) {

    res, err := query(r.vector, ts)
    if err != nil {
        return nil, err
    }

    alerts := make(map[uint64]*Alert, len(res))
    resultFPs := map[uint64]struct{}{}
    // step 1:将当前的符合报警的结果，塞入alerts，状态为StatePending
    for _, smpl := range res {
        expand := func(text string) string {
            return fmt.Sprintf(`
                __alert_:%s,
                __vector_:%s,
                __value_:%s,
                __smpl_:%s`,
                r.name, r.vector, text,smpl)
        }
        annotations := make(Labels, 0, len(r.annotations))
        for _, a := range r.annotations {
            annotations = append(annotations, Label{Name: a.Name, Value: expand(a.Value)})
        }
        labels := make(Labels, 0, len(r.labels))
        for _, l := range r.labels {
            labels = append(labels, Label{Name: l.Name, Value: expand(l.Value)})
        }

        h := labels.Hash()
        resultFPs[h] = struct{}{}

        if _, ok := alerts[h]; ok {
            return nil, fmt.Errorf("vector contains metrics with the same labelset after applying alert labels")
        }

        alerts[h] = &Alert{
            Labels:      labels,
            Annotations: annotations,
            ActiveAt:    ts,
            State:       StatePending,
            Value:       smpl,
        }

    }

    for h, a := range alerts {
        // Check whether we already have alerting state for the identifying label set.
        // Update the last value and annotations if so, create a new alert entry otherwise.
        if alert, ok := r.active[h]; ok && alert.State != StateInactive {
            alert.Value = a.Value
            alert.Annotations = a.Annotations
            continue
        }

        r.active[h] = a
    }

    // Check if any pending alerts should be removed or fire now. Write out alert timeseries.
    for fp, a := range r.active {
        if _, ok := resultFPs[fp]; !ok {
            // If the alert was previously firing, keep it around for a given
            // retention time so it is reported as resolved to the AlertManager.
            if a.State == StatePending || (!a.ResolvedAt.IsZero() && ts.Sub(a.ResolvedAt) > resolvedRetention) {
                delete(r.active, fp)
            }
            if a.State != StateInactive {
                a.State = StateInactive
                a.ResolvedAt = ts
            }
            continue
        }

        if a.State == StatePending && ts.Sub(a.ActiveAt) >= r.holdDuration {
            a.State = StateFiring
            a.FiredAt = ts
        }
        if a.State == StateFiring{
            tmls = append(tmls, a)
        }
    }

    return tmls, nil
}


// promethues 通过scrape_interval 定时采集
var SourceData = map[string]map[string][]float64{
    "mysql_cpu": {
        "2022-05-17 16:26": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
        "2022-05-17 16:27": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
        "2022-05-17 16:28": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
        "2022-05-17 16:29": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
        "2022-05-17 16:30": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
        "2022-05-17 16:31": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20},
        "2022-05-17 16:32": []float64{1, 2, 3, 4, 5, 6, 7, 8, 9, 10},
    },
}

func query(vector string, ts time.Time) (res []float64, err error) {
    // demo
    ends := strings.Split(vector, ">")
    key := ends[0]
    target, _ := strconv.ParseFloat(ends[1],64)
    for _, v := range SourceData[key][ts.Format("2006-01-02 15:04")] {
        if v > target {
            res = append(res, v)
        }
    }
    return res, nil
}

func main() {
    rules1 := make([]Rule, 0)
    rules1 = append(rules1, &AlertingRule{
        name:         "mysql_cpu",
        vector:       "mysql_cpu>10",
        holdDuration: time.Minute * 2,
        labels: Labels{
            Label{
                Name:  "level",
                Value: "critical",
            },
        },
        annotations: Labels{
            Label{
                Name:  "detail",
                Value: "数据库cpu",
            },
        },
        active:map[uint64]*Alert{},
    })

    groups := []Group{
        {
            name:     "group1",
            interval: time.Minute,
            rules:    rules1,
        },
    }

    for _, group := range groups {
        go func(group Group) {
            for {
                group.Run()
                time.Sleep(group.interval)
            }
        }(group)
    }

    select {}
}

最后结果：满足持续时间2min后报警，当不满足时不报警

prometheus抓取数据后，根据告警规则计算，表达式为真时，进入pending状态，当持续时间超过for配置的时间后进入active状态；数据同时会推送至alertmanager，在经过group_wait后发送通知。

告警延迟或频发

根据整个告警流程来看，在数据到达alertmanager后，如果group_wait设置越大，则收到告警的时间也就越长，也就会造成告警延迟；同理，如果group_wait设置过小，则频繁收到告警。因此，需要按照具体场景进行设置。

不该告警的时候告警了

prometheus每经过scrape_interval时间向target拉取数据，再进行计算。与此同时，target的数据可能已经恢复正常了，也就是说，在for计算过程中，原数据已经恢复了正常，但是被告警跳过了，达到了持续时间，就触发了告警，也就发送了告警通知。但从grafana中看，认为数据正常，不应发送告警。这是因为grafana以prometheus为数据源时，是range query，而不是像告警数据那样稀疏的。

hzzyu

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
promethues最小化demo示例

Prometheus以scrape_interval（默认为1m）规则周期，从监控目标上收集信息。其中scrape_interval可以基于全局或基于单个metric定义；然后将监控信息持久存储在其本地存储上。Prometheus以evaluation_interval（默认为1m）另一个独立的规则周期，对告警规则做定期计算。其中evaluation_interval只有全局值；然后更新告警状态。type Group struct { name string // prome
复制链接

扫一扫