目录
本系列尝试分析prometheus中部分重要模块的静态结构。
prometheus在实际作用中较为典型的应用是用于采集环境信息,提供监控告警功能;这个功能主要涉及规则和告警通知两个模块。单独看其中一个模块并不完整,所以这次将两个模块结合起来看。
单独看静态结构的话,本功能可以说是很简单了,其实只是ruleManager向nofityManager发送告警,然后nofityManager将具体告警分发给几个alertManagerSet。
所以其实要看明白这个功能的流程,应该主要从数据结构上做一些分析。
规则数据结构
ruleManager
// The Manager manages recording and alerting rules.
type Manager struct {
opts *ManagerOptions
groups map[string]*Group
mtx sync.RWMutex
block chan struct{}
done chan struct{}
restored bool
logger log.Logger
}
ManagerOptions
// ManagerOptions bundles options for the Manager.
type ManagerOptions struct {
ExternalURL *url.URL
QueryFunc QueryFunc
NotifyFunc NotifyFunc
Context context.Context
Appendable storage.Appendable
Queryable storage.Queryable
Logger log.Logger
Registerer prometheus.Registerer
OutageTolerance time.Duration
ForGracePeriod time.Duration
ResendDelay time.Duration
GroupLoader GroupLoader
Metrics *Metrics
}
我们可以看到,初始化ruleManager是要以ManagerOptions为入参的,这部分标识了一个规则管理结构体能够进行的操作。跟一下在main函数中初始化ruleManager的流程,我们可以发现这里面的NotifyFunc赋值的是sendAlerts。sendAlerts函数做了一件很重要的事情,它将rules.Alert转换成为notify.Alert,然后调用notify这边的Send函数发送。下面就可以走到notify模块来看了。
ruleManager = rules.NewManager(&rules.ManagerOptions{
Appendable: fanoutStorage,
Queryable: localStorage,
QueryFunc: rules.EngineQueryFunc(queryEngine, fanoutStorage),
NotifyFunc: sendAlerts(notifierManager, cfg.web.ExternalURL.String()),
Context: ctxRule,
ExternalURL: cfg.web.ExternalURL,
Registerer: prometheus.DefaultRegisterer,
Logger: log.With(logger, "component", "rule manager"),
OutageTolerance: time.Duration(cfg.outageTolerance),
ForGracePeriod: time.Duration(cfg.forGracePeriod),
ResendDelay: time.Duration(cfg.resendDelay),
}
// sendAlerts implements the rules.NotifyFunc for a Notifier.
func sendAlerts(s sender, externalURL string) rules.NotifyFunc {
return func(ctx context.Context, expr string, alerts ...*rules.Alert) {
var res []*notifier.Alert
for _, alert := range alerts {
a := ¬ifier.Alert{
StartsAt: alert.FiredAt,
Labels: alert.Labels,
Annotations: alert.Annotations,
GeneratorURL: externalURL + strutil.TableLinkForExpression(expr),
}
if !alert.ResolvedAt.IsZero() {
a.EndsAt = alert.ResolvedAt
} else {
a.EndsAt = alert.ValidUntil
}
res = append(res, a)
}
if len(alerts) > 0 {
s.Send(res...)
}
}
}
通知数据结构
notifyManager
// Manager is responsible for dispatching alert notifications to an
// alert manager service.
type Manager struct {
queue []*Alert
opts *Options
metrics *alertMetrics
more chan struct{}
mtx sync.RWMutex
ctx context.Context
cancel func()
alertmanagers map[string]*alertmanagerSet
logger log.Logger
}
这个结构体中要关注的是它将两个数据结构关联起来了,一个是notify.Alert,一个是alertmanagerSet。
// Alert is a generic representation of an alert in the Prometheus eco-system.
type Alert struct {
// Label value pairs for purpose of aggregation, matching, and disposition
// dispatching. This must minimally include an "alertname" label.
Labels labels.Labels `json:"labels"`
// Extra key/value information which does not define alert identity.
Annotations labels.Labels `json:"annotations"`
// The known time range for this alert. Both ends are optional.
StartsAt time.Time `json:"startsAt,omitempty"`
EndsAt time.Time `json:"endsAt,omitempty"`
GeneratorURL string `json:"generatorURL,omitempty"`
}
// alertmanagerSet contains a set of Alertmanagers discovered via a group of service
// discovery definitions that have a common configuration on how alerts should be sent.
type alertmanagerSet struct {
cfg *config.AlertmanagerConfig
client *http.Client
metrics *alertMetrics
mtx sync.RWMutex
ams []alertmanager
droppedAms []alertmanager
logger log.Logger
}
根据上面对rule结构体的分析,最后调用notify这边的Send函数发送,可以看到Send函数最后向管道里丢了一个消息,触发了sendAll(这名字起的。。。),sendAll里面就是对填充HTTP请求然后执行具体的发送。到此整个功能流程打通
// Run dispatches notifications continuously.
func (n *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) {
for {
select {
case <-n.ctx.Done():
return
case ts := <-tsets:
n.reload(ts)
case <-n.more:
}
alerts := n.nextBatch()
if !n.sendAll(alerts...) {
n.metrics.dropped.Add(float64(len(alerts)))
}
// If the queue still has items left, kick off the next iteration.
if n.queueLen() > 0 {
n.setMore()
}
}
}