笔者使用的alertmanager版本为0.9.1,
软件地址:https://github.com/prometheus/alertmanager
初始化
关于获取命令行参数,alertmanager和prometheus不同的是,am使用原生的flag库来获取。
configFile = flag.String("config.file", "alertmanager.yml", "Alertmanager configuration file name.")
dataDir = flag.String("storage.path", "data/", "Base path for data storage.")
由于目前am压力还小,没有使用HA方案,基于分布式通信协议gossip的mesh及相关的功能暂时不进行深入阅读。
nlog初始化
先看下nlog的定义nflog/nflog.go
type nlog struct {
logger log.Logger
metrics *metrics
now func() time.Time
retention time.Duration
runInterval time.Duration
snapf string
stopc chan struct{}
done func()
mtx sync.RWMutex
st gossipData
}
接下来就是nlog的初始化,[]nflog.Option中每一个返回的都是入参为nlog的方法。
notificationLogOpts := []nflog.Option{
nflog.WithRetention(*retention),
nflog.WithSnapshot(filepath.Join(*dataDir, "nflog")),
nflog.WithMaintenance(15*time.Minute, stopc, wg.Done),
nflog.WithMetrics(prometheus.DefaultRegisterer),
nflog.WithLogger(log.Base().With("component", "nflog")),
}
notificationLog, err := nflog.New(notificationLogOpts...)
nflog.New()会执行nflog.Option方法来最终完成nlog初始化,nflog/nflog.go
func New(opts ...Option) (Log, error) {
l := &nlog{
logger: log.NewNopLogger(),
now: utcNow,
st: map[string]*pb.MeshEntry{},
}
for _, o := range opts {
if err := o(l); err != nil {
return nil, err
}
}
go l.run()
}
marker与silences
marker := types.NewMarker()
silenceOpts := silence.Options{
SnapshotFile: filepath.Join(*dataDir, "silences"),
Retention: *retention,
Logger: log.Base().With("component", "silences"),
Metrics: prometheus.DefaultRegisterer,
}
silences, err := silence.New(silenceOpts)
go func() {
silences.Maintenance(15*time.Minute, filepath.Join(*dataDir, "silences"), stopc)
wg.Done()
}() //每隔15分钟将silences数据写入硬盘
alerts, err := mem.NewAlerts(marker, 30*time.Minute, *dataDir)
func NewAlerts(m types.Marker, intervalGC time.Duration, path string) (*Alerts, error) {
a := &Alerts{
alerts: map[model.Fingerprint]*types.Alert{},
marker: m,
intervalGC: intervalGC,
stopGC: make(chan struct{}),
listeners: map[int]chan *types.Alert{},
next: 0,
}
go a.runGC() //每隔30分钟将已经解决的alert删除
return a, nil
}
api
var (
inhibitor *inhibit.Inhibitor
tmpl *template.Template
pipeline notify.Stage
disp *dispatch.Dispatcher
)
apiv := api.New(
alerts,
silences,
func(matchers []*labels.Matcher) dispatch.AlertOverview {
return disp.Groups(matchers)
},
marker.Status,
mrouter,
)
apiv.SetMarker(marker)
apiv.Register(router.WithPrefix("/api")) //在api的uri增加/api前缀
配置
定义了reload方法来进行读取配置和启动相关的方法,并进行reload的首次调用
reload := func() (err error) {
conf, plainCfg, err := config.LoadFile(*configFile)
err = apiv.Update(conf, time.Duration(conf.Global.ResolveTimeout))
inhibitor = inhibit.NewInhibitor(alerts, conf.InhibitRules, marker)
pipeline = notify.BuildPipeline(
conf.Receivers,
tmpl,
waitFunc,
inhibitor,
silences,
notificationLog,
marker,
)
disp = dispatch.NewDispatcher(alerts, dispatch.NewRoute(conf.Route, nil), pipeline, marker, timeoutFunc)
go disp.Run()
go inhibitor.Run()
}
if err := reload(); err != nil {
os.Exit(1)
}
更新配置
定义webReload chan,使用其阻塞机制,在调用reload的api时,或者向进程发送SIGHUP信号进行reload()操作
webReload := make(chan struct{})
ui.Register(router, webReload)
var (
hup = make(chan os.Signal)
hupReady = make(chan bool)
term = make(chan os.Signal)
)
signal.Notify(hup, syscall.SIGHUP)
signal.Notify(term, os.Interrupt, syscall.SIGTERM)
go func() {
<-hupReady
for {
select {
case <-hup: //接收SIGHUP信号
case <-webReload: //api接口调用时会向此chan写入数据
}
reload()
}
}()
api更新的接口代码
ui/web.go
r.Post("/-/reload", func(w http.ResponseWriter, req *http.Request) {
w.Write([]byte("Reloading configuration file..."))
reloadCh <- struct{}{}
})