初始化
fanoutStorage = storage.NewFanout(logger, localStorage, remoteStorage)
targetManager = retrieval.NewTargetManager(fanoutStorage, log.With(logger, "component", "target manager"))
go targetManager.Run()
reloadConfig(filename string, logger log.Logger, rls ...Reloadable)
targetManager实现了Reloadable,在首次reloadConfig时会调用ApplyConfig方法:
func (tm *TargetManager) ApplyConfig(cfg *config.Config) error {
tm.scrapeConfigs = cfg.ScrapeConfigs
if tm.ctx != nil {
tm.reload()
}
}
接下来就是retrieval/targetmanager.go的reload()方法:
初始化targetSet,一个job一个target:
ts, ok := tm.targetSets[scfg.JobName]
if !ok {
ctx, cancel := context.WithCancel(tm.ctx)
ts = &targetSet{
ctx: ctx,
cancel: cancel,
sp: newScrapePool(ctx, scfg, tm.append, log.With(tm.logger, "scrape_pool", scfg.JobName)),
}
ts.ts = discovery.NewTargetSet(ts.sp)
tm.targetSets[scfg.JobName] = ts
}
根据job配置,初始化TargetProvider,笔者测试使用的file方式:
配置:
scrape_configs:
- job_name: 'prometheus'
file_sd_configs:
- files:
- /home/xxxxx/Workspace/go/src/github.com/prometheus/prometheus/*.json
reload()方法最后一行:discovery.ProvidersFromConfig(scfg.ServiceDiscoveryConfig, tm.logger)就是初始化TargetProvider
ts.ts.UpdateProviders(TargetProvider)将刚初始化的TargetProvider写入到ts.providerCh。
reload()方法中go线程调用了discovery/discovery.go的Run方法
retrieval/targetmanager.go启动线程的地方:
go func(ts *targetSet) {
// Run target set, which blocks until its context is canceled.
// Gracefully shut down pending scrapes in the scrape pool afterwards.
ts.ts.Run(ctx)
ts.sp.stop()
tm.wg.Done()
}(ts)
discovery/discovery.go的Run方法中会接收写入到ts.providerCh的值,并调用discovery/discovery.go的updateProviders方法
启动TargetProvider
discovery/discovery.go
for name, prov := range providers {
go prov.Run(ctx, updates)
}
笔者的测试环境会调用到discovery/file/file.go的Run方法
监控目标管理
本案例中监控目标管理的大部分逻辑都在discovery/file/file.go文件中。
启动文件监听
定时监听文件变化,有变化时重新加载相关的target配置
watcher, err := fsnotify.NewWatcher()
读取target配置文件
d.refresh(ctx, ch)
func (d *Discovery) refresh(ctx context.Context, ch chan<- []*config.TargetGroup) {
for _, p := range d.listFiles() {
tgroups, err := readFile(p)
ch <- tgroups
}
}
监控
现在我们又回到discovery/discovery.go的updateProviders方法末尾,向ts.syncCh写入数据
select {
case ts.syncCh <- struct{}{}:
default:
}
再回到discovery/discovery.go的Run方法
case <-ts.syncCh:
ts.sync()
func (ts *TargetSet) sync() {
var all []*config.TargetGroup
for _, tg := range ts.tgroups {
all = append(all, tg)
}
ts.syncer.Sync(all)
}
而ts.syncer是在retrieval/targetmanager.go中初始化的newScrapePool(ctx, scfg, tm.append, log.With(tm.logger, "scrape_pool", scfg.JobName))
接下来就是retrieval/scrape.go的Sync()方法
取出当前所有的target
func (sp *scrapePool) Sync(tgs []*config.TargetGroup) {
var all []*Target
for _, tg := range tgs {
targets, err := targetsFromGroup(tg, sp.config)
all = append(all, targets...)
}
sp.sync(all)
}
retrieval/scrape.go的sync()方法
该方法循环targets,对每一个target生成相应的scrape loop,并用go 启动线程进行抓取
func (sp *scrapePool) sync(targets []*Target) {
for _, t := range targets {
l := sp.newLoop(t, s)
go l.run(interval, timeout, nil)
}
}
而newloop对应的方法是在newScrapePool时初始化的
sp.newLoop = func(t *Target, s scraper) loop {
return newScrapeLoop(sp.ctx, s,
log.With(logger, "target", t),
buffers,
func(l labels.Labels) labels.Labels { return sp.mutateSampleLabels(l, t) },
func(l labels.Labels) labels.Labels { return sp.mutateReportSampleLabels(l, t) },
sp.appender,
)
}
scrape loop的run()会调到该方法
func (sl *scrapeLoop) run(interval, timeout time.Duration, errc chan<- error) {
scrapeErr := sl.scraper.scrape(scrapeCtx, buf)
}
最终,通过scraper.scrape()进行真正的metric接口调用
可以看下scraper的接口定义
type scraper interface {
scrape(ctx context.Context, w io.Writer) error
report(start time.Time, dur time.Duration, err error)
offset(interval time.Duration) time.Duration
}
其中scrape()方法
func (s *targetScraper) scrape(ctx context.Context, w io.Writer) error {
req, err := http.NewRequest("GET", s.URL().String(), nil)
resp, err := ctxhttp.Do(ctx, s.client, s.req)
_, err = io.Copy(w, s.gzipr)
}
入库
数据抓取到之后,就会调用append方法。这里完成数据解析
func (sl *scrapeLoop) append(b []byte, ts time.Time) (total, added int, err error) {
p = textparse.New(b) //解析接口返回数据,protobuf
}
更新target health状态
入库之后会调用report方法,更新部分metric的数据;同时,也会调用Target的report方法,更新本次抓取时间、状态等数据。
更新统计数据:
if err := sl.addReportSample(app, scrapeHealthMetricName, ts, health); err != nil {
}
if err := sl.addReportSample(app, scrapeDurationMetricName, ts, duration.Seconds()); err != nil {
}
if err := sl.addReportSample(app, scrapeSamplesMetricName, ts, float64(scraped)); err != nil {
}
if err := sl.addReportSample(app, samplesPostRelabelMetricName, ts, float64(appended)); err != nil {
}
更新target状态:
func (t *Target) report(start time.Time, dur time.Duration, err error) {
if err == nil {
t.health = HealthGood
} else {
t.health = HealthBad
}
t.lastError = err
t.lastScrape = start
}
至此,prometheus的target管理大部分流程已经走通,但发现该过程大量用到chan\select\go,个人觉得过于复杂,代码的可读性不高。由于接触golang时间尚短,不清楚是prometheus是这样实现,还是所有基于go实现系统代码都是这样的。