1. main 函数中初始化 scrapeManager 实例
// 初始化 scrapeManager,fanout Storage 是一个读写多个底层存储的代理
scrapeManager = scrape.NewManager(log.With(logger, "component", "scrape manager"), fanoutStorage)
fanoutStorage 是读写多个底层存储的代理,实现了 storage.Appendable 接口。
scrape.Manager 结构体
// Manager maintains a set of scrape pools and manages start/stop cycles
// when receiving new target groups form the discovery manager.
// Manager 维护了一组抓取池,当从 discover manager 获取到抓取目标的时候管理这些
// 抓取池的启停。
type Manager struct {
logger log.Logger
// 存储追加接口
append storage.Appendable
graceShut chan struct{}
jitterSeed uint64 // Global jitterSeed seed is used to spread scrape workload across HA setup.
mtxScrape sync.Mutex // Guards the fields below.
scrapeConfigs map[string]*config.ScrapeConfig
scrapePools map[string]*scrapePool
// 抓取的目标
targetSets map[string][]*targetgroup.Group
triggerReload chan struct{}
}
2. main 函数中初始化 scraper,其包裹 1 中初始化的 scrape.Manager
具体 scraper 做什么用还没看到
// 设置 scraper 的 scrapeManager 字段值
scraper.Set(scrapeManager)
scraper
是 readyScrapeManager
实例的指针
scraper = &readyScrapeManager{}
scraper 包裹 *scrape.Manager
// ReadyScrapeManager allows a scrape manager to be retrieved. Even if it's set at a later point in time.
// 获取 scrape manager,即使这个 manager 是在稍后时间点配置好的。
type readyScrapeManager struct {
mtx sync.RWMutex
m *scrape.Manager
}
// Set the scrape manager.
func (rm *readyScrapeManager) Set(m *scrape.Manager) {
rm.mtx.Lock()
defer rm.mtx.Unlock()
rm.m = m
3. 在 reloaders 中添加 scrapeManager.ApplyConfig 函数
reloaders := []reloader{
{
// The Scrape and notifier managers need to reload before the Discovery manager as
// they need to read the most updated config when receiving the new targets list.
// scrape 和 notifier manager 要在 discovery manager 之前重新加载,因为它们要在
// 获取新的目标之前重新配置。
name: "scrape",
reloader: scrapeManager.ApplyConfig,
}
}
如果有一个 job 的配置加载失败就返回错误。
// ApplyConfig resets the manager's target providers and job configurations as defined by the new cfg.
// ApplyConfig 根据配置文件的设置重置 manager 的抓取对象配置。
func (m *Manager) ApplyConfig(cfg *config.Config) error {
m.mtxScrape.Lock()
defer m.mtxScrape.Unlock()
// c 是以 job 为键,服务发现配置为值的 map。
c := make(map[string]*config.ScrapeConfig)
for _, scfg := range cfg.ScrapeConfigs {
c[scfg.JobName] = scfg
}
m.scrapeConfigs = c
if err := m.setJitterSeed(cfg.GlobalConfig.ExternalLabels); err != nil {
return err
}
// Cleanup and reload pool if the configuration has changed.
var failed bool
for name, sp := range m.scrapePools {
if cfg, ok := m.scrapeConfigs[name]; !ok {
// 停止并删除新的配置里面没有的抓取任务池
sp.stop()
delete(m.scrapePools, name)
} else if !reflect.DeepEqual(sp.config, cfg) {
// 如果存在同名抓取 job 但是对象配置不同就重新加载一遍配置
err := sp.reload(cfg)
if err != nil {
level.Error(m.logger).Log("msg", "error reloading scrape pool", "err", err, "scrape_pool", name)
failed = true
}
}
}
if failed {
return errors.New("failed to apply the new configuration")
}
return nil
}
每次 reload 配置文件的时候都会重新加载 scrape 的配置,config/config.go 中的 ScrapeConfig 结构体
// ScrapeConfig configures a scraping unit for Prometheus.
type ScrapeConfig struct {
// The job name to which the job label is set by default.
JobName string `yaml:"job_name"`
// Indicator whether the scraped metrics should remain unmodified.
HonorLabels bool `yaml:"honor_labels,omitempty"`
// Indicator whether the scraped timestamps should be respected.
HonorTimestamps bool `yaml:"honor_timestamps"`
// A set of query parameters with which the target is scraped.
Params url.Values `yaml:"params,omitempty"`
// How frequently to scrape the targets of this scrape config.
ScrapeInterval model.Duration `yaml:"scrape_interval,omitempty"`
// The timeout for scraping targets of this config.
ScrapeTimeout model.Duration `yaml:"scrape_timeout,omitempty"`
// The HTTP resource path on which to fetch metrics from targets.
MetricsPath string `yaml:"metrics_path,omitempty"`
// The URL scheme with which to fetch metrics from targets.
Scheme string `yaml:"scheme,omitempty"`
// More than this many samples post metric-relabeling will cause the scrape to fail.
SampleLimit uint `yaml:"sample_limit,omitempty"`
// More than this many targets after the target relabeling will cause the
// scrapes to fail.
TargetLimit uint `yaml:"target_limit,omitempty"`
// We cannot do proper Go type embedding below as the parser will then parse
// values arbitrarily into the overflow maps of further-down types.
ServiceDiscoveryConfigs discovery.Configs `yaml:"-"`
HTTPClientConfig config.HTTPClientConfig `yaml:",inline"`
// List of target relabel configurations.
RelabelConfigs []*relabel.Config `yaml:"relabel_configs,omitempty"`
// List of metric relabel configurations.
MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty"`
}
4. 在 run.Group 中加入 scrapeManager 的启动方法
{
// Scrape manager.
g.Add(
func() error {
// When the scrape manager receives a new targets list
// it needs to read a valid config for each job.
// It depends on the config being in sync with the discovery manager so
// we wait until the config is fully loaded.
// scrape manager 获取到新的抓取目标列表时,它需要读取每个 job 的合法的配置。
// 这依赖于正在被 discovery manager 同步的配置文件,所以要等到配置加载完成。
<-reloadReady.C
err := scrapeManager.Run(discoveryManagerScrape.SyncCh())
level.Info(logger).Log("msg", "Scrape manager stopped")
return err
},
func(err error) {
// Scrape manager needs to be stopped before closing the local TSDB
// so that it doesn't try to write samples to a closed storage.
level.Info(logger).Log("msg", "Stopping scrape manager...")
scrapeManager.Stop()
},
)
}
此处关注一下 reloadReady
// sync.Once is used to make sure we can close the channel at different execution stages(SIGTERM or when the config is loaded).
// sync.Once 用于确保在不同的执行阶段(SIGTERM 或加载完配置)关闭 channel。
type closeOnce struct {
C chan struct{}
once sync.Once
Close func()
}
// Wait until the server is ready to handle reloading.
// 等待直到 server 准备好处理配置重加载。
reloadReady := &closeOnce{
C: make(chan struct{}),
}
reloadReady.Close = func() {
reloadReady.once.Do(func() {
close(reloadReady.C)
})
}
在收到 sigterm 时关闭 reloadReady 的通道,保证阻塞等待它的协程可以继续执行
g.Add(
// 接收信号退出
func() error {
// Don't forget to release the reloadReady channel so that waiting blocks can exit normally.
select {
case <-term:
level.Warn(logger).Log("msg", "Received SIGTERM, exiting gracefully...")
reloadReady.Close()
case <-webHandler.Quit():
level.Warn(logger).Log("msg", "Received termination request via web service, exiting gracefully...")
case <-cancel:
reloadReady.Close()
}
return nil
},
func(err error) {
close(cancel)
},
)
}
重加载配置之前,要等待 reloadReady.C
g.Add(
func() error {
<-reloadReady.C
for {
select {
case <-hup:
if err := reloadConfig(cfg.configFile, logger, noStepSubqueryInterval, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config", "err", err)
}
case rc := <-webHandler.Reload():
if err := reloadConfig(cfg.configFile, logger, noStepSubqueryInterval, reloaders...); err != nil {
level.Error(logger).Log("msg", "Error reloading config", "err", err)
rc <- err
} else {
rc <- nil
}
case <-cancel:
return nil
}
}
},
初始化加载配置完成之后关闭 reloadReady
{
// Initial configuration loading.
cancel := make(chan struct{})
g.Add(
func() error {
select {
case <-dbOpen:
// In case a shutdown is initiated before the dbOpen is released
case <-cancel:
reloadReady.Close()
return nil
}
if err := reloadConfig(cfg.configFile, logger, noStepSubqueryInterval, reloaders...); err != nil {
return errors.Wrapf(err, "error loading config from %q", cfg.configFile)
}
reloadReady.Close()
webHandler.Ready()
level.Info(logger).Log("msg", "Server is ready to receive web requests.")
<-cancel
return nil
},
func(err error) {
close(cancel)
},
)
}
Rule manager 启动之前要等待 reloadReady
{
// Rule manager.
g.Add(
func() error {
<-reloadReady.C
ruleManager.Run()
return nil
},
func(err error) {
ruleManager.Stop()
},
)
}
Notifier manager启动之前要等待 reloadReady
{
// Notifier.
// Calling notifier.Stop() before ruleManager.Stop() will cause a panic if the ruleManager isn't running,
// so keep this interrupt after the ruleManager.Stop().
// 如果 ruleManager不在运行,在调用 ruleManager.Stop() 之前调用 notifier.Stop() 会引发 pannic,
// 所以确保在 ruleManager.Stop() 中断 notifier。
g.Add(
func() error {
// When the notifier manager receives a new targets list
// it needs to read a valid config for each job.
// It depends on the config being in sync with the discovery manager
// so we wait until the config is fully loaded.
<-reloadReady.C
notifierManager.Run(discoveryManagerNotify.SyncCh())
level.Info(logger).Log("msg", "Notifier manager stopped")
return nil
},
func(err error) {
notifierManager.Stop()
},
)
}
if err := g.Run(); err != nil {
level.Error(logger).Log("err", err)
os.Exit(1)
}
level.Info(logger).Log("msg", "See you next time!")
5. scrape.Manager.Run()
参数是 map[string][]*targetgroup.Group
的 channel,键是 jobname,先在新的 goroutine 中启动 Manager.realoder() 方法,然后死循环,如果收到 channel 中更新的抓取目标就执行 Manager。updateTest() 方法将更新的目标赋值赋值给Manager.targetSets,然后尝试向 Manager.triggerReload
发送信号。
// Run receives and saves target set updates and triggers the scraping loops reloading.
// Reloading happens in the background so that it doesn't block receiving targets updates.
// Run 接收并保存对抓取目标的更新,然后触发抓取循环的重加载。
// 重加载后台执行,不会阻塞获取目标更新。
func (m *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) error {
go m.reloader()
for {
select {
case ts := <-tsets:
m.updateTsets(ts)
select {
case m.triggerReload <- struct{}{}:
default:
}
case <-m.graceShut:
return nil
}
}
}
func (m *Manager) updateTsets(tsets map[string][]*targetgroup.Group) {
m.mtxScrape.Lock()
m.targetSets = tsets
m.mtxScrape.Unlock()
}
Manager.reloader() 方法每5秒钟轮询一次 Manager.triggerReload,如果有信号,就执行 Manager.reload() 方法。
func (m *Manager) reloader() {
ticker := time.NewTicker(5 * time.Second)
defer ticker.Stop()
for {
select {
case <-m.graceShut:
return
case <-ticker.C:
select {
case <-m.triggerReload:
m.reload()
case <-m.graceShut:
return
}
}
}
}
Manager.reload() 方法
func (m *Manager) reload() {
m.mtxScrape.Lock()
var wg sync.WaitGroup
// 遍历最新的抓取目标配置中的每个job 的 targetGroup
for setName, groups := range m.targetSets {
// 如果没有这个 job 的 scrapePool
if _, ok := m.scrapePools[setName]; !ok {
// 抓取配置 m.scrapeConfigs 有没有这个 job 的配置,
// 解析配置的时候有可能出错,就会跳过出错的 job,这里再检查一下
scrapeConfig, ok := m.scrapeConfigs[setName]
// 没有就跳过这个 job
if !ok {
level.Error(m.logger).Log("msg", "error reloading target set", "err", "invalid config id:"+setName)
continue
}
// 创建这个 job 的scrapePool
sp, err := newScrapePool(scrapeConfig, m.append, m.jitterSeed, log.With(m.logger, "scrape_pool", setName))
if err != nil {
level.Error(m.logger).Log("msg", "error creating new scrape pool", "err", err, "scrape_pool", setName)
continue
}
m.scrapePools[setName] = sp
}
wg.Add(1)
// Run the sync in parallel as these take a while and at high load can't catch up.
// 并发执行 scrapePool.Sync() 方法并等待全部执行完毕。
go func(sp *scrapePool, groups []*targetgroup.Group) {
sp.Sync(groups)
wg.Done()
}(m.scrapePools[setName], groups) // 如果已经有这个 job 就启动,所以此处不用 sp 而用m.scrapePools[setName]
}
m.mtxScrape.Unlock()
wg.Wait()
}
6. scrapePool 相关定义
scrapePool 结构体主要包含存储(storage.Appendable)、抓取配置、http client 和 targets。scarpe/scrape.go
// scrapePool manages scrapes for sets of targets.
// scrapePool 管理对抓取目标的抓取
type scrapePool struct {
appendable storage.Appendable
logger log.Logger
cancel context.CancelFunc
// mtx must not be taken after targetMtx.
mtx sync.Mutex
config *config.ScrapeConfig
client *http.Client
loops map[uint64]loop
targetLimitHit bool // Internal state to speed up the target_limit checks.
targetMtx sync.Mutex
// activeTargets and loops must always be synchronized to have the same
// set of hashes.
activeTargets map[uint64]*Target
droppedTargets []*Target
// Constructor for new scrape loops. This is settable for testing convenience.
newLoop func(scrapeLoopOptions) loop
}
工厂函数
func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, jitterSeed uint64, logger log.Logger) (*scrapePool, error) {
// target_scrape_pools 数量统计,每个 job 一个池。
targetScrapePools.Inc()
if logger == nil {
logger = log.NewNopLogger()
}
// 根据配置创建 http client
client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName, false, false)
if err != nil {
targetScrapePoolsFailed.Inc()
return nil, errors.Wrap(err, "error creating HTTP client")
}
// pool.New 返回一个分桶的 sync.Pool
buffers := pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) })
ctx, cancel := context.WithCancel(context.Background())
sp := &scrapePool{
cancel: cancel,
appendable: app,
config: cfg,
client: client,
activeTargets: map[uint64]*Target{},
loops: map[uint64]loop{},
logger: logger,
}
sp.newLoop = func(opts scrapeLoopOptions) loop {
// Update the targets retrieval function for metadata to a new scrape cache.
cache := opts.cache
if cache == nil {
cache = newScrapeCache()
}
opts.target.SetMetadataStore(cache)
return newScrapeLoop(
ctx,
opts.scraper,
log.With(logger, "target", opts.target),
buffers,
func(l labels.Labels) labels.Labels {
return mutateSampleLabels(l, opts.target, opts.honorLabels, opts.mrc)
},
func(l labels.Labels) labels.Labels { return mutateReportSampleLabels(l, opts.target) },
func(ctx context.Context) storage.Appender { return appender(app.Appender(ctx), opts.limit) },
cache,
jitterSeed,
opts.honorTimestamps,
)
}
return sp, nil
}
其中 scrapeCache 是跟踪暴露的指标字符串到标签集和存储直接按的映射的, 此外它还跟踪相邻两次抓取之间的腐化情况。
// scrapeCache tracks mappings of exposed metric strings to label sets and
// storage references. Additionally, it tracks staleness of series between
// scrapes.
type scrapeCache struct {
iter uint64 // Current scrape iteration. 当前抓取的迭代序号。
// How many series and metadata entries there were at the last success.
// 最后一次成功抓取的时序和元数据项
successfulCount int
// Parsed string to an entry with information about the actual label set
// and its storage reference.
// 将字符串解析为标签信息
series map[string]*cacheEntry
// Cache of dropped metric strings and their iteration. The iteration must
// be a pointer so we can update it without setting a new entry with an unsafe
// string in addDropped().
// 丢弃的指标字符串和他们的迭代序号。
droppedSeries map[string]*uint64
// seriesCur and seriesPrev store the labels of series that were seen
// in the current and previous scrape.
// We hold two maps and swap them out to save allocations.
// 当前抓取和上次抓取中见到的标签集,两个映射轮换可以节省分配。
seriesCur map[uint64]labels.Labels
seriesPrev map[uint64]labels.Labels
metaMtx sync.Mutex
metadata map[string]*metaEntry
}
7. 在 Manager.reload() 里针对每个 job 执行 sp.Sync(groups)
将 target group 转换为实际的抓取目标,同步当前运行的 scraper 和结果集,返回全部抓取和丢弃的目标。
// Sync converts target groups into actual scrape targets and synchronizes
// the currently running scraper with the resulting set and returns all scraped and dropped targets.
func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
sp.mtx.Lock()
defer sp.mtx.Unlock()
start := time.Now()
sp.targetMtx.Lock()
var all []*Target
sp.droppedTargets = []*Target{}
for _, tg := range tgs {
// 合并传入的抓取目标和配置中的抓取目标,转换为 target 类型
targets, err := targetsFromGroup(tg, sp.config)
if err != nil {
level.Error(sp.logger).Log("msg", "creating targets failed", "err", err)
continue
}
for _, t := range targets {
if t.Labels().Len() > 0 {
all = append(all, t)
} else if t.DiscoveredLabels().Len() > 0 {
// 之前有标签,现在没有了,即为丢弃的目标
sp.droppedTargets = append(sp.droppedTargets, t)
}
}
}
sp.targetMtx.Unlock()
sp.sync(all)
targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe(
time.Since(start).Seconds(),
)
targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc()
}
8. scrapePool.sync(targets []*Target)方法
参数是所有抓取目标。方法对目标去重,对于要抓取的目标,启动抓取循环,对于要丢弃的目标,停止其抓取循环。在全部要停止的循环终止以后返回。
// sync takes a list of potentially duplicated targets, deduplicates them, starts
// scrape loops for new targets, and stops scrape loops for disappeared targets.
// It returns after all stopped scrape loops terminated.
func (sp *scrapePool) sync(targets []*Target) {
var (
uniqueLoops = make(map[uint64]loop)
interval = time.Duration(sp.config.ScrapeInterval)
timeout = time.Duration(sp.config.ScrapeTimeout)
limit = int(sp.config.SampleLimit)
honorLabels = sp.config.HonorLabels
honorTimestamps = sp.config.HonorTimestamps
mrc = sp.config.MetricRelabelConfigs
)
sp.targetMtx.Lock()
for _, t := range targets {
hash := t.hash()
// 不在活跃目标列表中就创建一个 scraper 加入活跃目标列表
if _, ok := sp.activeTargets[hash]; !ok {
s := &targetScraper{Target: t, client: sp.client, timeout: timeout}
l := sp.newLoop(scrapeLoopOptions{
target: t,
scraper: s,
limit: limit,
honorLabels: honorLabels,
honorTimestamps: honorTimestamps,
mrc: mrc,
})
sp.activeTargets[hash] = t
sp.loops[hash] = l
uniqueLoops[hash] = l
} else {
// This might be a duplicated target.
// 是重复的目标
if _, ok := uniqueLoops[hash]; !ok {
uniqueLoops[hash] = nil
}
// Need to keep the most updated labels information
// for displaying it in the Service Discovery web page.
// 新抓取目标的原始标签标签,用于在 web 上显示
sp.activeTargets[hash].SetDiscoveredLabels(t.DiscoveredLabels())
}
}
var wg sync.WaitGroup
// Stop and remove old targets and scraper loops.
// 停止旧目标的抓取循环
for hash := range sp.activeTargets {
if _, ok := uniqueLoops[hash]; !ok {
wg.Add(1)
go func(l loop) {
l.stop()
wg.Done()
}(sp.loops[hash])
delete(sp.loops, hash)
delete(sp.activeTargets, hash)
}
}
sp.targetMtx.Unlock()
// 并发启动新目标的抓取循环
targetScrapePoolTargetsAdded.WithLabelValues(sp.config.JobName).Set(float64(len(uniqueLoops)))
forcedErr := sp.refreshTargetLimitErr()
for _, l := range sp.loops {
l.setForcedError(forcedErr)
}
for _, l := range uniqueLoops {
if l != nil {
go l.run(interval, timeout, nil)
}
}
// Wait for all potentially stopped scrapers to terminate.
// This covers the case of flapping targets. If the server is under high load, a new scraper
// may be active and tries to insert. The old scraper that didn't terminate yet could still
// be inserting a previous sample set.
// 等待停止中的 scraper 终止。
// 如果服务器负载较高,旧的 scraper 还没有终止,抓取相同目标的新 scraper 可能已经启动,
// 这时旧的 scraper 仍然会插入之前的样本集。
wg.Wait()
}
习得
- Manager 的 reloader() 作为重加载器每5秒执行一次 reload()。
- 在 reload() 中,每个 targetgroup(job) 组创建一个 ScrapePool,启动一个 goroutine 异步执行 Sync() 并等待完成。
- 在 scrape.Sync() 中将 targetgroup 转换为具体的 target 作为参数调用 sync().
- scrape.sync() 并发停止旧的抓取,启动新的抓取,等待停止和启动动作完成。
- 此时完成一轮 Manager.Run() 循环,每次更新抓取目标就执行一轮循环。
- 每一轮 Manager.Run() 都是一个闭环,严格管理每个 goroutine 的生命周期,会对不再抓取的目标停止抓取,对新的目标开启抓取。