Prometheus源码学习(8) scrape总体流程

最新推荐文章于 2024-05-18 00:38:36 发布

疯琴

最新推荐文章于 2024-05-18 00:38:36 发布

阅读量1.3k

点赞数

分类专栏： prometheus go 监控文章标签： prometheus go 源码

本文链接：https://blog.csdn.net/qq_35753140/article/details/117148565

版权

go 同时被 3 个专栏收录

27 篇文章 1 订阅

订阅专栏

prometheus

18 篇文章 3 订阅

订阅专栏

监控

13 篇文章 1 订阅

订阅专栏

在这里插入图片描述

1. main 函数中初始化 scrapeManager 实例

// 初始化 scrapeManager，fanout Storage 是一个读写多个底层存储的代理
scrapeManager = scrape.NewManager(log.With(logger, "component", "scrape manager"), fanoutStorage)

fanoutStorage 是读写多个底层存储的代理，实现了 storage.Appendable 接口。

scrape.Manager 结构体

// Manager maintains a set of scrape pools and manages start/stop cycles
// when receiving new target groups form the discovery manager.
// Manager 维护了一组抓取池，当从 discover manager 获取到抓取目标的时候管理这些
// 抓取池的启停。
type Manager struct {
	logger    log.Logger
	// 存储追加接口
	append    storage.Appendable
	graceShut chan struct{}

	jitterSeed    uint64     // Global jitterSeed seed is used to spread scrape workload across HA setup.
	mtxScrape     sync.Mutex // Guards the fields below.
	scrapeConfigs map[string]*config.ScrapeConfig
	scrapePools   map[string]*scrapePool
	// 抓取的目标
	targetSets    map[string][]*targetgroup.Group

	triggerReload chan struct{}
}

2. main 函数中初始化 scraper，其包裹 1 中初始化的 scrape.Manager

具体 scraper 做什么用还没看到

// 设置 scraper 的 scrapeManager 字段值
scraper.Set(scrapeManager)

scraper 是 readyScrapeManager 实例的指针

scraper       = &readyScrapeManager{}

scraper 包裹 *scrape.Manager

// ReadyScrapeManager allows a scrape manager to be retrieved. Even if it's set at a later point in time.
// 获取 scrape manager，即使这个 manager 是在稍后时间点配置好的。
type readyScrapeManager struct {
	mtx sync.RWMutex
	m   *scrape.Manager
}

// Set the scrape manager.
func (rm *readyScrapeManager) Set(m *scrape.Manager) {
	rm.mtx.Lock()
	defer rm.mtx.Unlock()

	rm.m = m

3. 在 reloaders 中添加 scrapeManager.ApplyConfig 函数

reloaders := []reloader{
{
	// The Scrape and notifier managers need to reload before the Discovery manager as
	// they need to read the most updated config when receiving the new targets list.
	// scrape 和 notifier manager 要在 discovery manager 之前重新加载，因为它们要在
	// 获取新的目标之前重新配置。
	name:     "scrape",
	reloader: scrapeManager.ApplyConfig,
}
}

如果有一个 job 的配置加载失败就返回错误。

// ApplyConfig resets the manager's target providers and job configurations as defined by the new cfg.
// ApplyConfig 根据配置文件的设置重置 manager 的抓取对象配置。
func (m *Manager) ApplyConfig(cfg *config.Config) error {
	m.mtxScrape.Lock()
	defer m.mtxScrape.Unlock()

	// c 是以 job 为键，服务发现配置为值的 map。
	c := make(map[string]*config.ScrapeConfig)
	for _, scfg := range cfg.ScrapeConfigs {
		c[scfg.JobName] = scfg
	}
	m.scrapeConfigs = c

	if err := m.setJitterSeed(cfg.GlobalConfig.ExternalLabels); err != nil {
		return err
	}

	// Cleanup and reload pool if the configuration has changed.
	var failed bool
	for name, sp := range m.scrapePools {
		if cfg, ok := m.scrapeConfigs[name]; !ok {
			// 停止并删除新的配置里面没有的抓取任务池
			sp.stop()
			delete(m.scrapePools, name)
		} else if !reflect.DeepEqual(sp.config, cfg) {
			// 如果存在同名抓取 job 但是对象配置不同就重新加载一遍配置
			err := sp.reload(cfg)
			if err != nil {
				level.Error(m.logger).Log("msg", "error reloading scrape pool", "err", err, "scrape_pool", name)
				failed = true
			}
		}
	}

	if failed {
		return errors.New("failed to apply the new configuration")
	}
	return nil
}

每次 reload 配置文件的时候都会重新加载 scrape 的配置，config/config.go 中的 ScrapeConfig 结构体

// ScrapeConfig configures a scraping unit for Prometheus.
type ScrapeConfig struct {
	// The job name to which the job label is set by default.
	JobName string `yaml:"job_name"`
	// Indicator whether the scraped metrics should remain unmodified.
	HonorLabels bool `yaml:"honor_labels,omitempty"`
	// Indicator whether the scraped timestamps should be respected.
	HonorTimestamps bool `yaml:"honor_timestamps"`
	// A set of query parameters with which the target is scraped.
	Params url.Values `yaml:"params,omitempty"`
	// How frequently to scrape the targets of this scrape config.
	ScrapeInterval model.Duration `yaml:"scrape_interval,omitempty"`
	// The timeout for scraping targets of this config.
	ScrapeTimeout model.Duration `yaml:"scrape_timeout,omitempty"`
	// The HTTP resource path on which to fetch metrics from targets.
	MetricsPath string `yaml:"metrics_path,omitempty"`
	// The URL scheme with which to fetch metrics from targets.
	Scheme string `yaml:"scheme,omitempty"`
	// More than this many samples post metric-relabeling will cause the scrape to fail.
	SampleLimit uint `yaml:"sample_limit,omitempty"`
	// More than this many targets after the target relabeling will cause the
	// scrapes to fail.
	TargetLimit uint `yaml:"target_limit,omitempty"`

	// We cannot do proper Go type embedding below as the parser will then parse
	// values arbitrarily into the overflow maps of further-down types.

	ServiceDiscoveryConfigs discovery.Configs       `yaml:"-"`
	HTTPClientConfig        config.HTTPClientConfig `yaml:",inline"`

	// List of target relabel configurations.
	RelabelConfigs []*relabel.Config `yaml:"relabel_configs,omitempty"`
	// List of metric relabel configurations.
	MetricRelabelConfigs []*relabel.Config `yaml:"metric_relabel_configs,omitempty"`
}

4. 在 run.Group 中加入 scrapeManager 的启动方法

{
	// Scrape manager.
	g.Add(
		func() error {
			// When the scrape manager receives a new targets list
			// it needs to read a valid config for each job.
			// It depends on the config being in sync with the discovery manager so
			// we wait until the config is fully loaded.
			// scrape manager 获取到新的抓取目标列表时，它需要读取每个 job 的合法的配置。
			// 这依赖于正在被 discovery manager 同步的配置文件，所以要等到配置加载完成。
			<-reloadReady.C

			err := scrapeManager.Run(discoveryManagerScrape.SyncCh())
			level.Info(logger).Log("msg", "Scrape manager stopped")
			return err
		},
		func(err error) {
			// Scrape manager needs to be stopped before closing the local TSDB
			// so that it doesn't try to write samples to a closed storage.
			level.Info(logger).Log("msg", "Stopping scrape manager...")
			scrapeManager.Stop()
		},
	)
}

此处关注一下 reloadReady

	// sync.Once is used to make sure we can close the channel at different execution stages(SIGTERM or when the config is loaded).
	// sync.Once 用于确保在不同的执行阶段（SIGTERM 或加载完配置）关闭 channel。
	type closeOnce struct {
		C     chan struct{}
		once  sync.Once
		Close func()
	}
	// Wait until the server is ready to handle reloading.
	// 等待直到 server 准备好处理配置重加载。
	reloadReady := &closeOnce{
		C: make(chan struct{}),
	}
	reloadReady.Close = func() {
		reloadReady.once.Do(func() {
			close(reloadReady.C)
		})
	}

在收到 sigterm 时关闭 reloadReady 的通道，保证阻塞等待它的协程可以继续执行

		g.Add(
			// 接收信号退出
			func() error {
				// Don't forget to release the reloadReady channel so that waiting blocks can exit normally.
				select {
				case <-term:
					level.Warn(logger).Log("msg", "Received SIGTERM, exiting gracefully...")
					reloadReady.Close()
				case <-webHandler.Quit():
					level.Warn(logger).Log("msg", "Received termination request via web service, exiting gracefully...")
				case <-cancel:
					reloadReady.Close()
				}
				return nil
			},
			func(err error) {
				close(cancel)
			},
		)
	}

重加载配置之前，要等待 reloadReady.C

		g.Add(
			func() error {
				<-reloadReady.C

				for {
					select {
					case <-hup:
						if err := reloadConfig(cfg.configFile, logger, noStepSubqueryInterval, reloaders...); err != nil {
							level.Error(logger).Log("msg", "Error reloading config", "err", err)
						}
					case rc := <-webHandler.Reload():
						if err := reloadConfig(cfg.configFile, logger, noStepSubqueryInterval, reloaders...); err != nil {
							level.Error(logger).Log("msg", "Error reloading config", "err", err)
							rc <- err
						} else {
							rc <- nil
						}
					case <-cancel:
						return nil
					}
				}

			},

初始化加载配置完成之后关闭 reloadReady

	{
		// Initial configuration loading.
		cancel := make(chan struct{})
		g.Add(
			func() error {
				select {
				case <-dbOpen:
				// In case a shutdown is initiated before the dbOpen is released
				case <-cancel:
					reloadReady.Close()
					return nil
				}

				if err := reloadConfig(cfg.configFile, logger, noStepSubqueryInterval, reloaders...); err != nil {
					return errors.Wrapf(err, "error loading config from %q", cfg.configFile)
				}

				reloadReady.Close()

				webHandler.Ready()
				level.Info(logger).Log("msg", "Server is ready to receive web requests.")
				<-cancel
				return nil
			},
			func(err error) {
				close(cancel)
			},
		)
	}

Rule manager 启动之前要等待 reloadReady

	{
		// Rule manager.
		g.Add(
			func() error {
				<-reloadReady.C
				ruleManager.Run()
				return nil
			},
			func(err error) {
				ruleManager.Stop()
			},
		)
	}

Notifier manager启动之前要等待 reloadReady

	{
		// Notifier.

		// Calling notifier.Stop() before ruleManager.Stop() will cause a panic if the ruleManager isn't running,
		// so keep this interrupt after the ruleManager.Stop().
		// 如果 ruleManager不在运行，在调用 ruleManager.Stop() 之前调用 notifier.Stop() 会引发 pannic，
		// 所以确保在 ruleManager.Stop() 中断 notifier。
		g.Add(
			func() error {
				// When the notifier manager receives a new targets list
				// it needs to read a valid config for each job.
				// It depends on the config being in sync with the discovery manager
				// so we wait until the config is fully loaded.
				<-reloadReady.C

				notifierManager.Run(discoveryManagerNotify.SyncCh())
				level.Info(logger).Log("msg", "Notifier manager stopped")
				return nil
			},
			func(err error) {
				notifierManager.Stop()
			},
		)
	}
	if err := g.Run(); err != nil {
		level.Error(logger).Log("err", err)
		os.Exit(1)
	}
	level.Info(logger).Log("msg", "See you next time!")

5. scrape.Manager.Run()

参数是 map[string][]*targetgroup.Group 的 channel，键是 jobname，先在新的 goroutine 中启动 Manager.realoder() 方法，然后死循环，如果收到 channel 中更新的抓取目标就执行 Manager。updateTest() 方法将更新的目标赋值赋值给Manager.targetSets，然后尝试向 Manager.triggerReload 发送信号。

// Run receives and saves target set updates and triggers the scraping loops reloading.
// Reloading happens in the background so that it doesn't block receiving targets updates.
// Run 接收并保存对抓取目标的更新，然后触发抓取循环的重加载。
// 重加载后台执行，不会阻塞获取目标更新。
func (m *Manager) Run(tsets <-chan map[string][]*targetgroup.Group) error {
	go m.reloader()
	for {
		select {
		case ts := <-tsets:
			m.updateTsets(ts)

			select {
			case m.triggerReload <- struct{}{}:
			default:
			}

		case <-m.graceShut:
			return nil
		}
	}
}

func (m *Manager) updateTsets(tsets map[string][]*targetgroup.Group) {
	m.mtxScrape.Lock()
	m.targetSets = tsets
	m.mtxScrape.Unlock()
}

Manager.reloader() 方法每5秒钟轮询一次 Manager.triggerReload，如果有信号，就执行 Manager.reload() 方法。

func (m *Manager) reloader() {
	ticker := time.NewTicker(5 * time.Second)
	defer ticker.Stop()

	for {
		select {
		case <-m.graceShut:
			return
		case <-ticker.C:
			select {
			case <-m.triggerReload:
				m.reload()
			case <-m.graceShut:
				return
			}
		}
	}
}

Manager.reload() 方法

func (m *Manager) reload() {
	m.mtxScrape.Lock()
	var wg sync.WaitGroup
	// 遍历最新的抓取目标配置中的每个job 的 targetGroup
	for setName, groups := range m.targetSets {
		// 如果没有这个 job 的 scrapePool
		if _, ok := m.scrapePools[setName]; !ok {
			// 抓取配置 m.scrapeConfigs 有没有这个 job 的配置，
			// 解析配置的时候有可能出错，就会跳过出错的 job，这里再检查一下
			scrapeConfig, ok := m.scrapeConfigs[setName]
			// 没有就跳过这个 job
			if !ok {
				level.Error(m.logger).Log("msg", "error reloading target set", "err", "invalid config id:"+setName)
				continue
			}
			// 创建这个 job 的scrapePool
			sp, err := newScrapePool(scrapeConfig, m.append, m.jitterSeed, log.With(m.logger, "scrape_pool", setName))
			if err != nil {
				level.Error(m.logger).Log("msg", "error creating new scrape pool", "err", err, "scrape_pool", setName)
				continue
			}
			m.scrapePools[setName] = sp
		}

		wg.Add(1)
		// Run the sync in parallel as these take a while and at high load can't catch up.
		// 并发执行 scrapePool.Sync() 方法并等待全部执行完毕。
		go func(sp *scrapePool, groups []*targetgroup.Group) {
			sp.Sync(groups)
			wg.Done()
		}(m.scrapePools[setName], groups) // 如果已经有这个 job 就启动，所以此处不用 sp 而用m.scrapePools[setName]

	}
	m.mtxScrape.Unlock()
	wg.Wait()
}

6. scrapePool 相关定义

scrapePool 结构体主要包含存储（storage.Appendable）、抓取配置、http client 和 targets。scarpe/scrape.go

// scrapePool manages scrapes for sets of targets.
// scrapePool 管理对抓取目标的抓取
type scrapePool struct {
	appendable storage.Appendable
	logger     log.Logger
	cancel     context.CancelFunc

	// mtx must not be taken after targetMtx.
	mtx            sync.Mutex
	config         *config.ScrapeConfig
	client         *http.Client
	loops          map[uint64]loop
	targetLimitHit bool // Internal state to speed up the target_limit checks.

	targetMtx sync.Mutex
	// activeTargets and loops must always be synchronized to have the same
	// set of hashes.
	activeTargets  map[uint64]*Target
	droppedTargets []*Target

	// Constructor for new scrape loops. This is settable for testing convenience.
	newLoop func(scrapeLoopOptions) loop
}

工厂函数

func newScrapePool(cfg *config.ScrapeConfig, app storage.Appendable, jitterSeed uint64, logger log.Logger) (*scrapePool, error) {
	// target_scrape_pools 数量统计，每个 job 一个池。
	targetScrapePools.Inc()
	if logger == nil {
		logger = log.NewNopLogger()
	}

	// 根据配置创建 http client
	client, err := config_util.NewClientFromConfig(cfg.HTTPClientConfig, cfg.JobName, false, false)
	if err != nil {
		targetScrapePoolsFailed.Inc()
		return nil, errors.Wrap(err, "error creating HTTP client")
	}

	// pool.New 返回一个分桶的 sync.Pool
	buffers := pool.New(1e3, 100e6, 3, func(sz int) interface{} { return make([]byte, 0, sz) })

	ctx, cancel := context.WithCancel(context.Background())
	sp := &scrapePool{
		cancel:        cancel,
		appendable:    app,
		config:        cfg,
		client:        client,
		activeTargets: map[uint64]*Target{},
		loops:         map[uint64]loop{},
		logger:        logger,
	}
	sp.newLoop = func(opts scrapeLoopOptions) loop {
		// Update the targets retrieval function for metadata to a new scrape cache.
		cache := opts.cache
		if cache == nil {
			cache = newScrapeCache()
		}
		opts.target.SetMetadataStore(cache)

		return newScrapeLoop(
			ctx,
			opts.scraper,
			log.With(logger, "target", opts.target),
			buffers,
			func(l labels.Labels) labels.Labels {
				return mutateSampleLabels(l, opts.target, opts.honorLabels, opts.mrc)
			},
			func(l labels.Labels) labels.Labels { return mutateReportSampleLabels(l, opts.target) },
			func(ctx context.Context) storage.Appender { return appender(app.Appender(ctx), opts.limit) },
			cache,
			jitterSeed,
			opts.honorTimestamps,
		)
	}

	return sp, nil
}

其中 scrapeCache 是跟踪暴露的指标字符串到标签集和存储直接按的映射的，此外它还跟踪相邻两次抓取之间的腐化情况。

// scrapeCache tracks mappings of exposed metric strings to label sets and
// storage references. Additionally, it tracks staleness of series between
// scrapes.
type scrapeCache struct {
	iter uint64 // Current scrape iteration. 当前抓取的迭代序号。

	// How many series and metadata entries there were at the last success.
	// 最后一次成功抓取的时序和元数据项
	successfulCount int

	// Parsed string to an entry with information about the actual label set
	// and its storage reference.
	// 将字符串解析为标签信息
	series map[string]*cacheEntry

	// Cache of dropped metric strings and their iteration. The iteration must
	// be a pointer so we can update it without setting a new entry with an unsafe
	// string in addDropped().
	// 丢弃的指标字符串和他们的迭代序号。
	droppedSeries map[string]*uint64

	// seriesCur and seriesPrev store the labels of series that were seen
	// in the current and previous scrape.
	// We hold two maps and swap them out to save allocations.
	// 当前抓取和上次抓取中见到的标签集，两个映射轮换可以节省分配。
	seriesCur  map[uint64]labels.Labels
	seriesPrev map[uint64]labels.Labels

	metaMtx  sync.Mutex
	metadata map[string]*metaEntry
}

7. 在 Manager.reload() 里针对每个 job 执行 sp.Sync(groups)

将 target group 转换为实际的抓取目标，同步当前运行的 scraper 和结果集，返回全部抓取和丢弃的目标。

// Sync converts target groups into actual scrape targets and synchronizes
// the currently running scraper with the resulting set and returns all scraped and dropped targets.
func (sp *scrapePool) Sync(tgs []*targetgroup.Group) {
	sp.mtx.Lock()
	defer sp.mtx.Unlock()
	start := time.Now()

	sp.targetMtx.Lock()
	var all []*Target
	sp.droppedTargets = []*Target{}
	for _, tg := range tgs {
		// 合并传入的抓取目标和配置中的抓取目标，转换为 target 类型
		targets, err := targetsFromGroup(tg, sp.config)
		if err != nil {
			level.Error(sp.logger).Log("msg", "creating targets failed", "err", err)
			continue
		}
		for _, t := range targets {
			if t.Labels().Len() > 0 {
				all = append(all, t)
			} else if t.DiscoveredLabels().Len() > 0 {
				// 之前有标签，现在没有了，即为丢弃的目标
				sp.droppedTargets = append(sp.droppedTargets, t)
			}
		}
	}
	sp.targetMtx.Unlock()
	sp.sync(all)

	targetSyncIntervalLength.WithLabelValues(sp.config.JobName).Observe(
		time.Since(start).Seconds(),
	)
	targetScrapePoolSyncsCounter.WithLabelValues(sp.config.JobName).Inc()
}

8. scrapePool.sync(targets []*Target)方法

参数是所有抓取目标。方法对目标去重，对于要抓取的目标，启动抓取循环，对于要丢弃的目标，停止其抓取循环。在全部要停止的循环终止以后返回。

// sync takes a list of potentially duplicated targets, deduplicates them, starts
// scrape loops for new targets, and stops scrape loops for disappeared targets.
// It returns after all stopped scrape loops terminated.
func (sp *scrapePool) sync(targets []*Target) {
	var (
		uniqueLoops     = make(map[uint64]loop)
		interval        = time.Duration(sp.config.ScrapeInterval)
		timeout         = time.Duration(sp.config.ScrapeTimeout)
		limit           = int(sp.config.SampleLimit)
		honorLabels     = sp.config.HonorLabels
		honorTimestamps = sp.config.HonorTimestamps
		mrc             = sp.config.MetricRelabelConfigs
	)

	sp.targetMtx.Lock()
	for _, t := range targets {
		hash := t.hash()

		// 不在活跃目标列表中就创建一个 scraper 加入活跃目标列表
		if _, ok := sp.activeTargets[hash]; !ok {
			s := &targetScraper{Target: t, client: sp.client, timeout: timeout}
			l := sp.newLoop(scrapeLoopOptions{
				target:          t,
				scraper:         s,
				limit:           limit,
				honorLabels:     honorLabels,
				honorTimestamps: honorTimestamps,
				mrc:             mrc,
			})

			sp.activeTargets[hash] = t
			sp.loops[hash] = l

			uniqueLoops[hash] = l
		} else {
			// This might be a duplicated target.
			// 是重复的目标
			if _, ok := uniqueLoops[hash]; !ok {
				uniqueLoops[hash] = nil
			}
			// Need to keep the most updated labels information
			// for displaying it in the Service Discovery web page.
			// 新抓取目标的原始标签标签，用于在 web 上显示
			sp.activeTargets[hash].SetDiscoveredLabels(t.DiscoveredLabels())
		}
	}

	var wg sync.WaitGroup

	// Stop and remove old targets and scraper loops.
	// 停止旧目标的抓取循环
	for hash := range sp.activeTargets {
		if _, ok := uniqueLoops[hash]; !ok {
			wg.Add(1)
			go func(l loop) {
				l.stop()
				wg.Done()
			}(sp.loops[hash])

			delete(sp.loops, hash)
			delete(sp.activeTargets, hash)
		}
	}

	sp.targetMtx.Unlock()

	// 并发启动新目标的抓取循环
	targetScrapePoolTargetsAdded.WithLabelValues(sp.config.JobName).Set(float64(len(uniqueLoops)))
	forcedErr := sp.refreshTargetLimitErr()
	for _, l := range sp.loops {
		l.setForcedError(forcedErr)
	}
	for _, l := range uniqueLoops {
		if l != nil {
			go l.run(interval, timeout, nil)
		}
	}
	// Wait for all potentially stopped scrapers to terminate.
	// This covers the case of flapping targets. If the server is under high load, a new scraper
	// may be active and tries to insert. The old scraper that didn't terminate yet could still
	// be inserting a previous sample set.
	// 等待停止中的 scraper 终止。
	// 如果服务器负载较高，旧的 scraper 还没有终止，抓取相同目标的新 scraper 可能已经启动，
	// 这时旧的 scraper 仍然会插入之前的样本集。
	wg.Wait()
}

习得

Manager 的 reloader() 作为重加载器每5秒执行一次 reload()。
在 reload() 中，每个 targetgroup(job) 组创建一个 ScrapePool，启动一个 goroutine 异步执行 Sync() 并等待完成。
在 scrape.Sync() 中将 targetgroup 转换为具体的 target 作为参数调用 sync().
scrape.sync() 并发停止旧的抓取，启动新的抓取，等待停止和启动动作完成。
此时完成一轮 Manager.Run() 循环，每次更新抓取目标就执行一轮循环。
每一轮 Manager.Run() 都是一个闭环，严格管理每个 goroutine 的生命周期，会对不再抓取的目标停止抓取，对新的目标开启抓取。

疯琴

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
Prometheus源码学习(8) scrape总体流程

1. main 函数中初始化 scrapeManager 实例// 初始化 scrapeManager，fanout Storage 是一个读写多个底层存储的代理scrapeManager = scrape.NewManager(log.With(logger, "component", "scrape manager"), fanoutStorage)fanoutStorage 是读写多个底层存储的代理，实现了 storage.Appendable 接口。scrape.Manager 结构体.
复制链接

扫一扫

专栏目录