Prometheus 实战于源码分析之storage

prometheus不仅支持本地存储还支持远端存储,先从远端存储说起,他是通过一个发送队列queue完成数据发送的。先看一下队列的定义:

func NewQueueManager(cfg QueueManagerConfig) *QueueManager {
    if cfg.QueueCapacity == 0 {
        cfg.QueueCapacity = defaultQueueCapacity
    }
    if cfg.MaxShards == 0 {
        cfg.MaxShards = defaultMaxShards
    }
    if cfg.MaxSamplesPerSend == 0 {
        cfg.MaxSamplesPerSend = defaultMaxSamplesPerSend
    }
    if cfg.BatchSendDeadline == 0 {
        cfg.BatchSendDeadline = defaultBatchSendDeadline
    }

    t := &QueueManager{
        cfg:         cfg,
        queueName:   cfg.Client.Name(),
        logLimiter:  rate.NewLimiter(logRateLimit, logBurst),
        numShards:   1,
        reshardChan: make(chan int),
        quit:        make(chan struct{}),

        samplesIn:          newEWMARate(ewmaWeight, shardUpdateDuration),
        samplesOut:         newEWMARate(ewmaWeight, shardUpdateDuration),
        samplesOutDuration: newEWMARate(ewmaWeight, shardUpdateDuration),
    }
    t.shards = t.newShards(t.numShards)
    numShards.WithLabelValues(t.queueName).Set(float64(t.numShards))
    queueCapacity.WithLabelValues(t.queueName).Set(float64(t.cfg.QueueCapacity))

    return t
}

这个队列的最大分片是1000,每个分片没秒1000个sample,那么一秒就可以发送1000*1000个sample。每一种存储,无论是本地存储还有远端存储,写数据都实现Append方法,remote的也一样,在romte的Append就调用了queue的Append方法。

func (t *QueueManager) Append(s *model.Sample) error {
    var snew model.Sample
    snew = *s
    snew.Metric = s.Metric.Clone()

    for ln, lv := range t.cfg.ExternalLabels {
        if _, ok := s.Metric[ln]; !ok {
            snew.Metric[ln] = lv
        }
    }

    snew.Metric = model.Metric(
        relabel.Process(model.LabelSet(snew.Metric), t.cfg.RelabelConfigs...))

    if snew.Metric == nil {
        return nil
    }

    t.shardsMtx.Lock()
    enqueued := t.shards.enqueue(&snew)
    t.shardsMtx.Unlock()

    if enqueued {
        queueLength.WithLabelValues(t.queueName).Inc()
    } else {
        droppedSamplesTotal.WithLabelValues(t.queueName).Inc()
        if t.logLimiter.Allow() {
            log.Warn("Remote storage queue full, discarding sample. Multiple subsequent messages of this kind may be suppressed.")
        }
    }
    return nil
}

通过enqueued := t.shards.enqueue(&snew)发到队列里面,

func (s *shards) enqueue(sample *model.Sample) bool {
    s.qm.samplesIn.incr(1)

    fp := sample.Metric.FastFingerprint()
    shard := uint64(fp) % uint64(len(s.queues))

    select {
    case s.queues[shard] <- sample:
        return true
    default:
        return false
    }
}

这个里面是简单的求余数分组的方法,如果这里使用一致hash会不会更好点呢!把数据发动到分片的队列中。QueueManager启动的时候就启动了队列发送任务

func (s *shards) start() {
    for i := 0; i < len(s.queues); i++ {
        go s.runShard(i)
    }
}

继续看runShard

func (s *shards) runShard(i int) {
    defer s.wg.Done()
    queue := s.queues[i]

    // Send batches of at most MaxSamplesPerSend samples to the remote storage.
    // If we have fewer samples than that, flush them out after a deadline
    // anyways.
    pendingSamples := model.Samples{}

    for {
        select {
        case sample, ok := <-queue:
            if !ok {
                if len(pendingSamples) > 0 {
                    log.Debugf("Flushing %d samples to remote storage...", len(pendingSamples))
                    s.sendSamples(pendingSamples)
                    log.Debugf("Done flushing.")
                }
                return
            }

            queueLength.WithLabelValues(s.qm.queueName).Dec()
            pendingSamples = append(pendingSamples, sample)

            for len(pendingSamples) >= s.qm.cfg.MaxSamplesPerSend {
                s.sendSamples(pendingSamples[:s.qm.cfg.MaxSamplesPerSend])
                pendingSamples = pendingSamples[s.qm.cfg.MaxSamplesPerSend:]
            }
        case <-time.After(s.qm.cfg.BatchSendDeadline):
            if len(pendingSamples) > 0 {
                s.sendSamples(pendingSamples)
                pendingSamples = pendingSamples[:0]
            }
        }
    }
}

具体发送样本的方法还要看里面的sendSamples

func (s *shards) sendSamples(samples model.Samples) {
    // Samples are sent to the remote storage on a best-effort basis. If a
    // sample isn't sent correctly the first time, it's simply dropped on the
    // floor.
    begin := time.Now()
    err := s.qm.cfg.Client.Store(samples)
    duration := time.Since(begin)

    if err != nil {
        log.Warnf("error sending %d samples to remote storage: %s", len(samples), err)
        failedSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
    } else {
        sentSamplesTotal.WithLabelValues(s.qm.queueName).Add(float64(len(samples)))
    }
    sentBatchDuration.WithLabelValues(s.qm.queueName).Observe(duration.Seconds())

    s.qm.samplesOut.incr(int64(len(samples)))
    s.qm.samplesOutDuration.incr(int64(duration))
}

最终通过Store方法发送数据

func (c *Client) Store(samples model.Samples) error {
    req := &WriteRequest{
        Timeseries: make([]*TimeSeries, 0, len(samples)),
    }
    for _, s := range samples {
        ts := &TimeSeries{
            Labels: make([]*LabelPair, 0, len(s.Metric)),
        }
        for k, v := range s.Metric {
            ts.Labels = append(ts.Labels,
                &LabelPair{
                    Name:  string(k),
                    Value: string(v),
                })
        }
        ts.Samples = []*Sample{
            {
                Value:       float64(s.Value),
                TimestampMs: int64(s.Timestamp),
            },
        }
        req.Timeseries = append(req.Timeseries, ts)
    }

    data, err := proto.Marshal(req)
    if err != nil {
        return err
    }

    buf := bytes.Buffer{}
    if _, err := snappy.NewWriter(&buf).Write(data); err != nil {
        return err
    }

    httpReq, err := http.NewRequest("POST", c.url.String(), &buf)
    if err != nil {
        return err
    }
    httpReq.Header.Add("Content-Encoding", "snappy")

    ctx, _ := context.WithTimeout(context.Background(), c.timeout)
    httpResp, err := ctxhttp.Do(ctx, c.client, httpReq)
    if err != nil {
        return err
    }
    defer httpResp.Body.Close()
    if httpResp.StatusCode/100 != 2 {
        return fmt.Errorf("server returned HTTP status %s", httpResp.Status)
    }
    return nil
}

Store里面就是通过POST方式发送数据。说完了远端存储,再解释一下本地存储,这个设计的挺复杂,它是先放到内存中,并会批量将内存数据导入到磁盘中保存,具体看内存存储管理

type MemorySeriesStorage struct {
    // archiveHighWatermark and numChunksToPersist have to be aligned for atomic operations.
    archiveHighWatermark model.Time    // No archived series has samples after this time.
    numChunksToPersist   int64         // The number of chunks waiting for persistence.
    maxChunksToPersist   int           // If numChunksToPersist reaches this threshold, ingestion will be throttled.
    rushed               bool          // Whether the storage is in rushed mode.
    rushedMtx            sync.Mutex    // Protects entering and exiting rushed mode.
    throttled            chan struct{} // This chan is sent to whenever NeedsThrottling() returns true (for logging).

    fpLocker   *fingerprintLocker
    fpToSeries *seriesMap

    options *MemorySeriesStorageOptions

    loopStopping, loopStopped  chan struct{}
    logThrottlingStopped       chan struct{}
    maxMemoryChunks            int
    dropAfter                  time.Duration
    checkpointInterval         time.Duration
    checkpointDirtySeriesLimit int

    persistence *persistence
    mapper      *fpMapper

    evictList                   *list.List
    evictRequests               chan chunk.EvictRequest
    evictStopping, evictStopped chan struct{}

    quarantineRequests                    chan quarantineRequest
    quarantineStopping, quarantineStopped chan struct{}

    persistErrors                 prometheus.Counter
    queuedChunksToPersist         prometheus.Counter
    numSeries                     prometheus.Gauge
    numHeadChunks                 prometheus.Gauge
    dirtySeries                   prometheus.Gauge
    seriesOps                     *prometheus.CounterVec
    ingestedSamplesCount          prometheus.Counter
    discardedSamplesCount         *prometheus.CounterVec
    nonExistentSeriesMatchesCount prometheus.Counter
    maintainSeriesDuration        *prometheus.SummaryVec
    persistenceUrgencyScore       prometheus.Gauge
    rushedMode                    prometheus.Gauge
}

他是一个内存存储管理器。和remote一样,他也是实现了Append方法去保存sample。

func (s *MemorySeriesStorage) Append(sample *model.Sample) error {
    for ln, lv := range sample.Metric {
        if len(lv) == 0 {
            delete(sample.Metric, ln)
        }
    }
    rawFP := sample.Metric.FastFingerprint()
    s.fpLocker.Lock(rawFP)
    fp := s.mapper.mapFP(rawFP, sample.Metric)
    defer func() {
        s.fpLocker.Unlock(fp)
    }() // Func wrapper because fp might change below.
    if fp != rawFP {
        // Switch locks.
        s.fpLocker.Unlock(rawFP)
        s.fpLocker.Lock(fp)
    }
    series, err := s.getOrCreateSeries(fp, sample.Metric)
    if err != nil {
        return err // getOrCreateSeries took care of quarantining already.
    }

    if sample.Timestamp == series.lastTime {
        // Don't report "no-op appends", i.e. where timestamp and sample
        // value are the same as for the last append, as they are a
        // common occurrence when using client-side timestamps
        // (e.g. Pushgateway or federation).
        if sample.Timestamp == series.lastTime &&
            series.lastSampleValueSet &&
            sample.Value.Equal(series.lastSampleValue) {
            return nil
        }
        s.discardedSamplesCount.WithLabelValues(duplicateSample).Inc()
        return ErrDuplicateSampleForTimestamp // Caused by the caller.
    }
    if sample.Timestamp < series.lastTime {
        s.discardedSamplesCount.WithLabelValues(outOfOrderTimestamp).Inc()
        return ErrOutOfOrderSample // Caused by the caller.
    }
    completedChunksCount, err := series.add(model.SamplePair{
        Value:     sample.Value,
        Timestamp: sample.Timestamp,
    })
    if err != nil {
        s.quarantineSeries(fp, sample.Metric, err)
        return err
    }
    s.ingestedSamplesCount.Inc()
    s.incNumChunksToPersist(completedChunksCount)

    return nil
}

这个里面先通过getOrCreateSeries获取series,series你可以理解为,相同类型的监控数据放到一起,这样便于压缩查找,通过series.add保存。但这只是保存到内存中,怎么持久化呢?
在MemorySeriesStorage启动的时候

    p, err = newPersistence(
        s.options.PersistenceStoragePath,
        s.options.Dirty, s.options.PedanticChecks,
        syncStrategy,
        s.options.MinShrinkRatio,
    )
    if err != nil {
        return err
    }
    s.persistence = p
    // Persistence must start running before loadSeriesMapAndHeads() is called.
    go s.persistence.run()
    ...
    go s.loop()

这个persistence负责把内存中的数据写到磁盘中,loop中

    for {
        select {
        case <-s.loopStopping:
            break loop
        case fp := <-memoryFingerprints:
            if s.maintainMemorySeries(fp, model.Now().Add(-s.dropAfter)) {
                dirty := atomic.AddInt64(&dirtySeriesCount, 1)
                s.dirtySeries.Set(float64(dirty))
                // Check if we have enough "dirty" series so that we need an early checkpoint.
                // However, if we are already behind persisting chunks, creating a checkpoint
                // would be counterproductive, as it would slow down chunk persisting even more,
                // while in a situation like that, where we are clearly lacking speed of disk
                // maintenance, the best we can do for crash recovery is to persist chunks as
                // quickly as possible. So only checkpoint if the urgency score is < 1.
                if dirty >= int64(s.checkpointDirtySeriesLimit) &&
                    s.calculatePersistenceUrgencyScore() < 1 {
                    checkpointTimer.Reset(0)
                }
            }
        case fp := <-archivedFingerprints:
            s.maintainArchivedSeries(fp, model.Now().Add(-s.dropAfter))
        }
    }

maintainMemorySeries保存series,

func (s *MemorySeriesStorage) maintainMemorySeries(
    fp model.Fingerprint, beforeTime model.Time,
) (becameDirty bool) {
    defer func(begin time.Time) {
        s.maintainSeriesDuration.WithLabelValues(maintainInMemory).Observe(
            time.Since(begin).Seconds(),
        )
    }(time.Now())

    s.fpLocker.Lock(fp)
    defer s.fpLocker.Unlock(fp)

    series, ok := s.fpToSeries.get(fp)
    if !ok {
        // Series is actually not in memory, perhaps archived or dropped in the meantime.
        return false
    }

    defer s.seriesOps.WithLabelValues(memoryMaintenance).Inc()

    closed, err := series.maybeCloseHeadChunk()
    if err != nil {
        s.quarantineSeries(fp, series.metric, err)
        s.persistErrors.Inc()
    }
    if closed {
        s.incNumChunksToPersist(1)
        s.numHeadChunks.Dec()
    }

    seriesWasDirty := series.dirty

    if s.writeMemorySeries(fp, series, beforeTime) {
        // Series is gone now, we are done.
        return false
    }

    iOldestNotEvicted := -1
    for i, cd := range series.chunkDescs {
        if !cd.IsEvicted() {
            iOldestNotEvicted = i
            break
        }
    }

    // Archive if all chunks are evicted. Also make sure the last sample has
    // an age of at least headChunkTimeout (which is very likely anyway).
    if iOldestNotEvicted == -1 && model.Now().Sub(series.lastTime) > headChunkTimeout {
        s.fpToSeries.del(fp)
        s.numSeries.Dec()
        s.persistence.archiveMetric(fp, series.metric, series.firstTime(), series.lastTime)
        s.seriesOps.WithLabelValues(archive).Inc()
        oldWatermark := atomic.LoadInt64((*int64)(&s.archiveHighWatermark))
        if oldWatermark < int64(series.lastTime) {
            if !atomic.CompareAndSwapInt64(
                (*int64)(&s.archiveHighWatermark),
                oldWatermark, int64(series.lastTime),
            ) {
                panic("s.archiveHighWatermark modified outside of maintainMemorySeries")
            }
        }
        return
    }
    // If we are here, the series is not archived, so check for Chunk.Desc
    // eviction next.
    series.evictChunkDescs(iOldestNotEvicted)

    return series.dirty && !seriesWasDirty
}

writeMemorySeries把数据写到磁盘,里面再调用persistChunks

func (p *persistence) persistChunks(fp model.Fingerprint, chunks []chunk.Chunk) (index int, err error) {
    f, err := p.openChunkFileForWriting(fp)
    if err != nil {
        return -1, err
    }
    defer p.closeChunkFile(f)

    if err := p.writeChunks(f, chunks); err != nil {
        return -1, err
    }

    // Determine index within the file.
    offset, err := f.Seek(0, os.SEEK_CUR)
    if err != nil {
        return -1, err
    }
    index, err = chunkIndexForOffset(offset)
    if err != nil {
        return -1, err
    }

    return index - len(chunks), err
}

那这些series怎么查询呢?它有个index列表,通过著名的leveldb保存index,这样就可以通过index去查询了。他是一个keyvalue数据库,接口定义storage/local/index/interface.go

type KeyValueStore interface {
    Put(key, value encoding.BinaryMarshaler) error
    // Get unmarshals the result into value. It returns false if no entry
    // could be found for key. If value is nil, Get behaves like Has.
    Get(key encoding.BinaryMarshaler, value encoding.BinaryUnmarshaler) (bool, error)
    Has(key encoding.BinaryMarshaler) (bool, error)
    // Delete returns (false, nil) if key does not exist.
    Delete(key encoding.BinaryMarshaler) (bool, error)

    NewBatch() Batch
    Commit(b Batch) error

    // ForEach iterates through the complete KeyValueStore and calls the
    // supplied function for each mapping.
    ForEach(func(kv KeyValueAccessor) error) error

    Close() error
}

它的实现在storage/local/index/leveldb.go里面,代码比较多,我就不粘出来了。

  • 0
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 打赏
    打赏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包

打赏作者

柳清风09

你的鼓励将是我创作的最大动力

¥1 ¥2 ¥4 ¥6 ¥10 ¥20
扫码支付:¥1
获取中
扫码支付

您的余额不足,请更换扫码支付或充值

打赏作者

实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值