RocksDB:memtable flush流程详解

Memtable的关键配置项

  • write_buffer_size:某个CF中一个memtable的size
  • db_write_buffer_size:所有CF所有memtable的总size
  • max_write_buffer_number:某个CF最大memtable数量
  • min_write_buffer_number_to_merge:某个CF最小可以被flush的memtable数量
  • max_total_wal_size:WAL的最大size。

Memtable flush框架

需要flush的CF加入队列

DBImpl中包含了flush_queue_,当某个CF满足flush条件时,将CF添加到队列中。

class DBImpl {
  ...
  std::deque<ColumnFamilyData*> flush_queue_;
  ...
};

判断否个CF是否需要flush的条件如下:当未flush的数量大于1,为flush的数量大于配置的最小触发flush数量时.

bool MemTableList::IsFlushPending() const {
  if ((flush_requested_ && num_flush_not_started_ >= 1) ||
      (num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
    assert(imm_flush_needed.load(std::memory_order_relaxed));
    return true;
  }
  return false;
}
void DBImpl::
(const FlushRequest& flush_req,
                                  FlushReason flush_reason) {
  ...
  if (!immutable_db_options_.atomic_flush) {
    ...
    if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
      cfd->Ref();
      cfd->set_queued_for_flush(true);
      cfd->SetFlushReason(flush_reason);
      ++unscheduled_flushes_;
      flush_queue_.push_back(flush_req);
    }
  } else {
    ...
  }
}

后台线程从flush_queue_中获取CF,将CF中的memtable刷盘

后台flush线程BGWorkFlush,会调用BackgroundFlush进行flush。

void DBImpl::MaybeScheduleFlushOrCompaction() {
  ...
  while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
         bg_flush_scheduled_ < bg_job_limits.max_flushes) {
    unscheduled_flushes_--;
    bg_flush_scheduled_++;
    // 启动后台线程BGWorkFlush
    env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this);
  }
  ...
}
Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
                               LogBuffer* log_buffer) {
  ...
  // 拿到一个满足条件的CF
  while (!flush_queue_.empty()) {
    auto first_cfd = PopFirstFromFlushQueue();
    // 判断IsFlushPending是否真正需要flush
    ...
    cfd = first_cfd;
    break;
  }

  if (cfd != nullptr) {
    ...
    // 将memtable写到磁盘
    status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
                                       job_context, log_buffer);
    ...
  }
  return status;
}

Memtable刷盘,memtable -> L0 sstable

Status DBImpl::FlushMemTableToOutputFile(
    ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
    bool* made_progress, JobContext* job_context,
    SuperVersionContext* superversion_context,
    ...) {
  // 创建flush job
  FlushJob flush_job(
      dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, ...);
  // 从CF中拿到需要flush的memtable
  flush_job.PickMemTable();
  // memtable flush
  s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
                      &switched_to_mempurge);
  // 新的superversion上线
  InstallSuperVersionAndScheduleWork(cfd, superversion_context,
                                       mutable_cf_options);
}
Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, bool* switched_to_mempurge) {
  // 将memtable转换为L0 sstable
  s = WriteLevel0Table();
}
Status FlushJob::WriteLevel0Table() {
  // 遍历所有memtable创建iterator
  for (MemTable* m : mems_) {
    // memtable iterator
    memtables.push_back(m->NewIterator(ro, &arena));
    // range del iterator
    auto* range_del_iter =
        m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
    if (range_del_iter != nullptr) {
      range_del_iters.emplace_back(range_del_iter);
    }
  }
  // 基于前面创建每个memtable的iterator,创建merge iterator
  ScopedArenaIterator iter(
          NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
                             static_cast<int>(memtables.size()), &arena));
  // 使用merge iterator和range del iterator,创建L0 sstable
  s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_,
          cfd_->table_cache(), iter.get(), std::move(range_del_iters), ...);
  // table创建完成后更新versionedit
  edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
                   meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
                   ...);
}
Status BuildTable(
    const std::string& dbname, VersionSet* versions,
    const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
    const FileOptions& file_options, TableCache* table_cache,
    InternalIterator* iter,
    std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
        range_del_iters,...) {
  // iter指向first
  iter->SeekToFirst();
  if (iter->Valid() || !range_del_agg->IsEmpty()) {
    TableBuilder* builder;
    // 创建一个file
    IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
    // 基于当前的iter,创建一个compaction的遍历迭代器
    CompactionIterator c_iter(
        iter, tboptions.internal_comparator.user_comparator(), &merge,
        kMaxSequenceNumber, &snapshots, ...);
    c_iter.SeekToFirst();
    // 将key写入到file中
    for (; c_iter.Valid(); c_iter.Next()) {
      builder->Add(key, value);
    }
  }
}

触发memtable flush

根据上述框架,flush属于后台任务,因此memtable flush也就是调用SchedulePendingFlush函数,将需要flush的CF添加到flush_queue_,并触发MaybeScheduleFlushOrCompaction即可。触发memtable flush的条件包括:

enum class FlushReason : int {
  kOthers = 0x00,
  kGetLiveFiles = 0x01,
  kShutDown = 0x02,
  kExternalFileIngestion = 0x03,
  kManualCompaction = 0x04,
  kWriteBufferManager = 0x05, // 所有CF memtable空间写满
  kWriteBufferFull = 0x06, // memtable size大于CF的memtablesize
  kTest = 0x07,
  kDeleteFiles = 0x08,
  kAutoCompaction = 0x09,
  kManualFlush = 0x0a,
  kErrorRecovery = 0xb,
  kErrorRecoveryRetryFlush = 0xc, 
  kWalFull = 0xd, // WAL写满切WAL
};
  • kWriteBufferManager:所有CF memtable空间写满,需要触发flush。
  • kWalFull:切WAL,当WAL写满时需要切WAL,因为memtable的数据只能写入到一个WAL,因此会出现memtable切换,需要触发memtable的flush。
  • kWriteBufferFull:状态机触发,当memtable size大于CF配置的memtable size时,触发flush
  • 其他枚举值:流程(用户手动刷盘、故障恢复等)触发,部分流程需要将memtable刷盘。

所有memtable总空间写满(db_write_buffer_size)

DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
                          bool read_only, Status* logger_creation_s) {
  ...
  if (!result.write_buffer_manager) {
    result.write_buffer_manager.reset(
        new WriteBufferManager(result.db_write_buffer_size));
  }
  ...
}
WriteBufferManager::WriteBufferManager(size_t _buffer_size,
                                       std::shared_ptr<Cache> cache,
                                       bool allow_stall)
    : buffer_size_(_buffer_size),
      mutable_limit_(buffer_size_ * 7 / 8) {}
bool ShouldFlush() const {
  if (enabled()) {
    // memory_usage():memtable使用的总容量
    // mutable_memtable_memory_usage():memtable中immutable使用的总容量
    // 将要被释放的内存超过db_write_buffer_size的7/8
    // memtable容量超过db_write_buffer_size,且要被释放的内存超过db_write_buffer_size的1/2
    if (mutable_memtable_memory_usage() > mutable_limit_) {
      return true;
    }
    if (memory_usage() >= buffer_size_ &&
        mutable_memtable_memory_usage() >= buffer_size_ / 2) {
      // If the memory exceeds the buffer size, we trigger more aggressive
      // flush. But if already more than half memory is being flushed,
      // triggering more flush may not help. We will hold it instead.
      return true;
    }
  }
  return false;
}
Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
                               bool* need_log_sync,
                               WriteContext* write_context) {
  ...
  if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
    InstrumentedMutexLock l(&mutex_);
    WaitForPendingWrites();
    status = HandleWriteBufferManagerFlush(write_context);
  }
}
Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
  ...
  // 切memtable
  for (const auto cfd : cfds) {
    ...
    status = SwitchMemtable(cfd, write_context);
    ...
  }
  // 触发flush
  if (status.ok()) {
    ...
    for (const auto cfd : cfds) {
      cfd->imm()->FlushRequested();
      if (!immutable_db_options_.atomic_flush) {
        FlushRequest flush_req;
        GenerateFlushRequest({cfd}, &flush_req);
        SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
      }
    }
    if (immutable_db_options_.atomic_flush) {
      FlushRequest flush_req;
      GenerateFlushRequest(cfds, &flush_req);
      SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
    }
    MaybeScheduleFlushOrCompaction();
  }
  return status;
}

WAL空间写满(max_total_wal_size)

Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
                               bool* need_log_sync,
                               WriteContext* write_context) {
  ...
  // 当WAL的size大于配置的size触发切WAL
  if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
               total_log_size_ > GetMaxTotalWalSize())) {
    status = SwitchWAL(write_context);
  }
}
Status DBImpl::SwitchWAL(WriteContext* write_context) {
  ...
  for (auto cfd : *versions_->GetColumnFamilySet()) {
    if (cfd->IsDropped()) {
      continue;
    }
    // 切memtable
    if (cfd->OldestLogToKeep() <= oldest_alive_log) {
      status = SwitchMemtable(cfd, write_context);
      if (!status.ok()) {
        break;
      }
      cfd->imm()->FlushRequested();
      // 触发flush
      SchedulePendingFlush(cfd, FlushReason::kWriteBufferManager);
    }
  }
  MaybeScheduleFlushOrCompaction();
  return status;
}

CF memtable空间写满(write_buffer_size)

在memtable写和写完成callback中会调用MemTable::UpdateFlushState函数设置memtable的state,state的类型包括三种:

enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };

FLUSH_NOT_REQUESTED -> FLUSH_REQUESTED

bool MemTable::ShouldFlushNow() const {
  // write_buffer_size为每个CF memtable的总size
  // kArenaBlockSize默认为0
  // 当前memtable内存使用已经大于write_buffer_size时返回true
  size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
  const double kAllowOverAllocationRatio = 0.6;
  auto allocated_memory = table_->ApproximateMemoryUsage() +
                          range_del_table_->ApproximateMemoryUsage() +
                          arena_.MemoryAllocatedBytes();

  // if we can still allocate one more block without exceeding the
  // over-allocation ratio, then we should not flush.
  if (allocated_memory + kArenaBlockSize <
      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
    return false;
  }

  // if user keeps adding entries that exceeds write_buffer_size, we need to
  // flush earlier even though we still have much available memory left.
  if (allocated_memory >
      write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
    return true;
  }

  return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
}
void MemTable::UpdateFlushState() {
  auto state = flush_state_.load(std::memory_order_relaxed);
  if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
    flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
                                         std::memory_order_relaxed,
                                         std::memory_order_relaxed);
  }
}

FLUSH_REQUESTED -> FLUSH_SCHEDULED

在PUT流程PutCFImpl中会check memtable是否已经写满了,如果写满了,就需要将此memtable flush。

Status PutCFImpl(uint32_t column_family_id, const Slice& key,
                 const Slice& value, ValueType value_type,
                 const ProtectionInfoKVOS64* kv_prot_info) {
if (UNLIKELY(ret_status.IsTryAgain())) {
  ...
} else if (ret_status.ok()) {
  MaybeAdvanceSeq();
  CheckMemtableFull();
}
bool MarkFlushScheduled() {
  auto before = FLUSH_REQUESTED;
  return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
                                              std::memory_order_relaxed,
                                              std::memory_order_relaxed);
}
void CheckMemtableFull() {
  if (flush_scheduler_ != nullptr) {
    auto* cfd = cf_mems_->current();
    assert(cfd != nullptr);
    // 修改memtable状态
    if (cfd->mem()->ShouldScheduleFlush() &&
        cfd->mem()->MarkFlushScheduled()) {
      // MarkFlushScheduled only returns true if we are the one that
      // should take action, so no need to dedup further
      // 触发flush
      flush_scheduler_->ScheduleWork(cfd);
    }
  }
}

进行flush

void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
  ...
  // 将CF添加到FlushScheduler中。
  Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)};
  while (!head_.compare_exchange_strong(
      node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) {
    // failing CAS updates the first param, so we are already set for
    // retry.  TakeNextColumnFamily won't happen until after another
    // inter-thread synchronization, so we don't even need release
    // semantics for this CAS
  }
  ...
}
Status DBImpl::ScheduleFlushes(WriteContext* context) {
  // 从flush_scheduler_获取cf
  while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
    cfds.push_back(tmp_cfd);
  }
  // 切memtable
  status = SwitchMemtable(cfd, context);
  // 触发flush
  SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
  MaybeScheduleFlushOrCompaction();
  return status;
}

流程触发flush

Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
                             const FlushOptions& flush_options,
                             FlushReason flush_reason, bool writes_stopped) {
  {
    ...
    // 切memtable
    s = SwitchMemtable(cfd, &context);
    // 触发flush
    SchedulePendingFlush(cfd, flush_reason);
    MaybeScheduleFlushOrCompaction();
  }
  ...
  return s;
}

参考文献

https://github.com/EighteenZi/rocksdb_wiki/blob/master/MemTable.md
https://developer.aliyun.com/article/643754

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值