Memtable的关键配置项
- write_buffer_size:某个CF中一个memtable的size
- db_write_buffer_size:所有CF所有memtable的总size
- max_write_buffer_number:某个CF最大memtable数量
- min_write_buffer_number_to_merge:某个CF最小可以被flush的memtable数量
- max_total_wal_size:WAL的最大size。
Memtable flush框架
需要flush的CF加入队列
DBImpl中包含了flush_queue_,当某个CF满足flush条件时,将CF添加到队列中。
class DBImpl {
...
std::deque<ColumnFamilyData*> flush_queue_;
...
};
判断否个CF是否需要flush的条件如下:当未flush的数量大于1,为flush的数量大于配置的最小触发flush数量时.
bool MemTableList::IsFlushPending() const {
if ((flush_requested_ && num_flush_not_started_ >= 1) ||
(num_flush_not_started_ >= min_write_buffer_number_to_merge_)) {
assert(imm_flush_needed.load(std::memory_order_relaxed));
return true;
}
return false;
}
void DBImpl::
(const FlushRequest& flush_req,
FlushReason flush_reason) {
...
if (!immutable_db_options_.atomic_flush) {
...
if (!cfd->queued_for_flush() && cfd->imm()->IsFlushPending()) {
cfd->Ref();
cfd->set_queued_for_flush(true);
cfd->SetFlushReason(flush_reason);
++unscheduled_flushes_;
flush_queue_.push_back(flush_req);
}
} else {
...
}
}
后台线程从flush_queue_中获取CF,将CF中的memtable刷盘
后台flush线程BGWorkFlush,会调用BackgroundFlush进行flush。
void DBImpl::MaybeScheduleFlushOrCompaction() {
...
while (!is_flush_pool_empty && unscheduled_flushes_ > 0 &&
bg_flush_scheduled_ < bg_job_limits.max_flushes) {
unscheduled_flushes_--;
bg_flush_scheduled_++;
// 启动后台线程BGWorkFlush
env_->Schedule(&DBImpl::BGWorkFlush, this, Env::Priority::HIGH, this);
}
...
}
Status DBImpl::BackgroundFlush(bool* made_progress, JobContext* job_context,
LogBuffer* log_buffer) {
...
// 拿到一个满足条件的CF
while (!flush_queue_.empty()) {
auto first_cfd = PopFirstFromFlushQueue();
// 判断IsFlushPending是否真正需要flush
...
cfd = first_cfd;
break;
}
if (cfd != nullptr) {
...
// 将memtable写到磁盘
status = FlushMemTableToOutputFile(cfd, mutable_cf_options, made_progress,
job_context, log_buffer);
...
}
return status;
}
Memtable刷盘,memtable -> L0 sstable
Status DBImpl::FlushMemTableToOutputFile(
ColumnFamilyData* cfd, const MutableCFOptions& mutable_cf_options,
bool* made_progress, JobContext* job_context,
SuperVersionContext* superversion_context,
...) {
// 创建flush job
FlushJob flush_job(
dbname_, cfd, immutable_db_options_, mutable_cf_options, max_memtable_id, ...);
// 从CF中拿到需要flush的memtable
flush_job.PickMemTable();
// memtable flush
s = flush_job.Run(&logs_with_prep_tracker_, &file_meta,
&switched_to_mempurge);
// 新的superversion上线
InstallSuperVersionAndScheduleWork(cfd, superversion_context,
mutable_cf_options);
}
Status FlushJob::Run(LogsWithPrepTracker* prep_tracker, FileMetaData* file_meta, bool* switched_to_mempurge) {
// 将memtable转换为L0 sstable
s = WriteLevel0Table();
}
Status FlushJob::WriteLevel0Table() {
// 遍历所有memtable创建iterator
for (MemTable* m : mems_) {
// memtable iterator
memtables.push_back(m->NewIterator(ro, &arena));
// range del iterator
auto* range_del_iter =
m->NewRangeTombstoneIterator(ro, kMaxSequenceNumber);
if (range_del_iter != nullptr) {
range_del_iters.emplace_back(range_del_iter);
}
}
// 基于前面创建每个memtable的iterator,创建merge iterator
ScopedArenaIterator iter(
NewMergingIterator(&cfd_->internal_comparator(), memtables.data(),
static_cast<int>(memtables.size()), &arena));
// 使用merge iterator和range del iterator,创建L0 sstable
s = BuildTable(dbname_, versions_, db_options_, tboptions, file_options_,
cfd_->table_cache(), iter.get(), std::move(range_del_iters), ...);
// table创建完成后更新versionedit
edit_->AddFile(0 /* level */, meta_.fd.GetNumber(), meta_.fd.GetPathId(),
meta_.fd.GetFileSize(), meta_.smallest, meta_.largest,
...);
}
Status BuildTable(
const std::string& dbname, VersionSet* versions,
const ImmutableDBOptions& db_options, const TableBuilderOptions& tboptions,
const FileOptions& file_options, TableCache* table_cache,
InternalIterator* iter,
std::vector<std::unique_ptr<FragmentedRangeTombstoneIterator>>
range_del_iters,...) {
// iter指向first
iter->SeekToFirst();
if (iter->Valid() || !range_del_agg->IsEmpty()) {
TableBuilder* builder;
// 创建一个file
IOStatus io_s = NewWritableFile(fs, fname, &file, file_options);
// 基于当前的iter,创建一个compaction的遍历迭代器
CompactionIterator c_iter(
iter, tboptions.internal_comparator.user_comparator(), &merge,
kMaxSequenceNumber, &snapshots, ...);
c_iter.SeekToFirst();
// 将key写入到file中
for (; c_iter.Valid(); c_iter.Next()) {
builder->Add(key, value);
}
}
}
触发memtable flush
根据上述框架,flush属于后台任务,因此memtable flush也就是调用SchedulePendingFlush函数,将需要flush的CF添加到flush_queue_,并触发MaybeScheduleFlushOrCompaction即可。触发memtable flush的条件包括:
enum class FlushReason : int {
kOthers = 0x00,
kGetLiveFiles = 0x01,
kShutDown = 0x02,
kExternalFileIngestion = 0x03,
kManualCompaction = 0x04,
kWriteBufferManager = 0x05, // 所有CF memtable空间写满
kWriteBufferFull = 0x06, // memtable size大于CF的memtablesize
kTest = 0x07,
kDeleteFiles = 0x08,
kAutoCompaction = 0x09,
kManualFlush = 0x0a,
kErrorRecovery = 0xb,
kErrorRecoveryRetryFlush = 0xc,
kWalFull = 0xd, // WAL写满切WAL
};
- kWriteBufferManager:所有CF memtable空间写满,需要触发flush。
- kWalFull:切WAL,当WAL写满时需要切WAL,因为memtable的数据只能写入到一个WAL,因此会出现memtable切换,需要触发memtable的flush。
- kWriteBufferFull:状态机触发,当memtable size大于CF配置的memtable size时,触发flush
- 其他枚举值:流程(用户手动刷盘、故障恢复等)触发,部分流程需要将memtable刷盘。
所有memtable总空间写满(db_write_buffer_size)
DBOptions SanitizeOptions(const std::string& dbname, const DBOptions& src,
bool read_only, Status* logger_creation_s) {
...
if (!result.write_buffer_manager) {
result.write_buffer_manager.reset(
new WriteBufferManager(result.db_write_buffer_size));
}
...
}
WriteBufferManager::WriteBufferManager(size_t _buffer_size,
std::shared_ptr<Cache> cache,
bool allow_stall)
: buffer_size_(_buffer_size),
mutable_limit_(buffer_size_ * 7 / 8) {}
bool ShouldFlush() const {
if (enabled()) {
// memory_usage():memtable使用的总容量
// mutable_memtable_memory_usage():memtable中immutable使用的总容量
// 将要被释放的内存超过db_write_buffer_size的7/8
// memtable容量超过db_write_buffer_size,且要被释放的内存超过db_write_buffer_size的1/2
if (mutable_memtable_memory_usage() > mutable_limit_) {
return true;
}
if (memory_usage() >= buffer_size_ &&
mutable_memtable_memory_usage() >= buffer_size_ / 2) {
// If the memory exceeds the buffer size, we trigger more aggressive
// flush. But if already more than half memory is being flushed,
// triggering more flush may not help. We will hold it instead.
return true;
}
}
return false;
}
Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
bool* need_log_sync,
WriteContext* write_context) {
...
if (UNLIKELY(status.ok() && write_buffer_manager_->ShouldFlush())) {
InstrumentedMutexLock l(&mutex_);
WaitForPendingWrites();
status = HandleWriteBufferManagerFlush(write_context);
}
}
Status DBImpl::HandleWriteBufferManagerFlush(WriteContext* write_context) {
...
// 切memtable
for (const auto cfd : cfds) {
...
status = SwitchMemtable(cfd, write_context);
...
}
// 触发flush
if (status.ok()) {
...
for (const auto cfd : cfds) {
cfd->imm()->FlushRequested();
if (!immutable_db_options_.atomic_flush) {
FlushRequest flush_req;
GenerateFlushRequest({cfd}, &flush_req);
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
}
}
if (immutable_db_options_.atomic_flush) {
FlushRequest flush_req;
GenerateFlushRequest(cfds, &flush_req);
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferManager);
}
MaybeScheduleFlushOrCompaction();
}
return status;
}
WAL空间写满(max_total_wal_size)
Status DBImpl::PreprocessWrite(const WriteOptions& write_options,
bool* need_log_sync,
WriteContext* write_context) {
...
// 当WAL的size大于配置的size触发切WAL
if (UNLIKELY(status.ok() && !single_column_family_mode_ &&
total_log_size_ > GetMaxTotalWalSize())) {
status = SwitchWAL(write_context);
}
}
Status DBImpl::SwitchWAL(WriteContext* write_context) {
...
for (auto cfd : *versions_->GetColumnFamilySet()) {
if (cfd->IsDropped()) {
continue;
}
// 切memtable
if (cfd->OldestLogToKeep() <= oldest_alive_log) {
status = SwitchMemtable(cfd, write_context);
if (!status.ok()) {
break;
}
cfd->imm()->FlushRequested();
// 触发flush
SchedulePendingFlush(cfd, FlushReason::kWriteBufferManager);
}
}
MaybeScheduleFlushOrCompaction();
return status;
}
CF memtable空间写满(write_buffer_size)
在memtable写和写完成callback中会调用MemTable::UpdateFlushState函数设置memtable的state,state的类型包括三种:
enum FlushStateEnum { FLUSH_NOT_REQUESTED, FLUSH_REQUESTED, FLUSH_SCHEDULED };
FLUSH_NOT_REQUESTED -> FLUSH_REQUESTED
bool MemTable::ShouldFlushNow() const {
// write_buffer_size为每个CF memtable的总size
// kArenaBlockSize默认为0
// 当前memtable内存使用已经大于write_buffer_size时返回true
size_t write_buffer_size = write_buffer_size_.load(std::memory_order_relaxed);
const double kAllowOverAllocationRatio = 0.6;
auto allocated_memory = table_->ApproximateMemoryUsage() +
range_del_table_->ApproximateMemoryUsage() +
arena_.MemoryAllocatedBytes();
// if we can still allocate one more block without exceeding the
// over-allocation ratio, then we should not flush.
if (allocated_memory + kArenaBlockSize <
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
return false;
}
// if user keeps adding entries that exceeds write_buffer_size, we need to
// flush earlier even though we still have much available memory left.
if (allocated_memory >
write_buffer_size + kArenaBlockSize * kAllowOverAllocationRatio) {
return true;
}
return arena_.AllocatedAndUnused() < kArenaBlockSize / 4;
}
void MemTable::UpdateFlushState() {
auto state = flush_state_.load(std::memory_order_relaxed);
if (state == FLUSH_NOT_REQUESTED && ShouldFlushNow()) {
flush_state_.compare_exchange_strong(state, FLUSH_REQUESTED,
std::memory_order_relaxed,
std::memory_order_relaxed);
}
}
FLUSH_REQUESTED -> FLUSH_SCHEDULED
在PUT流程PutCFImpl中会check memtable是否已经写满了,如果写满了,就需要将此memtable flush。
Status PutCFImpl(uint32_t column_family_id, const Slice& key,
const Slice& value, ValueType value_type,
const ProtectionInfoKVOS64* kv_prot_info) {
if (UNLIKELY(ret_status.IsTryAgain())) {
...
} else if (ret_status.ok()) {
MaybeAdvanceSeq();
CheckMemtableFull();
}
bool MarkFlushScheduled() {
auto before = FLUSH_REQUESTED;
return flush_state_.compare_exchange_strong(before, FLUSH_SCHEDULED,
std::memory_order_relaxed,
std::memory_order_relaxed);
}
void CheckMemtableFull() {
if (flush_scheduler_ != nullptr) {
auto* cfd = cf_mems_->current();
assert(cfd != nullptr);
// 修改memtable状态
if (cfd->mem()->ShouldScheduleFlush() &&
cfd->mem()->MarkFlushScheduled()) {
// MarkFlushScheduled only returns true if we are the one that
// should take action, so no need to dedup further
// 触发flush
flush_scheduler_->ScheduleWork(cfd);
}
}
}
进行flush
void FlushScheduler::ScheduleWork(ColumnFamilyData* cfd) {
...
// 将CF添加到FlushScheduler中。
Node* node = new Node{cfd, head_.load(std::memory_order_relaxed)};
while (!head_.compare_exchange_strong(
node->next, node, std::memory_order_relaxed, std::memory_order_relaxed)) {
// failing CAS updates the first param, so we are already set for
// retry. TakeNextColumnFamily won't happen until after another
// inter-thread synchronization, so we don't even need release
// semantics for this CAS
}
...
}
Status DBImpl::ScheduleFlushes(WriteContext* context) {
// 从flush_scheduler_获取cf
while ((tmp_cfd = flush_scheduler_.TakeNextColumnFamily()) != nullptr) {
cfds.push_back(tmp_cfd);
}
// 切memtable
status = SwitchMemtable(cfd, context);
// 触发flush
SchedulePendingFlush(flush_req, FlushReason::kWriteBufferFull);
MaybeScheduleFlushOrCompaction();
return status;
}
流程触发flush
Status DBImpl::FlushMemTable(ColumnFamilyData* cfd,
const FlushOptions& flush_options,
FlushReason flush_reason, bool writes_stopped) {
{
...
// 切memtable
s = SwitchMemtable(cfd, &context);
// 触发flush
SchedulePendingFlush(cfd, flush_reason);
MaybeScheduleFlushOrCompaction();
}
...
return s;
}
参考文献
https://github.com/EighteenZi/rocksdb_wiki/blob/master/MemTable.md
https://developer.aliyun.com/article/643754