文章目录
引言
RocksDB的LsmTree有哪些关键数据?
RocksDB采用了LsmTree用于存储kv数据,LsmTree包含了如下关键数据:
- memtable/immutable:存储在内存中,随着写入size和数量的增加会转换为sstable,存储在磁盘上。
- sstable:存储在磁盘上,存储kv数据,只读。
- WAL:存储在高性能介质,与memtable一一对应,存储与memtable中相同的kv数据,用于故障后重建memtable。
Manifest:描述LsmTree结构
Manifest就是用来描述LsmTree结构信息,其中包括:
- VersionEdit:LsmTree结构变化情况。
- WAL:已久化到磁盘的WAL文件seq。
根据manifest描述的文件信息即可重建LsmTree结构。
- memtable/immutable:通过读取WAL将重建memtable。
- sstable:根据VersionEdit记录重建LsmTree的sstable结构。
Manifest dump
为了便于后续理解,先看一下ldb工具dump出来的manifest。
VersionEdit {
Comparator: leveldb.BytewiseComparator
ColumnFamily: 0
}
VersionEdit {
LogNumber: 5024
AddFile: 6 5049 68211256 'Bblocks' seq:0, type:1 .. 'Mʧ.vdb_f4949979.file_head' seq:0, type:1 oldest_ancester_time:0 file_creation_time:0 file_checksum: file_checksum_func_name: Unknown
AddFile: 6 5050 68209671 'Mʧ.vdb_f4949980.file_head' seq:0, type:1 .. 'M�w.vdb_f46113.file_head' seq:0, type:1 oldest_ancester_time:0 file_creation_time:0 file_checksum: file_checksum_func_name: Unknown
AddFile: 6 5051 68209802 'M�w.vdb_f46114.file_head' seq:0, type:1 .. 'M�;.vdb_f2537719.file_head' seq:0, type:1 oldest_ancester_time:0 file_creation_time:0 file_checksum: file_checksum_func_name: Unknown
AddFile: 6 5052 68209026 'M�;.vdb_f2537720.file_head' seq:0, type:1 .. 'M��.vdb_f5027643.file_head' seq:0, type:1 oldest_ancester_time:0 file_creation_time:0 file_checksum: file_checksum_func_name: Unknown
...
}
VersionEdit {
LogNumber: 5025
PrevLogNumber: 0
NextFileNumber: 5193
LastSeq: 83912935
AddFile: 0 5191 66193 'LeB' seq:83912828, type:7 .. 'T' seq:83912771, type:2 oldest_ancester_time:0 file_creation_time:0 file_checksum: file_checksum_func_name: Unknown
ColumnFamily: 0
}
结合如下代码DumpManifestHandler::CheckIterationResult
函数可以得出如下信息:
- 第一个VersionEdit记录了AddFile记录,即经过整理后,一个某个version中全部的sstable。
- 第二个VersionEdit记录了一个L0的AddFile记录, 表示有一个memtable序列化到了L0
- 累加两次记录,就得到了LsmTree的结构。
想了解VersionEdit内容,可以直接看EncodeTo代码。
bool VersionEdit::EncodeTo(std::string* dst) const {
...
}
Manifest代码详解
关键元数据
Version/VersionEdit/VersionSet
- Version表示某一时刻LsmTree的状态(包含所有sstable)。
- VersionSet表示多个时刻Version的集合。用来处理读sstable和compaction删除sstable的冲突问题。
- VersionEdit表示Version的某次修改,比如由于compaction,Version从v1变成了v2,那么VersionEdit1就记录了v1到v2的变化(L0 add/delete sstable,L1 add/delete ssttable等),这样我们有v1和VersionEdit1、VersionEdit2…,就可以知道当前Version的状态。
- Manifest保存了所有的VersionEdit,因此在故障时候,可以通过一次遍历合并manifest中的记录恢复LsmTree的状态。
函数详解
写manifest:VersionSet::LogAndApply
Status VersionSet::LogAndApply(
...
// 创建writer,提交到versionset的writer队列
std::deque<ManifestWriter> writers;
for (int i = 0; i < num_cfds; ++i) {
const auto wcb =
manifest_wcbs.empty() ? [](const Status&) {} : manifest_wcbs[i];
writers.emplace_back(mu, column_family_datas[i],
*mutable_cf_options_list[i], edit_lists[i], wcb);
manifest_writers_.push_back(&writers[i]);
}
// 等待之前所有writer执行完成
while (!first_writer.done && &first_writer != manifest_writers_.front()) {
first_writer.cv.Wait();
}
if (first_writer.done) {
return first_writer.status;
}
// 唤醒队列中的writer
return ProcessManifestWrites(writers, mu, db_directory, new_descriptor_log, new_cf_options);
}
Status VersionSet::ProcessManifestWrites(
std::deque<ManifestWriter>& writers, InstrumentedMutex* mu,
FSDirectory* db_directory, bool new_descriptor_log,
const ColumnFamilyOptions* new_cf_options) {
...
// 将edit数据保存到batch_edits中
if (first_writer.edit_list.front()->IsColumnFamilyManipulation()) {
LogAndApplyCFHelper(first_writer.edit_list.front(), &max_last_sequence);
batch_edits.push_back(first_writer.edit_list.front());
} else {
while (it != manifest_writers_.cend()) {
...
Version* version = nullptr;
VersionBuilder* builder = nullptr;
...
for (const auto& e : last_writer->edit_list) {
...
Status s = LogAndApplyHelper(last_writer->cfd, builder, e,
&max_last_sequence, mu);
...
batch_edits.push_back(e);
}
}
for (int i = 0; i < static_cast<int>(versions.size()); ++i) {
...
Status s = builder->SaveTo(versions[i]->storage_info());
...
}
}
// 判断是否需要创建新的manifest文件
// 第一次logandapply需要创建
// 当manifest size大于配置值是需要创建
if (!descriptor_log_ ||
manifest_file_size_ > db_options_->max_manifest_file_size) {
new_descriptor_log = true;
} else {
pending_manifest_file_number_ = manifest_file_number_;
}
// 如果需要创建新的manifest,创建manifest
if (new_descriptor_log) {
pending_manifest_file_number_ = NewFileNumber();
batch_edits.back()->SetNextFile(next_file_number_.load());
// if we are writing out new snapshot make sure to persist max column
// family.
if (column_family_set_->GetMaxColumnFamily() > 0) {
first_writer.edit_list.front()->SetMaxColumnFamily(
column_family_set_->GetMaxColumnFamily());
}
...
}
...
if (s.ok()) {
// manifest写盘
io_s = SyncManifest(db_options_, descriptor_log_->file());
}
if (!io_s.ok()) {
s = io_s;
ROCKS_LOG_ERROR(db_options_->info_log, "MANIFEST write %s\n",
s.ToString().c_str());
}
}
// 更新CURRENT文件
if (s.ok() && new_descriptor_log) {
io_s = SetCurrentFile(fs_.get(), dbname_, pending_manifest_file_number_,
db_directory);
}
}
// 删除过期manifest
if (s.ok() && new_descriptor_log) {
obsolete_manifests_.emplace_back(
DescriptorFileName("", manifest_file_number_));
}
// 唤醒阻塞内容
if (!manifest_writers_.empty()) {
manifest_writers_.front()->cv.Signal();
}
return s;
}
DB recover流程
流程
流程图转自https://blog.csdn.net/xuhaitao23/article/details/121747616
函数详解
打开DB:DBImpl::Open
Status DBImpl::Open(const DBOptions& db_options, const std::string& dbname,
const std::vector<ColumnFamilyDescriptor>& column_families,
std::vector<ColumnFamilyHandle*>* handles, DB** dbptr,
const bool seq_per_batch, const bool batch_per_txn) {
// 检查option
Status s = ValidateOptionsByTable(db_options, column_families);
...
// 创建内存中的db实例
DBImpl* impl = new DBImpl(db_options, dbname, seq_per_batch, batch_per_txn);
...
// 创建各层目录,这里需要注意目录是option中配置的:
// 如果是wal,为wal_dir
// 如果是其他sst,为db_paths
s = impl->env_->CreateDirIfMissing(impl->immutable_db_options_.GetWalDir());
...
// db恢复
s = impl->Recover(column_families, false, false, false, &recovered_seq,
&recovery_ctx);
if (s.ok()) {
// 根据恢复的db,拿到新的file_num
uint64_t new_log_number = impl->versions_->NewFileNumber();
...
// 创建一个wal,写入emptybatch,并进行一次flush
s = impl->CreateWAL(new_log_number, 0 /*recycle_log_number*/,
preallocate_block_size, &new_log);
...
// 上线新的version,将version edit持久化到manifest
s = impl->LogAndApplyForRecovery(recovery_ctx);
// 初始化column family
for (auto cf : column_families) {
...
}
}
// memtable和sst都恢复完成后,superversion上线
impl->InstallSuperVersionAndScheduleWork(
cfd, &sv_context, *cfd->GetLatestMutableCFOptions());
...
// 删除过期的sst文件
impl->DeleteObsoleteFiles();
// 出发compaction等后台任务
...
}
恢复DB:DBImpl::Recover
Status DBImpl::Recover(
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
bool error_if_wal_file_exists, bool error_if_data_exists_in_wals,
uint64_t* recovered_seq, RecoveryContext* recovery_ctx) {
...
// 在目录中寻找CURRENT文件
// 如果非best_efforts_recovery:则一定要找到CURRENT文件,否则认为不存在db,下文主要讲解此种情况。
// 如果为best_efforts_recovery:则找到一个manifest即可,为一种尽量恢复db的模式。
if (!immutable_db_options_.best_efforts_recovery) {
s = env_->FileExists(current_fname);
} else {
...
}
...
// 检查文件是否可用
...
}
// 恢复version
if (!immutable_db_options_.best_efforts_recovery) {
s = versions_->Recover(column_families, read_only, &db_id_);
} else {
...
}
...
// 恢复wal
std::vector<std::string> files_in_wal_dir;
if (s.ok()) {
// 获取wal文件
if (!immutable_db_options_.best_efforts_recovery) {
s = env_->GetChildren(wal_dir, &files_in_wal_dir);
}
// 校验wal文件与manifest中的记录
if (immutable_db_options_.track_and_verify_wals_in_manifest) {
if (!immutable_db_options_.best_efforts_recovery) {
s = versions_->GetWalSet().CheckWals(env_, wal_files);
}
} else if (!versions_->GetWalSet().GetWals().empty()) {
...
}
// 恢复wal数据到memtable
if (!wal_files.empty()) {
...
s = RecoverLogFiles(wals, &next_sequence, read_only, &corrupted_wal_found,
recovery_ctx);
...
}
}
...
return s;
}
恢复version:VersionSet::Recover
Status VersionSet::Recover(
const std::vector<ColumnFamilyDescriptor>& column_families, bool read_only,
std::string* db_id) {
// 读取manifest文件
// 先读取CURRENT文件,在CURRENT文件中记录了当前的manifest,如“MANIFEST-000011”
Status s = GetCurrentManifestPath(dbname_, fs_.get(), &manifest_path,
&manifest_file_number_);
...
// 根据获取的当前manifest,创建SequentialFileReader,用于读取manifest。
std::unique_ptr<SequentialFileReader> manifest_file_reader;
{
std::unique_ptr<FSSequentialFile> manifest_file;
s = fs_->NewSequentialFile(manifest_path,
fs_->OptimizeForManifestRead(file_options_),
&manifest_file, nullptr);
...
manifest_file_reader.reset(new SequentialFileReader(
std::move(manifest_file), manifest_path,
db_options_->log_readahead_size, io_tracer_, db_options_->listeners));
}
// 读manifest,解析manifest并恢复version
...
{
// 迭代回放manifest
VersionEditHandler handler(read_only, column_families,
const_cast<VersionSet*>(this),
/*track_missing_files=*/false,
/*no_error_if_files_missing=*/false, io_tracer_);
// 真正处理manifest逻辑
handler.Iterate(reader, &log_read_status);
s = handler.status();
if (s.ok()) {
log_number = handler.GetVersionEditParams().log_number_;
current_manifest_file_size = reader.GetReadOffset();
handler.GetDbId(db_id);
}
}
...
return s;
}
void VersionEditHandlerBase::Iterate(log::Reader& reader,
Status* log_read_status) {
// 遍历读manifest记录
while (reader.LastRecordEnd() < max_manifest_read_size_ && s.ok() &&
reader.ReadRecord(&record, &scratch) && log_read_status->ok()) {
// 解码record到临时edit中
s = edit.DecodeFrom(record);
// 将edit添加到buffer,用于batchapply
s = read_buffer_.AddEdit(&edit);
if (edit.is_in_atomic_group_) {
// batchapply
...
} else {
// 将edit apply到version中
s = ApplyVersionEdit(edit, &cfd);
}
}
}
bool Reader::ReadRecord(Slice* record, std::string* scratch,
WALRecoveryMode wal_recovery_mode,
uint64_t* record_checksum) {
...
Slice fragment;
// 循环读数据,并添加到fragment中
while (true) {
uint64_t physical_record_offset = end_of_buffer_offset_ - buffer_.size();
size_t drop_size = 0;
// 读取一段记录数据
const unsigned int record_type =
ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
// 根据数据类型,使用不同策略将数据添加到fragment中
switch (record_type) {
// 一个完整的record
case kFullType:
case kRecyclableFullType:
...
// 一个record中第一个分片
case kFirstType:
case kRecyclableFirstType:
...
// 一个record中的中间分片
case kMiddleType:
case kRecyclableMiddleType:
...
// 一个record中的最后分片
case kLastType:
case kRecyclableLastType:
...
// header错误
case kBadHeader:
...
// 遇到结束,可能是写完部分分片后写失败了,因此数据无效,恢复上一个版本的version即可
case kEof:
...
case kOldRecord:
...
// 错误
case kBadRecord:
...
break;
case kBadRecordLen:
...
break;
case kBadRecordChecksum:
...
break;
case kSetCompressionType: {
...
default: {
...
break;
}
}
}
return false;
}
unsigned int Reader::ReadPhysicalRecord(Slice* result, size_t* drop_size,
uint64_t* fragment_checksum) {
while (true) {
// header不完整,需要读取更多的数据
if (buffer_.size() < static_cast<size_t>(kHeaderSize)) {
...
if (!ReadMore(drop_size, &r)) {
return r;
}
continue;
}
// 解析header,0-3 crc,4-5 length,6 type
const char* header = buffer_.data();
const uint32_t a = static_cast<uint32_t>(header[4]) & 0xff;
const uint32_t b = static_cast<uint32_t>(header[5]) & 0xff;
const unsigned int type = header[6];
const uint32_t length = a | (b << 8);
int header_size = kHeaderSize;
// 根据不同的type,校验crc,decompress等....
...
}
}
恢复WAL:DBImpl::RecoverLogFiles
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
SequenceNumber* next_sequence, bool read_only,
bool* corrupted_wal_found,
RecoveryContext* recovery_ctx) {
...
// 遍历所有wal record,wal_numbers为所有wal的log文件列表。
for (auto wal_number : wal_numbers) {
// 创建reader用于读取wal数据
std::unique_ptr<SequentialFileReader> file_reader;
{
std::unique_ptr<FSSequentialFile> file;
status = fs_->NewSequentialFile(
fname, fs_->OptimizeForLogRead(file_options_), &file, nullptr);
...
file_reader.reset(new SequentialFileReader(
std::move(file), fname, immutable_db_options_.log_readahead_size,
io_tracer_));
}
...
// 遍历所有record
while (!stop_replay_by_wal_filter &&
reader.ReadRecord(&record, &scratch,
immutable_db_options_.wal_recovery_mode,
&record_checksum) &&
status.ok()) {
...
// 解析record把其中的kv组装成writebatch
WriteBatch batch;
status = WriteBatchInternal::SetContents(&batch, record);
// 将writebatch插入memtable
status = WriteBatchInternal::InsertInto(
&batch, column_family_memtables_.get(), &flush_scheduler_,
&trim_history_scheduler_, true, wal_number, this,
false /* concurrent_memtable_writes */, next_sequence,
&has_valid_writes, seq_per_batch_, batch_per_txn_);
// 将memtable flush到L0
if (has_valid_writes && !read_only) {
...
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
...
cfd->CreateNewMemtable(*cfd->GetLatestMutableCFOptions(),
*next_sequence);
}
}
// 处理结果,清理flush_scheduler,更新seq_num
...
}
bool data_seen = false;
if (!read_only) {
// flush最后一个memtable
if (cfd->mem()->GetFirstSequenceNumber() != 0) {
if (flushed || !immutable_db_options_.avoid_flush_during_recovery) {
status = WriteLevel0TableForRecovery(job_id, cfd, cfd->mem(), edit);
}
}
}
return status;
}
参考文献
https://blog.csdn.net/xuhaitao23/article/details/121747616
https://vigourtyy-zhg.blog.csdn.net/article/details/106367782