概述
SSTable是一种文件的组织格式,存储了有序的key-value,并且持久化在磁盘等稳定存储中。当内存中的Memtable达到阈值后,会转为Immutable Memtable,LevelDB的后台线程执行Minor Compaction会将Immutable Memtable转储到磁盘上的SSTable中。
SSTable文件格式
SSTable的文件格式可以参考doc/table_format.md
,主要包含了Data Block、Filter Block、Meta Block、MetaIndex Block、IndexBlock和Footer
Data Block
关于Data Block相关的文件位于table/block.h
、table/block.cc
Block的定义如下:
class Block {
public:
// Initialize the block with the specified contents.
explicit Block(const BlockContents& contents);
Block(const Block&) = delete;
Block& operator=(const Block&) = delete;
~Block();
size_t size() const { return size_; }
Iterator* NewIterator(const Comparator* comparator);
private:
class Iter;
uint32_t NumRestarts() const;
const char* data_;
size_t size_;
uint32_t restart_offset_; // Offset in data_ of restart array
bool owned_; // Block owns data_[]
};
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively. Will not dereference past "limit".
//
// If any errors are detected, returns nullptr. Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
static inline const char* DecodeEntry(const char* p, const char* limit,
uint32_t* shared, uint32_t* non_shared,
uint32_t* value_length) {
if (limit - p < 3) return nullptr;
*shared = reinterpret_cast<const uint8_t*>(p)[0];
*non_shared = reinterpret_cast<const uint8_t*>(p)[1];
*value_length = reinterpret_cast<const uint8_t*>(p)[2];
if ((*shared | *non_shared | *value_length) < 128) {
// Fast path: all three values are encoded in one byte each
p += 3;
} else {
if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
}
if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
return nullptr;
}
return p;
}
首先看一下DecodeEntry()
函数,该函数将p
指针开始的数据解码出与前一条记录key共享部分的长度、与前一条记录key不共享部分的长度和value长度,得到这3个数据后就可以获取key和value。代码中首先判断litmit - p < 3
,因为采用varint编码方式,所以这3个长度每一个至少需要一个字节。
后面依次获取一个字节的内容赋值给shared
、non_shared
和value_length
,如果这3个值相或小于128,说明每一个值都小于128,根据varint编码,说明一个值用一个字节存储即可,p直接加3;如果大于128,说明至少其中一个值大于128,这时候调用GetVarint32Ptr()
获取对应的值。
其实直接调用GetVarint32Ptr()
也能实现,而且更加简洁,这里可能是出于效率的考虑,因为大部分的长度用一个字节还是可以存下的。
Block::Iter
的实现比较关键,比较复杂的是Prev()
和Seek()
函数,Prev()
函数需要递减restart_index_
来寻找上一个值,而Seek()
函数需要进行二分查找找到restart_index_
,然后再遍历查找。
void Prev() override {
assert(Valid());
// Scan backwards to a restart point before current_
const uint32_t original = current_;
while (GetRestartPoint(restart_index_) >= original) {
if (restart_index_ == 0) {
// No more entries
current_ = restarts_;
restart_index_ = num_restarts_;
return;
}
restart_index_--;
}
SeekToRestartPoint(restart_index_);
do {
// Loop until end of current entry hits the start of original entry
} while (ParseNextKey() && NextEntryOffset() < original);
}
void Seek(const Slice& target) override {
// Binary search in restart array to find the last restart point
// with a key < target
uint32_t left = 0;
uint32_t right = num_restarts_ - 1;
int current_key_compare = 0;
if (Valid()) {
// If we're already scanning, use the current position as a starting
// point. This is beneficial if the key we're seeking to is ahead of the
// current position.
current_key_compare = Compare(key_, target);
if (current_key_compare < 0) {
// key_ is smaller than target
left = restart_index_;
} else if (current_key_compare > 0) {
right = restart_index_;
} else {
// We're seeking to the key we're already at.
return;
}
}
while (left < right) {
uint32_t mid = (left + right + 1) / 2;
uint32_t region_offset = GetRestartPoint(mid);
uint32_t shared, non_shared, value_length;
const char* key_ptr =
DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
&non_shared, &value_length);
if (key_ptr == nullptr || (shared != 0)) {
CorruptionError();
return;
}
Slice mid_key(key_ptr, non_shared);
if (Compare(mid_key, target) < 0) {
// Key at "mid" is smaller than "target". Therefore all
// blocks before "mid" are uninteresting.
left = mid;
} else {
// Key at "mid" is >= "target". Therefore all blocks at or
// after "mid" are uninteresting.
right = mid - 1;
}
}
// We might be able to use our current position within the restart block.
// This is true if we determined the key we desire is in the current block
// and is after than the current key.
assert(current_key_compare == 0 || Valid());
bool skip_seek = left == restart_index_ && current_key_compare < 0;
if (!skip_seek) {
SeekToRestartPoint(left);
}
// Linear search (within restart block) for first key >= target
while (true) {
if (!ParseNextKey()) {
return;
}
if (Compare(key_, target) >= 0) {
return;
}
}
}
Data Block构造相关的代码位于table/block_builder.h
和table/block_builder.cc
,主要内容就是根据restart的压缩逻辑把数据压入buffer_
中,核心代码是Add()
函数,代码如下:
void BlockBuilder::Add(const Slice& key, const Slice& value) {
Slice last_key_piece(last_key_);
assert(!finished_);
assert(counter_ <= options_->block_restart_interval);
assert(buffer_.empty() // No values yet?
|| options_->comparator->Compare(key, last_key_piece) > 0);
size_t shared = 0;
if (counter_ < options_->block_restart_interval) {
// See how much sharing to do with previous string
const size_t min_length = std::min(last_key_piece.size(), key.size());
while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
shared++;
}
} else {
// Restart compression
restarts_.push_back(buffer_.size());
counter_ = 0;
}
const size_t non_shared = key.size() - shared;
// Add "<shared><non_shared><value_size>" to buffer_
PutVarint32(&buffer_, shared);
PutVarint32(&buffer_, non_shared);
PutVarint32(&buffer_, value.size());
// Add string delta to buffer_ followed by value
buffer_.append(key.data() + shared, non_shared);
buffer_.append(value.data(), value.size());
// Update state
last_key_.resize(shared);
last_key_.append(key.data() + shared, non_shared);
assert(Slice(last_key_) == key);
counter_++;
}
Filter Block
Filter Block相关的代码是table/filter_block.h
、table/filter_block.cc
和filter_block_test.cc
主要有FilterBlockBuilder
和FilterBlockReader
这两个类,构造时都需要传入FilterPolicy
关于FilterBlockBuilder
,源码中这样解释:
A FilterBlockBuilder is used to construct all of the filters for a particular Table. It generates a single string which is stored as a special block in the Table.
The sequence of calls to FilterBlockBuilder must match the regexp: (StartBlock AddKey*)* Finish
解释一下FilterBlockBuilder
的成员变量:
const FilterPolicy* policy_; // 过滤策略
std::string keys_; // 待生成Filter Data的暂存的keys,堆叠在一个字符串中
std::vector<size_t> start_; // Starting index in keys_ of each key
std::string result_; // Filter data computed so far
std::vector<Slice> tmp_keys_; // policy_->CreateFilter() argument
std::vector<uint32_t> filter_offsets_; // The offset of Filter Data
然后看一下最主要的三个public函数StartBlock()
、AddKey()
、Finish()
和一个private函数GenerateFilter()
首先GenerateFilter()
顾名思义就是生成过滤数据,首先判断buffer的key数量,如果为0,直接在filter_offsets
插入当前Filter Data的offset就返回。否则还需要调用policy_->CreateFilter()
来生成Filter Data
然后看一下StartBlock()
,首先根据计算该block_offset对应的filter_index,如果发现之前的block都没有计算Filter Data,那么就会循环调用GenerateFilter
将filter_offsets
都插入当前Filter Data的offset,代码如下:
void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
uint64_t filter_index = (block_offset / kFilterBase);
assert(filter_index >= filter_offsets_.size());
while (filter_index > filter_offsets_.size()) {
GenerateFilter();
}
}
AddKey()
就是向buffer的keys_和start_插入数据,每次GenerateFilter()
之后都会清空
Finish()
调用最后一次GenerateFilter()
后,将保存的filter data的offset array
和offset of beginning of offset array
插入到result_
中并返回
FilterBlockReader
比较简单,成员变量的含义如下:
const FilterPolicy* policy_; // Filter Policy
const char* data_; // Pointer to filter data (at block-start)
const char* offset_; // Pointer to beginning of offset array (at block-end)
size_t num_; // Number of entries in offset array
size_t base_lg_; // Encoding parameter (see kFilterBaseLg in .cc file)
KeyMayMatch
需要传入block_offset和key,如果在block_offset对用的filter内,有对应的匹配key,则返回true,否则返回false
bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
uint64_t index = block_offset >> base_lg_;
if (index < num_) {
uint32_t start = DecodeFixed32(offset_ + index * 4);
uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
if (start <= limit && limit <= static_cast<size_t>(offset_ - data_)) {
Slice filter = Slice(data_ + start, limit - start);
return policy_->KeyMayMatch(key, filter);
} else if (start == limit) {
// Empty filters do not match any keys
return false;
}
}
return true; // Errors are treated as potential matches
}
Format
table/format.h
和table/format.cc
保存了BlockHandle
和Footer
数据结构和它们的编码解码方法
重点看一下ReadBlock()
函数,给定一个RandomAccessFile、ReadOptions和BlockHandle,取出blcok内容放到BlockContents中,并进行crc验证和解压缩
注意一下BlockContents
struct BlockContents {
Slice data; // Actual contents of data
bool cachable; // True iff data can be cached
bool heap_allocated; // True iff caller should delete[] data.data()
};
cachable
和heap_allocated
的含义暂不清楚
Table Builder
关于table builder相关的代码主要位于include/leveldb/table_builder
、table/table_builder.cc
,为什么先分析table builder呢,因为table稍微复杂一些,涉及到迭代器等操作,等分析完迭代器后,在看一下神秘的table
首先看一下table是如何构造的,也就是TableBuilder
的实现,TableBuilder中有一个结构体Rep,为什么这么设计呢?有两个原因:1. 不想暴露给使用者内部的实现,所以在头文件只定义Rep结构的指针;2. Rep的结构体内容可能会变化,如果放到头文件中,发生变化时用户需要重新编译,而如果放到实现类中,用户只需要链接最新的库即可,这种技巧叫pImpl idiom
。可以参考stackoverflow上的这个解释why-table-and-tablebuilder-in-leveldb-use-struct-rep
接下来我们解释一下TableBuilder中主要的成员函数:
Add()
:添加(key, value)到当前table中;
Flush()
:强制刷新buffer的数据到data block中,一般使用者不会调用这个,内部实现时当data block的size达到4KB时会执行flush操作;
Finish()
:TableBuilder的收尾工作,会把所有blcok和footer都写到文件中;
Abandon()
:放弃收尾;
WriteBlock()
:完成构造block,判断压缩类型,调用WriteRawBlock
WriteRawBlock()
:将data block的offset和size写入block handle,并将data block的数据写入文件中
熟悉sstable的文件格式后,结合源码看并不复杂,就不逐行代码分析了,贴一下Finish()
的代码记录一下
Status TableBuilder::Finish() {
Rep* r = rep_;
Flush();
assert(!r->closed);
r->closed = true;
BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;
// Write filter block
if (ok() && r->filter_block != nullptr) {
WriteRawBlock(r->filter_block->Finish(), kNoCompression,
&filter_block_handle);
}
// Write metaindex block
if (ok()) {
BlockBuilder meta_index_block(&r->options);
if (r->filter_block != nullptr) {
// Add mapping from "filter.Name" to location of filter data
std::string key = "filter.";
key.append(r->options.filter_policy->Name());
std::string handle_encoding;
filter_block_handle.EncodeTo(&handle_encoding);
meta_index_block.Add(key, handle_encoding);
}
// TODO(postrelease): Add stats and other meta blocks
WriteBlock(&meta_index_block, &metaindex_block_handle);
}
// Write index block
if (ok()) {
if (r->pending_index_entry) {
r->options.comparator->FindShortSuccessor(&r->last_key);
std::string handle_encoding;
r->pending_handle.EncodeTo(&handle_encoding);
r->index_block.Add(r->last_key, Slice(handle_encoding));
r->pending_index_entry = false;
}
WriteBlock(&r->index_block, &index_block_handle);
}
// Write footer
if (ok()) {
Footer footer;
footer.set_metaindex_handle(metaindex_block_handle);
footer.set_index_handle(index_block_handle);
std::string footer_encoding;
footer.EncodeTo(&footer_encoding);
r->status = r->file->Append(footer_encoding);
if (r->status.ok()) {
r->offset += footer_encoding.size();
}
}
return r->status;
}
Iterator
关于迭代器相关的代码位于include/leveldb/iterator.h
、table/iterator.cc
、table/iterator_wrapper.h
、table/merger.h
、table/merge.cc
、table/two_level_iterator.h
、table/two_level_iterator.cpp
Iterator
定义了Iterator的接口,任何Iterator都必须实现这些接口,需要注意的是Iterator中包含一个存储着CleanupFunction
的单链表,当Iterator析构时会依次调用单链表中的CleanupFunction
table/iterator.cc
定义了EmptyIterator
,table/iterator_wrapper
定义了IteratorWrapper
,关于IteratorWrapper
,源码中的解释很清楚:
A internal wrapper class with an interface similar to Iterator that caches the valid() and key() results for an underlying iterator. This can help avoid virtual function calls and also gives better cache locality.
leveldb中所有的迭代器都继承自Iterator
,实现了声明的虚函数,使用多态的方式进行调用。这当前很方便,但是虚函数调用的效率比普通函数要低,因为虚函数地址需要在运行期决议出来:通过访问对象前四个字节所指向的虚表,取出目标函数地址再执行call指令。而普通函数调用的函数地址在编译完成时就已经确定下来了,显然效率更高。所以将 key 和 valid 存储下来,可以减少虚函数的调用次数。
MergingIterator
A merging iterator that provided the union of the data in children[0,n-1]. Takes ownership of the child iterators and will delete them when the result iterator is deleted.
The result does no duplicate suppression. I.e., if a particular key is present in K child iterators, it will be yielded K times.
MergingIterator
包含了多个IteratorWrapper类型的子迭代器,放到成员变量children_
中,direction_
是一个枚举值:包含kForward
和kReverse
,还有一个IteratorWrapper
类型的current_
指向某个迭代器,一个比较器comparator_
和子迭代器的数量n_
主要看一下Next()
,当direction_
为正向时,说明当前子迭代器指向的key为最小key,只需要执行子迭代器的next,然后调用FindSmallest()
获取所有子迭代器的最小值即可;当direction_
为反向时,当前子迭代器指向的key为最大key,所以直接调用next并不能保证值为下一个最小的值,因此需要让所有子迭代器都seek到当前子迭代器的key的位置,然后调用FindSmallest()
获取所有子迭代器的最小值。Prev()
同理
TwoLevelIterator
A two-level iterator contains an index iterator whose values point to a sequence of blocks where each block is itself a sequence of key,value pairs. The returned two-level iterator yields the concatenation of all key/value pairs in the sequence of blocks. Takes ownership of “index_iter” and will delete it when no longer needed.
TwoLevelIterator
设计的比较通用,但是实际上只为Table::Iterator
服务,这个跟sstable的格式有关系,sstable有index block和data block,index block存储了data block的索引信息,因此index block作为迭代器的第一级,data block作为迭代器的第二级
首先看一下TwoLevelIterator
的成员变量:
BlockFunction block_function_;
void* arg_;
const ReadOptions options_;
Status status_;
IteratorWrapper index_iter_;
IteratorWrapper data_iter_; // May be nullptr
// If data_iter_ is non-null, then "data_block_handle_" holds the
// "index_value" passed to block_function_ to create the data_iter_.
std::string data_block_handle_;
block_function_
为获取第二级迭代器的函数,arg_
、options_
和data_block_handle
是block_function_
的参数,index_iter_
为第一级迭代器,data_iter_
为第二级迭代器
需要关注三个函数:InitDataBlock()
、SkipEmptyDataBlocksForward()
和SkipEmptyDataBlocksBackward()
InitDataBlock
其实就是利用一级迭代器生成二级迭代器,然后调用SetDataIterator()
释放申请二级迭代器的内存,析构迭代器时会调用CleanupFunction
进行清理
data_iter_
进行遍历时,可能会碰到data block的边缘或者变成invalid,这时需要调用SkipEmptyDataBlocksForward()
和SkipEmptyDataBlocksBackward()
跳到下一个data block接着遍历
Table
Table和TableBuilder类似,也定义了Rep结构体,其中的优点可以查阅TableBuilder部分
Table的核心是Open()
函数,其中读取了footer、index block、metaindex block、filter block
BlockReader()
将一个index iterator的block handle转换为一个迭代器,用在TwoLevelIterator()
中,这个函数里使用了block cache
,block cache使用table的cache_id_和block handle的offset作为key,value是cache handle,block cache根据cache handle获取block内容
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
Iterator* Table::BlockReader(void* arg, const ReadOptions& options,
const Slice& index_value) {
Table* table = reinterpret_cast<Table*>(arg);
Cache* block_cache = table->rep_->options.block_cache;
Block* block = nullptr;
Cache::Handle* cache_handle = nullptr;
BlockHandle handle;
Slice input = index_value;
Status s = handle.DecodeFrom(&input);
// We intentionally allow extra stuff in index_value so that we
// can add more features in the future.
if (s.ok()) {
BlockContents contents;
if (block_cache != nullptr) {
char cache_key_buffer[16];
EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
EncodeFixed64(cache_key_buffer + 8, handle.offset());
Slice key(cache_key_buffer, sizeof(cache_key_buffer));
cache_handle = block_cache->Lookup(key);
if (cache_handle != nullptr) {
block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
} else {
s = ReadBlock(table->rep_->file, options, handle, &contents);
if (s.ok()) {
block = new Block(contents);
if (contents.cachable && options.fill_cache) {
cache_handle = block_cache->Insert(key, block, block->size(),
&DeleteCachedBlock);
}
}
}
} else {
s = ReadBlock(table->rep_->file, options, handle, &contents);
if (s.ok()) {
block = new Block(contents);
}
}
}
Iterator* iter;
if (block != nullptr) {
iter = block->NewIterator(table->rep_->options.comparator);
if (cache_handle == nullptr) {
iter->RegisterCleanup(&DeleteBlock, block, nullptr);
} else {
iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
}
} else {
iter = NewErrorIterator(s);
}
return iter;
}
InternalGet()
读取指定的(key, value),如果找到则执行handle_result()