leveldb源码解析系列—SSTable

概述

SSTable是一种文件的组织格式,存储了有序的key-value,并且持久化在磁盘等稳定存储中。当内存中的Memtable达到阈值后,会转为Immutable Memtable,LevelDB的后台线程执行Minor Compaction会将Immutable Memtable转储到磁盘上的SSTable中。

SSTable文件格式

SSTable的文件格式可以参考doc/table_format.md,主要包含了Data Block、Filter Block、Meta Block、MetaIndex Block、IndexBlock和Footer

Data Block

关于Data Block相关的文件位于table/block.htable/block.cc

Block的定义如下:

class Block {
 public:
  // Initialize the block with the specified contents.
  explicit Block(const BlockContents& contents);

  Block(const Block&) = delete;
  Block& operator=(const Block&) = delete;

  ~Block();

  size_t size() const { return size_; }
  Iterator* NewIterator(const Comparator* comparator);

 private:
  class Iter;

  uint32_t NumRestarts() const;

  const char* data_;
  size_t size_;
  uint32_t restart_offset_;  // Offset in data_ of restart array
  bool owned_;               // Block owns data_[]
};
// Helper routine: decode the next block entry starting at "p",
// storing the number of shared key bytes, non_shared key bytes,
// and the length of the value in "*shared", "*non_shared", and
// "*value_length", respectively.  Will not dereference past "limit".
//
// If any errors are detected, returns nullptr.  Otherwise, returns a
// pointer to the key delta (just past the three decoded values).
static inline const char* DecodeEntry(const char* p, const char* limit,
                                      uint32_t* shared, uint32_t* non_shared,
                                      uint32_t* value_length) {
  if (limit - p < 3) return nullptr;
  *shared = reinterpret_cast<const uint8_t*>(p)[0];
  *non_shared = reinterpret_cast<const uint8_t*>(p)[1];
  *value_length = reinterpret_cast<const uint8_t*>(p)[2];
  if ((*shared | *non_shared | *value_length) < 128) {
    // Fast path: all three values are encoded in one byte each
    p += 3;
  } else {
    if ((p = GetVarint32Ptr(p, limit, shared)) == nullptr) return nullptr;
    if ((p = GetVarint32Ptr(p, limit, non_shared)) == nullptr) return nullptr;
    if ((p = GetVarint32Ptr(p, limit, value_length)) == nullptr) return nullptr;
  }

  if (static_cast<uint32_t>(limit - p) < (*non_shared + *value_length)) {
    return nullptr;
  }
  return p;
}

首先看一下DecodeEntry()函数,该函数将p指针开始的数据解码出与前一条记录key共享部分的长度、与前一条记录key不共享部分的长度和value长度,得到这3个数据后就可以获取key和value。代码中首先判断litmit - p < 3,因为采用varint编码方式,所以这3个长度每一个至少需要一个字节。

后面依次获取一个字节的内容赋值给sharednon_sharedvalue_length,如果这3个值相或小于128,说明每一个值都小于128,根据varint编码,说明一个值用一个字节存储即可,p直接加3;如果大于128,说明至少其中一个值大于128,这时候调用GetVarint32Ptr()获取对应的值。

其实直接调用GetVarint32Ptr()也能实现,而且更加简洁,这里可能是出于效率的考虑,因为大部分的长度用一个字节还是可以存下的。

Block::Iter的实现比较关键,比较复杂的是Prev()Seek()函数,Prev()函数需要递减restart_index_来寻找上一个值,而Seek()函数需要进行二分查找找到restart_index_,然后再遍历查找。

void Prev() override {
    assert(Valid());

    // Scan backwards to a restart point before current_
    const uint32_t original = current_;
    while (GetRestartPoint(restart_index_) >= original) {
      if (restart_index_ == 0) {
        // No more entries
        current_ = restarts_;
        restart_index_ = num_restarts_;
        return;
      }
      restart_index_--;
    }

    SeekToRestartPoint(restart_index_);
    do {
      // Loop until end of current entry hits the start of original entry
    } while (ParseNextKey() && NextEntryOffset() < original);
  }
void Seek(const Slice& target) override {
    // Binary search in restart array to find the last restart point
    // with a key < target
    uint32_t left = 0;
    uint32_t right = num_restarts_ - 1;
    int current_key_compare = 0;

    if (Valid()) {
      // If we're already scanning, use the current position as a starting
      // point. This is beneficial if the key we're seeking to is ahead of the
      // current position.
      current_key_compare = Compare(key_, target);
      if (current_key_compare < 0) {
        // key_ is smaller than target
        left = restart_index_;
      } else if (current_key_compare > 0) {
        right = restart_index_;
      } else {
        // We're seeking to the key we're already at.
        return;
      }
    }

    while (left < right) {
      uint32_t mid = (left + right + 1) / 2;
      uint32_t region_offset = GetRestartPoint(mid);
      uint32_t shared, non_shared, value_length;
      const char* key_ptr =
          DecodeEntry(data_ + region_offset, data_ + restarts_, &shared,
                      &non_shared, &value_length);
      if (key_ptr == nullptr || (shared != 0)) {
        CorruptionError();
        return;
      }
      Slice mid_key(key_ptr, non_shared);
      if (Compare(mid_key, target) < 0) {
        // Key at "mid" is smaller than "target".  Therefore all
        // blocks before "mid" are uninteresting.
        left = mid;
      } else {
        // Key at "mid" is >= "target".  Therefore all blocks at or
        // after "mid" are uninteresting.
        right = mid - 1;
      }
    }

    // We might be able to use our current position within the restart block.
    // This is true if we determined the key we desire is in the current block
    // and is after than the current key.
    assert(current_key_compare == 0 || Valid());
    bool skip_seek = left == restart_index_ && current_key_compare < 0;
    if (!skip_seek) {
      SeekToRestartPoint(left);
    }
    // Linear search (within restart block) for first key >= target
    while (true) {
      if (!ParseNextKey()) {
        return;
      }
      if (Compare(key_, target) >= 0) {
        return;
      }
    }
  }

Data Block构造相关的代码位于table/block_builder.htable/block_builder.cc,主要内容就是根据restart的压缩逻辑把数据压入buffer_中,核心代码是Add()函数,代码如下:

void BlockBuilder::Add(const Slice& key, const Slice& value) {
  Slice last_key_piece(last_key_);
  assert(!finished_);
  assert(counter_ <= options_->block_restart_interval);
  assert(buffer_.empty()  // No values yet?
         || options_->comparator->Compare(key, last_key_piece) > 0);
  size_t shared = 0;
  if (counter_ < options_->block_restart_interval) {
    // See how much sharing to do with previous string
    const size_t min_length = std::min(last_key_piece.size(), key.size());
    while ((shared < min_length) && (last_key_piece[shared] == key[shared])) {
      shared++;
    }
  } else {
    // Restart compression
    restarts_.push_back(buffer_.size());
    counter_ = 0;
  }
  const size_t non_shared = key.size() - shared;

  // Add "<shared><non_shared><value_size>" to buffer_
  PutVarint32(&buffer_, shared);
  PutVarint32(&buffer_, non_shared);
  PutVarint32(&buffer_, value.size());

  // Add string delta to buffer_ followed by value
  buffer_.append(key.data() + shared, non_shared);
  buffer_.append(value.data(), value.size());

  // Update state
  last_key_.resize(shared);
  last_key_.append(key.data() + shared, non_shared);
  assert(Slice(last_key_) == key);
  counter_++;
}

Filter Block

Filter Block相关的代码是table/filter_block.htable/filter_block.ccfilter_block_test.cc

主要有FilterBlockBuilderFilterBlockReader这两个类,构造时都需要传入FilterPolicy

关于FilterBlockBuilder,源码中这样解释:

A FilterBlockBuilder is used to construct all of the filters for a particular Table. It generates a single string which is stored as a special block in the Table.

The sequence of calls to FilterBlockBuilder must match the regexp: (StartBlock AddKey*)* Finish

解释一下FilterBlockBuilder的成员变量:

const FilterPolicy* policy_;   // 过滤策略
std::string keys_;             // 待生成Filter Data的暂存的keys,堆叠在一个字符串中
std::vector<size_t> start_;    // Starting index in keys_ of each key
std::string result_;           // Filter data computed so far
std::vector<Slice> tmp_keys_;  // policy_->CreateFilter() argument
std::vector<uint32_t> filter_offsets_; // The offset of Filter Data 

然后看一下最主要的三个public函数StartBlock()AddKey()Finish()和一个private函数GenerateFilter()

首先GenerateFilter()顾名思义就是生成过滤数据,首先判断buffer的key数量,如果为0,直接在filter_offsets插入当前Filter Data的offset就返回。否则还需要调用policy_->CreateFilter()来生成Filter Data

然后看一下StartBlock(),首先根据计算该block_offset对应的filter_index,如果发现之前的block都没有计算Filter Data,那么就会循环调用GenerateFilterfilter_offsets都插入当前Filter Data的offset,代码如下:

void FilterBlockBuilder::StartBlock(uint64_t block_offset) {
  uint64_t filter_index = (block_offset / kFilterBase);
  assert(filter_index >= filter_offsets_.size());
  while (filter_index > filter_offsets_.size()) {
    GenerateFilter();
  }
}

AddKey()就是向buffer的keys_和start_插入数据,每次GenerateFilter()之后都会清空

Finish()调用最后一次GenerateFilter()后,将保存的filter data的offset arrayoffset of beginning of offset array插入到result_中并返回

FilterBlockReader比较简单,成员变量的含义如下:

  const FilterPolicy* policy_;  // Filter Policy 
  const char* data_;            // Pointer to filter data (at block-start)
  const char* offset_;          // Pointer to beginning of offset array (at block-end)
  size_t num_;                  // Number of entries in offset array
  size_t base_lg_;              // Encoding parameter (see kFilterBaseLg in .cc file)

KeyMayMatch需要传入block_offset和key,如果在block_offset对用的filter内,有对应的匹配key,则返回true,否则返回false

bool FilterBlockReader::KeyMayMatch(uint64_t block_offset, const Slice& key) {
  uint64_t index = block_offset >> base_lg_;
  if (index < num_) {
    uint32_t start = DecodeFixed32(offset_ + index * 4);
    uint32_t limit = DecodeFixed32(offset_ + index * 4 + 4);
    if (start <= limit && limit <= static_cast<size_t>(offset_ - data_)) {
      Slice filter = Slice(data_ + start, limit - start);
      return policy_->KeyMayMatch(key, filter);
    } else if (start == limit) {
      // Empty filters do not match any keys
      return false;
    }
  }
  return true;  // Errors are treated as potential matches
}

Format

table/format.htable/format.cc保存了BlockHandleFooter数据结构和它们的编码解码方法

重点看一下ReadBlock()函数,给定一个RandomAccessFile、ReadOptions和BlockHandle,取出blcok内容放到BlockContents中,并进行crc验证和解压缩

注意一下BlockContents

struct BlockContents {
  Slice data;           // Actual contents of data
  bool cachable;        // True iff data can be cached
  bool heap_allocated;  // True iff caller should delete[] data.data()
};

cachableheap_allocated的含义暂不清楚

Table Builder

关于table builder相关的代码主要位于include/leveldb/table_buildertable/table_builder.cc,为什么先分析table builder呢,因为table稍微复杂一些,涉及到迭代器等操作,等分析完迭代器后,在看一下神秘的table

首先看一下table是如何构造的,也就是TableBuilder的实现,TableBuilder中有一个结构体Rep,为什么这么设计呢?有两个原因:1. 不想暴露给使用者内部的实现,所以在头文件只定义Rep结构的指针;2. Rep的结构体内容可能会变化,如果放到头文件中,发生变化时用户需要重新编译,而如果放到实现类中,用户只需要链接最新的库即可,这种技巧叫pImpl idiom。可以参考stackoverflow上的这个解释why-table-and-tablebuilder-in-leveldb-use-struct-rep

接下来我们解释一下TableBuilder中主要的成员函数:

Add():添加(key, value)到当前table中;
Flush():强制刷新buffer的数据到data block中,一般使用者不会调用这个,内部实现时当data block的size达到4KB时会执行flush操作;
Finish():TableBuilder的收尾工作,会把所有blcok和footer都写到文件中;
Abandon():放弃收尾;
WriteBlock():完成构造block,判断压缩类型,调用WriteRawBlock
WriteRawBlock():将data block的offset和size写入block handle,并将data block的数据写入文件中

熟悉sstable的文件格式后,结合源码看并不复杂,就不逐行代码分析了,贴一下Finish()的代码记录一下

Status TableBuilder::Finish() {
  Rep* r = rep_;
  Flush();
  assert(!r->closed);
  r->closed = true;

  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;

  // Write filter block
  if (ok() && r->filter_block != nullptr) {
    WriteRawBlock(r->filter_block->Finish(), kNoCompression,
                  &filter_block_handle);
  }

  // Write metaindex block
  if (ok()) {
    BlockBuilder meta_index_block(&r->options);
    if (r->filter_block != nullptr) {
      // Add mapping from "filter.Name" to location of filter data
      std::string key = "filter.";
      key.append(r->options.filter_policy->Name());
      std::string handle_encoding;
      filter_block_handle.EncodeTo(&handle_encoding);
      meta_index_block.Add(key, handle_encoding);
    }

    // TODO(postrelease): Add stats and other meta blocks
    WriteBlock(&meta_index_block, &metaindex_block_handle);
  }

  // Write index block
  if (ok()) {
    if (r->pending_index_entry) {
      r->options.comparator->FindShortSuccessor(&r->last_key);
      std::string handle_encoding;
      r->pending_handle.EncodeTo(&handle_encoding);
      r->index_block.Add(r->last_key, Slice(handle_encoding));
      r->pending_index_entry = false;
    }
    WriteBlock(&r->index_block, &index_block_handle);
  }

  // Write footer
  if (ok()) {
    Footer footer;
    footer.set_metaindex_handle(metaindex_block_handle);
    footer.set_index_handle(index_block_handle);
    std::string footer_encoding;
    footer.EncodeTo(&footer_encoding);
    r->status = r->file->Append(footer_encoding);
    if (r->status.ok()) {
      r->offset += footer_encoding.size();
    }
  }
  return r->status;
}

Iterator

关于迭代器相关的代码位于include/leveldb/iterator.htable/iterator.cctable/iterator_wrapper.htable/merger.htable/merge.cctable/two_level_iterator.htable/two_level_iterator.cpp

Iterator定义了Iterator的接口,任何Iterator都必须实现这些接口,需要注意的是Iterator中包含一个存储着CleanupFunction的单链表,当Iterator析构时会依次调用单链表中的CleanupFunction

table/iterator.cc定义了EmptyIteratortable/iterator_wrapper定义了IteratorWrapper,关于IteratorWrapper,源码中的解释很清楚:

A internal wrapper class with an interface similar to Iterator that caches the valid() and key() results for an underlying iterator. This can help avoid virtual function calls and also gives better cache locality.

leveldb中所有的迭代器都继承自Iterator,实现了声明的虚函数,使用多态的方式进行调用。这当前很方便,但是虚函数调用的效率比普通函数要低,因为虚函数地址需要在运行期决议出来:通过访问对象前四个字节所指向的虚表,取出目标函数地址再执行call指令。而普通函数调用的函数地址在编译完成时就已经确定下来了,显然效率更高。所以将 key 和 valid 存储下来,可以减少虚函数的调用次数。

MergingIterator

A merging iterator that provided the union of the data in children[0,n-1]. Takes ownership of the child iterators and will delete them when the result iterator is deleted.

The result does no duplicate suppression. I.e., if a particular key is present in K child iterators, it will be yielded K times.

MergingIterator包含了多个IteratorWrapper类型的子迭代器,放到成员变量children_中,direction_是一个枚举值:包含kForwardkReverse,还有一个IteratorWrapper类型的current_指向某个迭代器,一个比较器comparator_和子迭代器的数量n_

主要看一下Next(),当direction_为正向时,说明当前子迭代器指向的key为最小key,只需要执行子迭代器的next,然后调用FindSmallest()获取所有子迭代器的最小值即可;当direction_为反向时,当前子迭代器指向的key为最大key,所以直接调用next并不能保证值为下一个最小的值,因此需要让所有子迭代器都seek到当前子迭代器的key的位置,然后调用FindSmallest()获取所有子迭代器的最小值。Prev()同理

TwoLevelIterator

A two-level iterator contains an index iterator whose values point to a sequence of blocks where each block is itself a sequence of key,value pairs. The returned two-level iterator yields the concatenation of all key/value pairs in the sequence of blocks. Takes ownership of “index_iter” and will delete it when no longer needed.

TwoLevelIterator设计的比较通用,但是实际上只为Table::Iterator服务,这个跟sstable的格式有关系,sstable有index block和data block,index block存储了data block的索引信息,因此index block作为迭代器的第一级,data block作为迭代器的第二级

首先看一下TwoLevelIterator的成员变量:

  BlockFunction block_function_;
  void* arg_;
  const ReadOptions options_;
  Status status_;
  IteratorWrapper index_iter_;
  IteratorWrapper data_iter_;  // May be nullptr
  // If data_iter_ is non-null, then "data_block_handle_" holds the
  // "index_value" passed to block_function_ to create the data_iter_.
  std::string data_block_handle_;

block_function_为获取第二级迭代器的函数,arg_options_data_block_handleblock_function_的参数,index_iter_为第一级迭代器,data_iter_为第二级迭代器

需要关注三个函数:InitDataBlock()SkipEmptyDataBlocksForward()SkipEmptyDataBlocksBackward()

InitDataBlock其实就是利用一级迭代器生成二级迭代器,然后调用SetDataIterator()释放申请二级迭代器的内存,析构迭代器时会调用CleanupFunction进行清理

data_iter_进行遍历时,可能会碰到data block的边缘或者变成invalid,这时需要调用SkipEmptyDataBlocksForward()SkipEmptyDataBlocksBackward()跳到下一个data block接着遍历

Table

Table和TableBuilder类似,也定义了Rep结构体,其中的优点可以查阅TableBuilder部分

Table的核心是Open()函数,其中读取了footer、index block、metaindex block、filter block

BlockReader()将一个index iterator的block handle转换为一个迭代器,用在TwoLevelIterator()中,这个函数里使用了block cache,block cache使用table的cache_id_和block handle的offset作为key,value是cache handle,block cache根据cache handle获取block内容

// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
Iterator* Table::BlockReader(void* arg, const ReadOptions& options,
                             const Slice& index_value) {
  Table* table = reinterpret_cast<Table*>(arg);
  Cache* block_cache = table->rep_->options.block_cache;
  Block* block = nullptr;
  Cache::Handle* cache_handle = nullptr;

  BlockHandle handle;
  Slice input = index_value;
  Status s = handle.DecodeFrom(&input);
  // We intentionally allow extra stuff in index_value so that we
  // can add more features in the future.

  if (s.ok()) {
    BlockContents contents;
    if (block_cache != nullptr) {
      char cache_key_buffer[16];
      EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
      EncodeFixed64(cache_key_buffer + 8, handle.offset());
      Slice key(cache_key_buffer, sizeof(cache_key_buffer));
      cache_handle = block_cache->Lookup(key);
      if (cache_handle != nullptr) {
        block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
      } else {
        s = ReadBlock(table->rep_->file, options, handle, &contents);
        if (s.ok()) {
          block = new Block(contents);
          if (contents.cachable && options.fill_cache) {
            cache_handle = block_cache->Insert(key, block, block->size(),
                                               &DeleteCachedBlock);
          }
        }
      }
    } else {
      s = ReadBlock(table->rep_->file, options, handle, &contents);
      if (s.ok()) {
        block = new Block(contents);
      }
    }
  }

  Iterator* iter;
  if (block != nullptr) {
    iter = block->NewIterator(table->rep_->options.comparator);
    if (cache_handle == nullptr) {
      iter->RegisterCleanup(&DeleteBlock, block, nullptr);
    } else {
      iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
    }
  } else {
    iter = NewErrorIterator(s);
  }
  return iter;
}

InternalGet()读取指定的(key, value),如果找到则执行handle_result()

Reference

LevelDB源码解析26. 二级迭代器
3000-leveldb精读系列-IteratorWrapper

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值