【leveldb】SSTable(十四):SSTable 读写流程

SSTable就是leveldb最后落地存储的文件,针对SSTable详细格式介绍可点此SSTable存储结构说明。本篇主要是对SSTable的读写流程代码研读。

写流程

写流程就是按照SSTable的格式去写,阅读起来并不是太复杂。

namespace leveldb {

struct TableBuilder::Rep {
  Rep(const Options& opt, WritableFile* f)
      : options(opt),
        index_block_options(opt),
        file(f),
        offset(0),
        data_block(&options),
        index_block(&index_block_options),
        num_entries(0),
        closed(false),
        filter_block(opt.filter_policy == nullptr
                         ? nullptr
                         : new FilterBlockBuilder(opt.filter_policy)),
        pending_index_entry(false) {
    index_block_options.block_restart_interval = 1;
  }

  //Data Block写选项
  Options options;
  //index Block写选项,主要参数是block_restart_interval,
  //主要是多久写一个Data Block重启点>
  Options index_block_options;
  //<!写文件操作>
  WritableFile* file;
  uint64_t offset;
  Status status;
  BlockBuilder data_block;
  BlockBuilder index_block;
  // <!Index Block中的key>
  std::string last_key;
  //<!整个SSTable的KV个数>
  int64_t num_entries;
  bool closed;  // Either Finish() or Abandon() has been called.
  FilterBlockBuilder* filter_block;

  // We do not emit the index entry for a block until we have seen the
  // first key for the next data block.  This allows us to use shorter
  // keys in the index block.  For example, consider a block boundary
  // between the keys "the quick brown fox" and "the who".  We can use
  // "the r" as the key for the index block entry since it is >= all
  // entries in the first block and < all entries in subsequent
  // blocks.
  //
  // Invariant: r->pending_index_entry is true only if data_block is empty.
  //<!true表示当前data_block是空的,需要往indexBlock中写入一条记录>
  bool pending_index_entry;
  //<!存储offset和size,用于写index block>
  BlockHandle pending_handle;  // Handle to add to index block
  // <!临时存储压缩数据>
  std::string compressed_output;
};

TableBuilder::TableBuilder(const Options& options, WritableFile* file)
    : rep_(new Rep(options, file)) {
  if (rep_->filter_block != nullptr) {
    rep_->filter_block->StartBlock(0);
  }
}

TableBuilder::~TableBuilder() {
  assert(rep_->closed);  // Catch errors where caller forgot to call Finish()
  delete rep_->filter_block;
  delete rep_;
}

/*
<!修改下Option,影响到了IndexBlock,注意:
	如果有Optinon中有新增的参数选项,在已启动SSTable创建之后,
	不能在修改了。
	下文中,检测到Key的比较方式发生变化,则直接报错,
	因为Key的排序规则都变了,那之前排序的数据则是异常的了。
>
*/
Status TableBuilder::ChangeOptions(const Options& options) {
  // Note: if more fields are added to Options, update
  // this function to catch changes that should not be allowed to
  // change in the middle of building a Table.
  if (options.comparator != rep_->options.comparator) {
    return Status::InvalidArgument("changing comparator while building table");
  }

  // Note that any live BlockBuilders point to rep_->options and therefore
  // will automatically pick up the updated options.
  rep_->options = options;
  rep_->index_block_options = options;
  rep_->index_block_options.block_restart_interval = 1;
  return Status::OK();
}

//<!添加一个KV数据>
void TableBuilder::Add(const Slice& key, const Slice& value) {
  Rep* r = rep_;
  //<!未关闭状态>
  assert(!r->closed);
  if (!ok()) return;
  /*
  <!如果当前num_entries > 0,表示已存储了key。
  上层传过来的key已保证从小到达的顺序,
  所以新加入的key肯定大于已存在key数据的最后一个key。
  >
  */
  if (r->num_entries > 0) {
    assert(r->options.comparator->Compare(key, Slice(r->last_key)) > 0);
  }

  /*
  <!pending_index_entry为true表示需要往index_block中写入一条DataBlock记录,
  数据格式就是| key | offset | size|。一般在写入了一个DataBlock
  之后,后续来了新key就要写一条DataBlock记录。
  >
  */
  if (r->pending_index_entry) {
    assert(r->data_block.empty());
	/*
	<!新写入到IndexBlock中的记录,对Key的要求是
	大于等于当前DataBlock数据块的最大,同时要小于接下来要写入的DataBlock数据块的最小Key;
	而当前这个key就是新的接下来要写入的DataBlock第一个key,也是其最小key。
	这个通过FindShortestSeparator()方法找到满足上述条件的key。
	>
	*/
    r->options.comparator->FindShortestSeparator(&r->last_key, key);
	//<!将上一个即当前已存在的DataBlock的Offset和Size,Var编码到handle_encoding>
    std::string handle_encoding;
    r->pending_handle.EncodeTo(&handle_encoding);
	//<!往IndexBlock中写入一条记录>
    r->index_block.Add(r->last_key, Slice(handle_encoding));
	//<!表示接下来不用再往IndexBlock中写入一条记录了>
    r->pending_index_entry = false;
  }

  // <!如果开启了过滤策略,则会有filter_block,则添加用于过滤的key>  
  if (r->filter_block != nullptr) {
    r->filter_block->AddKey(key);
  }

  // <!更新下最新的最后一个key>
  r->last_key.assign(key.data(), key.size());
  //<!KV次数累加>
  r->num_entries++;
  //<!往DataBlock中添加KV>
  r->data_block.Add(key, value);

  /*
  <!当DataBlock达到指定大小block_size,一般是4KB,用户可设置。
  就要将DataBlock数据落地到磁盘,即SSTable文件中。
  >
  */
  const size_t estimated_block_size = r->data_block.CurrentSizeEstimate();
  if (estimated_block_size >= r->options.block_size) {
    Flush();
  }
}

//<!这个方法主要是将数据刷到磁盘>
void TableBuilder::Flush() {

  //<!开始是一些条件检测>  
  Rep* r = rep_;
  assert(!r->closed);
  if (!ok()) return;
  if (r->data_block.empty()) return;

  /*
  <!pending_index_entry应为false,
  可以说明之前已一一对应写了一个
  DataBlock对应的记录到IndexBlock中
  >
  */
  assert(!r->pending_index_entry);
  //<!内部实现回刷数据>
  WriteBlock(&r->data_block, &r->pending_handle);
  if (ok()) {
    
    //<!如果写成功了,将pending_index_entry置位true,表示接下来要写一条记录到indexBlock>	
    r->pending_index_entry = true;
	//<!执行性flush,保证写的数据都正确落地到磁盘>
    r->status = r->file->Flush();
  }
  //<!这里检测下要不要在FilterBlock中生成一个新的Filter> 
  if (r->filter_block != nullptr) {
    r->filter_block->StartBlock(r->offset);
  }
}

//<!写DataBlock方法>
void TableBuilder::WriteBlock(BlockBuilder* block, BlockHandle* handle) {
  // File format contains a sequence of blocks where each block has:
  //    block_data: uint8[n]
  //    type: uint8
  //    crc: uint32
  assert(ok());
  Rep* r = rep_;
  //<!将DataBlock中的数据按DataBlock格式封装好存于raw中>
  Slice raw = block->Finish();

  /*
  <!将数据压缩存储,作者是建议将数据压缩的,一方面可以减少写放大,另一方面存入的数据更少。
  如果开启了压缩,但是压缩率小于12.5%,则直接存储未压缩的格式数据。
  >
  */
  Slice block_contents;
  CompressionType type = r->options.compression;
  // TODO(postrelease): Support more compression options: zlib?
  switch (type) {
    case kNoCompression:
      block_contents = raw;
      break;

    case kSnappyCompression: {
      std::string* compressed = &r->compressed_output;
      if (port::Snappy_Compress(raw.data(), raw.size(), compressed) &&
          compressed->size() < raw.size() - (raw.size() / 8u)) {
        block_contents = *compressed;
      } else {
        // Snappy not supported, or compressed less than 12.5%, so just
        // store uncompressed form
        block_contents = raw;
        type = kNoCompression;
      }
      break;
    }
  }

  //<!真正写元数据方法>
  WriteRawBlock(block_contents, type, handle);
  //<!将辅助字段清空初始化>
  r->compressed_output.clear();
  block->Reset();
}

void TableBuilder::WriteRawBlock(const Slice& block_contents,
                                 CompressionType type, BlockHandle* handle) {
  Rep* r = rep_;
  /*
  <!记下DataBlock在整个SSTable中的偏移位offset,
  以及DataBlock的大小size,用于上层写IndexBlock用。
  >
  */
  handle->set_offset(r->offset);
  handle->set_size(block_contents.size());
  //<!将DataBlock数据写入文件中>
  r->status = r->file->Append(block_contents);
  if (r->status.ok()) {

    /*
	<!因为每个DataBlock后面跟着 CompressionType(1Byte) + CRC32(4Byte)
	所以接下来就是写这些。
	>
	*/
    char trailer[kBlockTrailerSize];
    trailer[0] = type;
    uint32_t crc = crc32c::Value(block_contents.data(), block_contents.size());
    crc = crc32c::Extend(crc, trailer, 1);  // Extend crc to cover block type
    EncodeFixed32(trailer + 1, crc32c::Mask(crc));
    r->status = r->file->Append(Slice(trailer, kBlockTrailerSize));
    if (r->status.ok()) {
      //<!写成功之后,更新下SSTable接下来写新的DataBlock的偏移位offset> 
      r->offset += block_contents.size() + kBlockTrailerSize;
    }
  }
}

//<!获取当前操作的状态>
Status TableBuilder::status() const { return rep_->status; }

/*
<!这个Finish方法就是按SSTable文件结构组装数据,
一般在要完成真个SSTable时,调用此方法。
数据格式就是:
| DataBlock | MetaBlcok(FilterBlock) | MetaBlcok Index | Index Block | Footer |
>
*/
Status TableBuilder::Finish() {
  Rep* r = rep_;
  Flush();
  assert(!r->closed);
  r->closed = true;

  BlockHandle filter_block_handle, metaindex_block_handle, index_block_handle;

  /*
  <!写FilterBlock,WriteRawBlock方法上面已分析过>
  */
  // Write filter block
  if (ok() && r->filter_block != nullptr) {
    WriteRawBlock(r->filter_block->Finish(), kNoCompression,
                  &filter_block_handle);
  }

  /*
  <!写Metaindex Block,数据记录也是如下格式:
  | key | offset | size |,这里的key格式是"filer." + 策略名字。
  >
  */
  // Write metaindex block
  if (ok()) {
    BlockBuilder meta_index_block(&r->options);
    if (r->filter_block != nullptr) {
      // Add mapping from "filter.Name" to location of filter data
      std::string key = "filter.";
      key.append(r->options.filter_policy->Name());
      std::string handle_encoding;
      filter_block_handle.EncodeTo(&handle_encoding);
      meta_index_block.Add(key, handle_encoding);
    }

    // TODO(postrelease): Add stats and other meta blocks
    WriteBlock(&meta_index_block, &metaindex_block_handle);
  }

  //<!将IndexBlock写入SSTable>
  // Write index block
  if (ok()) {
    if (r->pending_index_entry) {
      r->options.comparator->FindShortSuccessor(&r->last_key);
      std::string handle_encoding;
      r->pending_handle.EncodeTo(&handle_encoding);
      r->index_block.Add(r->last_key, Slice(handle_encoding));
      r->pending_index_entry = false;
    }
    WriteBlock(&r->index_block, &index_block_handle);
  }

  /*
  <!写Footer,整个大小是48Byte,组成如下:
  | metaindex_block_handle | index_block_handle | pendding | magic |
  >
  */
  // Write footer
  if (ok()) {
    Footer footer;
    footer.set_metaindex_handle(metaindex_block_handle);
    footer.set_index_handle(index_block_handle);
    std::string footer_encoding;
    footer.EncodeTo(&footer_encoding);
    r->status = r->file->Append(footer_encoding);
    if (r->status.ok()) {
      r->offset += footer_encoding.size();
    }
  }

  //<!返回操作状态,外层会根据情况将数据刷到磁盘>
  return r->status;
}

void TableBuilder::Abandon() {
  Rep* r = rep_;
  assert(!r->closed);
  r->closed = true;
}

//<!获取SSTable写了多少个key>
uint64_t TableBuilder::NumEntries() const { return rep_->num_entries; }

//<!获取SSTable的大小>
uint64_t TableBuilder::FileSize() const { return rep_->offset; }

}  // namespace leveldb
读流程

读流程中涉及到了Table Cache等知识,待后续篇章去解读。
针对读流程中出现的二级迭代器,会在下篇文章中介绍。

namespace leveldb {

struct Table::Rep {
  ~Rep() {
    delete filter;
    delete[] filter_data;
    delete index_block;
  }

  Options options;
  Status status;
  RandomAccessFile* file;
  uint64_t cache_id;
  FilterBlockReader* filter;
  const char* filter_data;

  BlockHandle metaindex_handle;  // Handle to metaindex_block: saved from footer
  Block* index_block;
};

//打开SSTable时,首先将index block读取出来,
//用于后期查询key时,先通过内存中的index block来
//判断key在不在这个SSTable,然后再决定是否去读取对应的data block。
//这样明显可减少I/O操作。
Status Table::Open(const Options& options, RandomAccessFile* file,
                   uint64_t size, Table** table) {
  *table = nullptr;
  //SSTable的Footer就是48Byte
  if (size < Footer::kEncodedLength) {
    return Status::Corruption("file is too short to be an sstable");
  }

  char footer_space[Footer::kEncodedLength];
  Slice footer_input;
  //将footer读出来,用于解析其中的metaindex_block_handle和
  //index_block_handle。
  Status s = file->Read(size - Footer::kEncodedLength, Footer::kEncodedLength,
                        &footer_input, footer_space);
  if (!s.ok()) return s;

  //1、解析出metaindex_block_handle;
  //2、解析出index_block_handle。
  Footer footer;
  s = footer.DecodeFrom(&footer_input);
  if (!s.ok()) return s;

  // Read the index block
  BlockContents index_block_contents;
  if (s.ok()) {
    ReadOptions opt;
	//是否开启严格检查数据完整性,默认false
	//开启之后可能会因为部分数据异常导致整个数据库无法读。
    if (options.paranoid_checks) {
      opt.verify_checksums = true;
    }

	//将index_block读出。
	//1、安装offset去sstable位置读取数据;
	//2、若开启校验则校验;
	//3、若数据压缩则解压。
    s = ReadBlock(file, opt, footer.index_handle(), &index_block_contents);
  }

  if (s.ok()) {
    // We've successfully read the footer and the index block: we're
    // ready to serve requests.
    Block* index_block = new Block(index_block_contents);
    Rep* rep = new Table::Rep;
    rep->options = options;
    rep->file = file;
    rep->metaindex_handle = footer.metaindex_handle();
    rep->index_block = index_block;
	//涉及到对Cache管理了,这里暂时不清楚此cache_id的作用。
    rep->cache_id = (options.block_cache ? options.block_cache->NewId() : 0);
    rep->filter_data = nullptr;
    rep->filter = nullptr;
	//实例一个table,用于对sstable读取解析
    *table = new Table(rep);

	//读取filte block
    (*table)->ReadMeta(footer);
  }

  return s;
}

void Table::ReadMeta(const Footer& footer) {
  //过滤策略都没有,那就可以确定没必要读filter block了
  if (rep_->options.filter_policy == nullptr) {
    return;  // Do not need any metadata
  }

  // TODO(sanjay): Skip this if footer.metaindex_handle() size indicates
  // it is an empty block.
  ReadOptions opt;
  if (rep_->options.paranoid_checks) {
    opt.verify_checksums = true;
  }

  //根据metaindex_handle读取metaindex block
  BlockContents contents;
  if (!ReadBlock(rep_->file, opt, footer.metaindex_handle(), &contents).ok()) {
    // Do not propagate errors since meta info is not needed for operation
    return;
  }

  //这里是疑惑的地方!!!!!!
  Block* meta = new Block(contents);

  Iterator* iter = meta->NewIterator(BytewiseComparator());
  std::string key = "filter.";
  key.append(rep_->options.filter_policy->Name());
  iter->Seek(key);
  if (iter->Valid() && iter->key() == Slice(key)) {

    //根据metaindex的offset+size去读取filter block
    ReadFilter(iter->value());
  }
  delete iter;
  delete meta;
}

void Table::ReadFilter(const Slice& filter_handle_value) {
  Slice v = filter_handle_value;
  BlockHandle filter_handle;
  if (!filter_handle.DecodeFrom(&v).ok()) {
    return;
  }

  // We might want to unify with ReadBlock() if we start
  // requiring checksum verification in Table::Open.
  ReadOptions opt;
  if (rep_->options.paranoid_checks) {
    opt.verify_checksums = true;
  }

  //读取filter block 数据
  BlockContents block;
  if (!ReadBlock(rep_->file, opt, filter_handle, &block).ok()) {
    return;
  }

  //如果heap_allocated为true表示读取
  //filter block的时候new了内存,后续需要删除
  if (block.heap_allocated) {
    rep_->filter_data = block.data.data();  // Will need to delete later
  }

  //构造一个读取filter block的实例
  rep_->filter = new FilterBlockReader(rep_->options.filter_policy, block.data);
}

Table::~Table() { delete rep_; }

static void DeleteBlock(void* arg, void* ignored) {
  delete reinterpret_cast<Block*>(arg);
}

static void DeleteCachedBlock(const Slice& key, void* value) {
  Block* block = reinterpret_cast<Block*>(value);
  delete block;
}

static void ReleaseBlock(void* arg, void* h) {
  Cache* cache = reinterpret_cast<Cache*>(arg);
  Cache::Handle* handle = reinterpret_cast<Cache::Handle*>(h);
  cache->Release(handle);
}

//根据index_value(即offset+size),读取对应的block。
// Convert an index iterator value (i.e., an encoded BlockHandle)
// into an iterator over the contents of the corresponding block.
Iterator* Table::BlockReader(void* arg, const ReadOptions& options,
                             const Slice& index_value) {
  Table* table = reinterpret_cast<Table*>(arg);
  Cache* block_cache = table->rep_->options.block_cache;
  Block* block = nullptr;
  Cache::Handle* cache_handle = nullptr;

  BlockHandle handle;
  Slice input = index_value;
  Status s = handle.DecodeFrom(&input);
  // We intentionally allow extra stuff in index_value so that we
  // can add more features in the future.

  if (s.ok()) {
    BlockContents contents;
    if (block_cache != nullptr) {
      //如果开启了block_cache,则先去此cache中查找
	  //key就是id+DataBlock的offset。(此处暂时不解读Cache相关实现)
      char cache_key_buffer[16];
      EncodeFixed64(cache_key_buffer, table->rep_->cache_id);
      EncodeFixed64(cache_key_buffer + 8, handle.offset());
      Slice key(cache_key_buffer, sizeof(cache_key_buffer));
      cache_handle = block_cache->Lookup(key);

	  //1、若在cache中查找到了直接将地址赋值给block;
	  //2、若为找到,则去SSTable文件中去查找
      if (cache_handle != nullptr) {
        block = reinterpret_cast<Block*>(block_cache->Value(cache_handle));
      } else {
        s = ReadBlock(table->rep_->file, options, handle, &contents);
        if (s.ok()) {
          block = new Block(contents);
		  //若读取的Block是直接new的,且fill_cache,则将这个Block缓存起来。
          if (contents.cachable && options.fill_cache) {
            cache_handle = block_cache->Insert(key, block, block->size(),
                                               &DeleteCachedBlock);
          }
        }
      }
    } else {
     
      //3、若为使用block_cache,则直接去SSTable中去读数据。
      s = ReadBlock(table->rep_->file, options, handle, &contents);
      if (s.ok()) {
        block = new Block(contents);
      }
    }
  }

  Iterator* iter;
  if (block != nullptr) {
    iter = block->NewIterator(table->rep_->options.comparator);
	//1、cache_handle 为null,表示block不在缓存中,在迭代器iter析构时,
	//   直接删除这个block。
	//2、cache_handle非null,表示block在缓存中,在迭代器iter析构时,
	//   通过ReleaseBlock,减少其一次引用计数。
    if (cache_handle == nullptr) {
      iter->RegisterCleanup(&DeleteBlock, block, nullptr);
    } else {
      iter->RegisterCleanup(&ReleaseBlock, block_cache, cache_handle);
    }
  } else {
    //若未获取到block,则直接生存一个错误迭代器返回。
    iter = NewErrorIterator(s);
  }
  return iter;
}

//SSTable二层迭代器迭代器。
Iterator* Table::NewIterator(const ReadOptions& options) const {
  return NewTwoLevelIterator(
      rep_->index_block->NewIterator(rep_->options.comparator),
      &Table::BlockReader, const_cast<Table*>(this), options);
}

Status Table::InternalGet(const ReadOptions& options, const Slice& k, void* arg,
                          void (*handle_result)(void*, const Slice&,
                                                const Slice&)) {
  Status s;
  //通过key,找到index block中的一条对应DataBlock的记录
  Iterator* iiter = rep_->index_block->NewIterator(rep_->options.comparator);
  iiter->Seek(k);
  //Seek到
  if (iiter->Valid()) {

	//hanlde_vale就是返回的DataBlock的offset+size。   
    Slice handle_value = iiter->value();
    FilterBlockReader* filter = rep_->filter;
    BlockHandle handle;

	//如果过滤策略非空,则通过DataBlock的offset,去Filter中去查找是否有此key
    if (filter != nullptr && handle.DecodeFrom(&handle_value).ok() &&
        !filter->KeyMayMatch(handle.offset(), k)) {
      // Not found
    } else {

      //如果在Filte Block中查找到了(不一定真的查找到),那就去DataBlock中去查找。
	  //通过DataBlock的offset+size去创建一个读取DataBlock的迭代器
      Iterator* block_iter = BlockReader(this, options, iiter->value());

	  //Seek要查找的key
      block_iter->Seek(k);
      if (block_iter->Valid()) {

         //查找到key之后,执行传入的方法函数
        (*handle_result)(arg, block_iter->key(), block_iter->value());
      }
      s = block_iter->status();
      delete block_iter;
    }
  }
  if (s.ok()) {
    s = iiter->status();
  }
  delete iiter;
  return s;
}


//预估key的大致偏移位。
//1、在index_block中查找到了就返回index_block中对应的DataBlock的offset。
//2、如果在index_block中查找到了但是无法解码出offset+size,就默认给metaindex_block的offset。
//3、Seek是查到大于等于这个key的值,若未找到,说明这个key比较大,默认给metaindex_block的offset。
uint64_t Table::ApproximateOffsetOf(const Slice& key) const {
  Iterator* index_iter =
      rep_->index_block->NewIterator(rep_->options.comparator);
  index_iter->Seek(key);
  uint64_t result;
  if (index_iter->Valid()) {
    BlockHandle handle;
    Slice input = index_iter->value();
    Status s = handle.DecodeFrom(&input);
    if (s.ok()) {
      result = handle.offset();
    } else {
      // Strange: we can't decode the block handle in the index block.
      // We'll just return the offset of the metaindex block, which is
      // close to the whole file size for this case.
      result = rep_->metaindex_handle.offset();
    }
  } else {
    // key is past the last key in the file.  Approximate the offset
    // by returning the offset of the metaindex block (which is
    // right near the end of the file).
    result = rep_->metaindex_handle.offset();
  }
  delete index_iter;
  return result;
}

}  // namespace leveldb
  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值