Rocksdb 代码学习 写流程2 (memtable写)

1.memtable 写

上一篇的博客,写到Rocksdb将WriteBatch的内容往memtable,主要用到的是MemTableInserter这个类的SeekToColumnFamily和PutCF这两个方法

class MemTableInserter : public WriteBatch::Handler {
 public:
  SequenceNumber sequence_;
  ColumnFamilyMemTables* cf_mems_;
  bool ignore_missing_column_families_;
  uint64_t log_number_;
  DBImpl* db_;
  const bool dont_filter_deletes_;

  MemTableInserter(SequenceNumber sequence, ColumnFamilyMemTables* cf_mems,
                   bool ignore_missing_column_families, uint64_t log_number,
                   DB* db, const bool dont_filter_deletes)
      : sequence_(sequence),
        cf_mems_(cf_mems),
        ignore_missing_column_families_(ignore_missing_column_families),
        log_number_(log_number),
        db_(reinterpret_cast<DBImpl*>(db)),
        dont_filter_deletes_(dont_filter_deletes) {
    assert(cf_mems);
    if (!dont_filter_deletes_) {
      assert(db_);
    }
  }

  bool SeekToColumnFamily(uint32_t column_family_id, Status* s) {
    // We are only allowed to call this from a single-threaded write thread
    // (or while holding DB mutex)
      //在memtable里查找这个column_family
    bool found = cf_mems_->Seek(column_family_id);
    if (!found) {
      if (ignore_missing_column_families_) {
        *s = Status::OK();
      } else {
        *s = Status::InvalidArgument(
            "Invalid column family specified in write batch");
      }
      return false;
    }
    if (log_number_ != 0 && log_number_ < cf_mems_->GetLogNumber()) {
      // This is true only in recovery environment (log_number_ is always 0 in
      // non-recovery, regular write code-path)
      // * If log_number_ < cf_mems_->GetLogNumber(), this means that column
      // family already contains updates from this log. We can't apply updates
      // twice because of update-in-place or merge workloads -- ignore the
      // update
      *s = Status::OK();
      return false;
    }
    return true;
  }
  virtual Status PutCF(uint32_t column_family_id, const Slice& key,
                       const Slice& value) override {
    Status seek_status;
      //如果在memtable中没有找到传入的ColumnFamily,直接返回,如果找到了,就是设置当前(ColumnFamilyData)current_为找到的columnFamily
    if (!SeekToColumnFamily(column_family_id, &seek_status)) {
      ++sequence_;
      return seek_status;
    }
      //直接获取当前的ColumnFamilyData)current_的memtable
    MemTable* mem = cf_mems_->GetMemTable();
    auto* moptions = mem->GetMemTableOptions();
      //如何memtable操作中的内部更新不支持就添加这条记录
    if (!moptions->inplace_update_support) {
      mem->Add(sequence_, kTypeValue, key, value);
        //或者支持内部更新,但inplace_callback为空,就只是更新这条记录
    } else if (moptions->inplace_callback == nullptr) {
      mem->Update(sequence_, key, value);
      RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
    } else {
        //不然就更新这条记录并且Callback
      if (mem->UpdateCallback(sequence_, key, value)) {
      } else {
          //支持内部更新,但在memtable中找不到这条记录,就去从sst获取,并且更新,添加
        // key not found in memtable. Do sst get, update, add
          //设置快照
        SnapshotImpl read_from_snapshot;
        read_from_snapshot.number_ = sequence_;
        ReadOptions ropts;
        ropts.snapshot = &read_from_snapshot;

        std::string prev_value;
        std::string merged_value;

        auto cf_handle = cf_mems_->GetColumnFamilyHandle();
        if (cf_handle == nullptr) {
          cf_handle = db_->DefaultColumnFamily();
        }
          //调用数据库的Get的操作获获取这个key之前的值,并存在快照中
        Status s = db_->Get(ropts, cf_handle, key, &prev_value);

        char* prev_buffer = const_cast<char*>(prev_value.c_str());
        uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
        auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
                                                 s.ok() ? &prev_size : nullptr,
                                                 value, &merged_value);
        if (status == UpdateStatus::UPDATED_INPLACE) {
            //之前的的值已经内部更新了,其实就是把新的值写在原来的地址上
          // prev_value is updated in-place with final value.
          mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
        } else if (status == UpdateStatus::UPDATED) {
            //没有内部更新的话,就存合并后的值,
          // merged_value contains the final value.
          mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
          RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
        }
      }
    }
    // Since all Puts are logged in trasaction logs (if enabled), always bump
    // sequence number. Even if the update eventually fails and does not result
    // in memtable add/update.
    sequence_++;
    cf_mems_->CheckMemtableFull();
    return Status::OK();
  }
}

这个类在WriteBatchInternal::InsertInto这个中被构建并作为参加传入WriteBatch的Iterate方法。

Status WriteBatchInternal::InsertInto(const WriteBatch* b,
                                      ColumnFamilyMemTables* memtables,
                                      bool ignore_missing_column_families,
                                      uint64_t log_number, DB* db,
                                      const bool dont_filter_deletes) {
  MemTableInserter inserter(WriteBatchInternal::Sequence(b), memtables,
                            ignore_missing_column_families, log_number, db,
                            dont_filter_deletes);
  return b->Iterate(&inserter);
}

WriteBatch::Iterate而这个方法做的事情就是将WriteBatch中的内容移除头的12个字节后,一条条取记录,然后根据类型调用handler(MemTableInserter)里面的方法处理

Status WriteBatch::Iterate(Handler* handler) const {
  Slice input(rep_);
  if (input.size() < kHeader) {//字符串的长度至少要大于等于12
    return Status::Corruption("malformed WriteBatch (too small)");
  }

  input.remove_prefix(kHeader);//移除头12个字节
  Slice key, value, blob;
  int found = 0;//代表记录数
  Status s;
  while (s.ok() && !input.empty() && handler->Continue()) {
    char tag = 0;//获取类型
    uint32_t column_family = 0;  // default

    s = ReadRecordFromWriteBatch(&input, &tag, &column_family, &key, &value,
                                 &blob);
    if (!s.ok()) {
      return s;
    }

    switch (tag) {

      case kTypeColumnFamilyValue:
      case kTypeValue:
        s = handler->PutCF(column_family, key, value);
        found++;
        break;
      case kTypeColumnFamilyDeletion:
      case kTypeDeletion:
        s = handler->DeleteCF(column_family, key);
        found++;
        break;
      case kTypeColumnFamilySingleDeletion:
      case kTypeSingleDeletion:
        s = handler->SingleDeleteCF(column_family, key);
        found++;
        break;
      case kTypeColumnFamilyMerge:
      case kTypeMerge:
        s = handler->MergeCF(column_family, key, value);
        found++;
        break;
      case kTypeLogData:
        handler->LogData(blob);
        break;
      default:
        return Status::Corruption("unknown WriteBatch tag");
    }
  }
  if (!s.ok()) {
    return s;
  }
  if (found != WriteBatchInternal::Count(this)) {//判断添加的记录数是否等于WriteBatch中持有的记录数
    return Status::Corruption("WriteBatch has wrong count");
  } else {
    return Status::OK();
  }
}

因为是写操作,所以调用的是MemTableInserter:PutCF

virtual Status PutCF(uint32_t column_family_id, const Slice& key,
                     const Slice& value) override {
  Status seek_status;
    //如果在memtable中没有找到传入的ColumnFamily,直接返回,如果找到了,就是设置当前(ColumnFamilyData)current_为找到的columnFamily
  if (!SeekToColumnFamily(column_family_id, &seek_status)) {
    ++sequence_;
    return seek_status;
  }
    //直接获取当前的ColumnFamilyData)current_的memtable
  MemTable* mem = cf_mems_->GetMemTable();
  auto* moptions = mem->GetMemTableOptions();
    //如何memtable操作中的内部更新不支持就添加这条记录
  if (!moptions->inplace_update_support) {
    mem->Add(sequence_, kTypeValue, key, value);
      //或者支持内部更新,但inplace_callback为空,就只是更新这条记录
  } else if (moptions->inplace_callback == nullptr) {
    mem->Update(sequence_, key, value);
    RecordTick(moptions->statistics, NUMBER_KEYS_UPDATED);
  } else {
      //不然就更新这条记录并且Callback
    if (mem->UpdateCallback(sequence_, key, value)) {
    } else {
        //支持内部更新,但在memtable中找不到这条记录,就去从sst获取,并且更新,添加
      // key not found in memtable. Do sst get, update, add
        //设置快照
      SnapshotImpl read_from_snapshot;
      read_from_snapshot.number_ = sequence_;
      ReadOptions ropts;
      ropts.snapshot = &read_from_snapshot;

      std::string prev_value;
      std::string merged_value;

      auto cf_handle = cf_mems_->GetColumnFamilyHandle();
      if (cf_handle == nullptr) {
        cf_handle = db_->DefaultColumnFamily();
      }
        //调用数据库的Get的操作获获取这个key之前的值,并存在快照中
      Status s = db_->Get(ropts, cf_handle, key, &prev_value);

      char* prev_buffer = const_cast<char*>(prev_value.c_str());
      uint32_t prev_size = static_cast<uint32_t>(prev_value.size());
      auto status = moptions->inplace_callback(s.ok() ? prev_buffer : nullptr,
                                               s.ok() ? &prev_size : nullptr,
                                               value, &merged_value);
      if (status == UpdateStatus::UPDATED_INPLACE) {
          //之前的的值已经内部更新了,其实就是把新的值写在原来的地址上
        // prev_value is updated in-place with final value.
        mem->Add(sequence_, kTypeValue, key, Slice(prev_buffer, prev_size));
        RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
      } else if (status == UpdateStatus::UPDATED) {
          //没有内部更新的话,就存合并后的值,
        // merged_value contains the final value.
        mem->Add(sequence_, kTypeValue, key, Slice(merged_value));
        RecordTick(moptions->statistics, NUMBER_KEYS_WRITTEN);
      }
    }
  }
  // Since all Puts are logged in trasaction logs (if enabled), always bump
  // sequence number. Even if the update eventually fails and does not result
  // in memtable add/update.
  sequence_++;
  cf_mems_->CheckMemtableFull();
  return Status::OK();
}

而PutCF主要调用的是MemTable::Add方法往memtable里面添加记录,加入时进行编码操作(省空间)

void MemTable::Add(SequenceNumber s, ValueType type,
                   const Slice& key, /* user key */
                   const Slice& value) {

  //存储的格式如下
  // Format of an entry is concatenation of:
  //  key_size     : varint32 of internal_key.size()
  //  key bytes    : char[internal_key.size()]
  //  value_size   : varint32 of value.size()
  //  value bytes  : char[value.size()]

  //这里为了节省空间,将整型编码成变长整型,存储为变长整型(可以查下资料)
  uint32_t key_size = static_cast<uint32_t>(key.size());
  uint32_t val_size = static_cast<uint32_t>(value.size());
  uint32_t internal_key_size = key_size + 8;

  //获取编码后的长度
  const uint32_t encoded_len = VarintLength(internal_key_size) +
                               internal_key_size + VarintLength(val_size) +
                               val_size;
  char* buf = nullptr;

  //分配encoded_len长度的buffer
  KeyHandle handle = table_->Allocate(encoded_len, &buf);
  assert(buf != nullptr);

  //依次将key和value的长度和值,还有类型,编码到buf里面
  char* p = EncodeVarint32(buf, internal_key_size);
  memcpy(p, key.data(), key_size);
  p += key_size;
  uint64_t packed = PackSequenceAndType(s, type);
  EncodeFixed64(p, packed);
  p += 8;
  p = EncodeVarint32(p, val_size);
  memcpy(p, value.data(), val_size);
  assert((unsigned)(p + val_size - buf) == (unsigned)encoded_len);

  //编码完以后插入.
  // rocksdb中,memtable在内存中的形式有三种:skiplist,hash-skiplist,hash-linklist,
  // 从字面中就可以看出数据结构的大体形式,
  // hash-skiplist就是每个hash bucket中是一个skiplist,
  // hash-linklist中,每个hash bucket中是一个link-list,
  // 启用何用数据结构可在配置中选择
  table_->Insert(handle);
  num_entries_.store(num_entries_.load(std::memory_order_relaxed) + 1,
                     std::memory_order_relaxed);
  data_size_.store(data_size_.load(std::memory_order_relaxed) + encoded_len,
                   std::memory_order_relaxed);
  if (type == kTypeDeletion) {
    num_deletes_++;
  }

  if (prefix_bloom_) {
    assert(prefix_extractor_);
    prefix_bloom_->Add(prefix_extractor_->Transform(key));
  }

  // The first sequence number inserted into the memtable
  assert(first_seqno_ == 0 || s > first_seqno_);
  if (first_seqno_ == 0) {
    first_seqno_ = s;

    if (earliest_seqno_ == kMaxSequenceNumber) {
      earliest_seqno_ = first_seqno_;
    }
    assert(first_seqno_ >= earliest_seqno_);
  }

  should_flush_ = ShouldFlushNow();
}

这样完成了memtable表的记录添加

参考
http://kernelmaker.github.io/Rocksdb_Study_4
http://www.cnblogs.com/KevinT/category/590804.html

  • 1
    点赞
  • 5
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值