leveldb:write(如何处理并发写操作)

Put与Delete操作

Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
  WriteBatch batch;
  batch.Put(key, value);
  return Write(opt, &batch);
}

Status DB::Delete(const WriteOptions& opt, const Slice& key) {
  WriteBatch batch;
  batch.Delete(key);
  return Write(opt, &batch);
}

LevelDB对外暴露的写接口包括Put,Delete和Write,其中Write需要WriteBatch作为参数,而Put和Delete首先就是将当前的操作封装到一个WriteBatch对象,并调用Write接口。
opt是写选项,从上面代码并没有看出处理并发的逻辑,其实对于多线程的处理是在DBImpl::Write函数中完成

WriteBatch

WriteBatch可以记录许多个操作,每一个操作代表着要插入或删除相应数据

class WriteBatch {
 public:
  WriteBatch();
  ~WriteBatch();

  // Store the mapping "key->value" in the database.
  void Put(const Slice& key, const Slice& value);

  // If the database contains a mapping for "key", erase it.  Else do nothing.
  void Delete(const Slice& key);

  // Clear all updates buffered in this batch.
  void Clear();

  // Support for iterating over the contents of a batch.
  class Handler {
   public:
    virtual ~Handler();
    virtual void Put(const Slice& key, const Slice& value) = 0;
    virtual void Delete(const Slice& key) = 0;
  };
  Status Iterate(Handler* handler) const;

 private:
  friend class WriteBatchInternal;

  std::string rep_;  //只有一个string成员变量,来存放所有操作
};

每一个WriteBatch都是以一个固定长度的头部开始,然后后面接着许多连续的记录(插入或删除操作)
固定头部格式:
固定头部共12字节,其中前8字节为WriteBatch的序列号(也就是每个操作对应的全局序列号),对应rep_[0]到rep_[7],每次处理Batch中的记录时才会更新,后四字节为当前Batch中的记录数,对应rep_[8]到rep_[11];
后面的记录结构为:
插入数据时:type(kTypeValue、kTypeDeletion),Key_size,Key,Value_size,Value
删除数据时:type(kTypeValue、kTypeDeletion),Key_size,Key
这里写图片描述
WriteBatchInternal提供了一系列的静态操作接口来对WriteBatch的接口进行封装,而不是直接操作WriteBatch的接口

DBImpl::Write

Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
/*struct DBImpl::Writer {
*  WriteBatch* batch;
*  bool sync;
*  bool done;
* port::CondVar cv;
*};
*Writer封装WriteBatch,主要是多了信号量cv用于多线程的同步,以及该batch是否完成的标志done
*/
  Writer w(&mutex_);
  w.batch = my_batch;
  w.sync = options.sync;
  w.done = false;

//加锁,因为w要插入全局队列writers_中
  MutexLock l(&mutex_);
  writers_.push_back(&w);
//只有当w是位于队列头部且w并没有完成时才不用等待
  while (!w.done && &w != writers_.front()) {
    w.cv.Wait();
  }
  //可能该w中的batch被其他线程通过下面讲到的合并操作一起完成了
  if (w.done) {
    return w.status;
  }

  // May temporarily unlock and wait.
  Status status = MakeRoomForWrite(my_batch == NULL);
  uint64_t last_sequence = versions_->LastSequence();
  Writer* last_writer = &w;
  if (status.ok() && my_batch != NULL) {  
  //合并队列中的各个batch到一个新batch中
    WriteBatch* updates = BuildBatchGroup(&last_writer);
  //为合并后的新batch中的第一个操作赋上全局序列号
    WriteBatchInternal::SetSequence(updates, last_sequence + 1);
  //并计算新的全局序列号
    last_sequence += WriteBatchInternal::Count(updates);

    {
    //往磁盘写日志文件开销很大,此时可以释放锁来提高并发,此时其他线程可以将
    //新的writer插入到队列writers_中
      mutex_.Unlock();
    //将batch中的每条操作写入日志文件log_中
      status = log_->AddRecord(WriteBatchInternal::Contents(updates));
      bool sync_error = false;
      if (status.ok() && options.sync) {
      //是否要求立马刷盘将log写到磁盘,因为我们知道文件系统还有自己的缓存
        status = logfile_->Sync();
        if (!status.ok()) {
          sync_error = true;
        }
      }
      if (status.ok()) {
       //将batch中每条操作插入到memtable中
        status = WriteBatchInternal::InsertInto(updates, mem_);
      }
      //重新加锁
      mutex_.Lock();
    }
    //因为updates已经写入了log和memtable,可以清空了
    if (updates == tmp_batch_) tmp_batch_->Clear();
    //重新设置新的全局序列号
    versions_->SetLastSequence(last_sequence);
  }

  while (true) {
  //因为我们的updates可能合并了writers_队列中的很多,当前线程完成了其他线程的
  //writer,只需唤醒这些已完成writer的线程
    Writer* ready = writers_.front();
  //从队列头部取出已完成的writer
    writers_.pop_front();
    if (ready != &w) {
   //如果取出的writer不是当前线程的自己的,则唤醒writer所属的线程,唤醒的线程会执
   //行 if (w.done) {
   // return w.status;
  //}逻辑
      ready->status = status;
      ready->done = true;
      ready->cv.Signal();
    }
    //ready == last_writer说明这已经是合并的batch中最后一个已完成的writer了
    if (ready == last_writer) break;
  }

  // Notify new head of write queue
  if (!writers_.empty()) {
  //队列不空,则唤醒队列头部writer所属的线程,参见上面 while (!w.done && &w != writers_.front())
    writers_.front()->cv.Signal();
  }

  return status;
}

DBImpl::BuildBatchGroup


// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-NULL batch
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
  assert(!writers_.empty());
  Writer* first = writers_.front();
  WriteBatch* result = first->batch;
  assert(result != NULL);

  size_t size = WriteBatchInternal::ByteSize(first->batch);

  // 设置合并后产生的batch的最大容量
  size_t max_size = 1 << 20;
  if (size <= (128<<10)) {
  //如果第一个待合并的batch的size很小,则相应减小合并后batch的最大容量
    max_size = size + (128<<10);
  }
//我们需要记录writers_队列中最后一个被合并的writer,因为write函数中唤醒线程需要用
//到,防止小的batch需要等待过久用于合并
  *last_writer = first;
  std::deque<Writer*>::iterator iter = writers_.begin();
  ++iter;  // Advance past "first"
  for (; iter != writers_.end(); ++iter) {
    Writer* w = *iter;
    if (w->sync && !first->sync) {
      //能合并到一起的batch大家的sync属性必须相同
      break;
    }

    if (w->batch != NULL) {
      size += WriteBatchInternal::ByteSize(w->batch);
      if (size > max_size) {
        // Do not make batch too big
        break;
      }

      // Append to *result
      if (result == first->batch) {
        // 用db数据成员tmp_batch_存放合并后的结果,相当于把各个待合并的writer中
        //的数据全都拷贝进了tmp_batch_
        result = tmp_batch_;
        assert(WriteBatchInternal::Count(result) == 0);
        WriteBatchInternal::Append(result, first->batch);
      }
      WriteBatchInternal::Append(result, w->batch);
    }
    *last_writer = w;//记录最后一个合并的writer
  }
  return result;
}
展开阅读全文

没有更多推荐了,返回首页