leveldb源码学习之 DBImpl 类（数据写入）

最新推荐文章于 2022-07-24 08:44:18 发布

空腹熊猫

最新推荐文章于 2022-07-24 08:44:18 发布

阅读量480

点赞数

分类专栏： leveldb

本文链接：https://blog.csdn.net/guangyacyb/article/details/105277410

版权

leveldb 专栏收录该内容

15 篇文章 0 订阅

订阅专栏

推荐结合leveldb-handbook 阅读源码

leveldb以其优秀的写性能著名

整体流程

leveldb的一次写入分为两部分：

将写操作写入日志；
将写操作应用到内存数据库中；

写类型

leveldb对外提供的写入接口有：（1）Put（2）Delete两种。这两种本质对应同一种操作，Delete操作同样会被转换成一个value为空的Put操作。

除此以外，leveldb还提供了一个批量处理的工具Batch，用户可以依据Batch来完成批量的数据库更新操作，且这些操作是原子性的。

合并写

leveldb中，在面对并发写入时，做了一个处理的优化。在同一个时刻，只允许一个写入操作将内容写入到日志文件以及内存数据库中。为了在写入进程较多的情况下，减少日志文件的小写入，增加整体的写入性能，leveldb将一些“小写入”合并成一个“大写入”。

leveldb 写入数据常用到的是put方法

db/db_impl.cc

// Convenience methods
Status DBImpl::Put(const WriteOptions& o, const Slice& key, const Slice& val) {
  return DB::Put(o, key, val);
}

// 默认实现，DB的子类也可以调用
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
  WriteBatch batch;
  batch.Put(key, value);
  return Write(opt, &batch);
}

这里用到WriteBatch 类的方法

db/write_batch.cc

void WriteBatch::Put(const Slice& key, const Slice& value) {
   WriteBatchInternal::SetCount(this, WriteBatchInternal::Count(this) + 1);
   rep_.push_back(static_cast<char>(kTypeValue));
   PutLengthPrefixedSlice(&rep_, key);
   PutLengthPrefixedSlice(&rep_, value);
}

这里是将key和value写入WriteBatch自带的一个缓冲区rep_中，然后调用db的Write方法：

Status DBImpl::Write(const WriteOptions& options, WriteBatch* updates) {
  Writer w(&mutex_);
  w.batch = updates; // 包含所有写入的内容
  w.sync = options.sync;
  w.done = false;

  MutexLock l(&mutex_);
  writers_.push_back(&w); // 将当前写操作放入writers_队列中
  while (!w.done && &w != writers_.front()) { // 写未完成且还没轮到它写，进入等待
    w.cv.Wait();
  }
  if (w.done) {
    return w.status;
  }

  // 为当前写操作腾出空间，可能会暂时释放锁或等待. 当updates 为空，什么都不写时，会强制压缩
  Status status = MakeRoomForWrite(updates == nullptr);
  uint64_t last_sequence = versions_->LastSequence();
  Writer* last_writer = &w;
  if (status.ok() && updates != nullptr) {  // updates 为空表示要压缩
    WriteBatch* write_batch = BuildBatchGroup(&last_writer); // 将队列中异步写打包成批量操作
    WriteBatchInternal::SetSequence(write_batch, last_sequence + 1);
    last_sequence += WriteBatchInternal::Count(write_batch); // 因为是多次写操作批量，所以序号要加上批量的个数

    // 写入log并修改memtable.这里可以释放锁，因为w当前用于写log，在此期间会阻止并发写log还有并发写memtable.
    {
      mutex_.Unlock();
      status = log_->AddRecord(WriteBatchInternal::Contents(write_batch)); // 写入log
      bool sync_error = false;
      if (status.ok() && options.sync) { // log需要同步
        status = logfile_->Sync();
        if (!status.ok()) {
          sync_error = true;
        }
      }
      if (status.ok()) {
        status = WriteBatchInternal::InsertInto(write_batch, mem_);
      }
      mutex_.Lock();
      if (sync_error) {
        // The state of the log file is indeterminate: the log record we
        // just added may or may not show up when the DB is re-opened.
        // So we force the DB into a mode where all future writes fail.
        RecordBackgroundError(status);
      }
    }
    if (write_batch == tmp_batch_) tmp_batch_->Clear();

    versions_->SetLastSequence(last_sequence); // 更新version 的序号
  }

  // 遍历此次批量写涉及的所有writer，对其保存的条件变量等待方进行唤醒。
  while (true) {
    Writer* ready = writers_.front();
    writers_.pop_front();
    if (ready != &w) {
      ready->status = status;
      ready->done = true;
      ready->cv.Signal();
    }
    if (ready == last_writer) break;
  }

  // Notify new head of write queue
  if (!writers_.empty()) {
    writers_.front()->cv.Signal();
  }

  return status;
}

Writer 是db_impl中为每个等待的写操作保存的信息结构，保存了获得的锁的指针和WriteBatch对象：

struct DBImpl::Writer {
  explicit Writer(port::Mutex* mu)
      : batch(nullptr), sync(false), done(false), cv(mu) {}

  Status status;
  WriteBatch* batch;
  bool sync;
  bool done;
  port::CondVar cv;
};

本次写操作会放入一个队列，直到轮到它写

写之前先分配足够的写空间：

// 要求: 获得锁 mutex_
// 要求: 当前线程在写队列开头（轮到此次写）
Status DBImpl::MakeRoomForWrite(bool force) {
  mutex_.AssertHeld();
  assert(!writers_.empty());
  bool allow_delay = !force;  // 允许对此次写操作做一些延迟
  Status s;
  while (true) {
    if (!bg_error_.ok()) {
      // 上次操作的错误先处理
      s = bg_error_;
      break;
    } else if (allow_delay && versions_->NumLevelFiles(0) >=
                                  config::kL0_SlowdownWritesTrigger) {
      // 就到达到L0层文件数量限制. 与其在达到限制时阻塞单次写操作几秒, 不如对每次写阻塞1ms减少延时方差
	  // 当与压缩线程共用一个CPU核时，这种延时会将CPU交出给对方.
      mutex_.Unlock();
      env_->SleepForMicroseconds(1000); // 延时1ms，这期间可能会在后台执行压缩？？
      allow_delay = false;  // 对单次写操作延时不超过一次
      mutex_.Lock();
    } else if (!force &&
               (mem_->ApproximateMemoryUsage() <= options_.write_buffer_size)) {
      // 当前 memtable 还有些许空间
      break;
    } else if (imm_ != nullptr) {
      // 当前 memtable 已经写满了，而之前的memtable也就是immutable还在压缩，进入等待.
      Log(options_.info_log, "Current memtable full; waiting...\n");
      background_work_finished_signal_.Wait();
    } else if (versions_->NumLevelFiles(0) >= config::kL0_StopWritesTrigger) {
      // 第 0 层文件数量已经达到上限. 应该是等待一次 major 压缩
      Log(options_.info_log, "Too many L0 files; waiting...\n");
      background_work_finished_signal_.Wait();
    } else {
      // memtable 写满了，切换到新的  memtable 触发对旧的memtable的一次压缩
      assert(versions_->PrevLogNumber() == 0);// 保证没有遗留旧的log
      uint64_t new_log_number = versions_->NewFileNumber(); // 新建log文件编号
      WritableFile* lfile = nullptr;
      s = env_->NewWritableFile(LogFileName(dbname_, new_log_number), &lfile); //新增log文件
      if (!s.ok()) {
        // 复用new_log_number，跳出循环，避免不断循环更新文件标号.
        versions_->ReuseFileNumber(new_log_number);
        break;
      }
      delete log_;
      delete logfile_;
	  // 更新log文件
      logfile_ = lfile;
      logfile_number_ = new_log_number;
      log_ = new log::Writer(lfile);
      imm_ = mem_; // memtable 变成 immutable
      has_imm_.store(true, std::memory_order_release);
      mem_ = new MemTable(internal_comparator_);  // 新建memtable用于写
      mem_->Ref();
      force = false;  // 还有空间时不强制压缩
      MaybeScheduleCompaction(); // 添加压缩任务
    }
  }
  return s;
}

这里如果没有空间，会尝试压缩，可参考：leveldb源码学习之 DBImpl 类（压缩 compaction）

根据写用到的WriteBatch对象获取一个group，可以将写队列中的异步写打包成批量写操作，写完后会对每个writer 内部的条件变量的等待方进行唤醒：

// REQUIRES: 写队列不为空（还有写任务）
// REQUIRES:  首个 writer 必须包含写操作，为空表示要压缩
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
  mutex_.AssertHeld();
  assert(!writers_.empty());
  Writer* first = writers_.front();
  WriteBatch* result = first->batch;
  assert(result != nullptr);

  size_t size = WriteBatchInternal::ByteSize(first->batch); // 获取writebatch 的rep_的长度，内容有key和value的一串编码

  // 允许 BatchGroup 使用较大的size（max_size）， 但若原来写的内容不多（size较小），可以用稍小的size 从而不会减慢写速度.
  size_t max_size = 1 << 20;
  if (size <= (128 << 10)) {
    max_size = size + (128 << 10);
  }

  *last_writer = first;
  std::deque<Writer*>::iterator iter = writers_.begin();
  ++iter;  // Advance past "first"
  for (; iter != writers_.end(); ++iter) {
    Writer* w = *iter;
    if (w->sync && !first->sync) {
      // 不要再批量异步写当中加入同步写操作.
      break;
    }

    if (w->batch != nullptr) { // 当前writer 也有写操作
      size += WriteBatchInternal::ByteSize(w->batch); // 累加当前写操作的内容
      if (size > max_size) {
        // 不要一次批量写太多
        break;
      }

      // Append to *result
      if (result == first->batch) {
        // 切换到一个临时的 WriteBatch，不打断调用者使用的 WriteBatch 
        result = tmp_batch_;
        assert(WriteBatchInternal::Count(result) == 0);
        WriteBatchInternal::Append(result, first->batch); // 将批量的写添加到临时WriteBatch中
      }
      WriteBatchInternal::Append(result, w->batch);
    }
    *last_writer = w;
  }
  return result;
}

空腹熊猫

关注

0
点赞
踩
0

收藏

觉得还不错? 一键收藏
0
评论
leveldb源码学习之 DBImpl 类（数据写入）

推荐结合leveldb-handbook 阅读源码leveldb以其优秀的写性能著名整体流程leveldb的一次写入分为两部分：将写操作写入日志；将写操作应用到内存数据库中；写类型leveldb对外提供的写入接口有：（1）Put（2）Delete两种。这两种本质对应同一种操作，Delete操作同样会被转换成一个value为空的Put操作。除此以外，leveldb还...
复制链接

扫一扫