RocksDB:WAL详解 + 创建删除WAL + WAL写 + WAL恢复memtable

WAL格式

WAL File格式

以kBlockSize为单位进行存储,每个Block中有多个record,Block的若未写满则需要进行补0。若超过BlockSize则需要进行拆分。

       +-----+-------------+--+----+----------+------+-- ... ----+
 File  | r0  |        r1   |P | r2 |    r3    |  r4  |           |
       +-----+-------------+--+----+----------+------+-- ... ----+
       <--- kBlockSize ------>|<-- kBlockSize ------>|

  rn = variable size records
  P = Padding

详细可见如下代码:

IOStatus Writer::AddRecord(const Slice& slice,
                           Env::IOPriority rate_limiter_priority) {
  do {
    const int64_t leftover = kBlockSize - block_offset_;
    if (leftover < header_size) {
      // 如果header空间不足,则需要写到下一个block,这个block补0.
      if (leftover > 0) {
        // 补0
        s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
                                static_cast<size_t>(leftover)),
                          0 /* crc32c_checksum */, rate_limiter_priority);
      }
      block_offset_ = 0;
    }
    const size_t avail = kBlockSize - block_offset_ - header_size;
    // 根据剩余的size和payloadsize计算type
    const size_t fragment_length = (left < avail) ? left : avail;
    RecordType type;
    const bool end = (left == fragment_length && compress_remaining == 0);
    if (begin && end) {
      type = recycle_log_files_ ? kRecyclableFullType : kFullType;
    } else if (begin) {
      type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
    } else if (end) {
      type = recycle_log_files_ ? kRecyclableLastType : kLastType;
    } else {
      type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
    }
    // 记录record
    s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority);
    ptr += fragment_length;
    left -= fragment_length;
    begin = false;
  } while (s.ok() && (left > 0 || compress_remaining > 0));
}

WAL Record格式

Record由如下格式组成:

  • CRC:4字节,先计算Log number的crc和Payload的crc,在将两个crc计算成最终的crc。
  • Size:2字节,Payload的size。
  • Type:1字节,Record的类型,kZeroType, kFullType, kFirstType, kLastType, kMiddleType。
  • Log number:4字节,version的filenum,也就是memtable的filenum。
  • Payload:真正的kv数据。
+---------+-----------+-----------+----------------+--- ... ---+
|CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
+---------+-----------+-----------+----------------+--- ... ---+
Same as above, with the addition of
Log number = 32bit log file number, so that we can distinguish between
records written by the most recent log writer vs a previous one.

详见如下代码

IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n,
                                    Env::IOPriority rate_limiter_priority) {
  size_t header_size;
  char buf[kRecyclableHeaderSize];
  // 2个字节记录payload的size
  buf[4] = static_cast<char>(n & 0xff); 
  buf[5] = static_cast<char>(n >> 8);
  // 1个字节记录type
  buf[6] = static_cast<char>(t);
  // 4个字节记录log_number
  EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
  // 计算log_number的crc
  crc = crc32c::Extend(crc, buf + 7, 4); 
  // 计算payload的crc
  uint32_t payload_crc = crc32c::Value(ptr, n);
  // 合并log_number和payload的crc
  crc = crc32c::Crc32cCombine(crc, payload_crc, n);
  // 4字节记录crc
  EncodeFixed32(buf, crc);
  // append header
  IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */,
                             rate_limiter_priority);
  // append payload
  s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority);
}

Payload格式

// WriteBatch::rep_ :=
//    sequence: fixed64
//    count: fixed32
//    data: record[count]
// record :=
//    kTypeValue varstring varstring
//    kTypeDeletion varstring
//    kTypeSingleDeletion varstring
//    kTypeRangeDeletion varstring varstring
//    kTypeMerge varstring varstring
//    kTypeColumnFamilyValue varint32 varstring varstring
//    kTypeColumnFamilyDeletion varint32 varstring
//    kTypeColumnFamilySingleDeletion varint32 varstring
//    kTypeColumnFamilyRangeDeletion varint32 varstring varstring
//    kTypeColumnFamilyMerge varint32 varstring varstring
//    kTypeBeginPrepareXID
//    kTypeEndPrepareXID varstring
//    kTypeCommitXID varstring
//    kTypeCommitXIDAndTimestamp varstring varstring
//    kTypeRollbackXID varstring
//    kTypeBeginPersistedPrepareXID
//    kTypeBeginUnprepareXID
//    kTypeWideColumnEntity varstring varstring
//    kTypeColumnFamilyWideColumnEntity varint32 varstring varstring
//    kTypeNoop
// varstring :=
//    len: varint32
//    data: uint8[len]

详细代码如下:

IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
                            log::Writer* log_writer, uint64_t* log_used,
                            bool need_log_sync, bool need_log_dir_sync,
                            SequenceNumber sequence,
                            LogFileNumberSize& log_file_number_size) {
  WriteBatch* merged_batch;
  // 将write_group中的writebatch添加到payload中
  io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch,
                                        &write_with_wal, &to_be_cached_state));
  // 所有payload的kv数据添加完成,记录seq
  WriteBatchInternal::SetSequence(merged_batch, sequence);
}
Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
                          WriteBatch* tmp_batch, WriteBatch** merged_batch,
                          size_t* write_with_wal,
                          WriteBatch** to_be_cached_state) {
    *merged_batch = tmp_batch;
    // 循环将writegroup中的batch添加到merged_batch(payload)中
    for (auto writer : write_group) {
      if (!writer->CallbackFailed()) {
        Status s = WriteBatchInternal::Append(*merged_batch, writer->batch,
                                              /*WAL_only*/ true);
      }
    }
  }
Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
                                  const bool wal_only) {
  // 记录batch的count,这里外层循环每一次进来都会更新一次
  SetCount(dst, Count(dst) + src_count);
  // append kv数据
  dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
  return Status::OK();
}
Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
                               const Slice& key, const Slice& value) {
  // 每个batch的kv数据在这里,以slice的方式先写入key,后写入value。
  PutLengthPrefixedSlice(&b->rep_, key);
  PutLengthPrefixedSlice(&b->rep_, value);
}

WAL写流程

写流程

WAL的写流程如下:

IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                            log::Writer* log_writer, uint64_t* log_used,
                            uint64_t* log_size,
                            Env::IOPriority rate_limiter_priority,
                            LogFileNumberSize& log_file_number_size) {
  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
  // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
  // from possible concurrent calls via the FlushWAL by the application.
  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
  // Due to performance cocerns of missed branch prediction penalize the new
  // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
  // when we do not need any locking.
  if (UNLIKELY(needs_locking)) {
    log_write_mutex_.Lock();
  }
  IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority);
  if (UNLIKELY(needs_locking)) {
    log_write_mutex_.Unlock();
  }
}
IOStatus Writer::AddRecord(const Slice& slice,
                           Env::IOPriority rate_limiter_priority) {
  // 准备写盘内容
  ...
  // 刷盘
  s = dest_->Flush(rate_limiter_priority);
}

刷盘策略

直接刷盘

创建db的option配置了sync,直接写盘。

IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
                            log::Writer* log_writer, uint64_t* log_used,
                            bool need_log_sync, bool need_log_dir_sync,
                            SequenceNumber sequence,
                            LogFileNumberSize& log_file_number_size) {
  // 数据直接刷盘
  if (io_s.ok() && need_log_sync) {
    for (auto& log : logs_) {
      io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
      if (!io_s.ok()) {
        break;
      }
    }
  }
  return io_s;
}

异步1M聚合刷盘

创建db的option配置了wal_bytes_per_sync,会将数据显写到buffer后返回,然后按照1M粒度刷盘,这种情况如果刷盘不及时,会导致恢复的时候数据丢失。

IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) {
  IOStatus s;
    // 数据显写入buffer
      if (perform_data_verification_ && buffered_data_with_checksum_) {
        s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(),
                                      op_rate_limiter_priority);
      } else {
        s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(),
                          op_rate_limiter_priority);
      }
    }
    // 写入buffer后返回
    if (!s.ok()) {
      return s;
    }

  // sync OS cache to disk for every bytes_per_sync_
  // TODO: give log file and sst file different options (log
  // files could be potentially cached in OS for their whole
  // life time, thus we might not want to flush at all).

  // We try to avoid sync to the last 1MB of data. For two reasons:
  // (1) avoid rewrite the same page that is modified later.
  // (2) for older version of OS, write can block while writing out
  //     the page.
  // Xfs does neighbor page flushing outside of the specified ranges. We
  // need to make sure sync range is far from the write offset.
  if (!use_direct_io() && bytes_per_sync_) {
    const uint64_t kBytesNotSyncRange =
        1024 * 1024;                                // recent 1MB is not synced.
    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
    if (cur_size > kBytesNotSyncRange) {
      uint64_t offset_sync_to = cur_size - kBytesNotSyncRange;
      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
      assert(offset_sync_to >= last_sync_size_);
      if (offset_sync_to > 0 &&
          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
        last_sync_size_ = offset_sync_to;
      }
    }
  }

  return s;
}

WAL创建

创建WAL有两种情况。

  • DB open时会创建一个WAL。
  • CF刷新到磁盘后,会切换一个新的memtable,同时切WAL。
IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
                           size_t preallocate_block_size,
                           log::Writer** new_log) {
  io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
}

WAL删除

WAL在memtable转换成sstable后会进行后台删除。

void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                               bool no_full_scan) {
  // 遍历所有alive_log_files_文件,并添加到删除队列
    if (!alive_log_files_.empty() && !logs_.empty()) {
      // 当file的num小于还flush memtable对应的lognum,就表示这个wal需要删除了
      while (alive_log_files_.begin()->number < min_log_number) {
        ...
      }
    }
}
void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
  // 获取需要删除的WAL
  std::unordered_set<uint64_t> log_recycle_files_set(
      state.log_recycle_files.begin(), state.log_recycle_files.end());
  for (const auto& candidate_file : candidate_files) {
  switch (type) {
    case kWalFile:
      keep = ((number >= state.log_number) ||
              (number == state.prev_log_number) ||
              (log_recycle_files_set.find(number) !=
               log_recycle_files_set.end()));
      break;
  }
  // 后台删除
  SchedulePurge();
}

WAL恢复

在故障场景,WAL重建memtable,此时就需要遍历读取WAL中的kv数据,并插入memtable。主要的函数如下。

bool Reader::ReadRecord(Slice* record, std::string* scratch,
                        WALRecoveryMode wal_recovery_mode,
                        uint64_t* record_checksum) {
  while (true) {
    // 解析type,根据type判断是否被block拆分过。
    const unsigned int record_type =
        ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
    switch (record_type) {
      case kFullType:
      case kRecyclableFullType:
      ...
    }
  }
}
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
                               SequenceNumber* next_sequence, bool read_only,
                               bool* corrupted_wal_found,
                               RecoveryContext* recovery_ctx) {
  // 遍历所有wal
  for (auto wal_number : wal_numbers) {
    // 读取一个完整的record
    while (!stop_replay_by_wal_filter &&
           reader.ReadRecord(&record, &scratch,
                             immutable_db_options_.wal_recovery_mode,
                             &record_checksum) &&
           status.ok()) {}
      WriteBatch batch;
      // 获取payload
      status = WriteBatchInternal::SetContents(&batch, record);
      // 获取seq
      SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
      // 写memtable
      status = WriteBatchInternal::InsertInto(
          &batch, column_family_memtables_.get(), &flush_scheduler_,
          &trim_history_scheduler_, true, wal_number, this,
          false /* concurrent_memtable_writes */, next_sequence,
          &has_valid_writes, seq_per_batch_, batch_per_txn_);
    }
}

参考文献

https://kernelmaker.github.io/Rocksdb_WAL
http://mysql.taobao.org/monthly/2018/04/09/
https://hexiangyu.me/2022/06/12/rocksdb-wal/

  • 1
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值