RocksDB：WAL详解 + 创建删除WAL + WAL写 + WAL恢复memtable

最新推荐文章于 2023-05-10 14:17:07 发布

easonwx

最新推荐文章于 2023-05-10 14:17:07 发布

阅读量1.4k

点赞数 1

分类专栏： RocksDB 文章标签：分布式云计算 linux 数据库大数据

本文链接：https://blog.csdn.net/easonwx/article/details/126282167

版权

RocksDB 专栏收录该内容

5 篇文章 0 订阅

订阅专栏

文章目录

WAL格式
WAL写流程
WAL创建
WAL删除
WAL恢复
参考文献

WAL格式

WAL File格式

以kBlockSize为单位进行存储，每个Block中有多个record，Block的若未写满则需要进行补0。若超过BlockSize则需要进行拆分。

       +-----+-------------+--+----+----------+------+-- ... ----+
 File  | r0  |        r1   |P | r2 |    r3    |  r4  |           |
       +-----+-------------+--+----+----------+------+-- ... ----+
       <--- kBlockSize ------>|<-- kBlockSize ------>|

  rn = variable size records
  P = Padding

详细可见如下代码：

IOStatus Writer::AddRecord(const Slice& slice,
                           Env::IOPriority rate_limiter_priority) {
  do {
    const int64_t leftover = kBlockSize - block_offset_;
    if (leftover < header_size) {
      // 如果header空间不足，则需要写到下一个block，这个block补0.
      if (leftover > 0) {
        // 补0
        s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
                                static_cast<size_t>(leftover)),
                          0 /* crc32c_checksum */, rate_limiter_priority);
      }
      block_offset_ = 0;
    }
    const size_t avail = kBlockSize - block_offset_ - header_size;
    // 根据剩余的size和payloadsize计算type
    const size_t fragment_length = (left < avail) ? left : avail;
    RecordType type;
    const bool end = (left == fragment_length && compress_remaining == 0);
    if (begin && end) {
      type = recycle_log_files_ ? kRecyclableFullType : kFullType;
    } else if (begin) {
      type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
    } else if (end) {
      type = recycle_log_files_ ? kRecyclableLastType : kLastType;
    } else {
      type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
    }
    // 记录record
    s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority);
    ptr += fragment_length;
    left -= fragment_length;
    begin = false;
  } while (s.ok() && (left > 0 || compress_remaining > 0));
}

WAL Record格式

Record由如下格式组成：

CRC：4字节，先计算Log number的crc和Payload的crc，在将两个crc计算成最终的crc。
Size：2字节，Payload的size。
Type：1字节，Record的类型，kZeroType, kFullType, kFirstType, kLastType, kMiddleType。
Log number：4字节，version的filenum，也就是memtable的filenum。
Payload：真正的kv数据。

+---------+-----------+-----------+----------------+--- ... ---+
|CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload   |
+---------+-----------+-----------+----------------+--- ... ---+
Same as above, with the addition of
Log number = 32bit log file number, so that we can distinguish between
records written by the most recent log writer vs a previous one.

详见如下代码

IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n,
                                    Env::IOPriority rate_limiter_priority) {
  size_t header_size;
  char buf[kRecyclableHeaderSize];
  // 2个字节记录payload的size
  buf[4] = static_cast<char>(n & 0xff); 
  buf[5] = static_cast<char>(n >> 8);
  // 1个字节记录type
  buf[6] = static_cast<char>(t);
  // 4个字节记录log_number
  EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
  // 计算log_number的crc
  crc = crc32c::Extend(crc, buf + 7, 4); 
  // 计算payload的crc
  uint32_t payload_crc = crc32c::Value(ptr, n);
  // 合并log_number和payload的crc
  crc = crc32c::Crc32cCombine(crc, payload_crc, n);
  // 4字节记录crc
  EncodeFixed32(buf, crc);
  // append header
  IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */,
                             rate_limiter_priority);
  // append payload
  s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority);
}

Payload格式

// WriteBatch::rep_ :=
//    sequence: fixed64
//    count: fixed32
//    data: record[count]
// record :=
//    kTypeValue varstring varstring
//    kTypeDeletion varstring
//    kTypeSingleDeletion varstring
//    kTypeRangeDeletion varstring varstring
//    kTypeMerge varstring varstring
//    kTypeColumnFamilyValue varint32 varstring varstring
//    kTypeColumnFamilyDeletion varint32 varstring
//    kTypeColumnFamilySingleDeletion varint32 varstring
//    kTypeColumnFamilyRangeDeletion varint32 varstring varstring
//    kTypeColumnFamilyMerge varint32 varstring varstring
//    kTypeBeginPrepareXID
//    kTypeEndPrepareXID varstring
//    kTypeCommitXID varstring
//    kTypeCommitXIDAndTimestamp varstring varstring
//    kTypeRollbackXID varstring
//    kTypeBeginPersistedPrepareXID
//    kTypeBeginUnprepareXID
//    kTypeWideColumnEntity varstring varstring
//    kTypeColumnFamilyWideColumnEntity varint32 varstring varstring
//    kTypeNoop
// varstring :=
//    len: varint32
//    data: uint8[len]

详细代码如下：

IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
                            log::Writer* log_writer, uint64_t* log_used,
                            bool need_log_sync, bool need_log_dir_sync,
                            SequenceNumber sequence,
                            LogFileNumberSize& log_file_number_size) {
  WriteBatch* merged_batch;
  // 将write_group中的writebatch添加到payload中
  io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch,
                                        &write_with_wal, &to_be_cached_state));
  // 所有payload的kv数据添加完成，记录seq
  WriteBatchInternal::SetSequence(merged_batch, sequence);
}

Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
                          WriteBatch* tmp_batch, WriteBatch** merged_batch,
                          size_t* write_with_wal,
                          WriteBatch** to_be_cached_state) {
    *merged_batch = tmp_batch;
    // 循环将writegroup中的batch添加到merged_batch（payload）中
    for (auto writer : write_group) {
      if (!writer->CallbackFailed()) {
        Status s = WriteBatchInternal::Append(*merged_batch, writer->batch,
                                              /*WAL_only*/ true);
      }
    }
  }

Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
                                  const bool wal_only) {
  // 记录batch的count，这里外层循环每一次进来都会更新一次
  SetCount(dst, Count(dst) + src_count);
  // append kv数据
  dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
  return Status::OK();
}

Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
                               const Slice& key, const Slice& value) {
  // 每个batch的kv数据在这里，以slice的方式先写入key，后写入value。
  PutLengthPrefixedSlice(&b->rep_, key);
  PutLengthPrefixedSlice(&b->rep_, value);
}

WAL写流程

写流程

WAL的写流程如下：

IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
                            log::Writer* log_writer, uint64_t* log_used,
                            uint64_t* log_size,
                            Env::IOPriority rate_limiter_priority,
                            LogFileNumberSize& log_file_number_size) {
  // When two_write_queues_ WriteToWAL has to be protected from concurretn calls
  // from the two queues anyway and log_write_mutex_ is already held. Otherwise
  // if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
  // from possible concurrent calls via the FlushWAL by the application.
  const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
  // Due to performance cocerns of missed branch prediction penalize the new
  // manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
  // when we do not need any locking.
  if (UNLIKELY(needs_locking)) {
    log_write_mutex_.Lock();
  }
  IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority);
  if (UNLIKELY(needs_locking)) {
    log_write_mutex_.Unlock();
  }
}

IOStatus Writer::AddRecord(const Slice& slice,
                           Env::IOPriority rate_limiter_priority) {
  // 准备写盘内容
  ...
  // 刷盘
  s = dest_->Flush(rate_limiter_priority);
}

刷盘策略

直接刷盘

创建db的option配置了sync，直接写盘。

IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
                            log::Writer* log_writer, uint64_t* log_used,
                            bool need_log_sync, bool need_log_dir_sync,
                            SequenceNumber sequence,
                            LogFileNumberSize& log_file_number_size) {
  // 数据直接刷盘
  if (io_s.ok() && need_log_sync) {
    for (auto& log : logs_) {
      io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
      if (!io_s.ok()) {
        break;
      }
    }
  }
  return io_s;
}

异步1M聚合刷盘

创建db的option配置了wal_bytes_per_sync，会将数据显写到buffer后返回，然后按照1M粒度刷盘，这种情况如果刷盘不及时，会导致恢复的时候数据丢失。

IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) {
  IOStatus s;
    // 数据显写入buffer
      if (perform_data_verification_ && buffered_data_with_checksum_) {
        s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(),
                                      op_rate_limiter_priority);
      } else {
        s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(),
                          op_rate_limiter_priority);
      }
    }
    // 写入buffer后返回
    if (!s.ok()) {
      return s;
    }

  // sync OS cache to disk for every bytes_per_sync_
  // TODO: give log file and sst file different options (log
  // files could be potentially cached in OS for their whole
  // life time, thus we might not want to flush at all).

  // We try to avoid sync to the last 1MB of data. For two reasons:
  // (1) avoid rewrite the same page that is modified later.
  // (2) for older version of OS, write can block while writing out
  //     the page.
  // Xfs does neighbor page flushing outside of the specified ranges. We
  // need to make sure sync range is far from the write offset.
  if (!use_direct_io() && bytes_per_sync_) {
    const uint64_t kBytesNotSyncRange =
        1024 * 1024;                                // recent 1MB is not synced.
    const uint64_t kBytesAlignWhenSync = 4 * 1024;  // Align 4KB.
    uint64_t cur_size = filesize_.load(std::memory_order_acquire);
    if (cur_size > kBytesNotSyncRange) {
      uint64_t offset_sync_to = cur_size - kBytesNotSyncRange;
      offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
      assert(offset_sync_to >= last_sync_size_);
      if (offset_sync_to > 0 &&
          offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
        s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
        last_sync_size_ = offset_sync_to;
      }
    }
  }

  return s;
}

WAL创建

创建WAL有两种情况。

DB open时会创建一个WAL。
CF刷新到磁盘后，会切换一个新的memtable，同时切WAL。

IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
                           size_t preallocate_block_size,
                           log::Writer** new_log) {
  io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
}

WAL删除

WAL在memtable转换成sstable后会进行后台删除。

void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
                               bool no_full_scan) {
  // 遍历所有alive_log_files_文件，并添加到删除队列
    if (!alive_log_files_.empty() && !logs_.empty()) {
      // 当file的num小于还flush memtable对应的lognum，就表示这个wal需要删除了
      while (alive_log_files_.begin()->number < min_log_number) {
        ...
      }
    }
}

void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
  // 获取需要删除的WAL
  std::unordered_set<uint64_t> log_recycle_files_set(
      state.log_recycle_files.begin(), state.log_recycle_files.end());
  for (const auto& candidate_file : candidate_files) {
  switch (type) {
    case kWalFile:
      keep = ((number >= state.log_number) ||
              (number == state.prev_log_number) ||
              (log_recycle_files_set.find(number) !=
               log_recycle_files_set.end()));
      break;
  }
  // 后台删除
  SchedulePurge();
}

WAL恢复

在故障场景，WAL重建memtable，此时就需要遍历读取WAL中的kv数据，并插入memtable。主要的函数如下。

bool Reader::ReadRecord(Slice* record, std::string* scratch,
                        WALRecoveryMode wal_recovery_mode,
                        uint64_t* record_checksum) {
  while (true) {
    // 解析type，根据type判断是否被block拆分过。
    const unsigned int record_type =
        ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
    switch (record_type) {
      case kFullType:
      case kRecyclableFullType:
      ...
    }
  }
}

Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
                               SequenceNumber* next_sequence, bool read_only,
                               bool* corrupted_wal_found,
                               RecoveryContext* recovery_ctx) {
  // 遍历所有wal
  for (auto wal_number : wal_numbers) {
    // 读取一个完整的record
    while (!stop_replay_by_wal_filter &&
           reader.ReadRecord(&record, &scratch,
                             immutable_db_options_.wal_recovery_mode,
                             &record_checksum) &&
           status.ok()) {}
      WriteBatch batch;
      // 获取payload
      status = WriteBatchInternal::SetContents(&batch, record);
      // 获取seq
      SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
      // 写memtable
      status = WriteBatchInternal::InsertInto(
          &batch, column_family_memtables_.get(), &flush_scheduler_,
          &trim_history_scheduler_, true, wal_number, this,
          false /* concurrent_memtable_writes */, next_sequence,
          &has_valid_writes, seq_per_batch_, batch_per_txn_);
    }
}