WAL格式
WAL File格式
以kBlockSize为单位进行存储,每个Block中有多个record,Block的若未写满则需要进行补0。若超过BlockSize则需要进行拆分。
+-----+-------------+--+----+----------+------+-- ... ----+
File | r0 | r1 |P | r2 | r3 | r4 | |
+-----+-------------+--+----+----------+------+-- ... ----+
<--- kBlockSize ------>|<-- kBlockSize ------>|
rn = variable size records
P = Padding
详细可见如下代码:
IOStatus Writer::AddRecord(const Slice& slice,
Env::IOPriority rate_limiter_priority) {
do {
const int64_t leftover = kBlockSize - block_offset_;
if (leftover < header_size) {
// 如果header空间不足,则需要写到下一个block,这个block补0.
if (leftover > 0) {
// 补0
s = dest_->Append(Slice("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00",
static_cast<size_t>(leftover)),
0 /* crc32c_checksum */, rate_limiter_priority);
}
block_offset_ = 0;
}
const size_t avail = kBlockSize - block_offset_ - header_size;
// 根据剩余的size和payloadsize计算type
const size_t fragment_length = (left < avail) ? left : avail;
RecordType type;
const bool end = (left == fragment_length && compress_remaining == 0);
if (begin && end) {
type = recycle_log_files_ ? kRecyclableFullType : kFullType;
} else if (begin) {
type = recycle_log_files_ ? kRecyclableFirstType : kFirstType;
} else if (end) {
type = recycle_log_files_ ? kRecyclableLastType : kLastType;
} else {
type = recycle_log_files_ ? kRecyclableMiddleType : kMiddleType;
}
// 记录record
s = EmitPhysicalRecord(type, ptr, fragment_length, rate_limiter_priority);
ptr += fragment_length;
left -= fragment_length;
begin = false;
} while (s.ok() && (left > 0 || compress_remaining > 0));
}
WAL Record格式
Record由如下格式组成:
- CRC:4字节,先计算Log number的crc和Payload的crc,在将两个crc计算成最终的crc。
- Size:2字节,Payload的size。
- Type:1字节,Record的类型,kZeroType, kFullType, kFirstType, kLastType, kMiddleType。
- Log number:4字节,version的filenum,也就是memtable的filenum。
- Payload:真正的kv数据。
+---------+-----------+-----------+----------------+--- ... ---+
|CRC (4B) | Size (2B) | Type (1B) | Log number (4B)| Payload |
+---------+-----------+-----------+----------------+--- ... ---+
Same as above, with the addition of
Log number = 32bit log file number, so that we can distinguish between
records written by the most recent log writer vs a previous one.
详见如下代码
IOStatus Writer::EmitPhysicalRecord(RecordType t, const char* ptr, size_t n,
Env::IOPriority rate_limiter_priority) {
size_t header_size;
char buf[kRecyclableHeaderSize];
// 2个字节记录payload的size
buf[4] = static_cast<char>(n & 0xff);
buf[5] = static_cast<char>(n >> 8);
// 1个字节记录type
buf[6] = static_cast<char>(t);
// 4个字节记录log_number
EncodeFixed32(buf + 7, static_cast<uint32_t>(log_number_));
// 计算log_number的crc
crc = crc32c::Extend(crc, buf + 7, 4);
// 计算payload的crc
uint32_t payload_crc = crc32c::Value(ptr, n);
// 合并log_number和payload的crc
crc = crc32c::Crc32cCombine(crc, payload_crc, n);
// 4字节记录crc
EncodeFixed32(buf, crc);
// append header
IOStatus s = dest_->Append(Slice(buf, header_size), 0 /* crc32c_checksum */,
rate_limiter_priority);
// append payload
s = dest_->Append(Slice(ptr, n), payload_crc, rate_limiter_priority);
}
Payload格式
// WriteBatch::rep_ :=
// sequence: fixed64
// count: fixed32
// data: record[count]
// record :=
// kTypeValue varstring varstring
// kTypeDeletion varstring
// kTypeSingleDeletion varstring
// kTypeRangeDeletion varstring varstring
// kTypeMerge varstring varstring
// kTypeColumnFamilyValue varint32 varstring varstring
// kTypeColumnFamilyDeletion varint32 varstring
// kTypeColumnFamilySingleDeletion varint32 varstring
// kTypeColumnFamilyRangeDeletion varint32 varstring varstring
// kTypeColumnFamilyMerge varint32 varstring varstring
// kTypeBeginPrepareXID
// kTypeEndPrepareXID varstring
// kTypeCommitXID varstring
// kTypeCommitXIDAndTimestamp varstring varstring
// kTypeRollbackXID varstring
// kTypeBeginPersistedPrepareXID
// kTypeBeginUnprepareXID
// kTypeWideColumnEntity varstring varstring
// kTypeColumnFamilyWideColumnEntity varint32 varstring varstring
// kTypeNoop
// varstring :=
// len: varint32
// data: uint8[len]
详细代码如下:
IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
log::Writer* log_writer, uint64_t* log_used,
bool need_log_sync, bool need_log_dir_sync,
SequenceNumber sequence,
LogFileNumberSize& log_file_number_size) {
WriteBatch* merged_batch;
// 将write_group中的writebatch添加到payload中
io_s = status_to_io_status(MergeBatch(write_group, &tmp_batch_, &merged_batch,
&write_with_wal, &to_be_cached_state));
// 所有payload的kv数据添加完成,记录seq
WriteBatchInternal::SetSequence(merged_batch, sequence);
}
Status DBImpl::MergeBatch(const WriteThread::WriteGroup& write_group,
WriteBatch* tmp_batch, WriteBatch** merged_batch,
size_t* write_with_wal,
WriteBatch** to_be_cached_state) {
*merged_batch = tmp_batch;
// 循环将writegroup中的batch添加到merged_batch(payload)中
for (auto writer : write_group) {
if (!writer->CallbackFailed()) {
Status s = WriteBatchInternal::Append(*merged_batch, writer->batch,
/*WAL_only*/ true);
}
}
}
Status WriteBatchInternal::Append(WriteBatch* dst, const WriteBatch* src,
const bool wal_only) {
// 记录batch的count,这里外层循环每一次进来都会更新一次
SetCount(dst, Count(dst) + src_count);
// append kv数据
dst->rep_.append(src->rep_.data() + WriteBatchInternal::kHeader, src_len);
return Status::OK();
}
Status WriteBatchInternal::Put(WriteBatch* b, uint32_t column_family_id,
const Slice& key, const Slice& value) {
// 每个batch的kv数据在这里,以slice的方式先写入key,后写入value。
PutLengthPrefixedSlice(&b->rep_, key);
PutLengthPrefixedSlice(&b->rep_, value);
}
WAL写流程
写流程
WAL的写流程如下:
IOStatus DBImpl::WriteToWAL(const WriteBatch& merged_batch,
log::Writer* log_writer, uint64_t* log_used,
uint64_t* log_size,
Env::IOPriority rate_limiter_priority,
LogFileNumberSize& log_file_number_size) {
// When two_write_queues_ WriteToWAL has to be protected from concurretn calls
// from the two queues anyway and log_write_mutex_ is already held. Otherwise
// if manual_wal_flush_ is enabled we need to protect log_writer->AddRecord
// from possible concurrent calls via the FlushWAL by the application.
const bool needs_locking = manual_wal_flush_ && !two_write_queues_;
// Due to performance cocerns of missed branch prediction penalize the new
// manual_wal_flush_ feature (by UNLIKELY) instead of the more common case
// when we do not need any locking.
if (UNLIKELY(needs_locking)) {
log_write_mutex_.Lock();
}
IOStatus io_s = log_writer->AddRecord(log_entry, rate_limiter_priority);
if (UNLIKELY(needs_locking)) {
log_write_mutex_.Unlock();
}
}
IOStatus Writer::AddRecord(const Slice& slice,
Env::IOPriority rate_limiter_priority) {
// 准备写盘内容
...
// 刷盘
s = dest_->Flush(rate_limiter_priority);
}
刷盘策略
直接刷盘
创建db的option配置了sync,直接写盘。
IOStatus DBImpl::WriteToWAL(const WriteThread::WriteGroup& write_group,
log::Writer* log_writer, uint64_t* log_used,
bool need_log_sync, bool need_log_dir_sync,
SequenceNumber sequence,
LogFileNumberSize& log_file_number_size) {
// 数据直接刷盘
if (io_s.ok() && need_log_sync) {
for (auto& log : logs_) {
io_s = log.writer->file()->Sync(immutable_db_options_.use_fsync);
if (!io_s.ok()) {
break;
}
}
}
return io_s;
}
异步1M聚合刷盘
创建db的option配置了wal_bytes_per_sync,会将数据显写到buffer后返回,然后按照1M粒度刷盘,这种情况如果刷盘不及时,会导致恢复的时候数据丢失。
IOStatus WritableFileWriter::Flush(Env::IOPriority op_rate_limiter_priority) {
IOStatus s;
// 数据显写入buffer
if (perform_data_verification_ && buffered_data_with_checksum_) {
s = WriteBufferedWithChecksum(buf_.BufferStart(), buf_.CurrentSize(),
op_rate_limiter_priority);
} else {
s = WriteBuffered(buf_.BufferStart(), buf_.CurrentSize(),
op_rate_limiter_priority);
}
}
// 写入buffer后返回
if (!s.ok()) {
return s;
}
// sync OS cache to disk for every bytes_per_sync_
// TODO: give log file and sst file different options (log
// files could be potentially cached in OS for their whole
// life time, thus we might not want to flush at all).
// We try to avoid sync to the last 1MB of data. For two reasons:
// (1) avoid rewrite the same page that is modified later.
// (2) for older version of OS, write can block while writing out
// the page.
// Xfs does neighbor page flushing outside of the specified ranges. We
// need to make sure sync range is far from the write offset.
if (!use_direct_io() && bytes_per_sync_) {
const uint64_t kBytesNotSyncRange =
1024 * 1024; // recent 1MB is not synced.
const uint64_t kBytesAlignWhenSync = 4 * 1024; // Align 4KB.
uint64_t cur_size = filesize_.load(std::memory_order_acquire);
if (cur_size > kBytesNotSyncRange) {
uint64_t offset_sync_to = cur_size - kBytesNotSyncRange;
offset_sync_to -= offset_sync_to % kBytesAlignWhenSync;
assert(offset_sync_to >= last_sync_size_);
if (offset_sync_to > 0 &&
offset_sync_to - last_sync_size_ >= bytes_per_sync_) {
s = RangeSync(last_sync_size_, offset_sync_to - last_sync_size_);
last_sync_size_ = offset_sync_to;
}
}
}
return s;
}
WAL创建
创建WAL有两种情况。
- DB open时会创建一个WAL。
- CF刷新到磁盘后,会切换一个新的memtable,同时切WAL。
IOStatus DBImpl::CreateWAL(uint64_t log_file_num, uint64_t recycle_log_number,
size_t preallocate_block_size,
log::Writer** new_log) {
io_s = NewWritableFile(fs_.get(), log_fname, &lfile, opt_file_options);
}
WAL删除
WAL在memtable转换成sstable后会进行后台删除。
void DBImpl::FindObsoleteFiles(JobContext* job_context, bool force,
bool no_full_scan) {
// 遍历所有alive_log_files_文件,并添加到删除队列
if (!alive_log_files_.empty() && !logs_.empty()) {
// 当file的num小于还flush memtable对应的lognum,就表示这个wal需要删除了
while (alive_log_files_.begin()->number < min_log_number) {
...
}
}
}
void DBImpl::PurgeObsoleteFiles(JobContext& state, bool schedule_only) {
// 获取需要删除的WAL
std::unordered_set<uint64_t> log_recycle_files_set(
state.log_recycle_files.begin(), state.log_recycle_files.end());
for (const auto& candidate_file : candidate_files) {
switch (type) {
case kWalFile:
keep = ((number >= state.log_number) ||
(number == state.prev_log_number) ||
(log_recycle_files_set.find(number) !=
log_recycle_files_set.end()));
break;
}
// 后台删除
SchedulePurge();
}
WAL恢复
在故障场景,WAL重建memtable,此时就需要遍历读取WAL中的kv数据,并插入memtable。主要的函数如下。
bool Reader::ReadRecord(Slice* record, std::string* scratch,
WALRecoveryMode wal_recovery_mode,
uint64_t* record_checksum) {
while (true) {
// 解析type,根据type判断是否被block拆分过。
const unsigned int record_type =
ReadPhysicalRecord(&fragment, &drop_size, record_checksum);
switch (record_type) {
case kFullType:
case kRecyclableFullType:
...
}
}
}
Status DBImpl::RecoverLogFiles(const std::vector<uint64_t>& wal_numbers,
SequenceNumber* next_sequence, bool read_only,
bool* corrupted_wal_found,
RecoveryContext* recovery_ctx) {
// 遍历所有wal
for (auto wal_number : wal_numbers) {
// 读取一个完整的record
while (!stop_replay_by_wal_filter &&
reader.ReadRecord(&record, &scratch,
immutable_db_options_.wal_recovery_mode,
&record_checksum) &&
status.ok()) {}
WriteBatch batch;
// 获取payload
status = WriteBatchInternal::SetContents(&batch, record);
// 获取seq
SequenceNumber sequence = WriteBatchInternal::Sequence(&batch);
// 写memtable
status = WriteBatchInternal::InsertInto(
&batch, column_family_memtables_.get(), &flush_scheduler_,
&trim_history_scheduler_, true, wal_number, this,
false /* concurrent_memtable_writes */, next_sequence,
&has_valid_writes, seq_per_batch_, batch_per_txn_);
}
}
参考文献
https://kernelmaker.github.io/Rocksdb_WAL
http://mysql.taobao.org/monthly/2018/04/09/
https://hexiangyu.me/2022/06/12/rocksdb-wal/