Put与Delete操作
Status DB::Put(const WriteOptions& opt, const Slice& key, const Slice& value) {
WriteBatch batch;
batch.Put(key, value);
return Write(opt, &batch);
}
Status DB::Delete(const WriteOptions& opt, const Slice& key) {
WriteBatch batch;
batch.Delete(key);
return Write(opt, &batch);
}
LevelDB对外暴露的写接口包括Put,Delete和Write,其中Write需要WriteBatch作为参数,而Put和Delete首先就是将当前的操作封装到一个WriteBatch对象,并调用Write接口。
opt是写选项,从上面代码并没有看出处理并发的逻辑,其实对于多线程的处理是在DBImpl::Write函数中完成
WriteBatch
WriteBatch可以记录许多个操作,每一个操作代表着要插入或删除相应数据
class WriteBatch {
public:
WriteBatch();
~WriteBatch();
// Store the mapping "key->value" in the database.
void Put(const Slice& key, const Slice& value);
// If the database contains a mapping for "key", erase it. Else do nothing.
void Delete(const Slice& key);
// Clear all updates buffered in this batch.
void Clear();
// Support for iterating over the contents of a batch.
class Handler {
public:
virtual ~Handler();
virtual void Put(const Slice& key, const Slice& value) = 0;
virtual void Delete(const Slice& key) = 0;
};
Status Iterate(Handler* handler) const;
private:
friend class WriteBatchInternal;
std::string rep_; //只有一个string成员变量,来存放所有操作
};
每一个WriteBatch都是以一个固定长度的头部开始,然后后面接着许多连续的记录(插入或删除操作)
固定头部格式:
固定头部共12字节,其中前8字节为WriteBatch的序列号(也就是每个操作对应的全局序列号),对应rep_[0]到rep_[7],每次处理Batch中的记录时才会更新,后四字节为当前Batch中的记录数,对应rep_[8]到rep_[11];
后面的记录结构为:
插入数据时:type(kTypeValue、kTypeDeletion),Key_size,Key,Value_size,Value
删除数据时:type(kTypeValue、kTypeDeletion),Key_size,Key
WriteBatchInternal提供了一系列的静态操作接口来对WriteBatch的接口进行封装,而不是直接操作WriteBatch的接口
DBImpl::Write
Status DBImpl::Write(const WriteOptions& options, WriteBatch* my_batch) {
/*struct DBImpl::Writer {
* WriteBatch* batch;
* bool sync;
* bool done;
* port::CondVar cv;
*};
*Writer封装WriteBatch,主要是多了信号量cv用于多线程的同步,以及该batch是否完成的标志done
*/
Writer w(&mutex_);
w.batch = my_batch;
w.sync = options.sync;
w.done = false;
//加锁,因为w要插入全局队列writers_中
MutexLock l(&mutex_);
writers_.push_back(&w);
//只有当w是位于队列头部且w并没有完成时才不用等待
while (!w.done && &w != writers_.front()) {
w.cv.Wait();
}
//可能该w中的batch被其他线程通过下面讲到的合并操作一起完成了
if (w.done) {
return w.status;
}
// May temporarily unlock and wait.
Status status = MakeRoomForWrite(my_batch == NULL);
uint64_t last_sequence = versions_->LastSequence();
Writer* last_writer = &w;
if (status.ok() && my_batch != NULL) {
//合并队列中的各个batch到一个新batch中
WriteBatch* updates = BuildBatchGroup(&last_writer);
//为合并后的新batch中的第一个操作赋上全局序列号
WriteBatchInternal::SetSequence(updates, last_sequence + 1);
//并计算新的全局序列号
last_sequence += WriteBatchInternal::Count(updates);
{
//往磁盘写日志文件开销很大,此时可以释放锁来提高并发,此时其他线程可以将
//新的writer插入到队列writers_中
mutex_.Unlock();
//将batch中的每条操作写入日志文件log_中
status = log_->AddRecord(WriteBatchInternal::Contents(updates));
bool sync_error = false;
if (status.ok() && options.sync) {
//是否要求立马刷盘将log写到磁盘,因为我们知道文件系统还有自己的缓存
status = logfile_->Sync();
if (!status.ok()) {
sync_error = true;
}
}
if (status.ok()) {
//将batch中每条操作插入到memtable中
status = WriteBatchInternal::InsertInto(updates, mem_);
}
//重新加锁
mutex_.Lock();
}
//因为updates已经写入了log和memtable,可以清空了
if (updates == tmp_batch_) tmp_batch_->Clear();
//重新设置新的全局序列号
versions_->SetLastSequence(last_sequence);
}
while (true) {
//因为我们的updates可能合并了writers_队列中的很多,当前线程完成了其他线程的
//writer,只需唤醒这些已完成writer的线程
Writer* ready = writers_.front();
//从队列头部取出已完成的writer
writers_.pop_front();
if (ready != &w) {
//如果取出的writer不是当前线程的自己的,则唤醒writer所属的线程,唤醒的线程会执
//行 if (w.done) {
// return w.status;
//}逻辑
ready->status = status;
ready->done = true;
ready->cv.Signal();
}
//ready == last_writer说明这已经是合并的batch中最后一个已完成的writer了
if (ready == last_writer) break;
}
// Notify new head of write queue
if (!writers_.empty()) {
//队列不空,则唤醒队列头部writer所属的线程,参见上面 while (!w.done && &w != writers_.front())
writers_.front()->cv.Signal();
}
return status;
}
DBImpl::BuildBatchGroup
// REQUIRES: Writer list must be non-empty
// REQUIRES: First writer must have a non-NULL batch
WriteBatch* DBImpl::BuildBatchGroup(Writer** last_writer) {
assert(!writers_.empty());
Writer* first = writers_.front();
WriteBatch* result = first->batch;
assert(result != NULL);
size_t size = WriteBatchInternal::ByteSize(first->batch);
// 设置合并后产生的batch的最大容量
size_t max_size = 1 << 20;
if (size <= (128<<10)) {
//如果第一个待合并的batch的size很小,则相应减小合并后batch的最大容量
max_size = size + (128<<10);
}
//我们需要记录writers_队列中最后一个被合并的writer,因为write函数中唤醒线程需要用
//到,防止小的batch需要等待过久用于合并
*last_writer = first;
std::deque<Writer*>::iterator iter = writers_.begin();
++iter; // Advance past "first"
for (; iter != writers_.end(); ++iter) {
Writer* w = *iter;
if (w->sync && !first->sync) {
//能合并到一起的batch大家的sync属性必须相同
break;
}
if (w->batch != NULL) {
size += WriteBatchInternal::ByteSize(w->batch);
if (size > max_size) {
// Do not make batch too big
break;
}
// Append to *result
if (result == first->batch) {
// 用db数据成员tmp_batch_存放合并后的结果,相当于把各个待合并的writer中
//的数据全都拷贝进了tmp_batch_
result = tmp_batch_;
assert(WriteBatchInternal::Count(result) == 0);
WriteBatchInternal::Append(result, first->batch);
}
WriteBatchInternal::Append(result, w->batch);
}
*last_writer = w;//记录最后一个合并的writer
}
return result;
}