本文着重讲解chekpoint刷脏环节中核心内容:CheckPointBuffers,这部分涉及共享缓冲区的内容,后续会将补充共享缓冲区的源码分析.
知识回顾:
postgres checkpoint源码学习-1
postgres checkpoint源码学习-2
postgres checkpoint源码学习-3
源码解析
1 BufferSync API介绍
该函数的功能是将缓冲池中的所有脏缓冲块刷至磁盘,在检查点的时候会调用.
/*
* BufferSync -- Write out all dirty buffers in the pool.
// 如果设置CHECKPOINT_IMMEDIATE标识,会立即执行写;如果设置CHECKPOINT_IS_SHUTDOWN/
CHECKPOINT_END_OF_RECOVERY / CHECKPOINT_FLUSH_ALL, 非日志缓冲块也要刷盘,反之
跳过非日志缓冲块,其他的标识无其他影响
* This is called at checkpoint time to write out all dirty shared buffers.
* The checkpoint request flags should be passed in. If CHECKPOINT_IMMEDIATE
* is set, we disable delays between writes; if CHECKPOINT_IS_SHUTDOWN,
* CHECKPOINT_END_OF_RECOVERY or CHECKPOINT_FLUSH_ALL is set, we write even
* unlogged buffers, which are otherwise skipped. The remaining flags
* currently have no effect here.
*/
static void
BufferSync(int flags)
2 源码分析
static void
BufferSync(int flags)
{
uint32 buf_state;
int buf_id;
int num_to_scan;
int num_spaces;
int num_processed;
int num_written;
CkptTsStatus *per_ts_stat = NULL;
Oid last_tsid;
binaryheap *ts_heap;
int i;
int mask = BM_DIRTY;
WritebackContext wb_context;
/* Make sure we can handle the pin inside SyncOneBuffer */
// 确保当前资源管理器中有足够的buffer,若不足,2倍扩容
ResourceOwnerEnlargeBuffers(CurrentResourceOwner);
/*
* Unless this is a shutdown checkpoint or we have been explicitly told,
* we write only permanent, dirty buffers. But at shutdown or end of
* recovery, we write all dirty buffers.
* 通过标识信息以确定刷哪些类型的脏缓冲区
*/
if (!((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
CHECKPOINT_FLUSH_ALL))))
mask |= BM_PERMANENT;
/*
* Loop over all buffers, and mark the ones that need to be written with
* BM_CHECKPOINT_NEEDED. Count them as we go (num_to_scan), so that we
* can estimate how much work needs to be done.
// 遍历所有的的缓冲块,将需要刷盘的标记上BM_CHECKPOINT_NEEDED信息,并计数
* This allows us to write only those pages that were dirty when the
* checkpoint began, and not those that get dirtied while it proceeds.
* Whenever a page with BM_CHECKPOINT_NEEDED is written out, either by us
* later in this function, or by normal backends or the bgwriter cleaning
* scan, the flag is cleared. Any buffer dirtied after this point won't
* have the flag set.
// 允许将检查点开始前的脏页刷盘,在检查点开始之后的脏页不算其中; 经过标记的脏页被写出后
标记会清除[也可以被其他进程或者bgwriter进程清除].
* Note that if we fail to write some buffer, we may leave buffers with
* BM_CHECKPOINT_NEEDED still set. This is OK since any such buffer would
* certainly need to be written for the next checkpoint attempt, too.
* 如果在写的过程中发生失败,BM_CHECKPOINT_NEEDED标记还遗留,没关系,未写入的脏buffer会在
* 下次的检查点继续进行
*/
num_to_scan = 0;
for (buf_id = 0; buf_id < NBuffers; buf_id++)
{
BufferDesc *bufHdr = GetBufferDescriptor(buf_id);
/*
* Header spinlock is enough to examine BM_DIRTY, see comment in
* SyncOneBuffer.
*/
// 持有 spinlock检查buffer是否为脏,并记录每个脏缓冲块对应的物理文件信息,包括
// 表空间 + 表名 + 分支属性 + 块号
buf_state = LockBufHdr(bufHdr);
if ((buf_state & mask) == mask)
{
CkptSortItem *item;
buf_state |= BM_CHECKPOINT_NEEDED;
item = &CkptBufferIds[num_to_scan++];
item->buf_id = buf_id;
item->tsId = bufHdr->tag.rnode.spcNode;
item->relNode = bufHdr->tag.rnode.relNode;
item->forkNum = bufHdr->tag.forkNum;
item->blockNum = bufHdr->tag.blockNum;
}
UnlockBufHdr(bufHdr, buf_state);
/* Check for barrier events in case NBuffers is large. */
if (ProcSignalBarrierPending)
ProcessProcSignalBarrier();
}
统计好信息后,将遍历的脏缓冲块按照表空间的进行分类排序,其目的是减少随机IO
if (num_to_scan == 0)
return; /* nothing to do */
WritebackContextInit(&wb_context, &checkpoint_flush_after);
TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_scan);
/*
* Sort buffers that need to be written to reduce the likelihood of random
* IO. The sorting is also important for the implementation of balancing
* writes between tablespaces. Without balancing writes we'd potentially
* end up writing to the tablespaces one-by-one; possibly overloading the
* underlying system.
*/
// 对需要写入的缓冲区进行排序,以减少随机 IO 的可能性。 排序对于实现表空间之间的平衡写入
// 也很重要。如果不平衡写入,我们可能最终会一个接一个地写入表空间[随机];可能会使底层系统过载。
sort_checkpoint_bufferids(CkptBufferIds, num_to_scan);
num_spaces = 0;
/*
// 为每个表空间需要刷脏的buffers分配一个进程status
* Allocate progress status for each tablespace with buffers that need to
* be flushed. This requires the to-be-flushed array to be sorted.
*/
last_tsid = InvalidOid;
for (i = 0; i < num_to_scan; i++)
{
CkptTsStatus *s;
Oid cur_tsid;
cur_tsid = CkptBufferIds[i].tsId;
/*
* Grow array of per-tablespace status structs, every time a new
* tablespace is found.
*/
if (last_tsid == InvalidOid || last_tsid != cur_tsid)
{
Size sz;
num_spaces++;
/*
* Not worth adding grow-by-power-of-2 logic here - even with a
* few hundred tablespaces this should be fine.
*/
sz = sizeof(CkptTsStatus) * num_spaces;
if (per_ts_stat == NULL)
per_ts_stat = (CkptTsStatus *) palloc(sz);
else
per_ts_stat = (CkptTsStatus *) repalloc(per_ts_stat, sz);
s = &per_ts_stat[num_spaces - 1];
memset(s, 0, sizeof(*s));
s->tsId = cur_tsid;
/*
* The first buffer in this tablespace. As CkptBufferIds is sorted
* by tablespace all (s->num_to_scan) buffers in this tablespace
* will follow afterwards.
*/
s->index = i;
/*
* progress_slice will be determined once we know how many buffers
* are in each tablespace, i.e. after this loop.
*/
last_tsid = cur_tsid;
}
else
{
s = &per_ts_stat[num_spaces - 1];
}
s->num_to_scan++;
/* Check for barrier events. */
if (ProcSignalBarrierPending)
ProcessProcSignalBarrier();
}
Assert(num_spaces > 0);
根据上述信息建立一个最小堆,处于顶端的表空间处理脏页的速度最慢,这样能够避免刷脏页的过程中
集中在某一个表空间文件上,进而避免IO集中
/*
* Build a min-heap over the write-progress in the individual tablespaces,
* and compute how large a portion of the total progress a single
* processed buffer is.
*/
ts_heap = binaryheap_allocate(num_spaces,
ts_ckpt_progress_comparator,
NULL);
for (i = 0; i < num_spaces; i++)
{
CkptTsStatus *ts_stat = &per_ts_stat[i];
ts_stat->progress_slice = (float8) num_to_scan / ts_stat->num_to_scan;
binaryheap_add_unordered(ts_heap, PointerGetDatum(ts_stat));
}
binaryheap_build(ts_heap); // 构建堆
准备工作完成之后,便进行实际工作,调用SyncOneBuffer将脏buffer进行刷盘
/*
* Iterate through to-be-checkpointed buffers and write the ones (still)
* marked with BM_CHECKPOINT_NEEDED. The writes are balanced between
* tablespaces; otherwise the sorting would lead to only one tablespace
* receiving writes at a time, making inefficient use of the hardware.
*/
num_processed = 0;
num_written = 0;
while (!binaryheap_empty(ts_heap)) //遍历最小堆中的每一个节点:表空间
{
BufferDesc *bufHdr = NULL;
CkptTsStatus *ts_stat = (CkptTsStatus *)
DatumGetPointer(binaryheap_first(ts_heap));
buf_id = CkptBufferIds[ts_stat->index].buf_id;
Assert(buf_id != -1);
bufHdr = GetBufferDescriptor(buf_id); //从CkptBufferIds获取buf_id,进而读取标识
num_processed++;
/*
* We don't need to acquire the lock here, because we're only looking
* at a single bit. It's possible that someone else writes the buffer
* and clears the flag right after we check, but that doesn't matter
* since SyncOneBuffer will then do nothing. However, there is a
* further race condition: it's conceivable that between the time we
* examine the bit here and the time SyncOneBuffer acquires the lock,
* someone else not only wrote the buffer but replaced it with another
* page and dirtied it. In that improbable case, SyncOneBuffer will
* write the buffer though we didn't need to. It doesn't seem worth
* guarding against this, though.
* 查看标识位无需获取锁.如果在此过程其他进程已经该buffer页写入并清理标识位,并不影响
* SyncOneBuffer操作,在检查标识位和SyncOneBuffer操作获取锁时.其他人不仅已经写入
* 该buffer而且被其他页置换并标脏.
*/
if (pg_atomic_read_u32(&bufHdr->state) & BM_CHECKPOINT_NEEDED)
{
if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN)
{
TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id);
BgWriterStats.m_buf_written_checkpoints++;
num_written++;
}
}
/*
* Measure progress independent of actually having to flush the buffer
* - otherwise writing become unbalanced.
*/
ts_stat->progress += ts_stat->progress_slice;
ts_stat->num_scanned++;
ts_stat->index++;
/* Have all the buffers from the tablespace been processed? */
if (ts_stat->num_scanned == ts_stat->num_to_scan)
{
binaryheap_remove_first(ts_heap);
}
else
{
// 更换下一个表空间
/* update heap with the new progress */
binaryheap_replace_first(ts_heap, PointerGetDatum(ts_stat));
}
/*
* Sleep to throttle our I/O rate.
// 控制检查点的速率
* (This will check for barrier events even if it doesn't sleep.)
*/
CheckpointWriteDelay(flags, (double) num_processed / num_to_scan);
}
/* issue all pending flushes */
// 处理回写请求
IssuePendingWritebacks(&wb_context);
// 释放资源
pfree(per_ts_stat);
per_ts_stat = NULL;
binaryheap_free(ts_heap);
/*
* Update checkpoint statistics. As noted above, this doesn't include
* buffers written by other backends or bgwriter scan.
* 更新检查点统计信息
*/
CheckpointStats.ckpt_bufs_written += num_written;
TRACE_POSTGRESQL_BUFFER_SYNC_DONE(NBuffers, num_written, num_to_scan);
}