PostgreSQL重启恢复---Log Buffer

obvious__

已于 2022-08-19 14:57:55 修改

阅读量1.3k

点赞数 4

分类专栏： postgresql 文章标签： postgresql 数据库

于 2021-08-01 12:18:21 首次发布

本文链接：https://blog.csdn.net/obvious__/article/details/119295527

版权

postgresql 专栏收录该内容

25 篇文章 28 订阅

订阅专栏

Log Buffer

预备知识

《PostgreSQL重启恢复—XLOG 1.0》

《PostgreSQL重启恢复—XLOG 2.0》

概述

在前面的文章中，我们解决了对于数据页的insert操作，需要组装一个怎样的XLOG以及XLOG写入log buffer的流程。接下来我们需要来更加深入的了解下log buffer，了解他的组织结构，了解log buffer的作用，更重要的是要了解XLOG如何落盘。还记得WAL的两大准则么：

XLOG落盘之后，对应的数据才能落盘。
一个事务相关的所有XLOG都落盘之后，事务才可以提交。

可见XLOG的落盘，在整个XLOG体系中占有至关重要的作用。

Log Buffer的组织结构

在《XLOG 2.0》中我们简要介绍过log buffer的组织结构，log buffer是按照段页式进行管理。log buffer被分为多个段（segment）每个段默认16MB，一个段又被分为多个页面（page）每个页默认大小为8KB。段内第一个页面包含了由XLogLongPageHeaderData定义的首部数据，其他页面包含了由XLogPageHeaderData定义的首部数据。如下图所示：

在这里插入图片描述

我们可以通过log buffer的初始化函数来观察log buffer的组织结构。log buffer的初始化函数为XLOGShmemInit。主要有两个步骤：

step1：从共享缓存中分配一块空间。
step2：将step1中得到的共享缓存进行初始化。

在step1中，首先调用XLOGShmemSize()来计算需要给log buffer分配的空间大小，然后从共享缓存中分配空间。XLOGShmemSize的实现如下：

/*
 * Initialization of shared memory for XLOG
 */
Size
XLOGShmemSize(void)
{
	Size		size;

	/*
	 * If the value of wal_buffers is -1, use the preferred auto-tune value.
	 * This isn't an amazingly clean place to do this, but we must wait till
	 * NBuffers has received its final value, and must do it before using the
	 * value of XLOGbuffers to do anything important.
	 */
	if (XLOGbuffers == -1)
	{
		char		buf[32];

		snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
		SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
	}
	Assert(XLOGbuffers > 0);

	/* XLogCtl 第一部分*/
	size = sizeof(XLogCtlData);

	/* WAL insertion locks, plus alignment 第三部分（这里确实是第三部分）*/
	size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
	/* xlblocks array  第二部分*/
	size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
	/* extra alignment padding for XLOG I/O buffers 第四部分*/
	size = add_size(size, XLOG_BLCKSZ);
	/* and the buffers themselves 第五部分*/
	size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));

	/*
	 * Note: we don't count ControlFileData, it comes out of the "slop factor"
	 * added by CreateSharedMemoryAndSemaphores.  This lets us use this
	 * routine again below to compute the actual allocation size.
	 */

	return size;
}

而XLOGShmemInit的具体实现如下：

void
XLOGShmemInit(void)
{
	bool		foundCFile,
				foundXLog;
	char	   *allocptr;
	int			i;

#ifdef WAL_DEBUG

	/*
	 * Create a memory context for WAL debugging that's exempt from the normal
	 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
	 * an allocation fails, but wal_debug is not for production use anyway.
	 */
	if (walDebugCxt == NULL)
	{
		walDebugCxt = AllocSetContextCreate(TopMemoryContext,
											"WAL Debug",
											ALLOCSET_DEFAULT_SIZES);
		MemoryContextAllowInCriticalSection(walDebugCxt, true);
	}
#endif

    /*
     * step1:分配空间
     */
	ControlFile = (ControlFileData *)
		ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
	XLogCtl = (XLogCtlData *)
		ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);

	if (foundCFile || foundXLog)
	{
		/* both should be present or neither */
		Assert(foundCFile && foundXLog);

		/* Initialize local copy of WALInsertLocks and register the tranche */
		WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
		LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
							  &XLogCtl->Insert.WALInsertLockTranche);
		return;
	}
    
     /*
     * step2:初始化共享缓存
     */
    //第一部分：XLogCtl
	memset(XLogCtl, 0, sizeof(XLogCtlData));

	/*
	 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
	 * multiple of the alignment for same, so no extra alignment padding is
	 * needed here.
	 */
    
    //第二部分：XLOGbuffers个XLogRecPtr（XLogRecPtr为int类型）
	allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
	XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
	memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
	allocptr += sizeof(XLogRecPtr) * XLOGbuffers;


	/* WAL insertion locks. Ensure they're aligned to the full padded size */
    //第三部分：NUM_XLOGINSERT_LOCKS个WALInsertLockPadded
    //注意，后面在XLogInsertRecord中调用的WALInsertLockAcquire函数，所使用的就是		WALInsertLocks
	allocptr += sizeof(WALInsertLockPadded) -
		((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
	WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
		(WALInsertLockPadded *) allocptr;
	allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;

	XLogCtl->Insert.WALInsertLockTranche.name = "wal_insert";
	XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
	XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);

	LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, &XLogCtl->Insert.WALInsertLockTranche);
	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
	{
		LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
		WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
	}

	/*
	 * Align the start of the page buffers to a full xlog block size boundary.
	 * This simplifies some calculations in XLOG insertion. It is also
	 * required for O_DIRECT.
	 * 第四部分：ALIGN
	 */
	allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
	XLogCtl->pages = allocptr;
    //第五部分：log buffer（XLOG_BLCKSZ个page）
	memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);

	/*
	 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
	 * in additional info.)
	 */
	XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
	XLogCtl->SharedRecoveryInProgress = true;
	XLogCtl->SharedHotStandbyActive = false;
	XLogCtl->WalWriterSleeping = false;

	SpinLockInit(&XLogCtl->Insert.insertpos_lck);
	SpinLockInit(&XLogCtl->info_lck);
	SpinLockInit(&XLogCtl->ulsn_lck);
	InitSharedLatch(&XLogCtl->recoveryWakeupLatch);

	/*
	 * If we are not in bootstrap mode, pg_control should already exist. Read
	 * and validate it immediately (see comments in ReadControlFile() for the
	 * reasons why).
	 */
	if (!IsBootstrapProcessingMode())
		ReadControlFile();
}

从上述代码中可用看出共享缓存被分了5个部分：

第一部分：XLogCtl
第二部分：LSN数组，数组元素个数和log buffer的页面数一致（XLOGbuffers）
第三部分：WALInsertLockPadded数组，数组元素个数为NUM_XLOGINSERT_LOCKS
第四部分：对齐位
第五部分：log buffer，数组元素个数为XLOGbuffers

在这里插入图片描述

XLogCtl是一个XLogCtlData结构体，这个结构体非常重要，用于控制XLOG的写入。这个结构体也非常复杂，所以我们打算逐步讲解。在上述代码中，我们需要重点关注的是XLogCtlData的pages成员。pages用于指向log buffer的起始地址。而第四部分的pad是为了让这个起始地址对齐为XLOG_BLCKSZ的整数倍：

在这里插入图片描述

这是为了方便定位，在GetXLogBuffer中会用到这个特性，这一点后面再来讲。

除了pages成员之外，我们还需要知道XLogCacheBlck成员

在这里插入图片描述

XLogCacheBlck用于存放最大的log buffer页面下标。也就是页面数量-1。

在这里插入图片描述

小结

现在我们来对log buffer的构造做一个小结

log buffer是一段连续的内存空间，组织结构为段页式。
log buffer的首地址存放在XLogCtlData的pages成员中，地址对齐为XLOG_BLCKSZ的整数倍。
log buffer中页面的数量由XLogCacheBlck来表示。

GetXLogBuffer—Round 1

在弄懂了log buffer的结构之后，我们需要解决的第一个问题就是XLOG如何写入log buffer。这个问题在《XLOG 2.0》中已经进行了描述。我们先来回顾一下，在XLogRecordAssemble组装好一条XLOG之后。会经历以下步骤：

调用ReserveXLogInsertLocation获取XLOG的物理写入位置，这个位置也是XLOG的LSN。LSN是一个单调递增的整数。
调用GetXLogBuffer，将上一步得到的LSN作为入参，获取这个LSN应该写入log buffer的哪个页面，以及写入的位置指针，currpos。
将XLOG写入currpos指向的log buffer。

这里的关键就在于如何将LSN转换为log buffer中的地址指针，也就是GetXLogBuffer的具体实现。由于GetXLogBuffer涉及的知识点比较多，需要分多次来讲，在本节中，我们只是简单的了解如何将LSN转换为log buffer的地址指针。

log buffer是由连续内存空间组成的循环队列，XLOG从前向后写log buffer，写满后循环到队头，再重头开始写。涉及这部分操作的核心代码如下：

static char *
GetXLogBuffer(XLogRecPtr ptr)
{
	int			idx;
	XLogRecPtr	endptr;
	static uint64 cachedPage = 0;
	static char *cachedPos = NULL;


	/*
	 * The XLog buffer cache is organized so that a page is always loaded to a
	 * particular buffer.  That way we can easily calculate the buffer a given
	 * page must be loaded into, from the XLogRecPtr alone.
	 * 根据LSN获取buffer page的下标
	 */
	idx = XLogRecPtrToBufIdx(ptr);
    
	/*
	 * Found the buffer holding this page. Return a pointer to the right
	 * offset within the page.
	 */
	cachedPage = ptr / XLOG_BLCKSZ;
	cachedPos  = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;

	return cachedPos + ptr % XLOG_BLCKSZ;
}

XLogRecPtrToBufIdx的实现如下：

#define XLogRecPtrToBufIdx(recptr)	\
	(((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))

其中XLogCtl->XLogCacheBlck + 1就是log buffer中page的个数。这个其实就是循环队列的标准算法。

再来看下XLOG实际写入的代码：

在这里插入图片描述

内存循环用于处理XLOG长度大于当前page空闲空间的情况，此时需要先将XLOG的一部分存放到当前page的剩余空间中，然后调用GetXLogBuffer为XLOG的剩余部分寻找一个新的page进行写入，而这个新page实际就是当前page的下一个page。如果当前page是log buffer中的最后一个page，那么GetXLogBuffer就会循环到的log buffer的第一个page。

由于是循环队列，那么当循环到队头后，队头page中的数据就会被新的XLOG覆盖。既然要覆盖，那么在覆盖之前需要先确保对应page中的数据已经落盘。所以GetXLogBuffer还有一个非常重要的功能就是在页面覆盖之前判断这个页面是否是脏页，如果是脏页就需要将脏页落盘。所以，接下来我们就需要了解，XLOG是如何落盘的。

XLOG落盘

在了解XLOG如何落盘之前，我们需要思考一个问题。XLOG什么时候需要落盘？

事务commit之前

依据WAL的定义，XLOG落盘之后事务才可以commit。所以在事务commit之前，必须将事务相关的XLOG落盘。
页面落盘之前
根据WAL的定义，XLOG落盘之后，相关数据才可以落盘，所以在数据页面落盘之前，需要先判断page lsn对应的XLOG是否落盘，如果没有就需要把这部分XLOG进行落盘。
log buffer被覆盖之前

这个就是前面说到的情况。
后台进程定期落盘

由于commit之前日志必须落盘，也就是说日志没有落盘，事务就不能commit。所以日志的落盘会导致commit的延迟，为了降低这种延迟，数据库通常都有专门的后台线程或者进程来定期对日志进行落盘。

在这三种情况中，最简单也是最可控的情况，应该就是commit之前的XLOG落盘，为了方便调试排除其他干扰因素，我们需要首先将后台落盘的进程挂起，以免在commit之前后台进程将XLOG进行落盘。后台进程挂起后，我们在commit之前，事务中的相关XLOG一定没有落盘，于是我们就可以很好的观察我们之前写入的XLOG是如何落盘的。

定期落盘的主要函数为：WalWriterMain > XLogBackgroundFlush > XLogWrite。后台进程如下：

在这里插入图片描述

用VS将这个进程挂起后，我们开始调试commit前XLOG的落盘。

如何知道哪个进程是后台日志进程？

一个简单粗暴的办法，在XLogBackgroundFlush()中打上断点，然后用VS挂起所有PostgreSQL相关的进程。等一会儿看看哪个进程进来执行XLogBackgroundFlush，哪个进程就是后台日志进程。

XLOG落盘之commit

我们使用的用例如下：

DROP TABLE IF EXISTS TEST;
CREATE TABLE TEST(a int);

begin;
insert into test values(1);
commit;

当执行完insert后，PostgreSQL产生一条XLOG并写入log buffer，该XLOG对应的lsn为24664584。然后我们执行commit，commit的执行同样也会产生一条XLOG，并写入log buffer，lsn为24664640（由此可以推断出commit对应的XLOG大小为56字节）。接下来PostgreSQL会调用XLogFlush() 函数，XlogFlush的原型如下：

void XLogFlush(XLogRecPtr record);

这个函数用于将record之前的所有XLOG全部落盘。对于当前用例，由于当前只有一个事务在执行操作，所以显然这里需要将24664640之前的所有XLOG全部落盘。好了，现在我们需要用到XLogCtlData的两个非常重要的成员：LogwrtRqst、LogwrtResult。我们来看看PostgreSQL对他们的定义：

/*----------
 * Shared-memory data structures for XLOG control
 *
 * LogwrtRqst indicates a byte position that we need to write and/or fsync
 * the log up to (all records before that point must be written or fsynced).
 * LogwrtResult indicates the byte positions we have already written/fsynced.
 * These structs are identical but are declared separately to indicate their
 * slightly different functions.
 *
 *
 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
 * WALWriteLock.  To update it, you need to hold both locks.  The point of
 * this arrangement is that the value can be examined by code that already
 * holds WALWriteLock without needing to grab info_lck as well.  In addition
 * to the shared variable, each backend has a private copy of LogwrtResult,
 * which is updated when convenient.
 *
 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
 * (protected by info_lck), but we don't need to cache any copies of it.
 *
 * info_lck is only held long enough to read/update the protected variables,
 * so it's a plain spinlock.  The other locks are held longer (potentially
 * over I/O operations), so we use LWLocks for them.  These locks are:
 *
 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
 * It is only held while initializing and changing the mapping.  If the
 * contents of the buffer being replaced haven't been written yet, the mapping
 * lock is released while the write is done, and reacquired afterwards.
 *
 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
 * XLogFlush).
 *
 * ControlFileLock: must be held to read/update control file or create
 * new log file.
 *
 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
 * only one checkpointer at a time; currently, with all checkpoints done by
 * the checkpointer, this is just pro forma).
 *
 *----------
 *
 */

typedef struct XLogwrtRqst
{
	XLogRecPtr	Write;			/* last byte + 1 to write out */
	XLogRecPtr	Flush;			/* last byte + 1 to flush */
} XLogwrtRqst;

typedef struct XLogwrtResult
{
	XLogRecPtr	Write;			/* last byte + 1 written out */
	XLogRecPtr	Flush;			/* last byte + 1 flushed */
} XLogwrtResult;

这两个结构体非常相似，我们首先来介绍Write和Flush的区别：

Write

Write表示在此位置之前的XLOG已经写入操作系统缓存（不确定是否落盘）。
Flush

Write表示在此位置之前的XLOG已经落盘。

注意

log buffer是数据库层面提供的缓存，用于合并I\O。而通常在操作系统层面，也会维护一个操作系统缓存，同样是用来合并I\O。从编程的角度来看，write函数返回成功不代表数据就一定落盘了；确保数据落盘有两种方式：

以O_DSYNC或O_SYNC方式打开文件，以这两种方式打开的文件，write返回后，就表示数据已经落盘。
write完成后，调用fsync，fsync返回则表示数据落盘。

PostgreSQL支持两种提交方式：同步和异步。其区别如下：

同步提交（synchronous_commit为ON）：事务提交时，对应的XLOG必须全部落盘才算提交。
异步提交（synchronous_commit为OFF）：事务提交时，立刻返回用户成功，同时更新asyncXactLSN。

同步提交，可以确保ACID，但会有一定的性能开销，而异步提交性能更快，但无法保证提交时XLOG一定落盘，所以数据崩溃重启后可能导致数据丢失。本文只关注同步提交。

由于，我们只关注同步提交，所以本文中Write和Flush一定是相等的。

接下来再来看看XLogwrtRqst与XLogwrtResult的区别：XLogwrtRqst表示我们需要落盘的XLOG lsn，XLogwrtResult表示已经落盘的XLOG lsn。

在上面代码的注释部分，还有一些值得关注的信息：

XLogwrtRqst与XLogwrtResult是存放于共享内存中被所有进程共享的，所以在读写时需要加锁。具体来说：读时需要info_lck锁或者WALWriteLock，写时两把锁都需要。
每一个处理进程都有一个本地的LogwrtResult，这是共享LogwrtResult的一个copy。而XLogwrtRqst并没有本地copy。

OK，我们继续我们的代码调试：

在这里插入图片描述

现在，我们来到了XLogFlush函数，需要将record之前（24664640）的所有XLOG进行落盘。首先，将record的值与本地缓存的XLogwrtResult.Flush相比较，以判断record之前的XLOG是否已经落盘，如果是则直接返回。

在这里插入图片描述

接下来，对info_lck加锁，然后获取全局XLogwrtResult，以更新本地XLogwrtResult。前面讲过对全局XLogwrtResult、XLogwrtRqst的读操作需要对info_lck加锁。更新本地XLogwrtResult后再次判断record之前的XLOG是否已经落盘。这是一个典型的乐观锁方式，以此提高并发性。由于我们挂起了XLOG落盘的后台进程，所以record之前显然还有尚未落盘的XLOG。

接下来我们需要“wait for all in-flight insertions to the pages we’re about to write to finish”。这是一个非常重要的操作，留到后面来讲。

在这里插入图片描述

然后，我们需要获取WALWriteLock锁，在对XLOG进行写盘之前，必须获取WALWriteLock锁，获取锁之后，需要再次读取全局XLogwrtResult，然后判断record之前的XLOG是否已经落盘。

在这里插入图片描述

接下来，将临时WriteRqst的Write和Flush修改为insertpos（当前用例insertpos与record一致），表示我们希望将insertpos（record）之前的XLOG落盘。然后调用XLogWrite进行真正的写盘操作。

在这里插入图片描述

XLogWrite

XLogWrite概述

XLogWrite是XLOG落盘的最底层函数，负责将XLOG真正写入磁盘。这个函数的细节较多，我们来慢慢分析。在前面的文章中，我们提到过log buffer的管理方式为段页式，对应到物理空间，每个段都是一个独立的文件，每个页面对应文件中的一个物理块。在实际落盘时，是以页面为单位进行落盘。在理解具体的代码实现之前，我们先来白话一下PostgreSQL如何将进行落盘。

在这里插入图片描述

图1

上图展示了LSN、log buffer、物理文件的对应关系。其中绿色部分表示已经落盘的XLOG、蓝色部分表示尚未落盘的XLOG。XLOG的写入顺序为从左至右顺序写入。上图展示的实际上是一种相对复杂的情况：

LSN

LSN本质上是日志的物理偏移（日志本身大小+段\页管理信息），初始值为0，随着XLOG的不断写入，单调递增。
log buffer

XLOG首先会写入log buffer。前面讲过GetXLogBuffer会定位一个LSN对应的XLOG应该写入哪个buffer page。log buffer是一个循环队列，在上图中，1号page在log buffer的队尾，队尾写满之后，会循环到队头继续写入。在上图中，LSN的1号页面中的内容会写入log buffer的1号页面，LSN的2号页面中的内容会写入log buffer的2号页面…

（注意： LSN本身是没有页面这个概念的，这里只是为了表明对应关系）

在上述页面中，3号页面比较特殊，3号是一个partial page。partial page是指当前页面还未写满，这个页面也是当前的写入页面。由于log buffer是顺序写的，所以有且仅有一个partial page。由于落盘也是顺序落盘，所以当partial page落盘后，就说明相关的XLOG都已经落盘。对当前用例而言，3号页面落盘后，就说明record之前的XLOG都已经落盘。
disk

disk表示物理文件，log buffer中的XLOG最终都会落盘到物理文件。那么我们如何知道buffer page和物理块的对应关系呢？实际上也是通过LSN。一个segment对应一个单独的物理文件，一个page对应物理文件中的一个物理块，page和物理块大小相等一一对应。所以通过一个简单的映射规则就能知道一个LSN对应的XLOG应该写入哪个物理块。

段号 = LSN / XLogSegSize

段内块偏移 = LSN % XLogSegSize

当一个segment写满之后，会写下一个segment。在实际落盘时，是以页面为单位进行落盘，所以在落盘前需要找到每个页面的起始位置，然后将整个页面进行落盘。

小结

综上所述，XLOG的落盘流程如下：

从LogwrtResult.Write开始，依据LSN及映射规则在log buffer中获取对应的页面。
依据LSN及映射规则获取页面需要写入的段号及段内页号。
将buffer page写入对应的物理页面。

XLogWrite代码实现

接下来，我们来看看XLogWrite的代码实现，XLogWrite代码略长，总体可以分为三个部分：

part1：调用wirte，对WriteRqst.Write之前的XLOG进行写入。这个部分是XLogWrite的核心，包含上述所有流程。但是这个部分不保证XLOG落盘。
part2：调用flush，对part1写入的XLOG进行落盘。
part2：更新XLogCtl的LogwrtResult及LogwrtRqst。

现在我们逐一对各部分进行讲解。

part1

场景1

part1是整个XLogWrite的核心，这个部分细节也比较多，所以我们先讲解主要流程，然后再根据情况来添加代码。我们先对图1进行简化：

在这里插入图片描述

图2

在图2中，log buffer尚未写满，所以并未发生循环使用，同时当前segment也可以容纳所有的XLOG，这是最简单的情况。针对图2的情况，我们将XLogWrite的代码进行了简化，代码如下：

static void
XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
{
	bool		ispartialpage;
	bool		last_iteration;
	bool		finishing_seg;
	bool		use_existent;
	int			curridx;
	int			npages;
	int			startidx;
	uint32		startoffset;

	/* We should always be inside a critical section here */
	Assert(CritSectionCount > 0);

	/*
	 * Update local LogwrtResult (caller probably did this already, but...)
	 * 更新本地的LogwrtResult，从上面的描述中不难发现，这件事在XLogFlush中已经做过了
	 */
	LogwrtResult = XLogCtl->LogwrtResult;

	/*
	 * Since successive pages in the xlog cache are consecutively allocated,
	 * we can usually gather multiple pages together and issue just one
	 * write() call.  npages is the number of pages we have determined can be
	 * written together; startidx is the cache block index of the first one,
	 * and startoffset is the file offset at which it should go. The latter
	 * two variables are only valid when npages > 0, but we must initialize
	 * all of them to keep the compiler quiet.
	 *
	 * 初始化一些关键变量：详见图3
	 */
	npages 		= 0;
	startidx 	= 0;
	startoffset = 0;

	/*
	 * Within the loop, curridx is the cache block index of the page to
	 * consider writing.  Begin at the buffer containing the next unwritten
	 * page, or last partially written page.
	 * 
	 * 通过XLogRecPtrToBufIdx将LogwrtResult.Write转换为buffer page的下标
	 * 从图2中不难看出，LogwrtResult.Write是第一个需要落盘的page对应的LSN,
	 * 所以curridx为第一个page的数组下标
	 */
	curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);

	while (LogwrtResult.Write < WriteRqst.Write)
	{
		/*
		 * Make sure we're not ahead of the insert process.  This could happen
		 * if we're passed a bogus WriteRqst.Write that is past the end of the
		 * last page that's been initialized by AdvanceXLInsertBuffer.
		 */
		XLogRecPtr	EndPtr = XLogCtl->xlblocks[curridx];
		
		if (LogwrtResult.Write >= EndPtr)
			elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
				 (uint32) (LogwrtResult.Write >> 32),
				 (uint32) LogwrtResult.Write,
				 (uint32) (EndPtr >> 32), (uint32) EndPtr);
        
        /* Advance LogwrtResult.Write to end of current buffer page */
        LogwrtResult.Write = EndPtr;
        ispartialpage = WriteRqst.Write < LogwrtResult.Write;

		/* Add current page to the set of pending pages-to-dump */
		if (npages == 0)
		{
			/* first of group */
			startidx 	= curridx;
			startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
		}
		npages++;

		/*
		 * Dump the set if this will be the last loop iteration, or if we are
		 * at the last page of the cache area (since the next page won't be
		 * contiguous in memory), or if we are at the end of the logfile
		 * segment.
		 */
		last_iteration = WriteRqst.Write <= LogwrtResult.Write;
		if (last_iteration)
		{
			//落盘代码，暂时省略	
		}
        
        //ispartialpage用于表示partial page是否已经落盘，
        /* 如果落盘则表示WriteRqst.Write之前的所有XLOG都已经落盘，则结束循环
         * 注意在结束循环前，需要将LogwrtResult.Write改为WriteRqst.Write
         * 因为我们前面将LogwrtResult.Write改为了XLogCtl->xlblocks[curridx]
         * 而我们实际写入的内容仅限于WriteRqst.Write之前
         */
		if (ispartialpage)
		{
			/* Only asked to write a partial page */
			LogwrtResult.Write = WriteRqst.Write;
			break;
		}
		curridx = NextBufIdx(curridx);
		/* If flexible, break out of loop as soon as we wrote something */
		if (flexible && npages == 0)
			break;
	}
	Assert(npages == 0);
}

上述代码，首先对三个非常重要的成员进行了初始化：npages、startidx、startoffset。这三个成员的作用如图3所示：

在这里插入图片描述

图3

npages用于记录需要落盘的页面数量；startidx表示第一个需要落盘的页面的下标；startoffset表示页面的起始写入偏移。

startidx

在“Log Buffer的组织结构”这个章节中，我们讲过log buffer以数组形式存放在XLogCtlData的pages成员中，数组元素为一个buffer page。而startidx、curridx均表示pages数组的下标。

接下来，通过XLogRecPtrToBufIdx函数找到LogwrtResult.Write对应的buffer page。这个page其实就是第一个需要落盘的page。

紧接着进入循环，每次循环对一个buffer page进行一系列处理，curridx表示当前正在处理的buffer page。在循环体内部首先从xlblocks获取一个页面当前可以存放的XLOG的最大lsn。

XLogRecPtr	EndPtr = XLogCtl->xlblocks[curridx];
		
if (LogwrtResult.Write >= EndPtr)
    elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
         (uint32) (LogwrtResult.Write >> 32),
         (uint32) LogwrtResult.Write,
         (uint32) (EndPtr >> 32), (uint32) EndPtr);

/* Advance LogwrtResult.Write to end of current buffer page */
LogwrtResult.Write = EndPtr;
ispartialpage = WriteRqst.Write < LogwrtResult.Write;//判断当前写入的页面是不是partial page

xlblocks

xlblocks是XLogCtlData的又一个成员。xlblocks是一个XLogRecPtr的数组，数组元组的个数为log buffer的页面数，xlblocks与buffer page一一对应。

前面讲过LSN到buffer page的转换，也就是说每个LSN都对应一个buffer page。反之每个buffer page都对应一个范围内的LSN。由于log buffer是循环队列，所以用xlblocks数组来表示某个buffer page当前可写入的XLOG的LSN的上限。由于buffer page的大小固定为XLOG_BLCKSZ，所以通过xlblocks-XLOG_BLCKSZ就可以得到该page可写入XLOG的LSN的下限。所以一个buffer page当前可以写入XLOG的LSN的范围为 [xlblocks-XLOG_BLCKSZ, xlblocks]。这个范围主要用来判断当前写入的XLOG是否会覆盖页面中之前写入的XLOG。后面会详细讲解xlblocks的使用。

这里，在获取到EndPtr之后，首先进行校验，由于LogwrtResult.Write之后的页面都是需要落盘的，所以LogwrtResult.Write不可能>=EndPtr。校验之后将LogwrtResult.Write修改为EndPtr。注意：LogwrtResult是本地缓存的LogwrtResult而不是全局LogwrtResult。接着判断当前写入的页面是不是partial page。

接下来，在npages为0时获取startidx以及startoffset。

/* Add current page to the set of pending pages-to-dump */
if (npages == 0)
{
    /* first of group */
    startidx 	= curridx;
    startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
}

这里需要特别注意startoffset的运算。如图4所示：

在这里插入图片描述

图4

假设初始时LogwrtResult.Write对应一个页面的中间位置，通过前面的流程，已经将LogwrtResult.Write修改为页面结束位置对应的LSN。所以LogwrtResult.Write - XLOG_BLCKSZ对应页面起始位置的LSN。由于是以页面为单位进行落盘，所以很显然，我们需要获取log buffer对应的物理页偏移。而前面说过将LSN % XLogSegSize就可以实现LSN到物理偏移的转换。所以 startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;就得到了log buffer写盘的起始物理偏移。

接下来需要对页面数进行累加。

npages++;

然后判断当前页是否为最后一个需要落盘的页面，对于图2的场景，只有在这个时候才会开始真正的落盘操作。

last_iteration = WriteRqst.Write <= LogwrtResult.Write;
if (last_iteration)
{
    //落盘代码，暂时省略	
}

最后，判断当前页面是否为partial page，如果是则表示WriteRqst.Write之前的所有XLOG都已经落盘，循环结束。否则继续循环。

if (ispartialpage)
{
    /* Only asked to write a partial page */
    LogwrtResult.Write = WriteRqst.Write;
    break;
}
/*
 * 获取下一个buffer page
 * #define NextBufIdx(idx) (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
 */
curridx = NextBufIdx(curridx);
/* If flexible, break out of loop as soon as we wrote something */
if (flexible && npages == 0)
    break;

OK，我们现在已经讲完了while (LogwrtResult.Write < WriteRqst.Write)循环的主要框架。在这个循环中主要做以下事情：

每次获取一个buffer page
校验每个page的xlblocks与LogwrtResult.Write是否合法
第一次循环时获取startidx、startoffset
累加npages
最后一次循环时执行真正的写盘操作

千呼万唤始出来，下来我们就来看看真正写盘的代码：

if (last_iteration)
{
    char	   *from;
    Size		nbytes;
    Size		nleft;
    int			written;

    /* 
     * Need to seek in the file? 
     * step1:调用lseek，依据startoffset定位文件的写入位置
     */
    if (openLogOff != startoffset)
    {
        if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
            ereport(PANIC,
                    (errcode_for_file_access(),
                     errmsg("could not seek in log file %s to offset %u: %m",
                            XLogFileNameP(ThisTimeLineID, openLogSegNo),
                            startoffset)));
        openLogOff = startoffset;
    }

    /* 
     * OK to write the page(s) 
     * step2:开始真正写入
     */
    from 	= XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
    nbytes 	= npages * (Size) XLOG_BLCKSZ;
    nleft 	= nbytes;
    do
    {
        errno = 0;
        written = write(openLogFile, from, nleft);
        if (written <= 0)
        {
            if (errno == EINTR)
                continue;
            ereport(PANIC,
                    (errcode_for_file_access(),
                     errmsg("could not write to log file %s "
                            "at offset %u, length %zu: %m",
                            XLogFileNameP(ThisTimeLineID, openLogSegNo),
                            openLogOff, nbytes)));
        }
        nleft -= written;
        from += written;
    } while (nleft > 0);

    /* Update state for write 
     * step3:移动openLogOff、重置npages
     */
    openLogOff += nbytes;
    npages = 0;
}

上面的代码实现非常简单，主要有三个步骤：

step1：判断startoffset是否等于openLogOff，如果不相等就seek到startoffset。
step2：将startidx开始的npages个页面写入文件。
step3：将openLogOff向前移动nbytes，然后重置npages（后面我们会看到step3的作用）

这段代码有几个有意思的地方：

openLogOff

openLogOff是一个全局变量，用于记录当前文件的写入位置偏移。由于文件的seek操作通常存在很大的性能开销，而日志文件又是顺序写入的，所以没有必要在每次写入之前都调用seek。通过比较openLogOff与startoffset可以极大的减少seek的次数。
from、nbytes、npages

由于log buffer是一段连续的空间，所以对于图2的场景，可以一次性将from开始npages个页面直接写入物理文件。

OK，至此我们已经介绍分析了图2场景下，XLOG的落盘，后面还有两种场景，但都是基于这种最简单的场景。现在我们介绍第二种场景：

场景2

在这里插入图片描述

图5

如图5所示，当前segment也可以容纳所有的XLOG，但是log buffer已经写满一轮然后循环到了队头。此时page1和page2、page3不再连续，所以需要分两次进行落盘。也就是说当while (LogwrtResult.Write < WriteRqst.Write)循环到log buffer的最后一个页面时，需要先将LogwrtResult.Write到该页面之间的所有页面都写盘，然后再继续循环。因此除了last_iteration时需要写盘，在curridx == XLogCtl->XLogCacheBlck时也需要写盘。

if (last_iteration ||
	curridx == XLogCtl->XLogCacheBlck)
{
    //step1：省略
    //step2：省略
    
    /* Update state for write 
     * step3:移动openLogOff、重置npages
     */
    openLogOff += nbytes;
    npages = 0;
}

这里重点讲下step3。step3会将openLogOff前移，这无可厚非，因为write会前移文件指针。更重要的是需要将npages重置为0。针对图5的场景，page1落盘之后会获取下一个buffer page：

在这里插入图片描述

显然，此时的curridex值为0指向队头的页面page2，然后继续循环。再次循环时，由于npages重置为0，所以会重新获取startidx和startoffset，而startidx就指向了page2。如此当遍历到page3这个partial page时就会再次将page2和page3写盘。

在这里插入图片描述

至此场景2介绍完毕，现在我们来看最后一种场景。

场景3

最后一种场景就是图1中的场景，当前segment中空闲空间不足，buffer page需要写入到两个segment中。在这种情况下需要先将buffer page中的一部分写入当前segment，并且将这部分内容强制落盘（调用fsync）。这样做的目的是防止后面还需要将重新打开这个segment再进行落盘。所以，我们需要添加如下代码：

//判断当前segment是否写满
finishing_seg = !ispartialpage &&
			(startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;

if (last_iteration ||
	curridx == XLogCtl->XLogCacheBlck ||
    finishing_seg)
{
    //step1：省略
    //step2：省略
    //step3：省略
    
    /*
	 * If we just wrote the whole last page of a logfile segment,
	 * fsync the segment immediately.  This avoids having to go back
	 * and re-open prior segments when an fsync request comes along
	 * later. Doing it here ensures that one and only one backend will
	 * perform this fsync.
	 *
	 * This is also the right place to notify the Archiver that the
	 * segment is ready to copy to archival storage, and to update the
	 * timer for archive_timeout, and to signal for a checkpoint if
	 * too many logfile segments have been used since the last
	 * checkpoint.
	 *
	 * 强制落盘
	 */
    if (finishing_seg)
    {
        issue_xlog_fsync(openLogFile, openLogSegNo);

        /* signal that we need to wakeup walsenders later */
        WalSndWakeupRequest();

        LogwrtResult.Flush = LogwrtResult.Write;		/* end of page */

        if (XLogArchivingActive())
            XLogArchiveNotifySeg(openLogSegNo);

        XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);

        /*
		 * Request a checkpoint if we've consumed too much xlog since
		 * the last one.  For speed, we first check using the local
		 * copy of RedoRecPtr, which might be out of date; if it looks
		 * like a checkpoint is needed, forcibly update RedoRecPtr and
		 * recheck.
		 */
        if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
        {
            (void) GetRedoRecPtr();
            if (XLogCheckpointNeeded(openLogSegNo))
                RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
        }
    }
}

至此，XLogWrite第一部分的核心功能我们已经全部阐述完毕。

part2

在第一部分，我们只是调用write将log buffer中的内容写入日志文件，但如果日志文件不是以O_SYNC或者O_DSYNC的方式打开的话，write函数无法保证成功即落盘（日志可能只是写入了操作系统缓存），所以还需要调用fsync来将系统缓存中的数据强制落盘。

/*
 * If asked to flush, do so
 */
if (LogwrtResult.Flush < WriteRqst.Flush &&
    LogwrtResult.Flush < LogwrtResult.Write)

{
    /*
		 * Could get here without iterating above loop, in which case we might
		 * have no open file or the wrong one.  However, we do not need to
		 * fsync more than one file.
		 */
    if (sync_method != SYNC_METHOD_OPEN &&
        sync_method != SYNC_METHOD_OPEN_DSYNC)
    {
        if (openLogFile >= 0 &&
            !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
            XLogFileClose();
        if (openLogFile < 0)
        {
            XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
            openLogFile = XLogFileOpen(openLogSegNo);
            openLogOff = 0;
        }

        issue_xlog_fsync(openLogFile, openLogSegNo);
    }

    /* signal that we need to wakeup walsenders later */
    WalSndWakeupRequest();

    LogwrtResult.Flush = LogwrtResult.Write;
}

part3

完成了落盘操作之后，最后就是修改全局LogwrtResult和LogwrtRqst。

/*
 * Update shared-memory status
 *
 * We make sure that the shared 'request' values do not fall behind the
 * 'result' values.  This is not absolutely essential, but it saves some
 * code in a couple of places.
 */
{
    SpinLockAcquire(&XLogCtl->info_lck);
    XLogCtl->LogwrtResult = LogwrtResult;
    if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
        XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
    if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
        XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
    SpinLockRelease(&XLogCtl->info_lck);
}

前面讲过，对于全局LogwrtResult和LogwrtRqst的修改需要同时持有WALWriteLock锁和info_lck锁，在XLogFlush调用XLogWrite之前就已经持有了WALWriteLock，所以这里只需要持有info_lck锁即可。

XLOG落盘之SyncOneBuffer

SyncOneBuffer函数用来负责数据页面的落盘，为了保证WAL，在SyncOneBuffer中，页面落盘之前会判断页面的page lsn之前的XLOG有没有落盘，如果没有则需要先将对应XLOG落盘。对应代码如下：

//代码位置：bufmgr.c line:2728
/*
	 * Force XLOG flush up to buffer's LSN.  This implements the basic WAL
	 * rule that log updates must hit disk before any of the data-file changes
	 * they describe do.
	 *
	 * However, this rule does not apply to unlogged relations, which will be
	 * lost after a crash anyway.  Most unlogged relation pages do not bear
	 * LSNs since we never emit WAL records for them, and therefore flushing
	 * up through the buffer LSN would be useless, but harmless.  However,
	 * GiST indexes use LSNs internally to track page-splits, and therefore
	 * unlogged GiST pages bear "fake" LSNs generated by
	 * GetFakeLSNForUnloggedRel.  It is unlikely but possible that the fake
	 * LSN counter could advance past the WAL insertion point; and if it did
	 * happen, attempting to flush WAL through that location would fail, with
	 * disastrous system-wide consequences.  To make sure that can't happen,
	 * skip the flush if the buffer isn't permanent.
	 */
	if (buf_state & BM_PERMANENT)
		XLogFlush(recptr);

XLOG落盘之后台进程

在PostgreSQL中会有一个后台进程来定期将log buffer中的日志进行落盘，这样可以减轻commit时XLOG落盘的压力，提高commit的效率。XLOG后台落盘的主要函数为：WalWriterMain > XLogBackgroundFlush > XLogWrite。XLogWrite我们在前面已经详细阐述过了，本节我们重点来看看XLogBackgroundFlush。XLogBackgroundFlush函数大致可以分为两个部分，我们逐一进行讲解。

XLogBackgroundFlush—Part1

bool
XLogBackgroundFlush(void)
{
	XLogwrtRqst WriteRqst;
	bool		flexible = true;
	static TimestampTz lastflush;
	TimestampTz now;
	int			flushbytes;

	/* XLOG doesn't need flushing during recovery */
	if (RecoveryInProgress())
		return false;

	/* read LogwrtResult and update local state 
	 * step1：加锁，然后获取全局LogwrtResult、LogwrtRqst
	 */
	SpinLockAcquire(&XLogCtl->info_lck);
	LogwrtResult = XLogCtl->LogwrtResult;
	WriteRqst 	 = XLogCtl->LogwrtRqst;
	SpinLockRelease(&XLogCtl->info_lck);

	/* back off to last completed page boundary 
	 * step2：将WriteRqst.Write对齐到完整的页面边界,
	 * 这说明后台日志进程只会将写满的buffer page落盘
	 */
	WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;

	/* if we have already flushed that far, consider async commit records 
	 *  step3：判断log buffer中是否有内容需要落盘，
	 *  如果log buffer中的数据全部落盘，则判断是否有异步提交的日志需要落盘
	 */
	if (WriteRqst.Write <= LogwrtResult.Flush)
	{
		SpinLockAcquire(&XLogCtl->info_lck);
		WriteRqst.Write = XLogCtl->asyncXactLSN;
		SpinLockRelease(&XLogCtl->info_lck);
		flexible = false;		/* ensure it all gets written */
	}

	/*
	 * If already known flushed, we're done. Just need to check if we are
	 * holding an open file handle to a logfile that's no longer in use,
	 * preventing the file from being deleted.
	 * step4：如果log buffer和异步提交的日志都已经落盘，则直接返回
	 */
	if (WriteRqst.Write <= LogwrtResult.Flush)
	{
		if (openLogFile >= 0)
		{
			if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
			{
				XLogFileClose();
			}
		}
		return false;
	}
}

第一部分代码的主要功能是判断log buffer是否有未落盘的XLOG。如果log buffer内的XLOG全部落盘那么就直接返回，否则才会执行下面的代码。那么我们如何构建用例来观察XLOG后台落盘呢？思路很简单，我们只需要开启一个事务，然后插入一条数据，插入完成后不要提交事务。插入操作会产生XLOG，由于事务没有提交，所以XLOG不会由于事务的提交而落盘，那么等待一段时间后就会由后台进程来将这条XLOG进行落盘。依据这样的思路，我构造了如下用例：

DROP TABLE IF EXISTS TEST;
CREATE TABLE TEST(a int);

begin;
insert into test values(1);

然而，事与愿违。后台日志进程执行上述Part1的代码时总是发现WriteRqst.Write小于LogwrtResult.Flush，即当前log buffer中的所有日志都已经落盘，但insert产生的XLOG明明就没有落盘啊？经过排查，我发现了一个有意思的现象，就是不论我执行多少次insert，XLogCtl->LogwrtRqst中Write成员的值都不会发生变化。回想一下前面阐述过的所有流程，除了commit落盘后会修改XLogCtl->LogwrtRqst.Write其他似乎没有地方会修改XLogCtl->LogwrtRqst.Write。于是我搜索了一下代码中对于XLogCtl->LogwrtRqst.Write的修改，然后在XLogInsertRecord函数中发现了这样一段代码：

//xlog.c line1073
/*
 * Update shared LogwrtRqst.Write, if we crossed page boundary.
 */
if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
{
    SpinLockAcquire(&XLogCtl->info_lck);
    /* advance global request to include new block(s) */
    if (XLogCtl->LogwrtRqst.Write < EndPos)
        XLogCtl->LogwrtRqst.Write = EndPos;
    /* update local result copy while I have the chance */
    LogwrtResult = XLogCtl->LogwrtResult;
    SpinLockRelease(&XLogCtl->info_lck);
}

这段代码在CopyXLogRecordToWAL之后执行，也就是说在XLOG写入log buffer后执行。注释写的很清楚，如果当前XLOG跨越了一个page的边界，就更新XLogCtl->LogwrtRqst.Write的值。XLOG跨越page的边界意味着什么？意味着StartPos对应的page已经写满了。在这个时候才去更新XLogCtl->LogwrtRqst.Write的值。这样设计的目的是什么？试想如果每写一条XLOG都去更新一下XLogCtl->LogwrtRqst.Write会怎样？

首先对于XLogCtl->LogwrtRqst.Write的更新需要加锁，所以每次插入都更新会降低系统的并发性。
其次XLogCtl->LogwrtRqst.Write一旦更新后台日志进程就会发现log buffer中有日志没有落盘，于是就会将这部分XLOG进行落盘。XLOG的落盘本身就是以buffer page为单位的，所以如果每写一条XLOG就触发一次落盘会有很大的I\O开销。既然XLOG的落盘是以buffer page为单位，所以何不等写满一个也后再触发后台落盘呢？此时在回头看看part1代码的step2，我们就更能明白这个步骤的用意了。

知道这样一个事实之后，我修改了测试用例：


drop table if exists test;
create table test(a varchar);

begin;
insert into test values('Completely destroy information stored without your knowledge or approval: Internet history, Web pages and pictures from sites visited on the Internet, unwanted cookies, chatroom conversations, deleted e-mail messages, temporary files, the Windows swap file, the Recycle Bin, previously deleted files, valuable corporate trade secrets, Business plans, personal files, photos or confidential letters, etc.East-Tec Eraser 2005 offers full support for popular browsers (Internet Explorer, Netscape Navigator, America Online, MSN Explorer, Opera), for Peer2Peer applications (Kazaa, Kazaa Lite, iMesh, Napster, Morpheus, Direct Connect, Limewire, Shareaza, etc.), and for other popular programs such as Windows Media Player, RealPlayer, Yahoo Messenger, ICQ, etc. Eraser has an intuitive interface and wizards that guide you through all the necessary steps needed to protect your privacy and sensitive information.Other features include support for custom privacy needs, user-defined erasure methods, command-line parameters, integration with Windows Explorer, and password protection.Direct Connect, Limewire, Shareaza, etc.), and for other popular programs such as Windows Media Player');

这次我将字段类型修改为了varchar，然后向其中插入了一段文章，这段文章的大小是1.15K，这样做的目的是让insert可以很快写满一个buffer page。然后多执行几次insert，直到触发XLogCtl->LogwrtRqst.Write更新为止。果然XLogCtl->LogwrtRqst.Write更新后，我顺利的进入了part2。

XLogBackgroundFlush—Part2

bool
XLogBackgroundFlush(void)
{
	XLogwrtRqst WriteRqst;
	bool		flexible = true;
	static TimestampTz lastflush;
	TimestampTz now;
	int			flushbytes;

	//part1：省略
    
	/*
	 * Determine how far to flush WAL, based on the wal_writer_delay and
	 * wal_writer_flush_after GUCs.
	 * 获取当前时间
	 */
	now = GetCurrentTimestamp();
	flushbytes =
		WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;

	if (WalWriterFlushAfter == 0 || lastflush == 0)
	{
		/* first call, or block based limits disabled */
		WriteRqst.Flush = WriteRqst.Write;
		lastflush = now;
	}
	else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
	{
		/*
		 * 时间间隔不足200ms，不落盘
		 * Flush the writes at least every WalWriteDelay ms. This is important
		 * to bound the amount of time it takes for an asynchronous commit to
		 * hit disk.
		 */
		WriteRqst.Flush = WriteRqst.Write;
		lastflush = now;
	}
	else if (flushbytes >= WalWriterFlushAfter)
	{
		/* exceeded wal_writer_flush_after blocks, flush */
		WriteRqst.Flush = WriteRqst.Write;
		lastflush = now;
	}
	else
	{
		/* no flushing, this time round */
		WriteRqst.Flush = 0;
	}

#ifdef WAL_DEBUG
	if (XLOG_DEBUG)
		elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
			 (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
			 (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
			 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
		   (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
#endif

	START_CRIT_SECTION();

	/* now wait for any in-progress insertions to finish and get write lock */
	WaitXLogInsertionsToFinish(WriteRqst.Write);
	LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
	LogwrtResult = XLogCtl->LogwrtResult;
	if (WriteRqst.Write > LogwrtResult.Write ||
		WriteRqst.Flush > LogwrtResult.Flush)
	{
        //实际落盘的函数
		XLogWrite(WriteRqst, flexible);
	}
	LWLockRelease(WALWriteLock);

	END_CRIT_SECTION();

	/* wake up walsenders now that we've released heavily contended locks */
	WalSndWakeupProcessRequests();

	/*
	 * Great, done. To take some work off the critical path, try to initialize
	 * as many of the no-longer-needed WAL buffers for future use as we can.
	 */
	AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);

	/*
	 * If we determined that we need to write data, but somebody else
	 * wrote/flushed already, it should be considered as being active, to
	 * avoid hibernating too early.
	 */
	return true;
}

part2会调用XLogWrite对XLOG进行落盘，在这里，我们第二次看到了WaitXLogInsertionsToFinish函数，但是现在我们依然不讲，除此之外part2最重要的就是AdvanceXLInsertBuffer函数。这个函数在当前的场景下不太好解释，我们留到下面一个章节来讲。

XLOG落盘之bufffer page淘汰

前面讲到了两种XLOG的落盘场景，还是一种场景就会当log buffer写满后会循环到log buffer头，然后覆盖头部page。在覆盖之前如果这个page中的XLOG还没有落盘，则需要先落盘。而这一切都是在GetXLogBuffer中完成的。所以现在我们需要深入的学习下GetXLogBuffer。

GetXLogBuffer—Round2

我们现在来调试一下GetXLogBuffer，同时观察一个问题，当一个buffer page写满之后，是如何通过GetXLogBuffer切换到下一个buffer page的？在切换到下一个buffer page之前又需要做些什么？我们还是用上面插入文本的用例，多次执行。
在这里插入图片描述

当XLOG当前需要写入的buffer page不是cachedPage（上一次写入的buffer page）时，就会发生buffer page的切换。此时，首先会执行idx = XLogRecPtrToBufIdx(ptr);获取当前XLOG应该写入的buffer page。

接下来，会根据当前的写入位置ptr计算expectedEndPtr。expectedEndPtr是什么？还记得前面讲的xlblocks么？既然每个XLOG都要写入一个buffer page，那么每个buffer page都可以确定一个当前可以写入XLOG的LSN的范围。所以，既然ptr对应的XLOG要写入下标为idx的buffer page，那么这个page可以容纳XLOG的范围就应该是[expectedEndPtr-XLOG_BLCKSZ,expectedEndPtr]（这个expected就表示应该是），那么实际上是不是呢？我们将idx对应的xlblocks取出来比比看不就知道了。

在这里插入图片描述

上面的调试结果显示xlblocks[idx]的值为0。实际上xlblocks[idx]的值可以有三种情况：

为0

还记得系统启动时调用XLOGShmemInit函数将xlblocks数组元素都初始化为0了么？所以xlblocks[idx]为0表示当前buffer page还没人用过。
不为0且与expectedEndPtr不相等

既然为0表示buffer page没人用，那么不为0自然就表示之前被写入了东西。所以现在如果要写入数据，那么就会覆盖之前的内容。所以对于这样的buffer page需要判断里面的内容是否落盘，如果没有落盘就需要落盘。
等于expectedEndPtr

这个情况比较特殊，xlblocks[idx]与expectedEndPtr相等表示这个页面已经进行了初始化，这个后面再来讲。

由于xlblocks[idx]的当前值为0，所以显然满足expectedEndPtr != endptr，于是开始执行if中的内容。
在这里插入图片描述

WALInsertLockUpdateInsertingAt与前面见过的WaitXLogInsertionsToFinish是一对函数，所以这里也先跳过。我们现在重点来看看AdvanceXLInsertBuffer，这个函数是GetXLogBuffer的核心函数，有两大功能：

如果ptr对应的页面是一个没有初始化的页面，即对应的xlblocks值为0，那么就将其初始化。
如果ptr对应的页面中有历史数据，那么就需要根据情况对历史数据进行落盘，然后将这个页面重新初始化。

我们现在来调试一下AdvanceXLInsertBuffer。

AdvanceXLInsertBuffer

我们先来看看AdvanceXLInsertBuffer的两个参数upto、opportunistic。upto是我们当前需要写入的XLOG的LSN，opportunistic当前的值为0，所以先不去管。这里我们只需要知道满足upto >= XLogCtl->InitializedUpTo 或者opportunistic其中一个条件，就会执行while循环中的代码，而while循环会实现AdvanceXLInsertBuffer的两大功能。

在这里插入图片描述

在这里，我们先持有WALBufMappingLock锁，持有WALBufMappingLock锁是为了访问XLogCtlData的InitializedUpTo成员。那么InitializedUpTo是什么呢？

在这里插入图片描述

为了更加清楚的解释InitializedUpTo以及后续的一些知识点，我们重启数据库，然后从StartupXLOG开始调试，从而可以获取XLogCtlData中一些关键成员的初始值，观察这些值的后续变化，从而更好的理解整个流程。

在调用StartupXLOG之前，首先会调用XLOGShmemInit，这个函数我们在前面已经讲过，主要是为XLogCtlData中的一些成员分配空间。XLOGShmemInit结束后，XLogCtlData中各个成员的空间已经分配完毕，进行了基本的初始化。如下图所示：

在这里插入图片描述

图中列举了XLogCtlData中主要成员的值，当前大部分值都为0。接下来我们进入StartupXLOG函数。

在这里插入图片描述

此时XLogCtlData主要成员的值与之前一样。StartupXLOG的执行流程非常的长，我们直接在line7283打上断点。经过前面的流程，我们得到EndOfLog的值。EndOfLog表示我们在关闭数据库前最后写入的XLOG的结束位置，这个位置也是一个LSN。我们在《XLOG 2.0》中提到过的那个精妙的Insert->CurrBytePos在这里就是通过EndOfLog转换并初始化的。XLogRecPtrToBytePos与XLogBytePosToRecPtr是LSN到逻辑位置相互转换的一对函数。
在这里插入图片描述

EndOfLog是当前最后一条XLOG的结束位置，也就是说后面需要写入的XLOG都是从这个位置开始，而EndOfLog对应的buffer page，就是后面首先会写入数据的buffer page。那么需要先将这个page进行初始化。

在这里插入图片描述

上面的代码包含了很多关键信息，我们一一说明：

获取EndOfLog对应的buffer page下标

这个前面已经见过很多次了，直接调用XLogRecPtrToBufIdx即可。firstIdx就是后面产生的第一条XLOG写入的buffer page。此处firstIdx值为1。
获取pageBeginPtr
pageBeginPtr表示firstIdx对应的buffer page所能存放的XLOG的起始LSN。那么这个page存放LSN的范围为 [pageBeginPtr，pageBeginPtr + XLOG_BLCKSZ]，所以pageBeginPtr + XLOG_BLCKSZ自然就是XLogCtl->xlblocks[firstIdx]的值。

初始化当前page

page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
len = EndOfLog % XLOG_BLCKSZ;
memcpy(page, xlogreader->readBuf, len);
//这个memset感觉不是太必要，之前为page分配空间时已经对整个页面做了memset
memset(page + len, 0, XLOG_BLCKSZ - len);

这几行代码将日志文件中属于当前页面的XLOG拷贝到当前页面中。

获取xlblocks[firstIdx]、InitializedUpTo

前面已经讲过xlblocks[firstIdx]的值为pageBeginPtr + XLOG_BLCKSZ。而在这里InitializedUpTo与xlblocks[firstIdx]相等。InitializedUpTo表示当前已经初始化的所有页面中xlblocks的最大值。这里只初始化了1个页面所以InitializedUpTo自然就是这个页面的xlblocks。

上述代码执行完毕后，我们监视的几个关键成员的值，已经发生了变化。接下来就是获取LogwrtResult与LogwrtRqst的值了，这部分代码很简单：

在这里插入图片描述

既然EndOfLog是最后一条日志的结束位置，那么显然LogwrtResult、LogwrtRqst的值都应该与EndOfLog相同。OK，现在我们已经获取到了XLogCtlData中关键成员的初始状态，为了更加直观，我们用一张图来表示：

在这里插入图片描述

图6

其中1号页面已经初始化并写入了部分日志，其他页面都没有初始化。OK，我们已经完成了初始化工作，现在可以向数据库中写点东西了。首先还是挂起后台日志进程。然后开启事务，执行插入文本的语句，别提交、别提交、别提交，一会儿有惊喜。

这是数据库启动后进行的第一次插入操作，所以也是重启后第一次写入XLOG。我们来到了CopyXLogRecordToWAL，通过GetXLogBuffer获取当前XLOG需要插入的buffer page。

在这里插入图片描述

由于是重启后第一次写入XLOG，所以cachedPage为0（因为还没有cachedPage）。

在这里插入图片描述

于是需要定位插入位置对应的buffer page。显然当前XLOG应该插入page1，而由于page1已经进行了初始化，所以expectedEndPtr与xlblocks相等。

在这里插入图片描述

于是就可以直接返回XLOG的写入位置指针。现在进入到了XLOG写入log buffer的循环。由于page1中的空闲空间只有不到200个字节，而我们插入的元组有1K，所以显然会出现XLOG长度大于当前buffer page空闲空间的情况。

在这里插入图片描述

所以，需要先将XLOG的一部分写入当前buffer page。
在这里插入图片描述

接下来我们需要将XLOG的剩余部分写入下一个buffer page。此时log buffer的情况如下图所示：

在这里插入图片描述

图7

此时page1已经写满，CurrPos与InitializedUpTo相等。接下来，我们会再调用GetXLogBuffer获取下一个buffer page的地址。这里需要注意CurrPos的值正好是page1的结束位置，其实也就是page2的开始位置。通过CurrPos我们很自然的知道需要将XLOG的剩余内容写入page2，而由于page2尚未初始化，所以对应的xlblocks值为0。

在这里插入图片描述

于是expectedEndPtr与endptr不相等，从而进入AdvanceXLInsertBuffer。

在这里插入图片描述

在AdvanceXLInsertBuffer中，由于upto与XLogCtl->InitializedUpTo相等（upto就是前面的CurrPos），所以会进入while循环。while循环中首先会判断upto之前及upto对应的页面中的数据是否已经落盘。如果没有落盘，则调用XLogWrite对相应的数据页面进行落盘，落盘的流程和前面完全一致，这里就不再赘述了。
在这里插入图片描述

当前upto对应的页面为page2，由于page2还没有初始化，所以不存在需要落盘的数据。所以直接进入后续的页面初始化流程：

在这里插入图片描述

这里个地方值得注意，从图7中不难看出InitializedUpTo即是page1的结束位置，也是page2的开始位置。NewPageBeginPtr就表示page2的开始位置，自然page2的结束位置就是NewPageBeginPtr + XLOG_BLCKSZ。

最后，既然page2已经初始化完毕，那么InitializedUpTo自然就应该变为page2的结束位置NewPageEndPtr。

在这里插入图片描述

此时log buffer的情况，如图8所示：

在这里插入图片描述

图8

至此AdvanceXLInsertBuffer的流程已经全部讲解完毕。GetXLogBuffer的核心代码也讲解完毕。

AdvanceXLInsertBuffer之后台进程

还记得我们第一次见到AdvanceXLInsertBuffer函数是在哪里么？是在后台日志进程XLogBackgroundFlush函数的part2，当时说在这个场景下不太讲的清楚AdvanceXLInsertBuffer作用，现在在了解了AdvanceXLInsertBuffer的细节之后，我们再来看看XLogBackgroundFlush中的AdvanceXLInsertBuffer有什么特别之处。我们再前面的调试中，挂起了后台日志进程，并且插入操作也没有commit，在加上log buffer的page1已经写满了，所以我们放开后台日志进程后，一定会触发后台进程对page1的落盘。所以我们又顺利的见到了之前我们略过的那个AdvanceXLInsertBuffer：

在这里插入图片描述

我们先来观察下这个AdvanceXLInsertBuffer的两个实参，在前面讲AdvanceXLInsertBuffer时upto对应的实参是page1的结束位置，而opportunistic对应false。而在这里upto对应InvalidXLogRecPtr，这是一个非法的LSN，值为0。

#define InvalidXLogRecPtr	0

而opportunistic对应为true，opportunistic这个单词的中文意思是“机会主义的”，这个命名非常贴切。AdvanceXLInsertBuffer的两个参数也充分的说明了AdvanceXLInsertBuffer的两个使用场景：

upto >= XLogCtl->InitializedUpTo

这个场景前面已经讲过
opportunistic为true

opportunistic为true，表示“有机会”的时候就执行AdvanceXLInsertBuffer。那么什么时候是机会呢？从前面AdvanceXLInsertBuffer中不难看出，即便buffer page不需要落盘，只是对buffer page进行初始化，也是一件比较复杂的事情。那么这件事如果放在后台来做是不是会比较美好呢？所以当后台日志进程对log buffer中的页面进行落盘时，也是一个初始化buffer page的契机。

在opportunistic为true时，我们需要在确保在不需要将页面落盘的前提下，尽可能初始化更多的页面（pre-initialize as much as we can without flushing）。

在这里插入图片描述

在while循环中，会遍历log buffer中所有的页面，对这些页面逐一进行初始化，直到发现某个尚未落盘的页面。

在这里插入图片描述

图9

当前，log buffer的情况如图9所示，由于做了insert操作，所以page2中已经写入了一些XLOG。而后台进程在调用AdvanceXLInsertBuffer之前，已经将page1落盘（由于page2没有写满，所以后台进程是不会将page2落盘的，详见XLogBackgroundFlush的step2），除了page1、page2之外的其他页面都没有初始化。所以AdvanceXLInsertBuffer会将除page2之外的所有页面进行初始化。对于page1由于已经落盘，所以循环到page1后，会将page1重新初始化，而page2由于没有落盘，所以无法初始化，循环到page2后就会跳出循环。

在这里插入图片描述

不难发现，在循环跳出前，XLogCtl->xlblocks数组中的每一个元素都有对应的值。XLogCtl->InitializedUpTo的值远大于XLogCtl->LogwrtRqst.Write，因为这是重启后XLOG的第一次写入，但是buffer page却已经全部初始化了。现在log buffer的状态如图10：

在这里插入图片描述

图10

从现在起至log buffer循环写入到page2之前，在GetXLogBuffer中if (expectedEndPtr != endptr)总是为false。

在这里插入图片描述

至此AdvanceXLInsertBuffer介绍完毕。

WaitXLogInsertionsToFinish

好了，最后我们应该直面我们之前一直回避的两个函数了WaitXLogInsertionsToFinish和WALInsertLockUpdateInsertingAt。在讨论这两个函数之前，我们来回顾一下XLOG的写入流程：

调用XLogRecordAssemble组装XLOG。
调用ReserveXLogInsertLocation为前面组装的XLOG分配空间，返回该XLOG的StartPos（起始位置）和EndPos（结束位置）。
调用CopyXLogRecordToWAL将XLOG写入log buffer。

这三个步骤中，步骤2是串行的所以可以确保不同XLOG的[StartPos,EndPos]不会有交集。而第三个步骤是并行的。现在考虑一个多线程场景。

事务A	事务B
调用ReserveXLogInsertLocation获取EndPos_A
	调用ReserveXLogInsertLocation获取EndPos_B
	调用CopyXLogRecordToWAL将XLOG写入log buffer
	执行commit，将EndPos_B之前的日志全部落盘
调用CopyXLogRecordToWAL将XLOG写入log buffer

上面的两个事务，事务A先于事务B调用ReserveXLogInsertLocation，所以很显然EndPos_B > EndPos_A。紧接着事务B将XOLG写入了log buffer，并执行了commit。commit会将EndPos_B之前的所有日志都落盘。但是此时，事务A还没来得及将XLOG写入log buffer。所以此时在EndPos_B之前的日志存在“空洞”，当然也就不能直接将EndPos_B之前的日志落盘，必须等到事务A将相关的日志写入log buffer后，才能执行落盘的操作。而WaitXLogInsertionsToFinish的功能就是判断指定位置之前是否还有XLOG还没有写入log buffer，如果有就等待，直到这些XLOG都写入log buffer。

所以，在调用XLogWrite进行XLOG落盘之前，都需要调用WaitXLogInsertionsToFinish。

在这里插入图片描述

这里需要介绍XLogCtlData中的一个新成员：Insert.WALInsertLocks。WALInsertLocks是一个WALInsertLockPadded的数组，数组元素的个数为NUM_XLOGINSERT_LOCKS（值为8）。该数组在XLOGShmemInit中进行初始化，在前面Log Buffer的组织结构章节中我们提到过WALInsertLockPadded数组。WALInsertLockPadded数组的定义如下：

typedef struct
{
	LWLock		lock;
	XLogRecPtr	insertingAt;
} WALInsertLock;

typedef union WALInsertLockPadded
{
	WALInsertLock l;
	char		pad[PG_CACHE_LINE_SIZE];
} WALInsertLockPadded;

WALInsertLockPadded的主要成员是WALInsertLock，而WALInsertLock有两个成员：

insertingAt

表示当前进程将XLOG写入到了什么位置
lock

在读写insertingAt需要持有lock锁

现在，我们来调试一下前面的多线程场景，来看看WaitXLogInsertionsToFinish是如何工作的，由于后台日志进程在buffer page落盘之前也会调用WaitXLogInsertionsToFinish，所以为了避免后台日志进程的干扰，我们还是先挂起后台日志进程。然后使用下面的用例：

drop table if exists test1;
create table test1(a int);

drop table if exists test2;
create table test2(a int);

##事务A
begin;
insert into test1 values(1);
commit;

##事务B
begin;
insert into test2 values(1);    
commit;

注意，一定要创建两张表。两个事务分别对两张表做insert。否则两个事务会写入同一张表的同一个数据页，而对数据页的写入是串行的（数据页面的锁直到日志写入log buffer才会释放）。执行insert时，我们会来到XLogInsertRecord函数，XLogInsertRecord会首先调用WALInsertLockAcquire：

在这里插入图片描述

我们来看看WALInsertLockAcquire干了什么：

在这里插入图片描述

WALInsertLockAcquire会用到我们前讲过的WALInsertLockPadded数组，WALInsertLockAcquire会获取一个WALInsertLockPadded对象，然后对其进行加锁。获取的方式是用进程id mod NUM_XLOGINSERT_LOCKS。

if (lockToTry == -1)
		lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
	MyLockNo = lockToTry;

由于NUM_XLOGINSERT_LOCKS的值为8，所以最多有8个进程可以同时并行执行XLogInsertRecord。接下来会先后执行ReserveXLogInsertLocation和CopyXLogRecordToWAL。在CopyXLogRecordToWAL执行完后才会调用WALInsertLockRelease释放WALInsertLock上的锁。现在，我们让两个事务先后执行ReserveXLogInsertLocation

在这里插入图片描述

现在，我们保持事务A的进程不动，让事务B执行完毕，然后执行commit。然后我们就来到了XLogFlush中调用WaitXLogInsertionsToFinish的位置。

在这里插入图片描述

现在可以来看看WaitXLogInsertionsToFinish做了什么，WaitXLogInsertionsToFinish的实现代码如下：

/*
 * Wait for any WAL insertions < upto to finish.
 *
 * Returns the location of the oldest insertion that is still in-progress.
 * Any WAL prior to that point has been fully copied into WAL buffers, and
 * can be flushed out to disk. Because this waits for any insertions older
 * than 'upto' to finish, the return value is always >= 'upto'.
 *
 * Note: When you are about to write out WAL, you must call this function
 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
 * need to wait for an insertion to finish (or at least advance to next
 * uninitialized page), and the inserter might need to evict an old WAL buffer
 * to make room for a new one, which in turn requires WALWriteLock.
 */
static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)
{
	uint64		bytepos;
	XLogRecPtr	reservedUpto;
	XLogRecPtr	finishedUpto;
	XLogCtlInsert *Insert = &XLogCtl->Insert;
	int			i;

	if (MyProc == NULL)
		elog(PANIC, "cannot wait without a PGPROC structure");

	/* Read the current insert position */
	SpinLockAcquire(&Insert->insertpos_lck);
	bytepos = Insert->CurrBytePos;
	SpinLockRelease(&Insert->insertpos_lck);
	reservedUpto = XLogBytePosToEndRecPtr(bytepos);

	/*
	 * No-one should request to flush a piece of WAL that hasn't even been
	 * reserved yet. However, it can happen if there is a block with a bogus
	 * LSN on disk, for example. XLogFlush checks for that situation and
	 * complains, but only after the flush. Here we just assume that to mean
	 * that all WAL that has been reserved needs to be finished. In this
	 * corner-case, the return value can be smaller than 'upto' argument.
	 */
	if (upto > reservedUpto)
	{
		elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
			 (uint32) (upto >> 32), (uint32) upto,
			 (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
		upto = reservedUpto;
	}

	/*
	 * Loop through all the locks, sleeping on any in-progress insert older
	 * than 'upto'.
	 *
	 * finishedUpto is our return value, indicating the point upto which all
	 * the WAL insertions have been finished. Initialize it to the head of
	 * reserved WAL, and as we iterate through the insertion locks, back it
	 * out for any insertion that's still in progress.
	 */
	finishedUpto = reservedUpto;
	for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
	{
		XLogRecPtr	insertingat = InvalidXLogRecPtr;

		do
		{
			/*
			 * See if this insertion is in progress. LWLockWait will wait for
			 * the lock to be released, or for the 'value' to be set by a
			 * LWLockUpdateVar call.  When a lock is initially acquired, its
			 * value is 0 (InvalidXLogRecPtr), which means that we don't know
			 * where it's inserting yet.  We will have to wait for it.  If
			 * it's a small insertion, the record will most likely fit on the
			 * same page and the inserter will release the lock without ever
			 * calling LWLockUpdateVar.  But if it has to sleep, it will
			 * advertise the insertion point with LWLockUpdateVar before
			 * sleeping.
			 */
			if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
								 &WALInsertLocks[i].l.insertingAt,
								 insertingat, &insertingat))
			{
				/* the lock was free, so no insertion in progress */
				insertingat = InvalidXLogRecPtr;
				break;
			}

			/*
			 * This insertion is still in progress. Have to wait, unless the
			 * inserter has proceeded past 'upto'.
			 */
		} while (insertingat < upto);

		if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
			finishedUpto = insertingat;
	}
	return finishedUpto;
}

WaitXLogInsertionsToFinish的主要部分就是line59~line94的这个while循环，循环遍历WALInsertLocks数组然后调用LWLockWaitForVar获取insertingAt值。下面，我们看来看看LWLockWaitForVar是如何实现的：

/*
 * LWLockWaitForVar - Wait until lock is free, or a variable is updated.
 *
 * If the lock is held and *valptr equals oldval, waits until the lock is
 * either freed, or the lock holder updates *valptr by calling
 * LWLockUpdateVar.  If the lock is free on exit (immediately or after
 * waiting), returns true.  If the lock is still held, but *valptr no longer
 * matches oldval, returns false and sets *newval to the current value in
 * *valptr.
 *
 * Note: this function ignores shared lock holders; if the lock is held
 * in shared mode, returns 'true'.
 */
bool
LWLockWaitForVar(LWLock *lock, uint64 *valptr, uint64 oldval, uint64 *newval)
{
	PGPROC	   *proc = MyProc;
	int			extraWaits = 0;
	bool		result = false;
#ifdef LWLOCK_STATS
	lwlock_stats *lwstats;

	lwstats = get_lwlock_stats_entry(lock);
#endif

	PRINT_LWDEBUG("LWLockWaitForVar", lock, LW_WAIT_UNTIL_FREE);

	/*
	 * Lock out cancel/die interrupts while we sleep on the lock.  There is no
	 * cleanup mechanism to remove us from the wait queue if we got
	 * interrupted.
	 */
	HOLD_INTERRUPTS();

	/*
	 * Loop here to check the lock's status after each time we are signaled.
	 */
	for (;;)
	{
		bool		mustwait;

		mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
										  &result);

		if (!mustwait)
			break;				/* the lock was free or value didn't match */

		/*
		 * Add myself to wait queue. Note that this is racy, somebody else
		 * could wakeup before we're finished queuing. NB: We're using nearly
		 * the same twice-in-a-row lock acquisition protocol as
		 * LWLockAcquire(). Check its comments for details. The only
		 * difference is that we also have to check the variable's values when
		 * checking the state of the lock.
		 */
		LWLockQueueSelf(lock, LW_WAIT_UNTIL_FREE);

		/*
		 * Set RELEASE_OK flag, to make sure we get woken up as soon as the
		 * lock is released.
		 */
		pg_atomic_fetch_or_u32(&lock->state, LW_FLAG_RELEASE_OK);

		/*
		 * We're now guaranteed to be woken up if necessary. Recheck the lock
		 * and variables state.
		 */
		mustwait = LWLockConflictsWithVar(lock, valptr, oldval, newval,
										  &result);

		/* Ok, no conflict after we queued ourselves. Undo queueing. */
		if (!mustwait)
		{
			LOG_LWDEBUG("LWLockWaitForVar", lock, "free, undoing queue");

			LWLockDequeueSelf(lock);
			break;
		}

		/*
		 * Wait until awakened.
		 *
		 * Since we share the process wait semaphore with the regular lock
		 * manager and ProcWaitForSignal, and we may need to acquire an LWLock
		 * while one of those is pending, it is possible that we get awakened
		 * for a reason other than being signaled by LWLockRelease. If so,
		 * loop back and wait again.  Once we've gotten the LWLock,
		 * re-increment the sema by the number of additional signals received,
		 * so that the lock manager or signal manager will see the received
		 * signal when it next waits.
		 */
		LOG_LWDEBUG("LWLockWaitForVar", lock, "waiting");

#ifdef LWLOCK_STATS
		lwstats->block_count++;
#endif

		LWLockReportWaitStart(lock);
		TRACE_POSTGRESQL_LWLOCK_WAIT_START(T_NAME(lock), T_ID(lock),
										   LW_EXCLUSIVE);

		for (;;)
		{
			PGSemaphoreLock(&proc->sem);
			if (!proc->lwWaiting)
				break;
			extraWaits++;
		}

#ifdef LOCK_DEBUG
		{
			/* not waiting anymore */
			uint32 nwaiters PG_USED_FOR_ASSERTS_ONLY = pg_atomic_fetch_sub_u32(&lock->nwaiters, 1);

			Assert(nwaiters < MAX_BACKENDS);
		}
#endif

		TRACE_POSTGRESQL_LWLOCK_WAIT_DONE(T_NAME(lock), T_ID(lock),
										  LW_EXCLUSIVE);
		LWLockReportWaitEnd();

		LOG_LWDEBUG("LWLockWaitForVar", lock, "awakened");

		/* Now loop back and check the status of the lock again. */
	}

	TRACE_POSTGRESQL_LWLOCK_ACQUIRE(T_NAME(lock), T_ID(lock), LW_EXCLUSIVE);

	/*
	 * Fix the process wait semaphore's count for any absorbed wakeups.
	 */
	while (extraWaits-- > 0)
		PGSemaphoreUnlock(&proc->sem);

	/*
	 * Now okay to allow cancel/die interrupts.
	 */
	RESUME_INTERRUPTS();

	return result;
}

函数开头的第一行注释已经很清楚的说明了LWLockWaitForVar函数的功能：等待，直到锁状态为free或者valptr的值发生变化（Wait until lock is free, or a variable is update）。该函数的核心是LWLockConflictsWithVar函数，该函数依据Wait until lock is free, or a variable is update的规则来判断是否需要等待，我们再来看看LWLockConflictsWithVar的实现。LWLockConflictsWithVar的函数原型如下：

static bool
LWLockConflictsWithVar(LWLock *lock,
					   uint64 *valptr, uint64 oldval, uint64 *newval,
					   bool *result)

该函数返回一个bool类型，用于通知上层，是否需要等待。result表示锁的状态是否为free。 该函数的执行步骤如下：

判断lock的状态，如果不是LW_VAL_EXCLUSIVE（互斥锁），则说明锁的状态为free，所以将result的值设为true。既然锁是free，那么也就无需等待，所以直接返回false。
锁的状态为LW_VAL_EXCLUSIVE，表明锁不是free，所以将result设为false。
加锁并获取valptr的值。
比较*valptr与oldval。如果*valptr不等于oldval那么说明*valptr的值发生了变化，此时也无需等待，所以将*newval的值设置为*valptr，然后返回false。
如果*valptr等于oldval那么就说明*valptr的值没有发生变化，所以必须等待，返回true。

LWLockConflictsWithVar的实现如下：

/*
 * Does the lwlock in its current state need to wait for the variable value to
 * change?
 *
 * If we don't need to wait, and it's because the value of the variable has
 * changed, store the current value in newval.
 *
 * *result is set to true if the lock was free, and false otherwise.
 */
static bool
LWLockConflictsWithVar(LWLock *lock,
					   uint64 *valptr, uint64 oldval, uint64 *newval,
					   bool *result)
{
	bool		mustwait;
	uint64		value;

	/*
	 * Test first to see if it the slot is free right now.
	 *
	 * XXX: the caller uses a spinlock before this, so we don't need a memory
	 * barrier here as far as the current usage is concerned.  But that might
	 * not be safe in general.
	 */
	mustwait = (pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE) != 0;

    //判断lock的状态
	if (!mustwait)
	{
        /*
         * 如果不是LW_VAL_EXCLUSIVE（互斥锁），则说明锁的状态为free，
         * 所以将result的值设为true。既然锁是free，那么也就无需等待，所以直接返回false
         */
		*result = true;
		return false;
	}

    //锁的状态为LW_VAL_EXCLUSIVE，表明锁不是free，所以将result设为false
	*result = false;

	/*
	 * Read value using the lwlock's wait list lock, as we can't generally
	 * rely on atomic 64 bit reads/stores.  TODO: On platforms with a way to
	 * do atomic 64 bit reads/writes the spinlock should be optimized away.
	 * 加锁并获取valptr的值
	 */
	LWLockWaitListLock(lock);
	value = *valptr;
	LWLockWaitListUnlock(lock);

    //比较\*valptr与oldval
	if (value != oldval)
	{
        /*
         * 如果valptr不等于oldval那么说明valptr的值发生了变化，此时也无需等待，
         * 所以将*newval的值设置为valptr，然后返回false
         */
		mustwait = false;
		*newval = value;
	}
	else
	{
        //如果valptr等于oldval那么就说明valptr的值没有发生变化，所以必须等待，返回true
		mustwait = true;
	}

	return mustwait;
}

在了解了LWLockWaitForVar的功能后，我们再来看看LWLockWaitForVar具体是如何实现WaitXLogInsertionsToFinish的功能的。WaitXLogInsertionsToFinish(XLogRecPtr upto)是用于确保upto之前的所有日志都写入的log buffer。有两个方面可以确保这一点：

当前没有需要写入或正在写入log buffer的进程。
当前有进程正在写log buffer，写入的位置已经大于或等于upto了。

只要满足上面两种情况的任意一种情况WaitXLogInsertionsToFinish都会返回，否则WaitXLogInsertionsToFinish会挂起当前进程进入等待。

回到我们前面的用例，事务A在执行ReserveXLogInsertLocation之前调用WALInsertLockAcquire给WALInsertLocks数组中当前进程对应的WALInsertLock上了锁。当事务B提交时，在循环WALInsertLocks数组时就会发现事务A上的这把锁。由于事务A并没有修改insertingAt，所以insertingAt为0。而事务B的commit在调用LWLockWaitForVar时，给oldval传递的实参为InvalidXLogRecPtr也是0。所以底层的LWLockConflictsWithVar会返回true。从而让LWLockWaitForVar陷入等待：

在这里插入图片描述

我们现在让事务A继续执行，执行完CopyXLogRecordToWAL并通过WALInsertLockRelease释放锁。现在事务B被唤醒。由于锁状态已经为free，所以LWLockConflictsWithVar返回false。

在这里插入图片描述

上层函数LWLockWaitForVar返回的是锁的状态是否为free（即LWLockConflictsWithVar的result参数），所以上层直接跳出内循环，继续判断下一个的WALInsertLock的状态。

在这里插入图片描述

由于当前没有其他并发进程，所以最终WaitXLogInsertionsToFinish函数正常返回，commit继续执行最终完成。

WALInsertLockUpdateInsertingAt

我们再来看看WaitXLogInsertionsToFinish那个主要的循环：

do
{
    if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
                         &WALInsertLocks[i].l.insertingAt,
                         insertingat, &insertingat))
    {
        /* the lock was free, so no insertion in progress */
        insertingat = InvalidXLogRecPtr;
        break;
    }

    	/*
		 * This insertion is still in progress. Have to wait, unless the
		 * inserter has proceeded past 'upto'.
		 */
} while (insertingat < upto);

在前面的场景中LWLockWaitForVar返回时由于WALInsertLocks[i].l.lock的状态为free。如果lock状态不是free，LWLockWaitForVar返回就说明WALInsertLocks[i].l.insertingAt的值发生了变化，且新值存放在insertingat中，所以我们需要判断insertingat是否大于等于upto，如果是则结束循环，否则继续循环等待。

而WALInsertLockUpdateInsertingAt函数就是用于修改当前进程对应的insertingAt，然后因为调用WaitXLogInsertionsToFinish陷入等待的进程，让他们再次校验insertingAt是否满足要求。

/*
 * Update our insertingAt value, to let others know that we've finished
 * inserting up to that point.
 */
static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
{
	if (holdingAllLocks)
	{
		/*
		 * We use the last lock to mark our actual position, see comments in
		 * WALInsertLockAcquireExclusive.
		 */
		LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
					 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
						insertingAt);
	}
	else
		LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
						&WALInsertLocks[MyLockNo].l.insertingAt,
						insertingAt);
}

/*
 * LWLockUpdateVar - Update a variable and wake up waiters atomically
 *
 * Sets *valptr to 'val', and wakes up all processes waiting for us with
 * LWLockWaitForVar().  Setting the value and waking up the processes happen
 * atomically so that any process calling LWLockWaitForVar() on the same lock
 * is guaranteed to see the new value, and act accordingly.
 *
 * The caller must be holding the lock in exclusive mode.
 */
void
LWLockUpdateVar(LWLock *lock, uint64 *valptr, uint64 val)
{
	dlist_head	wakeup;
	dlist_mutable_iter iter;

	PRINT_LWDEBUG("LWLockUpdateVar", lock, LW_EXCLUSIVE);

	dlist_init(&wakeup);

	LWLockWaitListLock(lock);

	Assert(pg_atomic_read_u32(&lock->state) & LW_VAL_EXCLUSIVE);

	/* Update the lock's value */
	*valptr = val;

	/*
	 * See if there are any LW_WAIT_UNTIL_FREE waiters that need to be woken
	 * up. They are always in the front of the queue.
	 */
	dlist_foreach_modify(iter, &lock->waiters)
	{
		PGPROC	   *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);

		if (waiter->lwWaitMode != LW_WAIT_UNTIL_FREE)
			break;

		dlist_delete(&waiter->lwWaitLink);
		dlist_push_tail(&wakeup, &waiter->lwWaitLink);
	}

	/* We are done updating shared state of the lock itself. */
	LWLockWaitListUnlock(lock);

	/*
	 * Awaken any waiters I removed from the queue.
	 */
	dlist_foreach_modify(iter, &wakeup)
	{
		PGPROC	   *waiter = dlist_container(PGPROC, lwWaitLink, iter.cur);

		dlist_delete(&waiter->lwWaitLink);
		/* check comment in LWLockWakeup() about this barrier */
		pg_write_barrier();
		waiter->lwWaiting = false;
		PGSemaphoreUnlock(&waiter->sem);
	}
}

遗憾的是，目前我还没有找到这个函数的应用场景。所以作为一个遗留问题，以后再来分析。

obvious__

关注

4
点赞
踩
3

收藏

觉得还不错? 一键收藏
4
评论
PostgreSQL重启恢复---Log Buffer

Log Buffer预备知识《PostgreSQL重启恢复—XLOG 1.0》《PostgreSQL重启恢复—XLOG 2.0》概述在前面的文章中，我们解决了对于数据页的insert操作，需要组装一个怎样的XLOG以及XLOG写入log buffer的流程。接下来我们需要来更加深入的了解下log buffer，了解他的组织结构，了解log buffer的作用，更重要的是要了解XLOG如何落盘。还记得WAL的两大准则么：XLOG落盘之后，对应的数据才能落盘。一个事务相关的所有XLOG都落盘之后
复制链接

扫一扫