postgre源码解析24 缓冲池管理--2

Serendipity_Shy

已于 2022-10-01 11:40:14 修改

阅读量321

点赞数

分类专栏： postgres 文章标签： postgresql 数据库

于 2022-09-11 08:28:59 首次发布

本文链接：https://blog.csdn.net/qq_52668274/article/details/126799392

版权

postgres 专栏收录该内容

54 篇文章 28 订阅

订阅专栏

本文将讲解后端进程从缓冲池中读取数据页流程，其入口函数为ReadBuffer，实际调用ReadBufferExtended函数。
知识回顾：postgres源码解析缓冲池管理器–1

总结

执行流程图：
在这里插入图片描述

源码解析

1 ReadBuffer

/*
 * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main
 *	fork with RBM_NORMAL mode and default strategy.
 */
Buffer
ReadBuffer(Relation reln, BlockNumber blockNum)
{
	return ReadBufferExtended(reln, MAIN_FORKNUM, blockNum, RBM_NORMAL, NULL);
}

2 ReadBufferExtended

/*
 * ReadBufferExtended -- returns a buffer containing the requested
 *		block of the requested relation.  If the blknum
 *		requested is P_NEW, extend the relation file and
 *		allocate a new block.  (Caller is responsible for
 *		ensuring that only one backend tries to extend a
 *		relation at the same time!)
 * 	返回含有请求数据页的缓冲块如果 blknum 为P_NEW,则徐扩展页并分配新块，必须确保同一个时间
 * 只有一个后台进程尝试扩展文件页
 *	
 */
Buffer
ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum,
				   ReadBufferMode mode, BufferAccessStrategy strategy)
{
	bool		hit;
	Buffer		buf;

	/* Open it at the smgr level if not already done */
	RelationOpenSmgr(reln);        // 打开磁盘管理器

	/*
	 * Reject attempts to read non-local temporary relations; we would be
	 * likely to get wrong data since we have no visibility into the owning
	 * session's local buffers.
	 */
	if (RELATION_IS_OTHER_TEMP(reln))			// 拒绝访问其他会话的临时表
		ereport(ERROR,
				(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
				 errmsg("cannot access temporary tables of other sessions")));

	/*
	 * Read the buffer, and update pgstat counters to reflect a cache hit or
	 * miss.
	 */
	pgstat_count_buffer_read(reln);        			// 更新统计信息
	buf = ReadBuffer_common(reln->rd_smgr, reln->rd_rel->relpersistence,     //真正执行函数
							forkNum, blockNum, mode, strategy, &hit);
	if (hit)
		pgstat_count_buffer_hit(reln);   	// 命中，则更新信息
	return buf;
}

3 ReadBuffer_common

*
 * ReadBuffer_common -- common logic for all ReadBuffer variants
 // 读取缓冲块的通用入口
 * *hit is set to true if the request was satisfied from shared buffer cache.
 */
static Buffer
ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
				  BlockNumber blockNum, ReadBufferMode mode,
				  BufferAccessStrategy strategy, bool *hit)
{
	BufferDesc *bufHdr;
	Block		bufBlock;
	bool		found;
	bool		isExtend;
	bool		isLocalBuf = SmgrIsTemp(smgr);			// 判断是否为本地缓冲块

	*hit = false;

	/* Make sure we will have room to remember the buffer pin */
	ResourceOwnerEnlargeBuffers(CurrentResourceOwner);

	isExtend = (blockNum == P_NEW); 					// 是否进行扩展

	TRACE_POSTGRESQL_BUFFER_READ_START(forkNum, blockNum,
									   smgr->smgr_rnode.node.spcNode,
									   smgr->smgr_rnode.node.dbNode,
									   smgr->smgr_rnode.node.relNode,
									   smgr->smgr_rnode.backend,
									   isExtend);

	/* Substitute proper block number if caller asked for P_NEW */
	if (isExtend)								
	{
		blockNum = smgrnblocks(smgr, forkNum);
		/* Fail if relation is already at maximum possible length */
		if (blockNum == P_NEW)
			ereport(ERROR,
					(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
					 errmsg("cannot extend relation %s beyond %u blocks",
							relpath(smgr->smgr_rnode, forkNum),
							P_NEW)));
	}

	if (isLocalBuf)			// 如果是本地缓冲块，则调用 LocalBufferAlloc
	{
		bufHdr = LocalBufferAlloc(smgr, forkNum, blockNum, &found);   
		if (found)
			pgBufferUsage.local_blks_hit++;
		else if (isExtend)
			pgBufferUsage.local_blks_written++;
		else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
				 mode == RBM_ZERO_ON_ERROR)
			pgBufferUsage.local_blks_read++;
	}
	else
	{	
		// 反之，调用 BufferAlloc
		/*
		 * lookup the buffer.  IO_IN_PROGRESS is set if the requested block is
		 * not currently in memory.
		 */
		bufHdr = BufferAlloc(smgr, relpersistence, forkNum, blockNum,
							 strategy, &found);
		if (found)
			pgBufferUsage.shared_blks_hit++;
		else if (isExtend)
			pgBufferUsage.shared_blks_written++;
		else if (mode == RBM_NORMAL || mode == RBM_NORMAL_NO_LOG ||
				 mode == RBM_ZERO_ON_ERROR)
			pgBufferUsage.shared_blks_read++;
	}

	/* At this point we do NOT hold any locks. */

如果读取的缓冲块已在缓冲池中，则直接返回


	/* if it was already in the buffer pool, we're done */
	if (found)
	{
		if (!isExtend)
		{
			/* Just need to update stats before we exit */
			*hit = true;
			VacuumPageHit++;

			if (VacuumCostActive)
				VacuumCostBalance += VacuumCostPageHit;

			TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
											  smgr->smgr_rnode.node.spcNode,
											  smgr->smgr_rnode.node.dbNode,
											  smgr->smgr_rnode.node.relNode,
											  smgr->smgr_rnode.backend,
											  isExtend,
											  found);

			/*
			 * In RBM_ZERO_AND_LOCK mode the caller expects the page to be
			 * locked on return.
			 */
			 // 非本地缓冲块，依据模式进行相关所的施加或者释放
			if (!isLocalBuf)
			{
				if (mode == RBM_ZERO_AND_LOCK)
					LWLockAcquire(BufferDescriptorGetContentLock(bufHdr),
								  LW_EXCLUSIVE);
				else if (mode == RBM_ZERO_AND_CLEANUP_LOCK)
					LockBufferForCleanup(BufferDescriptorGetBuffer(bufHdr));
			}
			// 返回数据页对应的缓冲块
			return BufferDescriptorGetBuffer(bufHdr);
		}

		/*
		 * We get here only in the corner case where we are trying to extend
		 * the relation but we found a pre-existing buffer marked BM_VALID.
		 * This can happen because mdread doesn't complain about reads beyond
		 * EOF (when zero_damaged_pages is ON) and so a previous attempt to
		 * read a block beyond EOF could have left a "valid" zero-filled
		 * buffer.  Unfortunately, we have also seen this case occurring
		 * because of buggy Linux kernels that sometimes return an
		 * lseek(SEEK_END) result that doesn't account for a recent write. In
		 * that situation, the pre-existing buffer would contain valid data
		 * that we don't want to overwrite.  Since the legitimate case should
		 * always have left a zero-filled buffer, complain if not PageIsNew.
		 *
		 *	在尝试扩展relation时发现一个预存在的缓冲块且标记为 BM_VALID 【极端情况】
		 *  发生此现象的原因可能是 mdread在读到的EOF时并没有声明（zero_damaged_pages is ON）
		 * 导致之前尝试读取超出 EOF 的块可能会留下一个“有效的”零填充缓冲区。
		 * 不幸的是这种情况可能发生，由于Linux kernels的bug导致返回写入 lseek 结果
		 */
		 
		bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);
		if (!PageIsNew((Page) bufBlock))
			ereport(ERROR,
					(errmsg("unexpected data beyond EOF in block %u of relation %s",
							blockNum, relpath(smgr->smgr_rnode, forkNum)),
					 errhint("This has been seen to occur with buggy kernels; consider updating your system.")));

		/*
		 * We *must* do smgrextend before succeeding, else the page will not
		 * be reserved by the kernel, and the next P_NEW call will decide to
		 * return the same page.  Clear the BM_VALID bit, do the StartBufferIO
		 * call that BufferAlloc didn't, and proceed.
		 *
		 *	成功读取到缓冲块需要进行相关操作 ，本地buffer,仅需调整标识位
		 */
		if (isLocalBuf)
		{
			/* Only need to adjust flags */
			uint32		buf_state = pg_atomic_read_u32(&bufHdr->state);

			Assert(buf_state & BM_VALID);     
			buf_state &= ~BM_VALID;                   // 取消  BM_VALID
			pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
		}
		else
		{
			/*
			 * Loop to handle the very small possibility that someone re-sets
			 * BM_VALID between our clearing it and StartBufferIO inspecting
			 * it.
			 *
			 * 如果是共享缓冲块，则需循环等待直接标记为 BM_IO_IN_PROGRESS，因为该缓冲块可能
			 * 被其他进程操作，进行IO，用循环判断
			 */
			do
			{
				uint32		buf_state = LockBufHdr(bufHdr);

				Assert(buf_state & BM_VALID);
				buf_state &= ~BM_VALID;
				UnlockBufHdr(bufHdr, buf_state);
			} while (!StartBufferIO(bufHdr, true));
		}
	}

如果读取的缓冲块不在缓冲区中，返回的只是新分配的缓冲块，其内容是不是有效的，对共享缓冲池，则仅仅设置为
IO_IN_PROGRESS，防止其他进程进行IO操作

	/*
	 * if we have gotten to this point, we have allocated a buffer for the
	 * page but its contents are not yet valid.  IO_IN_PROGRESS is set for it,
	 * if it's a shared buffer.
	 *
	 * Note: if smgrextend fails, we will end up with a buffer that is
	 * allocated but not marked BM_VALID.  P_NEW will still select the same
	 * block number (because the relation didn't get any longer on disk) and
	 * so future attempts to extend the relation will find the same buffer (if
	 * it's not been recycled) but come right back here to try smgrextend
	 * again.
	 *
	 * 如果 smgrextend失败，将终止分配而不是标记为 BM_VALID。 
	 * 指定P_NEW标识的话选择同一页（因为relation未持久化），
	 */
	Assert(!(pg_atomic_read_u32(&bufHdr->state) & BM_VALID));	/* spinlock not needed */

	bufBlock = isLocalBuf ? LocalBufHdrGetBlock(bufHdr) : BufHdrGetBlock(bufHdr);

	if (isExtend)
	{
		/* new buffers are zero-filled */
		MemSet((char *) bufBlock, 0, BLCKSZ);
		/* don't set checksum for all-zero page */
		smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false);  

		/*
		 * NB: we're *not* doing a ScheduleBufferTagForWriteback here;
		 * although we're essentially performing a write. At least on linux
		 * doing so defeats the 'delayed allocation' mechanism, leading to
		 * increased file fragmentation.
		 */
	}
	else
	{
		/*
		 * Read in the page, unless the caller intends to overwrite it and
		 * just wants us to allocate a buffer.
		 */
		if (mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK)  // 置零
			MemSet((char *) bufBlock, 0, BLCKSZ);
		else
		{
			instr_time	io_start,
						io_time;

			if (track_io_timing)
				INSTR_TIME_SET_CURRENT(io_start);

			smgrread(smgr, forkNum, blockNum, (char *) bufBlock);			// 底层实现，将磁盘文件读入缓冲块

			if (track_io_timing)
			{
				INSTR_TIME_SET_CURRENT(io_time);
				INSTR_TIME_SUBTRACT(io_time, io_start);
				pgstat_count_buffer_read_time(INSTR_TIME_GET_MICROSEC(io_time));
				INSTR_TIME_ADD(pgBufferUsage.blk_read_time, io_time);
			}
			
			// 垃圾数据检查，出现错误则报告相关信息
			/* check for garbage data */
			if (!PageIsVerifiedExtended((Page) bufBlock, blockNum,
										PIV_LOG_WARNING | PIV_REPORT_STAT))
			{
				if (mode == RBM_ZERO_ON_ERROR || zero_damaged_pages)
				{
					ereport(WARNING,
							(errcode(ERRCODE_DATA_CORRUPTED),
							 errmsg("invalid page in block %u of relation %s; zeroing out page",
									blockNum,
									relpath(smgr->smgr_rnode, forkNum))));
					MemSet((char *) bufBlock, 0, BLCKSZ);
				}
				else
					ereport(ERROR,
							(errcode(ERRCODE_DATA_CORRUPTED),
							 errmsg("invalid page in block %u of relation %s",
									blockNum,
									relpath(smgr->smgr_rnode, forkNum))));
			}
		}
	}

	/*
	 * In RBM_ZERO_AND_LOCK mode, grab the buffer content lock before marking
	 * the page as valid, to make sure that no other backend sees the zeroed
	 * page before the caller has had a chance to initialize it.
	 *
	 * Since no-one else can be looking at the page contents yet, there is no
	 * difference between an exclusive lock and a cleanup-strength lock. (Note
	 * that we cannot use LockBuffer() or LockBufferForCleanup() here, because
	 * they assert that the buffer is already valid.)
	 */
	 // 设置标识
	if ((mode == RBM_ZERO_AND_LOCK || mode == RBM_ZERO_AND_CLEANUP_LOCK) &&
		!isLocalBuf)
	{
		LWLockAcquire(BufferDescriptorGetContentLock(bufHdr), LW_EXCLUSIVE);
	}

	if (isLocalBuf)
	{
		/* Only need to adjust flags */
		uint32		buf_state = pg_atomic_read_u32(&bufHdr->state);

		buf_state |= BM_VALID;     									// BM_VALID，表明以读取完成，供后续使用
		pg_atomic_unlocked_write_u32(&bufHdr->state, buf_state);
	}
	else
	{
		/* Set BM_VALID, terminate IO, and wake up any waiters */
		TerminateBufferIO(bufHdr, false, BM_VALID);				/// 唤醒其他进程的IO操作
	}

	VacuumPageMiss++;
	if (VacuumCostActive)
		VacuumCostBalance += VacuumCostPageMiss;

	TRACE_POSTGRESQL_BUFFER_READ_DONE(forkNum, blockNum,
									  smgr->smgr_rnode.node.spcNode,
									  smgr->smgr_rnode.node.dbNode,
									  smgr->smgr_rnode.node.relNode,
									  smgr->smgr_rnode.backend,
									  isExtend,
									  found);

	return BufferDescriptorGetBuffer(bufHdr);				// 返回缓冲块块号