本文讲解缓冲块的选择策略BufferAlloc,同时该函数也是替换策略的核心函数,
知识回顾:
postgres源码解析 缓冲池管理–1
postgres源码解析 缓冲池管理–2
总结<执行流程图>
源码解析
1 数据结构
typedef struct SMgrRelationData
{
/* rnode is the hashtable lookup key, so it must be first! */
// hashkey,物理标识符
RelFileNodeBackend smgr_rnode; /* relation physical identifier */
/* pointer to owning pointer, or NULL if none */
struct SMgrRelationData **smgr_owner;
/*
* The following fields are reset to InvalidBlockNumber upon a cache flush
* event, and hold the last known size for each fork. This information is
* currently only reliable during recovery, since there is no cache
* invalidation for fork extension.
*/
BlockNumber smgr_targblock; /* current insertion target block */
BlockNumber smgr_cached_nblocks[MAX_FORKNUM + 1]; /* last known size */
/* additional public fields may someday exist here */
/*
* Fields below here are intended to be private to smgr.c and its
* submodules. Do not touch them from elsewhere.
*/
int smgr_which; /* storage manager selector */
/*
* for md.c; per-fork arrays of the number of open segments
* (md_num_open_segs) and the segments themselves (md_seg_fds).
*/
int md_num_open_segs[MAX_FORKNUM + 1];
struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
/* if unowned, list link in list of all unowned SMgrRelations */
dlist_node node;
} SMgrRelationData;
buffer访问策略数据结构体
typedef struct BufferAccessStrategyData
{
/* Overall strategy type */
BufferAccessStrategyType btype; // 策略类型
/* Number of elements in buffers[] array */
int ring_size; // 环大小
/*
* Index of the "current" slot in the ring, ie, the one most recently
* returned by GetBufferFromRing.
*/
int current; // 当前槽在缓冲的索引
/*
* True if the buffer just returned by StrategyGetBuffer had been in the
* ring already.
*/
bool current_was_in_ring; // buffer是否在环中
/*
* Array of buffer numbers. InvalidBuffer (that is, zero) indicates we
* have not yet selected a buffer for this ring slot. For allocation
* simplicity this is palloc'd together with the fixed fields of the
* struct.
*/
Buffer buffers[FLEXIBLE_ARRAY_MEMBER]; // buffer 数组
} BufferAccessStrategyData;
2 执行流程
/*
* BufferAlloc -- subroutine for ReadBuffer. Handles lookup of a shared
* buffer. If no buffer exists already, selects a replacement
* victim and evicts the old page, but does NOT read in new page.
*
// 该函数在共享内存中寻找目标快。如果不存在则次啊用页面置换算法选择牺牲页供后续读入
* "strategy" can be a buffer replacement strategy object, or NULL for
* the default strategy. The selected buffer's usage_count is advanced when
* using the default strategy, but otherwise possibly not (see PinBuffer).
// 当页面被使用时会自增usage_count[默认策略]
* The returned buffer is pinned and is already marked as holding the
* desired page. If it already did have the desired page, *foundPtr is
* set true. Otherwise, *foundPtr is set false and the buffer is marked
* as IO_IN_PROGRESS; ReadBuffer will now need to do I/O to fill it.
*
// 返回的buffrer 不仅被 pinned而且被标记持有目标数据页,如果缓冲区已存在目标也,则
// *foundPtr为TRUE. 否则为FALSE,且buffer标记为 IO_IN_PROGRESS
* *foundPtr is actually redundant with the buffer's BM_VALID flag, but
* we keep it for simplicity in ReadBuffer.
*
* No locks are held either at entry or exit.
*/
static BufferDesc *
BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr)
- 首先在哈希表中查看是否已存在缓冲池中,如果是直接返回
static BufferDesc *
BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum,
BlockNumber blockNum,
BufferAccessStrategy strategy,
bool *foundPtr)
{
BufferTag newTag; /* identity of requested block */
uint32 newHash; /* hash value for newTag */
LWLock *newPartitionLock; /* buffer partition lock for it */
BufferTag oldTag; /* previous identity of selected buffer */
uint32 oldHash; /* hash value for oldTag */
LWLock *oldPartitionLock; /* buffer partition lock for it */
uint32 oldFlags;
int buf_id;
BufferDesc *buf;
bool valid;
uint32 buf_state;
/* create a tag so we can lookup the buffer */
// 根据入参确定hashkey
INIT_BUFFERTAG(newTag, smgr->smgr_rnode.node, forkNum, blockNum);
/* determine its hash code and partition lock ID */
// 根据hash code确定施加 newPartitionLock
newHash = BufTableHashCode(&newTag);
newPartitionLock = BufMappingPartitionLock(newHash);
/* see if the block is in the buffer pool already */
// 施加共享 newPartitionLock,并在hash table查找
LWLockAcquire(newPartitionLock, LW_SHARED);
buf_id = BufTableLookup(&newTag, newHash);
if (buf_id >= 0)
{
/*
* Found it. Now, pin the buffer so no one can steal it from the
* buffer pool, and check to see if the correct data has been loaded
* into the buffer.
*/
buf = GetBufferDescriptor(buf_id); // 获取缓冲区描述符
valid = PinBuffer(buf, strategy); // pin buffer
/* Can release the mapping lock as soon as we've pinned it */
LWLockRelease(newPartitionLock); // 释放 newPartitionLock
*foundPtr = true;
if (!valid)
{
/*
* We can only get here if (a) someone else is still reading in
* the page, or (b) a previous read attempt failed. We have to
* wait for any active read attempt to finish, and then set up our
* own read attempt if the page is still not BM_VALID.
* StartBufferIO does it all.
*/
// 未pin,则意味其他正在读或者读失败。必须等待其他人完成IO操作
if (StartBufferIO(buf, true))
{
/*
* If we get here, previous attempts to read the buffer must
* have failed ... but we shall bravely try again.
*/
*foundPtr = false;
}
}
return buf; // 缓冲池描述符
}
2.2 缓冲池中未找到,则必须初始化一个新的buffer,执行时释放newPartitionLock
/*
* Didn't find it in the buffer pool. We'll have to initialize a new
* buffer. Remember to unlock the mapping lock while doing the work.
*/
LWLockRelease(newPartitionLock);
/* Loop here in case we have to try another victim buffer */
for (;;)
{
/*
* Ensure, while the spinlock's not yet held, that there's a free
* refcount entry.
*/
ReservePrivateRefCountEntry();
/*
* Select a victim buffer. The buffer is returned with its header
* spinlock still held!
*/
buf = StrategyGetBuffer(strategy, &buf_state); // 页面置换算法
Assert(BUF_STATE_GET_REFCOUNT(buf_state) == 0);
/* Must copy buffer flags while we still hold the spinlock */
oldFlags = buf_state & BUF_FLAG_MASK; // 更新状态标识
/* Pin the buffer and then release the buffer spinlock */
PinBuffer_Locked(buf); // pin住buffer
/*
* If the buffer was dirty, try to write it out. There is a race
* condition here, in that someone might dirty it after we released it
* above, or even while we are writing it out (since our share-lock
* won't prevent hint-bit updates). We will recheck the dirty bit
* after re-locking the buffer header.
*/
// buffer为脏,则需要将其写出。这会存在竞争情景:在释放该buffer时其他人将其置脏
// 或者正在写入[因为共享锁不会阻止辨识标识位的更改]。 在锁buffer header时重检
// 标识位
if (oldFlags & BM_DIRTY)
{
/*
* We need a share-lock on the buffer contents to write it out
* (else we might write invalid data, eg because someone else is
* compacting the page contents while we write). We must use a
* conditional lock acquisition here to avoid deadlock. Even
* though the buffer was not pinned (and therefore surely not
* locked) when StrategyGetBuffer returned it, someone else could
* have pinned and exclusive-locked it by the time we get here. If
* we try to get the lock unconditionally, we'd block waiting for
* them; if they later block waiting for us, deadlock ensues.
* (This has been observed to happen when two backends are both
* trying to split btree index pages, and the second one just
* happens to be trying to split the page the first one got from
* StrategyGetBuffer.)
*/
if (LWLockConditionalAcquire(BufferDescriptorGetContentLock(buf),
LW_SHARED))
{
/*
* If using a nondefault strategy, and writing the buffer
* would require a WAL flush, let the strategy decide whether
* to go ahead and write/reuse the buffer or to choose another
* victim. We need lock to inspect the page LSN, so this
* can't be done inside StrategyGetBuffer.
*/
// 非默认策略,刷脏时需要获取 WAL lsn
if (strategy != NULL)
{
XLogRecPtr lsn;
/* Read the LSN while holding buffer header lock */
buf_state = LockBufHdr(buf); // 上buffer header 锁
lsn = BufferGetLSN(buf); // 读取LSN
UnlockBufHdr(buf, buf_state);
if (XLogNeedsFlush(lsn) &&
StrategyRejectBuffer(strategy, buf)) // 拒绝
{
/* Drop lock/pin and loop around for another buffer */
LWLockRelease(BufferDescriptorGetContentLock(buf));
UnpinBuffer(buf, true);
continue;
}
}
/* OK, do the I/O */
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_START(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode);
FlushBuffer(buf, NULL); // 刷脏,释放资源
LWLockRelease(BufferDescriptorGetContentLock(buf));
ScheduleBufferTagForWriteback(&BackendWritebackContext, // 回写调度
&buf->tag);
TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum,
smgr->smgr_rnode.node.spcNode,
smgr->smgr_rnode.node.dbNode,
smgr->smgr_rnode.node.relNode);
}
else
{
/*
* Someone else has locked the buffer, so give it up and loop
* back to get another one.
*/
UnpinBuffer(buf, true);
continue;
}
}
2.3 如果完成IO操作或者buffer处于有效状态,需要判断原buffer原分区是否与先buffer所在分区
相同,主要目的时防止死锁 [分区的目的在于提高并发,避免冲突,相应分区查找hash条目]
/*
* To change the association of a valid buffer, we'll need to have
* exclusive lock on both the old and new mapping partitions.
*/
if (oldFlags & BM_TAG_VALID) // 获取的buffer含有有效标识,非脏
{
/*
* Need to compute the old tag's hashcode and partition lock ID.
* XXX is it worth storing the hashcode in BufferDesc so we need
* not recompute it here? Probably not.
*/
// 根据 Tag获取该buffer对应的分区锁
oldTag = buf->tag;
oldHash = BufTableHashCode(&oldTag);
oldPartitionLock = BufMappingPartitionLock(oldHash);
/*
* Must lock the lower-numbered partition first to avoid
* deadlocks.
*/
// 调整获取顺序避免死锁
if (oldPartitionLock < newPartitionLock)
{
LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
}
else if (oldPartitionLock > newPartitionLock)
{
LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
LWLockAcquire(oldPartitionLock, LW_EXCLUSIVE);
}
else
{
/* only one partition, only one lock */
LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
}
}
else
{
/* if it wasn't valid, we need only the new partition */
LWLockAcquire(newPartitionLock, LW_EXCLUSIVE);
/* remember we have no old-partition lock or tag */
oldPartitionLock = NULL;
/* keep the compiler quiet about uninitialized variables */
oldHash = 0;
}
/*
* Try to make a hashtable entry for the buffer under its new tag.
* This could fail because while we were writing someone else
* allocated another buffer for the same block we want to read in.
* Note that we have not yet removed the hashtable entry for the old
* tag.
*/
// 向哈希表插入新条目,当我们读取时其他人已分配了此页,即可以重用此块
buf_id = BufTableInsert(&newTag, newHash, buf->buf_id);
if (buf_id >= 0)
{
/*
* Got a collision. Someone has already done what we were about to
* do. We'll just handle this as if it were found in the buffer
* pool in the first place. First, give up the buffer we were
* planning to use.
*/
// 其他进程已执行造成冲 释放锁,返回原位置,
UnpinBuffer(buf, true);
/* Can give up that buffer's mapping partition lock now */
if (oldPartitionLock != NULL &&
oldPartitionLock != newPartitionLock)
LWLockRelease(oldPartitionLock);
/* remaining code should match code at top of routine */
buf = GetBufferDescriptor(buf_id); // 获取缓冲区描述符
valid = PinBuffer(buf, strategy); // pin 住
/* Can release the mapping lock as soon as we've pinned it */
LWLockRelease(newPartitionLock); // 释放 newPartitionLock
*foundPtr = true;
if (!valid)
{
/*
* We can only get here if (a) someone else is still reading
* in the page, or (b) a previous read attempt failed. We
* have to wait for any active read attempt to finish, and
* then set up our own read attempt if the page is still not
* BM_VALID. StartBufferIO does it all.
*/
if (StartBufferIO(buf, true))
{
/*
* If we get here, previous attempts to read the buffer
* must have failed ... but we shall bravely try again.
*/
*foundPtr = false;
}
}
return buf;
}
/*
* Need to lock the buffer header too in order to change its tag.
*/
// 插入成功,更新状态信息
buf_state = LockBufHdr(buf);
/*
* Somebody could have pinned or re-dirtied the buffer while we were
* doing the I/O and making the new hashtable entry. If so, we can't
* recycle this buffer; we must undo everything we've done and start
* over with a new victim buffer.
*/
// 在执行IO和生成新hashtable条目时其他人已pinned住或者重定向,不能循环使用
// 退出循环,undo重新获取新的牺牲页面
oldFlags = buf_state & BUF_FLAG_MASK; // 更新标识信息
if (BUF_STATE_GET_REFCOUNT(buf_state) == 1 && !(oldFlags & BM_DIRTY))
break;
UnlockBufHdr(buf, buf_state); // 释放锁
BufTableDelete(&newTag, newHash); // 删除条目
if (oldPartitionLock != NULL &&
oldPartitionLock != newPartitionLock)
LWLockRelease(oldPartitionLock);
LWLockRelease(newPartitionLock);
UnpinBuffer(buf, true); // unpin
}
2.4 上述步骤完成后,进行善后工作,释放资源
/*
* Okay, it's finally safe to rename the buffer.
*
* Clearing BM_VALID here is necessary, clearing the dirtybits is just
* paranoia. We also reset the usage_count since any recency of use of
* the old content is no longer relevant. (The usage_count starts out at
* 1 so that the buffer can survive one clock-sweep pass.)
*
* Make sure BM_PERMANENT is set for buffers that must be written at every
* checkpoint. Unlogged buffers only need to be written at shutdown
* checkpoints, except for their "init" forks, which need to be treated
* just like permanent relations.
*/
buf->tag = newTag;
buf_state &= ~(BM_VALID | BM_DIRTY | BM_JUST_DIRTIED |
BM_CHECKPOINT_NEEDED | BM_IO_ERROR | BM_PERMANENT |
BUF_USAGECOUNT_MASK);
if (relpersistence == RELPERSISTENCE_PERMANENT || forkNum == INIT_FORKNUM)
buf_state |= BM_TAG_VALID | BM_PERMANENT | BUF_USAGECOUNT_ONE;
else
buf_state |= BM_TAG_VALID | BUF_USAGECOUNT_ONE;
UnlockBufHdr(buf, buf_state);
if (oldPartitionLock != NULL)
{
BufTableDelete(&oldTag, oldHash);
if (oldPartitionLock != newPartitionLock)
LWLockRelease(oldPartitionLock);
}
LWLockRelease(newPartitionLock);
/*
* Buffer contents are currently invalid. Try to obtain the right to
* start I/O. If StartBufferIO returns false, then someone else managed
* to read it before we did, so there's nothing left for BufferAlloc() to
* do.
*/
// buffer内容无效,则从物理文件读取信息之buffer
if (StartBufferIO(buf, true))
*foundPtr = false;
else
*foundPtr = true;
return buf; // 返回buffer
}