postgres 源码解析6 Page 与 Tuple 组织方式

Serendipity_Shy

已于 2022-10-01 12:03:38 修改

阅读量449

点赞数 3

分类专栏： postgres 文章标签：数据库

于 2022-06-09 20:58:03 首次发布

本文链接：https://blog.csdn.net/qq_52668274/article/details/125198448

版权

postgres 专栏收录该内容

54 篇文章 28 订阅

订阅专栏

1 Page 结构解析

在postgres中,表格默认为堆表（Heap）；每个表文件数据放在若干个物理文件中，每个物理文件由多个页（Page）组成，数据页默认大小为 8K，最大为32K，其结构如下图：
在这里插入图片描述
内部结构：

/*
 * A postgres disk page is an abstraction layered on top of a postgres
 * disk block (which is simply a unit of i/o, see block.h).
 *
 * specifically, while a disk block can be unformatted, a postgres
 * disk page is always a slotted page of the form:
 *
 * +----------------+---------------------------------+
 * | PageHeaderData | linp1 linp2 linp3 ...           |
 * +-----------+----+---------------------------------+
 * | ... linpN |									  |
 * +-----------+--------------------------------------+
 * |		   ^ pd_lower							  |
 * |												  |
 * |			 v pd_upper							  |
 * +-------------+------------------------------------+
 * |			 | tupleN ...                         |
 * +-------------+------------------+-----------------+
 * |	   ... tuple3 tuple2 tuple1 | "special space" |
 * +--------------------------------+-----------------+
 *									^ pd_special

页由三部分所组成： == 页头 + 项指针 + 元组 ==

typedef struct PageHeaderData
{
	/* XXX LSN is member of *any* block, not only page-organized ones */
	PageXLogRecPtr pd_lsn;		/* LSN: next byte after last byte of xlog
								 * record for last change to this page */
	uint16		pd_checksum;	/* checksum */
	uint16		pd_flags;		/* flag bits, see below */
	LocationIndex pd_lower;		/* offset to start of free space */
	LocationIndex pd_upper;		/* offset to end of free space */
	LocationIndex pd_special;	/* offset to start of special space */        // index 
	uint16		pd_pagesize_version;
	TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */  
	ItemIdData	pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */     // 项指针数组
} PageHeaderData;
typedef PageHeaderData *PageHeader;

pd_lsn———— 本页面最近一次变更所写入的XLOG记录对应的LSN。其类型是PageXLogRecPtr，该结构由xlogid和xrecoff两个属性组成，前者表示wal日志的逻辑id，后者表是在wal日志中的偏移量，两者都是32位无符号数。因此pd_lsn是一个8B的无符号整数。
pd_checksum———— 本页面的校验和值（9.3版本以后才有），2个字节的无符号整型。
pd_flags———— 标志位，见下面的定义，2个字节的无符号整型。
pd_lower、pd_upper———— pd_lower指向行指针的末尾，表示空闲空间的起始位置。pd_upper指向最新堆元组的起始位置，表示空闲空间的结束位置。都是2个字节的无符号整型。
pd_special ———— 在索引页中会用到该字段，在堆表页中它指向页尾。2个字节无符号整型。
pd_pagesize_version ————类似校验，2个字节。
pd_prune_xid ———— 字面意思是可剪枝的最老的事务ID，4个字节。
pd_linp[FLEXIBLE_ARRAY_MEMBER] ———— ItemIdData类型的数组。

/*
 * pd_flags contains the following flag bits.  Undefined bits are initialized
 * to zero and may be used in the future.
 *
 * PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before
 * pd_lower.  This should be considered a hint rather than the truth, since
 * changes to it are not WAL-logged.
 *
 * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
 * page for its new tuple version; this suggests that a prune is needed.
 * Again, this is just a hint.
 */
#define PD_HAS_FREE_LINES	0x0001	/* are there any unused line pointers? */
#define PD_PAGE_FULL		0x0002	/* not enough free space for new tuple? */
#define PD_ALL_VISIBLE		0x0004	/* all tuples on page are visible to
									 * everyone */
#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */

项指针：

typedef struct ItemIdData
{
	unsigned	lp_off:15,		/* offset to tuple (from start of page) */
				lp_flags:2,		/* state of line pointer, see below */
				lp_len:15;		/* byte length of tuple */
} ItemIdData;
typedef ItemIdData *ItemId;
/*
 * lp_flags has these possible states.  An UNUSED line pointer is available
 * for immediate re-use, the other states are not.
 */
#define LP_UNUSED		0		/* unused (should always have lp_len=0) */
#define LP_NORMAL		1		/* used (should always have lp_len>0) */
#define LP_REDIRECT		2		/* HOT redirect (should have lp_len=0) */
#define LP_DEAD			3		/* dead, may or may not have storage */

ItemIdData类型由lp_off、lp_flags、lp_len三个属性组成。每一个ItemIdData结构用来指向文件块中的一个元组，其中lp_off是元组在文件块（Page）中的偏移量，lp_len则说明了该元组的长度，lp_flags则表示元组的状态（分为未使用、正常使用、HOT重定向和死亡四种状态）。每个ItemIdData元素大小为4个字节

2 元组结构解析

struct HeapTupleHeaderData
{
	union
	{
		HeapTupleFields t_heap;        
		DatumTupleFields t_datum;
	}			t_choice;

	ItemPointerData t_ctid;		/* current TID of this or newer tuple (or a
								 * speculative insertion token) */

	/* Fields below here must match MinimalTupleData! */
#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
	uint16		t_infomask2;	/* number of attributes + various flags */

#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
	uint16		t_infomask;		/* various flag bits, see below */

#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4
	uint8		t_hoff;			/* sizeof header incl. bitmap, padding */

	/* ^ - 23 bytes - ^ */

#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5
	bits8		t_bits[FLEXIBLE_ARRAY_MEMBER];	/* bitmap of NULLs */

	/* MORE DATA FOLLOWS AT END OF STRUCT */
	// 元组的真实数据
};
typedef struct HeapTupleHeaderData HeapTupleHeaderData;
typedef HeapTupleHeaderData *HeapTupleHeader;

t_choise：是具体两个成员的联合类型：
t_heap：用于记录对元组执行插入/删除操作的事务ID和命令ID, 这些字段信息用于可见性判断。
t_datum：当一个新的元组在内存中形成的时候，并不关心事务可见性，因此t_choise中只需用DatumTupleFields来记录元组长度等信息，这是临时信息。在把该元组插入到表文件时，需要在元组头信息中记录插入该元组的事务和命令ID，此时会把t_choise转换为HeapTupleFields结构并填充相应数据后再进行元组的插入。
t_ctid：用于记录当前元组或新元组的物理位置，若元组被更新（删除旧版本元组，然后插入新版本元组），则记录的是新版本元组的物理位置。
t_infomask2：使用其低11位表示当前元组的属性个数，其他位用于包含用于HOT技术及元组可见性的标志位。
t_infomask：标识元组的当前状态，比如：是否具有OID、是否有空属性等，t_infomask的每一位对应不同的状态，共16种状态。
t_hoff：表示元组头的大小。
t_bits：用于标识该元组哪些字段为空。

typedef struct HeapTupleFields
{
	TransactionId t_xmin;		/* inserting xact ID */
	TransactionId t_xmax;		/* deleting or locking xact ID */

	union
	{
		CommandId	t_cid;		/* inserting or deleting command ID, or both */
		TransactionId t_xvac;	/* old-style VACUUM FULL xact ID */
	}			t_field3;
} HeapTupleFields;

typedef struct DatumTupleFields
{
	int32		datum_len_;		/* varlena header (do not touch directly!) */

	int32		datum_typmod;	/* -1, or identifier of a record type */

	Oid			datum_typeid;	/* composite type OID, or RECORDOID */

	/*
	 * datum_typeid cannot be a domain over composite, only plain composite,
	 * even if the datum is meant as a value of a domain-over-composite type.
	 * This is in line with the general principle that CoerceToDomain does not
	 * change the physical representation of the base type value.
	 *
	 * Note: field ordering is chosen with thought that Oid might someday
	 * widen to 64 bits.
	 */
} DatumTupleFields;

typedef struct ItemPointerData
{
	BlockIdData ip_blkid;            // 块号
	OffsetNumber ip_posid;			 // 表示该元组对应的ItemIdData数组的下标
}

/* If compiler understands packed and aligned pragmas, use those */
#if defined(pg_attribute_packed) && defined(pg_attribute_aligned)
			pg_attribute_packed()
			pg_attribute_aligned(2)
#endif
ItemPointerData;
typedef ItemPointerData *ItemPointer;

注意：在插入元组时，t_ctid并不是在构建元组时就存在，而是在元组写入文件块对应的共享内存文件页后才设置的，其代码如下：

void
RelationPutHeapTuple(Relation relation,
					 Buffer buffer,
					 HeapTuple tuple,
					 bool token)
{
	Page		pageHeader;
	OffsetNumber offnum;
    
	Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data));

	pageHeader = BufferGetPage(buffer);

    /* 
	 * Add the tuple to the page 
	 * 此时tuple中的t_ctid还是一个非法值
	 */
	offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
						 tuple->t_len, InvalidOffsetNumber, false, true);

	if (offnum == InvalidOffsetNumber)
		elog(PANIC, "failed to add tuple to page");

	/* Update tuple->t_self to the actual position where it was stored */
	ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);

	/*
	 * Insert the correct position into CTID of the stored tuple, too (unless
	 * this is a speculative insertion, in which case the token is held in
	 * CTID field instead)
	 */
	if (!token)
	{
        /*
         * 修改tuple中的t_ctid
         */
		ItemId		itemId = PageGetItemId(pageHeader, offnum);
		Item		item = PageGetItem(pageHeader, itemId);

		((HeapTupleHeader) item)->t_ctid = tuple->t_self;
	}
}