1 Page 结构解析
在postgres中,表格默认为堆表(Heap);每个表文件数据放在若干个物理文件中,每个物理文件由多个页(Page)组成,数据页默认大小为 8K,最大为32K,其结构如下图:
内部结构:
/*
* A postgres disk page is an abstraction layered on top of a postgres
* disk block (which is simply a unit of i/o, see block.h).
*
* specifically, while a disk block can be unformatted, a postgres
* disk page is always a slotted page of the form:
*
* +----------------+---------------------------------+
* | PageHeaderData | linp1 linp2 linp3 ... |
* +-----------+----+---------------------------------+
* | ... linpN | |
* +-----------+--------------------------------------+
* | ^ pd_lower |
* | |
* | v pd_upper |
* +-------------+------------------------------------+
* | | tupleN ... |
* +-------------+------------------+-----------------+
* | ... tuple3 tuple2 tuple1 | "special space" |
* +--------------------------------+-----------------+
* ^ pd_special
页由三部分所组成: == 页头 + 项指针 + 元组 ==
typedef struct PageHeaderData
{
/* XXX LSN is member of *any* block, not only page-organized ones */
PageXLogRecPtr pd_lsn; /* LSN: next byte after last byte of xlog
* record for last change to this page */
uint16 pd_checksum; /* checksum */
uint16 pd_flags; /* flag bits, see below */
LocationIndex pd_lower; /* offset to start of free space */
LocationIndex pd_upper; /* offset to end of free space */
LocationIndex pd_special; /* offset to start of special space */ // index
uint16 pd_pagesize_version;
TransactionId pd_prune_xid; /* oldest prunable XID, or zero if none */
ItemIdData pd_linp[FLEXIBLE_ARRAY_MEMBER]; /* line pointer array */ // 项指针数组
} PageHeaderData;
typedef PageHeaderData *PageHeader;
pd_lsn———— 本页面最近一次变更所写入的XLOG记录对应的LSN。其类型是PageXLogRecPtr,该结构由xlogid和xrecoff两个属性组成,前者表示wal日志的逻辑id,后者表是在wal日志中的偏移量,两者都是32位无符号数。因此pd_lsn是一个8B的无符号整数。
pd_checksum———— 本页面的校验和值(9.3版本以后才有),2个字节的无符号整型。
pd_flags———— 标志位,见下面的定义,2个字节的无符号整型。
pd_lower、pd_upper———— pd_lower指向行指针的末尾,表示空闲空间的起始位置。pd_upper指向最新堆元组的起始位置,表示空闲空间的结束位置。都是2个字节的无符号整型。
pd_special ———— 在索引页中会用到该字段,在堆表页中它指向页尾。2个字节无符号整型。
pd_pagesize_version ————类似校验,2个字节。
pd_prune_xid ———— 字面意思是可剪枝的最老的事务ID,4个字节。
pd_linp[FLEXIBLE_ARRAY_MEMBER] ———— ItemIdData类型的数组。
/*
* pd_flags contains the following flag bits. Undefined bits are initialized
* to zero and may be used in the future.
*
* PD_HAS_FREE_LINES is set if there are any LP_UNUSED line pointers before
* pd_lower. This should be considered a hint rather than the truth, since
* changes to it are not WAL-logged.
*
* PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
* page for its new tuple version; this suggests that a prune is needed.
* Again, this is just a hint.
*/
#define PD_HAS_FREE_LINES 0x0001 /* are there any unused line pointers? */
#define PD_PAGE_FULL 0x0002 /* not enough free space for new tuple? */
#define PD_ALL_VISIBLE 0x0004 /* all tuples on page are visible to
* everyone */
#define PD_VALID_FLAG_BITS 0x0007 /* OR of all valid pd_flags bits */
项指针:
typedef struct ItemIdData
{
unsigned lp_off:15, /* offset to tuple (from start of page) */
lp_flags:2, /* state of line pointer, see below */
lp_len:15; /* byte length of tuple */
} ItemIdData;
typedef ItemIdData *ItemId;
/*
* lp_flags has these possible states. An UNUSED line pointer is available
* for immediate re-use, the other states are not.
*/
#define LP_UNUSED 0 /* unused (should always have lp_len=0) */
#define LP_NORMAL 1 /* used (should always have lp_len>0) */
#define LP_REDIRECT 2 /* HOT redirect (should have lp_len=0) */
#define LP_DEAD 3 /* dead, may or may not have storage */
ItemIdData类型由lp_off、lp_flags、lp_len三个属性组成。每一个ItemIdData结构用来指向文件块中的一个元组,其中lp_off是元组在文件块(Page)中的偏移量,lp_len则说明了该元组的长度,lp_flags则表示元组的状态(分为未使用、正常使用、HOT重定向和死亡四种状态)。每个ItemIdData元素大小为4个字节
2 元组结构解析
struct HeapTupleHeaderData
{
union
{
HeapTupleFields t_heap;
DatumTupleFields t_datum;
} t_choice;
ItemPointerData t_ctid; /* current TID of this or newer tuple (or a
* speculative insertion token) */
/* Fields below here must match MinimalTupleData! */
#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
uint16 t_infomask2; /* number of attributes + various flags */
#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
uint16 t_infomask; /* various flag bits, see below */
#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4
uint8 t_hoff; /* sizeof header incl. bitmap, padding */
/* ^ - 23 bytes - ^ */
#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5
bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */
/* MORE DATA FOLLOWS AT END OF STRUCT */
// 元组的真实数据
};
typedef struct HeapTupleHeaderData HeapTupleHeaderData;
typedef HeapTupleHeaderData *HeapTupleHeader;
t_choise:是具体两个成员的联合类型:
t_heap:用于记录对元组执行插入/删除操作的事务ID和命令ID, 这些字段信息用于可见性判断。
t_datum:当一个新的元组在内存中形成的时候,并不关心事务可见性,因此t_choise中只需用DatumTupleFields来记录元组长度等信息,这是临时信息。在把该元组插入到表文件时,需要在元组头信息中记录插入该元组的事务和命令ID,此时会把t_choise转换为HeapTupleFields结构并填充相应数据后再进行元组的插入。
t_ctid:用于记录当前元组或新元组的物理位置,若元组被更新(删除旧版本元组,然后插入新版本元组),则记录的是新版本元组的物理位置。
t_infomask2:使用其低11位表示当前元组的属性个数,其他位用于包含用于HOT技术及元组可见性的标志位。
t_infomask:标识元组的当前状态,比如:是否具有OID、是否有空属性等,t_infomask的每一位对应不同的状态,共16种状态。
t_hoff:表示元组头的大小。
t_bits:用于标识该元组哪些字段为空。
typedef struct HeapTupleFields
{
TransactionId t_xmin; /* inserting xact ID */
TransactionId t_xmax; /* deleting or locking xact ID */
union
{
CommandId t_cid; /* inserting or deleting command ID, or both */
TransactionId t_xvac; /* old-style VACUUM FULL xact ID */
} t_field3;
} HeapTupleFields;
typedef struct DatumTupleFields
{
int32 datum_len_; /* varlena header (do not touch directly!) */
int32 datum_typmod; /* -1, or identifier of a record type */
Oid datum_typeid; /* composite type OID, or RECORDOID */
/*
* datum_typeid cannot be a domain over composite, only plain composite,
* even if the datum is meant as a value of a domain-over-composite type.
* This is in line with the general principle that CoerceToDomain does not
* change the physical representation of the base type value.
*
* Note: field ordering is chosen with thought that Oid might someday
* widen to 64 bits.
*/
} DatumTupleFields;
typedef struct ItemPointerData
{
BlockIdData ip_blkid; // 块号
OffsetNumber ip_posid; // 表示该元组对应的ItemIdData数组的下标
}
/* If compiler understands packed and aligned pragmas, use those */
#if defined(pg_attribute_packed) && defined(pg_attribute_aligned)
pg_attribute_packed()
pg_attribute_aligned(2)
#endif
ItemPointerData;
typedef ItemPointerData *ItemPointer;
注意:在插入元组时,t_ctid并不是在构建元组时就存在,而是在元组写入文件块对应的共享内存文件页后才设置的,其代码如下:
void
RelationPutHeapTuple(Relation relation,
Buffer buffer,
HeapTuple tuple,
bool token)
{
Page pageHeader;
OffsetNumber offnum;
Assert(!token || HeapTupleHeaderIsSpeculative(tuple->t_data));
pageHeader = BufferGetPage(buffer);
/*
* Add the tuple to the page
* 此时tuple中的t_ctid还是一个非法值
*/
offnum = PageAddItem(pageHeader, (Item) tuple->t_data,
tuple->t_len, InvalidOffsetNumber, false, true);
if (offnum == InvalidOffsetNumber)
elog(PANIC, "failed to add tuple to page");
/* Update tuple->t_self to the actual position where it was stored */
ItemPointerSet(&(tuple->t_self), BufferGetBlockNumber(buffer), offnum);
/*
* Insert the correct position into CTID of the stored tuple, too (unless
* this is a speculative insertion, in which case the token is held in
* CTID field instead)
*/
if (!token)
{
/*
* 修改tuple中的t_ctid
*/
ItemId itemId = PageGetItemId(pageHeader, offnum);
Item item = PageGetItem(pageHeader, itemId);
((HeapTupleHeader) item)->t_ctid = tuple->t_self;
}
}
参考:https://blog.csdn.net/obvious__/article/details/109328425