本文从源码角度分析postgre元组插入流程,知识回顾:postgres Page 与 Tuple 组织方式
1 示意图
根据用户输入的SQL语句,进行语义、语法分析生成解析树,后对解析树进行分析重写,生成查询树,最后生成执行计划交由执行器执行,用户插入的数据保存在HeapTupleHeaderData结构体中,如图所示
2关键数据结构
2.1 HeapTupleData
typedef struct HeapTupleData
{
uint32 t_len; /* length of *t_data */
ItemPointerData t_self; /* SelfItemPointer */
Oid t_tableOid; /* table the tuple came from */
#define FIELDNO_HEAPTUPLEDATA_DATA 3
HeapTupleHeader t_data; /* -> tuple header and data */
} HeapTupleData;
typedef HeapTupleData *HeapTuple;
2.2 HeapTupleHeaderData
struct HeapTupleHeaderData
{
union
{
HeapTupleFields t_heap;
DatumTupleFields t_datum;
} t_choice;
ItemPointerData t_ctid; /* current TID of this or newer tuple (or a
* speculative insertion token) */
/* Fields below here must match MinimalTupleData! */
#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK2 2
uint16 t_infomask2; /* number of attributes + various flags */
#define FIELDNO_HEAPTUPLEHEADERDATA_INFOMASK 3
uint16 t_infomask; /* various flag bits, see below */
#define FIELDNO_HEAPTUPLEHEADERDATA_HOFF 4
uint8 t_hoff; /* sizeof header incl. bitmap, padding */
/* ^ - 23 bytes - ^ */
#define FIELDNO_HEAPTUPLEHEADERDATA_BITS 5
bits8 t_bits[FLEXIBLE_ARRAY_MEMBER]; /* bitmap of NULLs */
/* MORE DATA FOLLOWS AT END OF STRUCT */
};
2.3 源码解读
元组插入的入口函数为 heap_insert, 堆栈为:
#0 heap_insert (relation=0x7f910e908118, tup=0x11289c8, cid=0, options=0, bistate=0x0) at heapam.c:2063
#1 0x00000000004ec368 in heapam_tuple_insert (relation=0x7f910e908118, slot=0x11288b8, cid=0, options=0, bistate=0x0) at heapam_handler.c:252
#2 0x0000000000725514 in table_tuple_insert (rel=0x7f910e908118, slot=0x11288b8, cid=0, options=0, bistate=0x0) at ../../../src/include/access/tableam.h:1374
#3 0x0000000000726f38 in ExecInsert (mtstate=0x1127708, resultRelInfo=0x1127918, slot=0x11288b8, planSlot=0x1127eb8, estate=0x11274a8, canSetTag=true) at nodeModifyTable.c:934
#4 0x0000000000729190 in ExecModifyTable (pstate=0x1127708) at nodeModifyTable.c:2561
#5 0x00000000006f29c9 in ExecProcNodeFirst (node=0x1127708) at execProcnode.c:463
#6 0x00000000006e833d in ExecProcNode (node=0x1127708) at ../../../src/include/executor/executor.h:257
#7 0x00000000006ea711 in ExecutePlan (estate=0x11274a8, planstate=0x1127708, use_parallel_mode=false, operation=CMD_INSERT, sendTuples=false, numberTuples=0, direction=ForwardScanDirection, dest=0x1126dd0, execute_once=true) at execMain.c:1551
#8 0x00000000006e8874 in standard_ExecutorRun (queryDesc=0x10647c8, direction=ForwardScanDirection, count=0, execute_once=true) at execMain.c:361
#9 0x00000000006e870b in ExecutorRun (queryDesc=0x10647c8, direction=ForwardScanDirection, count=0, execute_once=true) at execMain.c:305
#10 0x000000000090128e in ProcessQuery (plan=0x1126cf0, sourceText=0x103f088 "insert into sweet values(5,28);", params=0x0, queryEnv=0x0, dest=0x1126dd0, qc=0x7ffe7c4e65c0) at pquery.c:160
#11 0x00000000009029bb in PortalRunMulti (portal=0x10e5ec8, isTopLevel=true, setHoldSnapshot=false, dest=0x1126dd0, altdest=0x1126dd0, qc=0x7ffe7c4e65c0) at pquery.c:1266
#12 0x0000000000902027 in PortalRun (portal=0x10e5ec8, count=9223372036854775807, isTopLevel=true, run_once=true, dest=0x1126dd0, altdest=0x1126dd0, qc=0x7ffe7c4e65c0) at pquery.c:786
#13 0x00000000008fc14f in exec_simple_query (query_string=0x103f088 "insert into sweet values(5,28);") at postgres.c:1214
#14 0x00000000009003f8 in PostgresMain (argc=1, argv=0x7ffe7c4e6850, dbname=0x106ac78 "postgres", username=0x106ac58 "postgres") at postgres.c:4486
#15 0x0000000000851ca3 in BackendRun (port=0x1062680) at postmaster.c:4506
#16 0x0000000000851629 in BackendStartup (port=0x1062680) at postmaster.c:4228
#17 0x000000000084dd48 in ServerLoop () at postmaster.c:1745
#18 0x000000000084d629 in PostmasterMain (argc=1, argv=0x1039bd0) at postmaster.c:1417
#19 0x000000000075ec02 in main (argc=1, argv=0x1039bd0) at main.c:209
heap_insert接口介绍
/*
* heap_insert - insert tuple into a heap
*
* The new tuple is stamped with current transaction ID and the specified
* command ID.
*
// 将含有当前事务号和命令id的元组插入堆表中
* See table_tuple_insert for comments about most of the input flags, except
* that this routine directly takes a tuple rather than a slot.
*
* There's corresponding HEAP_INSERT_ options to all the TABLE_INSERT_
* options, and there additionally is HEAP_INSERT_SPECULATIVE which is used to
* implement table_tuple_insert_speculative().
*
* On return the header fields of *tup are updated to match the stored tuple;
* in particular tup->t_self receives the actual TID where the tuple was
* stored. But note that any toasting of fields within the tuple data is NOT
* reflected into *tup.
代码流程如下:
1) 准备阶段:填充HeapTuple结构体相关字段,主要为各种标识位infomask, xid等
void
heap_insert(Relation relation, HeapTuple tup, CommandId cid,
int options, BulkInsertState bistate)
{
TransactionId xid = GetCurrentTransactionId(); // 获取事务号
HeapTuple heaptup;
Buffer buffer;
Buffer vmbuffer = InvalidBuffer;
bool all_visible_cleared = false;
/* Cheap, simplistic check that the tuple matches the rel's rowtype. */
Assert(HeapTupleHeaderGetNatts(tup->t_data) <=
RelationGetNumberOfAttributes(relation));
/*
* Fill in tuple header fields and toast the tuple if necessary.
*
* Note: below this point, heaptup is the data we actually intend to store
* into the relation; tup is the caller's original untoasted data.
*/
heaptup = heap_prepare_insert(relation, tup, xid, cid, options); /// 准备工作,填充各种信息
2)从共享缓冲池中找到可用的缓冲块,并在相应位置进行插入,完成后置 dirty 标识位;
共享缓冲区知识回顾:
postgres源码解析 缓冲池管理器–1
postgre源码解析 缓冲池管理器–2
postgres 源码解析 缓冲池管理器-3
/*
* Find buffer to insert this tuple into. If the page is all visible,
* this will also pin the requisite visibility map page.
*/
buffer = RelationGetBufferForTuple(relation, heaptup->t_len,
InvalidBuffer, options, bistate,
&vmbuffer, NULL);
/*
* We're about to do the actual insert -- but check for conflict first, to
* avoid possibly having to roll back work we've just done.
// 在插入前首先需要进行冲突检查,尽可能避免回滚工作
* This is safe without a recheck as long as there is no possibility of
* another process scanning the page between this check and the insert
* being visible to the scan (i.e., an exclusive buffer content lock is
* continuously held from this point until the tuple insert is visible).
// 该操作安全无需重检因为其他进程是无法扫描到该页知道插入的元组可见位置,这是因为buffer content
// 排他锁会被持有者一直持有直到插入的元组可见为止,这点满足事务的隔离性
* For a heap insert, we only need to check for table-level SSI locks. Our
* new tuple can't possibly conflict with existing tuple locks, and heap
* page locks are only consolidated versions of tuple locks; they do not
* lock "gaps" as index page locks do. So we don't need to specify a
* buffer when making the call, which makes for a faster check.
*/
// 需要检查 表级别 SSI锁,新元组不会与已存在的元组锁冲突,堆页锁是元组锁的结合,不会像索引表一样
// 会锁间隙。因此不用指定buffer,更有利于检查
CheckForSerializableConflictIn(relation, NULL, InvalidBlockNumber);
/* NO EREPORT(ERROR) from here till changes are logged */
// 进入临界区
START_CRIT_SECTION();
RelationPutHeapTuple(relation, buffer, heaptup, // 元祖插入buffer的真正操作
(options & HEAP_INSERT_SPECULATIVE) != 0);
if (PageIsAllVisible(BufferGetPage(buffer)))
{
all_visible_cleared = true; // 因为是新插入的元组,所以导致并非元组 all_visible
PageClearAllVisible(BufferGetPage(buffer)); // 要修改VM页,因此需要提前将其载入内存
visibilitymap_clear(relation,
ItemPointerGetBlockNumber(&(heaptup->t_self)),
vmbuffer, VISIBILITYMAP_VALID_BITS);
}
/*
* XXX Should we set PageSetPrunable on this page ?
*
* The inserting transaction may eventually abort thus making this tuple
* DEAD and hence available for pruning. Though we don't want to optimize
* for aborts, if no other tuple in this page is UPDATEd/DELETEd, the
* aborted tuple will never be pruned until next vacuum is triggered.
// 插入事务可能会因为一些情况导致回滚使得元组变为DEAD,在清理时是可得的。
// 如果该页中的其他元祖为更新或者删除,元组的清理将不会自动发生直到下一次Vacuum事件触发。
* If you do add PageSetPrunable here, add it in heap_xlog_insert too.
*/
MarkBufferDirty(buffer); // 将buffer 被标记为 dirty
3)针对此事务操作写WAL日志,并落盘,防止后续脏数据未持久化进行回放。
/* XLOG stuff */
if (RelationNeedsWAL(relation))
{
xl_heap_insert xlrec;
xl_heap_header xlhdr;
XLogRecPtr recptr;
Page page = BufferGetPage(buffer);
uint8 info = XLOG_HEAP_INSERT;
int bufflags = 0;
/*
* If this is a catalog, we need to transmit combo CIDs to properly
* decode, so log that as well.
*/
if (RelationIsAccessibleInLogicalDecoding(relation))
log_heap_new_cid(relation, heaptup);
/*
* If this is the single and first tuple on page, we can reinit the
* page instead of restoring the whole thing. Set flag, and hide
* buffer references from XLogInsert.
*/
// 数据页的第一个元组将会重新初始化设置标识位
if (ItemPointerGetOffsetNumber(&(heaptup->t_self)) == FirstOffsetNumber &&
PageGetMaxOffsetNumber(page) == FirstOffsetNumber)
{
info |= XLOG_HEAP_INIT_PAGE;
bufflags |= REGBUF_WILL_INIT;
}
xlrec.offnum = ItemPointerGetOffsetNumber(&heaptup->t_self); // 元组偏移量
xlrec.flags = 0;
if (all_visible_cleared)
xlrec.flags |= XLH_INSERT_ALL_VISIBLE_CLEARED; // 根据上下文更新标识位信息
if (options & HEAP_INSERT_SPECULATIVE)
xlrec.flags |= XLH_INSERT_IS_SPECULATIVE;
Assert(ItemPointerGetBlockNumber(&heaptup->t_self) == BufferGetBlockNumber(buffer));
/*
* For logical decoding, we need the tuple even if we're doing a full
* page write, so make sure it's included even if we take a full-page
* image. (XXX We could alternatively store a pointer into the FPW).
*/
if (RelationIsLogicallyLogged(relation) && // 逻辑解析
!(options & HEAP_INSERT_NO_LOGICAL))
{
xlrec.flags |= XLH_INSERT_CONTAINS_NEW_TUPLE;
bufflags |= REGBUF_KEEP_DATA;
if (IsToastRelation(relation)) // toast表
xlrec.flags |= XLH_INSERT_ON_TOAST_RELATION;
}
XLogBeginInsert();
XLogRegisterData((char *) &xlrec, SizeOfHeapInsert); // 注册 HeapInsert结构体
xlhdr.t_infomask2 = heaptup->t_data->t_infomask2; // 填充 xl_heap_header信息
xlhdr.t_infomask = heaptup->t_data->t_infomask;
xlhdr.t_hoff = heaptup->t_data->t_hoff;
/*
* note we mark xlhdr as belonging to buffer; if XLogInsert decides to
* write the whole page to the xlog, we don't need to store
* xl_heap_header in the xlog.
*/
XLogRegisterBuffer(0, buffer, REGBUF_STANDARD | bufflags);
XLogRegisterBufData(0, (char *) &xlhdr, SizeOfHeapHeader);
/* PG73FORMAT: write bitmap [+ padding] [+ oid] + data */
XLogRegisterBufData(0,
(char *) heaptup->t_data + SizeofHeapTupleHeader,
heaptup->t_len - SizeofHeapTupleHeader);
/* filtering by origin on a row level is much more efficient */
XLogSetRecordFlags(XLOG_INCLUDE_ORIGIN);
recptr = XLogInsert(RM_HEAP_ID, info); // 将此wal日志插入WAL BUFFER
PageSetLSN(page, recptr); // 设置数据页的 LSN
}
END_CRIT_SECTION();
UnlockReleaseBuffer(buffer); // 释放该buffer context lock
if (vmbuffer != InvalidBuffer)
ReleaseBuffer(vmbuffer);
/*
* If tuple is cachable, mark it for invalidation from the caches in case
* we abort. Note it is OK to do this after releasing the buffer, because
* the heaptup data structure is all in local memory, not in the shared
* buffer.
*/
CacheInvalidateHeapTuple(relation, heaptup, NULL);
/* Note: speculative insertions are counted too, even if aborted later */
pgstat_count_heap_insert(relation, 1);
/*
* If heaptup is a private copy, release it. Don't forget to copy t_self
* back to the caller's image, too.
*/
if (heaptup != tup)
{
tup->t_self = heaptup->t_self;
heap_freetuple(heaptup);
}
}