相关
子事务的可见性判断、性能问题请看这篇:《Postgresql源码(25)子事务可见性判断和性能问题》
子事务的DDL和数据结构请看这篇:《Postgresql源码(71)子事务数据结构与DDL分析》
1 子事务的性能问题
测试SQL
BEGIN;
PREPARE sel(integer) AS
SELECT count(*)
FROM contend
WHERE id BETWEEN $1 AND $1 + 100;
PREPARE upd(integer) AS
UPDATE contend SET val = val + 1
WHERE id IN ($1, $1 + 10, $1 + 20, $1 + 30);
SAVEPOINT a;
\set rnd random(1,990)
EXECUTE sel(10 * :rnd + :client_id + 1);
EXECUTE upd(10 * :rnd + :client_id);
SAVEPOINT a;
\set rnd random(1,990)
EXECUTE sel(10 * :rnd + :client_id + 1);
EXECUTE upd(10 * :rnd + :client_id);
...
SAVEPOINT a;
\set rnd random(1,990)
EXECUTE sel(10 * :rnd + :client_id + 1);
EXECUTE upd(10 * :rnd + :client_id);
DEALLOCATE ALL;
COMMIT;
60 子事务
tps = 167.382164
pgbench -f subtrans.sql -n -c 6 -T 600
transaction type: subtrans.sql
scaling factor: 1
query mode: simple
number of clients: 6
number of threads: 1
duration: 600 s
number of transactions actually processed: 100434
latency average = 35.846 ms
tps = 167.382164 (including connections establishing)
tps = 167.383187 (excluding connections establishing)
perf top
+ 1.86% [.] tbm_iterate
+ 1.77% [.] hash_search_with_hash_value
1.75% [.] AllocSetAlloc
+ 1.36% [.] pg_qsort
+ 1.12% [.] base_yyparse
+ 1.10% [.] TransactionIdIsCurrentTransactionId
+ 0.96% [.] heap_hot_search_buffer
+ 0.96% [.] LWLockAttemptLock
+ 0.85% [.] HeapTupleSatisfiesVisibility
+ 0.82% [.] heap_page_prune
+ 0.81% [.] ExecInterpExpr
+ 0.80% [.] SearchCatCache1
+ 0.79% [.] BitmapHeapNext
+ 0.64% [.] LWLockRelease
+ 0.62% [.] MemoryContextAllocZeroAligned
+ 0.55% [.] _bt_checkkeys
0.54% [.] hash_any
+ 0.52% [.] _bt_compare
0.51% [.] ExecScan
90 子事务
tps = 68.993634
pgbench -f subtrans.sql -n -c 6 -T 600
transaction type: subtrans.sql
scaling factor: 1
query mode: simple
number of clients: 6
number of threads: 1
duration: 600 s
number of transactions actually processed: 41400
latency average = 86.965 ms
tps = 68.993634 (including connections establishing)
tps = 68.993993 (excluding connections establishing)
perf top
+ 10.59% [.] LWLockAttemptLock
+ 7.12% [.] LWLockRelease
+ 2.70% [.] LWLockAcquire
+ 2.40% [.] SimpleLruReadPage_ReadOnly
+ 1.30% [.] TransactionIdIsCurrentTransactionId
+ 1.26% [.] tbm_iterate
+ 1.22% [.] hash_search_with_hash_value
+ 1.08% [.] AllocSetAlloc
+ 0.77% [.] heap_hot_search_buffer
+ 0.72% [.] pg_qsort
+ 0.72% [.] base_yyparse
+ 0.66% [.] SubTransGetParent
+ 0.62% [.] HeapTupleSatisfiesVisibility
+ 0.54% [.] ExecInterpExpr
+ 0.51% [.] SearchCatCache1
2 可见性判断基础
《数据库系统实现》7.8.5 多版本时间戳
3 快照基础
postgres=# select txid_current_snapshot();
txid_current_snapshot
---------------------------------
1835847:1835853:1835847,1835849
postgres=# select txid_current();
txid_current
--------------
1835853
txid_current_snapshot is xmin:xmax:xip_list
理解这三个值得含义
- xmin:最小的活跃事务ID,比这个小的肯定都提交或回滚了
- xmax:下一个要分配的事务ID,比这个值大的或者等于得肯定都没分配
- xip_list:最容易理解,正在运行的事务ID
PG中使用行版本来判断可见性,每行数据都记录了创建、删除的XID;如果记录的XID是子事务的XID,还需要查询子事务的状态确认事务ID是否有效。
默认的RC隔离级别下,每条语句都会重新拿快照:
Snapshot
GetSnapshotData(Snapshot snapshot)
{
...
return snapshot;
}
对于事务可见性判断具体会拿到下面三个数据:
- xmax:最大未使用事务ID
- xip_list:活跃事务ID列表
- command number:最小的一个可见command number
typedef struct SnapshotData
{
...
TransactionId xmin; /* all XID < xmin are visible to me */
TransactionId xmax; /* all XID >= xmax are invisible to me */
TransactionId *xip;
...
CommandId curcid; /* in my xact, CID < curcid are visible */
...
}
快照的获取需要遍历整个process array(在共享内存中记录了所有正在运行的backend信息),每一个session在某一时刻只能有一个确定的XID,但由于子事务的原因,每个session可以额外记录64个活跃的子事务状态信息。
4 proc array
-
PRCO(816B)数组,维护链表结构方便申请释放,对应每个后台服务进程,PID为OS标识、PGPROCNO为内部标识
-
XACT(12B)数组,维护快照需要的xmin和xid,XACT从PROC拆出来是为了更高的cache line hit(64B放5个XACT),但是因为xmin的频繁更新,造成更严重的cache line invaild,PG14把这个结构完全优化掉了。
-
注意ProcArrayStruct后面跟的整形list保存的是运行中的 PROC和XACT的数组INDEX,按顺序存放
5 子事务可见性判断
执行SQL
begin; -- 1835915
insert into t1 values (1,1);
insert into t1 values (2,1);
delete from t1 where a = 2;
savepoint a;
insert into t1 values (3,1);
savepoint b;
insert into t1 values (4,1);
savepoint c;
insert into t1 values (5,1);
rollback to savepoint c;
select * from t1;
NOTICE: snapshot->curcid: 6
a | b
---+---
1 | 1
3 | 1
4 | 1
select txid_current();
txid_current
--------------
1835915
内存状态PGPROC
p *MyProc
{
pid = 10219,
pgprocno = 99,
backendId = 3,
databaseId = 13212,
roleId = 10,
subxids = {xids = {1835916, 1835917, 1835918, 1835883, 0 <repeats 60 times>}},
...
}
内存状态PGXACT
p *MyPgXact
{
xid = 1835915,
xmin = 1835915,
overflowed = 0,
nxids = 2
...
}
内存状态CurrentTransactionState
p *CurrentTransactionState
{
transactionId = 0,
subTransactionId = 5,
name = 0x2a9c3d0 "c",
state = TRANS_INPROGRESS,
blockState = TBLOCK_SUBINPROGRESS,
nestingLevel = 4,
parent = 0x2a9b980}
p *CurrentTransactionState->parent
{
transactionId = 1835917,
subTransactionId = 3,
name = 0x2a9c1f8 "b",
state = TRANS_INPROGRESS,
blockState = TBLOCK_SUBINPROGRESS,
nestingLevel = 3,
parent = 0x2a9b818
}
p *CurrentTransactionState->parent->parent
{
transactionId = 1835916,
subTransactionId = 2,
name = 0x2a9be30 "a",
state = TRANS_INPROGRESS,
blockState = TBLOCK_SUBINPROGRESS,
nestingLevel = 2,
parent = 0xf16580 <TopTransactionStateData>
}
p *CurrentTransactionState->parent->parent->parent
{
transactionId = 1835915,
subTransactionId = 1,
name = 0x0,
state = TRANS_INPROGRESS,
blockState = TBLOCK_INPROGRESS,
nestingLevel = 1,
parent = 0x0
}
MVCC判断位置
/* ----------------------------- */
/* 标准查询
/* ----------------------------- */
exec_simple_query
PortalRun
ExecScan
heapgettup_pagemode
heapgetpage
/* ----------------------------- */
/* 标准查询:MVCC判断
/* ----------------------------- */
heapgetpage
// 缓存读,返回pin & unlock状态的buffer
ReadBufferExtended
LockBuffer(buffer, BUFFER_LOCK_SHARE);
// 遍历所有元组!(所以表膨胀对性能会有影响)
for (lineoff = FirstOffsetNumber, lpp = PageGetItemId(dp, lineoff); lineoff <= lines; lineoff++, lpp++)
...
// 可见性判断入口函数
valid = HeapTupleSatisfiesVisibility
// 记录可见的元组
if (valid) scan->rs_vistuples[ntup++] = lineoff;
...
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
scan->rs_ntuples = ntup;
HeapTupleSatisfiesMVCC
/* ----------------------------- */
/* MVCC Satisfies
/* ----------------------------- */
HeapTupleSatisfiesVisibility
HeapTupleSatisfiesMVCC
if (!HeapTupleHeaderXminCommitted(tuple)) // 【判断1】创建者没有提交
if (HeapTupleHeaderXminInvalid(tuple)) // 【判断1.1】创建者的事务ID已经回滚了
return false;
...
else if (TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetRawXmin(tuple))) // 【判断1.2】当前事务产生的元组
... // 当前事务内部使用ctid判断可见性
if (HeapTupleHeaderGetCmin(tuple) >= snapshot->curcid) // 【判断1.2.1】insert发生在scan之后
return false;
if (tuple->t_infomask & HEAP_XMAX_INVALID) // 【判断1.2.2】没有被删除
return true;
if (HeapTupleHeaderGetCmax(tuple) >= snapshot->curcid) // 【判断1.2.3】删除了;删除和读谁比较早?
return true; // 先读 后删除
else
return false; // 先删除 后读
...
else if (XidInMVCCSnapshot...) // (性能问题)【判断1.3】非当前事务产生的元组,事务ID在当前快照中
else if (TransactionIdDidCommit(HeapTupleHeaderGetRawXmin(tuple))) // 【判断1.4】非当前事务产生的元组;创建者已经提交了
...
else
return false;
...
...
可见性判断过程:
a | b command id xmin xmax t_infomask 可见性判断流程
---+---
1 | 1 0 1835915 0 2048 = 0x800 = HEAP_XMAX_INVALID 可见:1 -> 1.2 -> 1.2.2
(2 | 1) 1 1835915 1835915 32 = 0x20 = HEAP_COMBOCID 不可见:1 -> 1.2 -> 1.2.3
3 | 1 2 1835916 0 2048 = 0x800 = HEAP_XMAX_INVALID 可见:1 -> 1.2 -> 1.2.2
4 | 1 3 1835917 0 2048 = 0x800 = HEAP_XMAX_INVALID 可见:1 -> 1.2 -> 1.2.2
(5 | 1) 4 1835918 0 2560 = 0xA00 = HEAP_XMIN_INVALID 不可见:1 -> 1.1
HEAP_XMAX_INVALID
TransactionIdIsCurrentTransactionId如何判断数据是当前事务产生的?
遍历CurrentTransactionState list,如果找到创建元组的xmin说明是当前事务产生的
性能问题的根源:
HeapTupleSatisfiesMVCC拿到一个元组的xmin时,并不知道该XID是子事务ID还是事务ID,例如上面数据(5|1)是事务1835918创建的:
(1)如果快照中保存了完整的事务信息(子事务少于64个),可以通过快照保存的子事务信息。
(2)如果快照的子事务信息溢出了,那么需要去遍历SLRU页面,通过自己的XID一层一层的找到顶层事务ID用户判断可见性。
1835918->1835917->1835916->1835915
XidInMVCCSnapshot
...
if (!snapshot->suboverflowed)
{
/* we have full data, so search subxip */
int32 j;
for (j = 0; j < snapshot->subxcnt; j++)
{
if (TransactionIdEquals(xid, snapshot->subxip[j]))
return true;
}
/* not there, fall through to search xip[] */
}
else
{
// SubTransGetTopmostTransaction --> SubTransGetParent
// --> SimpleLruReadPage_ReadOnly --> (control LW_EXCLUSIVE) --> SimpleLruReadPage
// --> SlruSelectLRUPage --> (bufer_locks LW_EXCLUSIVE) SimpleLruWaitIO --> ...
xid = SubTransGetTopmostTransaction(xid);
if (TransactionIdPrecedes(xid, snapshot->xmin))
return false;
}
...
6 SLRU页面
- SLRU页面在PG中有四类应用,CLOG、SUBTRANS、COMMIT_TS、MULTI_XACT
- SLRU:SIMPLE+最近最少使用=SLRU,常用的页面淘汰方法。
- slru实现:使用计数管理,使用后增加页面计数,淘汰最小的
- lru触发:页面超过CLOGShmemBuffers:
Min(128, Max(4, NBuffers / 512))
:4-128个
- lru触发:页面超过CLOGShmemBuffers:
- vfd实现:使用链表管理,使用后前调链表位置,淘汰最后的
- slru实现:使用计数管理,使用后增加页面计数,淘汰最小的
- 子事务页面:使用事务ID查询,事务ID作为OFFSET可以对应到某一个页面的某一个uint32的位置,该位置存放事务ID的父事务ID
- sharedbuffer中保存32个页面
- CLOG页面:使用事务ID查询,事务ID作为OFFSET对应到某个页面的某一个2BIT的位置,该位置存放事务ID的状态
- sharedbuffer中保存4-128个页面