**环境:**Linux openGauss115 4.19.90-2303.4.0.0193.oe1.aarch64 #1 SMP Wed Mar 22 03:04:16 UTC 2023 aarch64 aarch64 aarch64 GNU/Linux
问题:资源池化双集群debug版本,备集群首备build之后,重启,回放日志到LSN 4/1108(XLOG_SWITCH类型)后,一直不回放日志
1. gcore 进程号
gdb gaussdb core.123456进入
2. 查看所有线程,找出异常线程
3. 切到线程86分发线程
查看函数CheckBufHasSpaceToDispatch 和 GetLsnCheckInfo
查看代码DispatchRedoRecordToFile -> CheckBufHasSpaceToDispatch
#ifdef USE_ASSERT_CHECKING
uint64 waitCount = 0;
while (!CheckBufHasSpaceToDispatch(record->EndRecPtr)) {
RedoInterruptCallBack();
waitCount++;
if ((waitCount & PRINT_ALL_WAIT_COUNT) == PRINT_ALL_WAIT_COUNT) {
ereport(LOG, (errmodule(MOD_REDO), errcode(ERRCODE_LOG),
errmsg("DispatchRedoRecordToFile:replayedLsn:%lu, blockcnt:%lu, readEndLSN:%lu",
GetXLogReplayRecPtr(NULL, NULL), waitCount, record->EndRecPtr)));
}
}
#endif
这里不应该死循环在这里,说明CheckBufHasSpaceToDispatch返回false
查看代码 CheckBufHasSpaceToDispatch
bool CheckBufHasSpaceToDispatch(XLogRecPtr endRecPtr)
{
uint64 curPosition;
XLogRecPtr curLsn;
GetLsnCheckInfo(&curPosition, &curLsn);
XLogRecPtr endPtr = endRecPtr;
if (endPtr % XLogSegSize == 0) {
XLByteAdvance(endPtr, SizeOfXLogLongPHD);
} else if (endPtr % XLOG_BLCKSZ == 0) {
XLByteAdvance(endPtr, SizeOfXLogShortPHD);
}
uint32 len = (uint32)(endPtr - curLsn);
if (len < LSN_CHECK_BUF_SIZE) {
return true;
}
return false;
}
从代码可知应该是len > LSN_CHECK_BUF_SIZE才会返回false,切到函数1 (操作(gdb) f 1)
(gdb) f 1
#1 0x00000000021d0368 in extreme_rto::CheckBufHasSpaceToDispatch (endRecPtr=18253611008) at dispatcher.cpp:1679
1679 in dispatcher.cpp
(gdb) p len
$1 = 1073737504
(gdb) p/x g_dispatcher->lsnCheckCtl.curLsn
$2 = 0x400001108
(gdb) p/x endRecPtr
$3 = 0x440000000
(gdb) p *g_dispatcher->lsnCheckCtl
$4 = {curPosition = 288, curLsn = 17179873544, lsnCheckBuf = {0 <repeats 134217728 times>}}
(gdb) p/x 17179873544
$5 = 0x400001108
(gdb) p 0x440000000
$6 = 18253611008
len = endRecPtr - curLsn = 18253611008 - 17179873544
#define LSN_CHECK_BUF_SIZE (128*1024*1024)
LSN_CHECK_BUF_SIZE = 134217728
4.读线程
发现读线程队列满
5.回放线程–普通回放线程
6.事务回放线程
(gdb) t 120
[Switching to thread 120 (Thread 0xfffc6cbc9700 (LWP 2611290))]
#0 0x00000000021d4a74 in extreme_rto::HandlePageRedoInterruptsImpl (clearRedoFdCountInc=1) at page_redo.cpp:398
398 page_redo.cpp: No such file or directory.
(gdb) bt
#0 0x00000000021d4a74 in extreme_rto::HandlePageRedoInterruptsImpl (clearRedoFdCountInc=1) at page_redo.cpp:398
#1 0x00000000021d4b44 in extreme_rto::HandlePageRedoInterrupts () at page_redo.cpp:411
#2 0x00000000021d78a0 in extreme_rto::TrxnWorkerQueueCallBack () at page_redo.cpp:1350
#3 0x0000000001c82d08 in RedoInterruptCallBack () at knl_thread.cpp:1987
#4 0x00000000021d40d8 in extreme_rto::RedoWorkerQueueCallBack () at page_redo.cpp:187
#5 0x00000000021dfe0c in extreme_rto::SPSCBlockingQueueTop (queue=0xfffcd5de0060) at spsc_blocking_queue.cpp:251
#6 0x00000000021d7940 in extreme_rto::TrxnWorkMain () at page_redo.cpp:1366
#7 0x00000000021dc63c in extreme_rto::RedoMainLoop () at page_redo.cpp:2555
#8 0x00000000021dcaf0 in extreme_rto::ParallelRedoThreadMain () at page_redo.cpp:2664
#9 0x00000000022bb318 in ExtremeParallelRedoThreadMain () at extreme_rto_redo_api.cpp:244
#10 0x000000000223f7b0 in MultiRedoMain () at multi_redo_api.cpp:224
#11 0x0000000001b7cfe8 in GaussDbAuxiliaryThreadMain<(knl_thread_role)22> (arg=0xfffdf90ea260) at postmaster.cpp:13628
#12 0x0000000001b6990c in GaussDbThreadMain<(knl_thread_role)22> (arg=0xfffdf90ea260) at postmaster.cpp:14015
#13 0x0000000001b5bc04 in InternalThreadFunc (args=0xfffdf90ea260) at postmaster.cpp:14607
#14 0x0000000002915ba8 in ThreadStarterFunc (arg=0xfffdf90ea250) at gs_thread.cpp:380
#15 0x0000ffff55b588cc in start_thread (arg=0xfffd00a482cf) at pthread_create.c:486
#16 0x0000ffff55a9954c in thread_start () at ../sysdeps/unix/sysv/linux/aarch64/clone.S:78
分析可知:
(1)分发线程死循环,导致读日志线程队列满,回放线程队列空,一直没有回放
(2)分发线程在分发日志 endRecPtr = 0x440000000(4/40000000)时,CheckBufHasSpaceToDispatch 校验分发是否还有空间可以分发,意思就是g_dispatcher->lsnCheckCtl函数设置了上一次分发的位置=0x400001108,但现在需要分发的日志大小检验 = 当前分发位置 - 上一次分发的日志,发现这个差值大于LSN_CHECK_BUF_SIZE,所以说明g_dispatcher->lsnCheckCtl位置更新不对
(3)debug版本check问题
目前规避方案:(1) debug版本先用并行回放重启再用极致rto (2)release版本可能没这个问题
7.SetLsnCheckInfo
g_dispatcher->lsnCheckCtl的值是从ItemLsnCheck -> PushCheckLsn -> SetLsnCheckInfo获取
8. xlog日志
000000010000000400000000文件
REDO @ 4/EF8; LSN 4/F48: prev 4/EC8; xid 0; term 703; len 40; total 74; crc 3186837428; desc: Standby - XLOG_RUNNING_XACTS
REDO @ 4/F48; LSN 4/FE8: prev 4/EF8; xid 0; term 703; len 120; total 154; crc 2956845812; desc: XLOG - checkpoint: redo 4/EC8; len 120; next_csn 629133; recent_global_xmin 674500; tli 1; fpw false; xid 686730; oid 24705; multi 2; offset 0; oldest xid 12786 in DB 15186; oldest running xid 686730; oldest xid with epoch having undo 0; online at Tue Dec 12 20:35:18 2023; remove_seg 0/1
REDO @ 4/FE8; LSN 4/1018: prev 4/F48; xid 0; term 703; len 8; total 42; crc 1211992718; desc: Standby - XLOG_STANDBY_CSN
REDO @ 4/1018; LSN 4/1068: prev 4/FE8; xid 0; term 703; len 40; total 74; crc 416350291; desc: Standby - XLOG_RUNNING_XACTS
REDO @ 4/1068; LSN 4/1108: prev 4/1018; xid 0; term 703; len 120; total 154; crc 3714272456; desc: XLOG - checkpoint: redo 4/FE8; len 120; next_csn 629133; recent_global_xmin 674500; tli 1; fpw false; xid 686730; oid 24705; multi 2; offset 0; oldest xid 12786 in DB 15186; oldest running xid 686730; oldest xid with epoch having undo 0; online at Tue Dec 12 20:36:18 2023; remove_seg 0/1
REDO @ 4/1108; LSN 4/40000000: prev 4/1068; xid 0; term 703; len 0; total 32; crc 1075085097; desc: XLOG - xlog switch
000000010000000400000001文件
REDO @ 4/40000028; LSN 4/40000058: prev 4/1108; xid 0; term 703; len 8; total 42; crc 2686753266; desc: Standby - XLOG_STANDBY_CSN
REDO @ 4/40000058; LSN 4/400000A8: prev 4/40000028; xid 0; term 703; len 40; total 74; crc 581279890; desc: Standby - XLOG_RUNNING_XACTS
REDO @ 4/400000A8; LSN 4/40000148: prev 4/40000058; xid 0; term 703; len 120; total 154; crc 385920242; desc: XLOG - checkpoint: redo 4/40000028; len 120; next_csn 629133; recent_global_xmin 674500; tli 1; fpw false; xid 686730; oid 24705; multi 2; offset 0; oldest xid 12786 in DB 15186; oldest running xid 686730; oldest xid with epoch having undo 0; online at Tue Dec 12 20:37:31 2023; remove_seg 0/1
REDO @ 4/40000148; LSN 4/40000178: prev 4/400000A8; xid 0; term 703; len 8; total 42; crc 1075485289; desc: Standby - XLOG_STANDBY_CSN
REDO @ 4/40000178; LSN 4/400001C8: prev 4/40000148; xid 0; term 703; len 40; total 74; crc 3810559801; desc: Standby - XLOG_RUNNING_XACTS
REDO @ 4/400001C8; LSN 4/40000268: prev 4/40000178; xid 0; term 703; len 120; total 154; crc 200874704; desc: XLOG - checkpoint: redo 4/40000148; len 120; next_csn 629133; recent_global_xmin 674500; tli 1; fpw false; xid 686730; oid 24705; multi 2; offset 0; oldest xid 12786 in DB 15186; oldest running xid 686730; oldest xid with epoch having undo 0; online at Tue Dec 12 20:38:31 2023; remove_seg 0/1
REDO @ 4/40000268; LSN 4/40000298: prev 4/400001C8; xid 0; term 703; len 8; total 42; crc 2174729666; desc: Standby - XLOG_STANDBY_CSN
REDO @ 4/40000298; LSN 4/400002E8: prev 4/40000268; xid 0; term 703; len 40; total 74; crc 1829591469; desc: Standby - XLOG_RUNNING_XACTS
REDO @ 4/400002E8; LSN 4/40000388: prev 4/40000298; xid 0; term 703; len 120; total 154; crc 1834159785; desc: XLOG - checkpoint: redo 4/40000268; len 120; next_csn 629133; recent_global_xmin 674500; tli 1; fpw false; xid 686730; oid 24705; multi 2; offset 0; oldest xid 12786 in DB 15186; oldest running xid 686730; oldest xid with epoch having undo 0; online at Tue Dec 12 20:39:31 2023; remove_seg 0/1
REDO @ 4/40000388; LSN 4/400003B8: prev 4/400002E8; xid 0; term 703; len 8; total 42; crc 2922488873; desc: XLOG - backup end: 4/FE8
REDO @ 4/400003B8; LSN 4/80000000: prev 4/40000388; xid 0; term 703; len 0; total 32; crc 4018308493; desc: XLOG - xlog switch