PostgreSQL启动恢复读取checkpoint记录失败的条件
-
1、首先读取ControlFile->checkPoint指向的checkpoint
-
2、如果读取失败,slave直接abort退出,master再次读取ControlFile->prevCheckPoint指向的checkpoint
-
StartupXLOG->
-
|--checkPointLoc = ControlFile->checkPoint;
-
|--record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true):
-
|-- if (record != NULL){
-
...
-
}else if (StandbyMode){
-
ereport(PANIC,(errmsg("could not locate a valid checkpoint record")));
-
}else{
-
checkPointLoc = ControlFile->prevCheckPoint;
-
record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
-
if (record != NULL){
-
InRecovery = true;//标记下面进入recovery
-
}else{
-
ereport(PANIC,(errmsg("could not locate a valid checkpoint record")));
-
}
-
}
-
...
一、那么什么条件下读取的checkpoint记录record==NULL?
-
1、ControlFile->checkPoint % XLOG_BLCKSZ < SizeOfXLogShortPHD
-
2、ReadRecord(xlogreader, ControlFile->checkPoint, LOG, true)返回NULL
-
3、ReadRecord读到的record!=NULL && record->xl_rmid != RM_XLOG_ID
-
4、ReadRecord读到的record!=NULL && info != XLOG_CHECKPOINT_SHUTDOWN && info != XLOG_CHECKPOINT_ONLINE
-
5、ReadRecord读到的record!=NULL && record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)
二、ReadRecord函数返回NULL的条件
-
ReadRecord(xlogreader, ControlFile->checkPoint, LOG, true)
-
|--record = XLogReadRecord(xlogreader, ControlFile->checkPoint, &errormsg);
-
|-- 2.1 record==NULL && !StandbyMode
-
|-- 2.2 record!=NULL && !tliInHistory(xlogreader->latestPageTLI, expectedTLEs)
-
/*-----
-
note:只要读取了一页xlog,就会赋值为该页第一个记录的时间线
-
XLogReaderValidatePageHeader
-
-->xlogreader->latestPageTLI=hdr->xlp_tli;
-
------*/
三、XlogReadRecord读取checkpoint返回NULL的条件?
-
XLogReadRecord(xlogreader, ControlFile->checkPoint, &errormsg)
-
targetPagePtr = ControlFile->checkPoint - (ControlFile->checkPoint % XLOG_BLCKSZ);
-
targetRecOff = ControlFile->checkPoint % XLOG_BLCKSZ;
-
readOff = ReadPageInternal(state,targetPagePtr, Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ));
-
pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) state->readBuf);
-
record = (XLogRecord *) (state->readBuf + RecPtr % XLOG_BLCKSZ);
-
total_len = record->xl_tot_len;
-
-------------
-
1、readOff < 0
-
2、0< targetRecOff < pageHeaderSize
-
3、(((XLogPageHeader) state->readBuf)->xlp_info & XLP_FIRST_IS_CONTRECORD) && targetRecOff == pageHeaderSize
-
page头有跨页的record并且checkpoint定位的偏移正好在页头尾部
-
4、targetRecOff <= XLOG_BLCKSZ - SizeOfXLogRecord &&
-
!ValidXLogRecordHeader(state, ControlFile->checkPoint, state->ReadRecPtr, record,randAccess)
-
---(record->xl_tot_len < SizeOfXLogRecord || record->xl_rmid > RM_MAX_ID || record->xl_prev != state->ReadRecPtr)
-
5、targetRecOff > XLOG_BLCKSZ - SizeOfXLogRecord && total_len < SizeOfXLogRecord
-
6、total_len > state->readRecordBufSize && !allocate_recordbuf(state, total_len)
-
一旦该记录损坏,total_len的长度非常大的话,就需要allocate_recordbuf扩展state->readbuf,可能因此分配失败abort
-
记录的checksum需要等待全部读取完整记录后才校验
-
-------------
三、ReadPageInternal返回的readOff返回小于0的条件
-
ReadPageInternal(state,targetPagePtr, Min(targetRecOff + SizeOfXLogRecord, XLOG_BLCKSZ))
-
1、第一次read wal文件,readLen = state->read_page:读取第一页。readLen < 0
-
2、readLen>0 && !XLogReaderValidatePageHeader(state, targetSegmentPtr, state->readBuf)
-
--
-
3、读取checkpoint所在页readLen = state->read_page: readLen < 0
-
4、readLen > 0 && readLen <= SizeOfXLogShortPHD
-
5、!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr)
四、XLogPageRead何时返回值<0 ?
-
/*
-
1、WaitForWALToBecomeAvailable open失败
-
2、lseek 失败 && !StandbyMode
-
3、read失败 && !StandbyMode
-
4、校验page头失败 && !StandbyMode
-
如果是StandbyMode,则会重新retry->WaitForWALToBecomeAvailable,切换日志源进行open
-
*/
-
!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,private->randAccess,1,targetRecPtr)//open
-
|-- return -1
-
readOff = targetPageOff;
-
if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0){
-
!StandbyMode:: return -1
-
}
-
if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ){
-
!StandbyMode:: return -1
-
}
-
XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf)
-
!StandbyMode:: return -1
五、WaitForWALToBecomeAvailable何时返回false?
-
--XLOG_FROM_ARCHIVE | XLOG_FROM_PG_WAL
-
1、先XLogFileReadAnyTLI open日志:
-
1、遍历时间线列表里的每一个时间线,从最新的开始
-
2、当读取checkpoint的时候,source是XLOG_FROM_ANY
-
3、先找归档的日志进行open;如果open失败再找WAL日志进行open
-
4、如果都没有open成功,则向前找时间线,open前一个时间线segno和文件号相同的文件进行open
-
5、open成功后expectedTLEs被赋值为当前时间线列表的所有值
-
2、如果open失败,则切换日志源:XLOG_FROM_ARCHIVE | XLOG_FROM_PG_WAL -> XLOG_FROM_STREAM
-
3、切换日志源后,XLOG_FROM_ARCHIVE | XLOG_FROM_PG_WAL 则:
-
slave && promote :return false
-
!StandbyMode:return false
-
--XLOG_FROM_STREAM
-
1、!WalRcvStreaming()即receiver进程挂了,切换日志源
-
2、CheckForStandbyTrigger()切换日志源
-
3、XLOG_FROM_STREAM->XLOG_FROM_ARCHIVE
六、代码流程:
-
static XLogRecord * ReadCheckpointRecord(
-
XLogReaderState *xlogreader,
-
XLogRecPtr RecPtr,
-
int whichChkpt,
-
bool report
-
)
-
{
-
//((RecPtr) % XLOG_BLCKSZ >= SizeOfXLogShortPHD)
-
if (!XRecOffIsValid(RecPtr)){
-
...
-
return NULL;
-
}
-
record = ReadRecord(xlogreader, RecPtr, LOG, true);
-
if (record == NULL){
-
...
-
return NULL;
-
}
-
if (record->xl_rmid != RM_XLOG_ID){
-
...
-
return NULL;
-
}
-
info = record->xl_info & ~XLR_INFO_MASK;
-
if (info != XLOG_CHECKPOINT_SHUTDOWN &&
-
info != XLOG_CHECKPOINT_ONLINE){
-
...
-
return NULL;
-
}
-
if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint)){
-
...
-
return NULL;
-
}
-
return record;
-
}
-
static int ReadPageInternal(
-
XLogReaderState *state,
-
XLogRecPtr pageptr,
-
int reqLen
-
)
-
{
-
XLByteToSeg(pageptr, targetSegNo);
-
targetPageOff = (pageptr % XLogSegSize);
-
/*
-
1、第一次read段文件,先read第一页并进行校验:readLen <0 或 readLength >=0 && 页头没校验通过
-
*/
-
if (targetSegNo != state->readSegNo && targetPageOff != 0){
-
XLogRecPtr targetSegmentPtr = pageptr - targetPageOff;
-
readLen = state->read_page(state, targetSegmentPtr, XLOG_BLCKSZ,
-
state->currRecPtr,
-
state->readBuf, &state->readPageTLI);
-
if (readLen < 0)
-
goto err;
-
if (!XLogReaderValidatePageHeader(state, targetSegmentPtr, state->readBuf))
-
goto err;
-
}
-
/*
-
2、read至少short page header大小:
-
1)readLen < 0
-
2)readLen <= SizeOfXLogShortPHD
-
*/
-
readLen = state->read_page(state, pageptr, Max(reqLen, SizeOfXLogShortPHD),
-
state->currRecPtr,
-
state->readBuf, &state->readPageTLI);
-
if (readLen < 0)
-
goto err;
-
if (readLen <= SizeOfXLogShortPHD)
-
goto err;
-
hdr = (XLogPageHeader) state->readBuf;
-
/*
-
3、如果读取的不够,需要再次继续读取
-
*/
-
if (readLen < XLogPageHeaderSize(hdr)){
-
readLen = state->read_page(state, pageptr, XLogPageHeaderSize(hdr),
-
state->currRecPtr,
-
state->readBuf, &state->readPageTLI);
-
if (readLen < 0)
-
goto err;
-
}
-
/*
-
3)校验整个页头没有通过。校验通过会state->latestPageTLI = hdr->xlp_tli;
-
*/
-
if (!XLogReaderValidatePageHeader(state, pageptr, (char *) hdr))
-
goto err;
-
//最后更新读取的状态
-
state->readSegNo = targetSegNo;
-
state->readOff = targetPageOff;
-
state->readLen = readLen;
-
return readLen;
-
err:
-
XLogReaderInvalReadState(state);
-
return -1;
-
}
-
static bool WaitForWALToBecomeAvailable(
-
XLogRecPtr RecPtr,
-
bool randAccess,
-
bool fetching_ckpt,
-
XLogRecPtr tliRecPtr
-
)
-
{
-
/*
-
1、currentSource读取checkpoint时是0,首先从XLOG_FROM_ARCHIVE进行open
-
*/
-
if (!InArchiveRecovery)
-
currentSource = XLOG_FROM_PG_WAL;
-
else if (currentSource == 0)
-
currentSource = XLOG_FROM_ARCHIVE;
-
for (;;)
-
{
-
int oldSource = currentSource;
-
/*
-
2、切换日志源
-
*/
-
if (lastSourceFailed){
-
switch (currentSource){
-
case XLOG_FROM_ARCHIVE:
-
case XLOG_FROM_PG_WAL:
-
if (StandbyMode && CheckForStandbyTrigger()){
-
ShutdownWalRcv();
-
return false;
-
}
-
/*只有在slave下才会切换*/
-
if (!StandbyMode)
-
return false;
-
/*如果recovery.conf配置了连接master的信息,则计算并启动receiv*/
-
if (PrimaryConnInfo){//后续需要单独详细解析
-
if (fetching_ckpt){//读取checkpoint
-
ptr = RedoStartLSN;
-
tli = ControlFile->checkPointCopy.ThisTimeLineID;
-
}else{
-
ptr = RecPtr;
-
tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
-
}
-
curFileTLI = tli;
-
RequestXLogStreaming(tli, ptr, PrimaryConnInfo,PrimarySlotName);
-
receivedUpto = 0;
-
}
-
currentSource = XLOG_FROM_STREAM;
-
break;
-
case XLOG_FROM_STREAM:
-
if (WalRcvStreaming())
-
ShutdownWalRcv();
-
if (recoveryTargetIsLatest){
-
if (rescanLatestTimeLine()){
-
currentSource = XLOG_FROM_ARCHIVE;
-
break;
-
}
-
}
-
now = GetCurrentTimestamp();
-
if (!TimestampDifferenceExceeds(last_fail_time, now,
-
wal_retrieve_retry_interval)){
-
TimestampDifference(last_fail_time, now, &secs, &usecs);
-
wait_time = wal_retrieve_retry_interval -
-
(secs * 1000 + usecs / 1000);
-
WaitLatch(&XLogCtl->recoveryWakeupLatch,
-
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-
wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
-
ResetLatch(&XLogCtl->recoveryWakeupLatch);
-
now = GetCurrentTimestamp();
-
}
-
last_fail_time = now;
-
currentSource = XLOG_FROM_ARCHIVE;
-
break;
-
default:
-
elog(ERROR, "unexpected WAL source %d", currentSource);
-
}
-
}else if (currentSource == XLOG_FROM_PG_WAL){
-
if (InArchiveRecovery)
-
currentSource = XLOG_FROM_ARCHIVE;
-
}
-
/*
-
3、先进来进行open
-
*/
-
lastSourceFailed = false;
-
switch (currentSource)
-
{
-
case XLOG_FROM_ARCHIVE:
-
case XLOG_FROM_PG_WAL:
-
readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
-
currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
-
currentSource);
-
if (readFile >= 0)
-
return true; /* success! */
-
/*open失败,进入for循环切换日志源*/
-
lastSourceFailed = true;
-
break;
-
case XLOG_FROM_STREAM:{
-
if (!WalRcvStreaming()){
-
lastSourceFailed = true;
-
break;
-
}
-
if (RecPtr < receivedUpto)
-
havedata = true;
-
else{
-
XLogRecPtr latestChunkStart;
-
receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
-
if (RecPtr < receivedUpto && receiveTLI == curFileTLI){
-
havedata = true;
-
if (latestChunkStart <= RecPtr){
-
XLogReceiptTime = GetCurrentTimestamp();
-
SetCurrentChunkStartTime(XLogReceiptTime);
-
}
-
}
-
else
-
havedata = false;
-
}
-
if (havedata){
-
if (readFile < 0){
-
if (!expectedTLEs)
-
expectedTLEs = readTimeLineHistory(receiveTLI);
-
readFile = XLogFileRead(readSegNo, PANIC,
-
receiveTLI,
-
XLOG_FROM_STREAM, false);
-
}else{
-
/* just make sure source info is correct... */
-
readSource = XLOG_FROM_STREAM;
-
XLogReceiptSource = XLOG_FROM_STREAM;
-
return true;
-
}
-
break;
-
}
-
if (CheckForStandbyTrigger()){
-
lastSourceFailed = true;
-
break;
-
}
-
if (!streaming_reply_sent){
-
WalRcvForceReply();
-
streaming_reply_sent = true;
-
}
-
WaitLatch(&XLogCtl->recoveryWakeupLatch,
-
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
-
5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
-
ResetLatch(&XLogCtl->recoveryWakeupLatch);
-
break;
-
}
-
default:
-
elog(ERROR, "unexpected WAL source %d", currentSource);
-
}
-
HandleStartupProcInterrupts();
-
}
-
return false; /* not reached */
-
}
-
static int
-
XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
-
{
-
if (expectedTLEs)
-
tles = expectedTLEs;
-
else
-
tles = readTimeLineHistory(recoveryTargetTLI);
-
/*
-
1、遍历时间线列表里的每一个时间线,从最新的开始
-
2、当读取checkpoint的时候,source是XLOG_FROM_ANY
-
3、先找归档的日志进行open;如果open失败再找WAL日志进行open
-
4、如果都没有open成功,则向前找时间线,open前一个时间线segno和文件号相同的文件进行open
-
5、open成功后expectedTLEs被赋值为当前时间线列表的所有值
-
*/
-
foreach(cell, tles){
-
TimeLineID tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
-
if (tli < curFileTLI)
-
break; /* don't bother looking at too-old TLIs */
-
if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE){
-
fd = XLogFileRead(segno, emode, tli,XLOG_FROM_ARCHIVE, true);
-
if (fd != -1){
-
elog(DEBUG1, "got WAL segment from archive");
-
if (!expectedTLEs)
-
expectedTLEs = tles;
-
return fd;
-
}
-
}
-
if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
-
{
-
fd = XLogFileRead(segno, emode, tli,XLOG_FROM_PG_WAL, true);
-
if (fd != -1){
-
if (!expectedTLEs)
-
expectedTLEs = tles;
-
return fd;
-
}
-
}
-
}
-
return -1;
-
}
-
static int
-
XLogFileRead(
-
XLogSegNo segno, /*IN:wal文件号*/
-
int emode, /*IN:log日志级别*/
-
TimeLineID tli, /*IN:时间线*/
-
int source, /*IN:XLOG_FROM_ARCHIVE or XLOG_FROM_PG_WAL or XLOG_FROM_STREAM*/
-
bool notfoundOk /*IN:XLOG_FROM_ARCHIVE or XLOG_FROM_PG_WAL时为TRUE,XLOG_FROM_STREAM:false*/
-
)
-
{
-
//通过tli、segno拼成日志文件名
-
XLogFileName(xlogfname, tli, segno);
-
switch (source){
-
case XLOG_FROM_ARCHIVE:
-
//InRedo:开始apply redo时为TRUE,结束则false
-
restoredFromArchive = RestoreArchivedFile(path, xlogfname, "RECOVERYXLOG", XLogSegSize,InRedo);
-
if (!restoredFromArchive)
-
return -1;
-
break;
-
case XLOG_FROM_PG_WAL:
-
case XLOG_FROM_STREAM:
-
//路径+wal 文件
-
XLogFilePath(path, tli, segno);
-
restoredFromArchive = false;
-
break;
-
default:
-
elog(ERROR, "invalid XLogFileRead source %d", source);
-
}
-
/*
-
* If the segment was fetched from archival storage, replace the existing
-
* xlog segment (if any) with the archival version.
-
*/
-
if (source == XLOG_FROM_ARCHIVE)
-
{
-
KeepFileRestoredFromArchive(path, xlogfname);
-
/*
-
* Set path to point at the new file in pg_wal.
-
*/
-
snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
-
}
-
//open
-
fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
-
if (fd >= 0)
-
{
-
/* Success! */
-
curFileTLI = tli;
-
/* Track source of data in assorted state variables */
-
readSource = source;
-
XLogReceiptSource = source;
-
/* In FROM_STREAM case, caller tracks receipt time, not me */
-
if (source != XLOG_FROM_STREAM)
-
XLogReceiptTime = GetCurrentTimestamp();
-
return fd;
-
}
-
if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
-
ereport(PANIC,
-
(errcode_for_file_access(),
-
errmsg("could not open file \"%s\": %m", path)));
-
return -1;