postgresql 在流复制模式下，WAL发生以下错误的对处方法

最新推荐文章于 2024-08-21 15:49:25 发布

pg_edb

最新推荐文章于 2024-08-21 15:49:25 发布

阅读量1.4k

点赞数

本文链接：https://blog.csdn.net/pg_edb/article/details/88949039

版权

开发十年，就只剩下这套架构体系了！ >>>

postgresql在流复制模式（stream）时，slave侧log出现以下错误：

record with zero length at XXX

FATAL:terminating walreceiver process due to administrator command

错误

xlog.c

4069

	else if (record->xl_len == 0)
	{
		ereport(emode_for_corrupt_record(emode, *RecPtr),
				(errmsg("record with zero length at %X/%X",
						RecPtr->xlogid, RecPtr->xrecoff)));
		goto next_record_is_invalid;
	}



next_record_is_invalid:
	failedSources \|= readSource;

	if (readFile >= 0)
	{
		close(readFile);
		readFile = -1;
	}

	/*
	* If archive recovery was requested, but we were still doing crash
	* recovery, switch to archive recovery and retry using the offline
	* archive. We have now replayed all the valid WAL in pg_xlog, so
	* we are presumably now consistent.
	*
	* We require that there's at least some valid WAL present in
	* pg_xlog, however (!fetch_ckpt). We could recover using the WAL
	* from the archive, even if pg_xlog is completely empty, but we'd
	* have no idea how far we'd have to replay to reach consistency.
	* So err on the safe side and give up.
	*/
	if (!InArchiveRecovery && ArchiveRecoveryRequested && !fetching_ckpt)
	{
		ereport(DEBUG1,
				(errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
		InArchiveRecovery = true;
		if (StandbyModeRequested)
			StandbyMode = true;

		/* initialize minRecoveryPoint to this record */
		LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
		ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
		if (XLByteLT(ControlFile->minRecoveryPoint, EndRecPtr))
			ControlFile->minRecoveryPoint = EndRecPtr;

		/* update local copy */
		minRecoveryPoint = ControlFile->minRecoveryPoint;

		UpdateControlFile();
		LWLockRelease(ControlFileLock);

		CheckRecoveryConsistency();

		goto retry;
	}


	retry:
	/* See if we need to retrieve more data */
	if (readFile < 0 \|\|
		(readSource == XLOG_FROM_STREAM && !XLByteLT(*RecPtr, receivedUpto)))
	{
		if (StandbyMode)
		{
			/*
			* In standby mode, wait for the requested record to become
			* available, either via restore_command succeeding to restore the
			* segment, or via walreceiver having streamed the record.
			*/
			for (;;)
			{
				if (WalRcvInProgress())
				{
					bool

					/*
					* If we find an invalid record in the WAL streamed from
					* master, something is seriously wrong. There's little
					* chance that the problem will just go away, but PANIC is
					* not good for availability either, especially in hot
					* standby mode. Disconnect, and retry from
					* archive/pg_xlog again. The WAL in the archive should be
					* identical to what was streamed, so it's unlikely that
					* it helps, but one can hope...
					*/
					if (failedSources & XLOG_FROM_STREAM)
					{
						ShutdownWalRcv();
						continue;
					}

					/*
					* Walreceiver is active, so see if new data has arrived.
					*
					* We only advance XLogReceiptTime when we obtain fresh
					* WAL from walreceiver and observe that we had already
					* processed everything before the most recent "chunk"
					* that it flushed to disk. In steady state where we are
					* keeping up with the incoming data, XLogReceiptTime will
					* be updated on each cycle. When we are behind,
					* XLogReceiptTime will not advance, so the grace time
					* alloted to conflicting queries will decrease.
					*/
					if (XLByteLT(*RecPtr, receivedUpto))
						havedata = true;
					else
					{
						XLogRecPtr

						receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart);
						if (XLByteLT(*RecPtr, receivedUpto))
						{






						}
						else

					}
					if (havedata)
					{
						/*
						* Great, streamed far enough. Open the file if it's
						* not open already. Use XLOG_FROM_STREAM so that
						* source info is set correctly and XLogReceiptTime
						* isn't changed.
						*/
						if (readFile < 0)
						{






						}
						else
						{



						}
						break;
					}

					/*
					* Data not here yet, so check for trigger then sleep for
					* five seconds like in the WAL file polling case below.
					*/
					if (CheckForStandbyTrigger())
						goto retry;

					/*
					* Wait for more WAL to arrive, or timeout to be reached
					*/
					WaitLatch(&XLogCtl->recoveryWakeupLatch,


					ResetLatch(&XLogCtl->recoveryWakeupLatch);
				}
				else
				{
					int
					pg_time_t	now;

					/*
					* Until walreceiver manages to reconnect, poll the
					* archive.
					*/
					if (readFile >= 0)
					{
						close(readFile);
						readFile = -1;
					}
					/* Reset curFileTLI if random fetch. */
					if (randAccess)
						curFileTLI = 0;

					/*
					* Try to restore the file from archive, or read an
					* existing file from pg_xlog.
					*/
					sources = XLOG_FROM_ARCHIVE \| XLOG_FROM_PG_XLOG;
					if (!(sources & ~failedSources))
					{
						/*
						* We've exhausted all options for retrieving the
						* file. Retry.
						*/
						failedSources = 0;

						/*
						* Before we sleep, re-scan for possible new timelines
						* if we were requested to recover to the latest
						* timeline.
						*/
						if (recoveryTargetIsLatest)
						{


						}

						/*
						* If it hasn't been long since last attempt, sleep to
						* avoid busy-waiting.
						*/
						now = (pg_time_t) time(NULL);
						if ((now - last_fail_time) < 5)
						{


						}
						last_fail_time = now;

						/*
						* If primary_conninfo is set, launch walreceiver to
						* try to stream the missing WAL, before retrying to
						* restore from archive/pg_xlog.
						*
						* If fetching_ckpt is TRUE, RecPtr points to the
						* initial checkpoint location. In that case, we use
						* RedoStartLSN as the streaming start position
						* instead of RecPtr, so that when we later jump
						* backwards to start redo at RedoStartLSN, we will
						* have the logs streamed already.
						*/
						if (PrimaryConnInfo)
						{




						}
					}
					/* Don't try to read from a source that just failed */
					sources &= ~failedSources;
					readFile = XLogFileReadAnyTLI(readId, readSeg, DEBUG2,

					switched_segment = true;
					if (readFile >= 0)
						break;

					/*
					* Nope, not found in archive and/or pg_xlog.
					*/
					failedSources \|= sources;

					/*
					* Check to see if the trigger file exists. Note that we
					* do this only after failure, so when you create the
					* trigger file, we still finish replaying as much as we
					* can from archive and pg_xlog before failover.
					*/
					if (CheckForStandbyTrigger())
						goto triggered;
				}

				/*
				* This possibly-long loop needs to handle interrupts of
				* startup process.
				*/
				HandleStartupProcInterrupts();
			}
		}
		else
		{
			/* In archive or crash recovery. */
			if (readFile < 0)
			{
				int

				/* Reset curFileTLI if random fetch. */
				if (randAccess)
					curFileTLI = 0;

				sources = XLOG_FROM_PG_XLOG;
				if (InArchiveRecovery)
					sources \|= XLOG_FROM_ARCHIVE;

				readFile = XLogFileReadAnyTLI(readId, readSeg, emode,

				switched_segment = true;
				if (readFile < 0)
					return false;
			}
		}
	}

	/*
	* At this point, we have the right segment open and if we're streaming we
	* know the requested record is in it.
	*/
	Assert(readFile != -1);

	/*
	* If the current segment is being streamed from master, calculate how
	* much of the current page we have received already. We know the
	* requested record has been received, but this is for the benefit of
	* future calls, to allow quick exit at the top of this function.
	*/
	if (readSource == XLOG_FROM_STREAM)
	{
		if (RecPtr->xlogid != receivedUpto.xlogid \|\|
			(RecPtr->xrecoff / XLOG_BLCKSZ) != (receivedUpto.xrecoff / XLOG_BLCKSZ))
		{
			readLen = XLOG_BLCKSZ;
		}
		else
			readLen = receivedUpto.xrecoff % XLogSegSize - targetPageOff;
	}
	else
		readLen = XLOG_BLCKSZ;

	if (switched_segment && targetPageOff != 0)
	{
		/*
		* Whenever switching to a new WAL segment, we read the first page of
		* the file and validate its header, even if that's not where the
		* target record is. This is so that we can check the additional
		* identification info that is present in the first page's "long"
		* header.
		*/
		readOff = 0;
		if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
		{
			ereport(emode_for_corrupt_record(emode, *RecPtr),
					(errcode_for_file_access(),
					errmsg("could not read from log file %u, segment %u, offset %u: %m",

			goto next_record_is_invalid;
		}
		if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, true))
			goto next_record_is_invalid;
	}

	/* Read the requested page */
	readOff = targetPageOff;
	if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
	{
		ereport(emode_for_corrupt_record(emode, *RecPtr),
				(errcode_for_file_access(),
		errmsg("could not seek in log file %u, segment %u to offset %u: %m",
				readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
	{
		ereport(emode_for_corrupt_record(emode, *RecPtr),
				(errcode_for_file_access(),
		errmsg("could not read from log file %u, segment %u, offset %u: %m",
				readId, readSeg, readOff)));
		goto next_record_is_invalid;
	}
	if (!ValidXLOGHeader((XLogPageHeader) readBuf, emode, false))
		goto next_record_is_invalid;

	Assert(targetId == readId);
	Assert(targetSeg == readSeg);
	Assert(targetPageOff == readOff);
	Assert(targetRecOff < readLen);

	return true;

next_record_is_invalid:
	failedSources \|= readSource;

	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	/* In standby-mode, keep trying */
	if (StandbyMode)
		goto retry;
	else
		return false;

triggered:
	if (readFile >= 0)
		close(readFile);
	readFile = -1;
	readLen = 0;
	readSource = 0;

	return false;
}