greenplum 源码解析 FTS辅助进程工作主流程

1 简介

  FTS(Fault Tolerance Service)是greenplum提供的对于子节点的故障检测与恢复的服务。其隶属于master的一个子进程,通过定期轮询每个primary的状态来获取每个primary-mirror组的状态。该进程只在master上存在,进程名为ftsprobe process。本文将从源码角度讲解FTS辅助进程的工作原理,知识回顾:greenplum 源码解析 FTS辅助进程–ReadMe

2 故障恢复小总结

在这里插入图片描述

3 源码解析

3.1 关键数据结构
FtsMessageState 该枚举类型记录了FTS探活过程中涉及的消息状态信息,根据类型后续进行对应的消息处理

/* States used by FTS main loop for probing segments. */
typedef enum
{
	FTS_PROBE_SEGMENT,         /* send probe message */
	FTS_SYNCREP_OFF_SEGMENT,   /* turn off syncrep due to mirror down */
	FTS_PROMOTE_SEGMENT,       /* promote a mirror due to primary down */

	/* wait before making another retry attempt */
	FTS_PROBE_RETRY_WAIT,
	FTS_SYNCREP_OFF_RETRY_WAIT,
	FTS_PROMOTE_RETRY_WAIT,

	FTS_PROBE_SUCCESS,         /* response to probe is ready for processing */
	FTS_SYNCREP_OFF_SUCCESS,   /* syncrep was turned off by the primary */
	FTS_PROMOTE_SUCCESS,       /* promotion was triggered on the mirror */

	FTS_PROBE_FAILED,          /* the segment should be considered down */

	FTS_SYNCREP_OFF_FAILED,    /*
								* let the next probe cycle find out what
								* happened to the primary
								*/
	FTS_PROMOTE_FAILED,        /* double fault */

	FTS_RESPONSE_PROCESSED     /*
								* final state, nothing more needs to be done in
								* this probe cycle
								*/
} FtsMessageState;
#宏定义
#define IsFtsMessageStateSuccess(state) (state == FTS_PROBE_SUCCESS || \
		state == FTS_SYNCREP_OFF_SUCCESS || state == FTS_PROMOTE_SUCCESS)
#define IsFtsMessageStateFailed(state) (state == FTS_PROBE_FAILED || \
		state == FTS_SYNCREP_OFF_FAILED || state == FTS_PROMOTE_FAILED)
	
typedef struct
{
	/*
	 * The primary_cdbinfo and mirror_cdbinfo are references to primary and
	 * mirror configuration at the beginning of a probe cycle.  They are used
	 * to start libpq connection to send a FTS message.  Their state/role/mode
	 * is not used and does remain unchanged even when configuration is updated
	 * in the middle of a probe cycle (e.g. mirror marked down in configuration
	 * before sending SYNCREP_OFF message).
	 */
	 // 
	CdbComponentDatabaseInfo *primary_cdbinfo;
	CdbComponentDatabaseInfo *mirror_cdbinfo;
	fts_result result;
	FtsMessageState state;
	short poll_events;				// poll监听 事件
	short poll_revents;				// 返回事件
	int16 fd_index;               /* index into PollFds array */
	pg_time_t startTime;          /* probe start timestamp */
	pg_time_t retryStartTime;     /* time at which next retry attempt can start */
	int16 probe_errno;            /* saved errno from the latest system call */
	struct pg_conn *conn;         /* libpq connection object */
	int retry_count;
	XLogRecPtr xlogrecptr;
	bool recovery_making_progress;
} fts_segment_info;   // 集群中每个segment节点信息

typedef struct
{
	int num_pairs; /* number of primary-mirror pairs FTS wants to probe */
	fts_segment_info *perSegInfos;            // segment节点信息数组
} fts_context;

3.2 FtsProbeMain
该函数为FTS进程工作的主函数,包含如下步骤:
1)注册信号处理函数
2) FtsLoop轮询检测,采集集群Segment节点的状态信息,若出现异常会执行相应的操作保证segment的高可用。

2.2 FtsLoop

static
void FtsLoop()
{
	bool	updated_probe_state;
	MemoryContext probeContext = NULL, oldContext = NULL;
	time_t elapsed,	probe_start_time, timeout;
	CdbComponentDatabases *cdbs = NULL;

	// 分配对应的内存上下文
	probeContext = AllocSetContextCreate(TopMemoryContext,
										 "FtsProbeMemCtxt",
										 ALLOCSET_DEFAULT_INITSIZE,	/* always have some memory */
										 ALLOCSET_DEFAULT_INITSIZE,
										 ALLOCSET_DEFAULT_MAXSIZE);

	while (true)
	{
		bool		has_mirrors;
		int			rc;

		if (got_SIGHUP)         // 如果收到 PGC_SIGHUP信息,需要重读 配置文件
		{
			got_SIGHUP = false;
			ProcessConfigFile(PGC_SIGHUP); 
		}

		CHECK_FOR_INTERRUPTS();

		SIMPLE_FAULT_INJECTOR("ftsLoop_before_probe");

		probe_start_time = time(NULL);

		// 自增计数器
		SpinLockAcquire(&ftsProbeInfo->lock);
		ftsProbeInfo->start_count++;
		SpinLockRelease(&ftsProbeInfo->lock);

		/* Need a transaction to access the catalogs */
		StartTransactionCommand();
		
		// 读取系统表信息 + 更新状态
		cdbs = readCdbComponentInfoAndUpdateStatus(probeContext);

		/* Check here gp_segment_configuration if has mirror's */
		// 检查是否有 mirror 
		has_mirrors = gp_segment_config_has_mirrors();

		/* close the transaction we started above */
		CommitTransactionCommand();
	
		// 重置变量 
		/* Reset this as we are performing the probe */
		probe_requested = false;
		skipFtsProbe = false;

		if (SIMPLE_FAULT_INJECTOR("fts_probe") == FaultInjectorTypeSkip)
			skipFtsProbe = true;

		if (skipFtsProbe || !has_mirrors)
		{
			elogif(gp_log_fts >= GPVARS_VERBOSITY_VERBOSE, LOG,
				   "skipping FTS probes due to %s",
				   !has_mirrors ? "no mirrors" : "fts_probe fault");

		}
		else
		{
			elogif(gp_log_fts == GPVARS_VERBOSITY_DEBUG, LOG,
				   "FTS: starting %s scan with %d segments and %d contents",
				   (probe_requested ? "full " : ""),
				   cdbs->total_segment_dbs,
				   cdbs->total_segments);
			/*
			 * We probe in a special context, some of the heap access
			 * stuff palloc()s internally
			 */
			oldContext = MemoryContextSwitchTo(probeContext);
			
			// 
			updated_probe_state = FtsWalRepMessageSegments(cdbs);

			MemoryContextSwitchTo(oldContext);

			/* free any pallocs we made inside probeSegments() */
			MemoryContextReset(probeContext);

			/* Bump the version if configuration was updated. */
			// 更新gp_segment_configuration 文件信息 写临时文件后重命名==》 无锁操作
			if (updated_probe_state)
			{
				/*
				 * File GPSEGCONFIGDUMPFILE under $PGDATA is used by other
				 * components to fetch latest gp_segment_configuration outside
				 * of a transaction. FTS update this file in the first probe
				 * and every probe which updated gp_segment_configuration.
				 */
				StartTransactionCommand();
				writeGpSegConfigToFTSFiles();
				CommitTransactionCommand();

				ftsProbeInfo->status_version++;
			}
		}

		/* free current components info and free ip addr caches */	
		// 释放资源,内存+锁
		cdbcomponent_destroyCdbComponents();

		SIMPLE_FAULT_INJECTOR("ftsLoop_after_probe");

		/* Notify any waiting backends about probe cycle completion. */
		SpinLockAcquire(&ftsProbeInfo->lock);
		ftsProbeInfo->done_count = ftsProbeInfo->start_count;
		SpinLockRelease(&ftsProbeInfo->lock);


		/* check if we need to sleep before starting next iteration */
		elapsed = time(NULL) - probe_start_time;
		timeout = elapsed >= gp_fts_probe_interval ? 0 : 
							gp_fts_probe_interval - elapsed;

		/*
		 * In above code we might update gp_segment_configuration and then wal
		 * is generated. While synchronizing wal to standby, we need to wait on
		 * MyLatch also in SyncRepWaitForLSN(). The set latch introduced by
		 * outside fts probe trigger (e.g. gp_request_fts_probe_scan() or
		 * FtsNotifyProber()) might be consumed by it so we do not WaitLatch()
		 * here with a long timout here else we may block for that long
		 * timeout, so we recheck probe_requested here before waitLatch().
		 */
		if (probe_requested)
			timeout = 0;
  		
  		// 等待下次探活周期
		rc = WaitLatch(&MyProc->procLatch,
					   WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
					   timeout * 1000L);

		SIMPLE_FAULT_INJECTOR("ftsLoop_after_latch");

		ResetLatch(&MyProc->procLatch);

		/* emergency bailout if postmaster has died */
		if (rc & WL_POSTMASTER_DEATH)
			proc_exit(1);
	} /* end server loop */

	return;
}

3.3 readCdbComponentInfoAndUpdateStatus
该函数负责从系统表中读取segment节点信息,并更新状态信息【内存形式】

/*
 * Populate cdb_component_dbs object by reading from catalog.  Use
 * probeContext instead of current memory context because current
 * context will be destroyed by CommitTransactionCommand().
 */
static
CdbComponentDatabases *readCdbComponentInfoAndUpdateStatus(MemoryContext probeContext)
{
	int i;
	// 返回的信息包括primary 和mirror,其中dbid是唯一标识,但同一节点的 contentid相同
	CdbComponentDatabases *cdbs = cdbcomponent_getCdbComponents();

	for (i=0; i < cdbs->total_segment_dbs; i++)
	{
		// 遍历每个节点判断是否处于健康状态
		CdbComponentDatabaseInfo *segInfo = &cdbs->segment_db_info[i];
		uint8	segStatus = 0;
		
		// 如果非 alive, 设置表示位
		if (!SEGMENT_IS_ALIVE(segInfo))
			FTS_STATUS_SET_DOWN(segStatus);

		ftsProbeInfo->status[segInfo->config->dbid] = segStatus;
	}

	/*
	 * Initialize fts_stausVersion after populating the config details in
	 * shared memory for the first time after FTS startup.
	 */
	 // 在FTS第一期启动时,会将上述探测的结果写入 gp_segment_configuration文件中
	if (ftsProbeInfo->status_version == 0)
	{
		ftsProbeInfo->status_version++;
		writeGpSegConfigToFTSFiles();
	}

	return cdbs;
}

3.4 FtsWalRepMessageSegments
该函数位FTS进程的探活工作,通过建立连接–》事件触发–》发送消息–》接收结果==》结果处理【重试】–》回复响应

bool
FtsWalRepMessageSegments(CdbComponentDatabases *cdbs)
{
	bool is_updated = false;
	fts_context context;

	FtsWalRepInitProbeContext(cdbs, &context);
	InitPollFds(cdbs->total_segments);

	while (!allDone(&context) && FtsIsActive())
	{
		ftsConnect(&context);
		ftsPoll(&context);
		ftsSend(&context);
		ftsReceive(&context);
		processRetry(&context);
		is_updated |= processResponse(&context);
	}
	int i;
	if (!FtsIsActive())
	{
		for (i = 0; i < context.num_pairs; i++)
		{
			if (context.perSegInfos[i].conn)
			{
				PQfinish(context.perSegInfos[i].conn);
				context.perSegInfos[i].conn = NULL;
			}
		}
	}
#ifdef USE_ASSERT_CHECKING
	/*
	 * At the end of probe cycle, there shouldn't be any active libpq
	 * connections.
	 */
	for (i = 0; i < context.num_pairs; i++)
	{
		if (context.perSegInfos[i].conn != NULL)
			elog(ERROR,
				 "FTS libpq connection left open (content=%d, dbid=%d)"
				 " state=%d, retry_count=%d, conn->status=%d",
				 context.perSegInfos[i].primary_cdbinfo->config->segindex,
				 context.perSegInfos[i].primary_cdbinfo->config->dbid,
				 context.perSegInfos[i].state,
				 context.perSegInfos[i].retry_count,
				 context.perSegInfos[i].conn->status);
	}
#endif
	pfree(context.perSegInfos);
	pfree(PollFds);
	return is_updated;
}

/* EOF */

参考:分布式数据库的高可用实现,Greenplum是如何做到的

  • 1
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值