1 简介
FTS(Fault Tolerance Service)是greenplum提供的对于子节点的故障检测与恢复的服务。其隶属于master的一个子进程,通过定期轮询每个primary的状态来获取每个primary-mirror组的状态。该进程只在master上存在,进程名为ftsprobe process。本文将从源码角度讲解FTS辅助进程的工作原理,知识回顾:greenplum 源码解析 FTS辅助进程–ReadMe
2 故障恢复小总结
3 源码解析
3.1 关键数据结构
FtsMessageState 该枚举类型记录了FTS探活过程中涉及的消息状态信息,根据类型后续进行对应的消息处理
/* States used by FTS main loop for probing segments. */
typedef enum
{
FTS_PROBE_SEGMENT, /* send probe message */
FTS_SYNCREP_OFF_SEGMENT, /* turn off syncrep due to mirror down */
FTS_PROMOTE_SEGMENT, /* promote a mirror due to primary down */
/* wait before making another retry attempt */
FTS_PROBE_RETRY_WAIT,
FTS_SYNCREP_OFF_RETRY_WAIT,
FTS_PROMOTE_RETRY_WAIT,
FTS_PROBE_SUCCESS, /* response to probe is ready for processing */
FTS_SYNCREP_OFF_SUCCESS, /* syncrep was turned off by the primary */
FTS_PROMOTE_SUCCESS, /* promotion was triggered on the mirror */
FTS_PROBE_FAILED, /* the segment should be considered down */
FTS_SYNCREP_OFF_FAILED, /*
* let the next probe cycle find out what
* happened to the primary
*/
FTS_PROMOTE_FAILED, /* double fault */
FTS_RESPONSE_PROCESSED /*
* final state, nothing more needs to be done in
* this probe cycle
*/
} FtsMessageState;
#宏定义
#define IsFtsMessageStateSuccess(state) (state == FTS_PROBE_SUCCESS || \
state == FTS_SYNCREP_OFF_SUCCESS || state == FTS_PROMOTE_SUCCESS)
#define IsFtsMessageStateFailed(state) (state == FTS_PROBE_FAILED || \
state == FTS_SYNCREP_OFF_FAILED || state == FTS_PROMOTE_FAILED)
typedef struct
{
/*
* The primary_cdbinfo and mirror_cdbinfo are references to primary and
* mirror configuration at the beginning of a probe cycle. They are used
* to start libpq connection to send a FTS message. Their state/role/mode
* is not used and does remain unchanged even when configuration is updated
* in the middle of a probe cycle (e.g. mirror marked down in configuration
* before sending SYNCREP_OFF message).
*/
//
CdbComponentDatabaseInfo *primary_cdbinfo;
CdbComponentDatabaseInfo *mirror_cdbinfo;
fts_result result;
FtsMessageState state;
short poll_events; // poll监听 事件
short poll_revents; // 返回事件
int16 fd_index; /* index into PollFds array */
pg_time_t startTime; /* probe start timestamp */
pg_time_t retryStartTime; /* time at which next retry attempt can start */
int16 probe_errno; /* saved errno from the latest system call */
struct pg_conn *conn; /* libpq connection object */
int retry_count;
XLogRecPtr xlogrecptr;
bool recovery_making_progress;
} fts_segment_info; // 集群中每个segment节点信息
typedef struct
{
int num_pairs; /* number of primary-mirror pairs FTS wants to probe */
fts_segment_info *perSegInfos; // segment节点信息数组
} fts_context;
3.2 FtsProbeMain
该函数为FTS进程工作的主函数,包含如下步骤:
1)注册信号处理函数
2) FtsLoop轮询检测,采集集群Segment节点的状态信息,若出现异常会执行相应的操作保证segment的高可用。
2.2 FtsLoop
static
void FtsLoop()
{
bool updated_probe_state;
MemoryContext probeContext = NULL, oldContext = NULL;
time_t elapsed, probe_start_time, timeout;
CdbComponentDatabases *cdbs = NULL;
// 分配对应的内存上下文
probeContext = AllocSetContextCreate(TopMemoryContext,
"FtsProbeMemCtxt",
ALLOCSET_DEFAULT_INITSIZE, /* always have some memory */
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
while (true)
{
bool has_mirrors;
int rc;
if (got_SIGHUP) // 如果收到 PGC_SIGHUP信息,需要重读 配置文件
{
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
}
CHECK_FOR_INTERRUPTS();
SIMPLE_FAULT_INJECTOR("ftsLoop_before_probe");
probe_start_time = time(NULL);
// 自增计数器
SpinLockAcquire(&ftsProbeInfo->lock);
ftsProbeInfo->start_count++;
SpinLockRelease(&ftsProbeInfo->lock);
/* Need a transaction to access the catalogs */
StartTransactionCommand();
// 读取系统表信息 + 更新状态
cdbs = readCdbComponentInfoAndUpdateStatus(probeContext);
/* Check here gp_segment_configuration if has mirror's */
// 检查是否有 mirror
has_mirrors = gp_segment_config_has_mirrors();
/* close the transaction we started above */
CommitTransactionCommand();
// 重置变量
/* Reset this as we are performing the probe */
probe_requested = false;
skipFtsProbe = false;
if (SIMPLE_FAULT_INJECTOR("fts_probe") == FaultInjectorTypeSkip)
skipFtsProbe = true;
if (skipFtsProbe || !has_mirrors)
{
elogif(gp_log_fts >= GPVARS_VERBOSITY_VERBOSE, LOG,
"skipping FTS probes due to %s",
!has_mirrors ? "no mirrors" : "fts_probe fault");
}
else
{
elogif(gp_log_fts == GPVARS_VERBOSITY_DEBUG, LOG,
"FTS: starting %s scan with %d segments and %d contents",
(probe_requested ? "full " : ""),
cdbs->total_segment_dbs,
cdbs->total_segments);
/*
* We probe in a special context, some of the heap access
* stuff palloc()s internally
*/
oldContext = MemoryContextSwitchTo(probeContext);
//
updated_probe_state = FtsWalRepMessageSegments(cdbs);
MemoryContextSwitchTo(oldContext);
/* free any pallocs we made inside probeSegments() */
MemoryContextReset(probeContext);
/* Bump the version if configuration was updated. */
// 更新gp_segment_configuration 文件信息 写临时文件后重命名==》 无锁操作
if (updated_probe_state)
{
/*
* File GPSEGCONFIGDUMPFILE under $PGDATA is used by other
* components to fetch latest gp_segment_configuration outside
* of a transaction. FTS update this file in the first probe
* and every probe which updated gp_segment_configuration.
*/
StartTransactionCommand();
writeGpSegConfigToFTSFiles();
CommitTransactionCommand();
ftsProbeInfo->status_version++;
}
}
/* free current components info and free ip addr caches */
// 释放资源,内存+锁
cdbcomponent_destroyCdbComponents();
SIMPLE_FAULT_INJECTOR("ftsLoop_after_probe");
/* Notify any waiting backends about probe cycle completion. */
SpinLockAcquire(&ftsProbeInfo->lock);
ftsProbeInfo->done_count = ftsProbeInfo->start_count;
SpinLockRelease(&ftsProbeInfo->lock);
/* check if we need to sleep before starting next iteration */
elapsed = time(NULL) - probe_start_time;
timeout = elapsed >= gp_fts_probe_interval ? 0 :
gp_fts_probe_interval - elapsed;
/*
* In above code we might update gp_segment_configuration and then wal
* is generated. While synchronizing wal to standby, we need to wait on
* MyLatch also in SyncRepWaitForLSN(). The set latch introduced by
* outside fts probe trigger (e.g. gp_request_fts_probe_scan() or
* FtsNotifyProber()) might be consumed by it so we do not WaitLatch()
* here with a long timout here else we may block for that long
* timeout, so we recheck probe_requested here before waitLatch().
*/
if (probe_requested)
timeout = 0;
// 等待下次探活周期
rc = WaitLatch(&MyProc->procLatch,
WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
timeout * 1000L);
SIMPLE_FAULT_INJECTOR("ftsLoop_after_latch");
ResetLatch(&MyProc->procLatch);
/* emergency bailout if postmaster has died */
if (rc & WL_POSTMASTER_DEATH)
proc_exit(1);
} /* end server loop */
return;
}
3.3 readCdbComponentInfoAndUpdateStatus
该函数负责从系统表中读取segment节点信息,并更新状态信息【内存形式】
/*
* Populate cdb_component_dbs object by reading from catalog. Use
* probeContext instead of current memory context because current
* context will be destroyed by CommitTransactionCommand().
*/
static
CdbComponentDatabases *readCdbComponentInfoAndUpdateStatus(MemoryContext probeContext)
{
int i;
// 返回的信息包括primary 和mirror,其中dbid是唯一标识,但同一节点的 contentid相同
CdbComponentDatabases *cdbs = cdbcomponent_getCdbComponents();
for (i=0; i < cdbs->total_segment_dbs; i++)
{
// 遍历每个节点判断是否处于健康状态
CdbComponentDatabaseInfo *segInfo = &cdbs->segment_db_info[i];
uint8 segStatus = 0;
// 如果非 alive, 设置表示位
if (!SEGMENT_IS_ALIVE(segInfo))
FTS_STATUS_SET_DOWN(segStatus);
ftsProbeInfo->status[segInfo->config->dbid] = segStatus;
}
/*
* Initialize fts_stausVersion after populating the config details in
* shared memory for the first time after FTS startup.
*/
// 在FTS第一期启动时,会将上述探测的结果写入 gp_segment_configuration文件中
if (ftsProbeInfo->status_version == 0)
{
ftsProbeInfo->status_version++;
writeGpSegConfigToFTSFiles();
}
return cdbs;
}
3.4 FtsWalRepMessageSegments
该函数位FTS进程的探活工作,通过建立连接–》事件触发–》发送消息–》接收结果==》结果处理【重试】–》回复响应
bool
FtsWalRepMessageSegments(CdbComponentDatabases *cdbs)
{
bool is_updated = false;
fts_context context;
FtsWalRepInitProbeContext(cdbs, &context);
InitPollFds(cdbs->total_segments);
while (!allDone(&context) && FtsIsActive())
{
ftsConnect(&context);
ftsPoll(&context);
ftsSend(&context);
ftsReceive(&context);
processRetry(&context);
is_updated |= processResponse(&context);
}
int i;
if (!FtsIsActive())
{
for (i = 0; i < context.num_pairs; i++)
{
if (context.perSegInfos[i].conn)
{
PQfinish(context.perSegInfos[i].conn);
context.perSegInfos[i].conn = NULL;
}
}
}
#ifdef USE_ASSERT_CHECKING
/*
* At the end of probe cycle, there shouldn't be any active libpq
* connections.
*/
for (i = 0; i < context.num_pairs; i++)
{
if (context.perSegInfos[i].conn != NULL)
elog(ERROR,
"FTS libpq connection left open (content=%d, dbid=%d)"
" state=%d, retry_count=%d, conn->status=%d",
context.perSegInfos[i].primary_cdbinfo->config->segindex,
context.perSegInfos[i].primary_cdbinfo->config->dbid,
context.perSegInfos[i].state,
context.perSegInfos[i].retry_count,
context.perSegInfos[i].conn->status);
}
#endif
pfree(context.perSegInfos);
pfree(PollFds);
return is_updated;
}
/* EOF */