AntDB Cluster Reduce死锁与驱动

AntDB Reduce死锁与驱动

AntDB的集群计划(Cluster Plan)类似并行计划(Parallel Plan),通过序列化(Serialize)和反序列化(Restore)执行计划(Plan Statement),并发送到各个相关节点(Node),以保证各个节点的执行计划一致(基本一致,Restore时可能略微改动)。

AntDB引入Reduce Plan用于动态重分布数据,Reduce Plan的执行完成包括两部分:本地数据(通过eof_underlying标记本地数据是否扫描完毕)+网络数据(通过eof_network标记网络数据是否扫描完毕)。

实际上,尽管各个节点的执行计划一致,但是由于各个节点自身的情况(包括机器性能、数据大小,网络等因素),各个节点在执行过程中,必然有先后之分。进而,我们就发现,Reduce Plan由于需要各个节点发送EOF消息来标记网络传输的完成,一定情况下就会出现Reduce Plan在各个节点之间死锁的问题。那么,合理的驱动Plan Tree中的Reduce Plan发送EOF成为必要的措施。

以下列举目前已知的死锁用例。

常规死锁

问题:常规死锁是发生在执行计划执行过程中,一定条件下,某个节点的Plan Tree提前退出,导致该Plan Tree的一个或多个Reduce Node没有发送EOF消息通知其余节点而导致的死锁。

方法:当ExecProcNode(PlanState *node)返回的Slot满足TupIsNull(slot)时,驱动以该node为顶点的Plan Tree,确保该Plan Tree下的Reduce Plan完成EOF的发送行为。

Agg死锁

问题:Agg Plan在执行过程中,当左树返回NULL时,Agg Plan返回的结果可能不满足TupIsNull(slot),而导致没有驱动。

方法:驱动条件新增 ((AggState *) node)->agg_done) 为真时驱动。

CteScan死锁

问题:CteScan是比较特殊的一种Plan,其真正执行的plan是其上层的某个Plan的initPlan,由于驱动ClusterReduce目的在于转发不属于本节点的实时数据,丢弃其余数据,那么在CteScan的执行上就不合理了,因为其他Plan可能还会用到该CteScan。例如:

WITH t_onek AS (
		SELECT unique1, two, ten, hundred, twothousand
			, tenthous, even, stringu2
		FROM onek
		WHERE odd < 100
	)
SELECT *
FROM t_onek
WHERE even = 1000
UNION ALL
SELECT *
FROM t_onek
WHERE even < 100;

方法:CteScan的驱动程序

static bool
DriveCteScanState(CteScanState *node)
{
	TupleTableSlot *slot = NULL;
	ListCell	   *lc = NULL;
	SubPlanState   *sps = NULL;

	Assert(node && IsA(node, CteScanState));

	if (!IsThereClusterReduce((PlanState *) node))
		return false;

	/*
	 * Here we do ExecCteScan instead of just driving ClusterReduce,
	 * because other plan node may need the results of the CteScan.
	 */
	for (;;)
	{
		slot = ExecCteScan((CteScanState *) node);
		if (TupIsNull(slot))
			break;
	}

	/*
	 * Do not forget to drive subPlan-s.
	 */
	foreach (lc, node->ss.ps.subPlan)
	{
		sps = (SubPlanState *) lfirst(lc);

		Assert(IsA(sps, SubPlanState));
		if (DriveClusterReduceWalker(sps->planstate))
			return true;
	}

	/*
	 * Do not forget to drive initPlan-s.
	 */
	foreach (lc, node->ss.ps.initPlan)
	{
		sps = (SubPlanState *) lfirst(lc);

		Assert(IsA(sps, SubPlanState));
		if (DriveClusterReduceWalker(sps->planstate))
			return true;
	}

	return false;
}
Plan的执行与驱动之间互锁

问题:PlanState的驱动顺序是按照planstate_tree_walker的顺序驱动的,但这个顺序与实际上PlanState的执行顺序是不匹配的,故而,一定情况下会出现执行与驱动的互锁情况。例如:HashJoin死锁,HashJoin的左树(Left Tree)和右树(Right Tree)在执行时,可能先做左树,也可能先做右树,故导致执行与驱动死锁。planstate_tree_walker的顺序按照:

  1. initPlan-s
  2. left tree
  3. right tree
  4. special child plans
  5. subPlan-s

执行。

/*
 * planstate_tree_walker --- walk plan state trees
 *
 * The walker has already visited the current node, and so we need only
 * recurse into any sub-nodes it has.
 */
bool
planstate_tree_walker(PlanState *planstate,
					  bool (*walker) (),
					  void *context)
{
	Plan	   *plan = planstate->plan;
	ListCell   *lc;

	/* initPlan-s */
	if (planstate_walk_subplans(planstate->initPlan, walker, context))
		return true;

	/* lefttree */
	if (outerPlanState(planstate))
	{
		if (walker(outerPlanState(planstate), context))
			return true;
	}

	/* righttree */
	if (innerPlanState(planstate))
	{
		if (walker(innerPlanState(planstate), context))
			return true;
	}

	/* special child plans */
	switch (nodeTag(plan))
	{
		case T_ModifyTable:
			if (planstate_walk_members(((ModifyTable *) plan)->plans,
								  ((ModifyTableState *) planstate)->mt_plans,
									   walker, context))
				return true;
			break;
		case T_Append:
			if (planstate_walk_members(((Append *) plan)->appendplans,
									((AppendState *) planstate)->appendplans,
									   walker, context))
				return true;
			break;
		case T_MergeAppend:
			if (planstate_walk_members(((MergeAppend *) plan)->mergeplans,
								((MergeAppendState *) planstate)->mergeplans,
									   walker, context))
				return true;
			break;
		case T_BitmapAnd:
			if (planstate_walk_members(((BitmapAnd *) plan)->bitmapplans,
								 ((BitmapAndState *) planstate)->bitmapplans,
									   walker, context))
				return true;
			break;
		case T_BitmapOr:
			if (planstate_walk_members(((BitmapOr *) plan)->bitmapplans,
								  ((BitmapOrState *) planstate)->bitmapplans,
									   walker, context))
				return true;
			break;
		case T_SubqueryScan:
			if (walker(((SubqueryScanState *) planstate)->subplan, context))
				return true;
			break;
		case T_CustomScan:
			foreach(lc, ((CustomScanState *) planstate)->custom_ps)
			{
				if (walker((PlanState *) lfirst(lc), context))
					return true;
			}
			break;
		default:
			break;
	}

	/* subPlan-s */
	if (planstate_walk_subplans(planstate->subPlan, walker, context))
		return true;

	return false;
}

方法:为了使得驱动顺序能与执行顺序保持一致,新增planstate_tree_exec_walker函数,walk顺序为:

  1. left tree/right tree/special child plans
  2. subPlan-s
  3. initPlan-s


bool
planstate_tree_exec_walker(PlanState *planstate,
						   bool (*walker) (),
						   void *context)
{
	Plan	   *plan = planstate->plan;
	ListCell   *lc;

	switch (nodeTag(plan))
	{
		case T_HashJoin:
			if (planstate_exec_walk_hashjoin((HashJoinState *)planstate,
											 walker,
											 context))
				return true;
			break;
		case T_ModifyTable:
			if (planstate_walk_members(((ModifyTable *) plan)->plans,
								  ((ModifyTableState *) planstate)->mt_plans,
									   walker, context))
				return true;
			break;
		case T_Append:
			if (planstate_walk_members(((Append *) plan)->appendplans,
									((AppendState *) planstate)->appendplans,
									   walker, context))
				return true;
			break;
		case T_MergeAppend:
			if (planstate_walk_members(((MergeAppend *) plan)->mergeplans,
								((MergeAppendState *) planstate)->mergeplans,
									   walker, context))
				return true;
			break;
		case T_BitmapAnd:
			if (planstate_walk_members(((BitmapAnd *) plan)->bitmapplans,
								 ((BitmapAndState *) planstate)->bitmapplans,
									   walker, context))
				return true;
			break;
		case T_BitmapOr:
			if (planstate_walk_members(((BitmapOr *) plan)->bitmapplans,
								  ((BitmapOrState *) planstate)->bitmapplans,
									   walker, context))
				return true;
			break;
		case T_SubqueryScan:
			if (walker(((SubqueryScanState *) planstate)->subplan, context))
				return true;
			break;
		case T_CustomScan:
			foreach(lc, ((CustomScanState *) planstate)->custom_ps)
			{
				if (walker((PlanState *) lfirst(lc), context))
					return true;
			}
			break;
		case T_CteScan:
			if (walker(((CteScanState *) planstate)->cteplanstate, context))
				return true;
			break;
		default:
			if (outerPlanState(planstate) && walker(outerPlanState(planstate), context))
				return true;
			if (innerPlanState(planstate) && walker(innerPlanState(planstate), context))
				return true;
			break;
	}

	/* subPlan-s */
	if (planstate_walk_subplans(planstate->subPlan, walker, context))
		return true;

	/* initPlan-s */
	if (planstate_walk_subplans(planstate->initPlan, walker, context))
		return true;

	return false;
}
AntDB Cluster Reduce死锁驱动
static bool
DriveClusterReduceWalker(PlanState *node)
{
	EState	   *estate;
	int			planid;
	bool		res;

	if (node == NULL)
		return false;

	estate = node->state;
	planid = PlanNodeID(node->plan);

	if (bms_is_member(planid, estate->es_reduce_drived_set))
		return false;

	if (IsA(node, ClusterReduceState))
	{
		ClusterReduceState *crs = (ClusterReduceState *) node;
		Assert(crs->port);

		if (!crs->eof_network || !crs->eof_underlying)
			elog(LOG, "Drive ClusterReduce(%d) to send EOF message", planid);

		/*
		 * Drive all ClusterReduce to send slot, discard slot
		 * used for local.
		 */
		res = DriveClusterReduceState(crs);
	} else
	if (IsA(node, CteScanState))
	{
		res = DriveCteScanState((CteScanState *) node);
	} else
	{
		res = planstate_tree_exec_walker(node, DriveClusterReduceWalker, NULL);
	}

	estate->es_reduce_drived_set = bms_add_member(estate->es_reduce_drived_set, planid);

	return res;
}

限于认知有限,驱动程序应该尚有hold不住的情况,后续遇到死锁case时再持续优化。

转载于:https://my.oschina.net/zaclu/blog/1612917

  • 0
    点赞
  • 0
    收藏
    觉得还不错? 一键收藏
  • 0
    评论
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值