PostgreSQL 流程---查询

查询

概述

查询是数据库中最常用也是最复杂的操作之一,关于查询涉及很多方面:SQL解析、成本评估、索引选择…等等,本文只讨论最简单的部分,即PostgreSQL如何执行全表遍历。考虑下面的SQL语句:

-- 建表
create table t1(id int);
-- 插入
insert into t1 values(1),(2),(3);
-- 查询
select * from t1 where id = 1;

对于select语句,由于我们并没有为t1表创建索引,所以只能通过全表遍历的方式来执行查询。全表遍历会遍历表的所有块,逐条获取块中的元组,判断元组是否符合条件,如果符合条件则放入结果集。下面我们详细阐述查询的相关内容。

查询框架

在介绍全表遍历前,我们先来看看PostgreSQL的查询框架。ExecutePlan是查询的主循环,该函数获取一条合法的元组(合法是指元组可见且满足where条件)。然后放入结果集(这部分不太清楚)。具体代码如下:

static void
ExecutePlan(EState *estate,
			PlanState *planstate,
			bool use_parallel_mode,
			CmdType operation,
			bool sendTuples,
			uint64 numberTuples,
			ScanDirection direction,
			DestReceiver *dest)
{
	TupleTableSlot *slot;
	uint64		current_tuple_count;

	/*
	 * initialize local variables
	 */
	current_tuple_count = 0;

	/*
	 * Set the direction.
	 */
	estate->es_direction = direction;

	/*
	 * If a tuple count was supplied, we must force the plan to run without
	 * parallelism, because we might exit early.  Also disable parallelism
	 * when writing into a relation, because no database changes are allowed
	 * in parallel mode.
	 */
	if (numberTuples || dest->mydest == DestIntoRel)
		use_parallel_mode = false;

	estate->es_use_parallel_mode = use_parallel_mode;
	if (use_parallel_mode)
		EnterParallelMode();

	/*
	 * Loop until we've processed the proper number of tuples from the plan.
	 */
	for (;;)
	{
		/* Reset the per-output-tuple exprcontext */
		ResetPerTupleExprContext(estate);

		/*
		 * Execute the plan and obtain a tuple
		 * 获取一条合法元组
		 */
		slot = ExecProcNode(planstate);
		
		/*
		 * if the tuple is null, then we assume there is nothing more to
		 * process so we just end the loop...
		 * 如果solt为空,说明已经不存在合法元组,则结束循环(也就是结束查询)
		 */
		if (TupIsNull(slot))
		{
			/* Allow nodes to release or shut down resources. */
			(void) ExecShutdownNode(planstate);
			break;
		}

		/*
		 * If we have a junk filter, then project a new tuple with the junk
		 * removed.
		 *
		 * Store this new "clean" tuple in the junkfilter's resultSlot.
		 * (Formerly, we stored it back over the "dirty" tuple, which is WRONG
		 * because that tuple slot has the wrong descriptor.)
		 */
		if (estate->es_junkFilter != NULL)
			slot = ExecFilterJunk(estate->es_junkFilter, slot);

		/*
		 * If we are supposed to send the tuple somewhere, do so. (In
		 * practice, this is probably always the case at this point.)
		 */
		if (sendTuples)
		{
			/*
			 * If we are not able to send the tuple, we assume the destination
			 * has closed and no more tuples can be sent. If that's the case,
			 * end the loop.
			 */
			if (!((*dest->receiveSlot) (slot, dest)))
				break;
		}

		/*
		 * Count tuples processed, if this is a SELECT.  (For other operation
		 * types, the ModifyTable plan node must count the appropriate
		 * events.)
		 */
		if (operation == CMD_SELECT)
			(estate->es_processed)++;

		/*
		 * check our tuple count.. if we've processed the proper number then
		 * quit, else loop again and process more tuples.  Zero numberTuples
		 * means no limit.
		 */
		current_tuple_count++;
		if (numberTuples && numberTuples == current_tuple_count)
			break;
	}

	if (use_parallel_mode)
		ExitParallelMode();
}

上述代码中关键函数为ExecProcNode,ExecProcNode负责获取一条合法元组。我来看看ExecProcNode的具体实现:

TupleTableSlot *
ExecProcNode(PlanState *node)
{
	TupleTableSlot *result;

	CHECK_FOR_INTERRUPTS();

	if (node->chgParam != NULL) /* something changed */
		ExecReScan(node);		/* let ReScan handle this */

	if (node->instrument)
		InstrStartNode(node->instrument);

	switch (nodeTag(node))
	{
			/*
			 * control nodes
			 */
		case T_ResultState:
			result = ExecResult((ResultState *) node);
			break;

		case T_ModifyTableState:
			result = ExecModifyTable((ModifyTableState *) node);
			break;

		case T_AppendState:
			result = ExecAppend((AppendState *) node);
			break;

		case T_MergeAppendState:
			result = ExecMergeAppend((MergeAppendState *) node);
			break;

		case T_RecursiveUnionState:
			result = ExecRecursiveUnion((RecursiveUnionState *) node);
			break;

			/* BitmapAndState does not yield tuples */

			/* BitmapOrState does not yield tuples */

			/*
			 * scan nodes
			 */
		case T_SeqScanState:
			result = ExecSeqScan((SeqScanState *) node);
			break;

		case T_SampleScanState:
			result = ExecSampleScan((SampleScanState *) node);
			break;

		case T_IndexScanState:
			result = ExecIndexScan((IndexScanState *) node);
			break;

		case T_IndexOnlyScanState:
			result = ExecIndexOnlyScan((IndexOnlyScanState *) node);
			break;

			/* BitmapIndexScanState does not yield tuples */

		case T_BitmapHeapScanState:
			result = ExecBitmapHeapScan((BitmapHeapScanState *) node);
			break;

		case T_TidScanState:
			result = ExecTidScan((TidScanState *) node);
			break;

		case T_SubqueryScanState:
			result = ExecSubqueryScan((SubqueryScanState *) node);
			break;

		case T_FunctionScanState:
			result = ExecFunctionScan((FunctionScanState *) node);
			break;

		case T_ValuesScanState:
			result = ExecValuesScan((ValuesScanState *) node);
			break;

		case T_CteScanState:
			result = ExecCteScan((CteScanState *) node);
			break;

		case T_WorkTableScanState:
			result = ExecWorkTableScan((WorkTableScanState *) node);
			break;

		case T_ForeignScanState:
			result = ExecForeignScan((ForeignScanState *) node);
			break;

		case T_CustomScanState:
			result = ExecCustomScan((CustomScanState *) node);
			break;

			/*
			 * join nodes
			 */
		case T_NestLoopState:
			result = ExecNestLoop((NestLoopState *) node);
			break;

		case T_MergeJoinState:
			result = ExecMergeJoin((MergeJoinState *) node);
			break;

		case T_HashJoinState:
			result = ExecHashJoin((HashJoinState *) node);
			break;

			/*
			 * materialization nodes
			 */
		case T_MaterialState:
			result = ExecMaterial((MaterialState *) node);
			break;

		case T_SortState:
			result = ExecSort((SortState *) node);
			break;

		case T_GroupState:
			result = ExecGroup((GroupState *) node);
			break;

		case T_AggState:
			result = ExecAgg((AggState *) node);
			break;

		case T_WindowAggState:
			result = ExecWindowAgg((WindowAggState *) node);
			break;

		case T_UniqueState:
			result = ExecUnique((UniqueState *) node);
			break;

		case T_GatherState:
			result = ExecGather((GatherState *) node);
			break;

		case T_HashState:
			result = ExecHash((HashState *) node);
			break;

		case T_SetOpState:
			result = ExecSetOp((SetOpState *) node);
			break;

		case T_LockRowsState:
			result = ExecLockRows((LockRowsState *) node);
			break;

		case T_LimitState:
			result = ExecLimit((LimitState *) node);
			break;

		default:
			elog(ERROR, "unrecognized node type: %d", (int) nodeTag(node));
			result = NULL;
			break;
	}

	if (node->instrument)
		InstrStopNode(node->instrument, TupIsNull(result) ? 0.0 : 1.0);

	return result;
}

ExecProcNode接收一个执行计划PlanState(目前并未明确PlanState是什么东西,看起来可能是执行计划),根据执行计划的类型调用对应函数,执行相关查询。例如,本文重点介绍的全表遍历就属于T_SeqScanState类型,会调用ExecSeqScan来执行。而ExecSeqScan会调用ExecScan,并将SeqNext作为参数,SeqNext是实际执行全表遍历的函数,后面我们会重点讲解,现在先来看看ExecSeqScan的实现:

TupleTableSlot *
ExecSeqScan(SeqScanState *node)
{
	return ExecScan((ScanState *) node,
					(ExecScanAccessMtd) SeqNext,
					(ExecScanRecheckMtd) SeqRecheck);
}

下面我们就来重点看下ExecScan的实现。

ExecScan

实际上不只是ExecSeqScan,几乎所有的单表查询操作都会调用ExecScan,例如使用索引查询的实现如下:

TupleTableSlot *
ExecIndexScan(IndexScanState *node)
{
	/*
	 * If we have runtime keys and they've not already been set up, do it now.
	 */
	if (node->iss_NumRuntimeKeys != 0 && !node->iss_RuntimeKeysReady)
		ExecReScan((PlanState *) node);

	if (node->iss_NumOrderByKeys > 0)
		return ExecScan(&node->ss,
						(ExecScanAccessMtd) IndexNextWithReorder,
						(ExecScanRecheckMtd) IndexRecheck);
	else
		return ExecScan(&node->ss,
						(ExecScanAccessMtd) IndexNext,
						(ExecScanRecheckMtd) IndexRecheck);
}

只不过在ExecIndexScan中,根据情况将IndexNext和IndexNextWithReorder作为实际执行查询的函数。

ExecScan根据不同的查询方式(比如:全表遍历,索引遍历等)调用对应的函数(如前面提到的SeqNext)获取一条可见的元组(由ExecScanFetch实现),判断元组的合法性(由ExecQual实现)。如果合法则返回该元组,否则获取下一条可见元组,直到获取完所有的可见元组为止,我们来看看具体的代码:

TupleTableSlot *
ExecScan(ScanState *node,
		 ExecScanAccessMtd accessMtd,	/* function returning a tuple */
		 ExecScanRecheckMtd recheckMtd)
{
	ExprContext *econtext;
	List	   *qual;
	ProjectionInfo *projInfo;
	ExprDoneCond isDone;
	TupleTableSlot *resultSlot;

	/*
	 * Fetch data from node
	 */
	qual = node->ps.qual;
	projInfo = node->ps.ps_ProjInfo;
	econtext = node->ps.ps_ExprContext;

	/*
	 * If we have neither a qual to check nor a projection to do, just skip
	 * all the overhead and return the raw scan tuple.
	 * 如果不需要判断合法性(比如:没有查询条件)则直接调用该函数。
	 */
	if (!qual && !projInfo)
	{
		ResetExprContext(econtext);
		return ExecScanFetch(node, accessMtd, recheckMtd);
	}

	/*
	 * Check to see if we're still projecting out tuples from a previous scan
	 * tuple (because there is a function-returning-set in the projection
	 * expressions).  If so, try to project another one.
	 */
	if (node->ps.ps_TupFromTlist)
	{
		Assert(projInfo);		/* can't get here if not projecting */
		resultSlot = ExecProject(projInfo, &isDone);
		if (isDone == ExprMultipleResult)
			return resultSlot;
		/* Done with that source tuple... */
		node->ps.ps_TupFromTlist = false;
	}

	/*
	 * Reset per-tuple memory context to free any expression evaluation
	 * storage allocated in the previous tuple cycle.  Note this can't happen
	 * until we're done projecting out tuples from a scan tuple.
	 */
	ResetExprContext(econtext);

	/*
	 * get a tuple from the access method.  Loop until we obtain a tuple that
	 * passes the qualification.
	 * 主循环,不断获取可见的元组
	 */
	for (;;)
	{
		TupleTableSlot *slot;

		CHECK_FOR_INTERRUPTS();
		/* 根据不同的查询方式,获取一条可见元组 */
		slot = ExecScanFetch(node, accessMtd, recheckMtd);

		/*
		 * if the slot returned by the accessMtd contains NULL, then it means
		 * there is nothing more to scan so we just return an empty slot,
		 * being careful to use the projection result slot so it has correct
		 * tupleDesc.
		 */
		if (TupIsNull(slot))
		{
			if (projInfo)
				return ExecClearTuple(projInfo->pi_slot);
			else
				return slot;
		}

		/*
		 * place the current tuple into the expr context
		 */
		econtext->ecxt_scantuple = slot;

		/*
		 * check that the current tuple satisfies the qual-clause
		 *
		 * check for non-nil qual here to avoid a function call to ExecQual()
		 * when the qual is nil ... saves only a few cycles, but they add up
		 * ...
		 * 判断元组的合法性
		 */
		if (!qual || ExecQual(qual, econtext, false))
		{
			/*
			 * Found a satisfactory scan tuple.
			 */
			if (projInfo)
			{
				/*
				 * Form a projection tuple, store it in the result tuple slot
				 * and return it --- unless we find we can project no tuples
				 * from this scan tuple, in which case continue scan.
				 */
				resultSlot = ExecProject(projInfo, &isDone);
				if (isDone != ExprEndResult)
				{
					node->ps.ps_TupFromTlist = (isDone == ExprMultipleResult);
					return resultSlot;
				}
			}
			else
			{
				/*
				 * Here, we aren't projecting, so just return scan tuple.
				 */
				return slot;
			}
		}
		else
			InstrCountFiltered1(node, 1);

		/*
		 * Tuple fails qual, so free per-tuple memory and try again.
		 */
		ResetExprContext(econtext);
	}
}

下面,我们来看看ExecScanFetch函数。

ExecScanFetch

static inline TupleTableSlot *
ExecScanFetch(ScanState *node,
			  ExecScanAccessMtd accessMtd,
			  ExecScanRecheckMtd recheckMtd)
{
	EState	   *estate = node->ps.state;

	if (estate->es_epqTuple != NULL)
	{
		/*
		 * We are inside an EvalPlanQual recheck.  Return the test tuple if
		 * one is available, after rechecking any access-method-specific
		 * conditions.
		 */
		Index		scanrelid = ((Scan *) node->ps.plan)->scanrelid;

		if (scanrelid == 0)
		{
			TupleTableSlot *slot = node->ss_ScanTupleSlot;

			/*
			 * This is a ForeignScan or CustomScan which has pushed down a
			 * join to the remote side.  The recheck method is responsible not
			 * only for rechecking the scan/join quals but also for storing
			 * the correct tuple in the slot.
			 */
			if (!(*recheckMtd) (node, slot))
				ExecClearTuple(slot);	/* would not be returned by scan */
			return slot;
		}
		else if (estate->es_epqTupleSet[scanrelid - 1])
		{
			TupleTableSlot *slot = node->ss_ScanTupleSlot;

			/* Return empty slot if we already returned a tuple */
			if (estate->es_epqScanDone[scanrelid - 1])
				return ExecClearTuple(slot);
			/* Else mark to remember that we shouldn't return more */
			estate->es_epqScanDone[scanrelid - 1] = true;

			/* Return empty slot if we haven't got a test tuple */
			if (estate->es_epqTuple[scanrelid - 1] == NULL)
				return ExecClearTuple(slot);

			/* Store test tuple in the plan node's scan slot */
			ExecStoreTuple(estate->es_epqTuple[scanrelid - 1],
						   slot, InvalidBuffer, false);

			/* Check if it meets the access-method conditions */
			if (!(*recheckMtd) (node, slot))
				ExecClearTuple(slot);	/* would not be returned by scan */

			return slot;
		}
	}

	/*
	 * Run the node-type-specific access method function to get the next tuple
	 */
	return (*accessMtd) (node);
}

ExecScanFetch的核心是最后一行代码,调用具体的函数来获取元组。所谓具体函数也就是前面调用ExecScan时的第二个参数,对于全表遍历就是SeqNext函数,下面我们来具体看看SeqNext的实现。

SeqNext

SeqNext是实际执行全表遍历的函数,该函数涉及相当多的细节,放在专门的文档《PostgreSQL流程—全表遍历》进行讲解。

ExecQual

该函数主要用于判断元组的合法性,即元组是否符合where条件

bool
ExecQual(List *qual, ExprContext *econtext, bool resultForNull)
{
	bool		result;
	MemoryContext oldContext;
	ListCell   *l;

	/*
	 * debugging stuff
	 */
	EV_printf("ExecQual: qual is ");
	EV_nodeDisplay(qual);
	EV_printf("\n");

	/*
	 * Run in short-lived per-tuple context while computing expressions.
	 */
	oldContext = MemoryContextSwitchTo(econtext->ecxt_per_tuple_memory);

	/*
	 * Evaluate the qual conditions one at a time.  If we find a FALSE result,
	 * we can stop evaluating and return FALSE --- the AND result must be
	 * FALSE.  Also, if we find a NULL result when resultForNull is FALSE, we
	 * can stop and return FALSE --- the AND result must be FALSE or NULL in
	 * that case, and the caller doesn't care which.
	 *
	 * If we get to the end of the list, we can return TRUE.  This will happen
	 * when the AND result is indeed TRUE, or when the AND result is NULL (one
	 * or more NULL subresult, with all the rest TRUE) and the caller has
	 * specified resultForNull = TRUE.
	 */
	result = true;

	foreach(l, qual)
	{
		ExprState  *clause = (ExprState *) lfirst(l);
		Datum		expr_value;
		bool		isNull;

        /* 关键函数,获取表达式值 */
		expr_value = ExecEvalExpr(clause, econtext, &isNull, NULL);

		if (isNull)
		{
			if (resultForNull == false)
			{
				result = false; /* treat NULL as FALSE */
				break;
			}
		}
		else
		{
			if (!DatumGetBool(expr_value))
			{
				result = false; /* definitely FALSE */
				break;
			}
		}
	}

	MemoryContextSwitchTo(oldContext);

	return result;
}

static Datum
ExecMakeFunctionResultNoSets(FuncExprState *fcache,
							 ExprContext *econtext,
							 bool *isNull,
							 ExprDoneCond *isDone)
{
	ListCell   *arg;
	Datum		result;
	FunctionCallInfo fcinfo;
	PgStat_FunctionCallUsage fcusage;
	int			i;

	/* Guard against stack overflow due to overly complex expressions */
	check_stack_depth();

	if (isDone)
		*isDone = ExprSingleResult;

	/* inlined, simplified version of ExecEvalFuncArgs */
	fcinfo = &fcache->fcinfo_data;
	i = 0;
	foreach(arg, fcache->args)
	{
		ExprState  *argstate = (ExprState *) lfirst(arg);
	
        /* 
         * 关键函数,获取参数
         * 比如:sname = 'pgsql'
         * 该函数返回结果为:fcinfo->arg[0] 为 sname的值
         				   fcinfo->arg[1] 为 'pgsql'
         */
		fcinfo->arg[i] = ExecEvalExpr(argstate,
									  econtext,
									  &fcinfo->argnull[i],
									  NULL);
		i++;
	}

	/*
	 * If function is strict, and there are any NULL arguments, skip calling
	 * the function and return NULL.
	 */
	if (fcache->func.fn_strict)
	{
		while (--i >= 0)
		{
			if (fcinfo->argnull[i])
			{
				*isNull = true;
				return (Datum) 0;
			}
		}
	}

	pgstat_init_function_usage(fcinfo, &fcusage);

	fcinfo->isnull = false;
    /* 真正的判断函数 */
	result = FunctionCallInvoke(fcinfo);
	*isNull = fcinfo->isnull;

	pgstat_end_function_usage(&fcusage, true);

	return result;
}

遗留问题

  1. ExecutePlan中获取到一条合法记录后是如何处理的?放入结果集还是其他?
  2. PlanState是什么?深入研究其相关细节。

补充

查询时需要将物理记录中的字段抽出来,流程:

ExecProject->slot_getsomeattrs->slot_deform_tuple

最后存放在Datum中

  • 0
    点赞
  • 2
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值