PostgreSQL 优化器的初步分析：query_planner()

最新推荐文章于 2024-07-08 07:27:17 发布

davidzwb

最新推荐文章于 2024-07-08 07:27:17 发布

阅读量1.7k

点赞数 1

分类专栏： PostgreSQL 内核源码研究文章标签： postgresql 优化器路径 planner

本文链接：https://blog.csdn.net/davidzwb/article/details/53374812

版权

PostgreSQL 内核源码研究专栏收录该内容

3 篇文章 0 订阅

订阅专栏

PostgreSQL 优化器的初步分析：

query_planner():

query_planner() 是优化器生成 path 的基础，所以首先从这里开始分析起。

query_planner() 负责生成一个基本的 path，这个 path 除了基本的扫描表的操作外，最后最多只包含连接操作，剩下的其他诸如聚合或者 limit 等操作，则会到更外层的 grouping_planner() 中进行。

进入 query_planner() 第一步就是对每个基本表生成 RelOptInfo 结构：

基本表就是 FROM 后指定的一个普通的表，而不是由连接等操作生成的表。
在优化生成 plan 的过程中，RelOptInfo 中包含了对一个表所需的信息；

typedef struct RelOptInfo
{
    NodeTag     type;

    RelOptKind  reloptkind;

    /* all relations included in this RelOptInfo */
    Relids      relids;         /* set of base relids (rangetable indexes) */

    /* size estimates generated by planner */
    double      rows;           /* estimated number of result tuples */

    /* per-relation planner control flags */
    bool        consider_startup;       /* keep cheap-startup-cost paths? */
    bool        consider_param_startup; /* ditto, for parameterized paths? */
    bool        consider_parallel;      /* consider parallel paths? */

    /* default result targetlist for Paths scanning this relation */
    struct PathTarget *reltarget;       /* list of Vars/Exprs, cost, width */

    /* materialization information */
    List       *pathlist;       /* Path structures */
    List       *ppilist;        /* ParamPathInfos used in pathlist */
    List       *partial_pathlist;       /* partial Paths */
    struct Path *cheapest_startup_path;
    struct Path *cheapest_total_path;
    struct Path *cheapest_unique_path;
    List       *cheapest_parameterized_paths;

    /* parameterization information needed for both base rels and join rels */
    /* (see also lateral_vars and lateral_referencers) */
    Relids      direct_lateral_relids;  /* rels directly laterally referenced */
    Relids      lateral_relids; /* minimum parameterization of rel */

    /* information about a base rel (not set for join rels!) */
    Index       relid;
    Oid         reltablespace;  /* containing tablespace */
    RTEKind     rtekind;        /* RELATION, SUBQUERY, or FUNCTION */
    AttrNumber  min_attr;       /* smallest attrno of rel (often <0) */
    AttrNumber  max_attr;       /* largest attrno of rel */
    Relids     *attr_needed;    /* array indexed [min_attr .. max_attr] */
    int32      *attr_widths;    /* array indexed [min_attr .. max_attr] */
    List       *lateral_vars;   /* LATERAL Vars and PHVs referenced by rel */
    Relids      lateral_referencers;    /* rels that reference me laterally */
    List       *indexlist;      /* list of IndexOptInfo */
    BlockNumber pages;          /* size estimates derived from pg_class */
    double      tuples;
    double      allvisfrac;
    PlannerInfo *subroot;       /* if subquery */
    List       *subplan_params; /* if subquery */
    int         rel_parallel_workers;   /* wanted number of parallel workers */

    /* Information about foreign tables and foreign joins */
    Oid         serverid;       /* identifies server for the table or join */
    Oid         userid;         /* identifies user to check access as */
    bool        useridiscurrent;    /* join is only valid for current user */
    /* use "struct FdwRoutine" to avoid including fdwapi.h here */
    struct FdwRoutine *fdwroutine;
    void       *fdw_private;

    /* used by various scans and joins: */
    List       *baserestrictinfo;       /* RestrictInfo structures (if base
                                         * rel) */
    QualCost    baserestrictcost;       /* cost of evaluating the above */
    List       *joininfo;       /* RestrictInfo structures for join clauses
                                 * involving this rel */
    bool        has_eclass_joins;       /* T means joininfo is incomplete */
} RelOptInfo;

以上的 RelOptInfo 结构中的元素，在接下来优化的过程中会逐渐用到。

接下来还需要介绍 PlannerInfo 结构，该结构是整个优化器执行过程中信息存放的地方，所以每个生成的 RelOptInfo 都存放在 PlannerInfo 中：

/*----------
 * PlannerInfo
 *      Per-query information for planning/optimization
 *
 * This struct is conventionally called "root" in all the planner routines.
 * It holds links to all of the planner's working state, in addition to the
 * original Query.  Note that at present the planner extensively modifies
 * the passed-in Query data structure; someday that should stop.
 *----------
 */
typedef struct PlannerInfo
{
    NodeTag     type;

    Query      *parse;          /* the Query being planned */

    PlannerGlobal *glob;        /* global info for current planner run */

    Index       query_level;    /* 1 at the outermost Query */

    struct PlannerInfo *parent_root;    /* NULL at outermost Query */

    /*
     * plan_params contains the expressions that this query level needs to
     * make available to a lower query level that is currently being planned.
     * outer_params contains the paramIds of PARAM_EXEC Params that outer
     * query levels will make available to this query level.
     */
    List       *plan_params;    /* list of PlannerParamItems, see below */
    Bitmapset  *outer_params;

    /*
     * simple_rel_array holds pointers to "base rels" and "other rels" (see
     * comments for RelOptInfo for more info).  It is indexed by rangetable
     * index (so entry 0 is always wasted).  Entries can be NULL when an RTE
     * does not correspond to a base relation, such as a join RTE or an
     * unreferenced view RTE; or if the RelOptInfo hasn't been made yet.
     */
    struct RelOptInfo **simple_rel_array;       /* All 1-rel RelOptInfos */
    int         simple_rel_array_size;  /* allocated size of array */

    /*
     * simple_rte_array is the same length as simple_rel_array and holds
     * pointers to the associated rangetable entries.  This lets us avoid
     * rt_fetch(), which can be a bit slow once large inheritance sets have
     * been expanded.
     */
    RangeTblEntry **simple_rte_array;   /* rangetable as an array */

先通过下面语句为每个要创建 RelOptInfo 的基本表申请空间：

一个 RelOptInfo 对应一个 RangeTableEntry，而这些 RTE 为了避免干扰，是从 Query 中的 jointree 中取出的。

  setup_simple_rel_arrays(PlannerInfo *root)
  {
    Index       rti;
    ListCell   *lc;

    /* Arrays are accessed using RT indexes (1..N) */
    /* 为 Query 中 RangeTable 中的每一个都创建一个 RelOptInfo，*/
    /* 由于 RangeTable 是从 1 到 N 的，所以大小 + 1*/
    root->simple_rel_array_size = list_length(root->parse->rtable) + 1;

    /* simple_rel_array is initialized to all NULLs */
    root->simple_rel_array = (RelOptInfo **)
        palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));

    /* simple_rte_array is an array equivalent of the rtable list */
    /* RangeTable 中每一个 RangeTableEntry(RTE) 的快捷入口 */
    root->simple_rte_array = (RangeTblEntry **)
        palloc0(root->simple_rel_array_size * sizeof(RangeTblEntry *));
    rti = 1;
    foreach(lc, root->parse->rtable)
    {
        RangeTblEntry *rte = (RangeTblEntry *) lfirst(lc);

        root->simple_rte_array[rti++] = rte;
    }
  }

然后在 add_base_rels_to_query() 中初始化每个 RelOptInfo：

add_base_rels_to_query(root, (Node *) parse->jointree);

从 Query.jointree 中而不是从 Query.rtable 中取出 RTE 可以仅取到不包含 Join 的基本表；

对于每个基本表进行下列操作：

/*
 * build_simple_rel
 *    Construct a new RelOptInfo for a base relation or 'other' relation.
 */
RelOptInfo *
build_simple_rel(PlannerInfo *root, int relid, RelOptKind reloptkind)
{
    RelOptInfo *rel;
    RangeTblEntry *rte;

    /* Rel should not exist already */
    Assert(relid > 0 && relid < root->simple_rel_array_size);
    if (root->simple_rel_array[relid] != NULL)
        elog(ERROR, "rel %d already exists", relid);

    /* Fetch RTE for relation */
    rte = root->simple_rte_array[relid];
    Assert(rte != NULL);

    rel = makeNode(RelOptInfo);
    rel->reloptkind = reloptkind;
    rel->relids = bms_make_singleton(relid);
    rel->rows = 0;
    /* cheap startup cost is interesting iff not all tuples to be retrieved */
    rel->consider_startup = (root->tuple_fraction > 0);
    rel->consider_param_startup = false;        /* might get changed later */
    rel->consider_parallel = false;     /* might get changed later */
    rel->reltarget = create_empty_pathtarget();
    rel->pathlist = NIL;
    rel->ppilist = NIL;
    rel->partial_pathlist = NIL;
    rel->cheapest_startup_path = NULL;
    rel->cheapest_total_path = NULL;
    rel->cheapest_unique_path = NULL;
    rel->cheapest_parameterized_paths = NIL;
    rel->direct_lateral_relids = NULL;
    rel->lateral_relids = NULL;
    rel->relid = relid;
    rel->rtekind = rte->rtekind;
    /* min_attr, max_attr, attr_needed, attr_widths are set below */
    rel->lateral_vars = NIL;
    rel->lateral_referencers = NULL;
    rel->indexlist = NIL;
    rel->pages = 0;
    rel->tuples = 0;
    rel->allvisfrac = 0;
    rel->subroot = NULL;
    rel->subplan_params = NIL;
    rel->rel_parallel_workers = -1;     /* set up in GetRelationInfo */
    rel->serverid = InvalidOid;
    rel->userid = rte->checkAsUser;
    rel->useridiscurrent = false;
    rel->fdwroutine = NULL;
    rel->fdw_private = NULL;
    rel->baserestrictinfo = NIL;
    rel->baserestrictcost.startup = 0;
    rel->baserestrictcost.per_tuple = 0;
    rel->joininfo = NIL;
    rel->has_eclass_joins = false;
    /* Check type of rtable entry */
    switch (rte->rtekind)
    {
        case RTE_RELATION:
        /* Table --- retrieve statistics from the system catalogs */
        /* 在下面函数中通过查询 Catalog，将目前所针对建立 RelOptInfo 的 RTE 的下列信息项填入：
         *  min_attr    lowest valid AttrNumber 最低有效属性号
         *  max_attr    highest valid AttrNumber 最高有效属性号
         *  indexlist   list of IndexOptInfos for relation's indexes
         *  serverid    if it's a foreign table, the server OID 所属外部表服务器的 OID
         *  fdwroutine  if it's a foreign table, the FDW function pointers 外部表处理函数指针
         *  pages       number of pages 该 RTE 所对应的表的页数
         *  tuples      number of tuples　该 RTE 所对应的表的元组数
        */
            get_relation_info(root, rte->relid, rte->inh, rel);
            break;
        case RTE_SUBQUERY:
        case RTE_FUNCTION:
        case RTE_VALUES:
        case RTE_CTE:

            /*
             * Subquery, function, or values list --- set up attr range and
             * arrays
             *
             * Note: 0 is included in range to support whole-row Vars
             */
            rel->min_attr = 0;
            rel->max_attr = list_length(rte->eref->colnames);
            rel->attr_needed = (Relids *)
                palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(Relids));
            rel->attr_widths = (int32 *)
                palloc0((rel->max_attr - rel->min_attr + 1) * sizeof(int32));
            break;
        default:
            elog(ERROR, "unrecognized RTE kind: %d",
                 (int) rte->rtekind);
            break;
    }

接下来会对 Query 中的 Jointree 进行处理，Jointree 代表一个 SQL 语句中的 From/Where/Join/On 部分。

该部分的语句可能是连接操作，也可能是一些对表的限制条件(如：attr>2)，所以在这里需要作一个区分的处理：

deconstruct_jointree() -> deconstruct_recurse() 中：

如果是针对某个表的限制条件，则调用 distribute_qual_to_rels() 将该限制条件下推到该表的 RelOptInfo->baserestrictinfo 中去；

如果是对一些表的连接操作，则先同样调用 distribute_qual_to_rels() 将连接的条件下推到该表的 RelOptInfo->joininfo 中去。同时，构造一个 RangeTblRef * 类型的链表，记录需要在一起连接的基本表。

deconstruct_jointree() 调用完成后，会返回一个上述的连接链表，同时针对每个基本表的扫描或连接限制条件，都存入了各自的 RelOptInfo 的 baserestrictinfo/joininfo 中，如下：

joinlist = deconstruct_jointree(root);

然后就可以利用所获取的信息，进行 Join 的构造了：

/*
     * Ready to do the primary planning.
     */
    final_rel = make_one_rel(root, joinlist);

在 make_one_rel() 中：

    /*
     * Compute size estimates and consider_parallel flags for each base rel,
     * then generate access paths.
     */
    set_base_rel_sizes(root); //估算该基本表的行数和宽度
    set_base_rel_pathlists(root); //生成该基本表的 path

进入后，会根据基本表类型的不同，调用不同的函数处理，对于外部表，就是根据之前 RelOptInfo 中初始化的 fdwroutine 调用 fdw 部分的代码的，如下：

rel->fdwroutine->GetForeignRelSize(root, rel, rte->relid);
以及
rel->fdwroutine->GetForeignPaths(root, rel, rte->relid);

针对每个基本表生成 path 后，利用已经生成的 joinlist 构造一个 RELOPT_JOINREL 类型的 RelOptInfo 以及对应的 path，最后返回这个 Join 的 RelOptInfo。

/*
     * Generate access paths for the entire join tree.
     */
    rel = make_rel_from_joinlist(root, joinlist);

实例：

对于下列语句（其中 test1 和 test2 都是外部表），

select * from test1 left join test2 on test1.no=test2.no limit 3 offset 3;

query_planner() 不会处理其中的 “limit 3 offset 3” 那是在 grouping_planner 中之后做的事。

首先在 PlannerInfo 中初始化存储 RelOptInfo 的空间：

PlannerInfo.simple_rel_array_size = list_length(root->parse->rtable) + 1; // = 4
PlannerInfo.simple_rel_array = (RelOptInfo **)
        palloc0(root->simple_rel_array_size * sizeof(RelOptInfo *));

由于 Query 中的 RangeTable 中会有 test1、test2 以及它们之间的 Join 存在，这里的大小为 4，也就是说最多之后可能会用到 4 个 RelOptInfo，但并不是一定会，而且因为 RangeTable 是从 1 开始计数的，第一个空间是不会被使用到。

针对 test1 和 test2 表生成对应的 RelOptInfo；

add_base_rels_to_query(root, (Node *) parse->jointree);

下面是 test1 的 RelOptInfo：

(gdb) p *(RelOptInfo *)root->simple_rel_array[1]
$32 = {
  type = T_RelOptInfo, 
  reloptkind = RELOPT_BASEREL, 
  relids = 0x245c1d0, 
  rows = 0, 
  consider_startup = 1 '\001', 
  consider_param_startup = 0 '\000', 
  consider_parallel = 0 '\000', 
  reltarget = 0x245c1e8, 
  pathlist = 0x0, 
  ppilist = 0x0, 
  partial_pathlist = 0x0, 
  cheapest_startup_path = 0x0, 
  cheapest_total_path = 0x0, 
  cheapest_unique_path = 0x0, 
  cheapest_parameterized_paths = 0x0, 
  direct_lateral_relids = 0x0, 
  lateral_relids = 0x0, 
  relid = 1, 
  reltablespace = 0, 
  rtekind = RTE_RELATION, 
  min_attr = -7, 
  max_attr = 2, 
  attr_needed = 0x245c238, 
  attr_widths = 0x245c2c8, 
  lateral_vars = 0x0, 
  lateral_referencers = 0x0, 
  indexlist = 0x0, 
  pages = 0, 
  tuples = 0, 
  allvisfrac = 0, 
  subroot = 0x0, 
  subplan_params = 0x0, 
  rel_parallel_workers = -1, 
  serverid = 16399, 
  userid = 0, 
  useridiscurrent = 0 '\000', 
  fdwroutine = 0x245c318, 
  fdw_private = 0x0, 
  baserestrictinfo = 0x0, 
  baserestrictcost = {
    startup = 0, 
    per_tuple = 0
  }, 
  joininfo = 0x0, 
  has_eclass_joins = 0 '\000'
}

处理 RestrictInfo，由于原句并未包含对表的 Where 限制，所以这里只会生成一个 joinlist；

joinlist = deconstruct_jointree(&PlannerInfo);

生成的 joinlist 是一个 RTE 链表，连接着 2 个节点，分别指向 test1 和 test2 的 RTE，表示将要进行 test1 和 test2 的连接操作；

(gdb) p *(RangeTblRef *)((List *)joinlist)->head->data->ptr_value 
$29 = {
  type = T_RangeTblRef, 
  rtindex = 1
}
(gdb) p *(RangeTblRef *)((List *)joinlist)->tail->data->ptr_value 
$30 = {
  type = T_RangeTblRef, 
  rtindex = 2
}

最后根据 joinlist 生成 join RelOptInfo：

final_rel = make_one_rel(root, joinlist);
final_rel = {
  type = T_RelOptInfo, 
  reloptkind = RELOPT_JOINREL, 
  relids = 0x245f3b8, 
  rows = 9316, 
  consider_startup = 1 '\001', 
  consider_param_startup = 0 '\000', 
  consider_parallel = 0 '\000', 
  reltarget = 0x245f3d0, 
  pathlist = 0x245f9d0, 
  ppilist = 0x0, 
  partial_pathlist = 0x0, 
  cheapest_startup_path = 0x246b490, 
  cheapest_total_path = 0x245f8a0, 
  cheapest_unique_path = 0x0, 
  cheapest_parameterized_paths = 0x246b6d0, 
  direct_lateral_relids = 0x0, 
  lateral_relids = 0x0, 
  relid = 0, 
  reltablespace = 0, 
  rtekind = RTE_JOIN, 
  min_attr = 0, 
  max_attr = 0, 
  attr_needed = 0x0, 
  attr_widths = 0x0, 
  lateral_vars = 0x0, 
  lateral_referencers = 0x0, 
  indexlist = 0x0, 
  pages = 0, 
  tuples = 0, 
  allvisfrac = 0, 
  subroot = 0x0, 
  subplan_params = 0x0, 
  rel_parallel_workers = -1, 
  serverid = 16399, 
  userid = 0, 
  useridiscurrent = 0 '\000', 
  fdwroutine = 0x245c318, 
  fdw_private = 0x245fc40, 
  baserestrictinfo = 0x0, 
  baserestrictcost = {
    startup = 0, 
    per_tuple = 0
  }, 
  joininfo = 0x0, 
  has_eclass_joins = 0 '\000'
}

在名为 final_rel 的 RelOptInfo 中的 pathlist 链表中生成了 3 个 path：

MergePath，先从远端取回数据后，在本地进行 MergeJoin 处理；

HashPath，先从远端取回数据后，在本地进行 HashJoin 处理；

MergePath 和 HashPath 的 outerjoinpath 和 innerjoinpath 则分别指向一个 ForeignPath，分别进行 test1 和 test2 的 ForeignScan。

ForeignPath，将连接语句也一并发送到远端执行，所以对于本地就相当于仅有一个 ForeignScan 即 ForeignPath。

MergePath = {
  jpath = {
    path = {
      type = T_MergePath, 
      pathtype = T_MergeJoin, 
      parent = 0x245f1a8, 
      pathtarget = 0x245f3d0, 
      param_info = 0x0, 
      parallel_aware = 0 '\000', 
      parallel_safe = 0 '\000', 
      parallel_workers = 0, 
      rows = 9316, 
      startup_cost = 444.06045346876857, 
      total_cost = 590.62545346876846, 
      pathkeys = 0x0
    }, 
    jointype = JOIN_LEFT, 
    outerjoinpath = 0x245ee90, 
    innerjoinpath = 0x245efc0, 
    joinrestrictinfo = 0x245f4f0
  }, 
  path_mergeclauses = 0x245f730, 
  outersortkeys = 0x245f6e0, 
  innersortkeys = 0x245f7d0, 
  materialize_inner = 0 '\000'
}

HashPath = {
  jpath = {
    path = {
      type = T_HashPath, 
      pathtype = T_HashJoin, 
      parent = 0x245f1a8, 
      pathtarget = 0x245f3d0, 
      param_info = 0x0, 
      parallel_aware = 0 '\000', 
      parallel_safe = 0 '\000', 
      parallel_workers = 0, 
      rows = 9316, 
      startup_cost = 268.01250000000005, 
      total_cost = 647.58500000000004, 
      pathkeys = 0x0
    }, 
    jointype = JOIN_LEFT, 
    outerjoinpath = 0x245ee90, 
    innerjoinpath = 0x245efc0, 
    joinrestrictinfo = 0x245f4f0
  }, 
  path_mergeclauses = 0x245fbf0, 
  outersortkeys = 0x1, 
  innersortkeys = 0x10, 
  materialize_inner = 32 ' '
}

ForeignPath = {
  path = {
    type = T_ForeignPath, 
    pathtype = T_ForeignScan, 
    parent = 0x245f1a8, 
    pathtarget = 0x245f3d0, 
    param_info = 0x0, 
    parallel_aware = 0 '\000', 
    parallel_safe = 0 '\000', 
    parallel_workers = 0, 
    rows = 9316, 
    startup_cost = 100, 
    total_cost = 4991.6824999999999, 
    pathkeys = 0x0
  }, 
  fdw_outerpath = 0x0, 
  fdw_private = 0x0
}