Impala 源代码目录结构
SQL 解析
Impala 的 SQL 解析与执行计划生成部分是由 impala-frontend(Java)实现的,监听端口是 21000。用户通过
Beeswax 接口 BeeswaxService.query() 提交一个请求,在 impalad 端的处理逻辑是由
void ImpalaServer::query(QueryHandle& query_handle, const Query& query) 这个函数(
ImpalaServer.h)完成的。
1 | void ImpalaServer::query(QueryHandle& query_handle, const Query& query) { |
2 | VLOG_QUERY << "query(): query=" << query.query; |
3 | ScopedSessionState session_handle( this ); |
4 | shared_ptr<SessionState> session; |
6 | session_handle.WithSession(ThriftServer::GetThreadConnectionId(), &session), |
7 | SQLSTATE_GENERAL_ERROR); |
11 | RAISE_IF_ERROR(QueryToTQueryContext(query, &query_ctx), SQLSTATE_GENERAL_ERROR); |
15 | shared_ptr<QueryExecState> exec_state; |
18 | RAISE_IF_ERROR(Execute(&query_ctx, session, &exec_state), |
19 | SQLSTATE_SYNTAX_ERROR_OR_ACCESS_VIOLATION); |
21 | exec_state->UpdateQueryState(QueryState::RUNNING); |
24 | exec_state->WaitAsync(); |
27 | Status status = SetQueryInflight(session, exec_state); |
29 | UnregisterQuery(exec_state->query_id(), false , &status); |
30 | RaiseBeeswaxException(status.GetDetail(), SQLSTATE_GENERAL_ERROR); |
32 | TUniqueIdToQueryHandle(exec_state->query_id(), &query_handle); |
其中 QueryToTQueryContext(query, &query_ctx) 将 Query 装换为 TQueryCtx。具体代码实现如下:
(ImpalaServer.h)
1 | Status ImpalaServer::QueryToTQueryContext( const Query& query, |
2 | TQueryCtx* query_ctx) { |
3 | query_ctx->request.stmt = query.query; |
4 | VLOG_QUERY << "query: " << ThriftDebugString(query); |
6 | shared_ptr<SessionState> session; |
7 | const TUniqueId& session_id = ThriftServer::GetThreadConnectionId(); |
8 | RETURN_IF_ERROR(GetSessionState(session_id, &session)); |
9 | DCHECK(session != NULL); |
14 | lock_guard<mutex> l(session->lock); |
15 | if (session->connected_user.empty()) session->connected_user = query.hadoop_user; |
16 | query_ctx->request.query_options = session->default_query_options; |
19 | session->ToThrift(session_id, &query_ctx->session); |
23 | if (query.__isset.configuration) { |
24 | BOOST_FOREACH( const string& option, query.configuration) { |
25 | RETURN_IF_ERROR(ParseQueryOptions(option, &query_ctx->request.query_options)); |
27 | VLOG_QUERY << "TClientRequest.queryOptions: " |
28 | << ThriftDebugString(query_ctx->request.query_options); |
内部调用 ImpalaServer::Execute()
(ImpalaServer.h)
函数将 TQueryCtx 转换为 TExecRequest,具体逻辑通过调用 ImpalaServer::ExecuteInternal() 实现。代码如下:
1 | Status ImpalaServer::Execute(TQueryCtx* query_ctx, |
2 | shared_ptr<SessionState> session_state, |
3 | shared_ptr<QueryExecState>* exec_state) { |
4 | PrepareQueryContext(query_ctx); |
5 | bool registered_exec_state; |
6 | ImpaladMetrics::IMPALA_SERVER_NUM_QUERIES->Increment(1L); |
9 | string stmt = replace_all_copy(query_ctx->request.stmt, "\n" , " " ); |
11 | query_ctx->request.__set_redacted_stmt(( const string) stmt); |
13 | Status status = ExecuteInternal(*query_ctx, session_state, ®istered_exec_state, |
15 | if (!status.ok() && registered_exec_state) { |
16 | UnregisterQuery((*exec_state)->query_id(), false , &status); |
上面的函数调用 ImpalaServer::ExecuteInternal()
(ImpalaServer.h)
在这个函数里通过 JNI 接口调用 frontend.createExecRequest() 生成 TExecRequest,具体代码如下:
1 | Status ImpalaServer::ExecuteInternal( |
2 | const TQueryCtx& query_ctx, |
3 | shared_ptr<SessionState> session_state, |
4 | bool* registered_exec_state, |
5 | shared_ptr<QueryExecState>* exec_state) { |
6 | DCHECK(session_state != NULL); |
7 | *registered_exec_state = false ; |
9 | return Status( "This Impala server is offline. Please retry your query later." ); |
11 | exec_state->reset( new QueryExecState(query_ctx, exec_env_, exec_env_->frontend(), |
12 | this , session_state)); |
14 | (*exec_state)->query_events()->MarkEvent( "Start execution" ); |
29 | lock_guard<mutex> l(*(*exec_state)->lock()); |
34 | RETURN_IF_ERROR(RegisterQuery(session_state, *exec_state)); |
35 | *registered_exec_state = true ; |
37 | RETURN_IF_ERROR((*exec_state)->UpdateQueryStatus( |
39 | exec_env_->frontend()->GetExecRequest(query_ctx, &result))); |
40 | (*exec_state)->query_events()->MarkEvent( "Planning finished" ); |
41 | (*exec_state)->summary_profile()->AddEventSequence( |
42 | result.timeline.name, result.timeline); |
43 | if (result.__isset.result_set_metadata) { |
44 | (*exec_state)->set_result_metadata(result.result_set_metadata); |
47 | VLOG( 2 ) << "Execution request: " << ThriftDebugString(result); |
50 | RETURN_IF_ERROR((*exec_state)->Exec(&result)); |
51 | if (result.stmt_type == TStmtType::DDL) { |
52 | Status status = UpdateCatalogMetrics(); |
54 | VLOG_QUERY << "Couldn't update catalog metrics: " << status.GetDetail(); |
58 | if ((*exec_state)->coord() != NULL) { |
59 | const unordered_set<TNetworkAddress>& unique_hosts = |
60 | (*exec_state)->schedule()->unique_hosts(); |
61 | if (!unique_hosts.empty()) { |
62 | lock_guard<mutex> l(query_locations_lock_); |
63 | BOOST_FOREACH( const TNetworkAddress& port, unique_hosts) { |
64 | query_locations_[port].insert((*exec_state)->query_id()); |
Frontend::GetExecRequest()
(Frontend.h)
通过 JNI 接口调用 frontend.createExecRequest() 生成 TExecRequest。具体实现代码如下:
1 | Status Frontend::GetExecRequest( |
2 | const TQueryCtx& query_ctx, TExecRequest* result) { |
3 | return JniUtil::CallJniMethod(fe_, create_exec_request_id_, query_ctx, result); |
JniUtil::CallJniMethod()
(jni-util.h)
的具体实现代码如下:
4 | static Status CallJniMethod( const jobject& obj, const jmethodID& method, const T& arg) { |
5 | JNIEnv* jni_env = getJNIEnv(); |
6 | jbyteArray request_bytes; |
7 | JniLocalFrame jni_frame; |
8 | RETURN_IF_ERROR(jni_frame.push(jni_env)); |
9 | RETURN_IF_ERROR(SerializeThriftMsg(jni_env, &arg, &request_bytes)); |
10 | jni_env->CallObjectMethod(obj, method, request_bytes); |
11 | RETURN_ERROR_IF_EXC(jni_env); |
至此,将通过 Thrift 转到 Java Frontend 生成执行计划树。
public TExecRequest createExecRequest(TQueryCtx queryCtx, StringBuilder explainString)
(Frontend.java)
是最重要的方法,它根据提供的 TQueryCtx 创建 TExecRequest。具体代码(分析部分)如下:
2 | * Create a populated TExecRequest corresponding to the supplied TQueryCtx. |
4 | public TExecRequest createExecRequest(TQueryCtx queryCtx, StringBuilder explainString) |
5 | throws ImpalaException { |
7 | AnalysisContext.AnalysisResult analysisResult = analyzeStmt(queryCtx); |
8 | EventSequence timeline = analysisResult.getAnalyzer().getTimeline(); |
9 | timeline.markEvent( "Analysis finished" ); |
首先通过调用 analyzeStmt()
(Frontend.java)
方法分析提交的 SQL 语句。analyzeStmt() 的具体实现代码如下:
2 | * Analyzes the SQL statement included in queryCtx and returns the AnalysisResult. |
4 | private AnalysisContext.AnalysisResult analyzeStmt(TQueryCtx queryCtx) |
5 | throws AnalysisException, InternalException, AuthorizationException { |
6 | AnalysisContext analysisCtx = new AnalysisContext(dsqldCatalog_, queryCtx, |
8 | LOG.debug( "analyze query " + queryCtx.request.stmt); |
18 | analysisCtx.analyze(queryCtx.request.stmt); |
19 | Preconditions.checkState(analysisCtx.getAnalyzer().getMissingTbls().isEmpty()); |
20 | return analysisCtx.getAnalysisResult(); |
21 | } catch (AnalysisException e) { |
22 | Set<TableName> missingTbls = analysisCtx.getAnalyzer().getMissingTbls(); |
24 | if (missingTbls.isEmpty()) throw e; |
27 | if (!requestTblLoadAndWait(missingTbls, MISSING_TBL_LOAD_WAIT_TIMEOUT_MS)) { |
28 | LOG.info(String.format( "Missing tables were not received in %dms. Load " + |
29 | "request will be retried." , MISSING_TBL_LOAD_WAIT_TIMEOUT_MS)); |
37 | analysisCtx.getAnalyzer().authorize(getAuthzChecker()); |
AnalyzerContext.AnalyzeResult.Analyzer 对象是个存放这个 SQL 所涉及到的所有信息
(包含Table, conjunct, slot,slotRefMap, eqJoinConjuncts等)的知识库,所有跟这个
SQL 有关的东西都会存到 Analyzer对象里面。该类的定义可以查看
Analyzer.java
AnalyzerContex.analyze()
(AnalyzeContext.java)
的具体实现代码如下:
2 | * Parse and analyze 'stmt'. If 'stmt' is a nested query (i.e. query that |
3 | * contains subqueries), it is also rewritten by performing subquery unnesting. |
4 | * The transformed stmt is then re-analyzed in a new analysis context. |
6 | public void analyze(String stmt) throws AnalysisException { |
7 | Analyzer analyzer = new Analyzer(catalog_, queryCtx_, authzConfig_); |
8 | analyze(stmt, analyzer); |
上面的 analyze() 函数通过调用同名的重载函数 analyze(String stmt, Analyzer analyzer)
(AnalyzeContext.java)
实现具体的分析,代码如下:
2 | * Parse and analyze 'stmt' using a specified Analyzer. |
4 | public void analyze(String stmt, Analyzer analyzer) throws AnalysisException { |
5 | SqlScanner input = new SqlScanner( new StringReader(stmt)); |
6 | SqlParser parser = new SqlParser(input); |
8 | analysisResult_ = new AnalysisResult(); |
9 | analysisResult_.analyzer_ = analyzer; |
10 | if (analysisResult_.analyzer_ == null ) { |
11 | analysisResult_.analyzer_ = new Analyzer(catalog_, queryCtx_, authzConfig_); |
13 | analysisResult_.stmt_ = (StatementBase) parser.parse().value; |
14 | if (analysisResult_.stmt_ == null ) |
19 | if (analysisResult_.stmt_ instanceof CreateTableAsSelectStmt) { |
20 | analysisResult_.tmpCreateTableStmt_ = |
21 | ((CreateTableAsSelectStmt) analysisResult_.stmt_).getCreateStmt().clone(); |
24 | analysisResult_.stmt_.analyze(analysisResult_.analyzer_); |
25 | boolean isExplain = analysisResult_.isExplainStmt(); |
28 | if (analysisResult_.requiresRewrite()) { |
29 | StatementBase rewrittenStmt = StmtRewriter.rewrite(analysisResult_); |
31 | Preconditions.checkNotNull(rewrittenStmt); |
32 | analysisResult_ = new AnalysisResult(); |
33 | analysisResult_.analyzer_ = new Analyzer(catalog_, queryCtx_, authzConfig_); |
34 | analysisResult_.stmt_ = rewrittenStmt; |
35 | analysisResult_.stmt_.analyze(analysisResult_.analyzer_); |
36 | LOG.trace( "rewrittenStmt: " + rewrittenStmt.toSql()); |
38 | analysisResult_.stmt_.setIsExplain(); |
40 | } catch (AnalysisException e) { |
43 | } catch (Exception e) { |
44 | throw new AnalysisException(parser.getErrorMsg(stmt), e); |
上面的函数通过调用 SqlScanner 和 SqlParser 类实现具体的分析。可以查看
sql-scanner.flex
和
sql-parser.y
分析 SQL 语句的大概流程如下:
- 处理这个 SQL 所涉及到的 Table(即TableRefs),这些 Table 是在 from 从句中提取出来的(包含关键字
from, join, on/using)。注意 JOIN 操作以及 on/using 条件是存储在参与 JOIN 操作的右边的表的 TableRef
中并分析的。依次 analyze() 每个 TableRef,向 Analyzer 注册 registerBaseTableRef(填充TupleDescriptor)。
如果对应的 TableRef 涉及到 JOIN 操作,还要 analyzeJoin()。在 analyzeJoin() 时会向 Analyzer registerConjunct()
填充 Analyzer 的一些成员变量:conjuncts,tuplePredicates(TupleId 与 conjunct 的映射),slotPredicates(SlotId
与 conjunct 的映射),eqJoinConjuncts。 - 处理 select 从句(包含关键字 select, MAX(), AVG()等聚集函数):分析这个 SQL 都 select 了哪几项,每一项都是个
Expr 类型的子类对象,把这几项填入 resultExprs 数组和 colLabels。然后把 resultExprs 里面的 Expr 都递归 analyze
一下,要分析到树的最底层,向 Analyzer 注册 SlotRef 等。 - 分析 where 从句(关键字 where),首先递归 Analyze 从句中 Expr 组成的树,然后向 Analyzer registerConjunct()
填充 Analyzer 的一些成员变量(同1,此外还要填充 whereClauseConjuncts) 。 - 处理 sort 相关信息(关键字 order by)。先是解析 aliases 和 ordinals,然后从 order by 后面的从句中提取 Expr 填入
orderingExprs,接着递归 Analyze 从句中 Expr 组成的树,最后创建 SortInfo 对象。 - 处理 aggregation 相关信息(关键字 group by, having, avg, max 等)。首先递归分析 group by 从句里的 Expr,然后如果有
having 从句就像 where 从句一样,先是 analyze having 从句中 Expr 组成的树,然后向 Analyzer registerConjunct()等。 - 处理 InlineView。
至此,词法分析和语法分析都完成了,回到 frontend.createExecRequest()
(Frontend.java)
函数,开始填充 TExecRequest 内的成员变量。代码如下(部分):
2 | * Create a populated TExecRequest corresponding to the supplied TQueryCtx. |
4 | public TExecRequest createExecRequest(TQueryCtx queryCtx, StringBuilder explainString) |
5 | throws ImpalaException { |
7 | AnalysisContext.AnalysisResult analysisResult = analyzeStmt(queryCtx); |
8 | EventSequence timeline = analysisResult.getAnalyzer().getTimeline(); |
9 | timeline.markEvent( "Analysis finished" ); |
12 | Preconditions.checkNotNull(analysisResult.getStmt()); |
13 | TExecRequest result = new TExecRequest(); |
14 | result.setQuery_options(queryCtx.request.getQuery_options()); |
15 | result.setAccess_events(analysisResult.getAccessEvents()); |
16 | result.analysis_warnings = analysisResult.getAnalyzer().getWarnings(); |
18 | if (analysisResult.isCatalogOp()) { |
19 | result.stmt_type = TStmtType.DDL; |
20 | createCatalogOpRequest(analysisResult, result); |
21 | String jsonLineageGraph = analysisResult.getJsonLineageGraph(); |
22 | if (jsonLineageGraph != null && !jsonLineageGraph.isEmpty()) { |
23 | result.catalog_op_request.setLineage_graph(jsonLineageGraph); |
26 | if (!analysisResult.isCreateTableAsSelectStmt()) return result; |
27 | } else if (analysisResult.isLoadDataStmt()) { |
28 | result.stmt_type = TStmtType.LOAD; |
29 | result.setResult_set_metadata( new TResultSetMetadata(Arrays.asList( |
30 | new TColumn( "summary" , Type.STRING.toThrift())))); |
31 | result.setLoad_data_request(analysisResult.getLoadDataStmt().toThrift()); |
33 | } else if (analysisResult.isSetStmt()) { |
34 | result.stmt_type = TStmtType.SET; |
35 | result.setResult_set_metadata( new TResultSetMetadata(Arrays.asList( |
36 | new TColumn( "option" , Type.STRING.toThrift()), |
37 | new TColumn( "value" , Type.STRING.toThrift())))); |
38 | result.setSet_query_option_request(analysisResult.getSetStmt().toThrift()); |
如果是 DDL 命令(use, show tables, show databases, describe),那么调用 createCatalogOpRequest()。
如果是 Load Data 或者 Set 语句,就调用相应的 setmetadata 并转换为 Thrift。
执行计划生成
另外一种情况就是 Query 或者 DML 命令,那么就得创建和填充 TQueryExecRequest 了。该部分代码如下:
2 | * Create a populated TExecRequest corresponding to the supplied TQueryCtx. |
4 | public TExecRequest createExecRequest(TQueryCtx queryCtx, StringBuilder explainString) |
12 | Preconditions.checkState(analysisResult.isQueryStmt() || analysisResult.isDmlStmt() |
13 | || analysisResult.isCreateTableAsSelectStmt()); |
15 | TQueryExecRequest queryExecRequest = new TQueryExecRequest(); |
17 | LOG.debug( "create plan" ); |
18 | Planner planner = new Planner(analysisResult, queryCtx); |
21 | ArrayList<PlanFragment> fragments = planner.createPlan(); |
23 | List<ScanNode> scanNodes = Lists.newArrayList(); |
26 | Map<PlanFragment, Integer> fragmentIdx = Maps.newHashMap(); |
28 | for ( int fragmentId = 0 ; fragmentId < fragments.size(); ++fragmentId) { |
29 | PlanFragment fragment = fragments.get(fragmentId); |
30 | Preconditions.checkNotNull(fragment.getPlanRoot()); |
31 | fragment.getPlanRoot().collect(Predicates.instanceOf(ScanNode. class ), scanNodes); |
32 | fragmentIdx.put(fragment, fragmentId); |
上面的 createPlan() 函数是 frontend 最重要的函数:根据 SQL 解析的结果和 client 传入的 query options,
生成执行计划。执行计划是用 PlanFragment 的数组表示的,最后会序列化到 TQueryExecRequest.fragments
然后传给 backend 的 coordinator 去调度执行。现在让我们来看看 createPlan()
(Planner.java)
的具体实现:
2 | * Returns a list of plan fragments for executing an analyzed parse tree. |
3 | * May return a single-node or distributed executable plan. |
5 | public ArrayList<PlanFragment> createPlan() throws ImpalaException { |
6 | SingleNodePlanner singleNodePlanner = new SingleNodePlanner(ctx_); |
7 | DistributedPlanner distributedPlanner = new DistributedPlanner(ctx_); |
9 | PlanNode singleNodePlan = singleNodePlanner.createSingleNodePlan(); |
10 | ctx_.getRootAnalyzer().getTimeline().markEvent( "Single node plan created" ); |
11 | ArrayList<PlanFragment> fragments = null ; |
14 | MaxRowsProcessedVisitor visitor = new MaxRowsProcessedVisitor(); |
15 | singleNodePlan.accept(visitor); |
16 | long maxRowsProcessed = visitor.get() == - 1 ? Long.MAX_VALUE : visitor.get(); |
17 | boolean isSmallQuery = |
18 | maxRowsProcessed < ctx_.getQueryOptions().exec_single_node_rows_threshold; |
21 | ctx_.getQueryOptions().setNum_nodes( 1 ); |
22 | ctx_.getQueryOptions().setDisable_codegen( true ); |
23 | if (maxRowsProcessed < ctx_.getQueryOptions().batch_size || |
24 | maxRowsProcessed < 1024 && ctx_.getQueryOptions().batch_size == 0 ) { |
26 | ctx_.getQueryOptions().setNum_scanner_threads( 1 ); |
30 | if (ctx_.isSingleNodeExec()) { |
32 | fragments = Lists.newArrayList( new PlanFragment( |
33 | ctx_.getNextFragmentId(), singleNodePlan, DataPartition.UNPARTITIONED)); |
36 | fragments = distributedPlanner.createPlanFragments(singleNodePlan); |
39 | PlanFragment rootFragment = fragments.get(fragments.size() - 1 ); |
40 | if (ctx_.isInsertOrCtas()) { |
41 | InsertStmt insertStmt = ctx_.getAnalysisResult().getInsertStmt(); |
42 | if (!ctx_.isSingleNodeExec()) { |
44 | rootFragment = distributedPlanner.createInsertFragment( |
45 | rootFragment, insertStmt, ctx_.getRootAnalyzer(), fragments); |
48 | rootFragment.setSink(insertStmt.createDataSink()); |
51 | ColumnLineageGraph graph = ctx_.getRootAnalyzer().getColumnLineageGraph(); |
52 | List<Expr> resultExprs = null ; |
53 | Table targetTable = null ; |
54 | if (ctx_.isInsertOrCtas()) { |
55 | InsertStmt insertStmt = ctx_.getAnalysisResult().getInsertStmt(); |
56 | resultExprs = insertStmt.getResultExprs(); |
57 | targetTable = insertStmt.getTargetTable(); |
58 | graph.addTargetColumnLabels(targetTable); |
60 | resultExprs = ctx_.getQueryStmt().getResultExprs(); |
61 | graph.addTargetColumnLabels(ctx_.getQueryStmt().getColLabels()); |
63 | resultExprs = Expr.substituteList(resultExprs, |
64 | rootFragment.getPlanRoot().getOutputSmap(), ctx_.getRootAnalyzer(), true ); |
65 | rootFragment.setOutputExprs(resultExprs); |
66 | LOG.debug( "desctbl: " + ctx_.getRootAnalyzer().getDescTbl().debugString()); |
67 | LOG.debug( "resultexprs: " + Expr.debugString(rootFragment.getOutputExprs())); |
68 | LOG.debug( "finalize plan fragments" ); |
69 | for (PlanFragment fragment: fragments) { |
70 | fragment.finalize(ctx_.getRootAnalyzer()); |
73 | Collections.reverse(fragments); |
74 | ctx_.getRootAnalyzer().getTimeline().markEvent( "Distributed plan created" ); |
76 | if (RuntimeEnv.INSTANCE.computeLineage() || RuntimeEnv.INSTANCE.isTestEnv()) { |
78 | if (ctx_.isInsertOrCtas()) { |
79 | Preconditions.checkNotNull(targetTable); |
80 | List<Expr> exprs = Lists.newArrayList(); |
81 | if (targetTable instanceof HBaseTable) { |
82 | exprs.addAll(resultExprs); |
84 | exprs.addAll(ctx_.getAnalysisResult().getInsertStmt().getPartitionKeyExprs()); |
85 | exprs.addAll(resultExprs.subList( 0 , |
86 | targetTable.getNonClusteringColumns().size())); |
88 | graph.computeLineageGraph(exprs, ctx_.getRootAnalyzer()); |
90 | graph.computeLineageGraph(resultExprs, ctx_.getRootAnalyzer()); |
92 | LOG.trace( "lineage: " + graph.debugString()); |
93 | ctx_.getRootAnalyzer().getTimeline().markEvent( "Lineage info computed" ); |
createPlan 包括createSingleNodePlan 和 createPlanFragments
两个主要部分。其中第一个是单节点计划树,所有片段只能在一个节点 corrd 上执行,第二个是分布式执行计划树,片段可以分配到不同的节点中运行。我们先来看看 SingleNodePlanner.createSingleNodePlan()
(SingleNodePlanner.java)
该方法根据 Planner Context 中分析的语法树创建单节点执行计划树并返回根节点。计划递归处理语法树并执行以下操作,自上而下处理查询语句:
- materialize the slots required for evaluating expressions of that statement
- migrate conjuncts from parent blocks into inline views and union operands In the bottom-up phase generate the plan tree for every query statement:
- perform join-order optimization when generating the plan of the FROM clause of a select statement; requires that all materialized slots are known for an accurate estimate of row sizes needed for cost-based join ordering
- assign conjuncts that can be evaluated at that node and compute the stats of that node (cardinality, etc.)
- apply combined expression substitution map of child plan nodes; if a plan node re-maps its input, set a substitution map to be applied by parents
具体代码如下:
2 | * Generates and returns the root of the single-node plan for the analyzed parse tree |
3 | * in the planner context. |
5 | public PlanNode createSingleNodePlan() throws ImpalaException { |
6 | QueryStmt queryStmt = ctx_.getQueryStmt(); |
9 | Analyzer analyzer = queryStmt.getAnalyzer(); |
10 | analyzer.computeEquivClasses(); |
11 | analyzer.getTimeline().markEvent( "Equivalence classes computed" ); |
22 | if (queryStmt.getBaseTblResultExprs() != null ) { |
23 | analyzer.materializeSlots(queryStmt.getBaseTblResultExprs()); |
26 | LOG.trace( "desctbl: " + analyzer.getDescTbl().debugString()); |
27 | PlanNode singleNodePlan = createQueryPlan(queryStmt, analyzer, |
28 | ctx_.getQueryOptions().isDisable_outermost_topn()); |
29 | Preconditions.checkNotNull(singleNodePlan); |
30 | return singleNodePlan; |
上面的函数通过调用私有的 createQueryPlan()
(SingleNodePlanner.java)
函数实现。该函数为单节点执行创建计划树。为查询语句中的
Select/Project/Join/Union [All]/Group by/Having/Order by
生成 PlanNode。具体实现代码如下:
2 | * Create plan tree for single-node execution. Generates PlanNodes for the |
3 | * Select/Project/Join/Union [All]/Group by/Having/Order by clauses of the query stmt. |
5 | private PlanNode createQueryPlan(QueryStmt stmt, Analyzer analyzer, boolean disableTopN) |
6 | throws ImpalaException { |
8 | if (analyzer.hasEmptyResultSet()) return createEmptyNode(stmt, analyzer); |
11 | if (stmt instanceof SelectStmt) { |
12 | SelectStmt selectStmt = (SelectStmt) stmt; |
14 | root = createSelectPlan(selectStmt, analyzer); |
17 | if (((SelectStmt) stmt).getAnalyticInfo() != null ) { |
18 | AnalyticInfo analyticInfo = selectStmt.getAnalyticInfo(); |
19 | ArrayList<TupleId> stmtTupleIds = Lists.newArrayList(); |
20 | stmt.getMaterializedTupleIds(stmtTupleIds); |
21 | AnalyticPlanner analyticPlanner = |
22 | new AnalyticPlanner(stmtTupleIds, analyticInfo, analyzer, ctx_); |
23 | List<Expr> inputPartitionExprs = Lists.newArrayList(); |
24 | AggregateInfo aggInfo = selectStmt.getAggInfo(); |
25 | root = analyticPlanner.createSingleNodePlan(root, |
26 | aggInfo != null ? aggInfo.getGroupingExprs() : null , inputPartitionExprs); |
27 | if (aggInfo != null && !inputPartitionExprs.isEmpty()) { |
29 | aggInfo.setPartitionExprs(inputPartitionExprs); |
33 | Preconditions.checkState(stmt instanceof UnionStmt); |
34 | root = createUnionPlan((UnionStmt) stmt, analyzer); |
38 | boolean sortHasMaterializedSlots = false ; |
39 | if (stmt.evaluateOrderBy()) { |
40 | for (SlotDescriptor sortSlotDesc: |
41 | stmt.getSortInfo().getSortTupleDescriptor().getSlots()) { |
42 | if (sortSlotDesc.isMaterialized()) { |
43 | sortHasMaterializedSlots = true ; |
49 | if (stmt.evaluateOrderBy() && sortHasMaterializedSlots) { |
50 | long limit = stmt.getLimit(); |
53 | boolean useTopN = stmt.hasLimit() && !disableTopN; |
55 | root = new SortNode(ctx_.getNextNodeId(), root, stmt.getSortInfo(), |
56 | useTopN, stmt.getOffset()); |
57 | Preconditions.checkState(root.hasValidStats()); |
61 | root.setLimit(stmt.getLimit()); |
62 | root.computeStats(analyzer); |
SingleNodePlanner.createSelectPlan()
(SingleNodePlanner.java)
函数创建实现 select 查询语句块中
Select/Project/Join/Group by/Having 等从句的 PlanNode 树。具体实现代码如下:
2 | * Create tree of PlanNodes that implements the Select/Project/Join/Group by/Having |
3 | * of the selectStmt query block. |
5 | private PlanNode createSelectPlan(SelectStmt selectStmt, Analyzer analyzer) |
6 | throws ImpalaException { |
9 | if (selectStmt.getTableRefs().isEmpty()) { |
10 | return createConstantSelectPlan(selectStmt, analyzer); |
25 | selectStmt.materializeRequiredSlots(analyzer); |
27 | ArrayList<TupleId> rowTuples = Lists.newArrayList(); |
29 | for (TableRef tblRef: selectStmt.getTableRefs()) { |
30 | rowTuples.addAll(tblRef.getMaterializedTupleIds()); |
37 | if (analyzer.hasEmptySpjResultSet()) { |
38 | PlanNode emptySetNode = new EmptySetNode(ctx_.getNextNodeId(), rowTuples); |
39 | emptySetNode.init(analyzer); |
40 | emptySetNode.setOutputSmap(selectStmt.getBaseTblSmap()); |
41 | return createAggregationPlan(selectStmt, analyzer, emptySetNode); |
46 | List<Pair<TableRef, PlanNode>> refPlans = Lists.newArrayList(); |
47 | for (TableRef ref: selectStmt.getTableRefs()) { |
48 | PlanNode plan = createTableRefNode(analyzer, ref); |
49 | Preconditions.checkState(plan != null ); |
50 | refPlans.add( new Pair(ref, plan)); |
53 | for (Pair<TableRef, PlanNode> entry: refPlans) { |
54 | entry.second.setAssignedConjuncts(analyzer.getAssignedConjuncts()); |
59 | if (!selectStmt.getSelectList().isStraightJoin()) { |
60 | Set<ExprId> assignedConjuncts = analyzer.getAssignedConjuncts(); |
61 | root = createCheapestJoinPlan(analyzer, refPlans); |
62 | if (root == null ) analyzer.setAssignedConjuncts(assignedConjuncts); |
65 | if (selectStmt.getSelectList().isStraightJoin() || root == null ) { |
68 | root = createFromClauseJoinPlan(analyzer, refPlans); |
69 | Preconditions.checkNotNull(root); |
73 | if (selectStmt.getAggInfo() != null ) { |
74 | root = createAggregationPlan(selectStmt, analyzer, root); |
上面函数中调用的主要私有方法有:
createTableRefNode()、createCheapestJoinPlan()、 createFromClauseJoinPlan()、 createAggregationPlan(),各个函数的具体实现如下:
createTableRefNode()
2 | * Create a tree of PlanNodes for the given tblRef, which can be a BaseTableRef, |
3 | * CollectionTableRef or an InlineViewRef. |
5 | private PlanNode createTableRefNode(Analyzer analyzer, TableRef tblRef) |
6 | throws ImpalaException { |
7 | if (tblRef instanceof BaseTableRef || tblRef instanceof CollectionTableRef) { |
9 | return createScanNode(analyzer, tblRef); |
10 | } else if (tblRef instanceof InlineViewRef) { |
12 | return createInlineViewPlan(analyzer, (InlineViewRef) tblRef); |
14 | throw new InternalException( |
15 | "Unknown TableRef node: " + tblRef.getClass().getSimpleName()); |
createCheapestJoinPlan()
2 | * 返回物化 join refPlans 中所有 TblRefs 开销最小的 plan |
3 | * 假设 refPlans 中的顺序和查询中的原始顺序相同 |
5 | * - the plan is executable, ie, all non-cross joins have equi-join predicates |
6 | * - the leftmost scan is over the largest of the inputs for which we can still |
7 | * construct an executable plan(左边的是最大表) |
8 | * - all rhs's(right hand side?) are in decreasing order of selectiveness (percentage of rows they |
10 | * - outer/cross/semi joins: rhs serialized size is < lhs serialized size;(右边的表比左边的小) |
11 | * enforced via join inversion, if necessary(否则通过 join 反转实现) |
12 | * Returns null if we can't create an executable plan. |
14 | private PlanNode createCheapestJoinPlan( |
15 | Analyzer analyzer, List<Pair<TableRef, PlanNode>> refPlans) |
16 | throws ImpalaException { |
17 | LOG.trace( "createCheapestJoinPlan" ); |
18 | if (refPlans.size() == 1 ) return refPlans.get( 0 ).second; |
22 | ArrayList<Pair<TableRef, Long>> candidates = Lists.newArrayList(); |
23 | for (Pair<TableRef, PlanNode> entry: refPlans) { |
24 | TableRef ref = entry.first; |
25 | JoinOperator joinOp = ref.getJoinOp(); |
35 | if (((joinOp.isOuterJoin() || joinOp.isSemiJoin() || joinOp.isCrossJoin()) && |
36 | ref != refPlans.get( 1 ).first) || joinOp.isNullAwareLeftAntiJoin()) { |
41 | PlanNode plan = entry.second; |
42 | if (plan.getCardinality() == - 1 ) { |
45 | candidates.add( new Pair(ref, new Long( 0 ))); |
46 | LOG.trace( "candidate " + ref.getUniqueAlias() + ": 0" ); |
49 | Preconditions.checkNotNull(ref.getDesc()); |
50 | long materializedSize = |
51 | ( long ) Math.ceil(plan.getAvgRowSize() * ( double ) plan.getCardinality()); |
52 | candidates.add( new Pair(ref, new Long(materializedSize))); |
53 | LOG.trace( "candidate " + ref.getUniqueAlias() + ": " + Long.toString(materializedSize)); |
55 | if (candidates.isEmpty()) return null ; |
59 | Collections.sort(candidates, |
60 | new Comparator<Pair<TableRef, Long>>() { |
61 | public int compare(Pair<TableRef, Long> a, Pair<TableRef, Long> b) { |
62 | long diff = b.second - a.second; |
63 | return (diff < 0 ? - 1 : (diff > 0 ? 1 : 0 )); |
68 | for (Pair<TableRef, Long> candidate: candidates) { |
69 | PlanNode result = createJoinPlan(analyzer, candidate.first, refPlans); |
70 | if (result != null ) return result; |
createFromClauseJoinPlan()
2 | * 返回按照 from 语句顺序的 JoinPlan |
4 | private PlanNode createFromClauseJoinPlan( |
5 | Analyzer analyzer, List<Pair<TableRef, PlanNode>> refPlans) |
6 | throws ImpalaException { |
8 | Preconditions.checkState(!refPlans.isEmpty()); |
9 | PlanNode root = refPlans.get( 0 ).second; |
10 | for ( int i = 1 ; i < refPlans.size(); ++i) { |
11 | TableRef innerRef = refPlans.get(i).first; |
12 | PlanNode innerPlan = refPlans.get(i).second; |
13 | root = createJoinNode(analyzer, root, innerPlan, null , innerRef); |
14 | root.setId(ctx_.getNextNodeId()); |
createAggregationPlan()
2 | * Returns a new AggregationNode that materializes the aggregation of the given stmt. |
3 | * Assigns conjuncts from the Having clause to the returned node. |
5 | private PlanNode createAggregationPlan(SelectStmt selectStmt, Analyzer analyzer, |
6 | PlanNode root) throws InternalException { |
7 | Preconditions.checkState(selectStmt.getAggInfo() != null ); |
9 | AggregateInfo aggInfo = selectStmt.getAggInfo(); |
10 | root = new AggregationNode(ctx_.getNextNodeId(), root, aggInfo); |
12 | Preconditions.checkState(root.hasValidStats()); |
15 | if (aggInfo.isDistinctAgg()) { |
16 | ((AggregationNode)root).unsetNeedsFinalize(); |
18 | ((AggregationNode)root).setIntermediateTuple(); |
19 | root = new AggregationNode(ctx_.getNextNodeId(), root, |
20 | aggInfo.getSecondPhaseDistinctAggInfo()); |
22 | Preconditions.checkState(root.hasValidStats()); |
25 | root.assignConjuncts(analyzer); |
上面的 createCheapestJoinPlan() 和 createFromClauseJoinPlan()
方法调用了 createJoinNode() 和 createJoinPlan() 两个方法。它们的具体实现如下:
createJoinNode()
2 | * 创建 join outer 和 inner 的 node。两者其中之一可能是一个根据 table ref 创建的 plan |
3 | * 但不能同时都是 plan。对应的 outer/inner tableRef 不能为空 |
5 | private PlanNode createJoinNode( |
6 | Analyzer analyzer, PlanNode outer, PlanNode inner, TableRef outerRef, |
7 | TableRef innerRef) throws ImpalaException { |
8 | Preconditions.checkState(innerRef != null ^ outerRef != null ); |
9 | TableRef tblRef = (innerRef != null ) ? innerRef : outerRef; |
11 | List<BinaryPredicate> eqJoinConjuncts = Lists.newArrayList(); |
12 | List<Expr> eqJoinPredicates = Lists.newArrayList(); |
15 | if (innerRef != null ) { |
16 | getHashLookupJoinConjuncts( |
17 | analyzer, outer.getTblRefIds(), innerRef, eqJoinConjuncts, eqJoinPredicates); |
19 | if (!innerRef.getJoinOp().isOuterJoin()) { |
20 | analyzer.createEquivConjuncts(outer.getTblRefIds(), innerRef.getId(), |
24 | getHashLookupJoinConjuncts( |
25 | analyzer, inner.getTblRefIds(), outerRef, eqJoinConjuncts, eqJoinPredicates); |
27 | if (!outerRef.getJoinOp().isOuterJoin()) { |
28 | analyzer.createEquivConjuncts(inner.getTblRefIds(), outerRef.getId(), |
32 | for (BinaryPredicate eqJoinConjunct: eqJoinConjuncts) { |
33 | Expr swapTmp = eqJoinConjunct.getChild( 0 ); |
34 | eqJoinConjunct.setChild( 0 , eqJoinConjunct.getChild( 1 )); |
35 | eqJoinConjunct.setChild( 1 , swapTmp); |
40 | if (eqJoinConjuncts.isEmpty()) { |
46 | if (tblRef.getJoinOp().isOuterJoin() || |
47 | tblRef.getJoinOp().isSemiJoin()) { |
48 | throw new NotImplementedException( |
49 | String.format( "%s join with '%s' without equi-join " + |
50 | "conjuncts is not supported." , |
51 | tblRef.getJoinOp().isOuterJoin() ? "Outer" : "Semi" , |
52 | innerRef.getUniqueAlias())); |
54 | CrossJoinNode result = |
55 | new CrossJoinNode(outer, inner, tblRef, Collections.<Expr>emptyList()); |
56 | result.init(analyzer); |
61 | if (tblRef.getJoinOp() == JoinOperator.CROSS_JOIN) { |
62 | tblRef.setJoinOp(JoinOperator.INNER_JOIN); |
65 | analyzer.markConjunctsAssigned(eqJoinPredicates); |
67 | List<Expr> otherJoinConjuncts = Lists.newArrayList(); |
68 | if (tblRef.getJoinOp().isOuterJoin()) { |
71 | otherJoinConjuncts = analyzer.getUnassignedOjConjuncts(tblRef); |
72 | } else if (tblRef.getJoinOp().isSemiJoin()) { |
77 | analyzer.getUnassignedConjuncts(tblRef.getAllTupleIds(), false ); |
78 | if (tblRef.getJoinOp().isNullAwareLeftAntiJoin()) { |
79 | boolean hasNullMatchingEqOperator = false ; |
83 | Iterator<BinaryPredicate> it = eqJoinConjuncts.iterator(); |
84 | while (it.hasNext()) { |
85 | BinaryPredicate conjunct = it.next(); |
86 | if (!conjunct.isNullMatchingEq()) { |
87 | otherJoinConjuncts.add(conjunct); |
91 | Preconditions.checkState(!hasNullMatchingEqOperator); |
92 | hasNullMatchingEqOperator = true ; |
95 | Preconditions.checkState(hasNullMatchingEqOperator); |
98 | analyzer.markConjunctsAssigned(otherJoinConjuncts); |
100 | HashJoinNode result = |
101 | new HashJoinNode(outer, inner, tblRef, eqJoinConjuncts, otherJoinConjuncts); |
102 | result.init(analyzer); |
createJoinPlan()
2 | * Returns a plan with leftmostRef's plan as its leftmost input; the joins |
3 | * are in decreasing order of selectiveness (percentage of rows they eliminate). |
4 | * The leftmostRef's join will be inverted if it is an outer/semi/cross join. |
6 | private PlanNode createJoinPlan( |
7 | Analyzer analyzer, TableRef leftmostRef, List<Pair<TableRef, PlanNode>> refPlans) |
8 | throws ImpalaException { |
10 | LOG.trace( "createJoinPlan: " + leftmostRef.getUniqueAlias()); |
12 | List<Pair<TableRef, PlanNode>> remainingRefs = Lists.newArrayList(); |
14 | for (Pair<TableRef, PlanNode> entry: refPlans) { |
15 | if (entry.first == leftmostRef) { |
18 | remainingRefs.add(entry); |
21 | Preconditions.checkNotNull(root); |
23 | Set<TableRef> joinedRefs = Sets.newHashSet(); |
24 | joinedRefs.add(leftmostRef); |
27 | boolean planHasInvertedJoin = false ; |
28 | if (leftmostRef.getJoinOp().isOuterJoin() |
29 | || leftmostRef.getJoinOp().isSemiJoin() |
30 | || leftmostRef.getJoinOp().isCrossJoin()) { |
35 | leftmostRef.invertJoin(refPlans, analyzer); |
36 | planHasInvertedJoin = true ; |
41 | while (!remainingRefs.isEmpty()) { |
43 | PlanNode newRoot = null ; |
44 | Pair<TableRef, PlanNode> minEntry = null ; |
45 | for (Pair<TableRef, PlanNode> entry: remainingRefs) { |
46 | TableRef ref = entry.first; |
47 | LOG.trace(Integer.toString(i) + " considering ref " + ref.getUniqueAlias()); |
56 | JoinOperator joinOp = ref.getJoinOp(); |
57 | if (joinOp.isOuterJoin() || joinOp.isSemiJoin()) { |
58 | List<TupleId> currentTids = Lists.newArrayList(root.getTblRefIds()); |
59 | currentTids.add(ref.getId()); |
64 | List<TupleId> tableRefTupleIds = ref.getAllTupleIds(); |
65 | if (!currentTids.containsAll(tableRefTupleIds) || |
66 | !tableRefTupleIds.containsAll(currentTids)) { |
71 | } else if (ref.getJoinOp().isCrossJoin()) { |
72 | if (!joinedRefs.contains(ref.getLeftTblRef())) continue ; |
75 | PlanNode rhsPlan = entry.second; |
76 | analyzer.setAssignedConjuncts(root.getAssignedConjuncts()); |
78 | boolean invertJoin = false ; |
79 | if (joinOp.isOuterJoin() || joinOp.isSemiJoin() || joinOp.isCrossJoin()) { |
85 | long lhsCard = root.getCardinality(); |
86 | long rhsCard = rhsPlan.getCardinality(); |
87 | if (lhsCard != - 1 && rhsCard != - 1 && |
88 | lhsCard * root.getAvgRowSize() < rhsCard * rhsPlan.getAvgRowSize() && |
89 | !joinOp.isNullAwareLeftAntiJoin()) { |
93 | PlanNode candidate = null ; |
95 | ref.setJoinOp(ref.getJoinOp().invert()); |
96 | candidate = createJoinNode(analyzer, rhsPlan, root, ref, null ); |
97 | planHasInvertedJoin = true ; |
99 | candidate = createJoinNode(analyzer, root, rhsPlan, null , ref); |
101 | if (candidate == null ) continue ; |
102 | LOG.trace( "cardinality=" + Long.toString(candidate.getCardinality())); |
106 | if (joinOp.isOuterJoin() || joinOp.isSemiJoin()) { |
114 | || (candidate.getClass().equals(newRoot.getClass()) |
115 | && candidate.getCardinality() < newRoot.getCardinality()) |
116 | || (candidate instanceof HashJoinNode && newRoot instanceof CrossJoinNode)) { |
121 | if (newRoot == null ) { |
129 | Preconditions.checkState(!planHasInvertedJoin); |
135 | long lhsCardinality = root.getCardinality(); |
136 | long rhsCardinality = minEntry.second.getCardinality(); |
137 | numOps += lhsCardinality + rhsCardinality; |
138 | LOG.debug(Integer.toString(i) + " chose " + minEntry.first.getUniqueAlias() |
139 | + " #lhs=" + Long.toString(lhsCardinality) |
140 | + " #rhs=" + Long.toString(rhsCardinality) |
141 | + " #ops=" + Long.toString(numOps)); |
142 | remainingRefs.remove(minEntry); |
143 | joinedRefs.add(minEntry.first); |
147 | root.setId(ctx_.getNextNodeId()); |
148 | analyzer.setAssignedConjuncts(root.getAssignedConjuncts()); |
至此我们已经大概介绍了 createSingleNodePlan 的过程。
现在让我们回到 createPlan() 函数,来看看创建分布式执行计划树,即 createPlanFrangments 过程。
DistributedPlanner.createPlanFragments()
(Planner.java)
方法为单点计划树生成多个片段。具体代码如下:
3 | * 片段通过 list 返回,list 中位置 i 的片段只能使用片段 j 的输出(j > i)。 |
5 | * TODO: 考虑计划片段中的数据分片; 尤其是要比 createQueryPlan() 更加注重协调 |
6 | * 聚集操作中 hash partitioning 以及分析计算中的 hash partitioning。 |
7 | * (只有在相同 select 块中进行聚集和分析计算时才会发生协调) |
9 | public ArrayList<PlanFragment> createPlanFragments( |
10 | PlanNode singleNodePlan) throws ImpalaException { |
11 | Preconditions.checkState(!ctx_.isSingleNodeExec()); |
12 | AnalysisContext.AnalysisResult analysisResult = ctx_.getAnalysisResult(); |
13 | QueryStmt queryStmt = ctx_.getQueryStmt(); |
14 | ArrayList<PlanFragment> fragments = Lists.newArrayList(); |
17 | boolean isPartitioned = false ; |
18 | if ((analysisResult.isInsertStmt() || analysisResult.isCreateTableAsSelectStmt()) |
19 | && !singleNodePlan.hasLimit()) { |
20 | Preconditions.checkState(!queryStmt.hasOffset()); |
23 | LOG.debug( "create plan fragments" ); |
24 | long perNodeMemLimit = ctx_.getQueryOptions().mem_limit; |
25 | LOG.debug( "memlimit=" + Long.toString(perNodeMemLimit)); |
27 | createPlanFragments(singleNodePlan, isPartitioned, perNodeMemLimit, fragments); |
上面的方法调用私有成员方法 DistributedPlanner.createPlanFragments()
DistributedPlanner.java
该方法返回生成 root 结果的 fragments。具体代码如下:
2 | * 返回生成 'root' 结果的 fragments; 递归创建所有 input fragments 到返回的 fragment |
3 | * 如果创建了一个新的 fragment,会被追加到 ‘fragments’,这样 fragment 就会在所有需要 |
5 | * 如果 'isPartitioned' 为否,,那么返回的 fragment 就是 unpartitioned; |
6 | * 否则就可能是 partitioned, 取决于它的输入是否 partitioned; |
7 | * the partition function is derived from the inputs. |
9 | private PlanFragment createPlanFragments( |
10 | PlanNode root, boolean isPartitioned, |
11 | long perNodeMemLimit, ArrayList<PlanFragment> fragments) |
12 | throws InternalException, NotImplementedException { |
13 | ArrayList<PlanFragment> childFragments = Lists.newArrayList(); |
14 | for (PlanNode child: root.getChildren()) { |
18 | boolean childIsPartitioned = !child.hasLimit(); |
22 | child, childIsPartitioned, perNodeMemLimit, fragments)); |
25 | PlanFragment result = null ; |
26 | if (root instanceof ScanNode) { |
27 | result = createScanFragment(root); |
28 | fragments.add(result); |
29 | } else if (root instanceof HashJoinNode) { |
30 | Preconditions.checkState(childFragments.size() == 2 ); |
31 | result = createHashJoinFragment( |
32 | (HashJoinNode) root, childFragments.get( 1 ), childFragments.get( 0 ), |
33 | perNodeMemLimit, fragments); |
34 | } else if (root instanceof CrossJoinNode) { |
35 | Preconditions.checkState(childFragments.size() == 2 ); |
36 | result = createCrossJoinFragment( |
37 | (CrossJoinNode) root, childFragments.get( 1 ), childFragments.get( 0 ), |
38 | perNodeMemLimit, fragments); |
39 | } else if (root instanceof SelectNode) { |
40 | result = createSelectNodeFragment((SelectNode) root, childFragments); |
41 | } else if (root instanceof UnionNode) { |
42 | result = createUnionNodeFragment((UnionNode) root, childFragments, fragments); |
43 | } else if (root instanceof AggregationNode) { |
44 | result = createAggregationFragment( |
45 | (AggregationNode) root, childFragments.get( 0 ), fragments); |
46 | } else if (root instanceof SortNode) { |
47 | if (((SortNode) root).isAnalyticSort()) { |
49 | result = createAnalyticFragment( |
50 | (SortNode) root, childFragments.get( 0 ), fragments); |
52 | result = createOrderByFragment( |
53 | (SortNode) root, childFragments.get( 0 ), fragments); |
55 | } else if (root instanceof AnalyticEvalNode) { |
56 | result = createAnalyticFragment(root, childFragments.get( 0 ), fragments); |
57 | } else if (root instanceof EmptySetNode) { |
58 | result = new PlanFragment( |
59 | ctx_.getNextFragmentId(), root, DataPartition.UNPARTITIONED); |
61 | throw new InternalException( |
62 | "Cannot create plan fragment for this node type: " + root.getExplainString()); |
65 | fragments.remove(result); |
66 | fragments.add(result); |
68 | if (!isPartitioned && result.isPartitioned()) { |
69 | result = createMergeFragment(result); |
70 | fragments.add(result); |
上面的方法调用了大量的 create*Fragment() 私有成员方法。这些成员方法的具体实现可以查看源文件:
DistributedPlanner.java
这些成员方法都返回了 PlanFragment 实例,关于该类的具体实现可以查看源代码:
PlanFragment.java
至此,我们大概介绍了 createPlanFragments 的过程。
由于 createSingleNodePlan 和 createPlanFragments 两个 createPlan 最重要的部分都已经介绍了,
createPlan 也就介绍到这里。现在让我们回到 frontend.createExecRequest()
继续来看剩下的内容。frontend.createExecRequest() 其余代码如下:
2 | * Create a populated TExecRequest corresponding to the supplied TQueryCtx. |
4 | public TExecRequest createExecRequest(TQueryCtx queryCtx, StringBuilder explainString) |
5 | throws ImpalaException { |
13 | for ( int i = 1 ; i < fragments.size(); ++i) { |
14 | PlanFragment dest = fragments.get(i).getDestFragment(); |
15 | Integer idx = fragmentIdx.get(dest); |
16 | Preconditions.checkState(idx != null ); |
17 | queryExecRequest.addToDest_fragment_idx(idx.intValue()); |
22 | LOG.debug( "get scan range locations" ); |
23 | Set<TTableName> tablesMissingStats = Sets.newTreeSet(); |
24 | for (ScanNode scanNode: scanNodes) { |
25 | queryExecRequest.putToPer_node_scan_ranges( |
26 | scanNode.getId().asInt(), |
27 | scanNode.getScanRangeLocations()); |
28 | if (scanNode.isTableMissingStats()) { |
29 | tablesMissingStats.add(scanNode.getTupleDesc().getTableName().toThrift()); |
33 | queryExecRequest.setHost_list(analysisResult.getAnalyzer().getHostIndex().getList()); |
34 | for (TTableName tableName: tablesMissingStats) { |
35 | queryCtx.addToTables_missing_stats(tableName); |
40 | if (queryCtx.request.query_options.isDisable_unsafe_spills() |
41 | && !tablesMissingStats.isEmpty() |
42 | && !analysisResult.getAnalyzer().hasPlanHints()) { |
43 | queryCtx.setDisable_spilling( true ); |
48 | planner.computeResourceReqs(fragments, true , queryExecRequest); |
49 | } catch (Exception e) { |
51 | LOG.error( "Failed to compute resource requirements for query\n" + |
52 | queryCtx.request.getStmt(), e); |
56 | for (PlanFragment fragment: fragments) { |
57 | TPlanFragment thriftFragment = fragment.toThrift(); |
58 | queryExecRequest.addToFragments(thriftFragment); |
62 | TExplainLevel explainLevel = TExplainLevel.VERBOSE; |
64 | if (analysisResult.isExplainStmt() || RuntimeEnv.INSTANCE.isTestEnv()) { |
65 | explainLevel = queryCtx.request.query_options.getExplain_level(); |
69 | queryExecRequest.setQuery_ctx(queryCtx); |
72 | planner.getExplainString(fragments, queryExecRequest, explainLevel)); |
73 | queryExecRequest.setQuery_plan(explainString.toString()); |
74 | queryExecRequest.setDesc_tbl(analysisResult.getAnalyzer().getDescTbl().toThrift()); |
76 | String jsonLineageGraph = analysisResult.getJsonLineageGraph(); |
77 | if (jsonLineageGraph != null && !jsonLineageGraph.isEmpty()) { |
78 | queryExecRequest.setLineage_graph(jsonLineageGraph); |
81 | if (analysisResult.isExplainStmt()) { |
83 | createExplainRequest(explainString.toString(), result); |
87 | result.setQuery_exec_request(queryExecRequest); |
89 | if (analysisResult.isQueryStmt()) { |
91 | LOG.debug( "create result set metadata" ); |
92 | result.stmt_type = TStmtType.QUERY; |
93 | result.query_exec_request.stmt_type = result.stmt_type; |
94 | TResultSetMetadata metadata = new TResultSetMetadata(); |
95 | QueryStmt queryStmt = analysisResult.getQueryStmt(); |
96 | int colCnt = queryStmt.getColLabels().size(); |
97 | for ( int i = 0 ; i < colCnt; ++i) { |
98 | TColumn colDesc = new TColumn(); |
99 | colDesc.columnName = queryStmt.getColLabels().get(i); |
100 | colDesc.columnType = queryStmt.getResultExprs().get(i).getType().toThrift(); |
101 | metadata.addToColumns(colDesc); |
103 | result.setResult_set_metadata(metadata); |
105 | Preconditions.checkState(analysisResult.isInsertStmt() || |
106 | analysisResult.isCreateTableAsSelectStmt()); |
111 | analysisResult.isCreateTableAsSelectStmt() ? TStmtType.DDL : TStmtType.DML; |
112 | result.query_exec_request.stmt_type = TStmtType.DML; |
115 | InsertStmt insertStmt = analysisResult.getInsertStmt(); |
116 | if (insertStmt.getTargetTable() instanceof HdfsTable) { |
117 | TFinalizeParams finalizeParams = new TFinalizeParams(); |
118 | finalizeParams.setIs_overwrite(insertStmt.isOverwrite()); |
119 | finalizeParams.setTable_name(insertStmt.getTargetTableName().getTbl()); |
120 | finalizeParams.setTable_id(insertStmt.getTargetTable().getId().asInt()); |
121 | String db = insertStmt.getTargetTableName().getDb(); |
122 | finalizeParams.setTable_db(db == null ? queryCtx.session.database : db); |
123 | HdfsTable hdfsTable = (HdfsTable) insertStmt.getTargetTable(); |
124 | finalizeParams.setHdfs_base_dir(hdfsTable.getHdfsBaseDir()); |
125 | finalizeParams.setStaging_dir( |
126 | hdfsTable.getHdfsBaseDir() + "/_impala_insert_staging" ); |
127 | queryExecRequest.setFinalize_params(finalizeParams); |
131 | validateTableIds(analysisResult.getAnalyzer(), result); |
133 | timeline.markEvent( "Planning finished" ); |
134 | result.setTimeline(analysisResult.getAnalyzer().getTimeline().toThrift()); |
至此,FE 结束,返回 TExecRequest 型的对象给 backend 执行。
由于笔者刚开始接触 Impala,分析可能存在某些谬误,有任何疑问或建议都欢迎讨论。