Analyzer主要职责就是将通过Sql Parser未能Resolved的Logical Plan给Resolved掉。
lazy val analyzed: LogicalPlan = analyzer.execute(logical)//分析过的LogicalPlan
protected[sql] lazy val analyzer: Analyzer =
new Analyzer(catalog, functionRegistry, conf) {
override val extendedResolutionRules =
ExtractPythonUdfs ::
sources.PreInsertCastAndRename ::
Nil
override val extendedCheckRules = Seq(
sources.PreWriteCheck(catalog)
)
}
class Analyzer(
catalog: Catalog,
registry: FunctionRegistry,
conf: CatalystConf,
maxIterations: Int = 100)
extends RuleExecutor[LogicalPlan] with HiveTypeCoercion with CheckAnalysis {
def resolver: Resolver = {
if (conf.caseSensitiveAnalysis) {
caseSensitiveResolution
} else {
caseInsensitiveResolution
}
}
val fixedPoint = FixedPoint(maxIterations)
/**
* Override to provide additional rules for the "Resolution" batch.
*/
val extendedResolutionRules: Seq[Rule[LogicalPlan]] = Nil
lazy val batches: Seq[Batch] = Seq(//不同的Batch代表不同的策略
Batch("Substitution", fixedPoint,
CTESubstitution ::
WindowsSubstitution ::
Nil : _*),
Batch("Resolution", fixedPoint,
//通过catalog解析表名
ResolveRelations ::
//解析从子节点的操作生成的属性,一般是别名引起的,比如a.id
ResolveReferences ::
ResolveGroupingAnalytics ::
//在select语言里,order by的属性往往在前面没写,查询的时候也需要把这些字段查出来,排序完毕之后再删除
ResolveSortReferences ::
ResolveGenerate ::
//解析函数
ResolveFunctions ::
ExtractWindowExpressions ::
//解析全局的聚合函数,比如select sum(score) from table
GlobalAggregates ::
//解析having子句后面的聚合过滤条件,比如having sum(score) > 400
UnresolvedHavingClauseAttributes ::
//typeCoercionRules是hive的类型转换规则
TrimGroupingAliases ::
typeCoercionRules ++
extendedResolutionRules : _*)
)
…
}
其中val analyzed: LogicalPlan= analyzer.execute(logical),logical就是sqlparser解析出来的unresolved logical plan,analyzed就是analyzed logical plan。那么exectue究竟是这么样的过程呢?
def execute(plan: TreeType): TreeType = {
var curPlan = plan
batches.foreach { batch =>//针对每个Batch进行处理
val batchStartPlan = curPlan
var iteration = 1
var lastPlan = curPlan
var continue = true
// Run until fix point (or the max number of iterations as specified in the strategy.
while (continue) {//只要对这个plan应用这个batch里面的所有rule之后,最后生成的plan没有发生变化才认为所有都遍历过了,只要有变化,就继续遍历
//fold函数操作遍历问题集合的顺序。foldLeft是从左开始计算,然后往右遍历。foldRight是从右开始算,然后往左遍历。
curPlan = batch.rules.foldLeft(curPlan) {
case (plan, rule) =>
val result = rule(plan)//对这个plan应用rule.apply转化里面的TreeNode
logInfo(s"plan (${plan}) \n result (${result}) \n rule (${rule})")//加这个打印可以看到每个plan应用之后的result是什么,方便后面讲解
if (!result.fastEquals(plan)) {
logTrace(
s"""
|=== Applying Rule ${rule.ruleName} ===
|${sideBySide(plan.treeString, result.treeString).mkString("\n")}
""".stripMargin)
}
result
}
iteration += 1
if (iteration > batch.strategy.maxIterations) {
// Only log if this is a rule that is supposed to run more than once.
if (iteration != 2) {
logInfo(s"Max iterations (${iteration - 1}) reached for batch ${batch.name}")
}
continue = false
}
if (curPlan.fastEquals(lastPlan)) {
logTrace(
s"Fixed point reached for batch ${batch.name} after ${iteration - 1} iterations.")
continue = false
}
lastPlan = curPlan
}
if (!batchStartPlan.fastEquals(curPlan)) {
logDebug(
s"""
|=== Result of Batch ${batch.name} ===
|${sideBySide(plan.treeString, curPlan.treeString).mkString("\n")}
""".stripMargin)
} else {
logTrace(s"Batch ${batch.name} has no effect.")
}
}
curPlan
}
重点在于以下这个函数:
val result = rule(plan)//对这个plan应用rule.apply转化里面的TreeNode
rule(plan)调用的是对应的Rule[LogicalPlan]对象里面的apply函数,例如ResolveRelations和ResolveReferences
object ResolveRelations extends Rule[LogicalPlan] {
def getTable(u: UnresolvedRelation): LogicalPlan = {
try {
catalog.lookupRelation(u.tableIdentifier, u.alias)
} catch {
case _: NoSuchTableException =>
u.failAnalysis(s"no such table ${u.tableName}")
}
}
//输入(plan)logical 返回logical,transform是遍历各个节点,对每个节点应用该rule
def apply(plan: LogicalPlan): LogicalPlan = plan transform {//调用transformDown,本质上就是二叉树的前序(pre-order
)遍历
case i@InsertIntoTable(u: UnresolvedRelation, _, _, _, _) =>
i.copy(table = EliminateSubQueries(getTable(u)))
case u: UnresolvedRelation =>
getTable(u)
}
}
object ResolveReferences extends Rule[LogicalPlan] { def apply(plan: LogicalPlan): LogicalPlan = plan transformUp {// transformUp本质上就是二叉树的后序(post-order )遍历 case p: LogicalPlan if !p.childrenResolved => p // If the projection list contains Stars, expand it. case p @ Project(projectList, child) if containsStar(projectList) =&g