前言
本文隶属于专栏《大数据技术体系》,该专栏为笔者原创,引用请注明来源,不足和错误之处请在评论区帮忙指出,谢谢!
本专栏目录结构和参考文献请见大数据技术体系
关联
Spark SQL 工作流程源码解析(一)总览(基于 Spark 3.3.0)
Spark SQL 工作流程源码解析(二)parsing 阶段(基于 Spark 3.3.0)
Spark SQL 工作流程源码解析(三)analysis 阶段(基于 Spark 3.3.0)
Spark SQL 工作流程源码解析(四)optimization 阶段(基于 Spark 3.3.0)
Spark SQL 工作流程源码解析(五)planning 阶段(基于 Spark 3.3.0)
SparkSessionExtensions
用来给 SparkSession
提供注入点的容器。
注意,任何注入的构建器都应该假定
SparkSession
已完全
初始化,同时不应该触及SparkSession
的内部(例如SessionState
)。
当前支持下面的扩展:
- Analyzer Rules —— `analysis`阶段应用的规则
- Check Analysis Rules —— `analysis`阶段的检查规则
- Optimizer Rules —— `optimization`阶段应用的规则
- Pre CBO Rules —— `optimization`阶段应用的规则,在 CBO(基于成本的优化) 之前运行
- Planning Strategies —— `planning`阶段的策略
- Customized Parser —— 自定义`parsing`阶段使用的`ParserInterface`
- (External) Catalog listeners —— `catalog`监听器
- Columnar Rules —— 列式数据规则
- Adaptive Query Stage Preparation Rules —— AQE(自适应查询)准备期间运行的规则
源码下载
spark-examples 代码已开源,本项目致力于提供最具实践性的 Apache Spark 代码开发学习指南。
点击链接前往 github 下载源码:spark-examples
实践
package com.shockang.study.spark.sql.extensions
import com.shockang.study.spark.sql.extensions.SparkSessionExtensionExample.myFunction
import org.apache.spark.sql.catalyst.expressions.{Expression, ExpressionInfo, Literal}
import org.apache.spark.sql.catalyst.parser.ParserInterface
import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan
import org.apache.spark.sql.catalyst.rules.Rule
import org.apache.spark.sql.catalyst.{FunctionIdentifier, TableIdentifier}
import org.apache.spark.sql.execution.{SparkPlan, SparkStrategy}
import org.apache.spark.sql.internal.StaticSQLConf.SPARK_SESSION_EXTENSIONS
import org.apache.spark.sql.types.{DataType, IntegerType, StructType}
import org.apache.spark.sql.{SparkSession, SparkSessionExtensions}
/**
* Spark SQL 自定义扩展
*
* @author Shockang
*/
object SparkSessionExtensionExample {
def main(args: Array[String]): Unit = {
val spark = SparkSession.builder()
.master("local[1]")
.appName("SparkSessionExtensionExample")
.config(SPARK_SESSION_EXTENSIONS.key, classOf[MyExtensions].getCanonicalName)
.getOrCreate()
try {
// planning 阶段使用的策略
assert(spark.sessionState.planner.strategies.contains(MySparkStrategy(spark)))
// analysis 阶段 resolution 规则批中使用的规则
assert(spark.sessionState.analyzer.extendedResolutionRules.contains(MyRule(spark)))
// analysis 阶段 Post-Hoc Resolution 规则批中使用的规则
assert(spark.sessionState.analyzer.postHocResolutionRules.contains(MyRule(spark)))
// analysis 阶段的检测规则
assert(spark.sessionState.analyzer.extendedCheckRules.contains(MyCheckRule(spark)))
// optimization 阶段使用的规则
assert(spark.sessionState.optimizer.batches.flatMap(_.rules).contains(MyRule(spark)))
// 自定义 parsing 阶段使用的 ParserInterface
assert(spark.sessionState.sqlParser.isInstanceOf[MyParser])
// 自定义数据库函数
assert(spark.sessionState.functionRegistry
.lookupFunction(myFunction._1).isDefined)
} finally {
spark.stop()
}
}
val myFunction: (FunctionIdentifier, ExpressionInfo, Seq[Expression] => Literal) = (FunctionIdentifier("myFunction"),
new ExpressionInfo(
"noClass",
"myDb",
"myFunction",
"usage",
"extended usage",
" Examples:",
"""
note
""",
"",
"3.0.0",
"""
deprecated
""",
""),
(_: Seq[Expression]) => Literal(5, IntegerType))
}
class MyExtensions extends (SparkSessionExtensions => Unit) {
def apply(e: SparkSessionExtensions): Unit = {
// planning 阶段使用的策略
e.injectPlannerStrategy(MySparkStrategy)
// analysis 阶段 resolution 规则批中使用的规则
e.injectResolutionRule(MyRule)
// analysis 阶段 Post-Hoc Resolution 规则批中使用的规则
e.injectPostHocResolutionRule(MyRule)
// analysis 阶段的检测规则
e.injectCheckRule(MyCheckRule)
// optimization 阶段使用的规则
e.injectOptimizerRule(MyRule)
// 自定义 parsing 阶段使用的 ParserInterface
e.injectParser(MyParser)
// 自定义数据库函数
e.injectFunction(myFunction)
}
}
case class MySparkStrategy(spark: SparkSession) extends SparkStrategy {
override def apply(plan: LogicalPlan): Seq[SparkPlan] = Seq.empty
}
case class MyRule(spark: SparkSession) extends Rule[LogicalPlan] {
override def apply(plan: LogicalPlan): LogicalPlan = plan
}
case class MyCheckRule(spark: SparkSession) extends (LogicalPlan => Unit) {
override def apply(plan: LogicalPlan): Unit = {}
}
case class MyParser(spark: SparkSession, delegate: ParserInterface) extends ParserInterface {
override def parsePlan(sqlText: String): LogicalPlan =
delegate.parsePlan(sqlText)
override def parseExpression(sqlText: String): Expression =
delegate.parseExpression(sqlText)
override def parseTableIdentifier(sqlText: String): TableIdentifier =
delegate.parseTableIdentifier(sqlText)
override def parseFunctionIdentifier(sqlText: String): FunctionIdentifier =
delegate.parseFunctionIdentifier(sqlText)
override def parseMultipartIdentifier(sqlText: String): Seq[String] =
delegate.parseMultipartIdentifier(sqlText)
override def parseTableSchema(sqlText: String): StructType =
delegate.parseTableSchema(sqlText)
override def parseDataType(sqlText: String): DataType =
delegate.parseDataType(sqlText)
override def parseQuery(sqlText: String): LogicalPlan =
delegate.parseQuery(sqlText)
}