文章目录
DataSource回顾 及 DeltaDataSource 说明
先回顾一下上节说的Spark对于文件的读写,是对应文件格式的reader或writer操作内部加载到DataSource
封装出来的对象来实现的。
Delta格式加载到的class为 DeltaDataSource
。
DeltaDataSource
并没有实现DataSourceV2接口,所以来看一下DeltaDataSource.planForWriting()
的对象封装及Spark的数据写入过程。
在 DataFrameWriter 的 runCommand 先对plan进行解析生成 QueryExecution
,然后通过qe.toRdd
来使用一个新的 executionId 执行该QueryExecution
.
SaveIntoDataSourceCommand 对象的生成
DataSource
的 planForWriting()
方法中,传入的 DeltaDataSource
类是 CreatableRelationProvider
子类,会创建一个 SaveIntoDataSourceCommand
对象。
DataSource(
sparkSession = df.sparkSession,
className = source,
partitionColumns = partitioningColumns.getOrElse(Nil),
options = extraOptions.toMap).planForWriting(mode, df.logicalPlan)
def planForWriting(mode: SaveMode, data: LogicalPlan): LogicalPlan = {
if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
throw new AnalysisException("Cannot save interval data type into external storage.")
}
providingClass.newInstance() match {
case dataSource: CreatableRelationProvider =>
// DeltaDataSource 的实例实现了 CreatableRelationProvider 方法
SaveIntoDataSourceCommand(data, dataSource, caseInsensitiveOptions, mode)
case format: FileFormat =>
DataSource.validateSchema(data.schema)
planForWritingFileFormat(format, mode, data)
case _ =>
sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
}
}
// 这个命令生成的执行
case class SaveIntoDataSourceCommand(
query: LogicalPlan,
dataSource: CreatableRelationProvider,
options: Map[String, String],
mode: SaveMode) extends RunnableCommand {
override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)
override def run(sparkSession: SparkSession): Seq[Row] = {
dataSource.createRelation(
sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))
Seq.empty[Row]
}
}
SaveIntoDataSourceCommand 类继承关系
// 补充Spark中Plan继承关系
// LogicalPlan
// Command
// RunnableCommand
// SaveIntoDataSourceCommand
Spark Run Command 对象
上面我们看到了 SaveIntoDataSourceCommand 对象的生成,下面我们看一下Spark 是如何来调度执行 SaveIntoDataSourceCommand 的。整体方法栈还是比较简单的~~
调用方法栈:
- saveToV1Source()
- runCommand()
- val qe = session.sessionState.executePlan(command)
- qe.toRdd
- executedPlan.execute()
- ExecutedCommandExec.doExecute()
- SaveIntoDataSourceCommand.run()
- CreatableRelationProvider.createRelation()
- DeltaDataSource.createRelation()
// DataFrameWriter.scala
private def saveToV1Source(): Unit = {
//...
// Code path for data source v1.
runCommand(df.sparkSession, "save") {
DataSource(
sparkSession = df.sparkSession,
className = source,
partitionColumns = partitioningColumns.getOrElse(Nil),
options = extraOptions.toMap).planForWriting(mode, df.logicalPlan)
}
}
//
private def runCommand(session: SparkSession, name: String)(command: LogicalPlan): Unit = {
val qe = session.sessionState.executePlan(command)
try {
val start = System.nanoTime()
// call `QueryExecution.toRDD` to trigger the execution of commands.
SQLExecution.withNewExecutionId(session, qe)(qe.toRdd)
val end = System.nanoTime()
session.listenerManager.onSuccess(name, qe, end - start)
} catch {
case e: Exception =>
session.listenerManager.onFailure(name, qe, e)
throw e
}
}
在 runCommand()
方法中的 val qe = session.sessionState.executePlan(command)
返回 QueryExecution
对象,然后 通过qe.toRdd()
来执行。
QueryExecution的 toRDD()
方法:
// QueryExecution
lazy val toRdd: RDD[InternalRow] = executedPlan.execute()
final def execute(): RDD[InternalRow] = executeQuery {
if (isCanonicalizedPlan) {
throw new IllegalStateException("A canonicalized plan is not supposed to be executed.")
}
doExecute()
}
SaveIntoDataSourceCommand 生成的QueryExecution
// analyzed
SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
+- Project [_1#353 AS key#356, _2#354 AS value#357]
+- LocalRelation [_1#353, _2#354]
// optimizedPlan
SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
+- Project [_1#353 AS key#356, _2#354 AS value#357]
+- LocalRelation [_1#353, _2#354]
// sparkPlan
Execute SaveIntoDataSourceCommand
+- SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
+- Project [_1#353 AS key#356, _2#354 AS value#357]
+- LocalRelation [_1#353, _2#354]
// executedPlan
Execute SaveIntoDataSourceCommand
+- SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
+- Project [_1#353 AS key#356, _2#354 AS value#357]
+- LocalRelation [_1#353, _2#354]
对于所有command又都是通过sideEffect方式来执行,所以进入到 SaveIntoDataSourceCommand
的run方法
// case class ExecutedCommandExec
protected override def doExecute(): RDD[InternalRow] = {
sqlContext.sparkContext.parallelize(sideEffectResult, 1)
}
protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
val converter = CatalystTypeConverters.createToCatalystConverter(schema)
cmd.run(sqlContext.sparkSession).map(converter(_).asInstanceOf[InternalRow])
}
// SaveIntoDataSourceCommand
override def run(sparkSession: SparkSession): Seq[Row] = {
dataSource.createRelation(
sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))
Seq.empty[Row]
}
// CreatableRelationProvider
trait CreatableRelationProvider {
/**
* Saves a DataFrame to a destination (using data source-specific parameters)
*
* @param sqlContext SQLContext
* @param mode specifies what happens when the destination already exists
* @param parameters data source-specific parameters
* @param data DataFrame to save (i.e. the rows after executing the query)
* @return Relation with a known schema
*
* @since 1.3.0
*/
def createRelation(
sqlContext: SQLContext,
mode: SaveMode,
parameters: Map[String, String],
data: DataFrame): BaseRelation
}
// DeltaDataSource
override def createRelation(
sqlContext: SQLContext,
mode: SaveMode,
parameters: Map[String, String],
data: DataFrame): BaseRelation = {
val path = parameters.getOrElse("path", {
throw DeltaErrors.pathNotSpecifiedException
})
val partitionColumns = parameters.get(DeltaSourceUtils.PARTITIONING_COLUMNS_KEY)
.map(DeltaDataSource.decodePartitioningColumns)
.getOrElse(Nil)
val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path)
WriteIntoDelta(
deltaLog = deltaLog,
mode = mode,
new DeltaOptions(parameters, sqlContext.sparkSession.sessionState.conf),
partitionColumns = partitionColumns,
configuration = Map.empty,
data = data).run(sqlContext.sparkSession)
deltaLog.createRelation()
}