Delta File Fomat 3:Delta数据文件的写入

最新推荐文章于 2023-12-18 17:29:54 发布

wankunde

最新推荐文章于 2023-12-18 17:29:54 发布

阅读量766

点赞数

分类专栏： spark

本文链接：https://blog.csdn.net/wankunde/article/details/102641534

版权

spark 专栏收录该内容

69 篇文章 7 订阅

订阅专栏

文章目录

DataSource回顾及 DeltaDataSource 说明
SaveIntoDataSourceCommand 对象的生成
- SaveIntoDataSourceCommand 类继承关系
Spark Run Command 对象
- SaveIntoDataSourceCommand 生成的`QueryExecution`

DataSource回顾及 DeltaDataSource 说明

先回顾一下上节说的Spark对于文件的读写，是对应文件格式的reader或writer操作内部加载到DataSource封装出来的对象来实现的。
Delta格式加载到的class为 DeltaDataSource。
DeltaDataSource 并没有实现DataSourceV2接口，所以来看一下DeltaDataSource.planForWriting() 的对象封装及Spark的数据写入过程。

在 DataFrameWriter 的 runCommand 先对plan进行解析生成 QueryExecution，然后通过qe.toRdd 来使用一个新的 executionId 执行该QueryExecution.

SaveIntoDataSourceCommand 对象的生成

DataSource 的 planForWriting() 方法中，传入的 DeltaDataSource 类是 CreatableRelationProvider 子类，会创建一个 SaveIntoDataSourceCommand 对象。

DataSource(
        sparkSession = df.sparkSession,
        className = source,
        partitionColumns = partitioningColumns.getOrElse(Nil),
        options = extraOptions.toMap).planForWriting(mode, df.logicalPlan)

  def planForWriting(mode: SaveMode, data: LogicalPlan): LogicalPlan = {
    if (data.schema.map(_.dataType).exists(_.isInstanceOf[CalendarIntervalType])) {
      throw new AnalysisException("Cannot save interval data type into external storage.")
    }

    providingClass.newInstance() match {
      case dataSource: CreatableRelationProvider =>
        // DeltaDataSource 的实例实现了 CreatableRelationProvider 方法
        SaveIntoDataSourceCommand(data, dataSource, caseInsensitiveOptions, mode)
      case format: FileFormat =>
        DataSource.validateSchema(data.schema)
        planForWritingFileFormat(format, mode, data)
      case _ =>
        sys.error(s"${providingClass.getCanonicalName} does not allow create table as select.")
    }
  }



  // 这个命令生成的执行
  case class SaveIntoDataSourceCommand(
    query: LogicalPlan,
    dataSource: CreatableRelationProvider,
    options: Map[String, String],
    mode: SaveMode) extends RunnableCommand {

  override protected def innerChildren: Seq[QueryPlan[_]] = Seq(query)

  override def run(sparkSession: SparkSession): Seq[Row] = {
    dataSource.createRelation(
      sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))

    Seq.empty[Row]
  }
}

SaveIntoDataSourceCommand 类继承关系

// 补充Spark中Plan继承关系
// LogicalPlan
//     Command
//         RunnableCommand
//             SaveIntoDataSourceCommand

Spark Run Command 对象

上面我们看到了 SaveIntoDataSourceCommand 对象的生成，下面我们看一下Spark 是如何来调度执行 SaveIntoDataSourceCommand 的。整体方法栈还是比较简单的～～

调用方法栈:

saveToV1Source()
runCommand()
val qe = session.sessionState.executePlan(command)
qe.toRdd
executedPlan.execute()
ExecutedCommandExec.doExecute()
SaveIntoDataSourceCommand.run()
CreatableRelationProvider.createRelation()
DeltaDataSource.createRelation()

  // DataFrameWriter.scala
  private def saveToV1Source(): Unit = {
    //...

    // Code path for data source v1.
    runCommand(df.sparkSession, "save") {
      DataSource(
        sparkSession = df.sparkSession,
        className = source,
        partitionColumns = partitioningColumns.getOrElse(Nil),
        options = extraOptions.toMap).planForWriting(mode, df.logicalPlan)
    }
  }
  
  //  
  private def runCommand(session: SparkSession, name: String)(command: LogicalPlan): Unit = {
    val qe = session.sessionState.executePlan(command)
    try {
      val start = System.nanoTime()
      // call `QueryExecution.toRDD` to trigger the execution of commands.
      SQLExecution.withNewExecutionId(session, qe)(qe.toRdd)
      val end = System.nanoTime()
      session.listenerManager.onSuccess(name, qe, end - start)
    } catch {
      case e: Exception =>
        session.listenerManager.onFailure(name, qe, e)
        throw e
    }
  }

在 runCommand() 方法中的 val qe = session.sessionState.executePlan(command) 返回 QueryExecution 对象，然后通过qe.toRdd()来执行。

QueryExecution的 toRDD()方法：

  // QueryExecution
  lazy val toRdd: RDD[InternalRow] = executedPlan.execute()

  final def execute(): RDD[InternalRow] = executeQuery {
    if (isCanonicalizedPlan) {
      throw new IllegalStateException("A canonicalized plan is not supposed to be executed.")
    }
    doExecute()
  }

SaveIntoDataSourceCommand 生成的`QueryExecution`

// analyzed
SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
   +- Project [_1#353 AS key#356, _2#354 AS value#357]
      +- LocalRelation [_1#353, _2#354]

// optimizedPlan
SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
   +- Project [_1#353 AS key#356, _2#354 AS value#357]
      +- LocalRelation [_1#353, _2#354]

// sparkPlan
Execute SaveIntoDataSourceCommand
   +- SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
         +- Project [_1#353 AS key#356, _2#354 AS value#357]
            +- LocalRelation [_1#353, _2#354]

// executedPlan
Execute SaveIntoDataSourceCommand
   +- SaveIntoDataSourceCommand org.apache.spark.sql.delta.sources.DeltaDataSource@65a90c60, Map(path -> file:/private/var/folders/tp/70k9mzq90gq0j2_0dcylf1cw0000gn/T/spark-d06781d3-baa3-4479-8466-4babcb86e8ee), Append
         +- Project [_1#353 AS key#356, _2#354 AS value#357]
            +- LocalRelation [_1#353, _2#354]

对于所有command又都是通过sideEffect方式来执行，所以进入到 SaveIntoDataSourceCommand的run方法

  // case class ExecutedCommandExec
  protected override def doExecute(): RDD[InternalRow] = {
    sqlContext.sparkContext.parallelize(sideEffectResult, 1)
  }

  protected[sql] lazy val sideEffectResult: Seq[InternalRow] = {
    val converter = CatalystTypeConverters.createToCatalystConverter(schema)
    cmd.run(sqlContext.sparkSession).map(converter(_).asInstanceOf[InternalRow])
  }

  // SaveIntoDataSourceCommand
  override def run(sparkSession: SparkSession): Seq[Row] = {
    dataSource.createRelation(
      sparkSession.sqlContext, mode, options, Dataset.ofRows(sparkSession, query))

    Seq.empty[Row]
  }

  // CreatableRelationProvider
trait CreatableRelationProvider {
  /**
   * Saves a DataFrame to a destination (using data source-specific parameters)
   *
   * @param sqlContext SQLContext
   * @param mode specifies what happens when the destination already exists
   * @param parameters data source-specific parameters
   * @param data DataFrame to save (i.e. the rows after executing the query)
   * @return Relation with a known schema
   *
   * @since 1.3.0
   */
  def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation
}


  // DeltaDataSource 
  override def createRelation(
      sqlContext: SQLContext,
      mode: SaveMode,
      parameters: Map[String, String],
      data: DataFrame): BaseRelation = {
    val path = parameters.getOrElse("path", {
      throw DeltaErrors.pathNotSpecifiedException
    })
    val partitionColumns = parameters.get(DeltaSourceUtils.PARTITIONING_COLUMNS_KEY)
      .map(DeltaDataSource.decodePartitioningColumns)
      .getOrElse(Nil)

    val deltaLog = DeltaLog.forTable(sqlContext.sparkSession, path)
    WriteIntoDelta(
      deltaLog = deltaLog,
      mode = mode,
      new DeltaOptions(parameters, sqlContext.sparkSession.sessionState.conf),
      partitionColumns = partitionColumns,
      configuration = Map.empty,
      data = data).run(sqlContext.sparkSession)

    deltaLog.createRelation()
  }