case
test("SPARK decoder without codegen") {
withSQLConf(SQLConf.WHOLESTAGE_CODEGEN_ENABLED.key -> "false") {
spark.catalog.createTable("variance", "/mnt/DP_disk1/string_variance_value.gz.parquet", "parquet")
val df = sql("select * from variance")
df.show(4)
df.explain(false)
}
}
#without codegen
== Physical Plan ==
FileScan parquet default.variance[col0_str#218] Batched: false, DataFilters: [], Format: Parquet, Location: InMemoryFileIndex[file:/mnt/DP_disk1/string_variance_value.gz.parquet], PartitionFilters: [], PushedFilters: [], ReadSchema: struct<col0_str:string>
FileSourceScanExec
以上的plan scan Operator 对应的是 FileSourceScanExec
走的是 doExecute 方法
protected override def doExecute(): RDD[InternalRow] = {
val numOutputRows = longMetric("numOutputRows")
if (needsUnsafeRowConversion) {
inputRDD.mapPartitionsWithIndexInternal { (index, iter) =>
val toUnsafe = UnsafeProjection.create(schema)
toUnsafe.initialize(index)
iter.map { row =>
numOutputRows += 1
toUnsafe(row)
}
}
} else {
inputRDD.mapPartitionsInternal { iter =>
iter.map { row =>
numOutputRows += 1
row
}
}
}
}
FileSourceScanExec supportsColumnar
// Note that some vals referring the file-based relation are lazy intentionally
// so that this plan can be canonicalized on executor side too. See SPARK-23731.
override lazy val supportsColumnar: Boolean = {
relation.fileFormat.supportBatch(relation.sparkSession, schema)
}
注意,对于非 wholeStageEnabled , supportsColumnar 返回的是false, 在 spark 3.1.1 里这样的
sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/parquet/ParquetFileFormat.scala
/**
* Returns whether the reader will return the rows as batch or not.
*/
override def supportBatch(sparkSession: SparkSession, schema: StructType): Boolean = {
val conf = sparkSession.sessionState.conf
conf.parquetVectorizedReaderEnabled && conf.wholeStageEnabled &&
schema.length <= conf.wholeStageMaxNumFields &&
schema.forall(_.dataType.isInstanceOf[AtomicType])
}
supportsColumnar
什么时候会调用 FileSourceScanExec 。
基本逻辑是:
- 在R2C, C2R 环节, 会判断两个operator 之间是否需要插入 R2C, C2R。
- 当 scan 的supportsColumnar 为ture, 就会介入一个 C2R。
- C2R 然后会调用 scan 的 doExecuteColumnar
case class ApplyColumnarRulesAndInsertTransitions
/**
* Inserts RowToColumnarExecs and ColumnarToRowExecs where needed.
*/
private def insertTransitions(plan: SparkPlan, outputsColumnar: Boolean): SparkPlan = {
if (outputsColumnar) {
insertRowToColumnar(plan)
} else if (plan.supportsColumnar && !plan.supportsRowBased) {
// `outputsColumnar` is false but the plan only outputs columnar format, so add a
// to-row transition here.
ColumnarToRowExec(insertRowToColumnar(plan))
} else if (!plan.isInstanceOf[ColumnarToRowTransition]) {
plan.withNewChildren(plan.children.map(insertTransitions(_, outputsColumnar = false)))
} else {
plan
}
}
case class ColumnarToRowExec(child: SparkPlan) extends ColumnarToRowTransition with CodegenSupport
override def doExecute(): RDD[InternalRow] = {
val numOutputRows = longMetric("numOutputRows")
val numInputBatches = longMetric("numInputBatches")
// This avoids calling `output` in the RDD closure, so that we don't need to include the entire
// plan (this) in the closure.
val localOutput = this.output
child.executeColumnar().mapPartitionsInternal { batches =>
val toUnsafe = UnsafeProjection.create(localOutput, localOutput)
batches.flatMap { batch =>
numInputBatches += 1
numOutputRows += batch.numRows()
batch.rowIterator().asScala.map(toUnsafe)
}
}
}
case class FileSourceScanExec
protected override def doExecuteColumnar(): RDD[ColumnarBatch] = {
val numOutputRows = longMetric("numOutputRows")
val scanTime = longMetric("scanTime")
inputRDD.asInstanceOf[RDD[ColumnarBatch]].mapPartitionsInternal { batches =>
new Iterator[ColumnarBatch] {
override def hasNext: Boolean = {
// The `FileScanRDD` returns an iterator which scans the file during the `hasNext` call.
val startNs = System.nanoTime()
val res = batches.hasNext
scanTime += NANOSECONDS.toMillis(System.nanoTime() - startNs)
res
}
override def next(): ColumnarBatch = {
val batch = batches.next()
numOutputRows += batch.numRows()
batch
}
}
}
}