R2C/C2R
ColumnarOverrideRules
这是这个Gazelle 的入口 , 通过 SparkSessionExtensions, 隐式嵌入。
object ColumnarOverrides extends GazelleSparkExtensionsInjector {
override def inject(extensions: SparkSessionExtensions): Unit = {
extensions.injectColumnar(ColumnarOverrideRules)
}
}
class ColumnarOverrideRules 继承的是 ColumnarRule
case class ColumnarOverrideRules(session: SparkSession) extends ColumnarRule with Logging {
def columnarEnabled =
session.sqlContext.getConf("org.apache.spark.example.columnar.enabled", "true").trim.toBoolean
def codegendisable =
session.sqlContext.getConf("spark.oap.sql.columnar.codegendisableforsmallshuffles", "false").trim.toBoolean
def conf = session.sparkContext.getConf
所以 ColumnarOverrideRules 要实现两个方法
preColumnarTransitions
postColumnarTransitions
其中preColumnarTransitions 使用的是 val rule = preOverrides
override def preColumnarTransitions: Rule[SparkPlan] = plan => {
if (columnarEnabled) {
// According to Spark's Columnar.scala, the plan is tackled one by one.
// By recording the original plan, we can easily let the whole stage
// fallback at #postColumnarTransitions.
originalPlan = plan
isSupportAdaptive = SparkShimLoader.getSparkShims.supportAdaptiveWithExchangeConsidered(plan)
val rule = preOverrides
rule.setAdaptiveSupport(isSupportAdaptive)
rule(rowGuardOverrides(plan))
} else {
plan
}
}
其中 postColumnarTransitions 使用的是 val rule = postOverrides
override def postColumnarTransitions: Rule[SparkPlan] = plan => {
if (columnarEnabled) {
if (isSupportAdaptive && fallbackWholeStage(plan)) {
// BatchScan with ArrowScan initialized can still connect
// to ColumnarToRow for transition.
insertTransitions(originalPlan, false)
} else {
val rule = postOverrides
rule.setAdaptiveSupport(isSupportAdaptive)
val tmpPlan = rule(plan)
val ret = collapseOverrides(tmpPlan)
if (codegendisable)
{
logDebug("postColumnarTransitions:" +
" resetting spark.oap.sql.columnar.codegendisableforsmallshuffles To false")
session.sqlContext.setConf(
"spark.oap.sql.columnar.codegendisableforsmallshuffles", "false")
}
ret
}
} else {
plan
}
}
这两个定义如下
def preOverrides = ColumnarPreOverrides(session)
def postOverrides = ColumnarPostOverrides()
如下,
这两个类都是继承 Rule[SparkPlan]。
都各自含有 独立的replaceWithColumnarPlan。
只有第二个 replaceWithColumnarPlan 才涉及 R2C/C2R conversion
case class ColumnarPreOverrides(session: SparkSession) extends Rule[SparkPlan] {
val columnarConf: GazellePluginConfig = GazellePluginConfig.getSessionConf
var isSupportAdaptive: Boolean = true
def replaceWithColumnarPlan(plan: SparkPlan): SparkPlan = plan match {
case class ColumnarPostOverrides() extends Rule[SparkPlan] {
val columnarConf = GazellePluginConfig.getSessionConf
var isSupportAdaptive: Boolean = true
def replaceWithColumnarPlan(plan: SparkPlan): SparkPlan = plan match {
// To get ColumnarBroadcastExchangeExec back from the fallback that for DPP reuse.
case RowToColumnarExec(broadcastQueryStageExec: BroadcastQueryStageExec)
if (broadcastQueryStageExec.plan match {
case BroadcastExchangeExec(_, _: DataToArrowColumnarExec) => true
case _ => false
}) =>
logDebug(s"Due to a fallback of BHJ inserted into plan." +
s" See above override in BroadcastQueryStageExec")
val localBroadcastXchg = broadcastQueryStageExec.plan.asInstanceOf[BroadcastExchangeExec]
val dataToArrowColumnar = localBroadcastXchg.child.asInstanceOf[DataToArrowColumnarExec]
//ColumnarBroadcastExchangeExec(localBroadcastXchg.mode, dataToArrowColumnar)
dataToArrowColumnar.child
case plan: RowToColumnarExec =>
val child = replaceWithColumnarPlan(plan.child)
if (columnarConf.enableArrowRowToColumnar) {
logDebug(s"ColumnarPostOverrides ArrowRowToColumnarExec(${child.getClass})")
try {
ArrowRowToColumnarExec(child)
} catch {
case _: Throwable =>
logInfo("ArrowRowToColumnar: Falling back to RowToColumnar...")
RowToArrowColumnarExec(child)
}
} else {
logDebug(s"ColumnarPostOverrides RowToArrowColumnarExec(${child.getClass})")
RowToArrowColumnarExec(child)
}
case ColumnarToRowExec(child: ColumnarShuffleExchangeAdaptor) =>
replaceWithColumnarPlan(child)
case ColumnarToRowExec(child: ColumnarBroadcastExchangeAdaptor) =>
replaceWithColumnarPlan(child)
case ColumnarToRowExec(child: CoalesceBatchesExec) =>
plan.withNewChildren(Seq(replaceWithColumnarPlan(child.child)))
case ColumnarToRowExec(child: ArrowCoalesceBatchesExec) =>
plan.withNewChildren(Seq(replaceWithColumnarPlan(child.child)))
case plan: ColumnarToRowExec =>
if (columnarConf.enableArrowColumnarToRow) {
val child = replaceWithColumnarPlan(plan.child)
logDebug(s"ColumnarPostOverrides ArrowColumnarToRowExec(${child.getClass})")
try {
ArrowColumnarToRowExec(child)
} catch {
case _: Throwable =>
logInfo("ArrowColumnarToRowExec: Falling back to ColumnarToRow...")
ColumnarToRowExec(child)
}
} else {
val children = plan.children.map(replaceWithColumnarPlan)
plan.withNewChildren(children)
}
case r: SparkPlan
if !r.isInstanceOf[QueryStageExec] && !r.supportsColumnar && r.children.exists(c =>
c.isInstanceOf[ColumnarToRowExec]) =>
// This is a fix for when DPP and AQE both enabled,
// ColumnarExchange maybe child as a Row SparkPlan.
val children = r.children.map {
case c: ColumnarToRowExec =>
if (columnarConf.enableArrowColumnarToRow) {
try {
val child = replaceWithColumnarPlan(c.child)
ArrowColumnarToRowExec(child)
} catch {
case _: Throwable =>
logInfo("ArrowColumnarToRow : Falling back to ColumnarToRow...")
c.withNewChildren(c.children.map(replaceWithColumnarPlan))
}
} else {
c.withNewChildren(c.children.map(replaceWithColumnarPlan))
}
case other =>
replaceWithColumnarPlan(other)
}
r.withNewChildren(children)
case p =>
val children = p.children.map(replaceWithColumnarPlan)
p.withNewChildren(children)
}
def setAdaptiveSupport(enable: Boolean): Unit = { isSupportAdaptive = enable }
def apply(plan: SparkPlan): SparkPlan = {
replaceWithColumnarPlan(plan)
}
}
整个文件涉及三个类, 结构上三个类是平行的。逻辑上是 ColumnarOverrideRules 使用和调用了 ColumnarPreOverrides和ColumnarPostOverrides
case class ColumnarPreOverrides(session: SparkSession) extends Rule[SparkPlan]
case class () extends Rule[SparkPlan]
case class ColumnarOverrideRules(session: SparkSession) extends ColumnarRule