1) github 下载spark 源码(下面的代码都是需要添加的,找到对应的文件进行添加即可)
2)找到 SqlBase.g4 文件,以 offset 为例进行说明,下面的
2.1)找到下面的内容
queryOrganization
: (ORDER BY order+=sortItem (',' order+=sortItem)*)?
(CLUSTER BY clusterBy+=expression (',' clusterBy+=expression)*)?
(DISTRIBUTE BY distributeBy+=expression (',' distributeBy+=expression)*)?
(SORT BY sort+=sortItem (',' sort+=sortItem)*)?
windows?
(OFFSET offset=expression)? 此处是添加的功能,类似于Oracle里面的关键字
(LIMIT (ALL | limit=expression))?
;
2.2)1 package org.apache.spark.sql.execution limit
/**
* Offset 编辑信息
* @param offset
* @param limit
* @param child
*/
case class OffsetExec(offset:Int,limit: Int, child: SparkPlan) extends UnaryExecNode {
override def output: Seq[Attribute] = child.output
override def outputPartitioning: Partitioning = SinglePartition
override def executeCollect(): Array[InternalRow] = child.executeTake(offset + limit).drop(offset)
private val serializer: Serializer = new UnsafeRowSerializer(child.output.size)
protected override def doExecute(): RDD[InternalRow] = {
val locallyLimited = child.execute().mapPartitionsInternal(_.take(offset + limit))
val shuffled = new ShuffledRowRDD(
ShuffleExchangeExec.prepareShuffleDependency(
locallyLimited, child.output, SinglePartition, serializer))
shuffled.mapPartitionsInternal(_.take(offset + limit).drop(offset))
}
}
/**
* limit 排序
* @param offset
* @param limit
* @param sortOrder
* @param projectList
* @param child
*/
case class SortOffsetLimitAndProjectExec(
offset: Int,
limit: Int,
sortOrder: Seq[SortOrder],
projectList: Seq[NamedExpression],
child: SparkPlan) extends UnaryExecNode {
override def output: Seq[Attribute] = {
projectList.map(_.toAttribute)
}
override def executeCollect(): Array[InternalRow] = {
val ord = new LazilyGeneratedOrdering(sortOrder, child.output)
val data = child.execute().map(_.copy()).takeOrdered(offset + limit).drop(offset)(ord)
if (projectList != child.output) {
val proj = UnsafeProjection.create(projectList, child.output)
data.map(r => proj(r).copy())
} else {
data
}
}
private val serializer: Serializer = new UnsafeRowSerializer(child.output.size)
protected override def doExecute(): RDD[InternalRow] = {
val ord = new LazilyGeneratedOrdering(sortOrder, child.output)
val localTopK: RDD[InternalRow] = {
child.execute().map(_.copy()).mapPartitions { iter =>
org.apache.spark.util.collection.Utils.takeOrdered(iter, limit)(ord)
}
}
val shuffled = new ShuffledRowRDD(
ShuffleExchangeExec.prepareShuffleDependency(
localTopK, child.output, SinglePartition, serializer))
shuffled.mapPartitions { iter =>
val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), offset+limit).drop(offset)(ord)
if (projectList != child.output) {
val proj = UnsafeProjection.create(projectList, child.output)
topK.map(r => proj(r))
} else {
topK
}
}
}
override def outputOrdering: Seq[SortOrder] = sortOrder
override def outputPartitioning: Partitioning = SinglePartition
override def simpleString: String = {
val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]")
val outputString = Utils.truncatedString(output, "[", ",", "]")
s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)"
}
2.3)org.apache.spark.sql.catalyst.plans.logical.baseLogicalOperators
/**
* 基本物理逻辑处理
* @param offsetExpr
* @param limitExpr
* @param child
*/
case class Offset(offsetExpr: Expression,limitExpr: Expression, child: LogicalPlan) extends UnaryNode {
override def output: Seq[Attribute] = child.output
}
2.4 org.apache.spark.sql.catalyst.parser.AstBuilder withQueryResultClauses:
//Offset
var withOffset:LogicalPlan = null
if(limit != null){
withOffset = withWindow.optional(offset){
Offset(typedVisit(offset),typedVisit(limit))
}
}else{
withOffset = withWindow.optional(offset){
Offset(typedVisit(offset),typedVisit(offset))
}
}
withOffset.optional(limit) {
Limit(typedVisit(limit), withOffset)
}