主要原理
spark sql 核心:
ParseInterface:
专门负责解析外部数据源SQL的SqlParser。目前自带的parser已经能满足各种需求
RunnableCommand:
从反射的数据源中实例化relation,然后注册到temp table中。
Strategy:
将plan映射为物理计划。
RelationProvider:
提供一个Relation。
BaseRelation:
可提供sql的一些逻辑操作(insert等)。
Spark SQL解析SQL流程如下:
1、Analyzer通过Rule解析,将UnresolvedRelation解析为JsonRelation。
2、通过Parse,Analyzer,Optimizer最后得到Relation(_*)
3、自定义的Strategy将LogicalPlan映射到物理计划PhysicalRDD。
4、PhysicalRDD里包含了如何查询外部数据的规则。
简单示例
使用自定义的relation,strategy,plan实现简单的select,insert语句
代码:
Relation
case class TextRelation(sqlContext: SQLContext, schema: StructType, path: String) extends BaseRelation with InsertableRelation {
override def insert(data: DataFrame, overwrite: Boolean): Unit = {
if (!new File(path).exists())
data.rdd.map(_.mkString(",")).saveAsTextFile(path)
}
}
Source
class TextSource extends SchemaRelationProvider {
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = {
val path = parameters.getOrElse("path", "/home/wpy/tmp/external_sql/testSql")
TextRelation(sqlContext, schema, path)
}
}
sql入口
class Text4SQLContext(sc: SparkContext, sqlContext: SQLContext){
sqlContext.experimental.extraStrategies = new TextStrategies().TextStrategy :: Nil
def sql(sqlText: String): DataFrame = {
sqlContext.sql(sqlText)
}
}
Plan
case class LogicalText(output: Seq[Attribute], path: String) extends LogicalPlan {
override def children: Seq[LogicalPlan] = Nil
}
case class PhysicalText(output: Seq[Attribute], path: String) extends SparkPlan {
override protected def doExecute(): RDD[InternalRow] = {
sparkContext.textFile(path).map { row =>
val fields = row.split(",").map(UTF8String.fromString)
UnsafeProjection.create(schema)(InternalRow.fromSeq(fields))
}
}
override def children: Seq[SparkPlan] = Nil
}
case class TextExecuteCommand(cmd: RunnableCommand) extends SparkPlan {
override protected def doExecute(): RDD[InternalRow] = {
ExecutedCommandExec(cmd).execute()
}
override def output: Seq[Attribute] = cmd.output
override def children: Seq[SparkPlan] = Nil
}
Strategy
class TextStrategies extends QueryPlanner[SparkPlan] with PredicateHelper {
override def strategies: Seq[GenericStrategy[SparkPlan]] = TextStrategy :: Nil
object TextStrategy extends Strategy {
override def apply(plan: LogicalPlan): Seq[SparkPlan] = {
plan match {
case LogicalText(output, path) => PhysicalText(output, path) :: Nil
case LogicalRelation(TextRelation(_, _, path), output, _) => PhysicalText(output, path) :: Nil
case i@InsertIntoTable(l@LogicalRelation(t: TextRelation, _, _), part, query, overwrite, false) if part.isEmpty =>
ExecutedCommandExec(InsertIntoDataSourceCommand(l, query, overwrite)) :: Nil
case _ => Nil
}
}
}
override protected def collectPlaceholders(plan: SparkPlan): Seq[(SparkPlan, LogicalPlan)] = {
plan.collect {
case placeholder@PlanLater(logicalPlan) => placeholder -> logicalPlan
}
// Nil
}
override protected def prunePlans(plans: Iterator[SparkPlan]): Iterator[SparkPlan] = {
plans
}
}
示例:
object TestTextSql {
val conf = new SparkConf().setMaster("local[*]").setAppName(getClass.getCanonicalName)
val ss = SparkSession.builder().config(conf).getOrCreate()
def main(args: Array[String]): Unit = {
val sqlContext = ss.sqlContext
val sparkContext = ss.sparkContext
val ts = new Text4SQLContext(sparkContext, sqlContext)
ts.sql(
"""create table test1(
|word string,
|num string
|) using external.datasource.TextSource
|options(
|path '/home/wpy/tmp/external_sql/test1'
|)
""".stripMargin)
ts.sql("select * from test1").show
print("=============================================\n")
ts.sql(
"""create table test2(
|word string,
|num string
|) using external.datasource.TextSource
|options(
|path '/home/wpy/tmp/external_sql/test2'
|)
""".stripMargin)
ts.sql(
"""
|insert into table test2
|select * from test1
""".stripMargin)
ts.sql("select * from test2 order by word").show
}
}
数据源
test1表(文本文件)
1,a
2,b
3,c
测试结果如下:
+—-+—+
|word|num|
+—-+—+
| 1| a|
| 2| b|
| 3| c|
+—-+—+
=============================================
+—-+—+
|word|num|
+—-+—+
| 1| a|
| 2| b|
| 3| c|
+—-+—+
由于这部分代码较为简单,此处则不再赘述,理论上自定义数据源可以实现在任何能读取的数据上进行sql操作,其中还可以自定义filter方法从源头对数据进行过滤,从而实现对大量数据的快速查询,注意此处使用了spark自带的RDD,使用其他spark不支持的数据源时,需要自行定义合适的RDD(物理计划)去获取数据。代码能直接运行(spark版本 2.2.0-SNAPSHOT)。