在自定义数据源中使用sparksql(Spark2.0+)带示例

主要原理

spark sql 核心:
ParseInterface:
专门负责解析外部数据源SQL的SqlParser。目前自带的parser已经能满足各种需求
RunnableCommand:
从反射的数据源中实例化relation,然后注册到temp table中。
Strategy:
将plan映射为物理计划。
RelationProvider:
提供一个Relation。
BaseRelation:
可提供sql的一些逻辑操作(insert等)。

Spark SQL解析SQL流程如下:
1、Analyzer通过Rule解析,将UnresolvedRelation解析为JsonRelation。
2、通过Parse,Analyzer,Optimizer最后得到Relation(_*)
3、自定义的Strategy将LogicalPlan映射到物理计划PhysicalRDD。
4、PhysicalRDD里包含了如何查询外部数据的规则。

简单示例

使用自定义的relation,strategy,plan实现简单的select,insert语句

代码:

Relation


case class TextRelation(sqlContext: SQLContext, schema: StructType, path: String) extends BaseRelation with InsertableRelation {
override def insert(data: DataFrame, overwrite: Boolean): Unit = {
if (!new File(path).exists())
data.rdd.map(_.mkString(",")).saveAsTextFile(path)
}
}

Source


class TextSource extends SchemaRelationProvider {
override def createRelation(sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = {
val path = parameters.getOrElse("path", "/home/wpy/tmp/external_sql/testSql")
TextRelation(sqlContext, schema, path)
}
}

sql入口


class Text4SQLContext(sc: SparkContext, sqlContext: SQLContext){
sqlContext.experimental.extraStrategies = new TextStrategies().TextStrategy :: Nil
def sql(sqlText: String): DataFrame = {
sqlContext.sql(sqlText)
}
}

Plan


case class LogicalText(output: Seq[Attribute], path: String) extends LogicalPlan {
override def children: Seq[LogicalPlan] = Nil
}


case class PhysicalText(output: Seq[Attribute], path: String) extends SparkPlan {
override protected def doExecute(): RDD[InternalRow] = {
sparkContext.textFile(path).map { row =>
val fields = row.split(",").map(UTF8String.fromString)
UnsafeProjection.create(schema)(InternalRow.fromSeq(fields))
}
}
override def children: Seq[SparkPlan] = Nil
}


case class TextExecuteCommand(cmd: RunnableCommand) extends SparkPlan {
override protected def doExecute(): RDD[InternalRow] = {
ExecutedCommandExec(cmd).execute()
}
override def output: Seq[Attribute] = cmd.output
override def children: Seq[SparkPlan] = Nil
}

Strategy


class TextStrategies extends QueryPlanner[SparkPlan] with PredicateHelper {
override def strategies: Seq[GenericStrategy[SparkPlan]] = TextStrategy :: Nil
object TextStrategy extends Strategy {
override def apply(plan: LogicalPlan): Seq[SparkPlan] = {
plan match {
case LogicalText(output, path) => PhysicalText(output, path) :: Nil
case LogicalRelation(TextRelation(_, _, path), output, _) => PhysicalText(output, path) :: Nil
case i@InsertIntoTable(l@LogicalRelation(t: TextRelation, _, _), part, query, overwrite, false) if part.isEmpty =>
ExecutedCommandExec(InsertIntoDataSourceCommand(l, query, overwrite)) :: Nil
case _ => Nil
}
}
}
override protected def collectPlaceholders(plan: SparkPlan): Seq[(SparkPlan, LogicalPlan)] = {
plan.collect {
case placeholder@PlanLater(logicalPlan) => placeholder -> logicalPlan
}
// Nil
}
override protected def prunePlans(plans: Iterator[SparkPlan]): Iterator[SparkPlan] = {
plans
}
}

示例:


object TestTextSql {
val conf = new SparkConf().setMaster("local[*]").setAppName(getClass.getCanonicalName)
val ss = SparkSession.builder().config(conf).getOrCreate()
def main(args: Array[String]): Unit = {
val sqlContext = ss.sqlContext
val sparkContext = ss.sparkContext
val ts = new Text4SQLContext(sparkContext, sqlContext)
ts.sql(
"""create table test1(
|word string,
|num string
|) using external.datasource.TextSource
|options(
|path '/home/wpy/tmp/external_sql/test1'
|)
""".stripMargin)
ts.sql("select * from test1").show
print("=============================================\n")
ts.sql(
"""create table test2(
|word string,
|num string
|) using external.datasource.TextSource
|options(
|path '/home/wpy/tmp/external_sql/test2'
|)
""".stripMargin)
ts.sql(
"""
|insert into table test2
|select * from test1
""".stripMargin)
ts.sql("select * from test2 order by word").show
}
}

数据源
test1表(文本文件)
1,a
2,b
3,c

测试结果如下:

+—-+—+
|word|num|
+—-+—+
| 1| a|
| 2| b|
| 3| c|
+—-+—+

=============================================

+—-+—+
|word|num|
+—-+—+
| 1| a|
| 2| b|
| 3| c|
+—-+—+

由于这部分代码较为简单,此处则不再赘述,理论上自定义数据源可以实现在任何能读取的数据上进行sql操作,其中还可以自定义filter方法从源头对数据进行过滤,从而实现对大量数据的快速查询,注意此处使用了spark自带的RDD,使用其他spark不支持的数据源时,需要自行定义合适的RDD(物理计划)去获取数据。代码能直接运行(spark版本 2.2.0-SNAPSHOT)。

评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值