</pre>近期的工作,一直需要和SQL引擎打交道,周末有闲心下,把spark如何解析SQL语法,并执行spark的任务记录一下。<p></p><p><span style="white-space:pre"> </span>spark SQL自己实现了SQL语法的解析,从而脱离hive的版本跟进。看一下spark SQL 的core核心。从api入门。</p><p></p><pre name="code" class="plain">/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.spark.examples.sql
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._
// One method for defining the schema of an RDD is to make a case class with the desired column
// names and types.
case class Record(key: Int, value: String)
object RDDRelation {
def main(args: Array[String]) {
val sparkConf = new SparkConf().setAppName("RDDRelation")
val sc = new SparkContext(sparkConf)
val sqlContext = new SQLContext(sc)
// Importing the SQL context gives access to all the SQL functions and implicit conversions.
import sqlContext.implicits._
val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF()
// Any RDD containing case classes can be registered as a table. The schema of the table is
// automatically inferred using scala reflection.
df.registerTempTable("records")
// Once tables have been registered, you can run SQL queries over them.
println("Result of SELECT *:")
<span style="color:#ff99ff;">sqlContext.sql("SELECT * FROM records")</span>.collect().foreach(println)
// Aggregation queries are also supported.
val count = sqlContext.<span style="color:#ff99ff;">sql("SELECT COUNT(*) FROM records")</span>.collect().head.getLong(0)
println(s"COUNT(*): $count")
// The results of SQL queries are themselves RDDs and support all normal RDD functions. The
// items in the RDD are of type Row, which allows you to access each column by ordinal.
val rddFromSql = sqlContext.<span style="color:#ff99ff;">sql("SELECT key, value FROM records WHERE key < 10")</span>
println("Result of RDD.map:")
rddFromSql.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println)
// Queries can also be written using a LINQ-like Scala DSL.
<span style="color:#6666cc;"> df.where($"key" === 1).orderBy($"value".asc).select($"key")</span>.collect().foreach(println)
// Write out an RDD as a parquet file.
df.saveAsParquetFile("pair.parquet")
// Read in parquet file. Parquet files are self-describing so the schmema is preserved.
val parquetFile = sqlContext.parquetFile("pair.parquet")
// Queries can be run using the DSL on parequet files just like the original RDD.
parquetFile.<span style="color:#6666cc;">where($"key" === 1).select($"value".as("a"))</span>.collect().foreach(println)
// These files can also be registered as tables.
parquetFile.registerTempTable("parquetFile")
sqlContext.sql("SELECT * FROM parquetFile").collect().foreach(println)
sc.stop()
}
}
从api,看起来,使用方式简单。由粉红色的SQL语法解析,和dataframe版本紫色的api包装。主要关注粉红色的SQL语法是如何解析。
sqlContext.sql("SELECT * FROM records")
<pre name="code" class="plain"><span style="color:#ff0000;">org.org.apache.spark.sql.SQLContext.scala</span>
/**
* Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is
* used for SQL parsing can be configured with 'spark.sql.dialect'.
*
* @group basic
*/
def <span style="color:#ff99ff;">sql</span>(sqlText: String): DataFrame = {
if (conf.dialect == "sql") {
DataFrame(this, <span style="color:#ff6600;">parseSql</span>(sqlText))
} else {
sys.error(s"Unsupported SQL dialect: ${conf.dialect}")
}
}
<strong><span style="font-size:18px;"></span></strong><pre name="code" class="plain"><span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px;"><span style="font-size:18px;">001-执行SQL语法解析,返回一个<span style="color: rgb(51, 204, 255); font-family: monospace; white-space: pre; background-color: rgb(240, 240, 240);">LogicalPlan,逻辑执行计划。 </span></span></span>
protected[sql] def
parseSql
(sql: String):
LogicalPlan
= {
ddlParser(sql, false).getOrElse(sqlParser(sql))
}
<span style="color:#ff0000;">org.apache.spark.sql.sources.DDLParser.scala</span>
def apply(input: String, exceptionOnError: Boolean): Option[LogicalPlan] = {
try {
<span style="color:#33cc00;">Some</span>(<span style="color:#ffff33;">apply</span>(input))
} catch {
case ddlException: DDLException => throw ddlException
case _ if !exceptionOnError => None
case x: Throwable => throw x
}
}
scala.Option
final case class <span style="color:#33ff33;">Some</span>[+A](x: A) extends Option[A] {
def isEmpty = false
def get = x
}
org.apache.spark.sql.catalyst.AbstractSparkSQLParser
改函数开始解析SQL语法
def <span style="color:#ffff33;">apply</span>(input: String): LogicalPlan = {
// Initialize the Keywords.
lexical.<span style="color:#33ffff;">initialize</span>(reservedWords)
phrase(start)(new lexical.<span style="color:#ff99ff;">Scanner</span>(input)) match {
case Success(plan, _) => plan
case failureOrError => sys.error(failureOrError.toString)
}
}
org.apache.spark.sql.catalyst.SqlLexical
/* This is a work around to support the lazy setting */
def <span style="color:#33ffff;">initialize</span>(keywords: Seq[String]): Unit = {
<span style="color:#ffff99;">reserved</span>.clear()
<span style="color:#ffff99;">reserved </span>++= keywords
}
</pre><pre name="code" class="plain">/** This component provides a standard lexical parser for a simple,
* [[http://scala-lang.org Scala]]-like language. It parses keywords and
* identifiers, numeric literals (integers), strings, and delimiters.
*
* To distinguish between identifiers and keywords, it uses a set of
* reserved identifiers: every string contained in `reserved` is returned
* as a keyword token. (Note that `=>` is hard-coded as a keyword.)
* Additionally, the kinds of delimiters can be specified by the
* `delimiters` set.
*
* Usually this component is used to break character-based input into
* bigger tokens, which are then passed to a token-parser (see
* [[scala.util.parsing.combinator.syntactical.TokenParsers]].)
*
* @author Martin Odersky
* @author Iulian Dragos
* @author Adriaan Moors
*/
class StdLexical extends <span style="color:#ffccff;">Lexical </span>with StdTokens {
/** The set of reserved identifiers: these will be returned as `Keyword`s. */
val <span style="color:#ffff99;">reserved </span>= new mutable.HashSet[String]
scala.util.parsing.combinator.lexical.Scanners.Scanner
</pre><pre name="code" class="plain">class <span style="color:#ff99ff;">Scanner</span>(in: Reader[Char]) extends Reader[Token] {
/** Convenience constructor (makes a character reader out of the given string) */
def this(in: String) = this(new CharArrayReader(in.toCharArray()))
private val (tok, rest1, rest2) = whitespace(in) match {
case Success(_, in1) =>
token(in1) match {
case Success(tok, in2) => (tok, in1, in2)
case ns: NoSuccess => (errorToken(ns.msg), ns.next, skip(ns.next))
}
case ns: NoSuccess => (errorToken(ns.msg), ns.next, skip(ns.next))
}
private def skip(in: Reader[Char]) = if (in.atEnd) in else in.rest
override def source: java.lang.CharSequence = in.source
override def offset: Int = in.offset
def first = tok
def rest = new Scanner(rest2)
def pos = rest1.pos
def atEnd = in.atEnd || (whitespace(in) match { case Success(_, in1) => in1.atEnd case _ => false })
}
也就是说,sql()方法,返回一个LogicalPlan ,从下面基础结构看出,他是一个TreeNode的树状结构。
abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {
abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extendsTreeNode[PlanType]