spark SQL源码阅读001——sql.core包核心类——001执行SQL语法解析

最新推荐文章于 2024-09-26 13:54:23 发布

codemosi

最新推荐文章于 2024-09-26 13:54:23 发布

阅读量691

点赞数

分类专栏： spark源码阅读文章标签： spark 源码

本文链接：https://blog.csdn.net/codemosi/article/details/45954515

版权

spark源码阅读专栏收录该内容

2 篇文章 0 订阅

订阅专栏

</pre>近期的工作，一直需要和SQL引擎打交道，周末有闲心下，把spark如何解析SQL语法，并执行spark的任务记录一下。<p></p><p><span style="white-space:pre">	</span>spark SQL自己实现了SQL语法的解析，从而脱离hive的版本跟进。看一下spark SQL 的core核心。从api入门。</p><p></p><pre name="code" class="plain">/*
 * Licensed to the Apache Software Foundation (ASF) under one or more
 * contributor license agreements.  See the NOTICE file distributed with
 * this work for additional information regarding copyright ownership.
 * The ASF licenses this file to You under the Apache License, Version 2.0
 * (the "License"); you may not use this file except in compliance with
 * the License.  You may obtain a copy of the License at
 *
 *    http://www.apache.org/licenses/LICENSE-2.0
 *
 * Unless required by applicable law or agreed to in writing, software
 * distributed under the License is distributed on an "AS IS" BASIS,
 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 * See the License for the specific language governing permissions and
 * limitations under the License.
 */


package org.apache.spark.examples.sql


import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.functions._


// One method for defining the schema of an RDD is to make a case class with the desired column
// names and types.
case class Record(key: Int, value: String)


object RDDRelation {
  def main(args: Array[String]) {
    val sparkConf = new SparkConf().setAppName("RDDRelation")
    val sc = new SparkContext(sparkConf)
    val sqlContext = new SQLContext(sc)


    // Importing the SQL context gives access to all the SQL functions and implicit conversions.
    import sqlContext.implicits._


    val df = sc.parallelize((1 to 100).map(i => Record(i, s"val_$i"))).toDF()
    // Any RDD containing case classes can be registered as a table.  The schema of the table is
    // automatically inferred using scala reflection.
    df.registerTempTable("records")


    // Once tables have been registered, you can run SQL queries over them.
    println("Result of SELECT *:")
    <span style="color:#ff99ff;">sqlContext.sql("SELECT * FROM records")</span>.collect().foreach(println)


    // Aggregation queries are also supported.
    val count = sqlContext.<span style="color:#ff99ff;">sql("SELECT COUNT(*) FROM records")</span>.collect().head.getLong(0)
    println(s"COUNT(*): $count")


    // The results of SQL queries are themselves RDDs and support all normal RDD functions.  The
    // items in the RDD are of type Row, which allows you to access each column by ordinal.
    val rddFromSql = sqlContext.<span style="color:#ff99ff;">sql("SELECT key, value FROM records WHERE key < 10")</span>


    println("Result of RDD.map:")
    rddFromSql.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println)


    // Queries can also be written using a LINQ-like Scala DSL.
   <span style="color:#6666cc;"> df.where($"key" === 1).orderBy($"value".asc).select($"key")</span>.collect().foreach(println)


    // Write out an RDD as a parquet file.
    df.saveAsParquetFile("pair.parquet")


    // Read in parquet file.  Parquet files are self-describing so the schmema is preserved.
    val parquetFile = sqlContext.parquetFile("pair.parquet")


    // Queries can be run using the DSL on parequet files just like the original RDD.
    parquetFile.<span style="color:#6666cc;">where($"key" === 1).select($"value".as("a"))</span>.collect().foreach(println)


    // These files can also be registered as tables.
    parquetFile.registerTempTable("parquetFile")
    sqlContext.sql("SELECT * FROM parquetFile").collect().foreach(println)


    sc.stop()
  }
}

从api，看起来，使用方式简单。由粉红色的SQL语法解析，和dataframe版本紫色的api包装。主要关注粉红色的SQL语法是如何解析。

sqlContext.sql("SELECT * FROM records")

<pre name="code" class="plain"><span style="color:#ff0000;">org.org.apache.spark.sql.SQLContext.scala</span>

/**
   * Executes a SQL query using Spark, returning the result as a [[DataFrame]]. The dialect that is
   * used for SQL parsing can be configured with 'spark.sql.dialect'.
   *
   * @group basic
   */
  def <span style="color:#ff99ff;">sql</span>(sqlText: String): DataFrame = {
    if (conf.dialect == "sql") {
      DataFrame(this, <span style="color:#ff6600;">parseSql</span>(sqlText))
    } else {
      sys.error(s"Unsupported SQL dialect: ${conf.dialect}")
    }
  }

<strong><span style="font-size:18px;"></span></strong><pre name="code" class="plain"><span style="font-family: Arial, Helvetica, sans-serif; font-size: 12px;"><span style="font-size:18px;">001-执行SQL语法解析，返回一个<span style="color: rgb(51, 204, 255); font-family: monospace; white-space: pre; background-color: rgb(240, 240, 240);">LogicalPlan，逻辑执行计划。 </span></span></span>

protected[sql] def parseSql (sql: String): LogicalPlan = {

    ddlParser(sql, false).getOrElse(sqlParser(sql))
  }

<span style="color:#ff0000;">org.apache.spark.sql.sources.DDLParser.scala</span>

def apply(input: String, exceptionOnError: Boolean): Option[LogicalPlan] = {
    try {
      <span style="color:#33cc00;">Some</span>(<span style="color:#ffff33;">apply</span>(input))
    } catch {
      case ddlException: DDLException => throw ddlException
      case _ if !exceptionOnError => None
      case x: Throwable => throw x
    }
  }

scala.Option

final case class <span style="color:#33ff33;">Some</span>[+A](x: A) extends Option[A] {
  def isEmpty = false
  def get = x
}

org.apache.spark.sql.catalyst.AbstractSparkSQLParser

改函数开始解析SQL语法

def <span style="color:#ffff33;">apply</span>(input: String): LogicalPlan = {
    // Initialize the Keywords.
    lexical.<span style="color:#33ffff;">initialize</span>(reservedWords)
    phrase(start)(new lexical.<span style="color:#ff99ff;">Scanner</span>(input)) match {
      case Success(plan, _) => plan
      case failureOrError => sys.error(failureOrError.toString)
    }
  }

org.apache.spark.sql.catalyst.SqlLexical

/* This is a work around to support the lazy setting */
  def <span style="color:#33ffff;">initialize</span>(keywords: Seq[String]): Unit = {
    <span style="color:#ffff99;">reserved</span>.clear()
    <span style="color:#ffff99;">reserved </span>++= keywords
  }

</pre><pre name="code" class="plain">/** This component provides a standard lexical parser for a simple,
 *  [[http://scala-lang.org Scala]]-like language. It parses keywords and
 *  identifiers, numeric literals (integers), strings, and delimiters.
 *
 *  To distinguish between identifiers and keywords, it uses a set of
 *  reserved identifiers:  every string contained in `reserved` is returned
 *  as a keyword token. (Note that `=>` is hard-coded as a keyword.)
 *  Additionally, the kinds of delimiters can be specified by the
 *  `delimiters` set.
 *
 *  Usually this component is used to break character-based input into
 *  bigger tokens, which are then passed to a token-parser (see
 *  [[scala.util.parsing.combinator.syntactical.TokenParsers]].)
 *
 * @author Martin Odersky
 * @author Iulian Dragos
 * @author Adriaan Moors
 */
class StdLexical extends <span style="color:#ffccff;">Lexical </span>with StdTokens {

/** The set of reserved identifiers: these will be returned as `Keyword`s. */
  val <span style="color:#ffff99;">reserved </span>= new mutable.HashSet[String]

scala.util.parsing.combinator.lexical.Scanners.Scanner

</pre><pre name="code" class="plain">class <span style="color:#ff99ff;">Scanner</span>(in: Reader[Char]) extends Reader[Token] {
    /** Convenience constructor (makes a character reader out of the given string) */
    def this(in: String) = this(new CharArrayReader(in.toCharArray()))
    private val (tok, rest1, rest2) = whitespace(in) match {
      case Success(_, in1) =>
        token(in1) match {
          case Success(tok, in2) => (tok, in1, in2)
          case ns: NoSuccess => (errorToken(ns.msg), ns.next, skip(ns.next))
        }
      case ns: NoSuccess => (errorToken(ns.msg), ns.next, skip(ns.next))
    }
    private def skip(in: Reader[Char]) = if (in.atEnd) in else in.rest

    override def source: java.lang.CharSequence = in.source
    override def offset: Int = in.offset
    def first = tok
    def rest = new Scanner(rest2)
    def pos = rest1.pos
    def atEnd = in.atEnd || (whitespace(in) match { case Success(_, in1) => in1.atEnd case _ => false })
  }

也就是说，sql（）方法，返回一个LogicalPlan ，从下面基础结构看出，他是一个TreeNode的树状结构。

abstract class LogicalPlan extends QueryPlan[LogicalPlan] with Logging {

abstract class QueryPlan[PlanType <: TreeNode[PlanType]] extendsTreeNode[PlanType]