Spark 提交执行源码学习

SparkSubmit 执行后,执行环境准备工作

private def runDriver(): Unit = {
    addAmIpFilter(None, System.getenv(ApplicationConstants.APPLICATION_WEB_PROXY_BASE_ENV))
    userClassThread = startUserApplication()

    // This a bit hacky, but we need to wait until the spark.driver.port property has
    // been set by the Thread executing the user class.
    logInfo("Waiting for spark context initialization...")
    val totalWaitTime = sparkConf.get(AM_MAX_WAIT_TIME)
      val sc = ThreadUtils.awaitResult(sparkContextPromise.future,
        Duration(totalWaitTime, TimeUnit.MILLISECONDS))
      if (sc != null) {
        val rpcEnv = sc.env.rpcEnv
        val userConf = sc.getConf
        val host = userConf.get(DRIVER_HOST_ADDRESS)
        val port = userConf.get(DRIVER_PORT)
        registerAM(host, port, userConf,, appAttemptId)

        val driverRef = rpcEnv.setupEndpointRef(
          RpcAddress(host, port),
        createAllocator(driverRef, userConf, rpcEnv, appAttemptId, distCacheConf)
      } else {
        // Sanity check; should never happen in normal operation, since sc should only be null
        // if the user app did not create a SparkContext.
        throw new IllegalStateException("User did not initialize spark context!")
      // 等待用户线程执行完毕
   * 启动用户线程
   * Start the user class, which contains the spark driver, in a separate Thread.
   * If the main routine exits cleanly or exits with System.exit(N) for any N
   * we assume it was successful, for all other cases we assume failure.
   * Returns the user thread that was started.
  private def startUserApplication(): Thread = {
    logInfo("Starting the user application in a separate Thread")

    var userArgs = args.userArgs
    val mainMethod = userClassLoader.loadClass(args.userClass)
      .getMethod("main", classOf[Array[String]])

    val userThread = new Thread {
      override def run(): Unit = {
        try {
          if (!Modifier.isStatic(mainMethod.getModifiers)) {
            logError(s"Could not find static main method in object ${args.userClass}")
            finish(FinalApplicationStatus.FAILED, ApplicationMaster.EXIT_EXCEPTION_USER_CLASS)
          } else {
            mainMethod.invoke(null, userArgs.toArray)
            finish(FinalApplicationStatus.SUCCEEDED, ApplicationMaster.EXIT_SUCCESS)
            logDebug("Done running user class")
        } catch {  



spark = SparkSession.builder \
    .config('spark.driver.memory','4g') \
    .config('spark.executor.memory','4g') \
    .config('spark.executor.instances',2) \
    .config('spark.executor.cores',2) \
    .config('mapreduce.input.fileinputformat.input.dir.recursive', 'true') \
    .config('mapred.input.dir.recursive', 'true') \
    .config('spark.sql.hive.convertMetastoreOrc', 'false') \
    .config('spark.yarn.queue', 'datawarehouse') \
    .appName('yqj test') \
    .enableHiveSupport() \
sql = "select count(*) from ods.check_hive2_not_delete group by cityid"
sql_run = spark.sql(sql)


     * Gets an existing [[SparkSession]] or, if there is no existing one, creates a new
     * one based on the options set in this builder.
     * This method first checks whether there is a valid thread-local SparkSession,
     * and if yes, return that one. It then checks whether there is a valid global
     * default SparkSession, and if yes, return that one. If no valid global default
     * SparkSession exists, the method creates a new SparkSession and assigns the
     * newly created SparkSession as the global default.
     * In case an existing SparkSession is returned, the non-static config options specified in
     * this builder will be applied to the existing SparkSession.
     *  SparkSession对象可以重用,在Cli模式中
     * @since 2.0.0
def getOrCreate(): SparkSession = synchronized {
      val sparkConf = new SparkConf()
      options.foreach {
    case (k, v) => sparkConf.set(k, v) }

      // Get the session from current thread's active session.
      var session = activeThreadSession.get()
      if ((session ne null) && !session.sparkContext.isStopped) {
        applyModifiableSettings(session, new java.util.HashMap[String, String](options.asJava))
        return session

      // Global synchronization so we will only set the default session once.
      SparkSession.synchronized {
        // If the current thread does not have an active session, get it from the global session.
        session = defaultSession.get()
        if ((session ne null) && !session.sparkContext.isStopped) {
          applyModifiableSettings(session, new java.util.HashMap[String, String](options.asJava))
          return session

        // No active nor global default session. Create a new one.
        val sparkContext = userSuppliedContext.getOrElse {
          // set a random app name if not given.
          if (!sparkConf.contains("")) {

          // Do not update `SparkConf` for existing `SparkContext`, as it's shared by all sessions.

        // 构造session对象,需要传入SparkContext对象和SparkConf对象
        session = new SparkSession(sparkContext, None, None, extensions, options.toMap)

      return session

再来看下SparkSession类的结构 SessionState 是一个核心的类,很多属性从中获取

class SparkSession private(
    @transient val sparkContext: SparkContext,
    @transient private val existingSharedState: Option[SharedState],
    @transient private val parentSessionState: Option[SessionState],
    @transient private[sql] val extensions: SparkSessionExtensions,
    @transient private[sql] val initialSessionOptions: Map[String, String])
extends Serializable with Closeable with Logging {
   * State shared across sessions, including the `SparkContext`, cached data, listener,
   * and a catalog that interacts with external systems.
   *  跨会话共享对象,
   * This is internal to Spark and there is no guarantee on interface stability.
   * @since 2.2.0
  lazy val sharedState: SharedState = {
    existingSharedState.getOrElse(new SharedState(sparkContext, initialSessionOptions))
   * State isolated across sessions, including SQL configurations, temporary tables, registered
   * functions, and everything else that accepts a [[org.apache.spark.sql.internal.SQLConf]].
   * If `parentSessionState` is not null, the `SessionState` will be a copy of the parent.
   * 跨会话隔离的对象
   * This is internal to Spark and there is no guarantee on interface stability.
   * @since 2.2.0
  lazy val sessionState: SessionState = {
      .getOrElse {
        val state = SparkSession.instantiateSessionState(


 * A class that holds all session-specific state in a given [[SparkSession]].
 * @param sharedState The state shared across sessions, e.g. global view manager, external catalog.
 * @param conf SQL-specific key-value configurations.
 * @param experimentalMethods Interface to add custom planning strategies and optimizers.
 * @param functionRegistry Internal catalog for managing functions registered by the user.
 * @param udfRegistration Interface exposed to the user for registering user-defined functions.
 * @param catalogBuilder a function to create an internal catalog for managing table and database
 *                       states.
 * @param sqlParser Parser that extracts expressions, plans, table identifiers etc. from SQL texts.
 * @param analyzerBuilder A function to create the logical query plan analyzer for resolving
 *                        unresolved attributes and relations.
 * @param optimizerBuilder a function to create the logical query plan optimizer.
 * @param planner Planner that converts optimized logical plans to physical plans.
 * @param streamingQueryManagerBuilder A function to create a streaming query manager to
 *                                     start and stop streaming queries.
 * @param listenerManager Interface to register custominternal/SessionState.scala
 *                        [[org.apache.spark.sql.util.QueryExecutionListener]]s.
 * @param resourceLoaderBuilder a function to create a session shared resource loader to load JARs,
 *                              files, etc.
 * @param createQueryExecution Function used to create QueryExecution objects.
 * @param createClone Function used to create clones of the session state.

private[sql] class SessionState(
    sharedState: SharedState,
    val conf: SQLConf,
    val experimentalMethods: ExperimentalMethods,
    val functionRegistry: FunctionRegistry,
    val tableFunctionRegistry: TableFunctionRegistry,
    val udfRegistration: UDFRegistration,
    catalogBuilder: () => SessionCatalog,
    val sqlParser: ParserInterface,
    analyzerBuilder: () => Analyzer,
    optimizerBuilder: () => Optimizer,
    val planner: SparkPlanner,
    val streamingQueryManagerBuilder: () => StreamingQueryManager,
    val listenerManager: ExecutionListenerManager,
    resourceLoaderBuilder: () => SessionResourceLoader,
    createQueryExecution: (LogicalPlan, CommandExecutionMode.Value) => QueryExecution,
    createClone: (SparkSession, SessionState) => SessionState,
    val columnarRules: Seq[ColumnarRule],
    val queryStagePrepRules: Seq[Rule[SparkPlan]])


   * Executes a SQL query using Spark, returning the result as a `DataFrame`.
   * This API eagerly runs DDL/DML commands, but not for SELECT queries.
   * @since 2.0.0
  def sql(sqlText: String): DataFrame = withActive {
    val tracker = new QueryPlanningTracker
    // LogicPlan
    val plan = tracker.measurePhase(QueryPlanningTracker.PARSING) {
    // 转化为DataFrame
    Dataset.ofRows(self, plan, tracker)

// tracker对象
 * A simple utility for tracking runtime and associated stats in query planning.
 * There are two separate concepts we track:
 * 1. Phases: These are broad scope phases in query planning, as listed below, i.e. analysis,
 * optimization and physical planning (just planning).
 * 2. Rules: These are the individual Catalyst rules that we track. In addition to time, we also
 * track the number of invocations and effective invocations.
object QueryPlanningTracker{
   // Define a list of common phases here.
  val PARSING = "parsing"
  val ANALYSIS = "analysis"
  val OPTIMIZATION = "optimization"
  val PLANNING = "planning"

SQL Parse --> plan
/** Creates LogicalPlan for a given SQL string. */
  override def parsePlan(sqlText: String): LogicalPlan = parse(sqlText) {
    parser =>
    astBuilder.visitSingleStatement(parser.singleStatement()) match {
      case plan: LogicalPlan => plan
      case _ =>
        val position = Origin(None, None)
        throw QueryParsingErrors.sqlStatementUnsupportedError(sqlText, position)

plan --> DataFrame
/** A variant of ofRows that allows passing in a tracker so we can track query parsing time. */
  def ofRows(sparkSession: SparkSession, logicalPlan: LogicalPlan, tracker: QueryPlanningTracker)
    : DataFrame = sparkSession.withActive {
    val qe = new QueryExecution(sparkSession, logicalPlan, tracker)
    new Dataset[Row](qe, RowEncoder(qe.analyzed.schema))


 * The primary workflow for executing relational queries using Spark.  Designed to allow easy
 * access to the intermediate phases of query execution for developers.
 * While this is not a public class, we should avoid changing the function names for the sake of
 * changing them, because a lot of developers use the feature for debugging.
class QueryExecution(
    val sparkSession: SparkSession,
    val logical: LogicalPlan,
    val tracker: QueryPlanningTracker = new QueryPlanningTracker,
    val mode: CommandExecutionMode.Value = CommandExecutionMode.ALL) extends Logging


 * A Dataset is a strongly typed collection of domain-specific objects that can be transformed
 * in parallel using functional or relational operations. Each Dataset also has an untyped view
 * called a `DataFrame`, which is a Dataset of [[Row]].
 * Operations available on Datasets are divided into transformations and actions. Transformations
 * are the ones that produce new Datasets, and actions are the ones that trigger computation and
 * return results. Example transformations include map, filter, select, and aggregate (`groupBy`).
 * Example actions count, show, or writing data out to file systems.
 * Datasets are "lazy", i.e. computations are only triggered when an action is invoked. Internally,
 * a Dataset represents a logical plan that describes the computation required to produce the data.
 * When an action is invoked, Spark's query optimizer optimizes the logical plan and generates a
 * physical plan for efficient execution in a parallel and distributed manner. To explore the
 * logical plan as well as optimized physical plan, use the `explain` function.
 * 数据集是“惰性的”,即只有在调用操作时才会触发计算。在内部,数据集表示描述生成数据所需的计算的逻辑计划。
 * 当一个action 被调用时,Spark 的查询优化器会优化逻辑计划并生成一个以并行和分布式方式有效执行的物理计划。
 * 为了探索逻辑计划以及优化的物理计划,使用`explain`功能。
 * @groupname basic Basic Dataset functions
 * @groupname action Actions
 * @groupname untypedrel Untyped transformations
 * @groupname typedrel Typed transformations
 * @since 1.6.0
class Dataset[T] private[sql](
    @DeveloperApi @Unstable @transient val queryExecution: QueryExecution,
    @DeveloperApi @Unstable @transient val encoder: Encoder[T])

查看一个SQL Explain的结果

sql = "select count(*) from ods.check_hive2_not_delete group by cityid"
sql_run = spark.sql(sql)

== Parsed Logical Plan ==
'Aggregate ['cityid], [unresolvedalias('count(1), None)]
+- 'UnresolvedRelation `ods`.`check_hive2_not_delete`

== Analyzed Logical Plan ==
count(1): bigint
Aggregate [cityid#85], [count(1) AS count(1)#95L]
+- SubqueryAlias check_hive2_not_delete
   +- HiveTableRelation `ods`.`check_hive2_not_delete`,, [id#84, cityid#85, lng#86, lat#87, prob#88, order_cnt#89, user_cnt#90, ratio#91, load_ratio#92, unload_ratio#93, 10m_dist_ratio#94]

== Optimized Logical Plan ==
Aggregate [cityid#85], [count(1) AS count(1)#95L]
+- Project [cityid#85]
   +- HiveTableRelation `ods`.`check_hive2_not_delete`,, [id#84, cityid#85, lng#86, lat#87, prob#88, order_cnt#89, user_cnt#90, ratio#91, load_ratio#92, unload_ratio#93, 10m_dist_ratio#94]

== Physical Plan ==
*(2) HashAggregate(keys=[cityid#85], functions=[count(1)], output=[count(1)#95L])
+- Exchange hashpartitioning(cityid#85, 200)
   +- *(1) HashAggregate(keys=[cityid#85], functions=[partial_count(1)], output=[cityid#85, count#98L])
      +- HiveTableScan [cityid#85], HiveTableRelation `ods`.`check_hive2_not_delete`,, [id#84, cityid#85, lng#86, lat#87, prob#88, order_cnt#89, user_cnt#90, ratio#91, load_ratio#92, unload_ratio#93, 10m_dist_ratio#94]

Action算子最终触发SparkContext的 方法

   * Run a function on a given set of partitions in an RDD and pass the results to the given
   * handler function. This is the main entry point for all actions in Spark.
   * @param rdd target RDD to run tasks on
   * @param func a function to run on each partition of the RDD
   * @param partitions set of partitions to run on; some jobs may not want to compute on all
   * partitions of the target RDD, e.g. for operations like `first()`
   * @param resultHandler callback to pass each result to
  def runJob[T, U: ClassTag](
      rdd: RDD[T],
      func: (TaskContext, Iterator[T]) => U,
      partitions: Seq[Int],
      resultHandler: (Int, U) => Unit): Unit = {
    if (stopped.get()) {
      throw new IllegalStateException("SparkContext has been shutdown")
    val callSite = getCallSite
    val cleanedFunc = clean(func)
    logInfo("Starting job: " + callSite.shortForm)
    if (conf.getBoolean("spark.logLineage", false)) {
      logInfo("RDD's recursive dependencies:\n" + rdd.toDebugString)
    dagScheduler.runJob(rdd, cleanedFunc, partitions, callSite, resultHandler, localProperties.get)


   * Submit an action job to the scheduler.
   * @param rdd target RDD to run tasks on
   * @param func a function to run on each partition of the RDD
   * @param partitions set of partitions to run on; some jobs may not want to compute on all
   *   partitions of the target RDD, e.g. for operations like first()
   * @param callSite where in the user program this job was called
   * @param resultHandler callback to pass each result to
   * @param properties scheduler properties to attach to this job, e.g. fair scheduler pool name
   * @return a 
