第7课：Spark Streaming源码解读之JobScheduler内幕实现和深度思考

最新推荐文章于 2024-11-13 20:58:16 发布

chinsun_1

最新推荐文章于 2024-11-13 20:58:16 发布

阅读量575

点赞数

本文链接：https://blog.csdn.net/chinsun_1/article/details/51541867

版权

Spark定制班同时被 2 个专栏收录

8 篇文章 0 订阅

订阅专栏

SparkStreaming

1 篇文章 0 订阅

订阅专栏

JobScheduler 是SparkStreaming调度的核心，相当于Spark Core中高度中心的DAGScheduler。

StreamingContext的start方法

/**
 * Start the execution of the streams.
 *
 * @throws IllegalStateException if the StreamingContext is already stopped.
 */
def start(): Unit = synchronized {
state match {
case INITIALIZED =>
startSite.set(DStream.getCreationSite())
      StreamingContext.ACTIVATION_LOCK.synchronized {
        StreamingContext.assertNoOtherContextIsActive()
try {
          validate()

          //调用ThreadUtils的runInNewThread方法
          // Start the streaming scheduler in a new thread, so that thread local properties
          // like call sites and job groups can be reset without affecting those of the
          // current thread.
            ThreadUtils.runInNewThread("streaming-start") {//*1
            sparkContext.setCallSite(startSite.get)
            sparkContext.clearJobGroup()
            sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
            scheduler.start()  //*2
          }
state = StreamingContextState.ACTIVE
} catch {
case NonFatal(e) =>
            logError("Error starting the context, marking it as stopped", e)
scheduler.stop(false)
state = StreamingContextState.STOPPED
throw e
        }
        StreamingContext.setActiveContext(this)
      }
shutdownHookRef = ShutdownHookManager.addShutdownHook(
        StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
// Registering Streaming Metrics at the start of the StreamingContext
assert(env.metricsSystem != null)
env.metricsSystem.registerSource(streamingSource)
uiTab.foreach(_.attach())
      logInfo("StreamingContext started")
case ACTIVE =>
      logWarning("StreamingContext has already been started")
case STOPPED =>
throw new IllegalStateException("StreamingContext has already been stopped")
  }
}

接下来从具体代码调用的print方法角度跟踪查看

val adsClickStreamFormatted = adsClickStream.map { ads => (ads.split(" ")(1), ads) }

 validClicked.map(validClick => {validClick._2._1})
}).print

DStream的print方法

/**
 * Print the first ten elements of each RDD generated in this DStream. This is an output
 * operator, so this DStream will be registered as an output stream and there materialized.
 */
def print(): Unit = ssc.withScope {
print(10)
}

/**
 * Print the first num elements of each RDD generated in this DStream. This is an output
 * operator, so this DStream will be registered as an output stream and there materialized.
 */
def print(num: Int): Unit = ssc.withScope {
def foreachFunc: (RDD[T], Time) => Unit = {
    (rdd: RDD[T], time: Time) => {
    val firstNum = rdd.take(num + 1)
    // scalastyle:off println
    println("-------------------------------------------")
    println("Time: " + time)
    println("-------------------------------------------")
      firstNum.take(num).foreach(println)
   if (firstNum.length > num) println("...")
      println()
      // scalastyle:on println
   }
  }
foreachRDD(context.sparkContext.clean(foreachFunc), displayInnerRDDOps = false)
}
/**
 * Apply a function to each RDD in this DStream. This is an output operator, so
 * 'this' DStream will be registered as an output stream and therefore materialized.
 * @param foreachFunc foreachRDD function
 * @param displayInnerRDDOps Whether the detailed callsites and scopes of the RDDs generated
 *                           in the `foreachFunc` to be displayed in the UI. If `false`, then
 *                           only the scopes and callsites of `foreachRDD` will override those
 *                           of the RDDs on the display.
 */
private def foreachRDD(
    foreachFunc: (RDD[T], Time) => Unit,
displayInnerRDDOps: Boolean): Unit = {
new ForEachDStream(this,
context.sparkContext.clean(foreachFunc, false), displayInnerRDDOps).register()
}

/**
 * An internal DStream used to represent output operations like DStream.foreachRDD.
 * @param parent        Parent DStream
 * @param foreachFunc   Function to apply on each RDD generated by the parent DStream
 * @param displayInnerRDDOps Whether the detailed callsites and scopes of the RDDs generated
 *                           by `foreachFunc` will be displayed in the UI; only the scope and
 *                           callsite of `DStream.foreachRDD` will be displayed.
 */
private[streaming]
class ForEachDStream[T: ClassTag] (
    parent: DStream[T],
foreachFunc: (RDD[T], Time) => Unit,
displayInnerRDDOps: Boolean
) extends DStream[Unit](parent.ssc) {

override def dependencies: List[DStream[_]] = List(parent)

override def slideDuration: Duration = parent.slideDuration

override def compute(validTime: Time): Option[RDD[Unit]] = None

override def generateJob(time: Time): Option[Job] = {
    parent.getOrCompute(time) match {
case Some(rdd) =>
val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) {
          foreachFunc(rdd, time)
        }
Some(new Job(time, jobFunc))  //返回一个普通的Job（将time和jobFunc构造一个Job），此处的generateJob是逻辑级别的,并未直接调用
case None => None
    }
  }
}

实际调用从JobGenerator开始调用，

则由DStreamGraph中的generateJobs方法

def generateJobs(time: Time): Seq[Job] = {
  logDebug("Generating jobs for time " + time)
val jobs = this.synchronized {
outputStreams.flatMap { outputStream =>
val jobOption = outputStream.generateJob(time) //此处outputStream就是 ForEachDStream
      jobOption.foreach(_.setCallSite(outputStream.creationSite))
      jobOption
    }
  }
  logDebug("Generated " + jobs.length + " jobs for time " + time)
  jobs
}

再看JobGenerator中的generatorJobs(L240)

/** Generate jobs and perform checkpoint for the given `time`.  */
private def generateJobs(time: Time) {
// Set the SparkEnv in this thread, so that job generation code can access the environment
  // Example: BlockRDDs are created in this thread, and it needs to access BlockManager
  // Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
SparkEnv.set(ssc.env)
Try {
    jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
graph.generateJobs(time) // generate jobs using allocated block
} match {
case Success(jobs) =>
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)       //streamIdToInputInfos 是JOB需要处理的数据，
      jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
case Failure(e) =>
      jobScheduler.reportError("Error generating jobs for time " + time, e)
  }
eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}
def submitJobSet(jobSet: JobSet) { //JobSet是同一批次中的一组Job的集合
  if (jobSet.jobs.isEmpty) {
    logInfo("No jobs added for time " + jobSet.time)
  } else {
    listenerBus.post(StreamingListenerB atchSubmitted(jobSet.toBatchInfo))
    jobSets.put(jobSet.time, jobSet) //此处的jobSets是时间维度的多个Job，是ConcurrentHashMap 
    jobSet.jobs.foreach(job => jobExecutor.execute(new JobHandler(job)))//由线程级别的Executor执行每个job,此处job由JobHandler封装作为参数
    logInfo("Added jobs for time " + jobSet.time)
  }
}
JobHandler方法

 private class JobHandler(job: Job) extends Runnable with Logging {
    import JobScheduler._

    def run() {
      try {
        val formattedTime = UIUtils.formatBatchTime(
          job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
        val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
        val batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"

        ssc.sc.setJobDescription(
          s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
        ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
        ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)

        // We need to assign `eventLoop` to a temp variable. Otherwise, becaus要e
        // `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
        // it's possible that when `post` is called, `eventLoop` happens to null.
        var _eventLoop = eventLoop
        if (_eventLoop != null) {
          _eventLoop.post(JobStarted(job, clock.getTimeMillis())) //记录启动Job
          // Disable checks for existing output directories in jobs launched by the streaming
          // scheduler, since we may need to write output to an existing directory during checkpoint
          // recovery; see SPARK-4835 for more details.
          PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
            job.run() //job真正执行
          }
          _eventLoop = eventLoop
          if (_eventLoop != null) {
            _eventLoop.post(JobCompleted(job, clock.getTimeMillis()))//记录Job结束
          }
        } else {
          // JobScheduler has been stopped.
        }
      } finally {
        ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, null)
        ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, null)
      }
    }
  }
}
Job的run方法中 func

class Job(val time: Time, func: () => _) {
  private var _id: String = _
  private var _outputOpId: Int = _
  private var isSet = false
  private var _result: Try[_] = null
  private var _callSite: CallSite = null
  private var _startTime: Option[Long] = None
  private var _endTime: Option[Long] = None

  def run() {
    _result = Try(func())
  }
Try中的func()其实是最ForEachDStream类的jobFunc(Line 49),

其中foreachFunc其实就是print中定义的DStream的 Line 766行

 def foreachFunc: (RDD[T], Time)

JobScheduler参数注意

jobExecutor是一个线程池，线程的个数由参数配置,如下

private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1)
private val jobExecutor =
  ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor")