第7课:Spark Streaming源码解读之JobScheduler内幕实现和深度思考

本期内容:

1. JobScheduler内幕实现

2. JobScheduler深度思考

 

所有工作的关键都是jobScheduler

SparkStreaming至少要设置两条线程是因为一条用于接收数据,一条用于计算,调度和执行分离开。

源码

StreamingContext.scala
Start()

/**
 
* Start the execution of the streams.
 *
 *
@throws IllegalStateException if the StreamingContext is already stopped.
 
*/
def start(): Unit = synchronized {
 
state match {
   
case INITIALIZED =>
     
startSite.set(DStream.getCreationSite())
   
  StreamingContext.ACTIVATION_LOCK.synchronized {
       
StreamingContext.assertNoOtherContextIsActive()
        try {
         
validate()

          // Start the streaming scheduler in a new thread, so that thread local properties
         
// like call sites and job groups can be reset without affecting those of the
          // current thread.

//调度层面启动的新线程
         
ThreadUtils.runInNewThread("streaming-start") {
           
sparkContext.setCallSite(startSite.get)
           
sparkContext.clearJobGroup()
            sparkContext.setLocalProperty(SparkContext.SPARK_JOB_INTERRUPT_ON_CANCEL, "false")
           
scheduler.start()
         
}
          state = StreamingContextState.ACTIVE
       
} catch {
         
case NonFatal(e) =>
           
logError("Error starting the context, marking it as stopped", e)
           
scheduler.stop(false)
           
state = StreamingContextState.STOPPED
           
throw e
       
}
        StreamingContext.setActiveContext(this)
     
}
      shutdownHookRef = ShutdownHookManager.addShutdownHook(
       
StreamingContext.SHUTDOWN_HOOK_PRIORITY)(stopOnShutdown)
     
// Registering Streaming Metrics at the start of the StreamingContext
     
assert
(env.metricsSystem != null)
     
env.metricsSystem.registerSource(streamingSource)
     
uiTab.foreach(_.attach())
     
logInfo("StreamingContext started")
   
case ACTIVE =>
     
logWarning("StreamingContext has already been started")
   
case STOPPED =>
     
throw new IllegalStateException("StreamingContext has already been stopped")
 
}
}

runInNewThread

defrunInNewThread[T](
   
threadName: String,
   
isDaemon: Boolean = true)(body: => T): T = {
 
@volatile var exception: Option[Throwable] = None
 
@volatile var result: T = null.asInstanceOf[T]

 
val thread = new Thread(threadName) {
   
override def run(): Unit = {
     
try {
       
result = body
      } catch {
       
case NonFatal(e) =>
         
exception = Some(e)
      }
    }
  }
  thread.setDaemon(isDaemon)
  thread.start()
  thread.join()

  exception match {
   
case Some(realException) =>
     
// Remove the part of the stack that shows method calls into this helper method
     
// This means drop everything from the top until the stack element
      // ThreadUtils.runInNewThread(), and then drop that as well (hence the `drop(1)`).
     
val baseStackTrace = Thread.currentThread().getStackTrace().dropWhile(
       
! _.getClassName.contains(this.getClass.getSimpleName)).drop(1)

     
// Remove the part of the new thread stack that shows methods call from this helper method
     
val extraStackTrace = realException.getStackTrace.takeWhile(
       
! _.getClassName.contains(this.getClass.getSimpleName))

     
// Combine the two stack traces, with a place holder just specifying that there
     
// was a helper method used, without any further details of the helper
     
val placeHolderStackElem = new StackTraceElement(
       
s"... run in separate thread using ${ThreadUtils.getClass.getName.stripSuffix("$")} ..",
       
" ", "", -1)
     
val finalStackTrace = extraStackTrace ++ Seq(placeHolderStackElem) ++ baseStackTrace

     
// Update the stack trace and rethrow the exception in the caller thread
     
realException.setStackTrace(finalStackTrace)
     
throw realException
   
case None =>
     
result
  }
}

private[streaming] val scheduler = new JobScheduler(this)

jobScheduler.scala下面

private val jobExecutor =
 
ThreadUtils.newDaemonFixedThreadPool(numConcurrentJobs, "streaming-job-executor")

newDaemonFixedThreadPool

/**
 
* Wrapper over newFixedThreadPool. Thread names are formatted as prefix-ID, where ID is a
 * unique, sequentially assigned integer.
 */
def newDaemonFixedThreadPool(nThreads: Int, prefix: String): ThreadPoolExecutor = {
 
val threadFactory = namedThreadFactory(prefix)
 
Executors.newFixedThreadPool(nThreads, threadFactory).asInstanceOf[ThreadPoolExecutor]
}

 

scheduler.start()

def start(): Unit = synchronized {
 
if (eventLoop != null) return // scheduler has already been started

 
logDebug("Starting JobScheduler")
 
eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
   
override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

   
override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
 
}
  eventLoop.start()

 
 // attach rate controllers of input streams to receive batch completion updates
 
for {
   
inputDStream <- ssc.graph.getInputStreams
   
rateController <- inputDStream.rateController
 
} ssc.addStreamingListener(rateController)

 
listenerBus.start(ssc.sparkContext)
 
receiverTracker = new ReceiverTracker(ssc)
 
inputInfoTracker = new InputInfoTracker(ssc)
 
receiverTracker.start()
 
jobGenerator.start()
 
logInfo("Started JobScheduler")
}

 

JobGenerator.start

 

/** Start generation of jobs */
def start(): Unit = synchronized {
 
if (eventLoop != null) return // generator has already been started

 
// Call checkpointWriter here to initialize it before eventLoop uses it to avoid a deadlock.
  // See SPARK-10125
  checkpointWriter

 
eventLoop
= new EventLoop[JobGeneratorEvent]("JobGenerator") {
   
override protected def onReceive(event: JobGeneratorEvent): Unit = processEvent(event)

   
override protected def onError(e: Throwable): Unit = {
     
jobScheduler.reportError("Error in job generator", e)
   
}
  }
  eventLoop.start()

 
if (ssc.isCheckpointPresent) {
   
restart()
  } else {
   
startFirstTime()
  }
}

 

ForEachDStream

private[streaming]
class ForEachDStream[T: ClassTag] (
   
parent: DStream[T],
   
foreachFunc: (RDD[T], Time) => Unit,
   
displayInnerRDDOps: Boolean
  ) extends DStream[Unit](parent.ssc) {

 
override def dependencies: List[DStream[_]] = List(parent)

 
override def slideDuration: Duration = parent.slideDuration

 
override def compute(validTime: Time): Option[RDD[Unit]] = None

 
override def generateJob(time: Time): Option[Job] = {
   
parent.getOrCompute(time) match {
     
case Some(rdd) =>
       
val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) {
         
foreachFunc(rdd, time)
        }
        Some(new Job(time, jobFunc))
     
case None => None
   
}
  }
}

DStreamGraph.scala

def generateJobs(time: Time): Seq[Job] = {
 
logDebug("Generating jobs for time " + time)
 
val jobs = this.synchronized {
   
outputStreams.flatMap { outputStream =>
     
val jobOption = outputStream.generateJob(time)
     
jobOption.foreach(_.setCallSite(outputStream.creationSite))
     
jobOption
    }
  }
  logDebug("Generated " + jobs.length + " jobs for time " + time)
 
jobs
}

private val outputStreams = new ArrayBuffer[DStream[_]]()

JobGenerator. generateJobs

JobSet基于时间生成的job, streamIdToInputInfos是job要处理的数据

/** Generate jobs and perform checkpoint for the given `time`*/
private def generateJobs(time: Time) {
 
// Set the SparkEnv in this thread, so that job generation code can access the environment
 
// Example: BlockRDDs are created in this thread, and it needs to access BlockManager
  // Update: This is probably redundant after threadlocal stuff in SparkEnv has been removed.
 
SparkEnv.set(ssc.env)
 
Try {
    jobScheduler.receiverTracker.allocateBlocksToBatch(time) // allocate received blocks to batch
   
graph
.generateJobs(time) // generate jobs using allocated block
 
} match {
   
case Success(jobs) =>
     
val streamIdToInputInfos = jobScheduler.inputInfoTracker.getInfo(time)
     
jobScheduler.submitJobSet(JobSet(time, jobs, streamIdToInputInfos))
    case Failure(e) =>
     
jobScheduler.reportError("Error generating jobs for time " + time, e)
 
}
  eventLoop.post(DoCheckpoint(time, clearCheckpointDataLater = false))
}

def submitJobSet(jobSet: JobSet) {
 
if (jobSet.jobs.isEmpty) {
   
logInfo("No jobs added for time " + jobSet.time)
 
} else {
   
listenerBus.post(StreamingListenerBatchSubmitted(jobSet.toBatchInfo))
   
jobSets.put(jobSet.time, jobSet)
   
jobSet.jobs.foreach(job => jobExecutor.execute(new JobHandler(job)))
   
logInfo("Added jobs for time " + jobSet.time)
 
}
}

JobScheduler

private class JobHandler(job: Job) extends Runnable with Logging {
 
import JobScheduler._

 
def run() {
   
try {
     
val formattedTime = UIUtils.formatBatchTime(
       
job.time.milliseconds, ssc.graph.batchDuration.milliseconds, showYYYYMMSS = false)
     
val batchUrl = s"/streaming/batch/?id=${job.time.milliseconds}"
     
val
batchLinkText = s"[output operation ${job.outputOpId}, batch time ${formattedTime}]"

     
ssc.sc.setJobDescription(
       
s"""Streaming job from <a href="$batchUrl">$batchLinkText</a>""")
     
ssc.sc.setLocalProperty(BATCH_TIME_PROPERTY_KEY, job.time.milliseconds.toString)
     
ssc.sc.setLocalProperty(OUTPUT_OP_ID_PROPERTY_KEY, job.outputOpId.toString)

     
// We need to assign `eventLoop` to a temp variable. Otherwise, because
     
// `JobScheduler.stop(false)` may set `eventLoop` to null when this method is running, then
      // it's possible that when `post` is called, `eventLoop` happens to null.
     
var _eventLoop = eventLoop
     
if (_eventLoop != null) {
       
_eventLoop.post(JobStarted(job, clock.getTimeMillis()))
       
// Disable checks for existing output directories in jobs launched by the streaming
       
// scheduler, since we may need to write output to an existing directory during checkpoint
        // recovery; see SPARK-4835 for more details.
       
PairRDDFunctions.disableOutputSpecValidation.withValue(true) {
 
         job.run()
        }
        _eventLoop = eventLoop
       
if (_eventLoop != null) {
         
_eventLoop.post(JobCompleted(job, clock.getTimeMillis()))
       
}
      } else {
       
// JobScheduler has been stopped.
     
}
   
} finally {
  
   ssc.sc.setLocalProperty(JobScheduler.BATCH_TIME_PROPERTY_KEY, null)
     
ssc.sc.setLocalProperty(JobScheduler.OUTPUT_OP_ID_PROPERTY_KEY, null)
   
}
  }
}

private var eventLoop: EventLoop[JobSchedulerEvent] = null

 

def start(): Unit = synchronized {
 
if (eventLoop != null) return // scheduler has already been started

 
logDebug("Starting JobScheduler")
 
eventLoop = new EventLoop[JobSchedulerEvent]("JobScheduler") {
   
override protected def onReceive(event: JobSchedulerEvent): Unit = processEvent(event)

   
override protected def onError(e: Throwable): Unit = reportError("Error in job scheduler", e)
 
}
  eventLoop.start()

 

private def processEvent(event: JobSchedulerEvent) {
 
try {
   
event match {
     
case JobStarted(job, startTime) => handleJobStart(job, startTime)
     
case JobCompleted(job, completedTime) => handleJobCompletion(job, completedTime)
     
case ErrorReported(m, e) => handleError(m, e)
   
}
  } catch {
   
case e: Throwable =>
     
reportError("Error in job scheduler", e)
 
}
}

private def handleJobStart(job: Job, startTime: Long) {
 
val jobSet = jobSets.get(job.time)
 
val isFirstJobOfJobSet = !jobSet.hasStarted
 
jobSet.handleJobStart(job)
  if (isFirstJobOfJobSet) {
   
// "StreamingListenerBatchStarted" should be posted after calling "handleJobStart" to get the
   
// correct "jobSet.processingStartTime".
    listenerBus
.post(StreamingListenerBatchStarted(jobSet.toBatchInfo))
 
}
  job.setStartTime(startTime)
  listenerBus.post(StreamingListenerOutputOperationStarted(job.toOutputOperationInfo))
 
logInfo("Starting job " + job.id + " from job set of time " + jobSet.time)
}

def handleJobStart(job: Job) {
 
if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
}

回到JobScheduler.JobHandler 内部类run方法中job.run

把业务逻辑封装到function里面

class Job(val time: Time, func: () => _) {
 
private var _id: String = _
 
private var _outputOpId: Int = _
 
private var isSet = false
 
private var
_result: Try[_] = null
 
private var
_callSite: CallSite = null
 
private var
_startTime: Option[Long] = None
 
private var _endTime: Option[Long] = None

 
def run() {
   
_result = Try(func())
 
}

  def result: Try[_] = {
   
if (_result == null) {
     
throw new IllegalStateException("Cannot access result before job finishes")
   
}
    _result
 
}

ForEachDStream.scala

override def generateJob(time: Time): Option[Job] = {
 
parent.getOrCompute(time) match {
   
case Some(rdd) =>
     
val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) {
       
foreachFunc(rdd, time)
     
}
      Some(new Job(time, jobFunc))
   
case None => None
 
}
}

想改变作业并发的执行,可改变其并发度(JobScheduler

private val numConcurrentJobs = ssc.conf.getInt("spark.streaming.concurrentJobs", 1)

 

 

 

 

 

 

 

 

博主:罗白莲
资料来源于:王家林(Spark版本定制班课程)
新浪微博:http://www.weibo.com/ilovepains

 

  • 0
    点赞
  • 1
    收藏
    觉得还不错? 一键收藏
  • 0
    评论

“相关推荐”对你有帮助么?

  • 非常没帮助
  • 没帮助
  • 一般
  • 有帮助
  • 非常有帮助
提交
评论
添加红包

请填写红包祝福语或标题

红包个数最小为10个

红包金额最低5元

当前余额3.43前往充值 >
需支付:10.00
成就一亿技术人!
领取后你会自动成为博主和红包主的粉丝 规则
hope_wisdom
发出的红包
实付
使用余额支付
点击重新获取
扫码支付
钱包余额 0

抵扣说明:

1.余额是钱包充值的虚拟货币,按照1:1的比例进行支付金额的抵扣。
2.余额无法直接购买下载,可以购买VIP、付费专栏及课程。

余额充值